source: trunk/src/trig.cpp @ 902

Last change on this file since 902 was 902, checked in by sam, 11 years ago

core: fix PS3 compilation; the lol_fdiv implementation was missing.

File size: 14.7 KB
Line 
1//
2// Lol Engine
3//
4// Copyright: (c) 2010-2011 Sam Hocevar <sam@hocevar.net>
5//   This program is free software; you can redistribute it and/or
6//   modify it under the terms of the Do What The Fuck You Want To
7//   Public License, Version 2, as published by Sam Hocevar. See
8//   http://sam.zoy.org/projects/COPYING.WTFPL for more details.
9//
10
11#if defined HAVE_CONFIG_H
12#   include "config.h"
13#endif
14
15#if defined HAVE_FASTMATH_H
16#   include <fastmath.h>
17#endif
18
19#include "core.h"
20
21using namespace std;
22
23namespace lol
24{
25
26static const double PI     = 3.14159265358979323846264;
27static const double NEG_PI = -3.14159265358979323846264;
28static const double PI_2   = 1.57079632679489661923132;
29static const double PI_4   = 0.785398163397448309615661;
30static const double INV_PI = 0.318309886183790671537768;
31static const double ROOT3  = 1.73205080756887729352745;
32
33static const double ZERO    = 0.0;
34static const double ONE     = 1.0;
35static const double NEG_ONE = -1.0;
36static const double HALF    = 0.5;
37static const double QUARTER = 0.25;
38static const double TWO     = 2.0;
39static const double VERY_SMALL_NUMBER = 0x1.0p-128;
40static const double TWO_EXP_52 = 4503599627370496.0;
41static const double TWO_EXP_54 = 18014398509481984.0;
42
43/** sin Taylor series coefficients. */
44static const double SC[] =
45{
46    -1.6449340668482264364724e-0, // π^2/3!
47    +8.1174242528335364363700e-1, // π^4/5!
48    -1.9075182412208421369647e-1, // π^6/7!
49    +2.6147847817654800504653e-2, // π^8/9!
50    -2.3460810354558236375089e-3, // π^10/11!
51    +1.4842879303107100368487e-4, // π^12/13!
52    -6.9758736616563804745344e-6, // π^14/15!
53    +2.5312174041370276513517e-7, // π^16/17!
54};
55
56/* Note: the last value should be -1.3878952462213772114468e-7 (ie.
57 * π^18/18!) but we tweak it in order to get the better average precision
58 * required for tan() computations when close to π/2+kπ values. */
59static const double CC[] =
60{
61    -4.9348022005446793094172e-0, // π^2/2!
62    +4.0587121264167682181850e-0, // π^4/4!
63    -1.3352627688545894958753e-0, // π^6/6!
64    +2.3533063035889320454188e-1, // π^8/8!
65    -2.5806891390014060012598e-2, // π^10/10!
66    +1.9295743094039230479033e-3, // π^12/12!
67    -1.0463810492484570711802e-4, // π^14/14!
68    +4.3030695870329470072978e-6, // π^16/16!
69    -1.3777e-7,
70};
71
72/* These coefficients use Sloane’s http://oeis.org/A002430 and
73 * http://oeis.org/A036279 sequences for the Taylor series of tan().
74 * Note: the last value should be 2.12485922978838540352881e5 (ie.
75 * 443861162*π^18/1856156927625), but we tweak it in order to get
76 * sub 1e-11 average precision in a larger range. */
77static const double TC[] =
78{
79    3.28986813369645287294483e0, // π^2/3
80    1.29878788045336582981920e1, // 2*π^4/15
81    5.18844961612069061254404e1, // 17*π^6/315
82    2.07509320280908496804928e2, // 62*π^8/2835
83    8.30024701695986756361561e2, // 1382*π^10/155925
84    3.32009324029001216460018e3, // 21844*π^12/6081075
85    1.32803704909665483598490e4, // 929569*π^14/638512875
86    5.31214808666037709352112e4, // 6404582*π^16/10854718875
87    2.373e5,
88};
89
90/* Custom intrinsics */
91#define INLINEATTR __attribute__((always_inline))
92
93#if defined __CELLOS_LV2__
94static inline double lol_fctid(double x) INLINEATTR;
95static inline double lol_fctidz(double x) INLINEATTR;
96static inline double lol_fcfid(double x) INLINEATTR;
97static inline double lol_frsqrte(double x) INLINEATTR;
98static inline double lol_fsel(double c, double gte, double lt) INLINEATTR;
99static inline double lol_fres(double x) INLINEATTR;
100static inline double lol_fdiv(double a, double b) INLINEATTR;
101#endif
102static inline double lol_fabs(double x) INLINEATTR;
103static inline double lol_round(double x) INLINEATTR;
104static inline double lol_trunc(double x) INLINEATTR;
105
106#if defined __CELLOS_LV2__
107static inline double lol_fctid(double x)
108{
109    double r;
110#if defined __SNC__
111    r = __builtin_fctid(x);
112#else
113    __asm__ ("fctid %0, %1"
114             : "=f" (r) : "f" (x));
115#endif
116    return r;
117}
118
119static double lol_fctidz(double x)
120{
121    double r;
122#if defined __SNC__
123    r = __builtin_fctidz(x);
124#else
125    __asm__ ("fctidz %0, %1"
126             : "=f" (r) : "f" (x));
127#endif
128    return r;
129}
130
131static double lol_fcfid(double x)
132{
133    double r;
134#if defined __SNC__
135    r = __builtin_fcfid(x);
136#else
137    __asm__ ("fcfid %0, %1"
138             : "=f" (r) : "f" (x));
139#endif
140    return r;
141}
142
143static double lol_frsqrte(double x)
144{
145#if defined __SNC__
146    return __builtin_frsqrte(x);
147#else
148    double r;
149    __asm__ ("frsqrte %0, %1"
150             : "=f" (r) : "f" (x));
151    return r;
152#endif
153}
154
155static inline double lol_fsel(double c, double gte, double lt)
156{
157#if defined __CELLOS_LV2__ && defined __SNC__
158    return __fsel(c, gte, lt);
159#elif defined __CELLOS_LV2__
160    double r;
161    __asm__ ("fsel %0, %1, %2, %3"
162             : "=f" (r) : "f" (c), "f" (gte), "f" (lt));
163    return r;
164#else
165    return (c >= 0) ? gte : lt;
166#endif
167}
168
169static inline double lol_fres(double x)
170{
171    double ret;
172#if defined __SNC__
173    ret = __builtin_fre(x);
174#else
175    __asm__ ("fres %0, %1"
176             : "=f" (ret) : "f" (x));
177#endif
178    return ret;
179}
180
181static inline double lol_fdiv(double a, double b)
182{
183    /* Estimate */
184    double x0 = lol_fres(b);
185
186    /* Two steps of Newton-Raphson */
187    x0 = (b * x0 - ONE) * -x0 + x0;
188    x0 = (b * x0 - ONE) * -x0 + x0;
189
190    return a * x0;
191}
192#endif /* __CELLOS_LV2__ */
193
194static inline double lol_fabs(double x)
195{
196#if defined __CELLOS_LV2__ && defined __SNC__
197    return __fabs(x);
198#elif defined __CELLOS_LV2__
199    double r;
200    __asm__ ("fabs %0, %1"
201             : "=f" (r) : "f" (x));
202    return r;
203#else
204    return __builtin_fabs(x);
205#endif
206}
207
208static inline double lol_round(double x)
209{
210#if defined __CELLOS_LV2__
211    return lol_fcfid(lol_fctid(x));
212#else
213    return __builtin_round(x);
214#endif
215}
216
217static inline double lol_trunc(double x)
218{
219#if defined __CELLOS_LV2__
220    return lol_fcfid(lol_fctidz(x));
221#else
222    return __builtin_trunc(x);
223#endif
224}
225
226double lol_sin(double x)
227{
228    double absx = lol_fabs(x * INV_PI);
229
230    /* If branches are cheap, skip the cycle count when |x| < π/4,
231     * and only do the Taylor series up to the required precision. */
232#if defined LOL_FEATURE_CHEAP_BRANCHES
233    if (absx < QUARTER)
234    {
235        /* Computing x^4 is one multiplication too many we do, but it helps
236         * interleave the Taylor series operations a lot better. */
237        double x2 = absx * absx;
238        double x4 = x2 * x2;
239        double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
240        double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
241        double taylor = sub2 * x2 + sub1;
242        return x * taylor;
243    }
244#endif
245
246    /* Wrap |x| to the range [-1, 1] and keep track of the number of
247     * cycles required. If odd, we'll need to change the sign of the
248     * result. */
249#if defined __CELLOS_LV2__
250    double sign = lol_fsel(x, PI, NEG_PI);
251    double num_cycles = lol_round(absx);
252    double is_even = lol_trunc(num_cycles * HALF) - (num_cycles * HALF);
253    sign = lol_fsel(is_even, sign, -sign);
254#else
255    double num_cycles = absx + TWO_EXP_52;
256    __asm__("" : "+m" (num_cycles)); num_cycles -= TWO_EXP_52;
257
258    double is_even = TWO * num_cycles - ONE;
259    __asm__("" : "+m" (is_even)); is_even += TWO_EXP_54;
260    __asm__("" : "+m" (is_even)); is_even -= TWO_EXP_54;
261    __asm__("" : "+m" (is_even));
262    is_even -= TWO * num_cycles - ONE;
263    double sign = is_even;
264#endif
265    absx -= num_cycles;
266
267    /* If branches are very cheap, we have the option to do the Taylor
268     * series at a much lower degree by splitting. */
269#if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
270    if (lol_fabs(absx) > QUARTER)
271    {
272        sign = (x * absx >= 0.0) ? sign : -sign;
273
274        double x1 = HALF - lol_fabs(absx);
275        double x2 = x1 * x1;
276        double x4 = x2 * x2;
277        double sub1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
278        double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
279        double taylor = sub2 * x2 + sub1;
280
281        return taylor * sign;
282    }
283#endif
284
285    /* Compute a Tailor series for sin() and combine sign information. */
286    sign *= (x >= 0.0) ? PI : NEG_PI;
287
288    double x2 = absx * absx;
289    double x4 = x2 * x2;
290#if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
291    double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
292    double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
293#else
294    double sub1 = (((SC[7] * x4 + SC[5]) * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
295    double sub2 = ((SC[6] * x4 + SC[4]) * x4 + SC[2]) * x4 + SC[0];
296#endif
297    double taylor = sub2 * x2 + sub1;
298
299    return absx * taylor * sign;
300}
301
302double lol_cos(double x)
303{
304    double absx = lol_fabs(x * INV_PI);
305
306#if defined LOL_FEATURE_CHEAP_BRANCHES
307    if (absx < QUARTER)
308    {
309        double x2 = absx * absx;
310        double x4 = x2 * x2;
311        double sub1 = (CC[5] * x4 + CC[3]) * x4 + CC[1];
312        double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
313        double taylor = (sub1 * x2 + sub2) * x2 + ONE;
314        return taylor;
315    }
316#endif
317
318#if defined __CELLOS_LV2__
319    double num_cycles = lol_round(absx);
320    double is_even = lol_trunc(num_cycles * HALF) - (num_cycles * HALF);
321    double sign = lol_fsel(is_even, ONE, NEG_ONE);
322#else
323    double num_cycles = absx + TWO_EXP_52;
324    __asm__("" : "+m" (num_cycles)); num_cycles -= TWO_EXP_52;
325
326    double is_even = TWO * num_cycles - ONE;
327    __asm__("" : "+m" (is_even)); is_even += TWO_EXP_54;
328    __asm__("" : "+m" (is_even)); is_even -= TWO_EXP_54;
329    __asm__("" : "+m" (is_even));
330    is_even -= TWO * num_cycles - ONE;
331    double sign = is_even;
332#endif
333    absx -= num_cycles;
334
335#if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
336    if (lol_fabs(absx) > QUARTER)
337    {
338        double x1 = HALF - lol_fabs(absx);
339        double x2 = x1 * x1;
340        double x4 = x2 * x2;
341        double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
342        double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
343        double taylor = sub2 * x2 + sub1;
344
345        return x1 * taylor * sign * PI;
346    }
347#endif
348
349    double x2 = absx * absx;
350    double x4 = x2 * x2;
351#if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
352    double sub1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
353    double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
354#else
355    double sub1 = (((CC[7] * x4 + CC[5]) * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
356    double sub2 = ((CC[6] * x4 + CC[4]) * x4 + CC[2]) * x4 + CC[0];
357#endif
358    double taylor = sub2 * x2 + sub1;
359
360    return taylor * sign;
361}
362
363void lol_sincos(double x, double *sinx, double *cosx)
364{
365    double absx = lol_fabs(x * INV_PI);
366
367#if defined LOL_FEATURE_CHEAP_BRANCHES
368    if (absx < QUARTER)
369    {
370        double x2 = absx * absx;
371        double x4 = x2 * x2;
372
373        /* Computing the Taylor series to the 11th order is enough to get
374         * x * 1e-11 precision, but we push it to the 13th order so that
375         * tan() has a better precision. */
376        double subs1 = ((SC[5] * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
377        double subs2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
378        double taylors = subs2 * x2 + subs1;
379        *sinx = x * taylors;
380
381        double subc1 = (CC[5] * x4 + CC[3]) * x4 + CC[1];
382        double subc2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
383        double taylorc = (subc1 * x2 + subc2) * x2 + ONE;
384        *cosx = taylorc;
385
386        return;
387    }
388#endif
389
390#if defined __CELLOS_LV2__
391    double num_cycles = lol_round(absx);
392    double is_even = lol_trunc(num_cycles * HALF) - (num_cycles * HALF);
393
394    double sin_sign = lol_fsel(x, PI, NEG_PI);
395    sin_sign = lol_fsel(is_even, sin_sign, -sin_sign);
396    double cos_sign = lol_fsel(is_even, ONE, NEG_ONE);
397#else
398    double num_cycles = absx + TWO_EXP_52;
399    __asm__("" : "+m" (num_cycles)); num_cycles -= TWO_EXP_52;
400
401    double is_even = TWO * num_cycles - ONE;
402    __asm__("" : "+m" (is_even)); is_even += TWO_EXP_54;
403    __asm__("" : "+m" (is_even)); is_even -= TWO_EXP_54;
404    __asm__("" : "+m" (is_even));
405    is_even -= TWO * num_cycles - ONE;
406    double sin_sign = is_even;
407    double cos_sign = is_even;
408#endif
409    absx -= num_cycles;
410
411#if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
412    if (lol_fabs(absx) > QUARTER)
413    {
414        cos_sign = sin_sign;
415        sin_sign = (x * absx >= 0.0) ? sin_sign : -sin_sign;
416
417        double x1 = HALF - lol_fabs(absx);
418        double x2 = x1 * x1;
419        double x4 = x2 * x2;
420
421        double subs1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
422        double subs2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
423        double taylors = subs2 * x2 + subs1;
424        *sinx = taylors * sin_sign;
425
426        double subc1 = ((SC[5] * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
427        double subc2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
428        double taylorc = subc2 * x2 + subc1;
429        *cosx = x1 * taylorc * cos_sign * PI;
430
431        return;
432    }
433#endif
434
435#if !defined __CELLOS_LV2__
436    sin_sign *= (x >= 0.0) ? PI : NEG_PI;
437#endif
438
439    double x2 = absx * absx;
440    double x4 = x2 * x2;
441#if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
442    double subs1 = ((SC[5] * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
443    double subs2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
444    double subc1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
445    double subc2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
446#else
447    double subs1 = (((SC[7] * x4 + SC[5]) * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
448    double subs2 = ((SC[6] * x4 + SC[4]) * x4 + SC[2]) * x4 + SC[0];
449    /* Push Taylor series to the 19th order to enhance tan() accuracy. */
450    double subc1 = (((CC[7] * x4 + CC[5]) * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
451    double subc2 = (((CC[8] * x4 + CC[6]) * x4 + CC[4]) * x4 + CC[2]) * x4 + CC[0];
452#endif
453    double taylors = subs2 * x2 + subs1;
454    *sinx = absx * taylors * sin_sign;
455
456    double taylorc = subc2 * x2 + subc1;
457    *cosx = taylorc * cos_sign;
458}
459
460void lol_sincos(float x, float *sinx, float *cosx)
461{
462    double x2 = static_cast<double>(x);
463    double s2, c2;
464    lol_sincos(x2, &s2, &c2);
465    *sinx = static_cast<float>(s2);
466    *cosx = static_cast<float>(c2);
467}
468
469double lol_tan(double x)
470{
471#if defined LOL_FEATURE_CHEAP_BRANCHES
472    double absx = lol_fabs(x * INV_PI);
473
474    /* This value was determined empirically to ensure an error of no
475     * more than x * 1e-11 in this range. */
476    if (absx < 0.163)
477    {
478        double x2 = absx * absx;
479        double x4 = x2 * x2;
480        double sub1 = (((TC[7] * x4 + TC[5]) * x4
481                           + TC[3]) * x4 + TC[1]) * x4 + ONE;
482        double sub2 = (((TC[8] * x4 + TC[6]) * x4
483                           + TC[4]) * x4 + TC[2]) * x4 + TC[0];
484        double taylor = sub2 * x2 + sub1;
485        return x * taylor;
486    }
487#endif
488
489    double sinx, cosx;
490    lol_sincos(x, &sinx, &cosx);
491
492    /* Ensure cosx isn't zero. FIXME: we lose the cosx sign here. */
493    double absc = lol_fabs(cosx);
494#if defined __CELLOS_LV2__
495    double is_cos_not_zero = absc - VERY_SMALL_NUMBER;
496    cosx = lol_fsel(is_cos_not_zero, cosx, VERY_SMALL_NUMBER);
497    return lol_fdiv(sinx, cosx);
498#else
499    if (__unlikely(absc < VERY_SMALL_NUMBER))
500        cosx = VERY_SMALL_NUMBER;
501    return sinx / cosx;
502#endif
503}
504
505} /* namespace lol */
506
Note: See TracBrowser for help on using the repository browser.