source: trunk/src/trig.cpp @ 900

Last change on this file since 900 was 900, checked in by sam, 9 years ago

core: improve tan() accuracy by tweaking higher order Taylor coefficients.

File size: 14.1 KB
Line 
1//
2// Lol Engine
3//
4// Copyright: (c) 2010-2011 Sam Hocevar <sam@hocevar.net>
5//   This program is free software; you can redistribute it and/or
6//   modify it under the terms of the Do What The Fuck You Want To
7//   Public License, Version 2, as published by Sam Hocevar. See
8//   http://sam.zoy.org/projects/COPYING.WTFPL for more details.
9//
10
11#if defined HAVE_CONFIG_H
12#   include "config.h"
13#endif
14
15#if defined HAVE_FASTMATH_H
16#   include <fastmath.h>
17#endif
18
19#include "core.h"
20
21using namespace std;
22
23namespace lol
24{
25
26static const double PI     = 3.14159265358979323846264;
27static const double NEG_PI = -3.14159265358979323846264;
28static const double PI_2   = 1.57079632679489661923132;
29static const double PI_4   = 0.785398163397448309615661;
30static const double INV_PI = 0.318309886183790671537768;
31static const double ROOT3  = 1.73205080756887729352745;
32
33static const double ZERO    = 0.0;
34static const double ONE     = 1.0;
35static const double NEG_ONE = -1.0;
36static const double HALF    = 0.5;
37static const double QUARTER = 0.25;
38static const double TWO     = 2.0;
39static const double VERY_SMALL_NUMBER = 0x1.0p-128;
40static const double TWO_EXP_52 = 4503599627370496.0;
41static const double TWO_EXP_54 = 18014398509481984.0;
42
43/** sin Taylor series coefficients. */
44static const double SC[] =
45{
46    -1.6449340668482264364724e-0, // π^2/3!
47    +8.1174242528335364363700e-1, // π^4/5!
48    -1.9075182412208421369647e-1, // π^6/7!
49    +2.6147847817654800504653e-2, // π^8/9!
50    -2.3460810354558236375089e-3, // π^10/11!
51    +1.4842879303107100368487e-4, // π^12/13!
52    -6.9758736616563804745344e-6, // π^14/15!
53    +2.5312174041370276513517e-7, // π^16/17!
54};
55
56/* Note: the last value should be -1.3878952462213772114468e-7 (ie.
57 * π^18/18!) but we tweak it in order to get the better average precision
58 * required for tan() computations when close to π/2+kπ values. */
59static const double CC[] =
60{
61    -4.9348022005446793094172e-0, // π^2/2!
62    +4.0587121264167682181850e-0, // π^4/4!
63    -1.3352627688545894958753e-0, // π^6/6!
64    +2.3533063035889320454188e-1, // π^8/8!
65    -2.5806891390014060012598e-2, // π^10/10!
66    +1.9295743094039230479033e-3, // π^12/12!
67    -1.0463810492484570711802e-4, // π^14/14!
68    +4.3030695870329470072978e-6, // π^16/16!
69    -1.3777e-7,
70};
71
72/* These coefficients use Sloane’s http://oeis.org/A002430 and
73 * http://oeis.org/A036279 sequences for the Taylor series of tan().
74 * Note: the last value should be 2.12485922978838540352881e5 (ie.
75 * 443861162*π^18/1856156927625), but we tweak it in order to get
76 * sub 1e-11 average precision in a larger range. */
77static const double TC[] =
78{
79    3.28986813369645287294483e0, // π^2/3
80    1.29878788045336582981920e1, // 2*π^4/15
81    5.18844961612069061254404e1, // 17*π^6/315
82    2.07509320280908496804928e2, // 62*π^8/2835
83    8.30024701695986756361561e2, // 1382*π^10/155925
84    3.32009324029001216460018e3, // 21844*π^12/6081075
85    1.32803704909665483598490e4, // 929569*π^14/638512875
86    5.31214808666037709352112e4, // 6404582*π^16/10854718875
87    2.373e5,
88};
89
90/* Custom intrinsics */
91#define INLINEATTR __attribute__((always_inline))
92
93#if defined __CELLOS_LV2__
94static inline double lol_fctid(double x) INLINEATTR;
95static inline double lol_fctidz(double x) INLINEATTR;
96static inline double lol_fcfid(double x) INLINEATTR;
97static inline double lol_frsqrte(double x) INLINEATTR;
98#endif
99static inline double lol_fsel(double c, double gte, double lt) INLINEATTR;
100static inline double lol_fabs(double x) INLINEATTR;
101static inline double lol_round(double x) INLINEATTR;
102static inline double lol_trunc(double x) INLINEATTR;
103static inline double lol_round(double x) INLINEATTR;
104static inline double lol_trunc(double x) INLINEATTR;
105
106#if defined __CELLOS_LV2__
107static inline double lol_fctid(double x)
108{
109    double r;
110#if defined __SNC__
111    r = __builtin_fctid(x);
112#else
113    __asm__ ("fctid %0, %1"
114             : "=f"(r) : "f"(x));
115#endif
116    return r;
117}
118
119static double lol_fctidz(double x)
120{
121    double r;
122#if defined __SNC__
123    r = __builtin_fctidz(x);
124#else
125    __asm__ ("fctidz %0, %1"
126             : "=f"(r) : "f"(x));
127#endif
128    return r;
129}
130
131static double lol_fcfid(double x)
132{
133    double r;
134#if defined __SNC__
135    r = __builtin_fcfid(x);
136#else
137    __asm__ ("fcfid %0, %1"
138             : "=f"(r) : "f"(x));
139#endif
140    return r;
141}
142
143static double lol_frsqrte(double x)
144{
145#if defined __SNC__
146    return __builtin_frsqrte(x);
147#else
148    double r;
149    __asm__ ("frsqrte %0, %1"
150             : "=f"(r) : "f"(x));
151    return r;
152#endif
153}
154#endif /* __CELLOS_LV2__ */
155
156static inline double lol_fsel(double c, double gte, double lt)
157{
158#if defined __CELLOS_LV2__ && defined __SNC__
159    return __fsel(c, gte, lt);
160#elif defined __CELLOS_LV2__
161    double r;
162    __asm__ ("fsel %0, %1, %2, %3"
163             : "=f"(r) : "f"(c), "f"(gte), "f"(lt));
164    return r;
165#else
166    return (c >= 0) ? gte : lt;
167#endif
168}
169
170static inline double lol_fabs(double x)
171{
172#if defined __CELLOS_LV2__ && defined __SNC__
173    return __fabs(x);
174#elif defined __CELLOS_LV2__
175    double r;
176    __asm__ ("fabs %0, %1"
177             : "=f"(r) : "f"(x));
178    return r;
179#else
180    return __builtin_fabs(x);
181#endif
182}
183
184static inline double lol_round(double x)
185{
186#if defined __CELLOS_LV2__
187    return lol_fcfid(lol_fctid(x));
188#else
189    return __builtin_round(x);
190#endif
191}
192
193static inline double lol_trunc(double x)
194{
195#if defined __CELLOS_LV2__
196    return lol_fcfid(lol_fctidz(x));
197#else
198    return __builtin_trunc(x);
199#endif
200}
201
202double lol_sin(double x)
203{
204    double absx = lol_fabs(x * INV_PI);
205
206    /* If branches are cheap, skip the cycle count when |x| < π/4,
207     * and only do the Taylor series up to the required precision. */
208#if defined LOL_FEATURE_CHEAP_BRANCHES
209    if (absx < QUARTER)
210    {
211        /* Computing x^4 is one multiplication too many we do, but it helps
212         * interleave the Taylor series operations a lot better. */
213        double x2 = absx * absx;
214        double x4 = x2 * x2;
215        double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
216        double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
217        double taylor = sub2 * x2 + sub1;
218        return x * taylor;
219    }
220#endif
221
222    /* Wrap |x| to the range [-1, 1] and keep track of the number of
223     * cycles required. If odd, we'll need to change the sign of the
224     * result. */
225#if defined __CELLOS_LV2__
226    double sign = lol_fsel(x, PI, NEG_PI);
227    double num_cycles = lol_round(absx);
228    double is_even = lol_trunc(num_cycles * HALF) - (num_cycles * HALF);
229    sign = lol_fsel(is_even, sign, -sign);
230#else
231    double num_cycles = absx + TWO_EXP_52;
232    __asm__("" : "+m" (num_cycles)); num_cycles -= TWO_EXP_52;
233
234    double is_even = TWO * num_cycles - ONE;
235    __asm__("" : "+m" (is_even)); is_even += TWO_EXP_54;
236    __asm__("" : "+m" (is_even)); is_even -= TWO_EXP_54;
237    __asm__("" : "+m" (is_even));
238    is_even -= TWO * num_cycles - ONE;
239    double sign = is_even;
240#endif
241    absx -= num_cycles;
242
243    /* If branches are very cheap, we have the option to do the Taylor
244     * series at a much lower degree by splitting. */
245#if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
246    if (lol_fabs(absx) > QUARTER)
247    {
248        sign = (x * absx >= 0.0) ? sign : -sign;
249
250        double x1 = HALF - lol_fabs(absx);
251        double x2 = x1 * x1;
252        double x4 = x2 * x2;
253        double sub1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
254        double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
255        double taylor = sub2 * x2 + sub1;
256
257        return taylor * sign;
258    }
259#endif
260
261    /* Compute a Tailor series for sin() and combine sign information. */
262    sign *= (x >= 0.0) ? PI : NEG_PI;
263
264    double x2 = absx * absx;
265    double x4 = x2 * x2;
266#if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
267    double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
268    double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
269#else
270    double sub1 = (((SC[7] * x4 + SC[5]) * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
271    double sub2 = ((SC[6] * x4 + SC[4]) * x4 + SC[2]) * x4 + SC[0];
272#endif
273    double taylor = sub2 * x2 + sub1;
274
275    return absx * taylor * sign;
276}
277
278double lol_cos(double x)
279{
280    double absx = lol_fabs(x * INV_PI);
281
282#if defined LOL_FEATURE_CHEAP_BRANCHES
283    if (absx < QUARTER)
284    {
285        double x2 = absx * absx;
286        double x4 = x2 * x2;
287        double sub1 = (CC[5] * x4 + CC[3]) * x4 + CC[1];
288        double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
289        double taylor = (sub1 * x2 + sub2) * x2 + ONE;
290        return taylor;
291    }
292#endif
293
294#if defined __CELLOS_LV2__
295    double num_cycles = lol_round(absx);
296    double is_even = lol_trunc(num_cycles * HALF) - (num_cycles * HALF);
297    double sign = lol_fsel(is_even, ONE, NEG_ONE);
298#else
299    double num_cycles = absx + TWO_EXP_52;
300    __asm__("" : "+m" (num_cycles)); num_cycles -= TWO_EXP_52;
301
302    double is_even = TWO * num_cycles - ONE;
303    __asm__("" : "+m" (is_even)); is_even += TWO_EXP_54;
304    __asm__("" : "+m" (is_even)); is_even -= TWO_EXP_54;
305    __asm__("" : "+m" (is_even));
306    is_even -= TWO * num_cycles - ONE;
307    double sign = is_even;
308#endif
309    absx -= num_cycles;
310
311#if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
312    if (lol_fabs(absx) > QUARTER)
313    {
314        double x1 = HALF - lol_fabs(absx);
315        double x2 = x1 * x1;
316        double x4 = x2 * x2;
317        double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
318        double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
319        double taylor = sub2 * x2 + sub1;
320
321        return x1 * taylor * sign * PI;
322    }
323#endif
324
325    double x2 = absx * absx;
326    double x4 = x2 * x2;
327#if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
328    double sub1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
329    double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
330#else
331    double sub1 = (((CC[7] * x4 + CC[5]) * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
332    double sub2 = ((CC[6] * x4 + CC[4]) * x4 + CC[2]) * x4 + CC[0];
333#endif
334    double taylor = sub2 * x2 + sub1;
335
336    return taylor * sign;
337}
338
339void lol_sincos(double x, double *sinx, double *cosx)
340{
341    double absx = lol_fabs(x * INV_PI);
342
343#if defined LOL_FEATURE_CHEAP_BRANCHES
344    if (absx < QUARTER)
345    {
346        double x2 = absx * absx;
347        double x4 = x2 * x2;
348
349        /* Computing the Taylor series to the 11th order is enough to get
350         * x * 1e-11 precision, but we push it to the 13th order so that
351         * tan() has a better precision. */
352        double subs1 = ((SC[5] * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
353        double subs2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
354        double taylors = subs2 * x2 + subs1;
355        *sinx = x * taylors;
356
357        double subc1 = (CC[5] * x4 + CC[3]) * x4 + CC[1];
358        double subc2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
359        double taylorc = (subc1 * x2 + subc2) * x2 + ONE;
360        *cosx = taylorc;
361
362        return;
363    }
364#endif
365
366#if defined __CELLOS_LV2__
367    double num_cycles = lol_round(absx);
368    double is_even = lol_trunc(num_cycles * HALF) - (num_cycles * HALF);
369
370    double sin_sign = lol_fsel(x, PI, NEG_PI);
371    sin_sign = lol_fsel(is_even, sin_sign, -sin_sign);
372    double cos_sign = lol_fsel(is_even, ONE, NEG_ONE);
373#else
374    double num_cycles = absx + TWO_EXP_52;
375    __asm__("" : "+m" (num_cycles)); num_cycles -= TWO_EXP_52;
376
377    double is_even = TWO * num_cycles - ONE;
378    __asm__("" : "+m" (is_even)); is_even += TWO_EXP_54;
379    __asm__("" : "+m" (is_even)); is_even -= TWO_EXP_54;
380    __asm__("" : "+m" (is_even));
381    is_even -= TWO * num_cycles - ONE;
382    double sin_sign = is_even;
383    double cos_sign = is_even;
384#endif
385    absx -= num_cycles;
386
387#if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
388    if (lol_fabs(absx) > QUARTER)
389    {
390        sign = (x * absx >= 0.0) ? sign : -sign;
391
392        double x1 = HALF - lol_fabs(absx);
393        double x2 = x1 * x1;
394        double x4 = x2 * x2;
395
396        double subs1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
397        double subs2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
398        double taylors = subs2 * x2 + subs1;
399        *sinx = taylors * sign;
400
401        double subc1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
402        double subc2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
403        double taylorc = subc2 * x2 + subc1;
404        *cosx = x1 * taylorc * sign * PI;
405
406        return;
407    }
408#endif
409
410    sin_sign *= (x >= 0.0) ? PI : NEG_PI;
411
412    double x2 = absx * absx;
413    double x4 = x2 * x2;
414#if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
415    double subs1 = ((CC[5] * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
416    double subs2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
417    double subc1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
418    double subc2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
419#else
420    double subs1 = (((SC[7] * x4 + SC[5]) * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
421    double subs2 = ((SC[6] * x4 + SC[4]) * x4 + SC[2]) * x4 + SC[0];
422    /* Push Taylor series to the 19th order to enhance tan() accuracy. */
423    double subc1 = (((CC[7] * x4 + CC[5]) * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
424    double subc2 = (((CC[8] * x4 + CC[6]) * x4 + CC[4]) * x4 + CC[2]) * x4 + CC[0];
425#endif
426    double taylors = subs2 * x2 + subs1;
427    *sinx = absx * taylors * sin_sign;
428
429    double taylorc = subc2 * x2 + subc1;
430    *cosx = taylorc * cos_sign;
431}
432
433void lol_sincos(float x, float *sinx, float *cosx)
434{
435    double x2 = static_cast<double>(x);
436    double s2, c2;
437    lol_sincos(x2, &s2, &c2);
438    *sinx = static_cast<float>(s2);
439    *cosx = static_cast<float>(c2);
440}
441
442double lol_tan(double x)
443{
444#if defined LOL_FEATURE_CHEAP_BRANCHES
445    double absx = lol_fabs(x * INV_PI);
446
447    /* This value was determined empirically to ensure an error of no
448     * more than x * 1e-11 in this range. */
449    if (absx < 0.163)
450    {
451        double x2 = absx * absx;
452        double x4 = x2 * x2;
453        double sub1 = (((TC[7] * x4 + TC[5]) * x4
454                           + TC[3]) * x4 + TC[1]) * x4 + ONE;
455        double sub2 = (((TC[8] * x4 + TC[6]) * x4
456                           + TC[4]) * x4 + TC[2]) * x4 + TC[0];
457        double taylor = sub2 * x2 + sub1;
458        return x * taylor;
459    }
460#endif
461
462    double sinx, cosx;
463    lol_sincos(x, &sinx, &cosx);
464
465    /* Ensure cosx isn't zero. FIXME: we lose the cosx sign here. */
466    double absc = lol_fabs(cosx);
467#if defined __CELLOS_LV2__
468    double is_cos_not_zero = absc - VERY_SMALL_NUMBER;
469    cosx = lol_fsel(is_cos_not_zero, cosx, VERY_SMALL_NUMBER);
470    return __fdiv(sinx, cosx);
471#else
472    if (__unlikely(absc < VERY_SMALL_NUMBER))
473        cosx = VERY_SMALL_NUMBER;
474    return sinx / cosx;
475#endif
476}
477
478} /* namespace lol */
479
Note: See TracBrowser for help on using the repository browser.