source: trunk/src/trig.cpp @ 899

Last change on this file since 899 was 899, checked in by sam, 8 years ago

core: implement accelerated lol_sincos() and lol_tan().

File size: 13.7 KB
Line 
1//
2// Lol Engine
3//
4// Copyright: (c) 2010-2011 Sam Hocevar <sam@hocevar.net>
5//   This program is free software; you can redistribute it and/or
6//   modify it under the terms of the Do What The Fuck You Want To
7//   Public License, Version 2, as published by Sam Hocevar. See
8//   http://sam.zoy.org/projects/COPYING.WTFPL for more details.
9//
10
11#if defined HAVE_CONFIG_H
12#   include "config.h"
13#endif
14
15#if defined HAVE_FASTMATH_H
16#   include <fastmath.h>
17#endif
18
19#include "core.h"
20
21using namespace std;
22
23namespace lol
24{
25
26static const double PI     = 3.14159265358979323846264;
27static const double NEG_PI = -3.14159265358979323846264;
28static const double PI_2   = 1.57079632679489661923132;
29static const double PI_4   = 0.785398163397448309615661;
30static const double INV_PI = 0.318309886183790671537768;
31static const double ROOT3  = 1.73205080756887729352745;
32
33static const double ZERO    = 0.0;
34static const double ONE     = 1.0;
35static const double NEG_ONE = -1.0;
36static const double HALF    = 0.5;
37static const double QUARTER = 0.25;
38static const double TWO     = 2.0;
39static const double VERY_SMALL_NUMBER = 0x1.0p-128;
40static const double TWO_EXP_52 = 4503599627370496.0;
41static const double TWO_EXP_54 = 18014398509481984.0;
42
43/** sin Taylor series coefficients. */
44static const double SC[] =
45{
46    -1.6449340668482264364724e-0, // pi^2/3!
47    +8.1174242528335364363700e-1, // pi^4/5!
48    -1.9075182412208421369647e-1, // pi^6/7!
49    +2.6147847817654800504653e-2, // pi^8/9!
50    -2.3460810354558236375089e-3, // pi^10/11!
51    +1.4842879303107100368487e-4, // pi^12/13!
52    -6.9758736616563804745344e-6, // pi^14/15!
53    +2.5312174041370276513517e-7, // pi^16/17!
54};
55
56static const double CC[] =
57{
58    -4.9348022005446793094172e-0, // pi^2/2!
59    +4.0587121264167682181850e-0, // pi^4/4!
60    -1.3352627688545894958753e-0, // pi^6/6!
61    +2.3533063035889320454188e-1, // pi^8/8!
62    -2.5806891390014060012598e-2, // pi^10/10!
63    +1.9295743094039230479033e-3, // pi^12/12!
64    -1.0463810492484570711802e-4, // pi^14/14!
65    +4.3030695870329470072978e-6, // pi^16/16!
66};
67
68/* These coefficients use Sloane’s http://oeis.org/A002430 and
69 * http://oeis.org/A036279 sequences for the Taylor series of tan().
70 * Note: the last value should be 443861162*pi^18/1856156927625, ie.
71 * 2.12485922978838540352881e5, but we tweak it in order to get
72 * sub 1e-11 precision in a larger range. */
73static const double TC[] =
74{
75    3.28986813369645287294483e0, // pi^2/3
76    1.29878788045336582981920e1, // 2*pi^4/15
77    5.18844961612069061254404e1, // 17*pi^6/315
78    2.07509320280908496804928e2, // 62*pi^8/2835
79    8.30024701695986756361561e2, // 1382*pi^10/155925
80    3.32009324029001216460018e3, // 21844*pi^12/6081075
81    1.32803704909665483598490e4, // 929569*pi^14/638512875
82    5.31214808666037709352112e4, // 6404582*pi^16/10854718875
83    2.373e5, // XXX: last value tweaked to improve precision
84    //2.12485922978838540352881e5, // 443861162*pi^18/1856156927625
85};
86
87/* Custom intrinsics */
88#define INLINEATTR __attribute__((always_inline))
89
90#if defined __CELLOS_LV2__
91static inline double lol_fctid(double x) INLINEATTR;
92static inline double lol_fctidz(double x) INLINEATTR;
93static inline double lol_fcfid(double x) INLINEATTR;
94static inline double lol_frsqrte(double x) INLINEATTR;
95#endif
96static inline double lol_fsel(double c, double gte, double lt) INLINEATTR;
97static inline double lol_fabs(double x) INLINEATTR;
98static inline double lol_round(double x) INLINEATTR;
99static inline double lol_trunc(double x) INLINEATTR;
100static inline double lol_round(double x) INLINEATTR;
101static inline double lol_trunc(double x) INLINEATTR;
102
103#if defined __CELLOS_LV2__
104static inline double lol_fctid(double x)
105{
106    double r;
107#if defined __SNC__
108    r = __builtin_fctid(x);
109#else
110    __asm__ ("fctid %0, %1"
111             : "=f"(r) : "f"(x));
112#endif
113    return r;
114}
115
116static double lol_fctidz(double x)
117{
118    double r;
119#if defined __SNC__
120    r = __builtin_fctidz(x);
121#else
122    __asm__ ("fctidz %0, %1"
123             : "=f"(r) : "f"(x));
124#endif
125    return r;
126}
127
128static double lol_fcfid(double x)
129{
130    double r;
131#if defined __SNC__
132    r = __builtin_fcfid(x);
133#else
134    __asm__ ("fcfid %0, %1"
135             : "=f"(r) : "f"(x));
136#endif
137    return r;
138}
139
140static double lol_frsqrte(double x)
141{
142#if defined __SNC__
143    return __builtin_frsqrte(x);
144#else
145    double r;
146    __asm__ ("frsqrte %0, %1"
147             : "=f"(r) : "f"(x));
148    return r;
149#endif
150}
151#endif /* __CELLOS_LV2__ */
152
153static inline double lol_fsel(double c, double gte, double lt)
154{
155#if defined __CELLOS_LV2__ && defined __SNC__
156    return __fsel(c, gte, lt);
157#elif defined __CELLOS_LV2__
158    double r;
159    __asm__ ("fsel %0, %1, %2, %3"
160             : "=f"(r) : "f"(c), "f"(gte), "f"(lt));
161    return r;
162#else
163    return (c >= 0) ? gte : lt;
164#endif
165}
166
167static inline double lol_fabs(double x)
168{
169#if defined __CELLOS_LV2__ && defined __SNC__
170    return __fabs(x);
171#elif defined __CELLOS_LV2__
172    double r;
173    __asm__ ("fabs %0, %1"
174             : "=f"(r) : "f"(x));
175    return r;
176#else
177    return __builtin_fabs(x);
178#endif
179}
180
181static inline double lol_round(double x)
182{
183#if defined __CELLOS_LV2__
184    return lol_fcfid(lol_fctid(x));
185#else
186    return __builtin_round(x);
187#endif
188}
189
190static inline double lol_trunc(double x)
191{
192#if defined __CELLOS_LV2__
193    return lol_fcfid(lol_fctidz(x));
194#else
195    return __builtin_trunc(x);
196#endif
197}
198
199double lol_sin(double x)
200{
201    double absx = lol_fabs(x * INV_PI);
202
203    /* If branches are cheap, skip the cycle count when |x| < π/4,
204     * and only do the Taylor series up to the required precision. */
205#if defined LOL_FEATURE_CHEAP_BRANCHES
206    if (absx < QUARTER)
207    {
208        /* Computing x^4 is one multiplication too many we do, but it helps
209         * interleave the Taylor series operations a lot better. */
210        double x2 = absx * absx;
211        double x4 = x2 * x2;
212        double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
213        double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
214        double taylor = sub2 * x2 + sub1;
215        return x * taylor;
216    }
217#endif
218
219    /* Wrap |x| to the range [-1, 1] and keep track of the number of
220     * cycles required. If odd, we'll need to change the sign of the
221     * result. */
222#if defined __CELLOS_LV2__
223    double sign = lol_fsel(x, PI, NEG_PI);
224    double num_cycles = lol_round(absx);
225    double is_even = lol_trunc(num_cycles * HALF) - (num_cycles * HALF);
226    sign = lol_fsel(is_even, sign, -sign);
227#else
228    double num_cycles = absx + TWO_EXP_52;
229    __asm__("" : "+m" (num_cycles)); num_cycles -= TWO_EXP_52;
230
231    double is_even = TWO * num_cycles - ONE;
232    __asm__("" : "+m" (is_even)); is_even += TWO_EXP_54;
233    __asm__("" : "+m" (is_even)); is_even -= TWO_EXP_54;
234    __asm__("" : "+m" (is_even));
235    is_even -= TWO * num_cycles - ONE;
236    double sign = is_even;
237#endif
238    absx -= num_cycles;
239
240    /* If branches are very cheap, we have the option to do the Taylor
241     * series at a much lower degree by splitting. */
242#if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
243    if (lol_fabs(absx) > QUARTER)
244    {
245        sign = (x * absx >= 0.0) ? sign : -sign;
246
247        double x1 = HALF - lol_fabs(absx);
248        double x2 = x1 * x1;
249        double x4 = x2 * x2;
250        double sub1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
251        double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
252        double taylor = sub2 * x2 + sub1;
253
254        return taylor * sign;
255    }
256#endif
257
258    /* Compute a Tailor series for sin() and combine sign information. */
259    sign *= (x >= 0.0) ? PI : NEG_PI;
260
261    double x2 = absx * absx;
262    double x4 = x2 * x2;
263#if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
264    double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
265    double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
266#else
267    double sub1 = (((SC[7] * x4 + SC[5]) * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
268    double sub2 = ((SC[6] * x4 + SC[4]) * x4 + SC[2]) * x4 + SC[0];
269#endif
270    double taylor = sub2 * x2 + sub1;
271
272    return absx * taylor * sign;
273}
274
275double lol_cos(double x)
276{
277    double absx = lol_fabs(x * INV_PI);
278
279#if defined LOL_FEATURE_CHEAP_BRANCHES
280    if (absx < QUARTER)
281    {
282        double x2 = absx * absx;
283        double x4 = x2 * x2;
284        double sub1 = (CC[5] * x4 + CC[3]) * x4 + CC[1];
285        double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
286        double taylor = (sub1 * x2 + sub2) * x2 + ONE;
287        return taylor;
288    }
289#endif
290
291#if defined __CELLOS_LV2__
292    double num_cycles = lol_round(absx);
293    double is_even = lol_trunc(num_cycles * HALF) - (num_cycles * HALF);
294    double sign = lol_fsel(is_even, ONE, NEG_ONE);
295#else
296    double num_cycles = absx + TWO_EXP_52;
297    __asm__("" : "+m" (num_cycles)); num_cycles -= TWO_EXP_52;
298
299    double is_even = TWO * num_cycles - ONE;
300    __asm__("" : "+m" (is_even)); is_even += TWO_EXP_54;
301    __asm__("" : "+m" (is_even)); is_even -= TWO_EXP_54;
302    __asm__("" : "+m" (is_even));
303    is_even -= TWO * num_cycles - ONE;
304    double sign = is_even;
305#endif
306    absx -= num_cycles;
307
308#if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
309    if (lol_fabs(absx) > QUARTER)
310    {
311        double x1 = HALF - lol_fabs(absx);
312        double x2 = x1 * x1;
313        double x4 = x2 * x2;
314        double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
315        double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
316        double taylor = sub2 * x2 + sub1;
317
318        return x1 * taylor * sign * PI;
319    }
320#endif
321
322    double x2 = absx * absx;
323    double x4 = x2 * x2;
324#if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
325    double sub1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
326    double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
327#else
328    double sub1 = (((CC[7] * x4 + CC[5]) * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
329    double sub2 = ((CC[6] * x4 + CC[4]) * x4 + CC[2]) * x4 + CC[0];
330#endif
331    double taylor = sub2 * x2 + sub1;
332
333    return taylor * sign;
334}
335
336void lol_sincos(double x, double *sinx, double *cosx)
337{
338    double absx = lol_fabs(x * INV_PI);
339
340#if defined LOL_FEATURE_CHEAP_BRANCHES
341    if (absx < QUARTER)
342    {
343        double x2 = absx * absx;
344        double x4 = x2 * x2;
345
346        double subs1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
347        double subs2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
348        double taylors = subs2 * x2 + subs1;
349        *sinx = x * taylors;
350
351        double subc1 = (CC[5] * x4 + CC[3]) * x4 + CC[1];
352        double subc2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
353        double taylorc = (subc1 * x2 + subc2) * x2 + ONE;
354        *cosx = taylorc;
355
356        return;
357    }
358#endif
359
360#if defined __CELLOS_LV2__
361    double num_cycles = lol_round(absx);
362    double is_even = lol_trunc(num_cycles * HALF) - (num_cycles * HALF);
363
364    double sin_sign = lol_fsel(x, PI, NEG_PI);
365    sin_sign = lol_fsel(is_even, sin_sign, -sin_sign);
366    double cos_sign = lol_fsel(is_even, ONE, NEG_ONE);
367#else
368    double num_cycles = absx + TWO_EXP_52;
369    __asm__("" : "+m" (num_cycles)); num_cycles -= TWO_EXP_52;
370
371    double is_even = TWO * num_cycles - ONE;
372    __asm__("" : "+m" (is_even)); is_even += TWO_EXP_54;
373    __asm__("" : "+m" (is_even)); is_even -= TWO_EXP_54;
374    __asm__("" : "+m" (is_even));
375    is_even -= TWO * num_cycles - ONE;
376    double sin_sign = is_even;
377    double cos_sign = is_even;
378#endif
379    absx -= num_cycles;
380
381#if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
382    if (lol_fabs(absx) > QUARTER)
383    {
384        sign = (x * absx >= 0.0) ? sign : -sign;
385
386        double x1 = HALF - lol_fabs(absx);
387        double x2 = x1 * x1;
388        double x4 = x2 * x2;
389
390        double subs1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
391        double subs2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
392        double taylors = subs2 * x2 + subs1;
393        *sinx = taylors * sign;
394
395        double subc1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
396        double subc2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
397        double taylorc = subc2 * x2 + subc1;
398        *cosx = x1 * taylorc * sign * PI;
399
400        return;
401    }
402#endif
403
404    sin_sign *= (x >= 0.0) ? PI : NEG_PI;
405
406    double x2 = absx * absx;
407    double x4 = x2 * x2;
408#if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
409    double subs1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
410    double subs2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
411    double subc1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
412    double subc2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
413#else
414    double subs1 = (((SC[7] * x4 + SC[5]) * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
415    double subs2 = ((SC[6] * x4 + SC[4]) * x4 + SC[2]) * x4 + SC[0];
416    double subc1 = (((CC[7] * x4 + CC[5]) * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
417    double subc2 = ((CC[6] * x4 + CC[4]) * x4 + CC[2]) * x4 + CC[0];
418#endif
419    double taylors = subs2 * x2 + subs1;
420    *sinx = absx * taylors * sin_sign;
421
422    double taylorc = subc2 * x2 + subc1;
423    *cosx = taylorc * cos_sign;
424}
425
426void lol_sincos(float x, float *sinx, float *cosx)
427{
428    double x2 = static_cast<double>(x);
429    double s2, c2;
430    lol_sincos(x2, &s2, &c2);
431    *sinx = static_cast<float>(s2);
432    *cosx = static_cast<float>(c2);
433}
434
435double lol_tan(double x)
436{
437#if defined LOL_FEATURE_CHEAP_BRANCHES
438    double absx = lol_fabs(x * INV_PI);
439
440    /* This value was determined empirically to ensure an error of no
441     * more than x * 1e-11 in this range. */
442    if (absx < 0.163)
443    {
444        double x2 = absx * absx;
445        double x4 = x2 * x2;
446        double sub1 = (((TC[7] * x4 + TC[5]) * x4
447                           + TC[3]) * x4 + TC[1]) * x4 + ONE;
448        double sub2 = (((TC[8] * x4 + TC[6]) * x4
449                           + TC[4]) * x4 + TC[2]) * x4 + TC[0];
450        double taylor = sub2 * x2 + sub1;
451        return x * taylor;
452    }
453#endif
454
455    double sinx, cosx;
456    lol_sincos(x, &sinx, &cosx);
457
458    /* Ensure cosx isn't zero. FIXME: we lose the cosx sign here. */
459    double absc = lol_fabs(cosx);
460#if defined __CELLOS_LV2__
461    double is_cos_not_zero = absc - VERY_SMALL_NUMBER;
462    cosx = lol_fsel(is_cos_not_zero, cosx, VERY_SMALL_NUMBER);
463    return __fdiv(sinx, cosx);
464#else
465    if (__unlikely(absc < VERY_SMALL_NUMBER))
466        cosx = VERY_SMALL_NUMBER;
467    return sinx / cosx;
468#endif
469}
470
471} /* namespace lol */
472
Note: See TracBrowser for help on using the repository browser.