source: trunk/src/trig.cpp @ 908

Last change on this file since 908 was 908, checked in by sam, 9 years ago

core: avoid returning to memory when giving GCC floating point hints.

File size: 14.6 KB
Line 
1//
2// Lol Engine
3//
4// Copyright: (c) 2010-2011 Sam Hocevar <sam@hocevar.net>
5//   This program is free software; you can redistribute it and/or
6//   modify it under the terms of the Do What The Fuck You Want To
7//   Public License, Version 2, as published by Sam Hocevar. See
8//   http://sam.zoy.org/projects/COPYING.WTFPL for more details.
9//
10
11#if defined HAVE_CONFIG_H
12#   include "config.h"
13#endif
14
15#if defined HAVE_FASTMATH_H
16#   include <fastmath.h>
17#endif
18
19#include <cmath>
20
21#include "core.h"
22
23using namespace std;
24
25namespace lol
26{
27
28static const double PI     = 3.14159265358979323846264;
29static const double NEG_PI = -3.14159265358979323846264;
30static const double PI_2   = 1.57079632679489661923132;
31static const double PI_4   = 0.785398163397448309615661;
32static const double INV_PI = 0.318309886183790671537768;
33static const double ROOT3  = 1.73205080756887729352745;
34
35static const double ZERO    = 0.0;
36static const double ONE     = 1.0;
37static const double NEG_ONE = -1.0;
38static const double HALF    = 0.5;
39static const double QUARTER = 0.25;
40static const double TWO     = 2.0;
41#if defined __GNUC__
42static const double VERY_SMALL_NUMBER = 0x1.0p-128;
43#else
44static const double VERY_SMALL_NUMBER = 3e-39;
45#endif
46static const double TWO_EXP_52 = 4503599627370496.0;
47static const double TWO_EXP_54 = 18014398509481984.0;
48
49/** sin Taylor series coefficients. */
50static const double SC[] =
51{
52    -1.6449340668482264364724e-0, // π^2/3!
53    +8.1174242528335364363700e-1, // π^4/5!
54    -1.9075182412208421369647e-1, // π^6/7!
55    +2.6147847817654800504653e-2, // π^8/9!
56    -2.3460810354558236375089e-3, // π^10/11!
57    +1.4842879303107100368487e-4, // π^12/13!
58    -6.9758736616563804745344e-6, // π^14/15!
59    +2.5312174041370276513517e-7, // π^16/17!
60};
61
62/* Note: the last value should be -1.3878952462213772114468e-7 (ie.
63 * π^18/18!) but we tweak it in order to get the better average precision
64 * required for tan() computations when close to π/2+kπ values. */
65static const double CC[] =
66{
67    -4.9348022005446793094172e-0, // π^2/2!
68    +4.0587121264167682181850e-0, // π^4/4!
69    -1.3352627688545894958753e-0, // π^6/6!
70    +2.3533063035889320454188e-1, // π^8/8!
71    -2.5806891390014060012598e-2, // π^10/10!
72    +1.9295743094039230479033e-3, // π^12/12!
73    -1.0463810492484570711802e-4, // π^14/14!
74    +4.3030695870329470072978e-6, // π^16/16!
75    -1.3777e-7,
76};
77
78/* These coefficients use Sloane’s http://oeis.org/A002430 and
79 * http://oeis.org/A036279 sequences for the Taylor series of tan().
80 * Note: the last value should be 2.12485922978838540352881e5 (ie.
81 * 443861162*π^18/1856156927625), but we tweak it in order to get
82 * sub 1e-11 average precision in a larger range. */
83static const double TC[] =
84{
85    3.28986813369645287294483e0, // π^2/3
86    1.29878788045336582981920e1, // 2*π^4/15
87    5.18844961612069061254404e1, // 17*π^6/315
88    2.07509320280908496804928e2, // 62*π^8/2835
89    8.30024701695986756361561e2, // 1382*π^10/155925
90    3.32009324029001216460018e3, // 21844*π^12/6081075
91    1.32803704909665483598490e4, // 929569*π^14/638512875
92    5.31214808666037709352112e4, // 6404582*π^16/10854718875
93    2.373e5,
94};
95
96#if defined __CELLOS_LV2__
97static inline double lol_fctid(double x) INLINEATTR;
98static inline double lol_fctidz(double x) INLINEATTR;
99static inline double lol_fcfid(double x) INLINEATTR;
100static inline double lol_frsqrte(double x) INLINEATTR;
101static inline double lol_fsel(double c, double gte, double lt) INLINEATTR;
102static inline double lol_fres(double x) INLINEATTR;
103static inline double lol_fdiv(double a, double b) INLINEATTR;
104#endif
105static inline double lol_fabs(double x) INLINEATTR;
106#if defined __GNUC__
107static inline double lol_round(double x) INLINEATTR;
108static inline double lol_trunc(double x) INLINEATTR;
109#endif
110
111#if defined __CELLOS_LV2__
112static inline double lol_fctid(double x)
113{
114    double r;
115#if defined __SNC__
116    r = __builtin_fctid(x);
117#else
118    __asm__ ("fctid %0, %1"
119             : "=f" (r) : "f" (x));
120#endif
121    return r;
122}
123
124static double lol_fctidz(double x)
125{
126    double r;
127#if defined __SNC__
128    r = __builtin_fctidz(x);
129#else
130    __asm__ ("fctidz %0, %1"
131             : "=f" (r) : "f" (x));
132#endif
133    return r;
134}
135
136static double lol_fcfid(double x)
137{
138    double r;
139#if defined __SNC__
140    r = __builtin_fcfid(x);
141#else
142    __asm__ ("fcfid %0, %1"
143             : "=f" (r) : "f" (x));
144#endif
145    return r;
146}
147
148static double lol_frsqrte(double x)
149{
150#if defined __SNC__
151    return __builtin_frsqrte(x);
152#else
153    double r;
154    __asm__ ("frsqrte %0, %1"
155             : "=f" (r) : "f" (x));
156    return r;
157#endif
158}
159
160static inline double lol_fsel(double c, double gte, double lt)
161{
162#if defined __CELLOS_LV2__ && defined __SNC__
163    return __fsel(c, gte, lt);
164#elif defined __CELLOS_LV2__
165    double r;
166    __asm__ ("fsel %0, %1, %2, %3"
167             : "=f" (r) : "f" (c), "f" (gte), "f" (lt));
168    return r;
169#else
170    return (c >= 0) ? gte : lt;
171#endif
172}
173
174static inline double lol_fres(double x)
175{
176    double ret;
177#if defined __SNC__
178    ret = __builtin_fre(x);
179#else
180    __asm__ ("fres %0, %1"
181             : "=f" (ret) : "f" (x));
182#endif
183    return ret;
184}
185
186static inline double lol_fdiv(double a, double b)
187{
188    /* Estimate */
189    double x0 = lol_fres(b);
190
191    /* Two steps of Newton-Raphson */
192    x0 = (b * x0 - ONE) * -x0 + x0;
193    x0 = (b * x0 - ONE) * -x0 + x0;
194
195    return a * x0;
196}
197#endif /* __CELLOS_LV2__ */
198
199static inline double lol_fabs(double x)
200{
201#if defined __CELLOS_LV2__ && defined __SNC__
202    return __fabs(x);
203#elif defined __CELLOS_LV2__
204    double r;
205    __asm__ ("fabs %0, %1"
206             : "=f" (r) : "f" (x));
207    return r;
208#elif defined __GNUC__
209    return __builtin_fabs(x);
210#else
211    return fabs(x);
212#endif
213}
214
215#if defined __GNUC__
216static inline double lol_round(double x)
217{
218#if defined __CELLOS_LV2__
219    return lol_fcfid(lol_fctid(x));
220#else
221    return __builtin_round(x);
222#endif
223}
224
225static inline double lol_trunc(double x)
226{
227#if defined __CELLOS_LV2__
228    return lol_fcfid(lol_fctidz(x));
229#else
230    return __builtin_trunc(x);
231#endif
232}
233#endif
234
235double lol_sin(double x)
236{
237    double absx = lol_fabs(x * INV_PI);
238
239    /* If branches are cheap, skip the cycle count when |x| < π/4,
240     * and only do the Taylor series up to the required precision. */
241#if defined LOL_FEATURE_CHEAP_BRANCHES
242    if (absx < QUARTER)
243    {
244        /* Computing x^4 is one multiplication too many we do, but it helps
245         * interleave the Taylor series operations a lot better. */
246        double x2 = absx * absx;
247        double x4 = x2 * x2;
248        double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
249        double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
250        double taylor = sub2 * x2 + sub1;
251        return x * taylor;
252    }
253#endif
254
255    /* Wrap |x| to the range [-1, 1] and keep track of the number of
256     * cycles required. If odd, we'll need to change the sign of the
257     * result. */
258#if defined __CELLOS_LV2__
259    double sign = lol_fsel(x, PI, NEG_PI);
260    double num_cycles = lol_round(absx);
261    double is_even = lol_trunc(num_cycles * HALF) - (num_cycles * HALF);
262    sign = lol_fsel(is_even, sign, -sign);
263#else
264    double num_cycles = absx + TWO_EXP_52;
265    FP_USE(num_cycles); num_cycles -= TWO_EXP_52;
266
267    double is_even = TWO * num_cycles - ONE;
268    FP_USE(is_even); is_even += TWO_EXP_54;
269    FP_USE(is_even); is_even -= TWO_EXP_54;
270    FP_USE(is_even);
271    is_even -= TWO * num_cycles - ONE;
272    double sign = is_even;
273#endif
274    absx -= num_cycles;
275
276    /* If branches are very cheap, we have the option to do the Taylor
277     * series at a much lower degree by splitting. */
278#if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
279    if (lol_fabs(absx) > QUARTER)
280    {
281        sign = (x * absx >= 0.0) ? sign : -sign;
282
283        double x1 = HALF - lol_fabs(absx);
284        double x2 = x1 * x1;
285        double x4 = x2 * x2;
286        double sub1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
287        double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
288        double taylor = sub2 * x2 + sub1;
289
290        return taylor * sign;
291    }
292#endif
293
294    /* Compute a Tailor series for sin() and combine sign information. */
295    sign *= (x >= 0.0) ? PI : NEG_PI;
296
297    double x2 = absx * absx;
298    double x4 = x2 * x2;
299#if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
300    double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
301    double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
302#else
303    double sub1 = (((SC[7] * x4 + SC[5]) * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
304    double sub2 = ((SC[6] * x4 + SC[4]) * x4 + SC[2]) * x4 + SC[0];
305#endif
306    double taylor = sub2 * x2 + sub1;
307
308    return absx * taylor * sign;
309}
310
311double lol_cos(double x)
312{
313    double absx = lol_fabs(x * INV_PI);
314
315#if defined LOL_FEATURE_CHEAP_BRANCHES
316    if (absx < QUARTER)
317    {
318        double x2 = absx * absx;
319        double x4 = x2 * x2;
320        double sub1 = (CC[5] * x4 + CC[3]) * x4 + CC[1];
321        double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
322        double taylor = (sub1 * x2 + sub2) * x2 + ONE;
323        return taylor;
324    }
325#endif
326
327#if defined __CELLOS_LV2__
328    double num_cycles = lol_round(absx);
329    double is_even = lol_trunc(num_cycles * HALF) - (num_cycles * HALF);
330    double sign = lol_fsel(is_even, ONE, NEG_ONE);
331#else
332    double num_cycles = absx + TWO_EXP_52;
333    FP_USE(num_cycles); num_cycles -= TWO_EXP_52;
334
335    double is_even = TWO * num_cycles - ONE;
336    FP_USE(is_even); is_even += TWO_EXP_54;
337    FP_USE(is_even); is_even -= TWO_EXP_54;
338    FP_USE(is_even);
339    is_even -= TWO * num_cycles - ONE;
340    double sign = is_even;
341#endif
342    absx -= num_cycles;
343
344#if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
345    if (lol_fabs(absx) > QUARTER)
346    {
347        double x1 = HALF - lol_fabs(absx);
348        double x2 = x1 * x1;
349        double x4 = x2 * x2;
350        double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
351        double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
352        double taylor = sub2 * x2 + sub1;
353
354        return x1 * taylor * sign * PI;
355    }
356#endif
357
358    double x2 = absx * absx;
359    double x4 = x2 * x2;
360#if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
361    double sub1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
362    double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
363#else
364    double sub1 = (((CC[7] * x4 + CC[5]) * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
365    double sub2 = ((CC[6] * x4 + CC[4]) * x4 + CC[2]) * x4 + CC[0];
366#endif
367    double taylor = sub2 * x2 + sub1;
368
369    return taylor * sign;
370}
371
372void lol_sincos(double x, double *sinx, double *cosx)
373{
374    double absx = lol_fabs(x * INV_PI);
375
376#if defined LOL_FEATURE_CHEAP_BRANCHES
377    if (absx < QUARTER)
378    {
379        double x2 = absx * absx;
380        double x4 = x2 * x2;
381
382        /* Computing the Taylor series to the 11th order is enough to get
383         * x * 1e-11 precision, but we push it to the 13th order so that
384         * tan() has a better precision. */
385        double subs1 = ((SC[5] * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
386        double subs2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
387        double taylors = subs2 * x2 + subs1;
388        *sinx = x * taylors;
389
390        double subc1 = (CC[5] * x4 + CC[3]) * x4 + CC[1];
391        double subc2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
392        double taylorc = (subc1 * x2 + subc2) * x2 + ONE;
393        *cosx = taylorc;
394
395        return;
396    }
397#endif
398
399#if defined __CELLOS_LV2__
400    double num_cycles = lol_round(absx);
401    double is_even = lol_trunc(num_cycles * HALF) - (num_cycles * HALF);
402
403    double sin_sign = lol_fsel(x, PI, NEG_PI);
404    sin_sign = lol_fsel(is_even, sin_sign, -sin_sign);
405    double cos_sign = lol_fsel(is_even, ONE, NEG_ONE);
406#else
407    double num_cycles = absx + TWO_EXP_52;
408    FP_USE(num_cycles); num_cycles -= TWO_EXP_52;
409
410    double is_even = TWO * num_cycles - ONE;
411    FP_USE(is_even); is_even += TWO_EXP_54;
412    FP_USE(is_even); is_even -= TWO_EXP_54;
413    FP_USE(is_even);
414    is_even -= TWO * num_cycles - ONE;
415    double sin_sign = is_even;
416    double cos_sign = is_even;
417#endif
418    absx -= num_cycles;
419
420#if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
421    if (lol_fabs(absx) > QUARTER)
422    {
423        cos_sign = sin_sign;
424        sin_sign = (x * absx >= 0.0) ? sin_sign : -sin_sign;
425
426        double x1 = HALF - lol_fabs(absx);
427        double x2 = x1 * x1;
428        double x4 = x2 * x2;
429
430        double subs1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
431        double subs2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
432        double taylors = subs2 * x2 + subs1;
433        *sinx = taylors * sin_sign;
434
435        double subc1 = ((SC[5] * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
436        double subc2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
437        double taylorc = subc2 * x2 + subc1;
438        *cosx = x1 * taylorc * cos_sign * PI;
439
440        return;
441    }
442#endif
443
444#if !defined __CELLOS_LV2__
445    sin_sign *= (x >= 0.0) ? PI : NEG_PI;
446#endif
447
448    double x2 = absx * absx;
449    double x4 = x2 * x2;
450#if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
451    double subs1 = ((SC[5] * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
452    double subs2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
453    double subc1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
454    double subc2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
455#else
456    double subs1 = (((SC[7] * x4 + SC[5]) * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
457    double subs2 = ((SC[6] * x4 + SC[4]) * x4 + SC[2]) * x4 + SC[0];
458    /* Push Taylor series to the 19th order to enhance tan() accuracy. */
459    double subc1 = (((CC[7] * x4 + CC[5]) * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
460    double subc2 = (((CC[8] * x4 + CC[6]) * x4 + CC[4]) * x4 + CC[2]) * x4 + CC[0];
461#endif
462    double taylors = subs2 * x2 + subs1;
463    *sinx = absx * taylors * sin_sign;
464
465    double taylorc = subc2 * x2 + subc1;
466    *cosx = taylorc * cos_sign;
467}
468
469void lol_sincos(float x, float *sinx, float *cosx)
470{
471    double x2 = static_cast<double>(x);
472    double s2, c2;
473    lol_sincos(x2, &s2, &c2);
474    *sinx = static_cast<float>(s2);
475    *cosx = static_cast<float>(c2);
476}
477
478double lol_tan(double x)
479{
480#if defined LOL_FEATURE_CHEAP_BRANCHES
481    double absx = lol_fabs(x * INV_PI);
482
483    /* This value was determined empirically to ensure an error of no
484     * more than x * 1e-11 in this range. */
485    if (absx < 0.163)
486    {
487        double x2 = absx * absx;
488        double x4 = x2 * x2;
489        double sub1 = (((TC[7] * x4 + TC[5]) * x4
490                           + TC[3]) * x4 + TC[1]) * x4 + ONE;
491        double sub2 = (((TC[8] * x4 + TC[6]) * x4
492                           + TC[4]) * x4 + TC[2]) * x4 + TC[0];
493        double taylor = sub2 * x2 + sub1;
494        return x * taylor;
495    }
496#endif
497
498    double sinx, cosx;
499    lol_sincos(x, &sinx, &cosx);
500
501    /* Ensure cosx isn't zero. FIXME: we lose the cosx sign here. */
502    double absc = lol_fabs(cosx);
503#if defined __CELLOS_LV2__
504    double is_cos_not_zero = absc - VERY_SMALL_NUMBER;
505    cosx = lol_fsel(is_cos_not_zero, cosx, VERY_SMALL_NUMBER);
506    return lol_fdiv(sinx, cosx);
507#else
508    if (__unlikely(absc < VERY_SMALL_NUMBER))
509        cosx = VERY_SMALL_NUMBER;
510    return sinx / cosx;
511#endif
512}
513
514} /* namespace lol */
515
Note: See TracBrowser for help on using the repository browser.