source: trunk/src/math/trig.cpp @ 2183

Last change on this file since 2183 was 2183, checked in by sam, 7 years ago

build: fix the WTFPL site URL in all code comments.

  • Property svn:keywords set to Id
File size: 14.7 KB
Line 
1//
2// Lol Engine
3//
4// Copyright: (c) 2010-2011 Sam Hocevar <sam@hocevar.net>
5//   This program is free software; you can redistribute it and/or
6//   modify it under the terms of the Do What The Fuck You Want To
7//   Public License, Version 2, as published by Sam Hocevar. See
8//   http://www.wtfpl.net/ for more details.
9//
10
11#if defined HAVE_CONFIG_H
12#   include "config.h"
13#endif
14
15#if defined HAVE_FASTMATH_H
16#   include <fastmath.h>
17#endif
18
19#include "core.h"
20
21using namespace std;
22
23namespace lol
24{
25
26static const double PI     = 3.14159265358979323846264;
27static const double NEG_PI = -3.14159265358979323846264;
28static const double PI_2   = 1.57079632679489661923132;
29static const double PI_4   = 0.785398163397448309615661;
30static const double INV_PI = 0.318309886183790671537768;
31static const double ROOT3  = 1.73205080756887729352745;
32
33static const double ZERO    = 0.0;
34static const double ONE     = 1.0;
35static const double NEG_ONE = -1.0;
36static const double HALF    = 0.5;
37static const double QUARTER = 0.25;
38static const double TWO     = 2.0;
39#if defined __GNUC__
40static const double VERY_SMALL_NUMBER = 0x1.0p-128;
41#else
42static const double VERY_SMALL_NUMBER = 3e-39;
43#endif
44static const double TWO_EXP_52 = 4503599627370496.0;
45static const double TWO_EXP_54 = 18014398509481984.0;
46
47/** sin Taylor series coefficients. */
48static const double SC[] =
49{
50    -1.6449340668482264364724e-0, // π^2/3!
51    +8.1174242528335364363700e-1, // π^4/5!
52    -1.9075182412208421369647e-1, // π^6/7!
53    +2.6147847817654800504653e-2, // π^8/9!
54    -2.3460810354558236375089e-3, // π^10/11!
55    +1.4842879303107100368487e-4, // π^12/13!
56    -6.9758736616563804745344e-6, // π^14/15!
57    +2.5312174041370276513517e-7, // π^16/17!
58};
59
60/* Note: the last value should be -1.3878952462213772114468e-7 (ie.
61 * π^18/18!) but we tweak it in order to get the better average precision
62 * required for tan() computations when close to π/2+kπ values. */
63static const double CC[] =
64{
65    -4.9348022005446793094172e-0, // π^2/2!
66    +4.0587121264167682181850e-0, // π^4/4!
67    -1.3352627688545894958753e-0, // π^6/6!
68    +2.3533063035889320454188e-1, // π^8/8!
69    -2.5806891390014060012598e-2, // π^10/10!
70    +1.9295743094039230479033e-3, // π^12/12!
71    -1.0463810492484570711802e-4, // π^14/14!
72    +4.3030695870329470072978e-6, // π^16/16!
73    -1.3777e-7,
74};
75
76/* These coefficients use Sloane’s http://oeis.org/A002430 and
77 * http://oeis.org/A036279 sequences for the Taylor series of tan().
78 * Note: the last value should be 2.12485922978838540352881e5 (ie.
79 * 443861162*π^18/1856156927625), but we tweak it in order to get
80 * sub 1e-11 average precision in a larger range. */
81static const double TC[] =
82{
83    3.28986813369645287294483e0, // π^2/3
84    1.29878788045336582981920e1, // 2*π^4/15
85    5.18844961612069061254404e1, // 17*π^6/315
86    2.07509320280908496804928e2, // 62*π^8/2835
87    8.30024701695986756361561e2, // 1382*π^10/155925
88    3.32009324029001216460018e3, // 21844*π^12/6081075
89    1.32803704909665483598490e4, // 929569*π^14/638512875
90    5.31214808666037709352112e4, // 6404582*π^16/10854718875
91    2.373e5,
92};
93
94#if defined __CELLOS_LV2__
95static inline double lol_fctid(double x) INLINEATTR;
96static inline double lol_fctidz(double x) INLINEATTR;
97static inline double lol_fcfid(double x) INLINEATTR;
98static inline double lol_frsqrte(double x) INLINEATTR;
99static inline double lol_fsel(double c, double gte, double lt) INLINEATTR;
100static inline double lol_fres(double x) INLINEATTR;
101static inline double lol_fdiv(double a, double b) INLINEATTR;
102#endif
103static inline double lol_fabs(double x) INLINEATTR;
104#if defined __GNUC__
105static inline double lol_round(double x) INLINEATTR;
106static inline double lol_trunc(double x) INLINEATTR;
107#endif
108
109#if defined __CELLOS_LV2__
110static inline double lol_fctid(double x)
111{
112    double r;
113#if defined __SNC__
114    r = __builtin_fctid(x);
115#else
116    __asm__ ("fctid %0, %1"
117             : "=f" (r) : "f" (x));
118#endif
119    return r;
120}
121
122static double lol_fctidz(double x)
123{
124    double r;
125#if defined __SNC__
126    r = __builtin_fctidz(x);
127#else
128    __asm__ ("fctidz %0, %1"
129             : "=f" (r) : "f" (x));
130#endif
131    return r;
132}
133
134static double lol_fcfid(double x)
135{
136    double r;
137#if defined __SNC__
138    r = __builtin_fcfid(x);
139#else
140    __asm__ ("fcfid %0, %1"
141             : "=f" (r) : "f" (x));
142#endif
143    return r;
144}
145
146static double lol_frsqrte(double x)
147{
148#if defined __SNC__
149    return __builtin_frsqrte(x);
150#else
151    double r;
152    __asm__ ("frsqrte %0, %1"
153             : "=f" (r) : "f" (x));
154    return r;
155#endif
156}
157
158static inline double lol_fsel(double c, double gte, double lt)
159{
160#if defined __CELLOS_LV2__ && defined __SNC__
161    return __fsel(c, gte, lt);
162#elif defined __CELLOS_LV2__
163    double r;
164    __asm__ ("fsel %0, %1, %2, %3"
165             : "=f" (r) : "f" (c), "f" (gte), "f" (lt));
166    return r;
167#else
168    return (c >= 0) ? gte : lt;
169#endif
170}
171
172static inline double lol_fres(double x)
173{
174    double ret;
175#if defined __SNC__
176    ret = __builtin_fre(x);
177#else
178    __asm__ ("fres %0, %1"
179             : "=f" (ret) : "f" (x));
180#endif
181    return ret;
182}
183
184static inline double lol_fdiv(double a, double b)
185{
186    /* Estimate */
187    double x0 = lol_fres(b);
188
189    /* Two steps of Newton-Raphson */
190    x0 = (b * x0 - ONE) * -x0 + x0;
191    x0 = (b * x0 - ONE) * -x0 + x0;
192
193    return a * x0;
194}
195#endif /* __CELLOS_LV2__ */
196
197static inline double lol_fabs(double x)
198{
199#if defined __CELLOS_LV2__ && defined __SNC__
200    return __fabs(x);
201#elif defined __CELLOS_LV2__
202    double r;
203    __asm__ ("fabs %0, %1"
204             : "=f" (r) : "f" (x));
205    return r;
206#elif defined __GNUC__
207    return __builtin_fabs(x);
208#else
209    using std::fabs;
210    return fabs(x);
211#endif
212}
213
214#if defined __GNUC__
215static inline double lol_round(double x)
216{
217#if defined __CELLOS_LV2__
218    return lol_fcfid(lol_fctid(x));
219#else
220    return __builtin_round(x);
221#endif
222}
223
224static inline double lol_trunc(double x)
225{
226#if defined __CELLOS_LV2__
227    return lol_fcfid(lol_fctidz(x));
228#else
229    return __builtin_trunc(x);
230#endif
231}
232#endif
233
234double lol_sin(double x)
235{
236    double absx = lol_fabs(x * INV_PI);
237
238    /* If branches are cheap, skip the cycle count when |x| < π/4,
239     * and only do the Taylor series up to the required precision. */
240#if defined LOL_FEATURE_CHEAP_BRANCHES
241    if (absx < QUARTER)
242    {
243        /* Computing x^4 is one multiplication too many we do, but it helps
244         * interleave the Taylor series operations a lot better. */
245        double x2 = absx * absx;
246        double x4 = x2 * x2;
247        double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
248        double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
249        double taylor = sub2 * x2 + sub1;
250        return x * taylor;
251    }
252#endif
253
254    /* Wrap |x| to the range [-1, 1] and keep track of the number of
255     * cycles required. If odd, we'll need to change the sign of the
256     * result. */
257#if defined __CELLOS_LV2__
258    double sign = lol_fsel(x, PI, NEG_PI);
259    double num_cycles = lol_round(absx);
260    double is_even = lol_trunc(num_cycles * HALF) - (num_cycles * HALF);
261    sign = lol_fsel(is_even, sign, -sign);
262#else
263    double num_cycles = absx + TWO_EXP_52;
264    FP_USE(num_cycles); num_cycles -= TWO_EXP_52;
265
266    double is_even = TWO * num_cycles - ONE;
267    FP_USE(is_even); is_even += TWO_EXP_54;
268    FP_USE(is_even); is_even -= TWO_EXP_54;
269    FP_USE(is_even);
270    is_even -= TWO * num_cycles - ONE;
271    double sign = is_even;
272#endif
273    absx -= num_cycles;
274
275    /* If branches are very cheap, we have the option to do the Taylor
276     * series at a much lower degree by splitting. */
277#if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
278    if (lol_fabs(absx) > QUARTER)
279    {
280        sign = (x * absx >= 0.0) ? sign : -sign;
281
282        double x1 = HALF - lol_fabs(absx);
283        double x2 = x1 * x1;
284        double x4 = x2 * x2;
285        double sub1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
286        double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
287        double taylor = sub2 * x2 + sub1;
288
289        return taylor * sign;
290    }
291#endif
292
293#if !defined __CELLOS_LV2__
294    sign *= (x >= 0.0) ? PI : NEG_PI;
295#endif
296
297    /* Compute a Tailor series for sin() and combine sign information. */
298    double x2 = absx * absx;
299    double x4 = x2 * x2;
300#if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
301    double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
302    double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
303#else
304    double sub1 = (((SC[7] * x4 + SC[5]) * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
305    double sub2 = ((SC[6] * x4 + SC[4]) * x4 + SC[2]) * x4 + SC[0];
306#endif
307    double taylor = sub2 * x2 + sub1;
308
309    return absx * taylor * sign;
310}
311
312double lol_cos(double x)
313{
314    double absx = lol_fabs(x * INV_PI);
315
316#if defined LOL_FEATURE_CHEAP_BRANCHES
317    if (absx < QUARTER)
318    {
319        double x2 = absx * absx;
320        double x4 = x2 * x2;
321        double sub1 = (CC[5] * x4 + CC[3]) * x4 + CC[1];
322        double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
323        double taylor = (sub1 * x2 + sub2) * x2 + ONE;
324        return taylor;
325    }
326#endif
327
328#if defined __CELLOS_LV2__
329    double num_cycles = lol_round(absx);
330    double is_even = lol_trunc(num_cycles * HALF) - (num_cycles * HALF);
331    double sign = lol_fsel(is_even, ONE, NEG_ONE);
332#else
333    double num_cycles = absx + TWO_EXP_52;
334    FP_USE(num_cycles); num_cycles -= TWO_EXP_52;
335
336    double is_even = TWO * num_cycles - ONE;
337    FP_USE(is_even); is_even += TWO_EXP_54;
338    FP_USE(is_even); is_even -= TWO_EXP_54;
339    FP_USE(is_even);
340    is_even -= TWO * num_cycles - ONE;
341    double sign = is_even;
342#endif
343    absx -= num_cycles;
344
345#if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
346    if (lol_fabs(absx) > QUARTER)
347    {
348        double x1 = HALF - lol_fabs(absx);
349        double x2 = x1 * x1;
350        double x4 = x2 * x2;
351        double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
352        double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
353        double taylor = sub2 * x2 + sub1;
354
355        return x1 * taylor * sign * PI;
356    }
357#endif
358
359    double x2 = absx * absx;
360    double x4 = x2 * x2;
361#if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
362    double sub1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
363    double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
364#else
365    double sub1 = (((CC[7] * x4 + CC[5]) * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
366    double sub2 = ((CC[6] * x4 + CC[4]) * x4 + CC[2]) * x4 + CC[0];
367#endif
368    double taylor = sub2 * x2 + sub1;
369
370    return taylor * sign;
371}
372
373void lol_sincos(double x, double *sinx, double *cosx)
374{
375    double absx = lol_fabs(x * INV_PI);
376
377#if defined LOL_FEATURE_CHEAP_BRANCHES
378    if (absx < QUARTER)
379    {
380        double x2 = absx * absx;
381        double x4 = x2 * x2;
382
383        /* Computing the Taylor series to the 11th order is enough to get
384         * x * 1e-11 precision, but we push it to the 13th order so that
385         * tan() has a better precision. */
386        double subs1 = ((SC[5] * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
387        double subs2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
388        double taylors = subs2 * x2 + subs1;
389        *sinx = x * taylors;
390
391        double subc1 = (CC[5] * x4 + CC[3]) * x4 + CC[1];
392        double subc2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
393        double taylorc = (subc1 * x2 + subc2) * x2 + ONE;
394        *cosx = taylorc;
395
396        return;
397    }
398#endif
399
400#if defined __CELLOS_LV2__
401    double num_cycles = lol_round(absx);
402    double is_even = lol_trunc(num_cycles * HALF) - (num_cycles * HALF);
403
404    double sin_sign = lol_fsel(x, PI, NEG_PI);
405    sin_sign = lol_fsel(is_even, sin_sign, -sin_sign);
406    double cos_sign = lol_fsel(is_even, ONE, NEG_ONE);
407#else
408    double num_cycles = absx + TWO_EXP_52;
409    FP_USE(num_cycles); num_cycles -= TWO_EXP_52;
410
411    double is_even = TWO * num_cycles - ONE;
412    FP_USE(is_even); is_even += TWO_EXP_54;
413    FP_USE(is_even); is_even -= TWO_EXP_54;
414    FP_USE(is_even);
415    is_even -= TWO * num_cycles - ONE;
416    double sin_sign = is_even;
417    double cos_sign = is_even;
418#endif
419    absx -= num_cycles;
420
421#if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
422    if (lol_fabs(absx) > QUARTER)
423    {
424        cos_sign = sin_sign;
425        sin_sign = (x * absx >= 0.0) ? sin_sign : -sin_sign;
426
427        double x1 = HALF - lol_fabs(absx);
428        double x2 = x1 * x1;
429        double x4 = x2 * x2;
430
431        double subs1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
432        double subs2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
433        double taylors = subs2 * x2 + subs1;
434        *sinx = taylors * sin_sign;
435
436        double subc1 = ((SC[5] * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
437        double subc2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
438        double taylorc = subc2 * x2 + subc1;
439        *cosx = x1 * taylorc * cos_sign * PI;
440
441        return;
442    }
443#endif
444
445#if !defined __CELLOS_LV2__
446    sin_sign *= (x >= 0.0) ? PI : NEG_PI;
447#endif
448
449    double x2 = absx * absx;
450    double x4 = x2 * x2;
451#if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
452    double subs1 = ((SC[5] * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
453    double subs2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
454    double subc1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
455    double subc2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
456#else
457    double subs1 = (((SC[7] * x4 + SC[5]) * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
458    double subs2 = ((SC[6] * x4 + SC[4]) * x4 + SC[2]) * x4 + SC[0];
459    /* Push Taylor series to the 19th order to enhance tan() accuracy. */
460    double subc1 = (((CC[7] * x4 + CC[5]) * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
461    double subc2 = (((CC[8] * x4 + CC[6]) * x4 + CC[4]) * x4 + CC[2]) * x4 + CC[0];
462#endif
463    double taylors = subs2 * x2 + subs1;
464    *sinx = absx * taylors * sin_sign;
465
466    double taylorc = subc2 * x2 + subc1;
467    *cosx = taylorc * cos_sign;
468}
469
470void lol_sincos(float x, float *sinx, float *cosx)
471{
472    double x2 = static_cast<double>(x);
473    double s2, c2;
474    lol_sincos(x2, &s2, &c2);
475    *sinx = static_cast<float>(s2);
476    *cosx = static_cast<float>(c2);
477}
478
479double lol_tan(double x)
480{
481#if defined LOL_FEATURE_CHEAP_BRANCHES
482    double absx = lol_fabs(x * INV_PI);
483
484    /* This value was determined empirically to ensure an error of no
485     * more than x * 1e-11 in this range. */
486    if (absx < 0.163)
487    {
488        double x2 = absx * absx;
489        double x4 = x2 * x2;
490        double sub1 = (((TC[7] * x4 + TC[5]) * x4
491                           + TC[3]) * x4 + TC[1]) * x4 + ONE;
492        double sub2 = (((TC[8] * x4 + TC[6]) * x4
493                           + TC[4]) * x4 + TC[2]) * x4 + TC[0];
494        double taylor = sub2 * x2 + sub1;
495        return x * taylor;
496    }
497#endif
498
499    double sinx, cosx;
500    lol_sincos(x, &sinx, &cosx);
501
502    /* Ensure cosx isn't zero. FIXME: we lose the cosx sign here. */
503    double absc = lol_fabs(cosx);
504#if defined __CELLOS_LV2__
505    double is_cos_not_zero = absc - VERY_SMALL_NUMBER;
506    cosx = lol_fsel(is_cos_not_zero, cosx, VERY_SMALL_NUMBER);
507    return lol_fdiv(sinx, cosx);
508#else
509    if (__unlikely(absc < VERY_SMALL_NUMBER))
510        cosx = VERY_SMALL_NUMBER;
511    return sinx / cosx;
512#endif
513}
514
515} /* namespace lol */
516
Note: See TracBrowser for help on using the repository browser.