source: trunk/src/trig.cpp @ 896

Last change on this file since 896 was 896, checked in by sam, 12 years ago

core: add a code shortcut for sin() on platforms that have cheap branches.

File size: 6.7 KB
Line 
1//
2// Lol Engine
3//
4// Copyright: (c) 2010-2011 Sam Hocevar <sam@hocevar.net>
5//   This program is free software; you can redistribute it and/or
6//   modify it under the terms of the Do What The Fuck You Want To
7//   Public License, Version 2, as published by Sam Hocevar. See
8//   http://sam.zoy.org/projects/COPYING.WTFPL for more details.
9//
10
11#if defined HAVE_CONFIG_H
12#   include "config.h"
13#endif
14
15#if defined HAVE_FASTMATH_H
16#   include <fastmath.h>
17#endif
18
19#include "core.h"
20
21using namespace std;
22
23namespace lol
24{
25
26static const double PI     = 3.14159265358979323846264;
27static const double NEG_PI = -3.14159265358979323846264;
28static const double PI_2   = 1.57079632679489661923132;
29static const double PI_4   = 0.785398163397448309615661;
30static const double INV_PI = 0.318309886183790671537768;
31static const double ROOT3  = 1.73205080756887729352745;
32
33static const double ZERO    = 0.0;
34static const double ONE     = 1.0;
35static const double NEG_ONE = -1.0;
36static const double HALF    = 0.5;
37static const double QUARTER = 0.25;
38static const double TWO     = 2.0;
39static const double VERY_SMALL_NUMBER = 0x1.0p-128;
40static const double TWO_EXP_52 = 4503599627370496.0;
41static const double TWO_EXP_54 = 18014398509481984.0;
42
43/** sin Taylor series coefficients. */
44static const double SC[] =
45{
46    -1.6449340668482264364724e-0, // pi^2/3!
47    +8.1174242528335364363700e-1, // pi^4/5!
48    -1.9075182412208421369647e-1, // pi^6/7!
49    +2.6147847817654800504653e-2, // pi^8/9!
50    -2.3460810354558236375089e-3, // pi^10/11!
51    +1.4842879303107100368487e-4, // pi^12/13!
52    -6.9758736616563804745344e-6, // pi^14/15!
53    +2.5312174041370276513517e-7, // pi^16/17!
54};
55
56static const double CC[] =
57{
58    -4.9348022005446793094172e-0, // pi^2/2!
59    +4.0587121264167682181850e-0, // pi^4/4!
60    -1.3352627688545894958753e-0, // pi^6/6!
61    +2.3533063035889320454188e-1, // pi^8/8!
62    -2.5806891390014060012598e-2, // pi^10/10!
63    +1.9295743094039230479033e-3, // pi^12/12!
64    -1.0463810492484570711802e-4, // pi^14/14!
65    +4.3030695870329470072978e-6, // pi^16/16!
66};
67
68/* Custom intrinsics */
69#define INLINEATTR __attribute__((always_inline))
70
71#if defined __CELLOS_LV2__
72static inline double lol_fctid(double x) INLINEATTR;
73static inline double lol_fctidz(double x) INLINEATTR;
74static inline double lol_fcfid(double x) INLINEATTR;
75static inline double lol_frsqrte(double x) INLINEATTR;
76#endif
77static inline double lol_fsel(double c, double gte, double lt) INLINEATTR;
78static inline double lol_fabs(double x) INLINEATTR;
79static inline double lol_round(double x) INLINEATTR;
80static inline double lol_trunc(double x) INLINEATTR;
81static inline double lol_round(double x) INLINEATTR;
82static inline double lol_trunc(double x) INLINEATTR;
83
84#if defined __CELLOS_LV2__
85static inline double lol_fctid(double x)
86{
87    double r;
88#if defined __SNC__
89    r = __builtin_fctid(x);
90#else
91    __asm__ ("fctid %0, %1"
92             : "=f"(r) : "f"(x));
93#endif
94    return r;
95}
96
97static double lol_fctidz(double x)
98{
99    double r;
100#if defined __SNC__
101    r = __builtin_fctidz(x);
102#else
103    __asm__ ("fctidz %0, %1"
104             : "=f"(r) : "f"(x));
105#endif
106    return r;
107}
108
109static double lol_fcfid(double x)
110{
111    double r;
112#if defined __SNC__
113    r = __builtin_fcfid(x);
114#else
115    __asm__ ("fcfid %0, %1"
116             : "=f"(r) : "f"(x));
117#endif
118    return r;
119}
120
121static double lol_frsqrte(double x)
122{
123#if defined __SNC__
124    return __builtin_frsqrte(x);
125#else
126    double r;
127    __asm__ ("frsqrte %0, %1"
128             : "=f"(r) : "f"(x));
129    return r;
130#endif
131}
132#endif /* __CELLOS_LV2__ */
133
134static inline double lol_fsel(double c, double gte, double lt)
135{
136#if defined __CELLOS_LV2__ && defined __SNC__
137    return __fsel(c, gte, lt);
138#elif defined __CELLOS_LV2__
139    double r;
140    __asm__ ("fsel %0, %1, %2, %3"
141             : "=f"(r) : "f"(c), "f"(gte), "f"(lt));
142    return r;
143#else
144    return (c >= 0) ? gte : lt;
145#endif
146}
147
148static inline double lol_fabs(double x)
149{
150#if defined __CELLOS_LV2__ && defined __SNC__
151    return __fabs(x);
152#elif defined __CELLOS_LV2__
153    double r;
154    __asm__ ("fabs %0, %1"
155             : "=f"(r) : "f"(x));
156    return r;
157#else
158    return __builtin_fabs(x);
159#endif
160}
161
162static inline double lol_round(double x)
163{
164#if defined __CELLOS_LV2__
165    return lol_fcfid(lol_fctid(x));
166#else
167    return __builtin_round(x);
168#endif
169}
170
171static inline double lol_trunc(double x)
172{
173#if defined __CELLOS_LV2__
174    return lol_fcfid(lol_fctidz(x));
175#else
176    return __builtin_trunc(x);
177#endif
178}
179
180double lol_sin(double x)
181{
182    double absx = lol_fabs(x * INV_PI);
183
184    /* If branches are cheap, skip the cycle count when |x| < π/4,
185     * and only do the Taylor series up to the required precision. */
186#if defined LOL_FEATURE_CHEAP_BRANCHES
187    if (absx < QUARTER)
188    {
189        /* Computing x^4 is one multiplication too many we do, but it helps
190         * interleave the Taylor series operations a lot better. */
191        double x2 = absx * absx;
192        double x4 = x2 * x2;
193        double sub1 = SC[3] * x4 + SC[1];
194        double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
195        double taylor = (sub1 * x2 + sub2) * x2 + ONE;
196        return x * taylor;
197    }
198#endif
199
200    /* Wrap |x| to the range [-1, 1] and keep track of the number of
201     * cycles required. If odd, we'll need to change the sign of the
202     * result. */
203#if defined __CELLOS_LV2__
204    double sign = lol_fsel(x, PI, NEG_PI);
205    double num_cycles = lol_round(absx);
206    double is_even = lol_trunc(num_cycles * HALF) - (num_cycles * HALF);
207    sign = lol_fsel(is_even, sign, -sign);
208#else
209    double sign = (x >= 0.0) ? PI : NEG_PI;
210    double num_cycles = absx + TWO_EXP_52;
211    __asm__("" : "+m" (num_cycles)); num_cycles -= TWO_EXP_52;
212
213    double is_even = TWO * num_cycles - ONE;
214    __asm__("" : "+m" (is_even)); is_even += TWO_EXP_54;
215    __asm__("" : "+m" (is_even)); is_even -= TWO_EXP_54;
216    __asm__("" : "+m" (is_even));
217    is_even -= TWO * num_cycles - ONE;
218    sign *= is_even;
219#endif
220    absx -= num_cycles;
221
222#if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
223    if (lol_fabs(absx) > QUARTER)
224    {
225        sign = (x * absx >= 0.0) ? is_even : -is_even;
226
227        double k = HALF - lol_fabs(absx);
228        double x2 = k * k;
229        double x4 = x2 * x2;
230        double sub1 = (CC[5] * x4 + CC[3]) * x4 + CC[1];
231        double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
232        double taylor = (sub1 * x2 + sub2) * x2 + ONE;
233
234        return taylor * sign;
235    }
236#endif
237
238    double x2 = absx * absx;
239    double x4 = x2 * x2;
240#if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
241    double sub1 = SC[3] * x4 + SC[1];
242    double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
243#else
244    double sub1 = ((SC[7] * x4 + SC[5]) * x4 + SC[3]) * x4 + SC[1];
245    double sub2 = ((SC[6] * x4 + SC[4]) * x4 + SC[2]) * x4 + SC[0];
246#endif
247    double taylor = (sub1 * x2 + sub2) * x2 + ONE;
248
249    return absx * taylor * sign;
250}
251
252} /* namespace lol */
253
Note: See TracBrowser for help on using the repository browser.