source: trunk/src/trig.cpp @ 897

Last change on this file since 897 was 897, checked in by sam, 12 years ago

core: implement accelerated cos().

File size: 8.6 KB
Line 
1//
2// Lol Engine
3//
4// Copyright: (c) 2010-2011 Sam Hocevar <sam@hocevar.net>
5//   This program is free software; you can redistribute it and/or
6//   modify it under the terms of the Do What The Fuck You Want To
7//   Public License, Version 2, as published by Sam Hocevar. See
8//   http://sam.zoy.org/projects/COPYING.WTFPL for more details.
9//
10
11#if defined HAVE_CONFIG_H
12#   include "config.h"
13#endif
14
15#if defined HAVE_FASTMATH_H
16#   include <fastmath.h>
17#endif
18
19#include "core.h"
20
21using namespace std;
22
23namespace lol
24{
25
26static const double PI     = 3.14159265358979323846264;
27static const double NEG_PI = -3.14159265358979323846264;
28static const double PI_2   = 1.57079632679489661923132;
29static const double PI_4   = 0.785398163397448309615661;
30static const double INV_PI = 0.318309886183790671537768;
31static const double ROOT3  = 1.73205080756887729352745;
32
33static const double ZERO    = 0.0;
34static const double ONE     = 1.0;
35static const double NEG_ONE = -1.0;
36static const double HALF    = 0.5;
37static const double QUARTER = 0.25;
38static const double TWO     = 2.0;
39static const double VERY_SMALL_NUMBER = 0x1.0p-128;
40static const double TWO_EXP_52 = 4503599627370496.0;
41static const double TWO_EXP_54 = 18014398509481984.0;
42
43/** sin Taylor series coefficients. */
44static const double SC[] =
45{
46    -1.6449340668482264364724e-0, // pi^2/3!
47    +8.1174242528335364363700e-1, // pi^4/5!
48    -1.9075182412208421369647e-1, // pi^6/7!
49    +2.6147847817654800504653e-2, // pi^8/9!
50    -2.3460810354558236375089e-3, // pi^10/11!
51    +1.4842879303107100368487e-4, // pi^12/13!
52    -6.9758736616563804745344e-6, // pi^14/15!
53    +2.5312174041370276513517e-7, // pi^16/17!
54};
55
56static const double CC[] =
57{
58    -4.9348022005446793094172e-0, // pi^2/2!
59    +4.0587121264167682181850e-0, // pi^4/4!
60    -1.3352627688545894958753e-0, // pi^6/6!
61    +2.3533063035889320454188e-1, // pi^8/8!
62    -2.5806891390014060012598e-2, // pi^10/10!
63    +1.9295743094039230479033e-3, // pi^12/12!
64    -1.0463810492484570711802e-4, // pi^14/14!
65    +4.3030695870329470072978e-6, // pi^16/16!
66};
67
68/* Custom intrinsics */
69#define INLINEATTR __attribute__((always_inline))
70
71#if defined __CELLOS_LV2__
72static inline double lol_fctid(double x) INLINEATTR;
73static inline double lol_fctidz(double x) INLINEATTR;
74static inline double lol_fcfid(double x) INLINEATTR;
75static inline double lol_frsqrte(double x) INLINEATTR;
76#endif
77static inline double lol_fsel(double c, double gte, double lt) INLINEATTR;
78static inline double lol_fabs(double x) INLINEATTR;
79static inline double lol_round(double x) INLINEATTR;
80static inline double lol_trunc(double x) INLINEATTR;
81static inline double lol_round(double x) INLINEATTR;
82static inline double lol_trunc(double x) INLINEATTR;
83
84#if defined __CELLOS_LV2__
85static inline double lol_fctid(double x)
86{
87    double r;
88#if defined __SNC__
89    r = __builtin_fctid(x);
90#else
91    __asm__ ("fctid %0, %1"
92             : "=f"(r) : "f"(x));
93#endif
94    return r;
95}
96
97static double lol_fctidz(double x)
98{
99    double r;
100#if defined __SNC__
101    r = __builtin_fctidz(x);
102#else
103    __asm__ ("fctidz %0, %1"
104             : "=f"(r) : "f"(x));
105#endif
106    return r;
107}
108
109static double lol_fcfid(double x)
110{
111    double r;
112#if defined __SNC__
113    r = __builtin_fcfid(x);
114#else
115    __asm__ ("fcfid %0, %1"
116             : "=f"(r) : "f"(x));
117#endif
118    return r;
119}
120
121static double lol_frsqrte(double x)
122{
123#if defined __SNC__
124    return __builtin_frsqrte(x);
125#else
126    double r;
127    __asm__ ("frsqrte %0, %1"
128             : "=f"(r) : "f"(x));
129    return r;
130#endif
131}
132#endif /* __CELLOS_LV2__ */
133
134static inline double lol_fsel(double c, double gte, double lt)
135{
136#if defined __CELLOS_LV2__ && defined __SNC__
137    return __fsel(c, gte, lt);
138#elif defined __CELLOS_LV2__
139    double r;
140    __asm__ ("fsel %0, %1, %2, %3"
141             : "=f"(r) : "f"(c), "f"(gte), "f"(lt));
142    return r;
143#else
144    return (c >= 0) ? gte : lt;
145#endif
146}
147
148static inline double lol_fabs(double x)
149{
150#if defined __CELLOS_LV2__ && defined __SNC__
151    return __fabs(x);
152#elif defined __CELLOS_LV2__
153    double r;
154    __asm__ ("fabs %0, %1"
155             : "=f"(r) : "f"(x));
156    return r;
157#else
158    return __builtin_fabs(x);
159#endif
160}
161
162static inline double lol_round(double x)
163{
164#if defined __CELLOS_LV2__
165    return lol_fcfid(lol_fctid(x));
166#else
167    return __builtin_round(x);
168#endif
169}
170
171static inline double lol_trunc(double x)
172{
173#if defined __CELLOS_LV2__
174    return lol_fcfid(lol_fctidz(x));
175#else
176    return __builtin_trunc(x);
177#endif
178}
179
180double lol_sin(double x)
181{
182    double absx = lol_fabs(x * INV_PI);
183
184    /* If branches are cheap, skip the cycle count when |x| < π/4,
185     * and only do the Taylor series up to the required precision. */
186#if defined LOL_FEATURE_CHEAP_BRANCHES
187    if (absx < QUARTER)
188    {
189        /* Computing x^4 is one multiplication too many we do, but it helps
190         * interleave the Taylor series operations a lot better. */
191        double x2 = absx * absx;
192        double x4 = x2 * x2;
193        double sub1 = SC[3] * x4 + SC[1];
194        double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
195        double taylor = (sub1 * x2 + sub2) * x2 + ONE;
196        return x * taylor;
197    }
198#endif
199
200    /* Wrap |x| to the range [-1, 1] and keep track of the number of
201     * cycles required. If odd, we'll need to change the sign of the
202     * result. */
203#if defined __CELLOS_LV2__
204    double sign = lol_fsel(x, PI, NEG_PI);
205    double num_cycles = lol_round(absx);
206    double is_even = lol_trunc(num_cycles * HALF) - (num_cycles * HALF);
207    sign = lol_fsel(is_even, sign, -sign);
208#else
209    double num_cycles = absx + TWO_EXP_52;
210    __asm__("" : "+m" (num_cycles)); num_cycles -= TWO_EXP_52;
211
212    double is_even = TWO * num_cycles - ONE;
213    __asm__("" : "+m" (is_even)); is_even += TWO_EXP_54;
214    __asm__("" : "+m" (is_even)); is_even -= TWO_EXP_54;
215    __asm__("" : "+m" (is_even));
216    is_even -= TWO * num_cycles - ONE;
217    double sign = is_even;
218#endif
219    absx -= num_cycles;
220
221    /* If branches are very cheap, we have the option to do the Taylor
222     * series at a much lower degree by splitting. */
223#if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
224    if (lol_fabs(absx) > QUARTER)
225    {
226        sign = (x * absx >= 0.0) ? sign : -sign;
227
228        double x1 = HALF - lol_fabs(absx);
229        double x2 = x1 * x1;
230        double x4 = x2 * x2;
231        double sub1 = (CC[5] * x4 + CC[3]) * x4 + CC[1];
232        double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
233        double taylor = (sub1 * x2 + sub2) * x2 + ONE;
234
235        return taylor * sign;
236    }
237#endif
238
239    sign *= (x >= 0.0) ? PI : NEG_PI;
240
241    double x2 = absx * absx;
242    double x4 = x2 * x2;
243#if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
244    double sub1 = SC[3] * x4 + SC[1];
245    double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
246#else
247    double sub1 = ((SC[7] * x4 + SC[5]) * x4 + SC[3]) * x4 + SC[1];
248    double sub2 = ((SC[6] * x4 + SC[4]) * x4 + SC[2]) * x4 + SC[0];
249#endif
250    double taylor = (sub1 * x2 + sub2) * x2 + ONE;
251
252    return absx * taylor * sign;
253}
254
255double lol_cos(double x)
256{
257    double absx = lol_fabs(x * INV_PI);
258
259#if defined LOL_FEATURE_CHEAP_BRANCHES
260    if (absx < QUARTER)
261    {
262        double x2 = absx * absx;
263        double x4 = x2 * x2;
264        double sub1 = (CC[5] * x4 + CC[3]) * x4 + CC[1];
265        double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
266        double taylor = (sub1 * x2 + sub2) * x2 + ONE;
267        return taylor;
268    }
269#endif
270
271#if defined __CELLOS_LV2__
272    double num_cycles = lol_round(absx);
273    double is_even = lol_trunc(num_cycles * HALF) - (num_cycles * HALF);
274    double sign = lol_fsel(is_even, ONE, NEG_ONE);
275#else
276    double num_cycles = absx + TWO_EXP_52;
277    __asm__("" : "+m" (num_cycles)); num_cycles -= TWO_EXP_52;
278
279    double is_even = TWO * num_cycles - ONE;
280    __asm__("" : "+m" (is_even)); is_even += TWO_EXP_54;
281    __asm__("" : "+m" (is_even)); is_even -= TWO_EXP_54;
282    __asm__("" : "+m" (is_even));
283    is_even -= TWO * num_cycles - ONE;
284    double sign = is_even;
285#endif
286    absx -= num_cycles;
287
288#if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
289    if (lol_fabs(absx) > QUARTER)
290    {
291        double x1 = HALF - lol_fabs(absx);
292        double x2 = x1 * x1;
293        double x4 = x2 * x2;
294        double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
295        double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
296        double taylor = sub2 * x2 + sub1;
297
298        return x1 * taylor * sign * PI;
299    }
300#endif
301
302    double x2 = absx * absx;
303    double x4 = x2 * x2;
304#if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
305    double sub1 = (CC[5] * x4 + CC[3]) * x4 + CC[1];
306    double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
307#else
308    double sub1 = ((CC[7] * x4 + CC[5]) * x4 + CC[3]) * x4 + CC[1];
309    double sub2 = ((CC[6] * x4 + CC[4]) * x4 + CC[2]) * x4 + CC[0];
310#endif
311    double taylor = (sub1 * x2 + sub2) * x2 + ONE;
312
313    return taylor * sign;
314}
315
316} /* namespace lol */
317
Note: See TracBrowser for help on using the repository browser.