1 | // |
---|
2 | // Lol Engine |
---|
3 | // |
---|
4 | // Copyright: (c) 2010-2011 Sam Hocevar <sam@hocevar.net> |
---|
5 | // This program is free software; you can redistribute it and/or |
---|
6 | // modify it under the terms of the Do What The Fuck You Want To |
---|
7 | // Public License, Version 2, as published by Sam Hocevar. See |
---|
8 | // http://sam.zoy.org/projects/COPYING.WTFPL for more details. |
---|
9 | // |
---|
10 | |
---|
11 | #if defined HAVE_CONFIG_H |
---|
12 | # include "config.h" |
---|
13 | #endif |
---|
14 | |
---|
15 | #if defined HAVE_FASTMATH_H |
---|
16 | # include <fastmath.h> |
---|
17 | #endif |
---|
18 | |
---|
19 | #include "core.h" |
---|
20 | |
---|
21 | using namespace std; |
---|
22 | |
---|
23 | namespace lol |
---|
24 | { |
---|
25 | |
---|
26 | static const double PI = 3.14159265358979323846264; |
---|
27 | static const double NEG_PI = -3.14159265358979323846264; |
---|
28 | static const double PI_2 = 1.57079632679489661923132; |
---|
29 | static const double PI_4 = 0.785398163397448309615661; |
---|
30 | static const double INV_PI = 0.318309886183790671537768; |
---|
31 | static const double ROOT3 = 1.73205080756887729352745; |
---|
32 | |
---|
33 | static const double ZERO = 0.0; |
---|
34 | static const double ONE = 1.0; |
---|
35 | static const double NEG_ONE = -1.0; |
---|
36 | static const double HALF = 0.5; |
---|
37 | static const double QUARTER = 0.25; |
---|
38 | static const double TWO = 2.0; |
---|
39 | static const double VERY_SMALL_NUMBER = 0x1.0p-128; |
---|
40 | static const double TWO_EXP_52 = 4503599627370496.0; |
---|
41 | static const double TWO_EXP_54 = 18014398509481984.0; |
---|
42 | |
---|
43 | /** sin Taylor series coefficients. */ |
---|
44 | static const double SC[] = |
---|
45 | { |
---|
46 | -1.6449340668482264364724e-0, // pi^2/3! |
---|
47 | +8.1174242528335364363700e-1, // pi^4/5! |
---|
48 | -1.9075182412208421369647e-1, // pi^6/7! |
---|
49 | +2.6147847817654800504653e-2, // pi^8/9! |
---|
50 | -2.3460810354558236375089e-3, // pi^10/11! |
---|
51 | +1.4842879303107100368487e-4, // pi^12/13! |
---|
52 | -6.9758736616563804745344e-6, // pi^14/15! |
---|
53 | +2.5312174041370276513517e-7, // pi^16/17! |
---|
54 | }; |
---|
55 | |
---|
56 | static const double CC[] = |
---|
57 | { |
---|
58 | -4.9348022005446793094172e-0, // pi^2/2! |
---|
59 | +4.0587121264167682181850e-0, // pi^4/4! |
---|
60 | -1.3352627688545894958753e-0, // pi^6/6! |
---|
61 | +2.3533063035889320454188e-1, // pi^8/8! |
---|
62 | -2.5806891390014060012598e-2, // pi^10/10! |
---|
63 | +1.9295743094039230479033e-3, // pi^12/12! |
---|
64 | -1.0463810492484570711802e-4, // pi^14/14! |
---|
65 | +4.3030695870329470072978e-6, // pi^16/16! |
---|
66 | }; |
---|
67 | |
---|
68 | /* Custom intrinsics */ |
---|
69 | #define INLINEATTR __attribute__((always_inline)) |
---|
70 | |
---|
71 | #if defined __CELLOS_LV2__ |
---|
72 | static inline double lol_fctid(double x) INLINEATTR; |
---|
73 | static inline double lol_fctidz(double x) INLINEATTR; |
---|
74 | static inline double lol_fcfid(double x) INLINEATTR; |
---|
75 | static inline double lol_frsqrte(double x) INLINEATTR; |
---|
76 | #endif |
---|
77 | static inline double lol_fsel(double c, double gte, double lt) INLINEATTR; |
---|
78 | static inline double lol_fabs(double x) INLINEATTR; |
---|
79 | static inline double lol_round(double x) INLINEATTR; |
---|
80 | static inline double lol_trunc(double x) INLINEATTR; |
---|
81 | static inline double lol_round(double x) INLINEATTR; |
---|
82 | static inline double lol_trunc(double x) INLINEATTR; |
---|
83 | |
---|
84 | #if defined __CELLOS_LV2__ |
---|
85 | static inline double lol_fctid(double x) |
---|
86 | { |
---|
87 | double r; |
---|
88 | #if defined __SNC__ |
---|
89 | r = __builtin_fctid(x); |
---|
90 | #else |
---|
91 | __asm__ ("fctid %0, %1" |
---|
92 | : "=f"(r) : "f"(x)); |
---|
93 | #endif |
---|
94 | return r; |
---|
95 | } |
---|
96 | |
---|
97 | static double lol_fctidz(double x) |
---|
98 | { |
---|
99 | double r; |
---|
100 | #if defined __SNC__ |
---|
101 | r = __builtin_fctidz(x); |
---|
102 | #else |
---|
103 | __asm__ ("fctidz %0, %1" |
---|
104 | : "=f"(r) : "f"(x)); |
---|
105 | #endif |
---|
106 | return r; |
---|
107 | } |
---|
108 | |
---|
109 | static double lol_fcfid(double x) |
---|
110 | { |
---|
111 | double r; |
---|
112 | #if defined __SNC__ |
---|
113 | r = __builtin_fcfid(x); |
---|
114 | #else |
---|
115 | __asm__ ("fcfid %0, %1" |
---|
116 | : "=f"(r) : "f"(x)); |
---|
117 | #endif |
---|
118 | return r; |
---|
119 | } |
---|
120 | |
---|
121 | static double lol_frsqrte(double x) |
---|
122 | { |
---|
123 | #if defined __SNC__ |
---|
124 | return __builtin_frsqrte(x); |
---|
125 | #else |
---|
126 | double r; |
---|
127 | __asm__ ("frsqrte %0, %1" |
---|
128 | : "=f"(r) : "f"(x)); |
---|
129 | return r; |
---|
130 | #endif |
---|
131 | } |
---|
132 | #endif /* __CELLOS_LV2__ */ |
---|
133 | |
---|
134 | static inline double lol_fsel(double c, double gte, double lt) |
---|
135 | { |
---|
136 | #if defined __CELLOS_LV2__ && defined __SNC__ |
---|
137 | return __fsel(c, gte, lt); |
---|
138 | #elif defined __CELLOS_LV2__ |
---|
139 | double r; |
---|
140 | __asm__ ("fsel %0, %1, %2, %3" |
---|
141 | : "=f"(r) : "f"(c), "f"(gte), "f"(lt)); |
---|
142 | return r; |
---|
143 | #else |
---|
144 | return (c >= 0) ? gte : lt; |
---|
145 | #endif |
---|
146 | } |
---|
147 | |
---|
148 | static inline double lol_fabs(double x) |
---|
149 | { |
---|
150 | #if defined __CELLOS_LV2__ && defined __SNC__ |
---|
151 | return __fabs(x); |
---|
152 | #elif defined __CELLOS_LV2__ |
---|
153 | double r; |
---|
154 | __asm__ ("fabs %0, %1" |
---|
155 | : "=f"(r) : "f"(x)); |
---|
156 | return r; |
---|
157 | #else |
---|
158 | return __builtin_fabs(x); |
---|
159 | #endif |
---|
160 | } |
---|
161 | |
---|
162 | static inline double lol_round(double x) |
---|
163 | { |
---|
164 | #if defined __CELLOS_LV2__ |
---|
165 | return lol_fcfid(lol_fctid(x)); |
---|
166 | #else |
---|
167 | return __builtin_round(x); |
---|
168 | #endif |
---|
169 | } |
---|
170 | |
---|
171 | static inline double lol_trunc(double x) |
---|
172 | { |
---|
173 | #if defined __CELLOS_LV2__ |
---|
174 | return lol_fcfid(lol_fctidz(x)); |
---|
175 | #else |
---|
176 | return __builtin_trunc(x); |
---|
177 | #endif |
---|
178 | } |
---|
179 | |
---|
180 | double lol_sin(double x) |
---|
181 | { |
---|
182 | double absx = lol_fabs(x * INV_PI); |
---|
183 | |
---|
184 | /* If branches are cheap, skip the cycle count when |x| < π/4, |
---|
185 | * and only do the Taylor series up to the required precision. */ |
---|
186 | #if defined LOL_FEATURE_CHEAP_BRANCHES |
---|
187 | if (absx < QUARTER) |
---|
188 | { |
---|
189 | /* Computing x^4 is one multiplication too many we do, but it helps |
---|
190 | * interleave the Taylor series operations a lot better. */ |
---|
191 | double x2 = absx * absx; |
---|
192 | double x4 = x2 * x2; |
---|
193 | double sub1 = SC[3] * x4 + SC[1]; |
---|
194 | double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0]; |
---|
195 | double taylor = (sub1 * x2 + sub2) * x2 + ONE; |
---|
196 | return x * taylor; |
---|
197 | } |
---|
198 | #endif |
---|
199 | |
---|
200 | /* Wrap |x| to the range [-1, 1] and keep track of the number of |
---|
201 | * cycles required. If odd, we'll need to change the sign of the |
---|
202 | * result. */ |
---|
203 | #if defined __CELLOS_LV2__ |
---|
204 | double sign = lol_fsel(x, PI, NEG_PI); |
---|
205 | double num_cycles = lol_round(absx); |
---|
206 | double is_even = lol_trunc(num_cycles * HALF) - (num_cycles * HALF); |
---|
207 | sign = lol_fsel(is_even, sign, -sign); |
---|
208 | #else |
---|
209 | double num_cycles = absx + TWO_EXP_52; |
---|
210 | __asm__("" : "+m" (num_cycles)); num_cycles -= TWO_EXP_52; |
---|
211 | |
---|
212 | double is_even = TWO * num_cycles - ONE; |
---|
213 | __asm__("" : "+m" (is_even)); is_even += TWO_EXP_54; |
---|
214 | __asm__("" : "+m" (is_even)); is_even -= TWO_EXP_54; |
---|
215 | __asm__("" : "+m" (is_even)); |
---|
216 | is_even -= TWO * num_cycles - ONE; |
---|
217 | double sign = is_even; |
---|
218 | #endif |
---|
219 | absx -= num_cycles; |
---|
220 | |
---|
221 | /* If branches are very cheap, we have the option to do the Taylor |
---|
222 | * series at a much lower degree by splitting. */ |
---|
223 | #if defined LOL_FEATURE_VERY_CHEAP_BRANCHES |
---|
224 | if (lol_fabs(absx) > QUARTER) |
---|
225 | { |
---|
226 | sign = (x * absx >= 0.0) ? sign : -sign; |
---|
227 | |
---|
228 | double x1 = HALF - lol_fabs(absx); |
---|
229 | double x2 = x1 * x1; |
---|
230 | double x4 = x2 * x2; |
---|
231 | double sub1 = (CC[5] * x4 + CC[3]) * x4 + CC[1]; |
---|
232 | double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0]; |
---|
233 | double taylor = (sub1 * x2 + sub2) * x2 + ONE; |
---|
234 | |
---|
235 | return taylor * sign; |
---|
236 | } |
---|
237 | #endif |
---|
238 | |
---|
239 | sign *= (x >= 0.0) ? PI : NEG_PI; |
---|
240 | |
---|
241 | double x2 = absx * absx; |
---|
242 | double x4 = x2 * x2; |
---|
243 | #if defined LOL_FEATURE_VERY_CHEAP_BRANCHES |
---|
244 | double sub1 = SC[3] * x4 + SC[1]; |
---|
245 | double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0]; |
---|
246 | #else |
---|
247 | double sub1 = ((SC[7] * x4 + SC[5]) * x4 + SC[3]) * x4 + SC[1]; |
---|
248 | double sub2 = ((SC[6] * x4 + SC[4]) * x4 + SC[2]) * x4 + SC[0]; |
---|
249 | #endif |
---|
250 | double taylor = (sub1 * x2 + sub2) * x2 + ONE; |
---|
251 | |
---|
252 | return absx * taylor * sign; |
---|
253 | } |
---|
254 | |
---|
255 | double lol_cos(double x) |
---|
256 | { |
---|
257 | double absx = lol_fabs(x * INV_PI); |
---|
258 | |
---|
259 | #if defined LOL_FEATURE_CHEAP_BRANCHES |
---|
260 | if (absx < QUARTER) |
---|
261 | { |
---|
262 | double x2 = absx * absx; |
---|
263 | double x4 = x2 * x2; |
---|
264 | double sub1 = (CC[5] * x4 + CC[3]) * x4 + CC[1]; |
---|
265 | double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0]; |
---|
266 | double taylor = (sub1 * x2 + sub2) * x2 + ONE; |
---|
267 | return taylor; |
---|
268 | } |
---|
269 | #endif |
---|
270 | |
---|
271 | #if defined __CELLOS_LV2__ |
---|
272 | double num_cycles = lol_round(absx); |
---|
273 | double is_even = lol_trunc(num_cycles * HALF) - (num_cycles * HALF); |
---|
274 | double sign = lol_fsel(is_even, ONE, NEG_ONE); |
---|
275 | #else |
---|
276 | double num_cycles = absx + TWO_EXP_52; |
---|
277 | __asm__("" : "+m" (num_cycles)); num_cycles -= TWO_EXP_52; |
---|
278 | |
---|
279 | double is_even = TWO * num_cycles - ONE; |
---|
280 | __asm__("" : "+m" (is_even)); is_even += TWO_EXP_54; |
---|
281 | __asm__("" : "+m" (is_even)); is_even -= TWO_EXP_54; |
---|
282 | __asm__("" : "+m" (is_even)); |
---|
283 | is_even -= TWO * num_cycles - ONE; |
---|
284 | double sign = is_even; |
---|
285 | #endif |
---|
286 | absx -= num_cycles; |
---|
287 | |
---|
288 | #if defined LOL_FEATURE_VERY_CHEAP_BRANCHES |
---|
289 | if (lol_fabs(absx) > QUARTER) |
---|
290 | { |
---|
291 | double x1 = HALF - lol_fabs(absx); |
---|
292 | double x2 = x1 * x1; |
---|
293 | double x4 = x2 * x2; |
---|
294 | double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE; |
---|
295 | double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0]; |
---|
296 | double taylor = sub2 * x2 + sub1; |
---|
297 | |
---|
298 | return x1 * taylor * sign * PI; |
---|
299 | } |
---|
300 | #endif |
---|
301 | |
---|
302 | double x2 = absx * absx; |
---|
303 | double x4 = x2 * x2; |
---|
304 | #if defined LOL_FEATURE_VERY_CHEAP_BRANCHES |
---|
305 | double sub1 = (CC[5] * x4 + CC[3]) * x4 + CC[1]; |
---|
306 | double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0]; |
---|
307 | #else |
---|
308 | double sub1 = ((CC[7] * x4 + CC[5]) * x4 + CC[3]) * x4 + CC[1]; |
---|
309 | double sub2 = ((CC[6] * x4 + CC[4]) * x4 + CC[2]) * x4 + CC[0]; |
---|
310 | #endif |
---|
311 | double taylor = (sub1 * x2 + sub2) * x2 + ONE; |
---|
312 | |
---|
313 | return taylor * sign; |
---|
314 | } |
---|
315 | |
---|
316 | } /* namespace lol */ |
---|
317 | |
---|