source: trunk/src/trig.cpp @ 891

Last change on this file since 891 was 891, checked in by sam, 10 years ago

optim: split the Taylor series calculation into two separate values.

This is at the cost of one additional multiply, but performance increases
by more than 11%, because the PS3 pipeline is a lot happier now.

File size: 5.2 KB
Line 
1//
2// Lol Engine
3//
4// Copyright: (c) 2010-2011 Sam Hocevar <sam@hocevar.net>
5//   This program is free software; you can redistribute it and/or
6//   modify it under the terms of the Do What The Fuck You Want To
7//   Public License, Version 2, as published by Sam Hocevar. See
8//   http://sam.zoy.org/projects/COPYING.WTFPL for more details.
9//
10
11#if defined HAVE_CONFIG_H
12#   include "config.h"
13#endif
14
15#if defined HAVE_FASTMATH_H
16#   include <fastmath.h>
17#endif
18
19#include "core.h"
20
21using namespace std;
22
23namespace lol
24{
25
26static const double PI     = 3.14159265358979323846;
27static const double NEG_PI = -3.14159265358979323846;
28static const double PI_2   = PI / 2.0;
29static const double PI_4   = PI / 4.0;
30static const double INV_PI = 1.0 / PI;
31static const double ROOT3  = 1.732050807568877293527;
32
33static const double ZERO    = 0.0;
34static const double ONE     = 1.0;
35static const double NEG_ONE = -1.0;
36static const double HALF    = 0.5;
37static const double TWO     = 2.0;
38static const double VERY_SMALL_NUMBER = 0x1.0p-128;
39static const double TWO_EXP_52 = 4503599627370496.0;
40static const double TWO_EXP_54 = 18014398509481984.0;
41
42/** sin Taylor series coefficients. */
43static const double SC[]    =
44{
45    -1.6449340668482264364724e-0, // pi^2/3!
46    +8.1174242528335364363700e-1, // pi^4/5!
47    -1.9075182412208421369647e-1, // pi^6/7!
48    +2.6147847817654800504653e-2, // pi^8/9!
49    -2.3460810354558236375089e-3, // pi^10/11!
50    +1.4842879303107100368487e-4, // pi^12/13!
51    -6.9758736616563804745344e-6, // pi^14/15!
52    +2.5312174041370276513517e-7, // pi^16/17!
53};
54
55/* Custom intrinsics */
56#define INLINEATTR __attribute__((always_inline))
57
58#if defined __CELLOS_LV2__
59static inline double lol_fctid(double x) INLINEATTR;
60static inline double lol_fctidz(double x) INLINEATTR;
61static inline double lol_fcfid(double x) INLINEATTR;
62static inline double lol_frsqrte(double x) INLINEATTR;
63#endif
64static inline double lol_fsel(double c, double gte, double lt) INLINEATTR;
65static inline double lol_fabs(double x) INLINEATTR;
66static inline double lol_round(double x) INLINEATTR;
67static inline double lol_trunc(double x) INLINEATTR;
68static inline double lol_round(double x) INLINEATTR;
69static inline double lol_trunc(double x) INLINEATTR;
70
71#if defined __CELLOS_LV2__
72static inline double lol_fctid(double x)
73{
74    double r;
75#if defined __SNC__
76    r = __builtin_fctid(x);
77#else
78    __asm__ ("fctid %0, %1"
79             : "=f"(r) : "f"(x));
80#endif
81    return r;
82}
83
84static double lol_fctidz(double x)
85{
86    double r;
87#if defined __SNC__
88    r = __builtin_fctidz(x);
89#else
90    __asm__ ("fctidz %0, %1"
91             : "=f"(r) : "f"(x));
92#endif
93    return r;
94}
95
96static double lol_fcfid(double x)
97{
98    double r;
99#if defined __SNC__
100    r = __builtin_fcfid(x);
101#else
102    __asm__ ("fcfid %0, %1"
103             : "=f"(r) : "f"(x));
104#endif
105    return r;
106}
107
108static double lol_frsqrte(double x)
109{
110#if defined __SNC__
111    return __builtin_frsqrte(x);
112#else
113    double r;
114    __asm__ ("frsqrte %0, %1"
115             : "=f"(r) : "f"(x));
116    return r;
117#endif
118}
119#endif /* __CELLOS_LV2__ */
120
121static inline double lol_fsel(double c, double gte, double lt)
122{
123#if defined __CELLOS_LV2__ && defined __SNC__
124    return __fsel(c, gte, lt);
125#elif defined __CELLOS_LV2__
126    double r;
127    __asm__ ("fsel %0, %1, %2, %3"
128             : "=f"(r) : "f"(c), "f"(gte), "f"(lt));
129    return r;
130#else
131    return (c >= 0) ? gte : lt;
132#endif
133}
134
135static inline double lol_fabs(double x)
136{
137#if defined __CELLOS_LV2__ && defined __SNC__
138    return __fabs(x);
139#elif defined __CELLOS_LV2__
140    double r;
141    __asm__ ("fabs %0, %1"
142             : "=f"(r) : "f"(x));
143    return r;
144#else
145    return __builtin_fabs(x);
146#endif
147}
148
149static inline double lol_round(double x)
150{
151#if defined __CELLOS_LV2__
152    return lol_fcfid(lol_fctid(x));
153#else
154    return __builtin_round(x);
155#endif
156}
157
158static inline double lol_trunc(double x)
159{
160#if defined __CELLOS_LV2__
161    return lol_fcfid(lol_fctidz(x));
162#else
163    return __builtin_trunc(x);
164#endif
165}
166
167double lol_sin(double x)
168{
169    double absx = lol_fabs(x * INV_PI);
170    double sign = lol_fsel(x, PI, NEG_PI);
171
172    /* To compute sin(x) we build a Taylor series for |x|/pi wrapped to
173     * the range [-1, 1]. We also switch the result sign if the number
174     * of cycles is odd. */
175#if defined __CELLOS_LV2__
176    double num_cycles = lol_round(absx);
177    double is_even = lol_trunc(num_cycles * HALF) - (num_cycles * HALF);
178    sign = lol_fsel(is_even, sign, -sign);
179#else
180    double num_cycles = absx + TWO_EXP_52;
181    __asm__("" : "+m" (num_cycles)); num_cycles -= TWO_EXP_52;
182
183    double is_even = TWO * num_cycles - ONE;
184    __asm__("" : "+m" (is_even)); is_even += TWO_EXP_54;
185    __asm__("" : "+m" (is_even)); is_even -= TWO_EXP_54;
186    __asm__("" : "+m" (is_even));
187    is_even -= TWO * num_cycles - ONE;
188    sign *= is_even;
189#endif
190    double norm_x = absx - num_cycles;
191
192    /* Computing x^4 is one multiplication too many we do, but it helps
193     * interleave the Taylor series operations a lot better. */
194    double x2 = norm_x * norm_x;
195    double x4 = x2 * x2;
196    double sub1 = ((SC[7] * x4 + SC[5]) * x4 + SC[3]) * x4 + SC[1];
197    double sub2 = ((SC[6] * x4 + SC[4]) * x4 + SC[2]) * x4 + SC[0];
198    double taylor = (sub1 * x2 + sub2) * x2 + ONE;
199
200    double result = norm_x * taylor;
201    return result * sign;
202}
203
204} /* namespace lol */
205
Note: See TracBrowser for help on using the repository browser.