1 | // |
---|
2 | // Lol Engine |
---|
3 | // |
---|
4 | // Copyright: (c) 2010-2012 Sam Hocevar <sam@hocevar.net> |
---|
5 | // This program is free software; you can redistribute it and/or |
---|
6 | // modify it under the terms of the Do What The Fuck You Want To |
---|
7 | // Public License, Version 2, as published by Sam Hocevar. See |
---|
8 | // http://sam.zoy.org/projects/COPYING.WTFPL for more details. |
---|
9 | // |
---|
10 | |
---|
11 | #if defined HAVE_CONFIG_H |
---|
12 | # include "config.h" |
---|
13 | #endif |
---|
14 | |
---|
15 | #if defined __CELLOS_LV2__ |
---|
16 | # if defined __SNC__ |
---|
17 | # include <ppu_altivec_internals.h> |
---|
18 | # else |
---|
19 | # include <altivec.h> |
---|
20 | # endif |
---|
21 | #endif |
---|
22 | |
---|
23 | #include "core.h" |
---|
24 | |
---|
25 | using namespace std; |
---|
26 | |
---|
27 | namespace lol |
---|
28 | { |
---|
29 | |
---|
30 | /* These macros implement a finite iterator useful to build lookup |
---|
31 | * tables. For instance, S64(0) will call S1(x) for all values of x |
---|
32 | * between 0 and 63. |
---|
33 | * Due to the exponential behaviour of the calls, the stress on the |
---|
34 | * compiler may be important. */ |
---|
35 | #define S4(x) S1((x)), S1((x)+1), S1((x)+2), S1((x)+3) |
---|
36 | #define S16(x) S4((x)), S4((x)+4), S4((x)+8), S4((x)+12) |
---|
37 | #define S64(x) S16((x)), S16((x)+16), S16((x)+32), S16((x)+48) |
---|
38 | #define S256(x) S64((x)), S64((x)+64), S64((x)+128), S64((x)+192) |
---|
39 | #define S1024(x) S256((x)), S256((x)+256), S256((x)+512), S256((x)+768) |
---|
40 | |
---|
41 | /* Lookup table-based algorithm from “Fast Half Float Conversions” |
---|
42 | * by Jeroen van der Zijp, November 2008. No rounding is performed, |
---|
43 | * and some NaN values may be incorrectly converted to Inf (because |
---|
44 | * the lowest order bits in the mantissa are ignored). */ |
---|
45 | static inline uint16_t float_to_half_nobranch(uint32_t x) |
---|
46 | { |
---|
47 | static uint16_t const basetable[512] = |
---|
48 | { |
---|
49 | #define S1(i) (((i) < 103) ? 0x0000 : \ |
---|
50 | ((i) < 113) ? 0x0400 >> (0x1f & (113 - (i))) : \ |
---|
51 | ((i) < 143) ? ((i) - 112) << 10 : 0x7c00) |
---|
52 | S256(0), |
---|
53 | #undef S1 |
---|
54 | #define S1(i) (0x8000 | basetable[i]) |
---|
55 | S256(0), |
---|
56 | #undef S1 |
---|
57 | }; |
---|
58 | |
---|
59 | static uint8_t const shifttable[512] = |
---|
60 | { |
---|
61 | #define S1(i) (((i) < 103) ? 24 : \ |
---|
62 | ((i) < 113) ? 126 - (i) : \ |
---|
63 | ((i) < 143 || (i) == 255) ? 13 : 24) |
---|
64 | S256(0), S256(0), |
---|
65 | #undef S1 |
---|
66 | }; |
---|
67 | |
---|
68 | uint16_t bits = basetable[(x >> 23) & 0x1ff]; |
---|
69 | bits |= (x & 0x007fffff) >> shifttable[(x >> 23) & 0x1ff]; |
---|
70 | return bits; |
---|
71 | } |
---|
72 | |
---|
73 | /* This method is faster than the OpenEXR implementation (very often |
---|
74 | * used, eg. in Ogre), with the additional benefit of rounding, inspired |
---|
75 | * by James Tursa’s half-precision code. */ |
---|
76 | static inline uint16_t float_to_half_branch(uint32_t x) |
---|
77 | { |
---|
78 | uint16_t bits = (x >> 16) & 0x8000; /* Get the sign */ |
---|
79 | uint16_t m = (x >> 12) & 0x07ff; /* Keep one extra bit for rounding */ |
---|
80 | unsigned int e = (x >> 23) & 0xff; /* Using int is faster here */ |
---|
81 | |
---|
82 | /* If zero, or denormal, or exponent underflows too much for a denormal |
---|
83 | * half, return signed zero. */ |
---|
84 | if (e < 103) |
---|
85 | return bits; |
---|
86 | |
---|
87 | /* If NaN, return NaN. If Inf or exponent overflow, return Inf. */ |
---|
88 | if (e > 142) |
---|
89 | { |
---|
90 | bits |= 0x7c00u; |
---|
91 | /* If exponent was 0xff and one mantissa bit was set, it means NaN, |
---|
92 | * not Inf, so make sure we set one mantissa bit too. */ |
---|
93 | bits |= e == 255 && (x & 0x007fffffu); |
---|
94 | return bits; |
---|
95 | } |
---|
96 | |
---|
97 | /* If exponent underflows but not too much, return a denormal */ |
---|
98 | if (e < 113) |
---|
99 | { |
---|
100 | m |= 0x0800u; |
---|
101 | /* Extra rounding may overflow and set mantissa to 0 and exponent |
---|
102 | * to 1, which is OK. */ |
---|
103 | bits |= (m >> (114 - e)) + ((m >> (113 - e)) & 1); |
---|
104 | return bits; |
---|
105 | } |
---|
106 | |
---|
107 | bits |= ((e - 112) << 10) | (m >> 1); |
---|
108 | /* Extra rounding. An overflow will set mantissa to 0 and increment |
---|
109 | * the exponent, which is OK. */ |
---|
110 | bits += m & 1; |
---|
111 | return bits; |
---|
112 | } |
---|
113 | |
---|
114 | #if 0 |
---|
115 | static inline void float_to_half_vector(half *dst, float const *src) |
---|
116 | { |
---|
117 | vector unsigned int const v7 = vec_splat_u32(7); |
---|
118 | vector unsigned short const v6 = vec_splat_u16(6); |
---|
119 | #if _XBOX |
---|
120 | vector signed short const v9 = vec_splat_u16(9); |
---|
121 | vector unsigned short const v10 = vec_splat_u16(10); |
---|
122 | #else |
---|
123 | vector signed short const v0x0040 = { |
---|
124 | 0x0040, 0x0040, 0x0040, 0x0040, 0x0040, 0x0040, 0x0040, 0x0040}; |
---|
125 | vector unsigned short const v0x0400 = { |
---|
126 | 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400}; |
---|
127 | #endif |
---|
128 | vector unsigned char const shuffle_high = { |
---|
129 | 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}; |
---|
130 | vector unsigned char const shuffle_low = { |
---|
131 | 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31}; |
---|
132 | vector unsigned char const v0xbf70 = { |
---|
133 | 0xbf, 0x70, 0xbf, 0x70, 0xbf, 0x70, 0xbf, 0x70, |
---|
134 | 0xbf, 0x70, 0xbf, 0x70, 0xbf, 0x70, 0xbf, 0x70}; |
---|
135 | |
---|
136 | vector unsigned short v_mant, v_ret; |
---|
137 | vector signed short v_exp; |
---|
138 | vector unsigned int in0 = (vector unsigned int)vec_ld(0, src); |
---|
139 | vector unsigned int in1 = (vector unsigned int)vec_ld(16, src); |
---|
140 | |
---|
141 | in0 = vec_sra(in0, v7); |
---|
142 | in1 = vec_sra(in1, v7); |
---|
143 | v_exp = (vector signed short)vec_perm(in0, in1, shuffle_high); |
---|
144 | v_mant = (vector unsigned short)vec_perm(in0, in1, shuffle_low); |
---|
145 | v_exp = (vector signed short)vec_subs((vector unsigned char)v_exp, v0xbf70); |
---|
146 | #if _XBOX |
---|
147 | v_ret = (vector unsigned short)vec_or(v_exp, vec_sr(v_exp, v9)); |
---|
148 | #else |
---|
149 | v_ret = (vector unsigned short)vec_madds(v_exp, v0x0040, v_exp); |
---|
150 | #endif |
---|
151 | v_mant = vec_sr(v_mant, v6); |
---|
152 | #if _XBOX |
---|
153 | v_ret = vec_or(v_mant, vec_sl(v_ret, v10)); |
---|
154 | #else |
---|
155 | v_ret = vec_mladd(v_ret, v0x0400, v_mant); |
---|
156 | #endif |
---|
157 | vec_st(v_ret, 0, (uint16_t *)dst); |
---|
158 | } |
---|
159 | #endif |
---|
160 | |
---|
161 | /* We use this magic table, inspired by De Bruijn sequences, to compute a |
---|
162 | * branchless integer log2. The actual value fetched is 24-log2(x+1) for x |
---|
163 | * in 1, 3, 7, f, 1f, 3f, 7f, ff, 1fe, 1ff, 3fc, 3fd, 3fe, 3ff. */ |
---|
164 | static int const shifttable[16] = |
---|
165 | { |
---|
166 | 23, 22, 21, 15, -1, 20, 18, 14, 14, 16, 19, -1, 17, -1, -1, -1, |
---|
167 | }; |
---|
168 | static uint32_t const shiftmagic = 0x05a1a1a2u; |
---|
169 | |
---|
170 | /* Lookup table-based algorithm from “Fast Half Float Conversions” |
---|
171 | * by Jeroen van der Zijp, November 2008. Tables are generated using |
---|
172 | * the C++ preprocessor, thanks to a branchless implementation also |
---|
173 | * used in half_to_float_branch(). This code is very fast when performing |
---|
174 | * conversions on arrays of values. */ |
---|
175 | static inline uint32_t half_to_float_nobranch(uint16_t x) |
---|
176 | { |
---|
177 | #define M3(i) ((i) | ((i) >> 1)) |
---|
178 | #define M7(i) (M3(i) | (M3(i) >> 2)) |
---|
179 | #define MF(i) (M7(i) | (M7(i) >> 4)) |
---|
180 | #define E(i) shifttable[(uint32_t)((uint64_t)MF(i) * shiftmagic) >> 28] |
---|
181 | |
---|
182 | static uint32_t const mantissatable[2048] = |
---|
183 | { |
---|
184 | #define S1(i) (((i) == 0) ? 0 : ((125 - E(i)) << 23) + ((i) << E(i))) |
---|
185 | S1024(0), |
---|
186 | #undef S1 |
---|
187 | #define S1(i) (0x38000000u + ((i) << 13)) |
---|
188 | S1024(0), |
---|
189 | #undef S1 |
---|
190 | }; |
---|
191 | |
---|
192 | static uint32_t const exponenttable[64] = |
---|
193 | { |
---|
194 | #define S1(i) (((i) == 0) ? 0 : \ |
---|
195 | ((i) < 31) ? ((uint32_t)(i) << 23) : \ |
---|
196 | ((i) == 31) ? 0x47800000u : \ |
---|
197 | ((i) == 32) ? 0x80000000u : \ |
---|
198 | ((i) < 63) ? (0x80000000u | (((i) - 32) << 23)) : 0xc7800000) |
---|
199 | S64(0), |
---|
200 | #undef S1 |
---|
201 | }; |
---|
202 | |
---|
203 | static int const offsettable[64] = |
---|
204 | { |
---|
205 | #define S1(i) (((i) == 0 || (i) == 32) ? 0 : 1024) |
---|
206 | S64(0), |
---|
207 | #undef S1 |
---|
208 | }; |
---|
209 | |
---|
210 | return mantissatable[offsettable[x >> 10] + (x & 0x3ff)] |
---|
211 | + exponenttable[x >> 10]; |
---|
212 | } |
---|
213 | |
---|
214 | /* This algorithm is similar to the OpenEXR implementation, except it |
---|
215 | * uses branchless code in the denormal path. This is slower than the |
---|
216 | * table version, but will be more friendly to the cache for occasional |
---|
217 | * uses. */ |
---|
218 | static inline uint32_t half_to_float_branch(uint16_t x) |
---|
219 | { |
---|
220 | uint32_t s = (x & 0x8000u) << 16; |
---|
221 | |
---|
222 | if ((x & 0x7fffu) == 0) |
---|
223 | return (uint32_t)x << 16; |
---|
224 | |
---|
225 | uint32_t e = x & 0x7c00u; |
---|
226 | uint32_t m = x & 0x03ffu; |
---|
227 | |
---|
228 | if (e == 0) |
---|
229 | { |
---|
230 | /* m has 10 significant bits but replicating the leading bit to |
---|
231 | * 8 positions instead of 16 works just as well because of our |
---|
232 | * handcrafted shiftmagic table. */ |
---|
233 | uint32_t v = m | (m >> 1); |
---|
234 | v |= v >> 2; |
---|
235 | v |= v >> 4; |
---|
236 | |
---|
237 | e = shifttable[(v * shiftmagic) >> 28]; |
---|
238 | |
---|
239 | /* We don't have to remove the 10th mantissa bit because it gets |
---|
240 | * added to our underestimated exponent. */ |
---|
241 | return s | (((125 - e) << 23) + (m << e)); |
---|
242 | } |
---|
243 | |
---|
244 | if (e == 0x7c00u) |
---|
245 | { |
---|
246 | /* The amd64 pipeline likes the if() better than a ternary operator |
---|
247 | * or any other trick I could find. --sam */ |
---|
248 | if (m == 0) |
---|
249 | return s | 0x7f800000u; |
---|
250 | return s | 0x7fc00000u; |
---|
251 | } |
---|
252 | |
---|
253 | return s | (((e >> 10) + 112) << 23) | (m << 13); |
---|
254 | } |
---|
255 | |
---|
256 | /* Constructor from float. Uses the non-branching version because benchmarks |
---|
257 | * indicate it is about 80% faster on amd64, and 20% faster on the PS3. The |
---|
258 | * penalty of loading the lookup tables does not seem important. */ |
---|
259 | half half::makefast(float f) |
---|
260 | { |
---|
261 | union { float f; uint32_t x; } u = { f }; |
---|
262 | return makebits(float_to_half_nobranch(u.x)); |
---|
263 | } |
---|
264 | |
---|
265 | /* Constructor from float with better precision. */ |
---|
266 | half half::makeaccurate(float f) |
---|
267 | { |
---|
268 | union { float f; uint32_t x; } u = { f }; |
---|
269 | return makebits(float_to_half_branch(u.x)); |
---|
270 | } |
---|
271 | |
---|
272 | /* Cast to float. Uses the branching version because loading the tables |
---|
273 | * for only one value is going to be cache-expensive. */ |
---|
274 | float half::tofloat(half h) |
---|
275 | { |
---|
276 | union { float f; uint32_t x; } u; |
---|
277 | u.x = half_to_float_branch(h.bits); |
---|
278 | return u.f; |
---|
279 | } |
---|
280 | |
---|
281 | size_t half::convert(half *dst, float const *src, size_t nelem) |
---|
282 | { |
---|
283 | for (size_t i = 0; i < nelem; i++) |
---|
284 | { |
---|
285 | union { float f; uint32_t x; } u; |
---|
286 | u.f = *src++; |
---|
287 | *dst++ = makebits(float_to_half_nobranch(u.x)); |
---|
288 | #if 0 |
---|
289 | /* AltiVec code. Will work one day. */ |
---|
290 | float_to_half_vector(dst, src); |
---|
291 | src += 8; |
---|
292 | dst += 8; |
---|
293 | i += 7; |
---|
294 | #endif |
---|
295 | } |
---|
296 | |
---|
297 | return nelem; |
---|
298 | } |
---|
299 | |
---|
300 | size_t half::convert(float *dst, half const *src, size_t nelem) |
---|
301 | { |
---|
302 | for (size_t i = 0; i < nelem; i++) |
---|
303 | { |
---|
304 | union { float f; uint32_t x; } u; |
---|
305 | #if !defined __CELLOS_LV2__ |
---|
306 | /* This code is really too slow on the PS3, even with the denormal |
---|
307 | * handling stripped off. */ |
---|
308 | u.x = half_to_float_nobranch((*src++).bits); |
---|
309 | #else |
---|
310 | u.x = half_to_float_branch((*src++).bits); |
---|
311 | #endif |
---|
312 | *dst++ = u.f; |
---|
313 | } |
---|
314 | |
---|
315 | return nelem; |
---|
316 | } |
---|
317 | |
---|
318 | } /* namespace lol */ |
---|
319 | |
---|