source: trunk/src/half.cpp @ 882

Last change on this file since 882 was 882, checked in by sam, 8 years ago

core: reactivate half denormals for the PS3.

We know we will not have denormal floats on the PS3, but we should still
create denormal halves in case the other end (maybe the GPU?) knows how
to handle them.

  • Property svn:keywords set to Id
File size: 9.7 KB
Line 
1//
2// Lol Engine
3//
4// Copyright: (c) 2010-2011 Sam Hocevar <sam@hocevar.net>
5//   This program is free software; you can redistribute it and/or
6//   modify it under the terms of the Do What The Fuck You Want To
7//   Public License, Version 2, as published by Sam Hocevar. See
8//   http://sam.zoy.org/projects/COPYING.WTFPL for more details.
9//
10
11#if defined HAVE_CONFIG_H
12#   include "config.h"
13#endif
14
15#if defined __CELLOS_LV2__
16#   include <ppu_altivec_internals.h>
17#endif
18
19#include "core.h"
20
21using namespace std;
22
23namespace lol
24{
25
26/* These macros implement a finite iterator useful to build lookup
27 * tables. For instance, S64(0) will call S1(x) for all values of x
28 * between 0 and 63.
29 * Due to the exponential behaviour of the calls, the stress on the
30 * compiler may be important. */
31#define S4(x)    S1((x)),   S1((x)+1),     S1((x)+2),     S1((x)+3)
32#define S16(x)   S4((x)),   S4((x)+4),     S4((x)+8),     S4((x)+12)
33#define S64(x)   S16((x)),  S16((x)+16),   S16((x)+32),   S16((x)+48)
34#define S256(x)  S64((x)),  S64((x)+64),   S64((x)+128),  S64((x)+192)
35#define S1024(x) S256((x)), S256((x)+256), S256((x)+512), S256((x)+768)
36
37/* Lookup table-based algorithm from “Fast Half Float Conversions”
38 * by Jeroen van der Zijp, November 2008. No rounding is performed,
39 * and some NaN values may be incorrectly converted to Inf. */
40static inline uint16_t float_to_half_nobranch(uint32_t x)
41{
42    static uint16_t const basetable[512] =
43    {
44#define S1(i) (((i) < 103) ? 0x0000 : \
45               ((i) < 113) ? 0x0400 >> (113 - (i)) : \
46               ((i) < 143) ? ((i) - 112) << 10 : 0x7c00)
47        S256(0),
48#undef S1
49#define S1(i) (0x8000 | (((i) < 103) ? 0x0000 : \
50                         ((i) < 113) ? 0x0400 >> (113 - (i)) : \
51                         ((i) < 143) ? ((i) - 112) << 10 : 0x7c00))
52        S256(0),
53#undef S1
54    };
55
56    static uint8_t const shifttable[512] =
57    {
58#define S1(i) (((i) < 103) ? 24 : \
59               ((i) < 113) ? 126 - (i) : \
60               ((i) < 143 || (i) == 255) ? 13 : 24)
61        S256(0), S256(0),
62#undef S1
63    };
64
65    uint16_t bits = basetable[(x >> 23) & 0x1ff];
66    bits |= (x & 0x007fffff) >> shifttable[(x >> 23) & 0x1ff];
67    return bits;
68}
69
70/* This method is faster than the OpenEXR implementation (very often
71 * used, eg. in Ogre), with the additional benefit of rounding, inspired
72 * by James Tursa’s half-precision code. */
73static inline uint16_t float_to_half_branch(uint32_t x)
74{
75    uint16_t bits = (x >> 16) & 0x8000; /* Get the sign */
76    uint16_t m = (x >> 12) & 0x07ff; /* Keep one extra bit for rounding */
77    unsigned int e = (x >> 23) & 0xff; /* Using int is faster here */
78
79    /* If zero, or denormal, or exponent underflows too much for a denormal
80     * half, return signed zero. */
81    if (e < 103)
82        return bits;
83
84    /* If NaN, return NaN. If Inf or exponent overflow, return Inf. */
85    if (e > 142)
86    {
87        bits |= 0x7c00u;
88        /* If exponent was 0xff and one mantissa bit was set, it means NaN,
89         * not Inf, so make sure we set one mantissa bit too. */
90        bits |= e == 255 && (x & 0x007fffffu);
91        return bits;
92    }
93
94    /* If exponent underflows but not too much, return a denormal */
95    if (e < 113)
96    {
97        m |= 0x0800u;
98        /* Extra rounding may overflow and set mantissa to 0 and exponent
99         * to 1, which is OK. */
100        bits |= (m >> (114 - e)) + ((m >> (113 - e)) & 1);
101        return bits;
102    }
103
104    bits |= ((e - 112) << 10) | (m >> 1);
105    /* Extra rounding. An overflow will set mantissa to 0 and increment
106     * the exponent, which is OK. */
107    bits += m & 1;
108    return bits;
109}
110
111#if 0
112static inline void float_to_half_vector(half *dst, float const *src)
113{
114    vector unsigned int const v7 = vec_splat_u32(7);
115    vector unsigned short const v6 = vec_splat_u16(6);
116#if _XBOX
117    vector signed short const v9 = vec_splat_u16(9);
118    vector unsigned short const v10 = vec_splat_u16(10);
119#else
120    vector signed short const v0x0040 = {
121        0x0040, 0x0040, 0x0040, 0x0040, 0x0040, 0x0040, 0x0040, 0x0040};
122    vector unsigned short const v0x0400 = {
123        0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400};
124#endif
125    vector unsigned char const shuffle_high = {
126        0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
127    vector unsigned char const shuffle_low = {
128        2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31};
129    vector unsigned char const v0xbf70 = {
130        0xbf, 0x70, 0xbf, 0x70, 0xbf, 0x70, 0xbf, 0x70,
131        0xbf, 0x70, 0xbf, 0x70, 0xbf, 0x70, 0xbf, 0x70};
132
133    vector unsigned short v_mant, v_ret;
134    vector signed short v_exp;
135    vector unsigned int in0 = (vector unsigned int)vec_ld(0, src);
136    vector unsigned int in1 = (vector unsigned int)vec_ld(16, src);
137
138    in0 = vec_sra(in0, v7);
139    in1 = vec_sra(in1, v7);
140    v_exp = (vector signed short)vec_perm(in0, in1, shuffle_high);
141    v_mant = (vector unsigned short)vec_perm(in0, in1, shuffle_low);
142    v_exp = (vector signed short)vec_subs((vector unsigned char)v_exp, v0xbf70);
143#if _XBOX
144    v_ret = (vector unsigned short)vec_or(v_exp, vec_sr(v_exp, v9));
145#else
146    v_ret = (vector unsigned short)vec_madds(v_exp, v0x0040, v_exp);
147#endif
148    v_mant = vec_sr(v_mant, v6);
149#if _XBOX
150    v_ret = vec_or(v_mant, vec_sl(v_ret, v10));
151#else
152    v_ret = vec_mladd(v_ret, v0x0400, v_mant);
153#endif
154    vec_st(v_ret, 0, (uint16_t *)dst);
155}
156#endif
157
158static int const shifttable[32] =
159{
160    23, 14, 22, 0, 0, 0, 21, 0, 0, 0, 0, 0, 0, 0, 20, 0,
161    15, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 17, 0, 18, 19, 0,
162};
163static uint32_t const shiftmagic = 0x07c4acddu;
164
165/* Lookup table-based algorithm from “Fast Half Float Conversions”
166 * by Jeroen van der Zijp, November 2008. Tables are generated using
167 * the C++ preprocessor, thanks to a branchless implementation also
168 * used in half_to_float_branch(). This code is very fast when performing
169 * conversions on arrays of values. */
170static inline uint32_t half_to_float_nobranch(uint16_t x)
171{
172#define M3(i) ((i) | ((i) >> 1))
173#define M7(i) (M3(i) | (M3(i) >> 2))
174#define MF(i) (M7(i) | (M7(i) >> 4))
175#define MFF(i) (MF(i) | (MF(i) >> 8))
176#define E(i) shifttable[(unsigned int)(MFF(i) * shiftmagic) >> 27]
177
178    static uint32_t const mantissatable[2048] =
179    {
180#define S1(i) (((i) == 0) ? 0 : ((125 - E(i)) << 23) + ((i) << E(i)))
181        S1024(0),
182#undef S1
183#define S1(i) (0x38000000u + ((i) << 13))
184        S1024(0),
185#undef S1
186    };
187
188    static uint32_t const exponenttable[64] =
189    {
190#define S1(i) (((i) == 0) ? 0 : \
191               ((i) < 31) ? ((i) << 23) : \
192               ((i) == 31) ? 0x47800000u : \
193               ((i) == 32) ? 0x80000000u : \
194               ((i) < 63) ? (0x80000000u + (((i) - 32) << 23)) : 0xc7800000)
195        S64(0),
196#undef S1
197    };
198
199    static int const offsettable[64] =
200    {
201#define S1(i) (((i) == 0 || (i) == 32) ? 0 : 1024)
202        S64(0),
203#undef S1
204    };
205
206    return mantissatable[offsettable[x >> 10] + (x & 0x3ff)]
207            + exponenttable[x >> 10];
208}
209
210/* This algorithm is similar to the OpenEXR implementation, except it
211 * uses branchless code in the denormal path. This is slower than the
212 * table version, but will be more friendly to the cache for occasional
213 * uses. */
214static inline uint32_t half_to_float_branch(uint16_t x)
215{
216    uint32_t s = (x & 0x8000u) << 16;
217
218    if ((x & 0x7fffu) == 0)
219        return (uint32_t)x << 16;
220
221    uint32_t e = x & 0x7c00u;
222    uint32_t m = x & 0x03ffu;
223
224    if (e == 0)
225    {
226#if !defined __CELLOS_LV2__
227        uint32_t v = m | (m >> 1);
228        v |= v >> 2;
229        v |= v >> 4;
230        v |= v >> 8;
231
232        e = shifttable[(v * shiftmagic) >> 27];
233
234        /* We don't have to remove the 10th mantissa bit because it gets
235         * added to our underestimated exponent. */
236        return s | (((125 - e) << 23) + (m << e));
237#else
238        /* PS3 don't know bout my denormals */
239        return s;
240#endif
241    }
242
243    if (e == 0x7c00u)
244    {
245        /* The amd64 pipeline likes the if() better than a ternary operator
246         * or any other trick I could find. --sam */
247        if (m == 0)
248            return s | 0x7f800000u;
249        return s | 0x7fc00000u;
250    }
251
252    return s | (((e >> 10) + 112) << 23) | (m << 13);
253}
254
255/* Constructor from float. Uses the non-branching version because benchmarks
256 * indicate it is about 80% faster on amd64, and 20% faster on the PS3. The
257 * penalty of loading the lookup tables does not seem important. */
258half half::makefast(float f)
259{
260    union { float f; uint32_t x; } u = { f };
261    return makebits(float_to_half_nobranch(u.x));
262}
263
264/* Constructor from float with better precision. */
265half half::makeaccurate(float f)
266{
267    union { float f; uint32_t x; } u = { f };
268    return makebits(float_to_half_branch(u.x));
269}
270
271/* Cast to float. Uses the branching version because loading the tables
272 * for only one value is going to be cache-expensive. */
273float half::tofloat(half h)
274{
275    union { float f; uint32_t x; } u;
276    u.x = half_to_float_branch(h.bits);
277    return u.f;
278}
279
280size_t half::convert(half *dst, float const *src, size_t nelem)
281{
282    for (size_t i = 0; i < nelem; i++)
283    {
284        union { float f; uint32_t x; } u;
285        u.f = *src++;
286        *dst++ = makebits(float_to_half_nobranch(u.x));
287#if 0
288        /* AltiVec code. Will work one day. */
289        float_to_half_vector(dst, src);
290        src += 8;
291        dst += 8;
292        i += 7;
293#endif
294    }
295
296    return nelem;
297}
298
299size_t half::convert(float *dst, half const *src, size_t nelem)
300{
301    for (size_t i = 0; i < nelem; i++)
302    {
303        union { float f; uint32_t x; } u;
304#if !defined __CELLOS_LV2__
305        /* This code is really too slow on the PS3, even with the denormal
306         * handling stripped off. */
307        u.x = half_to_float_nobranch((*src++).bits);
308#else
309        u.x = half_to_float_branch((*src++).bits);
310#endif
311        *dst++ = u.f;
312    }
313
314    return nelem;
315}
316
317} /* namespace lol */
318
Note: See TracBrowser for help on using the repository browser.