source: trunk/src/half.cpp @ 926

Last change on this file since 926 was 926, checked in by sam, 9 years ago

core: handle denormal halfs properly on the PS3; it's denormal floats that
we do not care about.

  • Property svn:keywords set to Id
File size: 9.6 KB
Line 
1//
2// Lol Engine
3//
4// Copyright: (c) 2010-2011 Sam Hocevar <sam@hocevar.net>
5//   This program is free software; you can redistribute it and/or
6//   modify it under the terms of the Do What The Fuck You Want To
7//   Public License, Version 2, as published by Sam Hocevar. See
8//   http://sam.zoy.org/projects/COPYING.WTFPL for more details.
9//
10
11#if defined HAVE_CONFIG_H
12#   include "config.h"
13#endif
14
15#if defined __CELLOS_LV2__
16#   if defined __SNC__
17#       include <ppu_altivec_internals.h>
18#   else
19#       include <altivec.h>
20#   endif
21#endif
22
23#include "core.h"
24
25using namespace std;
26
27namespace lol
28{
29
30/* These macros implement a finite iterator useful to build lookup
31 * tables. For instance, S64(0) will call S1(x) for all values of x
32 * between 0 and 63.
33 * Due to the exponential behaviour of the calls, the stress on the
34 * compiler may be important. */
35#define S4(x)    S1((x)),   S1((x)+1),     S1((x)+2),     S1((x)+3)
36#define S16(x)   S4((x)),   S4((x)+4),     S4((x)+8),     S4((x)+12)
37#define S64(x)   S16((x)),  S16((x)+16),   S16((x)+32),   S16((x)+48)
38#define S256(x)  S64((x)),  S64((x)+64),   S64((x)+128),  S64((x)+192)
39#define S1024(x) S256((x)), S256((x)+256), S256((x)+512), S256((x)+768)
40
41/* Lookup table-based algorithm from “Fast Half Float Conversions”
42 * by Jeroen van der Zijp, November 2008. No rounding is performed,
43 * and some NaN values may be incorrectly converted to Inf. */
44static inline uint16_t float_to_half_nobranch(uint32_t x)
45{
46    static uint16_t const basetable[512] =
47    {
48#define S1(i) (((i) < 103) ? 0x0000 : \
49               ((i) < 113) ? 0x0400 >> (113 - (i)) : \
50               ((i) < 143) ? ((i) - 112) << 10 : 0x7c00)
51        S256(0),
52#undef S1
53#define S1(i) (0x8000 | (((i) < 103) ? 0x0000 : \
54                         ((i) < 113) ? 0x0400 >> (113 - (i)) : \
55                         ((i) < 143) ? ((i) - 112) << 10 : 0x7c00))
56        S256(0),
57#undef S1
58    };
59
60    static uint8_t const shifttable[512] =
61    {
62#define S1(i) (((i) < 103) ? 24 : \
63               ((i) < 113) ? 126 - (i) : \
64               ((i) < 143 || (i) == 255) ? 13 : 24)
65        S256(0), S256(0),
66#undef S1
67    };
68
69    uint16_t bits = basetable[(x >> 23) & 0x1ff];
70    bits |= (x & 0x007fffff) >> shifttable[(x >> 23) & 0x1ff];
71    return bits;
72}
73
74/* This method is faster than the OpenEXR implementation (very often
75 * used, eg. in Ogre), with the additional benefit of rounding, inspired
76 * by James Tursa’s half-precision code. */
77static inline uint16_t float_to_half_branch(uint32_t x)
78{
79    uint16_t bits = (x >> 16) & 0x8000; /* Get the sign */
80    uint16_t m = (x >> 12) & 0x07ff; /* Keep one extra bit for rounding */
81    unsigned int e = (x >> 23) & 0xff; /* Using int is faster here */
82
83    /* If zero, or denormal, or exponent underflows too much for a denormal
84     * half, return signed zero. */
85    if (e < 103)
86        return bits;
87
88    /* If NaN, return NaN. If Inf or exponent overflow, return Inf. */
89    if (e > 142)
90    {
91        bits |= 0x7c00u;
92        /* If exponent was 0xff and one mantissa bit was set, it means NaN,
93         * not Inf, so make sure we set one mantissa bit too. */
94        bits |= e == 255 && (x & 0x007fffffu);
95        return bits;
96    }
97
98    /* If exponent underflows but not too much, return a denormal */
99    if (e < 113)
100    {
101        m |= 0x0800u;
102        /* Extra rounding may overflow and set mantissa to 0 and exponent
103         * to 1, which is OK. */
104        bits |= (m >> (114 - e)) + ((m >> (113 - e)) & 1);
105        return bits;
106    }
107
108    bits |= ((e - 112) << 10) | (m >> 1);
109    /* Extra rounding. An overflow will set mantissa to 0 and increment
110     * the exponent, which is OK. */
111    bits += m & 1;
112    return bits;
113}
114
115#if 0
116static inline void float_to_half_vector(half *dst, float const *src)
117{
118    vector unsigned int const v7 = vec_splat_u32(7);
119    vector unsigned short const v6 = vec_splat_u16(6);
120#if _XBOX
121    vector signed short const v9 = vec_splat_u16(9);
122    vector unsigned short const v10 = vec_splat_u16(10);
123#else
124    vector signed short const v0x0040 = {
125        0x0040, 0x0040, 0x0040, 0x0040, 0x0040, 0x0040, 0x0040, 0x0040};
126    vector unsigned short const v0x0400 = {
127        0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400};
128#endif
129    vector unsigned char const shuffle_high = {
130        0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
131    vector unsigned char const shuffle_low = {
132        2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31};
133    vector unsigned char const v0xbf70 = {
134        0xbf, 0x70, 0xbf, 0x70, 0xbf, 0x70, 0xbf, 0x70,
135        0xbf, 0x70, 0xbf, 0x70, 0xbf, 0x70, 0xbf, 0x70};
136
137    vector unsigned short v_mant, v_ret;
138    vector signed short v_exp;
139    vector unsigned int in0 = (vector unsigned int)vec_ld(0, src);
140    vector unsigned int in1 = (vector unsigned int)vec_ld(16, src);
141
142    in0 = vec_sra(in0, v7);
143    in1 = vec_sra(in1, v7);
144    v_exp = (vector signed short)vec_perm(in0, in1, shuffle_high);
145    v_mant = (vector unsigned short)vec_perm(in0, in1, shuffle_low);
146    v_exp = (vector signed short)vec_subs((vector unsigned char)v_exp, v0xbf70);
147#if _XBOX
148    v_ret = (vector unsigned short)vec_or(v_exp, vec_sr(v_exp, v9));
149#else
150    v_ret = (vector unsigned short)vec_madds(v_exp, v0x0040, v_exp);
151#endif
152    v_mant = vec_sr(v_mant, v6);
153#if _XBOX
154    v_ret = vec_or(v_mant, vec_sl(v_ret, v10));
155#else
156    v_ret = vec_mladd(v_ret, v0x0400, v_mant);
157#endif
158    vec_st(v_ret, 0, (uint16_t *)dst);
159}
160#endif
161
162static int const shifttable[32] =
163{
164    23, 14, 22, 0, 0, 0, 21, 0, 0, 0, 0, 0, 0, 0, 20, 0,
165    15, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 17, 0, 18, 19, 0,
166};
167static uint32_t const shiftmagic = 0x07c4acddu;
168
169/* Lookup table-based algorithm from “Fast Half Float Conversions”
170 * by Jeroen van der Zijp, November 2008. Tables are generated using
171 * the C++ preprocessor, thanks to a branchless implementation also
172 * used in half_to_float_branch(). This code is very fast when performing
173 * conversions on arrays of values. */
174static inline uint32_t half_to_float_nobranch(uint16_t x)
175{
176#define M3(i) ((i) | ((i) >> 1))
177#define M7(i) (M3(i) | (M3(i) >> 2))
178#define MF(i) (M7(i) | (M7(i) >> 4))
179#define MFF(i) (MF(i) | (MF(i) >> 8))
180#define E(i) shifttable[(unsigned int)(MFF(i) * shiftmagic) >> 27]
181
182    static uint32_t const mantissatable[2048] =
183    {
184#define S1(i) (((i) == 0) ? 0 : ((125 - E(i)) << 23) + ((i) << E(i)))
185        S1024(0),
186#undef S1
187#define S1(i) (0x38000000u + ((i) << 13))
188        S1024(0),
189#undef S1
190    };
191
192    static uint32_t const exponenttable[64] =
193    {
194#define S1(i) (((i) == 0) ? 0 : \
195               ((i) < 31) ? ((i) << 23) : \
196               ((i) == 31) ? 0x47800000u : \
197               ((i) == 32) ? 0x80000000u : \
198               ((i) < 63) ? (0x80000000u + (((i) - 32) << 23)) : 0xc7800000)
199        S64(0),
200#undef S1
201    };
202
203    static int const offsettable[64] =
204    {
205#define S1(i) (((i) == 0 || (i) == 32) ? 0 : 1024)
206        S64(0),
207#undef S1
208    };
209
210    return mantissatable[offsettable[x >> 10] + (x & 0x3ff)]
211            + exponenttable[x >> 10];
212}
213
214/* This algorithm is similar to the OpenEXR implementation, except it
215 * uses branchless code in the denormal path. This is slower than the
216 * table version, but will be more friendly to the cache for occasional
217 * uses. */
218static inline uint32_t half_to_float_branch(uint16_t x)
219{
220    uint32_t s = (x & 0x8000u) << 16;
221
222    if ((x & 0x7fffu) == 0)
223        return (uint32_t)x << 16;
224
225    uint32_t e = x & 0x7c00u;
226    uint32_t m = x & 0x03ffu;
227
228    if (e == 0)
229    {
230        uint32_t v = m | (m >> 1);
231        v |= v >> 2;
232        v |= v >> 4;
233        v |= v >> 8;
234
235        e = shifttable[(v * shiftmagic) >> 27];
236
237        /* We don't have to remove the 10th mantissa bit because it gets
238         * added to our underestimated exponent. */
239        return s | (((125 - e) << 23) + (m << e));
240    }
241
242    if (e == 0x7c00u)
243    {
244        /* The amd64 pipeline likes the if() better than a ternary operator
245         * or any other trick I could find. --sam */
246        if (m == 0)
247            return s | 0x7f800000u;
248        return s | 0x7fc00000u;
249    }
250
251    return s | (((e >> 10) + 112) << 23) | (m << 13);
252}
253
254/* Constructor from float. Uses the non-branching version because benchmarks
255 * indicate it is about 80% faster on amd64, and 20% faster on the PS3. The
256 * penalty of loading the lookup tables does not seem important. */
257half half::makefast(float f)
258{
259    union { float f; uint32_t x; } u = { f };
260    return makebits(float_to_half_nobranch(u.x));
261}
262
263/* Constructor from float with better precision. */
264half half::makeaccurate(float f)
265{
266    union { float f; uint32_t x; } u = { f };
267    return makebits(float_to_half_branch(u.x));
268}
269
270/* Cast to float. Uses the branching version because loading the tables
271 * for only one value is going to be cache-expensive. */
272float half::tofloat(half h)
273{
274    union { float f; uint32_t x; } u;
275    u.x = half_to_float_branch(h.bits);
276    return u.f;
277}
278
279size_t half::convert(half *dst, float const *src, size_t nelem)
280{
281    for (size_t i = 0; i < nelem; i++)
282    {
283        union { float f; uint32_t x; } u;
284        u.f = *src++;
285        *dst++ = makebits(float_to_half_nobranch(u.x));
286#if 0
287        /* AltiVec code. Will work one day. */
288        float_to_half_vector(dst, src);
289        src += 8;
290        dst += 8;
291        i += 7;
292#endif
293    }
294
295    return nelem;
296}
297
298size_t half::convert(float *dst, half const *src, size_t nelem)
299{
300    for (size_t i = 0; i < nelem; i++)
301    {
302        union { float f; uint32_t x; } u;
303#if !defined __CELLOS_LV2__
304        /* This code is really too slow on the PS3, even with the denormal
305         * handling stripped off. */
306        u.x = half_to_float_nobranch((*src++).bits);
307#else
308        u.x = half_to_float_branch((*src++).bits);
309#endif
310        *dst++ = u.f;
311    }
312
313    return nelem;
314}
315
316} /* namespace lol */
317
Note: See TracBrowser for help on using the repository browser.