source: trunk/src/math/half.cpp @ 1186

Last change on this file since 1186 was 1186, checked in by sam, 8 years ago

math: fix VS2010 compiler warnings in the float/half conversion tables, and
replace the well known DeBruijn sequence with a custom magic value to spare
one binary operation.

  • Property svn:keywords set to Id
File size: 9.9 KB
Line 
1//
2// Lol Engine
3//
4// Copyright: (c) 2010-2012 Sam Hocevar <sam@hocevar.net>
5//   This program is free software; you can redistribute it and/or
6//   modify it under the terms of the Do What The Fuck You Want To
7//   Public License, Version 2, as published by Sam Hocevar. See
8//   http://sam.zoy.org/projects/COPYING.WTFPL for more details.
9//
10
11#if defined HAVE_CONFIG_H
12#   include "config.h"
13#endif
14
15#if defined __CELLOS_LV2__
16#   if defined __SNC__
17#       include <ppu_altivec_internals.h>
18#   else
19#       include <altivec.h>
20#   endif
21#endif
22
23#include "core.h"
24
25using namespace std;
26
27namespace lol
28{
29
30/* These macros implement a finite iterator useful to build lookup
31 * tables. For instance, S64(0) will call S1(x) for all values of x
32 * between 0 and 63.
33 * Due to the exponential behaviour of the calls, the stress on the
34 * compiler may be important. */
35#define S4(x)    S1((x)),   S1((x)+1),     S1((x)+2),     S1((x)+3)
36#define S16(x)   S4((x)),   S4((x)+4),     S4((x)+8),     S4((x)+12)
37#define S64(x)   S16((x)),  S16((x)+16),   S16((x)+32),   S16((x)+48)
38#define S256(x)  S64((x)),  S64((x)+64),   S64((x)+128),  S64((x)+192)
39#define S1024(x) S256((x)), S256((x)+256), S256((x)+512), S256((x)+768)
40
41/* Lookup table-based algorithm from “Fast Half Float Conversions”
42 * by Jeroen van der Zijp, November 2008. No rounding is performed,
43 * and some NaN values may be incorrectly converted to Inf (because
44 * the lowest order bits in the mantissa are ignored). */
45static inline uint16_t float_to_half_nobranch(uint32_t x)
46{
47    static uint16_t const basetable[512] =
48    {
49#define S1(i) (((i) < 103) ? 0x0000 : \
50               ((i) < 113) ? 0x0400 >> (0x1f & (113 - (i))) : \
51               ((i) < 143) ? ((i) - 112) << 10 : 0x7c00)
52        S256(0),
53#undef S1
54#define S1(i) (0x8000 | basetable[i])
55        S256(0),
56#undef S1
57    };
58
59    static uint8_t const shifttable[512] =
60    {
61#define S1(i) (((i) < 103) ? 24 : \
62               ((i) < 113) ? 126 - (i) : \
63               ((i) < 143 || (i) == 255) ? 13 : 24)
64        S256(0), S256(0),
65#undef S1
66    };
67
68    uint16_t bits = basetable[(x >> 23) & 0x1ff];
69    bits |= (x & 0x007fffff) >> shifttable[(x >> 23) & 0x1ff];
70    return bits;
71}
72
73/* This method is faster than the OpenEXR implementation (very often
74 * used, eg. in Ogre), with the additional benefit of rounding, inspired
75 * by James Tursa’s half-precision code. */
76static inline uint16_t float_to_half_branch(uint32_t x)
77{
78    uint16_t bits = (x >> 16) & 0x8000; /* Get the sign */
79    uint16_t m = (x >> 12) & 0x07ff; /* Keep one extra bit for rounding */
80    unsigned int e = (x >> 23) & 0xff; /* Using int is faster here */
81
82    /* If zero, or denormal, or exponent underflows too much for a denormal
83     * half, return signed zero. */
84    if (e < 103)
85        return bits;
86
87    /* If NaN, return NaN. If Inf or exponent overflow, return Inf. */
88    if (e > 142)
89    {
90        bits |= 0x7c00u;
91        /* If exponent was 0xff and one mantissa bit was set, it means NaN,
92         * not Inf, so make sure we set one mantissa bit too. */
93        bits |= e == 255 && (x & 0x007fffffu);
94        return bits;
95    }
96
97    /* If exponent underflows but not too much, return a denormal */
98    if (e < 113)
99    {
100        m |= 0x0800u;
101        /* Extra rounding may overflow and set mantissa to 0 and exponent
102         * to 1, which is OK. */
103        bits |= (m >> (114 - e)) + ((m >> (113 - e)) & 1);
104        return bits;
105    }
106
107    bits |= ((e - 112) << 10) | (m >> 1);
108    /* Extra rounding. An overflow will set mantissa to 0 and increment
109     * the exponent, which is OK. */
110    bits += m & 1;
111    return bits;
112}
113
114#if 0
115static inline void float_to_half_vector(half *dst, float const *src)
116{
117    vector unsigned int const v7 = vec_splat_u32(7);
118    vector unsigned short const v6 = vec_splat_u16(6);
119#if _XBOX
120    vector signed short const v9 = vec_splat_u16(9);
121    vector unsigned short const v10 = vec_splat_u16(10);
122#else
123    vector signed short const v0x0040 = {
124        0x0040, 0x0040, 0x0040, 0x0040, 0x0040, 0x0040, 0x0040, 0x0040};
125    vector unsigned short const v0x0400 = {
126        0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400};
127#endif
128    vector unsigned char const shuffle_high = {
129        0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
130    vector unsigned char const shuffle_low = {
131        2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31};
132    vector unsigned char const v0xbf70 = {
133        0xbf, 0x70, 0xbf, 0x70, 0xbf, 0x70, 0xbf, 0x70,
134        0xbf, 0x70, 0xbf, 0x70, 0xbf, 0x70, 0xbf, 0x70};
135
136    vector unsigned short v_mant, v_ret;
137    vector signed short v_exp;
138    vector unsigned int in0 = (vector unsigned int)vec_ld(0, src);
139    vector unsigned int in1 = (vector unsigned int)vec_ld(16, src);
140
141    in0 = vec_sra(in0, v7);
142    in1 = vec_sra(in1, v7);
143    v_exp = (vector signed short)vec_perm(in0, in1, shuffle_high);
144    v_mant = (vector unsigned short)vec_perm(in0, in1, shuffle_low);
145    v_exp = (vector signed short)vec_subs((vector unsigned char)v_exp, v0xbf70);
146#if _XBOX
147    v_ret = (vector unsigned short)vec_or(v_exp, vec_sr(v_exp, v9));
148#else
149    v_ret = (vector unsigned short)vec_madds(v_exp, v0x0040, v_exp);
150#endif
151    v_mant = vec_sr(v_mant, v6);
152#if _XBOX
153    v_ret = vec_or(v_mant, vec_sl(v_ret, v10));
154#else
155    v_ret = vec_mladd(v_ret, v0x0400, v_mant);
156#endif
157    vec_st(v_ret, 0, (uint16_t *)dst);
158}
159#endif
160
161/* We use this magic table, inspired by De Bruijn sequences, to compute a
162 * branchless integer log2. The actual value fetched is 24-log2(x+1) for x
163 * in 1, 3, 7, f, 1f, 3f, 7f, ff, 1fe, 1ff, 3fc, 3fd, 3fe, 3ff. */
164static int const shifttable[16] =
165{
166    23, 22, 21, 15, -1, 20, 18, 14, 14, 16, 19, -1, 17, -1, -1, -1,
167};
168static uint32_t const shiftmagic = 0x05a1a1a2u;
169
170/* Lookup table-based algorithm from “Fast Half Float Conversions”
171 * by Jeroen van der Zijp, November 2008. Tables are generated using
172 * the C++ preprocessor, thanks to a branchless implementation also
173 * used in half_to_float_branch(). This code is very fast when performing
174 * conversions on arrays of values. */
175static inline uint32_t half_to_float_nobranch(uint16_t x)
176{
177#define M3(i) ((i) | ((i) >> 1))
178#define M7(i) (M3(i) | (M3(i) >> 2))
179#define MF(i) (M7(i) | (M7(i) >> 4))
180#define E(i) shifttable[(uint32_t)((uint64_t)MF(i) * shiftmagic) >> 28]
181
182    static uint32_t const mantissatable[2048] =
183    {
184#define S1(i) (((i) == 0) ? 0 : ((125 - E(i)) << 23) + ((i) << E(i)))
185        S1024(0),
186#undef S1
187#define S1(i) (0x38000000u + ((i) << 13))
188        S1024(0),
189#undef S1
190    };
191
192    static uint32_t const exponenttable[64] =
193    {
194#define S1(i) (((i) == 0) ? 0 : \
195               ((i) < 31) ? ((uint32_t)(i) << 23) : \
196               ((i) == 31) ? 0x47800000u : \
197               ((i) == 32) ? 0x80000000u : \
198               ((i) < 63) ? (0x80000000u | (((i) - 32) << 23)) : 0xc7800000)
199        S64(0),
200#undef S1
201    };
202
203    static int const offsettable[64] =
204    {
205#define S1(i) (((i) == 0 || (i) == 32) ? 0 : 1024)
206        S64(0),
207#undef S1
208    };
209
210    return mantissatable[offsettable[x >> 10] + (x & 0x3ff)]
211            + exponenttable[x >> 10];
212}
213
214/* This algorithm is similar to the OpenEXR implementation, except it
215 * uses branchless code in the denormal path. This is slower than the
216 * table version, but will be more friendly to the cache for occasional
217 * uses. */
218static inline uint32_t half_to_float_branch(uint16_t x)
219{
220    uint32_t s = (x & 0x8000u) << 16;
221
222    if ((x & 0x7fffu) == 0)
223        return (uint32_t)x << 16;
224
225    uint32_t e = x & 0x7c00u;
226    uint32_t m = x & 0x03ffu;
227
228    if (e == 0)
229    {
230        /* m has 10 significant bits but replicating the leading bit to
231         * 8 positions instead of 16 works just as well because of our
232         * handcrafted shiftmagic table. */
233        uint32_t v = m | (m >> 1);
234        v |= v >> 2;
235        v |= v >> 4;
236
237        e = shifttable[(v * shiftmagic) >> 28];
238
239        /* We don't have to remove the 10th mantissa bit because it gets
240         * added to our underestimated exponent. */
241        return s | (((125 - e) << 23) + (m << e));
242    }
243
244    if (e == 0x7c00u)
245    {
246        /* The amd64 pipeline likes the if() better than a ternary operator
247         * or any other trick I could find. --sam */
248        if (m == 0)
249            return s | 0x7f800000u;
250        return s | 0x7fc00000u;
251    }
252
253    return s | (((e >> 10) + 112) << 23) | (m << 13);
254}
255
256/* Constructor from float. Uses the non-branching version because benchmarks
257 * indicate it is about 80% faster on amd64, and 20% faster on the PS3. The
258 * penalty of loading the lookup tables does not seem important. */
259half half::makefast(float f)
260{
261    union { float f; uint32_t x; } u = { f };
262    return makebits(float_to_half_nobranch(u.x));
263}
264
265/* Constructor from float with better precision. */
266half half::makeaccurate(float f)
267{
268    union { float f; uint32_t x; } u = { f };
269    return makebits(float_to_half_branch(u.x));
270}
271
272/* Cast to float. Uses the branching version because loading the tables
273 * for only one value is going to be cache-expensive. */
274float half::tofloat(half h)
275{
276    union { float f; uint32_t x; } u;
277    u.x = half_to_float_branch(h.bits);
278    return u.f;
279}
280
281size_t half::convert(half *dst, float const *src, size_t nelem)
282{
283    for (size_t i = 0; i < nelem; i++)
284    {
285        union { float f; uint32_t x; } u;
286        u.f = *src++;
287        *dst++ = makebits(float_to_half_nobranch(u.x));
288#if 0
289        /* AltiVec code. Will work one day. */
290        float_to_half_vector(dst, src);
291        src += 8;
292        dst += 8;
293        i += 7;
294#endif
295    }
296
297    return nelem;
298}
299
300size_t half::convert(float *dst, half const *src, size_t nelem)
301{
302    for (size_t i = 0; i < nelem; i++)
303    {
304        union { float f; uint32_t x; } u;
305#if !defined __CELLOS_LV2__
306        /* This code is really too slow on the PS3, even with the denormal
307         * handling stripped off. */
308        u.x = half_to_float_nobranch((*src++).bits);
309#else
310        u.x = half_to_float_branch((*src++).bits);
311#endif
312        *dst++ = u.f;
313    }
314
315    return nelem;
316}
317
318} /* namespace lol */
319
Note: See TracBrowser for help on using the repository browser.