source: trunk/src/half.cpp @ 879

Last change on this file since 879 was 879, checked in by sam, 9 years ago

core: tune the half precision code so that the best variants are being
used on the PS3 platform.

  • Property svn:keywords set to Id
File size: 8.4 KB
Line 
1//
2// Lol Engine
3//
4// Copyright: (c) 2010-2011 Sam Hocevar <sam@hocevar.net>
5//   This program is free software; you can redistribute it and/or
6//   modify it under the terms of the Do What The Fuck You Want To
7//   Public License, Version 2, as published by Sam Hocevar. See
8//   http://sam.zoy.org/projects/COPYING.WTFPL for more details.
9//
10
11#if defined HAVE_CONFIG_H
12#   include "config.h"
13#endif
14
15#include "core.h"
16
17using namespace std;
18
19namespace lol
20{
21
22/* These macros implement a finite iterator useful to build lookup
23 * tables. For instance, S64(0) will call S1(x) for all values of x
24 * between 0 and 63.
25 * Due to the exponential behaviour of the calls, the stress on the
26 * compiler may be important. */
27#define S4(x)    S1((x)),   S1((x)+1),     S1((x)+2),     S1((x)+3)
28#define S16(x)   S4((x)),   S4((x)+4),     S4((x)+8),     S4((x)+12)
29#define S64(x)   S16((x)),  S16((x)+16),   S16((x)+32),   S16((x)+48)
30#define S256(x)  S64((x)),  S64((x)+64),   S64((x)+128),  S64((x)+192)
31#define S1024(x) S256((x)), S256((x)+256), S256((x)+512), S256((x)+768)
32
33/* Lookup table-based algorithm from “Fast Half Float Conversions”
34 * by Jeroen van der Zijp, November 2008. No rounding is performed,
35 * and some NaN values may be incorrectly converted to Inf. */
36static inline uint16_t float_to_half_nobranch(uint32_t x)
37{
38    static uint16_t const basetable[512] =
39    {
40#define S1(i) (((i) < 103) ? 0x0000: \
41               ((i) < 113) ? 0x0400 >> (113 - (i)) : \
42               ((i) < 143) ? ((i) - 112) << 10 : 0x7c00)
43        S256(0),
44#undef S1
45#define S1(i) (0x8000 | (((i) < 103) ? 0x0000 : \
46                         ((i) < 113) ? 0x0400 >> (113 - (i)): \
47                         ((i) < 143) ? ((i) - 112) << 10 : 0x7c00))
48        S256(0),
49#undef S1
50    };
51
52    static uint8_t const shifttable[512] =
53    {
54#define S1(i) (((i) < 103) ? 24 : \
55               ((i) < 113) ? 126 - (i) : \
56               ((i) < 143 || (i) == 255) ? 13 : 24)
57        S256(0), S256(0),
58#undef S1
59    };
60
61    uint16_t bits = basetable[(x >> 23) & 0x1ff];
62    bits |= (x & 0x007fffff) >> shifttable[(x >> 23) & 0x1ff];
63    return bits;
64}
65
66/* This method is faster than the OpenEXR implementation (very often
67 * used, eg. in Ogre), with the additional benefit of rounding, inspired
68 * by James Tursa’s half-precision code. */
69static inline uint16_t float_to_half_branch(uint32_t x)
70{
71    uint16_t bits = (x >> 16) & 0x8000; /* Get the sign */
72    uint16_t m = (x >> 12) & 0x07ff; /* Keep one extra bit for rounding */
73    unsigned int e = (x >> 23) & 0xff; /* Using int is faster here */
74
75    /* If zero, or denormal, or exponent underflows too much for a denormal,
76     * return signed zero. */
77#if !defined __CELLOS_LV2__
78    if (e < 103)
79        return bits;
80#else
81    /* PS3 don't know bout my denormals */
82    if (e < 113)
83        return bits;
84#endif
85
86    /* If NaN, return NaN. If Inf or exponent overflow, return Inf. */
87    if (e > 142)
88    {
89        bits |= 0x7c00u;
90        /* If exponent was 0xff and one mantissa bit was set, it means NaN,
91         * not Inf, so make sure we set one mantissa bit too. */
92        bits |= e == 255 && (x & 0x007fffffu);
93        return bits;
94    }
95
96#if !defined __CELLOS_LV2__
97    /* If exponent underflows but not too much, return a denormal */
98    if (e < 113)
99    {
100        m |= 0x0800u;
101        /* Extra rounding may overflow and set mantissa to 0 and exponent
102         * to 1, which is OK. */
103        bits |= (m >> (114 - e)) + ((m >> (113 - e)) & 1);
104        return bits;
105    }
106#endif
107
108    bits |= ((e - 112) << 10) | (m >> 1);
109    /* Extra rounding. An overflow will set mantissa to 0 and increment
110     * the exponent, which is OK. */
111    bits += m & 1;
112    return bits;
113}
114
115static int const shifttable[32] =
116{
117    23, 14, 22, 0, 0, 0, 21, 0, 0, 0, 0, 0, 0, 0, 20, 0,
118    15, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 17, 0, 18, 19, 0,
119};
120static uint32_t const shiftmagic = 0x07c4acddu;
121
122/* Lookup table-based algorithm from “Fast Half Float Conversions”
123 * by Jeroen van der Zijp, November 2008. Tables are generated using
124 * the C++ preprocessor, thanks to a branchless implementation also
125 * used in half_to_float_branch(). This code is very fast when performing
126 * conversions on arrays of values. */
127static inline uint32_t half_to_float_nobranch(uint16_t x)
128{
129#define M3(i) ((i) | ((i) >> 1))
130#define M7(i) (M3(i) | (M3(i) >> 2))
131#define MF(i) (M7(i) | (M7(i) >> 4))
132#define MFF(i) (MF(i) | (MF(i) >> 8))
133#define E(i) shifttable[(unsigned int)(MFF(i) * shiftmagic) >> 27]
134
135    static uint32_t const mantissatable[2048] =
136    {
137#define S1(i) (((i) == 0) ? 0 : ((125 - E(i)) << 23) + ((i) << E(i)))
138        S1024(0),
139#undef S1
140#define S1(i) (0x38000000u + ((i) << 13))
141        S1024(0),
142#undef S1
143    };
144
145    static uint32_t const exponenttable[64] =
146    {
147#define S1(i) (((i) == 0) ? 0 : \
148               ((i) < 31) ? ((i) << 23) : \
149               ((i) == 31) ? 0x47800000u : \
150               ((i) == 32) ? 0x80000000u : \
151               ((i) < 63) ? (0x80000000u + (((i) - 32) << 23)) : 0xc7800000)
152        S64(0),
153#undef S1
154    };
155
156    static int const offsettable[64] =
157    {
158#define S1(i) (((i) == 0 || (i) == 32) ? 0 : 1024)
159        S64(0),
160#undef S1
161    };
162
163    return mantissatable[offsettable[x >> 10] + (x & 0x3ff)]
164            + exponenttable[x >> 10];
165}
166
167/* This algorithm is similar to the OpenEXR implementation, except it
168 * uses branchless code in the denormal path. This is slower than the
169 * table version, but will be more friendly to the cache for occasional
170 * uses. */
171static inline uint32_t half_to_float_branch(uint16_t x)
172{
173    uint32_t s = (x & 0x8000u) << 16;
174
175    if ((x & 0x7fffu) == 0)
176        return (uint32_t)x << 16;
177
178    uint32_t e = x & 0x7c00u;
179    uint32_t m = x & 0x03ffu;
180
181    if (e == 0)
182    {
183#if !defined __CELLOS_LV2__
184        uint32_t v = m | (m >> 1);
185        v |= v >> 2;
186        v |= v >> 4;
187        v |= v >> 8;
188
189        e = shifttable[(v * shiftmagic) >> 27];
190
191        /* We don't have to remove the 10th mantissa bit because it gets
192         * added to our underestimated exponent. */
193        return s | (((125 - e) << 23) + (m << e));
194#else
195        /* PS3 don't know bout my denormals */
196        return s;
197#endif
198    }
199
200    if (e == 0x7c00u)
201    {
202        /* The amd64 pipeline likes the if() better than a ternary operator
203         * or any other trick I could find. --sam */
204        if (m == 0)
205            return s | 0x7f800000u;
206        return s | 0x7fc00000u;
207    }
208
209    return s | (((e >> 10) + 112) << 23) | (m << 13);
210}
211
212/* Constructor from float. Uses the non-branching version because benchmarks
213 * indicate it is always twice as fast. The penalty of loading the lookup
214 * tables does not seem important. */
215half half::makefast(float f)
216{
217    union { float f; uint32_t x; } u = { f };
218#if !defined __CELLOS_LV2__
219    return makebits(float_to_half_nobranch(u.x));
220#else
221    /* This code is slightly faster on the PS3, mostly because we
222     * don't need to care about denormals. */
223    return makebits(float_to_half_branch(u.x));
224#endif
225}
226
227/* Constructor from float with better precision. */
228half half::makeaccurate(float f)
229{
230    union { float f; uint32_t x; } u = { f };
231    return makebits(float_to_half_branch(u.x));
232}
233
234/* Cast to float. Uses the branching version because loading the tables
235 * for only one value is going to be cache-expensive. */
236half::operator float() const
237{
238    /* FIXME: there is a hidden "this" in this method. Export more
239     * code so that it can all work in registers instead. */
240    union { float f; uint32_t x; } u;
241    u.x = half_to_float_branch(bits);
242    return u.f;
243}
244
245size_t half::convert(half *dst, float const *src, size_t nelem)
246{
247    for (size_t i = 0; i < nelem; i++)
248    {
249        union { float f; uint32_t x; } u;
250        u.f = *src++;
251#if !defined __CELLOS_LV2__
252        *dst++ = makebits(float_to_half_nobranch(u.x));
253#else
254        /* This code is slightly faster on the PS3, mostly because we
255         * don't need to care about denormals. */
256        *dst++ = makebits(float_to_half_branch(u.x));
257#endif
258    }
259
260    return nelem;
261}
262
263size_t half::convert(float *dst, half const *src, size_t nelem)
264{
265    for (size_t i = 0; i < nelem; i++)
266    {
267        union { float f; uint32_t x; } u;
268#if !defined __CELLOS_LV2__
269        /* This code is really too slow on the PS3, even with the denormal
270         * handling stripped off. */
271        u.x = half_to_float_nobranch((*src++).bits);
272#else
273        u.x = half_to_float_branch((*src++).bits);
274#endif
275        *dst++ = u.f;
276    }
277
278    return nelem;
279}
280
281} /* namespace lol */
282
Note: See TracBrowser for help on using the repository browser.