source: trunk/src/half.cpp @ 872

Last change on this file since 872 was 872, checked in by sam, 10 years ago

core: minor refactoring in the float / half conversions to accomodate
for future array versions.

  • Property svn:keywords set to Id
File size: 6.4 KB
Line 
1//
2// Lol Engine
3//
4// Copyright: (c) 2010-2011 Sam Hocevar <sam@hocevar.net>
5//   This program is free software; you can redistribute it and/or
6//   modify it under the terms of the Do What The Fuck You Want To
7//   Public License, Version 2, as published by Sam Hocevar. See
8//   http://sam.zoy.org/projects/COPYING.WTFPL for more details.
9//
10
11#if defined HAVE_CONFIG_H
12#   include "config.h"
13#endif
14
15#include "core.h"
16
17using namespace std;
18
19namespace lol
20{
21
22/* These macros implement a finite iterator useful to build lookup
23 * tables. For instance, S64(0) will call S1(x) for all values of x
24 * between 0 and 63.
25 * Due to the exponential behaviour of the calls, the stress on the
26 * compiler may be important. */
27#define S4(x)    S1((x)),   S1((x)+1),     S1((x)+2),     S1((x)+3)
28#define S16(x)   S4((x)),   S4((x)+4),     S4((x)+8),     S4((x)+12)
29#define S64(x)   S16((x)),  S16((x)+16),   S16((x)+32),   S16((x)+48)
30#define S256(x)  S64((x)),  S64((x)+64),   S64((x)+128),  S64((x)+192)
31#define S1024(x) S256((x)), S256((x)+256), S256((x)+512), S256((x)+768)
32
33/* Lookup table-based algorithm from “Fast Half Float Conversions”
34 * by Jeroen van der Zijp, November 2008. No rounding is performed,
35 * and some NaN values may be incorrectly converted to Inf. */
36static inline uint16_t float_to_half_nobranch(uint32_t x)
37{
38    static uint16_t const basetable[512] =
39    {
40#define S1(i) (((i) < 103) ? 0x0000: \
41               ((i) < 113) ? 0x0400 >> (113 - (i)) : \
42               ((i) < 143) ? ((i) - 112) << 10 : 0x7c00)
43        S256(0),
44#undef S1
45#define S1(i) (0x8000 | (((i) < 103) ? 0x0000 : \
46                         ((i) < 113) ? 0x0400 >> (113 - (i)): \
47                         ((i) < 143) ? ((i) - 112) << 10 : 0x7c00))
48        S256(0),
49#undef S1
50    };
51
52    static uint8_t const shifttable[512] =
53    {
54#define S1(i) (((i) < 103) ? 24 : \
55               ((i) < 113) ? 126 - (i) : \
56               ((i) < 143 || (i) == 255) ? 13 : 24)
57        S256(0), S256(0),
58#undef S1
59    };
60
61    uint16_t bits = basetable[(x >> 23) & 0x1ff];
62    bits |= (x & 0x007fffff) >> shifttable[(x >> 23) & 0x1ff];
63    return bits;
64}
65
66/* This method is faster than the OpenEXR implementation (very often
67 * used, eg. in Ogre), with the additional benefit of rounding, inspired
68 * by James Tursa’s half-precision code. */
69static inline uint16_t float_to_half_branch(uint32_t x)
70{
71    uint16_t bits = (x >> 16) & 0x8000; /* Get the sign */
72    uint16_t m = (x >> 12) & 0x07ff; /* Keep one extra bit for rounding */
73    unsigned int e = (x >> 23) & 0xff; /* Using int is faster here */
74
75    /* If zero, or denormal, or exponent underflows too much for a denormal,
76     * return signed zero. */
77    if (e < 103)
78        return bits;
79
80    /* If NaN, return NaN. If Inf or exponent overflow, return Inf. */
81    if (e > 142)
82    {
83        bits |= 0x7c00u;
84        /* If exponent was 0xff and one mantissa bit was set, it means NaN,
85         * not Inf, so make sure we set one mantissa bit too. */
86        bits |= e == 255 && (x & 0x007fffffu);
87        return bits;
88    }
89
90    /* If exponent underflows but not too much, return a denormal */
91    if (e < 113)
92    {
93        m |= 0x0800u;
94        /* Extra rounding may overflow and set mantissa to 0 and exponent
95         * to 1, which is OK. */
96        bits |= (m >> (114 - e)) + ((m >> (113 - e)) & 1);
97        return bits;
98    }
99
100    bits |= ((e - 112) << 10) | (m >> 1);
101    /* Extra rounding. An overflow will set mantissa to 0 and increment
102     * the exponent, which is OK. */
103    bits += m & 1;
104    return bits;
105}
106
107static int const shifttable[32] =
108{
109    23, 14, 22, 0, 0, 0, 21, 0, 0, 0, 0, 0, 0, 0, 20, 0,
110    15, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 17, 0, 18, 19, 0,
111};
112static uint32_t const shiftmagic = 0x07c4acddu;
113
114/* Lookup table-based algorithm from “Fast Half Float Conversions”
115 * by Jeroen van der Zijp, November 2008. Tables are generated using
116 * the C++ preprocessor, thanks to a branchless implementation also
117 * used in half_to_float_branch(). This code is actually almost always
118 * slower than the branching one. */
119static inline uint32_t half_to_float_nobranch(uint16_t x)
120{
121#define M3(i) ((i) | ((i) >> 1))
122#define M7(i) (M3(i) | (M3(i) >> 2))
123#define MF(i) (M7(i) | (M7(i) >> 4))
124#define MFF(i) (MF(i) | (MF(i) >> 8))
125#define E(i) shifttable[(unsigned int)(MFF(i) * shiftmagic) >> 27]
126
127    static uint32_t const mantissatable[2048] =
128    {
129#define S1(i) (((i) == 0) ? 0 : ((125 - E(i)) << 23) + ((i) << E(i)))
130        S1024(0),
131#undef S1
132#define S1(i) (0x38000000u + ((i) << 13))
133        S1024(0),
134#undef S1
135    };
136
137    static uint32_t const exponenttable[64] =
138    {
139#define S1(i) (((i) == 0) ? 0 : \
140               ((i) < 31) ? ((i) << 23) : \
141               ((i) == 31) ? 0x47800000u : \
142               ((i) == 32) ? 0x80000000u : \
143               ((i) < 63) ? (0x80000000u + (((i) - 32) << 23)) : 0xc7800000)
144        S64(0),
145#undef S1
146    };
147
148    static int const offsettable[64] =
149    {
150#define S1(i) (((i) == 0 || (i) == 32) ? 0 : 1024)
151        S64(0),
152#undef S1
153    };
154
155    return mantissatable[offsettable[x >> 10] + (x & 0x3ff)]
156            + exponenttable[x >> 10];
157}
158
159/* This algorithm is similar to the OpenEXR implementation, except it
160 * uses branchless code in the denormal path. */
161static inline uint32_t half_to_float_branch(uint16_t x)
162{
163    uint32_t s = (x & 0x8000u) << 16;
164
165    if ((x & 0x7fffu) == 0)
166        return (uint32_t)x << 16;
167
168    uint32_t e = x & 0x7c00u;
169    uint32_t m = x & 0x03ffu;
170
171    if (e == 0)
172    {
173        uint32_t v = m | (m >> 1);
174        v |= v >> 2;
175        v |= v >> 4;
176        v |= v >> 8;
177
178        e = shifttable[(v * shiftmagic) >> 27];
179
180        /* We don't have to remove the 10th mantissa bit because it gets
181         * added to our underestimated exponent. */
182        return s | (((125 - e) << 23) + (m << e));
183    }
184
185    if (e == 0x7c00u)
186    {
187        /* The amd64 pipeline likes the if() better than a ternary operator
188         * or any other trick I could find. --sam */
189        if (m == 0)
190            return s | 0x7f800000u;
191        return s | 0x7fc00000u;
192    }
193
194    return s | (((e >> 10) + 112) << 23) | (m << 13);
195}
196
197half half::makefast(float f)
198{
199    union { float f; uint32_t x; } u = { f };
200    return makebits(float_to_half_nobranch(u.x));
201}
202
203half half::makeslow(float f)
204{
205    union { float f; uint32_t x; } u = { f };
206    return makebits(float_to_half_branch(u.x));
207}
208
209half::operator float() const
210{
211    union { float f; uint32_t x; } u;
212    u.x = half_to_float_branch(bits);
213    return u.f;
214}
215
216} /* namespace lol */
217
Note: See TracBrowser for help on using the repository browser.