source: trunk/src/half.cpp @ 883

Last change on this file since 883 was 883, checked in by sam, 9 years ago

core: use <altivec.h> with ppu-gcc, <ppu_altivec_internals.h> with SNC.

  • Property svn:keywords set to Id
File size: 9.7 KB
Line 
1//
2// Lol Engine
3//
4// Copyright: (c) 2010-2011 Sam Hocevar <sam@hocevar.net>
5//   This program is free software; you can redistribute it and/or
6//   modify it under the terms of the Do What The Fuck You Want To
7//   Public License, Version 2, as published by Sam Hocevar. See
8//   http://sam.zoy.org/projects/COPYING.WTFPL for more details.
9//
10
11#if defined HAVE_CONFIG_H
12#   include "config.h"
13#endif
14
15#if defined __CELLOS_LV2__
16#   if defined __SNC__
17#       include <ppu_altivec_internals.h>
18#   else
19#       include <altivec.h>
20#   endif
21#endif
22
23#include "core.h"
24
25using namespace std;
26
27namespace lol
28{
29
30/* These macros implement a finite iterator useful to build lookup
31 * tables. For instance, S64(0) will call S1(x) for all values of x
32 * between 0 and 63.
33 * Due to the exponential behaviour of the calls, the stress on the
34 * compiler may be important. */
35#define S4(x)    S1((x)),   S1((x)+1),     S1((x)+2),     S1((x)+3)
36#define S16(x)   S4((x)),   S4((x)+4),     S4((x)+8),     S4((x)+12)
37#define S64(x)   S16((x)),  S16((x)+16),   S16((x)+32),   S16((x)+48)
38#define S256(x)  S64((x)),  S64((x)+64),   S64((x)+128),  S64((x)+192)
39#define S1024(x) S256((x)), S256((x)+256), S256((x)+512), S256((x)+768)
40
41/* Lookup table-based algorithm from “Fast Half Float Conversions”
42 * by Jeroen van der Zijp, November 2008. No rounding is performed,
43 * and some NaN values may be incorrectly converted to Inf. */
44static inline uint16_t float_to_half_nobranch(uint32_t x)
45{
46    static uint16_t const basetable[512] =
47    {
48#define S1(i) (((i) < 103) ? 0x0000 : \
49               ((i) < 113) ? 0x0400 >> (113 - (i)) : \
50               ((i) < 143) ? ((i) - 112) << 10 : 0x7c00)
51        S256(0),
52#undef S1
53#define S1(i) (0x8000 | (((i) < 103) ? 0x0000 : \
54                         ((i) < 113) ? 0x0400 >> (113 - (i)) : \
55                         ((i) < 143) ? ((i) - 112) << 10 : 0x7c00))
56        S256(0),
57#undef S1
58    };
59
60    static uint8_t const shifttable[512] =
61    {
62#define S1(i) (((i) < 103) ? 24 : \
63               ((i) < 113) ? 126 - (i) : \
64               ((i) < 143 || (i) == 255) ? 13 : 24)
65        S256(0), S256(0),
66#undef S1
67    };
68
69    uint16_t bits = basetable[(x >> 23) & 0x1ff];
70    bits |= (x & 0x007fffff) >> shifttable[(x >> 23) & 0x1ff];
71    return bits;
72}
73
74/* This method is faster than the OpenEXR implementation (very often
75 * used, eg. in Ogre), with the additional benefit of rounding, inspired
76 * by James Tursa’s half-precision code. */
77static inline uint16_t float_to_half_branch(uint32_t x)
78{
79    uint16_t bits = (x >> 16) & 0x8000; /* Get the sign */
80    uint16_t m = (x >> 12) & 0x07ff; /* Keep one extra bit for rounding */
81    unsigned int e = (x >> 23) & 0xff; /* Using int is faster here */
82
83    /* If zero, or denormal, or exponent underflows too much for a denormal
84     * half, return signed zero. */
85    if (e < 103)
86        return bits;
87
88    /* If NaN, return NaN. If Inf or exponent overflow, return Inf. */
89    if (e > 142)
90    {
91        bits |= 0x7c00u;
92        /* If exponent was 0xff and one mantissa bit was set, it means NaN,
93         * not Inf, so make sure we set one mantissa bit too. */
94        bits |= e == 255 && (x & 0x007fffffu);
95        return bits;
96    }
97
98    /* If exponent underflows but not too much, return a denormal */
99    if (e < 113)
100    {
101        m |= 0x0800u;
102        /* Extra rounding may overflow and set mantissa to 0 and exponent
103         * to 1, which is OK. */
104        bits |= (m >> (114 - e)) + ((m >> (113 - e)) & 1);
105        return bits;
106    }
107
108    bits |= ((e - 112) << 10) | (m >> 1);
109    /* Extra rounding. An overflow will set mantissa to 0 and increment
110     * the exponent, which is OK. */
111    bits += m & 1;
112    return bits;
113}
114
115#if 0
116static inline void float_to_half_vector(half *dst, float const *src)
117{
118    vector unsigned int const v7 = vec_splat_u32(7);
119    vector unsigned short const v6 = vec_splat_u16(6);
120#if _XBOX
121    vector signed short const v9 = vec_splat_u16(9);
122    vector unsigned short const v10 = vec_splat_u16(10);
123#else
124    vector signed short const v0x0040 = {
125        0x0040, 0x0040, 0x0040, 0x0040, 0x0040, 0x0040, 0x0040, 0x0040};
126    vector unsigned short const v0x0400 = {
127        0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400};
128#endif
129    vector unsigned char const shuffle_high = {
130        0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
131    vector unsigned char const shuffle_low = {
132        2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31};
133    vector unsigned char const v0xbf70 = {
134        0xbf, 0x70, 0xbf, 0x70, 0xbf, 0x70, 0xbf, 0x70,
135        0xbf, 0x70, 0xbf, 0x70, 0xbf, 0x70, 0xbf, 0x70};
136
137    vector unsigned short v_mant, v_ret;
138    vector signed short v_exp;
139    vector unsigned int in0 = (vector unsigned int)vec_ld(0, src);
140    vector unsigned int in1 = (vector unsigned int)vec_ld(16, src);
141
142    in0 = vec_sra(in0, v7);
143    in1 = vec_sra(in1, v7);
144    v_exp = (vector signed short)vec_perm(in0, in1, shuffle_high);
145    v_mant = (vector unsigned short)vec_perm(in0, in1, shuffle_low);
146    v_exp = (vector signed short)vec_subs((vector unsigned char)v_exp, v0xbf70);
147#if _XBOX
148    v_ret = (vector unsigned short)vec_or(v_exp, vec_sr(v_exp, v9));
149#else
150    v_ret = (vector unsigned short)vec_madds(v_exp, v0x0040, v_exp);
151#endif
152    v_mant = vec_sr(v_mant, v6);
153#if _XBOX
154    v_ret = vec_or(v_mant, vec_sl(v_ret, v10));
155#else
156    v_ret = vec_mladd(v_ret, v0x0400, v_mant);
157#endif
158    vec_st(v_ret, 0, (uint16_t *)dst);
159}
160#endif
161
162static int const shifttable[32] =
163{
164    23, 14, 22, 0, 0, 0, 21, 0, 0, 0, 0, 0, 0, 0, 20, 0,
165    15, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 17, 0, 18, 19, 0,
166};
167static uint32_t const shiftmagic = 0x07c4acddu;
168
169/* Lookup table-based algorithm from “Fast Half Float Conversions”
170 * by Jeroen van der Zijp, November 2008. Tables are generated using
171 * the C++ preprocessor, thanks to a branchless implementation also
172 * used in half_to_float_branch(). This code is very fast when performing
173 * conversions on arrays of values. */
174static inline uint32_t half_to_float_nobranch(uint16_t x)
175{
176#define M3(i) ((i) | ((i) >> 1))
177#define M7(i) (M3(i) | (M3(i) >> 2))
178#define MF(i) (M7(i) | (M7(i) >> 4))
179#define MFF(i) (MF(i) | (MF(i) >> 8))
180#define E(i) shifttable[(unsigned int)(MFF(i) * shiftmagic) >> 27]
181
182    static uint32_t const mantissatable[2048] =
183    {
184#define S1(i) (((i) == 0) ? 0 : ((125 - E(i)) << 23) + ((i) << E(i)))
185        S1024(0),
186#undef S1
187#define S1(i) (0x38000000u + ((i) << 13))
188        S1024(0),
189#undef S1
190    };
191
192    static uint32_t const exponenttable[64] =
193    {
194#define S1(i) (((i) == 0) ? 0 : \
195               ((i) < 31) ? ((i) << 23) : \
196               ((i) == 31) ? 0x47800000u : \
197               ((i) == 32) ? 0x80000000u : \
198               ((i) < 63) ? (0x80000000u + (((i) - 32) << 23)) : 0xc7800000)
199        S64(0),
200#undef S1
201    };
202
203    static int const offsettable[64] =
204    {
205#define S1(i) (((i) == 0 || (i) == 32) ? 0 : 1024)
206        S64(0),
207#undef S1
208    };
209
210    return mantissatable[offsettable[x >> 10] + (x & 0x3ff)]
211            + exponenttable[x >> 10];
212}
213
214/* This algorithm is similar to the OpenEXR implementation, except it
215 * uses branchless code in the denormal path. This is slower than the
216 * table version, but will be more friendly to the cache for occasional
217 * uses. */
218static inline uint32_t half_to_float_branch(uint16_t x)
219{
220    uint32_t s = (x & 0x8000u) << 16;
221
222    if ((x & 0x7fffu) == 0)
223        return (uint32_t)x << 16;
224
225    uint32_t e = x & 0x7c00u;
226    uint32_t m = x & 0x03ffu;
227
228    if (e == 0)
229    {
230#if !defined __CELLOS_LV2__
231        uint32_t v = m | (m >> 1);
232        v |= v >> 2;
233        v |= v >> 4;
234        v |= v >> 8;
235
236        e = shifttable[(v * shiftmagic) >> 27];
237
238        /* We don't have to remove the 10th mantissa bit because it gets
239         * added to our underestimated exponent. */
240        return s | (((125 - e) << 23) + (m << e));
241#else
242        /* PS3 don't know bout my denormals */
243        return s;
244#endif
245    }
246
247    if (e == 0x7c00u)
248    {
249        /* The amd64 pipeline likes the if() better than a ternary operator
250         * or any other trick I could find. --sam */
251        if (m == 0)
252            return s | 0x7f800000u;
253        return s | 0x7fc00000u;
254    }
255
256    return s | (((e >> 10) + 112) << 23) | (m << 13);
257}
258
259/* Constructor from float. Uses the non-branching version because benchmarks
260 * indicate it is about 80% faster on amd64, and 20% faster on the PS3. The
261 * penalty of loading the lookup tables does not seem important. */
262half half::makefast(float f)
263{
264    union { float f; uint32_t x; } u = { f };
265    return makebits(float_to_half_nobranch(u.x));
266}
267
268/* Constructor from float with better precision. */
269half half::makeaccurate(float f)
270{
271    union { float f; uint32_t x; } u = { f };
272    return makebits(float_to_half_branch(u.x));
273}
274
275/* Cast to float. Uses the branching version because loading the tables
276 * for only one value is going to be cache-expensive. */
277float half::tofloat(half h)
278{
279    union { float f; uint32_t x; } u;
280    u.x = half_to_float_branch(h.bits);
281    return u.f;
282}
283
284size_t half::convert(half *dst, float const *src, size_t nelem)
285{
286    for (size_t i = 0; i < nelem; i++)
287    {
288        union { float f; uint32_t x; } u;
289        u.f = *src++;
290        *dst++ = makebits(float_to_half_nobranch(u.x));
291#if 0
292        /* AltiVec code. Will work one day. */
293        float_to_half_vector(dst, src);
294        src += 8;
295        dst += 8;
296        i += 7;
297#endif
298    }
299
300    return nelem;
301}
302
303size_t half::convert(float *dst, half const *src, size_t nelem)
304{
305    for (size_t i = 0; i < nelem; i++)
306    {
307        union { float f; uint32_t x; } u;
308#if !defined __CELLOS_LV2__
309        /* This code is really too slow on the PS3, even with the denormal
310         * handling stripped off. */
311        u.x = half_to_float_nobranch((*src++).bits);
312#else
313        u.x = half_to_float_branch((*src++).bits);
314#endif
315        *dst++ = u.f;
316    }
317
318    return nelem;
319}
320
321} /* namespace lol */
322
Note: See TracBrowser for help on using the repository browser.