Changeset 872


Ignore:
Timestamp:
Aug 29, 2011, 2:07:54 AM (8 years ago)
Author:
sam
Message:

core: minor refactoring in the float / half conversions to accomodate
for future array versions.

Location:
trunk
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/half.cpp

    r869 r872  
    2020{
    2121
     22/* These macros implement a finite iterator useful to build lookup
     23 * tables. For instance, S64(0) will call S1(x) for all values of x
     24 * between 0 and 63.
     25 * Due to the exponential behaviour of the calls, the stress on the
     26 * compiler may be important. */
     27#define S4(x)    S1((x)),   S1((x)+1),     S1((x)+2),     S1((x)+3)
     28#define S16(x)   S4((x)),   S4((x)+4),     S4((x)+8),     S4((x)+12)
     29#define S64(x)   S16((x)),  S16((x)+16),   S16((x)+32),   S16((x)+48)
     30#define S256(x)  S64((x)),  S64((x)+64),   S64((x)+128),  S64((x)+192)
     31#define S1024(x) S256((x)), S256((x)+256), S256((x)+512), S256((x)+768)
     32
    2233/* Lookup table-based algorithm from “Fast Half Float Conversions”
    2334 * by Jeroen van der Zijp, November 2008. No rounding is performed,
    2435 * and some NaN values may be incorrectly converted to Inf. */
    25 half half::makefast(float f)
    26 {
    27 #define S4(x)    S1(4*(x)),  S1(4*(x)+1),  S1(4*(x)+2),  S1(4*(x)+3)
    28 #define S16(x)   S4(4*(x)),  S4(4*(x)+1),  S4(4*(x)+2),  S4(4*(x)+3)
    29 #define S64(x)  S16(4*(x)), S16(4*(x)+1), S16(4*(x)+2), S16(4*(x)+3)
    30 #define S256(x) S64(4*(x)), S64(4*(x)+1), S64(4*(x)+2), S64(4*(x)+3)
    31 
     36static inline uint16_t float_to_half_nobranch(uint32_t x)
     37{
    3238    static uint16_t const basetable[512] =
    3339    {
     
    5359    };
    5460
    55     union { float f; uint32_t x; } u = { f };
    56 
    57     uint16_t bits = basetable[(u.x >> 23) & 0x1ff];
    58     bits |= (u.x & 0x007fffff) >> shifttable[(u.x >> 23) & 0x1ff];
    59     return makebits(bits);
     61    uint16_t bits = basetable[(x >> 23) & 0x1ff];
     62    bits |= (x & 0x007fffff) >> shifttable[(x >> 23) & 0x1ff];
     63    return bits;
    6064}
    6165
     
    6367 * used, eg. in Ogre), with the additional benefit of rounding, inspired
    6468 * by James Tursa’s half-precision code. */
    65 half half::makeslow(float f)
    66 {
    67     union { float f; uint32_t x; } u = { f };
    68 
    69     uint16_t bits = (u.x >> 16) & 0x8000; /* Get the sign */
    70     uint16_t m = (u.x >> 12) & 0x07ff; /* Keep one extra bit for rounding */
    71     unsigned int e = (u.x >> 23) & 0xff; /* Using int is faster here */
     69static inline uint16_t float_to_half_branch(uint32_t x)
     70{
     71    uint16_t bits = (x >> 16) & 0x8000; /* Get the sign */
     72    uint16_t m = (x >> 12) & 0x07ff; /* Keep one extra bit for rounding */
     73    unsigned int e = (x >> 23) & 0xff; /* Using int is faster here */
    7274
    7375    /* If zero, or denormal, or exponent underflows too much for a denormal,
    7476     * return signed zero. */
    7577    if (e < 103)
    76         return makebits(bits);
     78        return bits;
    7779
    7880    /* If NaN, return NaN. If Inf or exponent overflow, return Inf. */
     
    8284        /* If exponent was 0xff and one mantissa bit was set, it means NaN,
    8385         * not Inf, so make sure we set one mantissa bit too. */
    84         bits |= e == 255 && (u.x & 0x007fffffu);
    85         return makebits(bits);
     86        bits |= e == 255 && (x & 0x007fffffu);
     87        return bits;
    8688    }
    8789
     
    9395         * to 1, which is OK. */
    9496        bits |= (m >> (114 - e)) + ((m >> (113 - e)) & 1);
    95         return makebits(bits);
     97        return bits;
    9698    }
    9799
     
    100102     * the exponent, which is OK. */
    101103    bits += m & 1;
    102     return makebits(bits);
    103 }
    104 
    105 half::operator float() const
    106 {
    107     union { float f; uint32_t x; } u;
    108 
    109     uint32_t s = (m_bits & 0x8000u) << 16;
    110 
    111     if ((m_bits & 0x7fffu) == 0)
    112     {
    113         u.x = (uint32_t)m_bits << 16;
    114         return u.f;
    115     }
    116 
    117     uint32_t e = m_bits & 0x7c00u;
    118     uint32_t m = m_bits & 0x03ffu;
     104    return bits;
     105}
     106
     107static int const shifttable[32] =
     108{
     109    23, 14, 22, 0, 0, 0, 21, 0, 0, 0, 0, 0, 0, 0, 20, 0,
     110    15, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 17, 0, 18, 19, 0,
     111};
     112static uint32_t const shiftmagic = 0x07c4acddu;
     113
     114/* Lookup table-based algorithm from “Fast Half Float Conversions”
     115 * by Jeroen van der Zijp, November 2008. Tables are generated using
     116 * the C++ preprocessor, thanks to a branchless implementation also
     117 * used in half_to_float_branch(). This code is actually almost always
     118 * slower than the branching one. */
     119static inline uint32_t half_to_float_nobranch(uint16_t x)
     120{
     121#define M3(i) ((i) | ((i) >> 1))
     122#define M7(i) (M3(i) | (M3(i) >> 2))
     123#define MF(i) (M7(i) | (M7(i) >> 4))
     124#define MFF(i) (MF(i) | (MF(i) >> 8))
     125#define E(i) shifttable[(unsigned int)(MFF(i) * shiftmagic) >> 27]
     126
     127    static uint32_t const mantissatable[2048] =
     128    {
     129#define S1(i) (((i) == 0) ? 0 : ((125 - E(i)) << 23) + ((i) << E(i)))
     130        S1024(0),
     131#undef S1
     132#define S1(i) (0x38000000u + ((i) << 13))
     133        S1024(0),
     134#undef S1
     135    };
     136
     137    static uint32_t const exponenttable[64] =
     138    {
     139#define S1(i) (((i) == 0) ? 0 : \
     140               ((i) < 31) ? ((i) << 23) : \
     141               ((i) == 31) ? 0x47800000u : \
     142               ((i) == 32) ? 0x80000000u : \
     143               ((i) < 63) ? (0x80000000u + (((i) - 32) << 23)) : 0xc7800000)
     144        S64(0),
     145#undef S1
     146    };
     147
     148    static int const offsettable[64] =
     149    {
     150#define S1(i) (((i) == 0 || (i) == 32) ? 0 : 1024)
     151        S64(0),
     152#undef S1
     153    };
     154
     155    return mantissatable[offsettable[x >> 10] + (x & 0x3ff)]
     156            + exponenttable[x >> 10];
     157}
     158
     159/* This algorithm is similar to the OpenEXR implementation, except it
     160 * uses branchless code in the denormal path. */
     161static inline uint32_t half_to_float_branch(uint16_t x)
     162{
     163    uint32_t s = (x & 0x8000u) << 16;
     164
     165    if ((x & 0x7fffu) == 0)
     166        return (uint32_t)x << 16;
     167
     168    uint32_t e = x & 0x7c00u;
     169    uint32_t m = x & 0x03ffu;
    119170
    120171    if (e == 0)
    121172    {
    122         static int const shifttable[32] =
    123         {
    124             10, 1, 9, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 7, 0,
    125             2, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 4, 0, 5, 6, 0,
    126         };
    127 
    128173        uint32_t v = m | (m >> 1);
    129174        v |= v >> 2;
     
    131176        v |= v >> 8;
    132177
    133         e = shifttable[(v * 0x07C4ACDDU) >> 27];
    134         m <<= e;
     178        e = shifttable[(v * shiftmagic) >> 27];
    135179
    136180        /* We don't have to remove the 10th mantissa bit because it gets
    137181         * added to our underestimated exponent. */
    138         u.x = s | (((112 - e) << 23) + (m << 13));
    139         return u.f;
     182        return s | (((125 - e) << 23) + (m << e));
    140183    }
    141184
     
    145188         * or any other trick I could find. --sam */
    146189        if (m == 0)
    147             u.x = s | 0x7f800000u;
    148         else
    149             u.x = s | 0x7fc00000u;
    150 
    151         return u.f;
    152     }
    153 
    154     u.x = s | (((e >> 10) + 112) << 23) | (m << 13);
    155 
     190            return s | 0x7f800000u;
     191        return s | 0x7fc00000u;
     192    }
     193
     194    return s | (((e >> 10) + 112) << 23) | (m << 13);
     195}
     196
     197half half::makefast(float f)
     198{
     199    union { float f; uint32_t x; } u = { f };
     200    return makebits(float_to_half_nobranch(u.x));
     201}
     202
     203half half::makeslow(float f)
     204{
     205    union { float f; uint32_t x; } u = { f };
     206    return makebits(float_to_half_branch(u.x));
     207}
     208
     209half::operator float() const
     210{
     211    union { float f; uint32_t x; } u;
     212    u.x = half_to_float_branch(bits);
    156213    return u.f;
    157214}
  • trunk/src/half.h

    r871 r872  
    2424class half
    2525{
    26 private:
    27     uint16_t m_bits;
    28 
    2926public:
    3027    inline half() { }
     
    3734    inline int isnan() const
    3835    {
    39         return ((m_bits & 0x7c00u) == 0x7c00u) && (m_bits & 0x03ffu);
     36        return ((bits & 0x7c00u) == 0x7c00u) && (bits & 0x03ffu);
    4037    }
    4138
    4239    inline int isfinite() const
    4340    {
    44         return (m_bits & 0x7c00u) != 0x7c00u;
     41        return (bits & 0x7c00u) != 0x7c00u;
    4542    }
    4643
    4744    inline int isinf() const
    4845    {
    49         return (uint16_t)(m_bits << 1) == (0x7c00u << 1);
     46        return (uint16_t)(bits << 1) == (0x7c00u << 1);
    5047    }
    5148
    5249    inline int isnormal() const
    5350    {
    54         return (isfinite() && (m_bits & 0x7c00u)) || ((m_bits & 0x7fffu) == 0);
    55     }
    56 
    57     inline uint16_t bits()
    58     {
    59         return m_bits;
     51        return (isfinite() && (bits & 0x7c00u)) || ((bits & 0x7fffu) == 0);
    6052    }
    6153
     
    6557
    6658    /* Operations */
    67     inline half operator -() { return makebits(m_bits ^ 0x8000u); }
     59    inline half operator -() { return makebits(bits ^ 0x8000u); }
    6860    inline half &operator +=(float f) { return (*this = (half)(*this + f)); }
    6961    inline half &operator -=(float f) { return (*this = (half)(*this - f)); }
     
    9082    {
    9183        half ret;
    92         ret.m_bits = x;
     84        ret.bits = x;
    9385        return ret;
    9486    }
     87
     88    /* Internal representation */
     89    uint16_t bits;
    9590};
    9691
  • trunk/test/half.cpp

    r871 r872  
    5555            half a = half::makebits(i);
    5656            uint16_t b = i;
    57             CPPUNIT_ASSERT_EQUAL(a.bits(), b);
     57            CPPUNIT_ASSERT_EQUAL(a.bits, b);
    5858        }
    5959    }
     
    6565            half a = half::makeslow(pairs[i].f);
    6666            uint16_t b = pairs[i].x;
    67             CPPUNIT_ASSERT_EQUAL(a.bits(), b);
     67            CPPUNIT_ASSERT_EQUAL(a.bits, b);
    6868        }
    6969    }
     
    7575            half a = half::makefast(pairs[i].f);
    7676            uint16_t b = pairs[i].x;
    77             CPPUNIT_ASSERT_EQUAL(a.bits(), b);
     77            CPPUNIT_ASSERT_EQUAL(a.bits, b);
    7878        }
    7979    }
     
    191191            {
    192192                CPPUNIT_ASSERT(!isnan(f));
    193                 CPPUNIT_ASSERT_EQUAL(g.bits(), h.bits());
     193                CPPUNIT_ASSERT_EQUAL(g.bits, h.bits);
    194194            }
    195195        }
     
    259259
    260260        half a = one + 0.0f;
    261         CPPUNIT_ASSERT_EQUAL(one.bits(), a.bits());
     261        CPPUNIT_ASSERT_EQUAL(one.bits, a.bits);
    262262        a += 0.0f;
    263         CPPUNIT_ASSERT_EQUAL(one.bits(), a.bits());
     263        CPPUNIT_ASSERT_EQUAL(one.bits, a.bits);
    264264        a -= 0.0f;
    265         CPPUNIT_ASSERT_EQUAL(one.bits(), a.bits());
     265        CPPUNIT_ASSERT_EQUAL(one.bits, a.bits);
    266266        a *= 1.0f;
    267         CPPUNIT_ASSERT_EQUAL(one.bits(), a.bits());
     267        CPPUNIT_ASSERT_EQUAL(one.bits, a.bits);
    268268        a /= 1.0f;
    269         CPPUNIT_ASSERT_EQUAL(one.bits(), a.bits());
     269        CPPUNIT_ASSERT_EQUAL(one.bits, a.bits);
    270270
    271271        half b = one + 0.0f;
    272         CPPUNIT_ASSERT_EQUAL(one.bits(), b.bits());
     272        CPPUNIT_ASSERT_EQUAL(one.bits, b.bits);
    273273        b += 1.0f;
    274         CPPUNIT_ASSERT_EQUAL(two.bits(), b.bits());
     274        CPPUNIT_ASSERT_EQUAL(two.bits, b.bits);
    275275        b *= 2.0f;
    276         CPPUNIT_ASSERT_EQUAL(four.bits(), b.bits());
     276        CPPUNIT_ASSERT_EQUAL(four.bits, b.bits);
    277277        b -= 2.0f;
    278         CPPUNIT_ASSERT_EQUAL(two.bits(), b.bits());
     278        CPPUNIT_ASSERT_EQUAL(two.bits, b.bits);
    279279        b /= 2.0f;
    280         CPPUNIT_ASSERT_EQUAL(one.bits(), b.bits());
     280        CPPUNIT_ASSERT_EQUAL(one.bits, b.bits);
    281281
    282282        half c = 1.0f - zero;
    283         CPPUNIT_ASSERT_EQUAL(one.bits(), c.bits());
     283        CPPUNIT_ASSERT_EQUAL(one.bits, c.bits);
    284284
    285285        half d = 2.0f - one;
    286         CPPUNIT_ASSERT_EQUAL(one.bits(), d.bits());
     286        CPPUNIT_ASSERT_EQUAL(one.bits, d.bits);
    287287
    288288        half e = 2.0f + (-one);
    289         CPPUNIT_ASSERT_EQUAL(one.bits(), e.bits());
     289        CPPUNIT_ASSERT_EQUAL(one.bits, e.bits);
    290290
    291291        half f = (2.0f * two) / (1.0f + one);
    292         CPPUNIT_ASSERT_EQUAL(two.bits(), f.bits());
     292        CPPUNIT_ASSERT_EQUAL(two.bits, f.bits);
    293293    }
    294294
  • trunk/test/lol-bench.cpp

    r870 r872  
    4141
    4242        half h = half::makeslow(u.f);
    43         total ^= h.bits();
     43        total ^= h.bits;
    4444    }
    4545    Log::Info("time for makeslow: %f (hash %04x)\n", timer.GetMs(), total);
     
    5151
    5252        half h = half::makefast(u.f);
    53         total ^= h.bits();
     53        total ^= h.bits;
    5454    }
    5555    Log::Info("time for makefast: %f (hash %04x)\n", timer.GetMs(), total);
Note: See TracChangeset for help on using the changeset viewer.