Changeset 882 for trunk/src/half.cpp
 Timestamp:
 Aug 30, 2011, 7:19:08 PM (9 years ago)
 File:

 1 edited
Legend:
 Unmodified
 Added
 Removed

trunk/src/half.cpp
r879 r882 11 11 #if defined HAVE_CONFIG_H 12 12 # include "config.h" 13 #endif 14 15 #if defined __CELLOS_LV2__ 16 # include <ppu_altivec_internals.h> 13 17 #endif 14 18 … … 38 42 static uint16_t const basetable[512] = 39 43 { 40 #define S1(i) (((i) < 103) ? 0x0000 : \44 #define S1(i) (((i) < 103) ? 0x0000 : \ 41 45 ((i) < 113) ? 0x0400 >> (113  (i)) : \ 42 46 ((i) < 143) ? ((i)  112) << 10 : 0x7c00) … … 44 48 #undef S1 45 49 #define S1(i) (0x8000  (((i) < 103) ? 0x0000 : \ 46 ((i) < 113) ? 0x0400 >> (113  (i)) : \50 ((i) < 113) ? 0x0400 >> (113  (i)) : \ 47 51 ((i) < 143) ? ((i)  112) << 10 : 0x7c00)) 48 52 S256(0), … … 73 77 unsigned int e = (x >> 23) & 0xff; /* Using int is faster here */ 74 78 75 /* If zero, or denormal, or exponent underflows too much for a denormal, 76 * return signed zero. */ 77 #if !defined __CELLOS_LV2__ 79 /* If zero, or denormal, or exponent underflows too much for a denormal 80 * half, return signed zero. */ 78 81 if (e < 103) 79 82 return bits; 80 #else81 /* PS3 don't know bout my denormals */82 if (e < 113)83 return bits;84 #endif85 83 86 84 /* If NaN, return NaN. If Inf or exponent overflow, return Inf. */ … … 94 92 } 95 93 96 #if !defined __CELLOS_LV2__97 94 /* If exponent underflows but not too much, return a denormal */ 98 95 if (e < 113) … … 104 101 return bits; 105 102 } 106 #endif107 103 108 104 bits = ((e  112) << 10)  (m >> 1); … … 112 108 return bits; 113 109 } 110 111 #if 0 112 static inline void float_to_half_vector(half *dst, float const *src) 113 { 114 vector unsigned int const v7 = vec_splat_u32(7); 115 vector unsigned short const v6 = vec_splat_u16(6); 116 #if _XBOX 117 vector signed short const v9 = vec_splat_u16(9); 118 vector unsigned short const v10 = vec_splat_u16(10); 119 #else 120 vector signed short const v0x0040 = { 121 0x0040, 0x0040, 0x0040, 0x0040, 0x0040, 0x0040, 0x0040, 0x0040}; 122 vector unsigned short const v0x0400 = { 123 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400}; 124 #endif 125 vector unsigned char const shuffle_high = { 126 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}; 127 vector unsigned char const shuffle_low = { 128 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31}; 129 vector unsigned char const v0xbf70 = { 130 0xbf, 0x70, 0xbf, 0x70, 0xbf, 0x70, 0xbf, 0x70, 131 0xbf, 0x70, 0xbf, 0x70, 0xbf, 0x70, 0xbf, 0x70}; 132 133 vector unsigned short v_mant, v_ret; 134 vector signed short v_exp; 135 vector unsigned int in0 = (vector unsigned int)vec_ld(0, src); 136 vector unsigned int in1 = (vector unsigned int)vec_ld(16, src); 137 138 in0 = vec_sra(in0, v7); 139 in1 = vec_sra(in1, v7); 140 v_exp = (vector signed short)vec_perm(in0, in1, shuffle_high); 141 v_mant = (vector unsigned short)vec_perm(in0, in1, shuffle_low); 142 v_exp = (vector signed short)vec_subs((vector unsigned char)v_exp, v0xbf70); 143 #if _XBOX 144 v_ret = (vector unsigned short)vec_or(v_exp, vec_sr(v_exp, v9)); 145 #else 146 v_ret = (vector unsigned short)vec_madds(v_exp, v0x0040, v_exp); 147 #endif 148 v_mant = vec_sr(v_mant, v6); 149 #if _XBOX 150 v_ret = vec_or(v_mant, vec_sl(v_ret, v10)); 151 #else 152 v_ret = vec_mladd(v_ret, v0x0400, v_mant); 153 #endif 154 vec_st(v_ret, 0, (uint16_t *)dst); 155 } 156 #endif 114 157 115 158 static int const shifttable[32] = … … 211 254 212 255 /* Constructor from float. Uses the nonbranching version because benchmarks 213 * indicate it is a lways twice as fast. The penalty of loading the lookup214 * tables does not seem important. */256 * indicate it is about 80% faster on amd64, and 20% faster on the PS3. The 257 * penalty of loading the lookup tables does not seem important. */ 215 258 half half::makefast(float f) 216 259 { 217 260 union { float f; uint32_t x; } u = { f }; 218 #if !defined __CELLOS_LV2__219 261 return makebits(float_to_half_nobranch(u.x)); 220 #else221 /* This code is slightly faster on the PS3, mostly because we222 * don't need to care about denormals. */223 return makebits(float_to_half_branch(u.x));224 #endif225 262 } 226 263 … … 234 271 /* Cast to float. Uses the branching version because loading the tables 235 272 * for only one value is going to be cacheexpensive. */ 236 half::operator float() const 237 { 238 /* FIXME: there is a hidden "this" in this method. Export more 239 * code so that it can all work in registers instead. */ 273 float half::tofloat(half h) 274 { 240 275 union { float f; uint32_t x; } u; 241 u.x = half_to_float_branch( bits);276 u.x = half_to_float_branch(h.bits); 242 277 return u.f; 243 278 } … … 249 284 union { float f; uint32_t x; } u; 250 285 u.f = *src++; 251 #if !defined __CELLOS_LV2__252 286 *dst++ = makebits(float_to_half_nobranch(u.x)); 253 #else 254 /* This code is slightly faster on the PS3, mostly because we 255 * don't need to care about denormals. */ 256 *dst++ = makebits(float_to_half_branch(u.x)); 287 #if 0 288 /* AltiVec code. Will work one day. */ 289 float_to_half_vector(dst, src); 290 src += 8; 291 dst += 8; 292 i += 7; 257 293 #endif 258 294 }
Note: See TracChangeset
for help on using the changeset viewer.