blob: 01c99c147dd2f8deafe88ed4f305225a9c441487 [file] [log] [blame]
//
// Copyright (c) 2017 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#include "harness/compat.h"
#include "reference_math.h"
#include <limits.h>
#if !defined(_WIN32)
#include <string.h>
#endif
#include "Utility.h"
#if defined( __SSE__ ) || (defined( _MSC_VER ) && (defined(_M_IX86) || defined(_M_X64)))
#include <xmmintrin.h>
#endif
#if defined( __SSE2__ ) || (defined( _MSC_VER ) && (defined(_M_IX86) || defined(_M_X64)))
#include <emmintrin.h>
#endif
#ifndef M_PI_4
#define M_PI_4 (M_PI/4)
#endif
#define EVALUATE( x ) x
#define CONCATENATE(x, y) x ## EVALUATE(y)
#pragma STDC FP_CONTRACT OFF
static void __log2_ep(double *hi, double *lo, double x);
typedef union
{
uint64_t i;
double d;
}uint64d_t;
static const uint64d_t _CL_NAN = { 0x7ff8000000000000ULL };
#define cl_make_nan() _CL_NAN.d
static double reduce1( double x );
static double reduce1( double x )
{
if( fabs(x) >= HEX_DBL( +, 1, 0, +, 53 ) )
{
if( fabs(x) == INFINITY )
return cl_make_nan();
return 0.0; //we patch up the sign for sinPi and cosPi later, since they need different signs
}
// Find the nearest multiple of 2
const double r = copysign( HEX_DBL( +, 1, 0, +, 53 ), x );
double z = x + r;
z -= r;
// subtract it from x. Value is now in the range -1 <= x <= 1
return x - z;
}
/*
static double reduceHalf( double x );
static double reduceHalf( double x )
{
if( fabs(x) >= HEX_DBL( +, 1, 0, +, 52 ) )
{
if( fabs(x) == INFINITY )
return cl_make_nan();
return 0.0; //we patch up the sign for sinPi and cosPi later, since they need different signs
}
// Find the nearest multiple of 1
const double r = copysign( HEX_DBL( +, 1, 0, +, 52 ), x );
double z = x + r;
z -= r;
// subtract it from x. Value is now in the range -0.5 <= x <= 0.5
return x - z;
}
*/
double reference_acospi( double x) { return reference_acos( x ) / M_PI; }
double reference_asinpi( double x) { return reference_asin( x ) / M_PI; }
double reference_atanpi( double x) { return reference_atan( x ) / M_PI; }
double reference_atan2pi( double y, double x ) { return reference_atan2( y, x) / M_PI; }
double reference_cospi( double x)
{
if( reference_fabs(x) >= HEX_DBL( +, 1, 0, +, 52 ) )
{
if( reference_fabs(x) == INFINITY )
return cl_make_nan();
//Note this probably fails for odd values between 0x1.0p52 and 0x1.0p53.
//However, when starting with single precision inputs, there will be no odd values.
return 1.0;
}
x = reduce1(x+0.5);
// reduce to [-0.5, 0.5]
if( x < -0.5 )
x = -1 - x;
else if ( x > 0.5 )
x = 1 - x;
// cosPi zeros are all +0
if( x == 0.0 )
return 0.0;
return reference_sin( x * M_PI );
}
double reference_relaxed_cospi(double x) { return reference_cospi(x); }
double reference_relaxed_divide( double x, double y ) { return (float)(((float) x ) / ( (float) y )); }
double reference_divide( double x, double y ) { return x / y; }
// Add a + b. If the result modulo overflowed, write 1 to *carry, otherwise 0
static inline cl_ulong add_carry( cl_ulong a, cl_ulong b, cl_ulong *carry )
{
cl_ulong result = a + b;
*carry = result < a;
return result;
}
// Subtract a - b. If the result modulo overflowed, write 1 to *carry, otherwise 0
static inline cl_ulong sub_carry( cl_ulong a, cl_ulong b, cl_ulong *carry )
{
cl_ulong result = a - b;
*carry = result > a;
return result;
}
static float fallback_frexpf( float x, int *iptr )
{
cl_uint u, v;
float fu, fv;
memcpy( &u, &x, sizeof(u));
cl_uint exponent = u & 0x7f800000U;
cl_uint mantissa = u & ~0x7f800000U;
// add 1 to the exponent
exponent += 0x00800000U;
if( (cl_int) exponent < (cl_int) 0x01000000 )
{ // subnormal, NaN, Inf
mantissa |= 0x3f000000U;
v = mantissa & 0xff800000U;
u = mantissa;
memcpy( &fv, &v, sizeof(v));
memcpy( &fu, &u, sizeof(u));
fu -= fv;
memcpy( &v, &fv, sizeof(v));
memcpy( &u, &fu, sizeof(u));
exponent = u & 0x7f800000U;
mantissa = u & ~0x7f800000U;
*iptr = (exponent >> 23) + (-126 + 1 -126);
u = mantissa | 0x3f000000U;
memcpy( &fu, &u, sizeof(u));
return fu;
}
*iptr = (exponent >> 23) - 127;
u = mantissa | 0x3f000000U;
memcpy( &fu, &u, sizeof(u));
return fu;
}
static inline int extractf( float, cl_uint * );
static inline int extractf( float x, cl_uint *mant )
{
static float (*frexppf)(float, int*) = NULL;
int e;
// verify that frexp works properly
if( NULL == frexppf )
{
if( 0.5f == frexpf( HEX_FLT( +, 1, 0, -, 130 ), &e ) && e == -129 )
frexppf = frexpf;
else
frexppf = fallback_frexpf;
}
*mant = (cl_uint) (HEX_FLT( +, 1, 0, +, 32 ) * fabsf( frexppf( x, &e )));
return e - 1;
}
// Shift right by shift bits. Any bits lost on the right side are bitwise OR'd together and ORd into the LSB of the result
static inline void shift_right_sticky_64( cl_ulong *p, int shift );
static inline void shift_right_sticky_64( cl_ulong *p, int shift )
{
cl_ulong sticky = 0;
cl_ulong r = *p;
// C doesn't handle shifts greater than the size of the variable dependably
if( shift >= 64 )
{
sticky |= (0 != r);
r = 0;
}
else
{
sticky |= (0 != (r << (64-shift)));
r >>= shift;
}
*p = r | sticky;
}
// Add two 64 bit mantissas. Bits that are below the LSB of the result are OR'd into the LSB of the result
static inline void add64( cl_ulong *p, cl_ulong c, int *exponent );
static inline void add64( cl_ulong *p, cl_ulong c, int *exponent )
{
cl_ulong carry;
c = add_carry(c, *p, &carry);
if( carry )
{
carry = c & 1; // set aside sticky bit
c >>= 1; // right shift to deal with overflow
c |= carry | 0x8000000000000000ULL; // or in carry bit, and sticky bit. The latter is to prevent rounding from believing we are exact half way case
*exponent = *exponent + 1; // adjust exponent
}
*p = c;
}
// IEEE-754 round to nearest, ties to even rounding
static float round_to_nearest_even_float( cl_ulong p, int exponent );
static float round_to_nearest_even_float( cl_ulong p, int exponent )
{
union{ cl_uint u; cl_float d;} u;
// If mantissa is zero, return 0.0f
if (p == 0) return 0.0f;
// edges
if( exponent > 127 )
{
volatile float r = exponent * CL_FLT_MAX; // signal overflow
// attempt to fool the compiler into not optimizing the above line away
if( r > CL_FLT_MAX )
return INFINITY;
return r;
}
if( exponent == -150 && p > 0x8000000000000000ULL)
return HEX_FLT( +, 1, 0, -, 149 );
if( exponent <= -150 ) return 0.0f;
//Figure out which bits go where
int shift = 8 + 32;
if( exponent < -126 )
{
shift -= 126 + exponent; // subnormal: shift is not 52
exponent = -127; // set exponent to 0
}
else
p &= 0x7fffffffffffffffULL; // normal: leading bit is implicit. Remove it.
// Assemble the double (round toward zero)
u.u = (cl_uint)(p >> shift) | ((cl_uint) (exponent + 127) << 23);
// put a representation of the residual bits into hi
p <<= (64-shift);
//round to nearest, ties to even based on the unused portion of p
if( p < 0x8000000000000000ULL ) return u.d;
if( p == 0x8000000000000000ULL ) u.u += u.u & 1U;
else u.u++;
return u.d;
}
static float round_to_nearest_even_float_ftz( cl_ulong p, int exponent );
static float round_to_nearest_even_float_ftz( cl_ulong p, int exponent )
{
extern int gCheckTininessBeforeRounding;
union{ cl_uint u; cl_float d;} u;
int shift = 8 + 32;
// If mantissa is zero, return 0.0f
if (p == 0) return 0.0f;
// edges
if( exponent > 127 )
{
volatile float r = exponent * CL_FLT_MAX; // signal overflow
// attempt to fool the compiler into not optimizing the above line away
if( r > CL_FLT_MAX )
return INFINITY;
return r;
}
// Deal with FTZ for gCheckTininessBeforeRounding
if( exponent < (gCheckTininessBeforeRounding - 127) )
return 0.0f;
if( exponent == -127 ) // only happens for machines that check tininess after rounding
p = (p&1) | (p>>1);
else
p &= 0x7fffffffffffffffULL; // normal: leading bit is implicit. Remove it.
cl_ulong q = p;
// Assemble the double (round toward zero)
u.u = (cl_uint)(q >> shift) | ((cl_uint) (exponent + 127) << 23);
// put a representation of the residual bits into hi
q <<= (64-shift);
//round to nearest, ties to even based on the unused portion of p
if( q > 0x8000000000000000ULL )
u.u++;
else if( q == 0x8000000000000000ULL )
u.u += u.u & 1U;
// Deal with FTZ for ! gCheckTininessBeforeRounding
if( 0 == (u.u & 0x7f800000U ) )
return 0.0f;
return u.d;
}
// IEEE-754 round toward zero.
static float round_toward_zero_float( cl_ulong p, int exponent );
static float round_toward_zero_float( cl_ulong p, int exponent )
{
union{ cl_uint u; cl_float d;} u;
// If mantissa is zero, return 0.0f
if (p == 0) return 0.0f;
// edges
if( exponent > 127 )
{
volatile float r = exponent * CL_FLT_MAX; // signal overflow
// attempt to fool the compiler into not optimizing the above line away
if( r > CL_FLT_MAX )
return CL_FLT_MAX;
return r;
}
if( exponent <= -149 )
return 0.0f;
//Figure out which bits go where
int shift = 8 + 32;
if( exponent < -126 )
{
shift -= 126 + exponent; // subnormal: shift is not 52
exponent = -127; // set exponent to 0
}
else
p &= 0x7fffffffffffffffULL; // normal: leading bit is implicit. Remove it.
// Assemble the double (round toward zero)
u.u = (cl_uint)(p >> shift) | ((cl_uint) (exponent + 127) << 23);
return u.d;
}
static float round_toward_zero_float_ftz( cl_ulong p, int exponent );
static float round_toward_zero_float_ftz( cl_ulong p, int exponent )
{
extern int gCheckTininessBeforeRounding;
union{ cl_uint u; cl_float d;} u;
int shift = 8 + 32;
// If mantissa is zero, return 0.0f
if (p == 0) return 0.0f;
// edges
if( exponent > 127 )
{
volatile float r = exponent * CL_FLT_MAX; // signal overflow
// attempt to fool the compiler into not optimizing the above line away
if( r > CL_FLT_MAX )
return CL_FLT_MAX;
return r;
}
// Deal with FTZ for gCheckTininessBeforeRounding
if( exponent < -126 )
return 0.0f;
cl_ulong q = p &= 0x7fffffffffffffffULL; // normal: leading bit is implicit. Remove it.
// Assemble the double (round toward zero)
u.u = (cl_uint)(q >> shift) | ((cl_uint) (exponent + 127) << 23);
// put a representation of the residual bits into hi
q <<= (64-shift);
return u.d;
}
// Subtract two significands.
static inline void sub64( cl_ulong *c, cl_ulong p, cl_uint *signC, int *expC );
static inline void sub64( cl_ulong *c, cl_ulong p, cl_uint *signC, int *expC )
{
cl_ulong carry;
p = sub_carry( *c, p, &carry );
if( carry )
{
*signC ^= 0x80000000U;
p = -p;
}
// normalize
if( p )
{
int shift = 32;
cl_ulong test = 1ULL << 32;
while( 0 == (p & 0x8000000000000000ULL))
{
if( p < test )
{
p <<= shift;
*expC = *expC - shift;
}
shift >>= 1;
test <<= shift;
}
}
else
{
// zero result.
*expC = -200;
*signC = 0; // IEEE rules say a - a = +0 for all rounding modes except -inf
}
*c = p;
}
float reference_fma( float a, float b, float c, int shouldFlush )
{
static const cl_uint kMSB = 0x80000000U;
// Make bits accessible
union{ cl_uint u; cl_float d; } ua; ua.d = a;
union{ cl_uint u; cl_float d; } ub; ub.d = b;
union{ cl_uint u; cl_float d; } uc; uc.d = c;
// deal with Nans, infinities and zeros
if( isnan( a ) || isnan( b ) || isnan(c) ||
isinf( a ) || isinf( b ) || isinf(c) ||
0 == ( ua.u & ~kMSB) || // a == 0, defeat host FTZ behavior
0 == ( ub.u & ~kMSB) || // b == 0, defeat host FTZ behavior
0 == ( uc.u & ~kMSB) ) // c == 0, defeat host FTZ behavior
{
FPU_mode_type oldMode;
RoundingMode oldRoundMode = kRoundToNearestEven;
if( isinf( c ) && !isinf(a) && !isinf(b) )
return (c + a) + b;
if (gIsInRTZMode)
oldRoundMode = set_round(kRoundTowardZero, kfloat);
memset( &oldMode, 0, sizeof( oldMode ) );
if( shouldFlush )
ForceFTZ( &oldMode );
a = (float) reference_multiply( a, b ); // some risk that the compiler will insert a non-compliant fma here on some platforms.
a = (float) reference_add( a, c ); // We use STDC FP_CONTRACT OFF above to attempt to defeat that.
if( shouldFlush )
RestoreFPState( &oldMode );
if( gIsInRTZMode )
set_round(oldRoundMode, kfloat);
return a;
}
// extract exponent and mantissa
// exponent is a standard unbiased signed integer
// mantissa is a cl_uint, with leading non-zero bit positioned at the MSB
cl_uint mantA, mantB, mantC;
int expA = extractf( a, &mantA );
int expB = extractf( b, &mantB );
int expC = extractf( c, &mantC );
cl_uint signC = uc.u & kMSB; // We'll need the sign bit of C later to decide if we are adding or subtracting
// exact product of A and B
int exponent = expA + expB;
cl_uint sign = (ua.u ^ ub.u) & kMSB;
cl_ulong product = (cl_ulong) mantA * (cl_ulong) mantB;
// renormalize -- 1.m * 1.n yields a number between 1.0 and 3.99999..
// The MSB might not be set. If so, fix that. Otherwise, reflect the fact that we got another power of two from the multiplication
if( 0 == (0x8000000000000000ULL & product) )
product <<= 1;
else
exponent++; // 2**31 * 2**31 gives 2**62. If the MSB was set, then our exponent increased.
//infinite precision add
cl_ulong addend = (cl_ulong) mantC << 32;
if( exponent >= expC )
{
// Shift C relative to the product so that their exponents match
if( exponent > expC )
shift_right_sticky_64( &addend, exponent - expC );
// Add
if( sign ^ signC )
sub64( &product, addend, &sign, &exponent );
else
add64( &product, addend, &exponent );
}
else
{
// Shift the product relative to C so that their exponents match
shift_right_sticky_64( &product, expC - exponent );
// add
if( sign ^ signC )
sub64( &addend, product, &signC, &expC );
else
add64( &addend, product, &expC );
product = addend;
exponent = expC;
sign = signC;
}
// round to IEEE result -- we do not do flushing to zero here. That part is handled manually in ternary.c.
if (gIsInRTZMode)
{
if( shouldFlush )
ua.d = round_toward_zero_float_ftz( product, exponent);
else
ua.d = round_toward_zero_float( product, exponent);
}
else
{
if( shouldFlush )
ua.d = round_to_nearest_even_float_ftz( product, exponent);
else
ua.d = round_to_nearest_even_float( product, exponent);
}
// Set the sign
ua.u |= sign;
return ua.d;
}
double reference_relaxed_exp10( double x)
{
return reference_exp10(x);
}
double reference_exp10( double x) { return reference_exp2( x * HEX_DBL( +, 1, a934f0979a371, +, 1 ) ); }
int reference_ilogb( double x )
{
extern int gDeviceILogb0, gDeviceILogbNaN;
union { cl_double f; cl_ulong u;} u;
u.f = (float) x;
cl_int exponent = (cl_int) (u.u >> 52) & 0x7ff;
if( exponent == 0x7ff )
{
if( u.u & 0x000fffffffffffffULL )
return gDeviceILogbNaN;
return CL_INT_MAX;
}
if( exponent == 0 )
{ // deal with denormals
u.f = x * HEX_DBL( +, 1, 0, +, 64 );
exponent = (cl_int) (u.u >> 52) & 0x7ff;
if( exponent == 0 )
return gDeviceILogb0;
return exponent - (1023 + 64);
}
return exponent - 1023;
}
double reference_nan( cl_uint x )
{
union{ cl_uint u; cl_float f; }u;
u.u = x | 0x7fc00000U;
return (double) u.f;
}
double reference_maxmag( double x, double y )
{
double fabsx = fabs(x);
double fabsy = fabs(y);
if( fabsx < fabsy )
return y;
if( fabsy < fabsx )
return x;
return reference_fmax( x, y );
}
double reference_minmag( double x, double y )
{
double fabsx = fabs(x);
double fabsy = fabs(y);
if( fabsx > fabsy )
return y;
if( fabsy > fabsx )
return x;
return reference_fmin( x, y );
}
//double my_nextafter( double x, double y ){ return (double) nextafterf( (float) x, (float) y ); }
double reference_relaxed_mad( double a, double b, double c)
{
return ((float) a )* ((float) b) + (float) c;
}
double reference_mad( double a, double b, double c )
{
return a * b + c;
}
double reference_recip( double x) { return 1.0 / x; }
double reference_rootn( double x, int i )
{
//rootn ( x, 0 ) returns a NaN.
if( 0 == i )
return cl_make_nan();
//rootn ( x, n ) returns a NaN for x < 0 and n is even.
if( x < 0 && 0 == (i&1) )
return cl_make_nan();
if( x == 0.0 )
{
switch( i & 0x80000001 )
{
//rootn ( +-0, n ) is +0 for even n > 0.
case 0:
return 0.0f;
//rootn ( +-0, n ) is +-0 for odd n > 0.
case 1:
return x;
//rootn ( +-0, n ) is +inf for even n < 0.
case 0x80000000:
return INFINITY;
//rootn ( +-0, n ) is +-inf for odd n < 0.
case 0x80000001:
return copysign(INFINITY, x);
}
}
double sign = x;
x = reference_fabs(x);
x = reference_exp2( reference_log2(x) / (double) i );
return reference_copysignd( x, sign );
}
double reference_rsqrt( double x) { return 1.0 / reference_sqrt(x); }
//double reference_sincos( double x, double *c ){ *c = cos(x); return sin(x); }
double reference_sinpi( double x)
{
double r = reduce1(x);
// reduce to [-0.5, 0.5]
if( r < -0.5 )
r = -1 - r;
else if ( r > 0.5 )
r = 1 - r;
// sinPi zeros have the same sign as x
if( r == 0.0 )
return reference_copysignd(0.0, x);
return reference_sin( r * M_PI );
}
double reference_relaxed_sinpi(double x) { return reference_sinpi(x); }
double reference_tanpi( double x)
{
// set aside the sign (allows us to preserve sign of -0)
double sign = reference_copysignd( 1.0, x);
double z = reference_fabs(x);
// if big and even -- caution: only works if x only has single precision
if( z >= HEX_DBL( +, 1, 0, +, 24 ) )
{
if( z == INFINITY )
return x - x; // nan
return reference_copysignd( 0.0, x); // tanpi ( n ) is copysign( 0.0, n) for even integers n.
}
// reduce to the range [ -0.5, 0.5 ]
double nearest = reference_rint( z ); // round to nearest even places n + 0.5 values in the right place for us
int i = (int) nearest; // test above against 0x1.0p24 avoids overflow here
z -= nearest;
//correction for odd integer x for the right sign of zero
if( (i&1) && z == 0.0 )
sign = -sign;
// track changes to the sign
sign *= reference_copysignd(1.0, z); // really should just be an xor
z = reference_fabs(z); // remove the sign again
// reduce once more
// If we don't do this, rounding error in z * M_PI will cause us not to return infinities properly
if( z > 0.25 )
{
z = 0.5 - z;
return sign / reference_tan( z * M_PI ); // use system tan to get the right result
}
//
return sign * reference_tan( z * M_PI ); // use system tan to get the right result
}
double reference_pown( double x, int i) { return reference_pow( x, (double) i ); }
double reference_powr( double x, double y )
{
//powr ( x, y ) returns NaN for x < 0.
if( x < 0.0 )
return cl_make_nan();
//powr ( x, NaN ) returns the NaN for x >= 0.
//powr ( NaN, y ) returns the NaN.
if( isnan(x) || isnan(y) )
return x + y; // Note: behavior different here than for pow(1,NaN), pow(NaN, 0)
if( x == 1.0 )
{
//powr ( +1, +-inf ) returns NaN.
if( reference_fabs(y) == INFINITY )
return cl_make_nan();
//powr ( +1, y ) is 1 for finite y. (NaN handled above)
return 1.0;
}
if( y == 0.0 )
{
//powr ( +inf, +-0 ) returns NaN.
//powr ( +-0, +-0 ) returns NaN.
if( x == 0.0 || x == INFINITY )
return cl_make_nan();
//powr ( x, +-0 ) is 1 for finite x > 0. (x <= 0, NaN, INF already handled above)
return 1.0;
}
if( x == 0.0 )
{
//powr ( +-0, -inf) is +inf.
//powr ( +-0, y ) is +inf for finite y < 0.
if( y < 0.0 )
return INFINITY;
//powr ( +-0, y ) is +0 for y > 0. (NaN, y==0 handled above)
return 0.0;
}
// x = +inf
if( isinf(x) )
{
if( y < 0 )
return 0;
return INFINITY;
}
double fabsx = reference_fabs(x);
double fabsy = reference_fabs(y);
//y = +-inf cases
if( isinf(fabsy) )
{
if( y < 0 )
{
if( fabsx < 1 )
return INFINITY;
return 0;
}
if( fabsx < 1 )
return 0;
return INFINITY;
}
double hi, lo;
__log2_ep(&hi, &lo, x);
double prod = y * hi;
double result = reference_exp2(prod);
return result;
}
double reference_fract( double x, double *ip )
{
if(isnan(x)) {
*ip = cl_make_nan();
return cl_make_nan();
}
float i;
float f = modff((float) x, &i );
if( f < 0.0 )
{
f = 1.0f + f;
i -= 1.0f;
if( f == 1.0f )
f = HEX_FLT( +, 1, fffffe, -, 1 );
}
*ip = i;
return f;
}
//double my_fdim( double x, double y){ return fdimf( (float) x, (float) y ); }
double reference_add( double x, double y )
{
volatile float a = (float) x;
volatile float b = (float) y;
#if defined( __SSE__ ) || (defined( _MSC_VER ) && (defined(_M_IX86) || defined(_M_X64)))
// defeat x87
__m128 va = _mm_set_ss( (float) a );
__m128 vb = _mm_set_ss( (float) b );
va = _mm_add_ss( va, vb );
_mm_store_ss( (float*) &a, va );
#elif defined(__PPC__)
// Most Power host CPUs do not support the non-IEEE mode (NI) which flushes denorm's to zero.
// As such, the reference add with FTZ must be emulated in sw.
if (fpu_control & _FPU_MASK_NI) {
union{ cl_uint u; cl_float d; } ua; ua.d = a;
union{ cl_uint u; cl_float d; } ub; ub.d = b;
cl_uint mantA, mantB;
cl_ulong addendA, addendB, sum;
int expA = extractf( a, &mantA );
int expB = extractf( b, &mantB );
cl_uint signA = ua.u & 0x80000000U;
cl_uint signB = ub.u & 0x80000000U;
// Force matching exponents if an operand is 0
if (a == 0.0f) {
expA = expB;
} else if (b == 0.0f) {
expB = expA;
}
addendA = (cl_ulong)mantA << 32;
addendB = (cl_ulong)mantB << 32;
if (expA >= expB) {
// Shift B relative to the A so that their exponents match
if( expA > expB )
shift_right_sticky_64( &addendB, expA - expB );
// add
if( signA ^ signB )
sub64( &addendA, addendB, &signA, &expA );
else
add64( &addendA, addendB, &expA );
} else {
// Shift the A relative to B so that their exponents match
shift_right_sticky_64( &addendA, expB - expA );
// add
if( signA ^ signB )
sub64( &addendB, addendA, &signB, &expB );
else
add64( &addendB, addendA, &expB );
addendA = addendB;
expA = expB;
signA = signB;
}
// round to IEEE result
if (gIsInRTZMode) {
ua.d = round_toward_zero_float_ftz( addendA, expA );
} else {
ua.d = round_to_nearest_even_float_ftz( addendA, expA );
}
// Set the sign
ua.u |= signA;
a = ua.d;
} else {
a += b;
}
#else
a += b;
#endif
return (double) a;
}
double reference_subtract( double x, double y )
{
volatile float a = (float) x;
volatile float b = (float) y;
#if defined( __SSE__ ) || (defined( _MSC_VER ) && (defined(_M_IX86) || defined(_M_X64)))
// defeat x87
__m128 va = _mm_set_ss( (float) a );
__m128 vb = _mm_set_ss( (float) b );
va = _mm_sub_ss( va, vb );
_mm_store_ss( (float*) &a, va );
#else
a -= b;
#endif
return a;
}
//double reference_divide( double x, double y ){ return (float) x / (float) y; }
double reference_multiply( double x, double y)
{
volatile float a = (float) x;
volatile float b = (float) y;
#if defined( __SSE__ ) || (defined( _MSC_VER ) && (defined(_M_IX86) || defined(_M_X64)))
// defeat x87
__m128 va = _mm_set_ss( (float) a );
__m128 vb = _mm_set_ss( (float) b );
va = _mm_mul_ss( va, vb );
_mm_store_ss( (float*) &a, va );
#elif defined(__PPC__)
// Most Power host CPUs do not support the non-IEEE mode (NI) which flushes denorm's to zero.
// As such, the reference multiply with FTZ must be emulated in sw.
if (fpu_control & _FPU_MASK_NI) {
// extract exponent and mantissa
// exponent is a standard unbiased signed integer
// mantissa is a cl_uint, with leading non-zero bit positioned at the MSB
union{ cl_uint u; cl_float d; } ua; ua.d = a;
union{ cl_uint u; cl_float d; } ub; ub.d = b;
cl_uint mantA, mantB;
int expA = extractf( a, &mantA );
int expB = extractf( b, &mantB );
// exact product of A and B
int exponent = expA + expB;
cl_uint sign = (ua.u ^ ub.u) & 0x80000000U;
cl_ulong product = (cl_ulong) mantA * (cl_ulong) mantB;
// renormalize -- 1.m * 1.n yields a number between 1.0 and 3.99999..
// The MSB might not be set. If so, fix that. Otherwise, reflect the fact that we got another power of two from the multiplication
if( 0 == (0x8000000000000000ULL & product) )
product <<= 1;
else
exponent++; // 2**31 * 2**31 gives 2**62. If the MSB was set, then our exponent increased.
// round to IEEE result -- we do not do flushing to zero here. That part is handled manually in ternary.c.
if (gIsInRTZMode) {
ua.d = round_toward_zero_float_ftz( product, exponent);
} else {
ua.d = round_to_nearest_even_float_ftz( product, exponent);
}
// Set the sign
ua.u |= sign;
a = ua.d;
} else {
a *= b;
}
#else
a *= b;
#endif
return a;
}
/*double my_remquo( double x, double y, int *iptr )
{
if( isnan(x) || isnan(y) ||
fabs(x) == INFINITY ||
y == 0.0 )
{
*iptr = 0;
return NAN;
}
return (double) remquof( (float) x, (float) y, iptr );
}*/
double reference_lgamma_r( double x, int *signp )
{
// This is not currently tested
*signp = 0;
return x;
}
int reference_isequal( double x, double y ){ return x == y; }
int reference_isfinite( double x ){ return 0 != isfinite(x); }
int reference_isgreater( double x, double y ){ return x > y; }
int reference_isgreaterequal( double x, double y ){ return x >= y; }
int reference_isinf( double x ){ return 0 != isinf(x); }
int reference_isless( double x, double y ){ return x < y; }
int reference_islessequal( double x, double y ){ return x <= y; }
int reference_islessgreater( double x, double y ){ return 0 != islessgreater( x, y ); }
int reference_isnan( double x ){ return 0 != isnan( x ); }
int reference_isnormal( double x ){ return 0 != isnormal( (float) x ); }
int reference_isnotequal( double x, double y ){ return x != y; }
int reference_isordered( double x, double y){ return x == x && y == y; }
int reference_isunordered( double x, double y ){ return isnan(x) || isnan( y ); }
int reference_signbit( float x ){ return 0 != signbit( x ); }
#if 1 // defined( _MSC_VER )
//Missing functions for win32
float reference_copysign( float x, float y )
{
union { float f; cl_uint u;} ux, uy;
ux.f = x; uy.f = y;
ux.u &= 0x7fffffffU;
ux.u |= uy.u & 0x80000000U;
return ux.f;
}
double reference_copysignd( double x, double y )
{
union { double f; cl_ulong u;} ux, uy;
ux.f = x; uy.f = y;
ux.u &= 0x7fffffffffffffffULL;
ux.u |= uy.u & 0x8000000000000000ULL;
return ux.f;
}
double reference_round( double x )
{
double absx = reference_fabs(x);
if( absx < 0.5 )
return reference_copysignd( 0.0, x );
if( absx < HEX_DBL( +, 1, 0, +, 53 ) )
x = reference_trunc( x + reference_copysignd( 0.5, x ) );
return x;
}
double reference_trunc( double x )
{
if( fabs(x) < HEX_DBL( +, 1, 0, +, 53 ) )
{
cl_long l = (cl_long) x;
return reference_copysignd( (double) l, x );
}
return x;
}
#ifndef FP_ILOGB0
#define FP_ILOGB0 INT_MIN
#endif
#ifndef FP_ILOGBNAN
#define FP_ILOGBNAN INT_MAX
#endif
double reference_cbrt(double x){ return reference_copysignd( reference_pow( reference_fabs(x), 1.0/3.0 ), x ); }
/*
double reference_scalbn(double x, int i)
{ // suitable for checking single precision scalbnf only
if( i > 300 )
return copysign( INFINITY, x);
if( i < -300 )
return copysign( 0.0, x);
union{ cl_ulong u; double d;} u;
u.u = ((cl_ulong) i + 1023) << 52;
return x * u.d;
}
*/
double reference_rint( double x )
{
if( reference_fabs(x) < HEX_DBL( +, 1, 0, +, 52 ) )
{
double magic = reference_copysignd( HEX_DBL( +, 1, 0, +, 52 ), x );
double rounded = (x + magic) - magic;
x = reference_copysignd( rounded, x );
}
return x;
}
double reference_acosh( double x )
{ // not full precision. Sufficient precision to cover float
if( isnan(x) )
return x + x;
if( x < 1.0 )
return cl_make_nan();
return reference_log( x + reference_sqrt(x + 1) * reference_sqrt(x-1) );
}
double reference_asinh( double x )
{
/*
* ====================================================
* This function is from fdlibm: http://www.netlib.org
* It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
*
* Developed at SunSoft, a Sun Microsystems, Inc. business.
* Permission to use, copy, modify, and distribute this
* software is freely granted, provided that this notice
* is preserved.
* ====================================================
*/
if( isnan(x) || isinf(x) )
return x + x;
double absx = reference_fabs(x);
if( absx < HEX_DBL( +, 1, 0, -, 28 ) )
return x;
double sign = reference_copysignd(1.0, x);
if( absx > HEX_DBL( +, 1, 0, +, 28 ) )
return sign * (reference_log( absx ) + 0.693147180559945309417232121458176568); // log(2)
if( absx > 2.0 )
return sign * reference_log( 2.0 * absx + 1.0 / (reference_sqrt( x * x + 1.0 ) + absx));
return sign * reference_log1p( absx + x*x / (1.0 + reference_sqrt(1.0 + x*x)));
}
double reference_atanh( double x )
{
/*
* ====================================================
* This function is from fdlibm: http://www.netlib.org
* It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
*
* Developed at SunSoft, a Sun Microsystems, Inc. business.
* Permission to use, copy, modify, and distribute this
* software is freely granted, provided that this notice
* is preserved.
* ====================================================
*/
if( isnan(x) )
return x + x;
double signed_half = reference_copysignd( 0.5, x );
x = reference_fabs(x);
if( x > 1.0 )
return cl_make_nan();
if( x < 0.5 )
return signed_half * reference_log1p( 2.0 * ( x + x*x / (1-x) ) );
return signed_half * reference_log1p(2.0 * x / (1-x));
}
double reference_relaxed_atan(double x) { return reference_atan(x); }
double reference_relaxed_exp2( double x )
{
return reference_exp2(x);
}
double reference_exp2( double x )
{ // Note: only suitable for verifying single precision. Doesn't have range of a full double exp2 implementation.
if( x == 0.0 )
return 1.0;
// separate x into fractional and integer parts
double i = reference_rint( x ); // round to nearest integer
if( i < -150 )
return 0.0;
if( i > 129 )
return INFINITY;
double f = x - i; // -0.5 <= f <= 0.5
// find exp2(f)
// calculate as p(f) = (exp2(f)-1)/f
// exp2(f) = f * p(f) + 1
// p(f) is a minimax polynomial with error within 0x1.c1fd80f0d1ab7p-50
double p = 0.693147180560184539289 +
(0.240226506955902863183 +
(0.055504108656833424373 +
(0.009618129212846484796 +
(0.001333355902958566035 +
(0.000154034191902497930 +
(0.000015252317761038105 +
(0.000001326283129417092 + 0.000000102593187638680 * f)*f)*f)*f)*f)*f)*f)*f;
f *= p;
f += 1.0;
// scale by 2 ** i
union{ cl_ulong u; double d; } u;
int exponent = (int) i + 1023;
u.u = (cl_ulong) exponent << 52;
return f * u.d;
}
double reference_expm1( double x )
{ // Note: only suitable for verifying single precision. Doesn't have range of a full double expm1 implementation. It is only accurate to 47 bits or less.
// early out for small numbers and NaNs
if( ! (reference_fabs(x) > HEX_DBL( +, 1, 0, -, 24 )) )
return x;
// early out for large negative numbers
if( x < -130.0 )
return -1.0;
// early out for large positive numbers
if( x > 100.0 )
return INFINITY;
// separate x into fractional and integer parts
double i = reference_rint( x ); // round to nearest integer
double f = x - i; // -0.5 <= f <= 0.5
// reduce f to the range -0.0625 .. f.. 0.0625
int index = (int) (f * 16.0) + 8; // 0...16
static const double reduction[17] = { -0.5, -0.4375, -0.375, -0.3125, -0.25, -0.1875, -0.125, -0.0625,
0.0,
+0.0625, +0.125, +0.1875, +0.25, +0.3125, +0.375, +0.4375, +0.5 };
// exponentials[i] = expm1(reduction[i])
static const double exponentials[17] = { HEX_DBL( -, 1, 92e9a0720d3ec, -, 2 ), HEX_DBL( -, 1, 6adb1cd9205ee, -, 2 ),
HEX_DBL( -, 1, 40373d42ce2e3, -, 2 ), HEX_DBL( -, 1, 12d35a41ba104, -, 2 ),
HEX_DBL( -, 1, c5041854df7d4, -, 3 ), HEX_DBL( -, 1, 5e25fb4fde211, -, 3 ),
HEX_DBL( -, 1, e14aed893eef4, -, 4 ), HEX_DBL( -, 1, f0540438fd5c3, -, 5 ),
HEX_DBL( +, 0, 0, +, 0 ),
HEX_DBL( +, 1, 082b577d34ed8, -, 4 ), HEX_DBL( +, 1, 10b022db7ae68, -, 3 ),
HEX_DBL( +, 1, a65c0b85ac1a9, -, 3 ), HEX_DBL( +, 1, 22d78f0fa061a, -, 2 ),
HEX_DBL( +, 1, 77a45d8117fd5, -, 2 ), HEX_DBL( +, 1, d1e944f6fbdaa, -, 2 ),
HEX_DBL( +, 1, 190048ef6002, -, 1 ), HEX_DBL( +, 1, 4c2531c3c0d38, -, 1 ),
};
f -= reduction[index];
// find expm1(f)
// calculate as p(f) = (exp(f)-1)/f
// expm1(f) = f * p(f)
// p(f) is a minimax polynomial with error within 0x1.1d7693618d001p-48 over the range +- 0.0625
double p = 0.999999999999998001599 +
(0.499999999999839628284 +
(0.166666666672817459505 +
(0.041666666612283048687 +
(0.008333330214567431435 +
(0.001389005319303770070 + 0.000198833381525156667 * f)*f)*f)*f)*f)*f;
f *= p; // expm1( reduced f )
// expm1(f) = (exmp1( reduced_f) + 1.0) * ( exponentials[index] + 1 ) - 1
// = exmp1( reduced_f) * exponentials[index] + exmp1( reduced_f) + exponentials[index] + 1 -1
// = exmp1( reduced_f) * exponentials[index] + exmp1( reduced_f) + exponentials[index]
f += exponentials[index] + f * exponentials[index];
// scale by e ** i
int exponent = (int) i;
if( 0 == exponent )
return f; // precise answer for x near 1
// table of e**(i-150)
static const double exp_table[128+150+1] =
{
HEX_DBL( +, 1, 82e16284f5ec5, -, 217 ), HEX_DBL( +, 1, 06e9996332ba1, -, 215 ),
HEX_DBL( +, 1, 6555cb289e44b, -, 214 ), HEX_DBL( +, 1, e5ab364643354, -, 213 ),
HEX_DBL( +, 1, 4a0bd18e64df7, -, 211 ), HEX_DBL( +, 1, c094499cc578e, -, 210 ),
HEX_DBL( +, 1, 30d759323998c, -, 208 ), HEX_DBL( +, 1, 9e5278ab1d4cf, -, 207 ),
HEX_DBL( +, 1, 198fa3f30be25, -, 205 ), HEX_DBL( +, 1, 7eae636d6144e, -, 204 ),
HEX_DBL( +, 1, 040f1036f4863, -, 202 ), HEX_DBL( +, 1, 6174e477a895f, -, 201 ),
HEX_DBL( +, 1, e065b82dd95a, -, 200 ), HEX_DBL( +, 1, 4676be491d129, -, 198 ),
HEX_DBL( +, 1, bbb5da5f7c823, -, 197 ), HEX_DBL( +, 1, 2d884eef5fdcb, -, 195 ),
HEX_DBL( +, 1, 99d3397ab8371, -, 194 ), HEX_DBL( +, 1, 1681497ed15b3, -, 192 ),
HEX_DBL( +, 1, 7a870f597fdbd, -, 191 ), HEX_DBL( +, 1, 013c74edba307, -, 189 ),
HEX_DBL( +, 1, 5d9ec4ada7938, -, 188 ), HEX_DBL( +, 1, db2edfd20fa7c, -, 187 ),
HEX_DBL( +, 1, 42eb9f39afb0b, -, 185 ), HEX_DBL( +, 1, b6e4f282b43f4, -, 184 ),
HEX_DBL( +, 1, 2a42764857b19, -, 182 ), HEX_DBL( +, 1, 9560792d19314, -, 181 ),
HEX_DBL( +, 1, 137b6ce8e052c, -, 179 ), HEX_DBL( +, 1, 766b45dd84f18, -, 178 ),
HEX_DBL( +, 1, fce362fe6e7d, -, 177 ), HEX_DBL( +, 1, 59d34dd8a5473, -, 175 ),
HEX_DBL( +, 1, d606847fc727a, -, 174 ), HEX_DBL( +, 1, 3f6a58b795de3, -, 172 ),
HEX_DBL( +, 1, b2216c6efdac1, -, 171 ), HEX_DBL( +, 1, 2705b5b153fb8, -, 169 ),
HEX_DBL( +, 1, 90fa1509bd50d, -, 168 ), HEX_DBL( +, 1, 107df698da211, -, 166 ),
HEX_DBL( +, 1, 725ae6e7b9d35, -, 165 ), HEX_DBL( +, 1, f75d6040aeff6, -, 164 ),
HEX_DBL( +, 1, 56126259e093c, -, 162 ), HEX_DBL( +, 1, d0ec7df4f7bd4, -, 161 ),
HEX_DBL( +, 1, 3bf2cf6722e46, -, 159 ), HEX_DBL( +, 1, ad6b22f55db42, -, 158 ),
HEX_DBL( +, 1, 23d1f3e5834a, -, 156 ), HEX_DBL( +, 1, 8c9feab89b876, -, 155 ),
HEX_DBL( +, 1, 0d88cf37f00dd, -, 153 ), HEX_DBL( +, 1, 6e55d2bf838a7, -, 152 ),
HEX_DBL( +, 1, f1e6b68529e33, -, 151 ), HEX_DBL( +, 1, 525be4e4e601d, -, 149 ),
HEX_DBL( +, 1, cbe0a45f75eb1, -, 148 ), HEX_DBL( +, 1, 3884e838aea68, -, 146 ),
HEX_DBL( +, 1, a8c1f14e2af5d, -, 145 ), HEX_DBL( +, 1, 20a717e64a9bd, -, 143 ),
HEX_DBL( +, 1, 8851d84118908, -, 142 ), HEX_DBL( +, 1, 0a9bdfb02d24, -, 140 ),
HEX_DBL( +, 1, 6a5bea046b42e, -, 139 ), HEX_DBL( +, 1, ec7f3b269efa8, -, 138 ),
HEX_DBL( +, 1, 4eafb87eab0f2, -, 136 ), HEX_DBL( +, 1, c6e2d05bbc, -, 135 ),
HEX_DBL( +, 1, 35208867c2683, -, 133 ), HEX_DBL( +, 1, a425b317eeacd, -, 132 ),
HEX_DBL( +, 1, 1d8508fa8246a, -, 130 ), HEX_DBL( +, 1, 840fbc08fdc8a, -, 129 ),
HEX_DBL( +, 1, 07b7112bc1ffe, -, 127 ), HEX_DBL( +, 1, 666d0dad2961d, -, 126 ),
HEX_DBL( +, 1, e726c3f64d0fe, -, 125 ), HEX_DBL( +, 1, 4b0dc07cabf98, -, 123 ),
HEX_DBL( +, 1, c1f2daf3b6a46, -, 122 ), HEX_DBL( +, 1, 31c5957a47de2, -, 120 ),
HEX_DBL( +, 1, 9f96445648b9f, -, 119 ), HEX_DBL( +, 1, 1a6baeadb4fd1, -, 117 ),
HEX_DBL( +, 1, 7fd974d372e45, -, 116 ), HEX_DBL( +, 1, 04da4d1452919, -, 114 ),
HEX_DBL( +, 1, 62891f06b345, -, 113 ), HEX_DBL( +, 1, e1dd273aa8a4a, -, 112 ),
HEX_DBL( +, 1, 4775e0840bfdd, -, 110 ), HEX_DBL( +, 1, bd109d9d94bda, -, 109 ),
HEX_DBL( +, 1, 2e73f53fba844, -, 107 ), HEX_DBL( +, 1, 9b138170d6bfe, -, 106 ),
HEX_DBL( +, 1, 175af0cf60ec5, -, 104 ), HEX_DBL( +, 1, 7baee1bffa80b, -, 103 ),
HEX_DBL( +, 1, 02057d1245ceb, -, 101 ), HEX_DBL( +, 1, 5eafffb34ba31, -, 100 ),
HEX_DBL( +, 1, dca23bae16424, -, 99 ), HEX_DBL( +, 1, 43e7fc88b8056, -, 97 ),
HEX_DBL( +, 1, b83bf23a9a9eb, -, 96 ), HEX_DBL( +, 1, 2b2b8dd05b318, -, 94 ),
HEX_DBL( +, 1, 969d47321e4cc, -, 93 ), HEX_DBL( +, 1, 1452b7723aed2, -, 91 ),
HEX_DBL( +, 1, 778fe2497184c, -, 90 ), HEX_DBL( +, 1, fe7116182e9cc, -, 89 ),
HEX_DBL( +, 1, 5ae191a99585a, -, 87 ), HEX_DBL( +, 1, d775d87da854d, -, 86 ),
HEX_DBL( +, 1, 4063f8cc8bb98, -, 84 ), HEX_DBL( +, 1, b374b315f87c1, -, 83 ),
HEX_DBL( +, 1, 27ec458c65e3c, -, 81 ), HEX_DBL( +, 1, 923372c67a074, -, 80 ),
HEX_DBL( +, 1, 1152eaeb73c08, -, 78 ), HEX_DBL( +, 1, 737c5645114b5, -, 77 ),
HEX_DBL( +, 1, f8e6c24b5592e, -, 76 ), HEX_DBL( +, 1, 571db733a9d61, -, 74 ),
HEX_DBL( +, 1, d257d547e083f, -, 73 ), HEX_DBL( +, 1, 3ce9b9de78f85, -, 71 ),
HEX_DBL( +, 1, aebabae3a41b5, -, 70 ), HEX_DBL( +, 1, 24b6031b49bda, -, 68 ),
HEX_DBL( +, 1, 8dd5e1bb09d7e, -, 67 ), HEX_DBL( +, 1, 0e5b73d1ff53d, -, 65 ),
HEX_DBL( +, 1, 6f741de1748ec, -, 64 ), HEX_DBL( +, 1, f36bd37f42f3e, -, 63 ),
HEX_DBL( +, 1, 536452ee2f75c, -, 61 ), HEX_DBL( +, 1, cd480a1b7482, -, 60 ),
HEX_DBL( +, 1, 39792499b1a24, -, 58 ), HEX_DBL( +, 1, aa0de4bf35b38, -, 57 ),
HEX_DBL( +, 1, 2188ad6ae3303, -, 55 ), HEX_DBL( +, 1, 898471fca6055, -, 54 ),
HEX_DBL( +, 1, 0b6c3afdde064, -, 52 ), HEX_DBL( +, 1, 6b7719a59f0e, -, 51 ),
HEX_DBL( +, 1, ee001eed62aa, -, 50 ), HEX_DBL( +, 1, 4fb547c775da8, -, 48 ),
HEX_DBL( +, 1, c8464f7616468, -, 47 ), HEX_DBL( +, 1, 36121e24d3bba, -, 45 ),
HEX_DBL( +, 1, a56e0c2ac7f75, -, 44 ), HEX_DBL( +, 1, 1e642baeb84a, -, 42 ),
HEX_DBL( +, 1, 853f01d6d53ba, -, 41 ), HEX_DBL( +, 1, 0885298767e9a, -, 39 ),
HEX_DBL( +, 1, 67852a7007e42, -, 38 ), HEX_DBL( +, 1, e8a37a45fc32e, -, 37 ),
HEX_DBL( +, 1, 4c1078fe9228a, -, 35 ), HEX_DBL( +, 1, c3527e433fab1, -, 34 ),
HEX_DBL( +, 1, 32b48bf117da2, -, 32 ), HEX_DBL( +, 1, a0db0d0ddb3ec, -, 31 ),
HEX_DBL( +, 1, 1b48655f37267, -, 29 ), HEX_DBL( +, 1, 81056ff2c5772, -, 28 ),
HEX_DBL( +, 1, 05a628c699fa1, -, 26 ), HEX_DBL( +, 1, 639e3175a689d, -, 25 ),
HEX_DBL( +, 1, e355bbaee85cb, -, 24 ), HEX_DBL( +, 1, 4875ca227ec38, -, 22 ),
HEX_DBL( +, 1, be6c6fdb01612, -, 21 ), HEX_DBL( +, 1, 2f6053b981d98, -, 19 ),
HEX_DBL( +, 1, 9c54c3b43bc8b, -, 18 ), HEX_DBL( +, 1, 18354238f6764, -, 16 ),
HEX_DBL( +, 1, 7cd79b5647c9b, -, 15 ), HEX_DBL( +, 1, 02cf22526545a, -, 13 ),
HEX_DBL( +, 1, 5fc21041027ad, -, 12 ), HEX_DBL( +, 1, de16b9c24a98f, -, 11 ),
HEX_DBL( +, 1, 44e51f113d4d6, -, 9 ), HEX_DBL( +, 1, b993fe00d5376, -, 8 ),
HEX_DBL( +, 1, 2c155b8213cf4, -, 6 ), HEX_DBL( +, 1, 97db0ccceb0af, -, 5 ),
HEX_DBL( +, 1, 152aaa3bf81cc, -, 3 ), HEX_DBL( +, 1, 78b56362cef38, -, 2 ),
HEX_DBL( +, 1, 0, +, 0 ), HEX_DBL( +, 1, 5bf0a8b145769, +, 1 ),
HEX_DBL( +, 1, d8e64b8d4ddae, +, 2 ), HEX_DBL( +, 1, 415e5bf6fb106, +, 4 ),
HEX_DBL( +, 1, b4c902e273a58, +, 5 ), HEX_DBL( +, 1, 28d389970338f, +, 7 ),
HEX_DBL( +, 1, 936dc5690c08f, +, 8 ), HEX_DBL( +, 1, 122885aaeddaa, +, 10 ),
HEX_DBL( +, 1, 749ea7d470c6e, +, 11 ), HEX_DBL( +, 1, fa7157c470f82, +, 12 ),
HEX_DBL( +, 1, 5829dcf95056, +, 14 ), HEX_DBL( +, 1, d3c4488ee4f7f, +, 15 ),
HEX_DBL( +, 1, 3de1654d37c9a, +, 17 ), HEX_DBL( +, 1, b00b5916ac955, +, 18 ),
HEX_DBL( +, 1, 259ac48bf05d7, +, 20 ), HEX_DBL( +, 1, 8f0ccafad2a87, +, 21 ),
HEX_DBL( +, 1, 0f2ebd0a8002, +, 23 ), HEX_DBL( +, 1, 709348c0ea4f9, +, 24 ),
HEX_DBL( +, 1, f4f22091940bd, +, 25 ), HEX_DBL( +, 1, 546d8f9ed26e1, +, 27 ),
HEX_DBL( +, 1, ceb088b68e804, +, 28 ), HEX_DBL( +, 1, 3a6e1fd9eecfd, +, 30 ),
HEX_DBL( +, 1, ab5adb9c436, +, 31 ), HEX_DBL( +, 1, 226af33b1fdc1, +, 33 ),
HEX_DBL( +, 1, 8ab7fb5475fb7, +, 34 ), HEX_DBL( +, 1, 0c3d3920962c9, +, 36 ),
HEX_DBL( +, 1, 6c932696a6b5d, +, 37 ), HEX_DBL( +, 1, ef822f7f6731d, +, 38 ),
HEX_DBL( +, 1, 50bba3796379a, +, 40 ), HEX_DBL( +, 1, c9aae4631c056, +, 41 ),
HEX_DBL( +, 1, 370470aec28ed, +, 43 ), HEX_DBL( +, 1, a6b765d8cdf6d, +, 44 ),
HEX_DBL( +, 1, 1f43fcc4b662c, +, 46 ), HEX_DBL( +, 1, 866f34a725782, +, 47 ),
HEX_DBL( +, 1, 0953e2f3a1ef7, +, 49 ), HEX_DBL( +, 1, 689e221bc8d5b, +, 50 ),
HEX_DBL( +, 1, ea215a1d20d76, +, 51 ), HEX_DBL( +, 1, 4d13fbb1a001a, +, 53 ),
HEX_DBL( +, 1, c4b334617cc67, +, 54 ), HEX_DBL( +, 1, 33a43d282a519, +, 56 ),
HEX_DBL( +, 1, a220d397972eb, +, 57 ), HEX_DBL( +, 1, 1c25c88df6862, +, 59 ),
HEX_DBL( +, 1, 8232558201159, +, 60 ), HEX_DBL( +, 1, 0672a3c9eb871, +, 62 ),
HEX_DBL( +, 1, 64b41c6d37832, +, 63 ), HEX_DBL( +, 1, e4cf766fe49be, +, 64 ),
HEX_DBL( +, 1, 49767bc0483e3, +, 66 ), HEX_DBL( +, 1, bfc951eb8bb76, +, 67 ),
HEX_DBL( +, 1, 304d6aeca254b, +, 69 ), HEX_DBL( +, 1, 9d97010884251, +, 70 ),
HEX_DBL( +, 1, 19103e4080b45, +, 72 ), HEX_DBL( +, 1, 7e013cd114461, +, 73 ),
HEX_DBL( +, 1, 03996528e074c, +, 75 ), HEX_DBL( +, 1, 60d4f6fdac731, +, 76 ),
HEX_DBL( +, 1, df8c5af17ba3b, +, 77 ), HEX_DBL( +, 1, 45e3076d61699, +, 79 ),
HEX_DBL( +, 1, baed16a6e0da7, +, 80 ), HEX_DBL( +, 1, 2cffdfebde1a1, +, 82 ),
HEX_DBL( +, 1, 9919cabefcb69, +, 83 ), HEX_DBL( +, 1, 160345c9953e3, +, 85 ),
HEX_DBL( +, 1, 79dbc9dc53c66, +, 86 ), HEX_DBL( +, 1, 00c810d464097, +, 88 ),
HEX_DBL( +, 1, 5d009394c5c27, +, 89 ), HEX_DBL( +, 1, da57de8f107a8, +, 90 ),
HEX_DBL( +, 1, 425982cf597cd, +, 92 ), HEX_DBL( +, 1, b61e5ca3a5e31, +, 93 ),
HEX_DBL( +, 1, 29bb825dfcf87, +, 95 ), HEX_DBL( +, 1, 94a90db0d6fe2, +, 96 ),
HEX_DBL( +, 1, 12fec759586fd, +, 98 ), HEX_DBL( +, 1, 75c1dc469e3af, +, 99 ),
HEX_DBL( +, 1, fbfd219c43b04, +, 100 ), HEX_DBL( +, 1, 5936d44e1a146, +, 102 ),
HEX_DBL( +, 1, d531d8a7ee79c, +, 103 ), HEX_DBL( +, 1, 3ed9d24a2d51b, +, 105 ),
HEX_DBL( +, 1, b15cfe5b6e17b, +, 106 ), HEX_DBL( +, 1, 268038c2c0e, +, 108 ),
HEX_DBL( +, 1, 9044a73545d48, +, 109 ), HEX_DBL( +, 1, 1002ab6218b38, +, 111 ),
HEX_DBL( +, 1, 71b3540cbf921, +, 112 ), HEX_DBL( +, 1, f6799ea9c414a, +, 113 ),
HEX_DBL( +, 1, 55779b984f3eb, +, 115 ), HEX_DBL( +, 1, d01a210c44aa4, +, 116 ),
HEX_DBL( +, 1, 3b63da8e9121, +, 118 ), HEX_DBL( +, 1, aca8d6b0116b8, +, 119 ),
HEX_DBL( +, 1, 234de9e0c74e9, +, 121 ), HEX_DBL( +, 1, 8bec7503ca477, +, 122 ),
HEX_DBL( +, 1, 0d0eda9796b9, +, 124 ), HEX_DBL( +, 1, 6db0118477245, +, 125 ),
HEX_DBL( +, 1, f1056dc7bf22d, +, 126 ), HEX_DBL( +, 1, 51c2cc3433801, +, 128 ),
HEX_DBL( +, 1, cb108ffbec164, +, 129 ), HEX_DBL( +, 1, 37f780991b584, +, 131 ),
HEX_DBL( +, 1, a801c0ea8ac4d, +, 132 ), HEX_DBL( +, 1, 20247cc4c46c1, +, 134 ),
HEX_DBL( +, 1, 87a0553328015, +, 135 ), HEX_DBL( +, 1, 0a233dee4f9bb, +, 137 ),
HEX_DBL( +, 1, 69b7f55b808ba, +, 138 ), HEX_DBL( +, 1, eba064644060a, +, 139 ),
HEX_DBL( +, 1, 4e184933d9364, +, 141 ), HEX_DBL( +, 1, c614fe2531841, +, 142 ),
HEX_DBL( +, 1, 3494a9b171bf5, +, 144 ), HEX_DBL( +, 1, a36798b9d969b, +, 145 ),
HEX_DBL( +, 1, 1d03d8c0c04af, +, 147 ), HEX_DBL( +, 1, 836026385c974, +, 148 ),
HEX_DBL( +, 1, 073fbe9ac901d, +, 150 ), HEX_DBL( +, 1, 65cae0969f286, +, 151 ),
HEX_DBL( +, 1, e64a58639cae8, +, 152 ), HEX_DBL( +, 1, 4a77f5f9b50f9, +, 154 ),
HEX_DBL( +, 1, c12744a3a28e3, +, 155 ), HEX_DBL( +, 1, 313b3b6978e85, +, 157 ),
HEX_DBL( +, 1, 9eda3a31e587e, +, 158 ), HEX_DBL( +, 1, 19ebe56b56453, +, 160 ),
HEX_DBL( +, 1, 7f2bc6e599b7e, +, 161 ), HEX_DBL( +, 1, 04644610df2ff, +, 163 ),
HEX_DBL( +, 1, 61e8b490ac4e6, +, 164 ), HEX_DBL( +, 1, e103201f299b3, +, 165 ),
HEX_DBL( +, 1, 46e1b637beaf5, +, 167 ), HEX_DBL( +, 1, bc473cfede104, +, 168 ),
HEX_DBL( +, 1, 2deb1b9c85e2d, +, 170 ), HEX_DBL( +, 1, 9a5981ca67d1, +, 171 ),
HEX_DBL( +, 1, 16dc8a9ef670b, +, 173 ), HEX_DBL( +, 1, 7b03166942309, +, 174 ),
HEX_DBL( +, 1, 0190be03150a7, +, 176 ), HEX_DBL( +, 1, 5e1152f9a8119, +, 177 ),
HEX_DBL( +, 1, dbca9263f8487, +, 178 ), HEX_DBL( +, 1, 43556dee93bee, +, 180 ),
HEX_DBL( +, 1, b774c12967dfa, +, 181 ), HEX_DBL( +, 1, 2aa4306e922c2, +, 183 ),
HEX_DBL( +, 1, 95e54c5dd4217, +, 184 ) };
// scale by e**i -- (expm1(f) + 1)*e**i - 1 = expm1(f) * e**i + e**i - 1 = e**i
return exp_table[exponent+150] + (f * exp_table[exponent+150] - 1.0);
}
double reference_fmax( double x, double y )
{
if( isnan(y) )
return x;
return x >= y ? x : y;
}
double reference_fmin( double x, double y )
{
if( isnan(y) )
return x;
return x <= y ? x : y;
}
double reference_hypot( double x, double y )
{
// Since the inputs are actually floats, we don't have to worry about range here
if( isinf(x) || isinf(y) )
return INFINITY;
return sqrt( x * x + y * y );
}
int reference_ilogbl( long double x)
{
extern int gDeviceILogb0, gDeviceILogbNaN;
// Since we are just using this to verify double precision, we can
// use the double precision ilogb here
union { double f; cl_ulong u;} u;
u.f = (double) x;
int exponent = (int)(u.u >> 52) & 0x7ff;
if( exponent == 0x7ff )
{
if( u.u & 0x000fffffffffffffULL )
return gDeviceILogbNaN;
return CL_INT_MAX;
}
if( exponent == 0 )
{ // deal with denormals
u.f = x * HEX_DBL( +, 1, 0, +, 64 );
exponent = (cl_uint)(u.u >> 52) & 0x7ff;
if( exponent == 0 )
return gDeviceILogb0;
exponent -= 1023 + 64;
return exponent;
}
return exponent - 1023;
}
//double reference_log2( double x )
//{
// return log( x ) * 1.44269504088896340735992468100189214;
//}
double reference_relaxed_log2( double x )
{
return reference_log2(x);
}
double reference_log2( double x )
{
if( isnan(x) || x < 0.0 || x == -INFINITY)
return cl_make_nan();
if( x == 0.0f)
return -INFINITY;
if( x == INFINITY )
return INFINITY;
double hi, lo;
__log2_ep( &hi, &lo, x );
return hi;
}
double reference_log1p( double x )
{ // This function is suitable only for verifying log1pf(). It produces several double precision ulps of error.
// Handle small and NaN
if( ! ( reference_fabs(x) > HEX_DBL( +, 1, 0, -, 53 ) ) )
return x;
// deal with special values
if( x <= -1.0 )
{
if( x < -1.0 )
return cl_make_nan();
return -INFINITY;
}
// infinity
if( x == INFINITY )
return INFINITY;
// High precision result for when near 0, to avoid problems with the reference result falling in the wrong binade.
if( reference_fabs(x) < HEX_DBL( +, 1, 0, -, 28 ) )
return (1.0 - 0.5 * x) * x;
// Our polynomial is only good in the region +-2**-4.
// If we aren't in that range then we need to reduce to be in that range
double correctionLo = -0.0; // correction down stream to compensate for the reduction, if any
double correctionHi = -0.0; // correction down stream to compensate for the exponent, if any
if( reference_fabs(x) > HEX_DBL( +, 1, 0, -, 4 ) )
{
x += 1.0; // double should cover any loss of precision here
// separate x into (1+f) * 2**i
union{ double d; cl_ulong u;} u; u.d = x;
int i = (int) ((u.u >> 52) & 0x7ff) - 1023;
u.u &= 0x000fffffffffffffULL;
int index = (int) (u.u >> 48 );
u.u |= 0x3ff0000000000000ULL;
double f = u.d;
// further reduce f to be within 1/16 of 1.0
static const double scale_table[16] = { 1.0, HEX_DBL( +, 1, d2d2d2d6e3f79, -, 1 ), HEX_DBL( +, 1, b8e38e42737a1, -, 1 ), HEX_DBL( +, 1, a1af28711adf3, -, 1 ),
HEX_DBL( +, 1, 8cccccd88dd65, -, 1 ), HEX_DBL( +, 1, 79e79e810ec8f, -, 1 ), HEX_DBL( +, 1, 68ba2e94df404, -, 1 ), HEX_DBL( +, 1, 590b216defb29, -, 1 ),
HEX_DBL( +, 1, 4aaaaab1500ed, -, 1 ), HEX_DBL( +, 1, 3d70a3e0d6f73, -, 1 ), HEX_DBL( +, 1, 313b13bb39f4f, -, 1 ), HEX_DBL( +, 1, 25ed09823f1cc, -, 1 ),
HEX_DBL( +, 1, 1b6db6e77457b, -, 1 ), HEX_DBL( +, 1, 11a7b96a3a34f, -, 1 ), HEX_DBL( +, 1, 0888888e46fea, -, 1 ), HEX_DBL( +, 1, 00000038e9862, -, 1 ) };
// correction_table[i] = -log( scale_table[i] )
// All entries have >= 64 bits of precision (rather than the expected 53)
static const double correction_table[16] = { -0.0, HEX_DBL( +, 1, 7a5c722c16058, -, 4 ), HEX_DBL( +, 1, 323db16c89ab1, -, 3 ), HEX_DBL( +, 1, a0f87d180629, -, 3 ),
HEX_DBL( +, 1, 050279324e17c, -, 2 ), HEX_DBL( +, 1, 36f885bb270b0, -, 2 ), HEX_DBL( +, 1, 669b771b5cc69, -, 2 ), HEX_DBL( +, 1, 94203a6292a05, -, 2 ),
HEX_DBL( +, 1, bfb4f9cb333a4, -, 2 ), HEX_DBL( +, 1, e982376ddb80e, -, 2 ), HEX_DBL( +, 1, 08d5d8769b2b2, -, 1 ), HEX_DBL( +, 1, 1c288bc00e0cf, -, 1 ),
HEX_DBL( +, 1, 2ec7535b31ecb, -, 1 ), HEX_DBL( +, 1, 40bed0adc63fb, -, 1 ), HEX_DBL( +, 1, 521a5c0330615, -, 1 ), HEX_DBL( +, 1, 62e42f7dd092c, -, 1 ) };
f *= scale_table[index];
correctionLo = correction_table[index];
// log( 2**(i) ) = i * log(2)
correctionHi = (double)i * 0.693147180559945309417232121458176568;
x = f - 1.0;
}
// minmax polynomial for p(x) = (log(x+1) - x)/x valid over the range x = [-1/16, 1/16]
// max error HEX_DBL( +, 1, 048f61f9a5eca, -, 52 )
double p = HEX_DBL( -, 1, cc33de97a9d7b, -, 46 ) +
(HEX_DBL( -, 1, fffffffff3eb7, -, 2 ) +
(HEX_DBL( +, 1, 5555555633ef7, -, 2 ) +
(HEX_DBL( -, 1, 00000062c78, -, 2 ) +
(HEX_DBL( +, 1, 9999958a3321, -, 3 ) +
(HEX_DBL( -, 1, 55534ce65c347, -, 3 ) +
(HEX_DBL( +, 1, 24957208391a5, -, 3 ) +
(HEX_DBL( -, 1, 02287b9a5b4a1, -, 3 ) +
HEX_DBL( +, 1, c757d922180ed, -, 4 ) * x)*x)*x)*x)*x)*x)*x)*x;
// log(x+1) = x * p(x) + x
x += x * p;
return correctionHi + (correctionLo + x);
}
double reference_logb( double x )
{
union { float f; cl_uint u;} u;
u.f = (float) x;
cl_int exponent = (u.u >> 23) & 0xff;
if( exponent == 0xff )
return x * x;
if( exponent == 0 )
{ // deal with denormals
u.u = (u.u & 0x007fffff) | 0x3f800000;
u.f -= 1.0f;
exponent = (u.u >> 23) & 0xff;
if( exponent == 0 )
return -INFINITY;
return exponent - (127 + 126);
}
return exponent - 127;
}
double reference_relaxed_reciprocal(double x)
{
return 1.0f / ((float) x);
}
double reference_reciprocal( double x )
{
return 1.0 / x;
}
double reference_remainder( double x, double y )
{
int i;
return reference_remquo( x, y, &i );
}
double reference_lgamma( double x)
{
/*
* ====================================================
* This function is from fdlibm. http://www.netlib.org
* Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
*
* Developed at SunSoft, a Sun Microsystems, Inc. business.
* Permission to use, copy, modify, and distribute this
* software is freely granted, provided that this notice
* is preserved.
* ====================================================
*
*/
static const double //two52 = 4.50359962737049600000e+15, /* 0x43300000, 0x00000000 */
half= 5.00000000000000000000e-01, /* 0x3FE00000, 0x00000000 */
one = 1.00000000000000000000e+00, /* 0x3FF00000, 0x00000000 */
pi = 3.14159265358979311600e+00, /* 0x400921FB, 0x54442D18 */
a0 = 7.72156649015328655494e-02, /* 0x3FB3C467, 0xE37DB0C8 */
a1 = 3.22467033424113591611e-01, /* 0x3FD4A34C, 0xC4A60FAD */
a2 = 6.73523010531292681824e-02, /* 0x3FB13E00, 0x1A5562A7 */
a3 = 2.05808084325167332806e-02, /* 0x3F951322, 0xAC92547B */
a4 = 7.38555086081402883957e-03, /* 0x3F7E404F, 0xB68FEFE8 */
a5 = 2.89051383673415629091e-03, /* 0x3F67ADD8, 0xCCB7926B */
a6 = 1.19270763183362067845e-03, /* 0x3F538A94, 0x116F3F5D */
a7 = 5.10069792153511336608e-04, /* 0x3F40B6C6, 0x89B99C00 */
a8 = 2.20862790713908385557e-04, /* 0x3F2CF2EC, 0xED10E54D */
a9 = 1.08011567247583939954e-04, /* 0x3F1C5088, 0x987DFB07 */
a10 = 2.52144565451257326939e-05, /* 0x3EFA7074, 0x428CFA52 */
a11 = 4.48640949618915160150e-05, /* 0x3F07858E, 0x90A45837 */
tc = 1.46163214496836224576e+00, /* 0x3FF762D8, 0x6356BE3F */
tf = -1.21486290535849611461e-01, /* 0xBFBF19B9, 0xBCC38A42 */
/* tt = -(tail of tf) */
tt = -3.63867699703950536541e-18, /* 0xBC50C7CA, 0xA48A971F */
t0 = 4.83836122723810047042e-01, /* 0x3FDEF72B, 0xC8EE38A2 */
t1 = -1.47587722994593911752e-01, /* 0xBFC2E427, 0x8DC6C509 */
t2 = 6.46249402391333854778e-02, /* 0x3FB08B42, 0x94D5419B */
t3 = -3.27885410759859649565e-02, /* 0xBFA0C9A8, 0xDF35B713 */
t4 = 1.79706750811820387126e-02, /* 0x3F9266E7, 0x970AF9EC */
t5 = -1.03142241298341437450e-02, /* 0xBF851F9F, 0xBA91EC6A */
t6 = 6.10053870246291332635e-03, /* 0x3F78FCE0, 0xE370E344 */
t7 = -3.68452016781138256760e-03, /* 0xBF6E2EFF, 0xB3E914D7 */
t8 = 2.25964780900612472250e-03, /* 0x3F6282D3, 0x2E15C915 */
t9 = -1.40346469989232843813e-03, /* 0xBF56FE8E, 0xBF2D1AF1 */
t10 = 8.81081882437654011382e-04, /* 0x3F4CDF0C, 0xEF61A8E9 */
t11 = -5.38595305356740546715e-04, /* 0xBF41A610, 0x9C73E0EC */
t12 = 3.15632070903625950361e-04, /* 0x3F34AF6D, 0x6C0EBBF7 */
t13 = -3.12754168375120860518e-04, /* 0xBF347F24, 0xECC38C38 */
t14 = 3.35529192635519073543e-04, /* 0x3F35FD3E, 0xE8C2D3F4 */
u0 = -7.72156649015328655494e-02, /* 0xBFB3C467, 0xE37DB0C8 */
u1 = 6.32827064025093366517e-01, /* 0x3FE4401E, 0x8B005DFF */
u2 = 1.45492250137234768737e+00, /* 0x3FF7475C, 0xD119BD6F */
u3 = 9.77717527963372745603e-01, /* 0x3FEF4976, 0x44EA8450 */
u4 = 2.28963728064692451092e-01, /* 0x3FCD4EAE, 0xF6010924 */
u5 = 1.33810918536787660377e-02, /* 0x3F8B678B, 0xBF2BAB09 */
v1 = 2.45597793713041134822e+00, /* 0x4003A5D7, 0xC2BD619C */
v2 = 2.12848976379893395361e+00, /* 0x40010725, 0xA42B18F5 */
v3 = 7.69285150456672783825e-01, /* 0x3FE89DFB, 0xE45050AF */
v4 = 1.04222645593369134254e-01, /* 0x3FBAAE55, 0xD6537C88 */
v5 = 3.21709242282423911810e-03, /* 0x3F6A5ABB, 0x57D0CF61 */
s0 = -7.72156649015328655494e-02, /* 0xBFB3C467, 0xE37DB0C8 */
s1 = 2.14982415960608852501e-01, /* 0x3FCB848B, 0x36E20878 */
s2 = 3.25778796408930981787e-01, /* 0x3FD4D98F, 0x4F139F59 */
s3 = 1.46350472652464452805e-01, /* 0x3FC2BB9C, 0xBEE5F2F7 */
s4 = 2.66422703033638609560e-02, /* 0x3F9B481C, 0x7E939961 */
s5 = 1.84028451407337715652e-03, /* 0x3F5E26B6, 0x7368F239 */
s6 = 3.19475326584100867617e-05, /* 0x3F00BFEC, 0xDD17E945 */
r1 = 1.39200533467621045958e+00, /* 0x3FF645A7, 0x62C4AB74 */
r2 = 7.21935547567138069525e-01, /* 0x3FE71A18, 0x93D3DCDC */
r3 = 1.71933865632803078993e-01, /* 0x3FC601ED, 0xCCFBDF27 */
r4 = 1.86459191715652901344e-02, /* 0x3F9317EA, 0x742ED475 */
r5 = 7.77942496381893596434e-04, /* 0x3F497DDA, 0xCA41A95B */
r6 = 7.32668430744625636189e-06, /* 0x3EDEBAF7, 0xA5B38140 */
w0 = 4.18938533204672725052e-01, /* 0x3FDACFE3, 0x90C97D69 */
w1 = 8.33333333333329678849e-02, /* 0x3FB55555, 0x5555553B */
w2 = -2.77777777728775536470e-03, /* 0xBF66C16C, 0x16B02E5C */
w3 = 7.93650558643019558500e-04, /* 0x3F4A019F, 0x98CF38B6 */
w4 = -5.95187557450339963135e-04, /* 0xBF4380CB, 0x8C0FE741 */
w5 = 8.36339918996282139126e-04, /* 0x3F4B67BA, 0x4CDAD5D1 */
w6 = -1.63092934096575273989e-03; /* 0xBF5AB89D, 0x0B9E43E4 */
static const double zero= 0.00000000000000000000e+00;
double t,y,z,nadj,p,p1,p2,p3,q,r,w;
cl_int i,hx,lx,ix;
union{ double d; cl_ulong u;}u; u.d = x;
hx = (cl_int) (u.u >> 32);
lx = (cl_int) (u.u & 0xffffffffULL);
/* purge off +-inf, NaN, +-0, and negative arguments */
// *signgamp = 1;
ix = hx&0x7fffffff;
if(ix>=0x7ff00000) return x*x;
if((ix|lx)==0) return INFINITY;
if(ix<0x3b900000) { /* |x|<2**-70, return -log(|x|) */
if(hx<0) {
// *signgamp = -1;
return -reference_log(-x);
} else return -reference_log(x);
}
if(hx<0) {
if(ix>=0x43300000) /* |x|>=2**52, must be -integer */
return INFINITY;
t = reference_sinpi(x);
if(t==zero) return INFINITY; /* -integer */
nadj = reference_log(pi/reference_fabs(t*x));
// if(t<zero) *signgamp = -1;
x = -x;
}
/* purge off 1 and 2 */
if((((ix-0x3ff00000)|lx)==0)||(((ix-0x40000000)|lx)==0)) r = 0;
/* for x < 2.0 */
else if(ix<0x40000000) {
if(ix<=0x3feccccc) { /* lgamma(x) = lgamma(x+1)-log(x) */
r = -reference_log(x);
if(ix>=0x3FE76944) {y = 1.0-x; i= 0;}
else if(ix>=0x3FCDA661) {y= x-(tc-one); i=1;}
else {y = x; i=2;}
} else {
r = zero;
if(ix>=0x3FFBB4C3) {y=2.0-x;i=0;} /* [1.7316,2] */
else if(ix>=0x3FF3B4C4) {y=x-tc;i=1;} /* [1.23,1.73] */
else {y=x-one;i=2;}
}
switch(i) {
case 0:
z = y*y;
p1 = a0+z*(a2+z*(a4+z*(a6+z*(a8+z*a10))));
p2 = z*(a1+z*(a3+z*(a5+z*(a7+z*(a9+z*a11)))));
p = y*p1+p2;
r += (p-0.5*y); break;
case 1:
z = y*y;
w = z*y;
p1 = t0+w*(t3+w*(t6+w*(t9 +w*t12))); /* parallel comp */
p2 = t1+w*(t4+w*(t7+w*(t10+w*t13)));
p3 = t2+w*(t5+w*(t8+w*(t11+w*t14)));
p = z*p1-(tt-w*(p2+y*p3));
r += (tf + p); break;
case 2:
p1 = y*(u0+y*(u1+y*(u2+y*(u3+y*(u4+y*u5)))));
p2 = one+y*(v1+y*(v2+y*(v3+y*(v4+y*v5))));
r += (-0.5*y + p1/p2);
}
}
else if(ix<0x40200000) { /* x < 8.0 */
i = (int)x;
t = zero;
y = x-(double)i;
p = y*(s0+y*(s1+y*(s2+y*(s3+y*(s4+y*(s5+y*s6))))));
q = one+y*(r1+y*(r2+y*(r3+y*(r4+y*(r5+y*r6)))));
r = half*y+p/q;
z = one; /* lgamma(1+s) = log(s) + lgamma(s) */
switch(i) {
case 7: z *= (y+6.0); /* FALLTHRU */
case 6: z *= (y+5.0); /* FALLTHRU */
case 5: z *= (y+4.0); /* FALLTHRU */
case 4: z *= (y+3.0); /* FALLTHRU */
case 3: z *= (y+2.0); /* FALLTHRU */
r += reference_log(z); break;
}
/* 8.0 <= x < 2**58 */
} else if (ix < 0x43900000) {
t = reference_log(x);
z = one/x;
y = z*z;
w = w0+z*(w1+y*(w2+y*(w3+y*(w4+y*(w5+y*w6)))));
r = (x-half)*(t-one)+w;
} else
/* 2**58 <= x <= inf */
r = x*(reference_log(x)-one);
if(hx<0) r = nadj - r;
return r;
}
#endif // _MSC_VER
double reference_assignment( double x ){ return x; }
int reference_not( double x )
{
int r = !x;
return r;
}
#pragma mark -
#pragma mark Double testing
#ifndef M_PIL
#define M_PIL 3.14159265358979323846264338327950288419716939937510582097494459230781640628620899L
#endif
static long double reduce1l( long double x );
#ifdef __PPC__
// Since long double on PPC is really extended precision double arithmetic
// consisting of two doubles (a high and low). This form of long double has
// the potential of representing a number with more than LDBL_MANT_DIG digits
// such that reduction algorithm used for other architectures will not work.
// Instead and alternate reduction method is used.
static long double reduce1l( long double x )
{
union {
long double ld;
double d[2];
} u;
// Reduce the high and low halfs separately.
u.ld = x;
return ((long double)reduce1(u.d[0]) + reduce1(u.d[1]));
}
#else // !__PPC__
static long double reduce1l( long double x )
{
static long double unit_exp = 0;
if( 0.0L == unit_exp )
unit_exp = scalbnl( 1.0L, LDBL_MANT_DIG);
if( reference_fabsl(x) >= unit_exp )
{
if( reference_fabsl(x) == INFINITY )
return cl_make_nan();
return 0.0L; //we patch up the sign for sinPi and cosPi later, since they need different signs
}
// Find the nearest multiple of 2
const long double r = reference_copysignl( unit_exp, x );
long double z = x + r;
z -= r;
// subtract it from x. Value is now in the range -1 <= x <= 1
return x - z;
}
#endif // __PPC__
long double reference_acospil( long double x){ return reference_acosl( x ) / M_PIL; }
long double reference_asinpil( long double x){ return reference_asinl( x ) / M_PIL; }
long double reference_atanpil( long double x){ return reference_atanl( x ) / M_PIL; }
long double reference_atan2pil( long double y, long double x){ return reference_atan2l( y, x) / M_PIL; }
long double reference_cospil( long double x)
{
if( reference_fabsl(x) >= HEX_LDBL( +, 1, 0, +, 54 ) )
{
if( reference_fabsl(x) == INFINITY )
return cl_make_nan();
//Note this probably fails for odd values between 0x1.0p52 and 0x1.0p53.
//However, when starting with single precision inputs, there will be no odd values.
return 1.0L;
}
x = reduce1l(x);
#if DBL_MANT_DIG >= LDBL_MANT_DIG
// phase adjust
double xhi = 0.0;
double xlo = 0.0;
xhi = (double) x + 0.5;
if(reference_fabsl(x) > 0.5L)
{
xlo = xhi - x;
xlo = 0.5 - xlo;
}
else
{
xlo = xhi - 0.5;
xlo = x - xlo;
}
// reduce to [-0.5, 0.5]
if( xhi < -0.5 )
{
xhi = -1.0 - xhi;
xlo = -xlo;
}
else if ( xhi > 0.5 )
{
xhi = 1.0 - xhi;
xlo = -xlo;
}
// cosPi zeros are all +0
if( xhi == 0.0 && xlo == 0.0 )
return 0.0;
xhi *= M_PI;
xlo *= M_PI;
xhi += xlo;
return reference_sinl( xhi );
#else
// phase adjust
x += 0.5L;
// reduce to [-0.5, 0.5]
if( x < -0.5L )
x = -1.0L - x;
else if ( x > 0.5L )
x = 1.0L - x;
// cosPi zeros are all +0
if( x == 0.0L )
return 0.0L;
return reference_sinl( x * M_PIL );
#endif
}
long double reference_dividel( long double x, long double y)
{
double dx = x;
double dy = y;
return dx/dy;
}
typedef struct{ double hi, lo; } double_double;
// Split doubles_double into a series of consecutive 26-bit precise doubles and a remainder.
// Note for later -- for multiplication, it might be better to split each double into a power of two and two 26 bit portions
// multiplication of a double double by a known power of two is cheap. The current approach causes some inexact arithmetic in mul_dd.
static inline void split_dd( double_double x, double_double *hi, double_double *lo )
{
union{ double d; cl_ulong u;}u;
u.d = x.hi;
u.u &= 0xFFFFFFFFF8000000ULL;
hi->hi = u.d;
x.hi -= u.d;
u.d = x.hi;
u.u &= 0xFFFFFFFFF8000000ULL;
hi->lo = u.d;
x.hi -= u.d;
double temp = x.hi;
x.hi += x.lo;
x.lo -= x.hi - temp;
u.d = x.hi;
u.u &= 0xFFFFFFFFF8000000ULL;
lo->hi = u.d;
x.hi -= u.d;
lo->lo = x.hi + x.lo;
}
static inline double_double accum_d( double_double a, double b )
{
double temp;
if( fabs(b) > fabs(a.hi) )
{
temp = a.hi;
a.hi += b;
a.lo += temp - (a.hi - b);
}
else
{
temp = a.hi;
a.hi += b;
a.lo += b - (a.hi - temp);
}
if( isnan( a.lo ) )
a.lo = 0.0;
return a;
}
static inline double_double add_dd( double_double a, double_double b )
{
double_double r = {-0.0 -0.0 };
if( isinf(a.hi) || isinf( b.hi ) ||
isnan(a.hi) || isnan( b.hi ) ||
0.0 == a.hi || 0.0 == b.hi )
{
r.hi = a.hi + b.hi;
r.lo = a.lo + b.lo;
if( isnan( r.lo ) )
r.lo = 0.0;
return r;
}
//merge sort terms by magnitude -- here we assume that |a.hi| > |a.lo|, |b.hi| > |b.lo|, so we don't have to do the first merge pass
double terms[4] = { a.hi, b.hi, a.lo, b.lo };
double temp;
//Sort hi terms
if( fabs(terms[0]) < fabs(terms[1]) )
{
temp = terms[0];
terms[0] = terms[1];
terms[1] = temp;
}
//sort lo terms
if( fabs(terms[2]) < fabs(terms[3]) )
{
temp = terms[2];
terms[2] = terms[3];
terms[3] = temp;
}
// Fix case where small high term is less than large low term
if( fabs(terms[1]) < fabs(terms[2]) )
{
temp = terms[1];
terms[1] = terms[2];
terms[2] = temp;
}
// accumulate the results
r.hi = terms[2] + terms[3];
r.lo = terms[3] - (r.hi - terms[2]);
temp = r.hi;
r.hi += terms[1];
r.lo += temp - (r.hi - terms[1]);
temp = r.hi;
r.hi += terms[0];
r.lo += temp - (r.hi - terms[0]);
// canonicalize the result
temp = r.hi;
r.hi += r.lo;
r.lo = r.lo - (r.hi - temp);
if( isnan( r.lo ) )
r.lo = 0.0;
return r;
}
static inline double_double mul_dd( double_double a, double_double b )
{
double_double result = {-0.0,-0.0};
// Inf, nan and 0
if( isnan( a.hi ) || isnan( b.hi ) ||
isinf( a.hi ) || isinf( b.hi ) ||
0.0 == a.hi || 0.0 == b.hi )
{
result.hi = a.hi * b.hi;
return result;
}
double_double ah, al, bh, bl;
split_dd( a, &ah, &al );
split_dd( b, &bh, &bl );
double p0 = ah.hi * bh.hi; // exact (52 bits in product) 0
double p1 = ah.hi * bh.lo; // exact (52 bits in product) 26
double p2 = ah.lo * bh.hi; // exact (52 bits in product) 26
double p3 = ah.lo * bh.lo; // exact (52 bits in product) 52
double p4 = al.hi * bh.hi; // exact (52 bits in product) 52
double p5 = al.hi * bh.lo; // exact (52 bits in product) 78
double p6 = al.lo * bh.hi; // inexact (54 bits in product) 78
double p7 = al.lo * bh.lo; // inexact (54 bits in product) 104
double p8 = ah.hi * bl.hi; // exact (52 bits in product) 52
double p9 = ah.hi * bl.lo; // inexact (54 bits in product) 78
double pA = ah.lo * bl.hi; // exact (52 bits in product) 78
double pB = ah.lo * bl.lo; // inexact (54 bits in product) 104
double pC = al.hi * bl.hi; // exact (52 bits in product) 104
// the last 3 terms are two low to appear in the result
// accumulate from bottom up
#if 0
// works but slow
result.hi = pC;
result = accum_d( result, pB );
result = accum_d( result, p7 );
result = accum_d( result, pA );
result = accum_d( result, p9 );
result = accum_d( result, p6 );
result = accum_d( result, p5 );
result = accum_d( result, p8 );
result = accum_d( result, p4 );
result = accum_d( result, p3 );
result = accum_d( result, p2 );
result = accum_d( result, p1 );
result = accum_d( result, p0 );
// canonicalize the result
double temp = result.hi;
result.hi += result.lo;
result.lo -= (result.hi - temp);
if( isnan( result.lo ) )
result.lo = 0.0;
return result;
#else
// take advantage of the known relative magnitudes of the partial products to avoid some sorting
// Combine 2**-78 and 2**-104 terms. Here we are a bit sloppy about canonicalizing the double_doubles
double_double t0 = { pA, pC };
double_double t1 = { p9, pB };
double_double t2 = { p6, p7 };
double temp0, temp1, temp2;
t0 = accum_d( t0, p5 ); // there is an extra 2**-78 term to deal with
// Add in 2**-52 terms. Here we are a bit sloppy about canonicalizing the double_doubles
temp0 = t0.hi; temp1 = t1.hi; temp2 = t2.hi;
t0.hi += p3; t1.hi += p4; t2.hi += p8;
temp0 -= t0.hi-p3; temp1 -= t1.hi-p4; temp2 -= t2.hi - p8;
t0.lo += temp0; t1.lo += temp1; t2.lo += temp2;
// Add in 2**-26 terms. Here we are a bit sloppy about canonicalizing the double_doubles
temp1 = t1.hi; temp2 = t2.hi;
t1.hi += p1; t2.hi += p2;
temp1 -= t1.hi-p1; temp2 -= t2.hi - p2;
t1.lo += temp1; t2.lo += temp2;
// Combine accumulators to get the low bits of result
t1 = add_dd( t1, add_dd( t2, t0 ) );
// Add in MSB's, and round to precision
return accum_d( t1, p0 ); // canonicalizes
#endif
}
long double reference_exp10l( long double z )
{
const double_double log2_10 = { HEX_DBL( +, 1, a934f0979a371, +, 1 ), HEX_DBL( +, 1, 7f2495fb7fa6d, -, 53 ) };
double_double x;
int j;
// Handle NaNs
if( isnan(z) )
return z;
// init x
x.hi = z;
x.lo = z - x.hi;
// 10**x = exp2( x * log2(10) )
x = mul_dd( x, log2_10); // x * log2(10)
//Deal with overflow and underflow for exp2(x) stage next
if( x.hi >= 1025 )
return INFINITY;
if( x.hi < -1075-24 )
return +0.0;
// find nearest integer to x
int i = (int) rint(x.hi);
// x now holds fractional part. The result would be then 2**i * exp2( x )
x.hi -= i;
// We could attempt to find a minimax polynomial for exp2(x) over the range x = [-0.5, 0.5].
// However, this would converge very slowly near the extrema, where 0.5**n is not a lot different
// from 0.5**(n+1), thereby requiring something like a 20th order polynomial to get 53 + 24 bits
// of precision. Instead we further reduce the range to [-1/32, 1/32] by observing that
//
// 2**(a+b) = 2**a * 2**b
//
// We can thus build a table of 2**a values for a = n/16, n = [-8, 8], and reduce the range
// of x to [-1/32, 1/32] by subtracting away the nearest value of n/16 from x.
const double_double corrections[17] =
{
{ HEX_DBL( +, 1, 6a09e667f3bcd, -, 1 ), HEX_DBL( -, 1, bdd3413b26456, -, 55 ) },
{ HEX_DBL( +, 1, 7a11473eb0187, -, 1 ), HEX_DBL( -, 1, 41577ee04992f, -, 56 ) },
{ HEX_DBL( +, 1, 8ace5422aa0db, -, 1 ), HEX_DBL( +, 1, 6e9f156864b27, -, 55 ) },
{ HEX_DBL( +, 1, 9c49182a3f09, -, 1 ), HEX_DBL( +, 1, c7c46b071f2be, -, 57 ) },
{ HEX_DBL( +, 1, ae89f995ad3ad, -, 1 ), HEX_DBL( +, 1, 7a1cd345dcc81, -, 55 ) },
{ HEX_DBL( +, 1, c199bdd85529c, -, 1 ), HEX_DBL( +, 1, 11065895048dd, -, 56 ) },
{ HEX_DBL( +, 1, d5818dcfba487, -, 1 ), HEX_DBL( +, 1, 2ed02d75b3707, -, 56 ) },
{ HEX_DBL( +, 1, ea4afa2a490da, -, 1 ), HEX_DBL( -, 1, e9c23179c2893, -, 55 ) },
{ HEX_DBL( +, 1, 0, +, 0 ), HEX_DBL( +, 0, 0, +, 0 ) },
{ HEX_DBL( +, 1, 0b5586cf9890f, +, 0 ), HEX_DBL( +, 1, 8a62e4adc610b, -, 54 ) },
{ HEX_DBL( +, 1, 172b83c7d517b, +, 0 ), HEX_DBL( -, 1, 19041b9d78a76, -, 55 ) },
{ HEX_DBL( +, 1, 2387a6e756238, +, 0 ), HEX_DBL( +, 1, 9b07eb6c70573, -, 54 ) },
{ HEX_DBL( +, 1, 306fe0a31b715, +, 0 ), HEX_DBL( +, 1, 6f46ad23182e4, -, 55 ) },
{ HEX_DBL( +, 1, 3dea64c123422, +, 0 ), HEX_DBL( +, 1, ada0911f09ebc, -, 55 ) },
{ HEX_DBL( +, 1, 4bfdad5362a27, +, 0 ), HEX_DBL( +, 1, d4397afec42e2, -, 56 ) },
{ HEX_DBL( +, 1, 5ab07dd485429, +, 0 ), HEX_DBL( +, 1, 6324c054647ad, -, 54 ) },
{ HEX_DBL( +, 1, 6a09e667f3bcd, +, 0 ), HEX_DBL( -, 1, bdd3413b26456, -, 54 ) }
};
int index = (int) rint( x.hi * 16.0 );
x.hi -= (double) index * 0.0625;
// canonicalize x
double temp = x.hi;
x.hi += x.lo;
x.lo -= x.hi - temp;
// Minimax polynomial for (exp2(x)-1)/x, over the range [-1/32, 1/32]. Max Error: 2 * 0x1.e112p-87
const double_double c[] = {
{HEX_DBL( +, 1, 62e42fefa39ef, -, 1 ), HEX_DBL( +, 1, abc9e3ac1d244, -, 56 )},
{HEX_DBL( +, 1, ebfbdff82c58f, -, 3 ), HEX_DBL( -, 1, 5e4987a631846, -, 57 )},