#ifndef LIGO_INLINE_MATH_X86_ASM #define LIGO_INLINE_MATH_X86_ASM #include "portableInline.h" #define M_PI 3.14159265358979323846 #define M_TWO_PI 6.28318530717958647692 #define M_SQRT2 1.41421356237309504880 #define __lrint_code \ long int __lrintres; \ __asm__ __volatile__( "fistpl %0" \ : "=m"( __lrintres ) \ : "t"( __x ) \ : "st" ); \ return __lrintres #ifdef __cplusplus extern "C" { #endif LIGO_INLINE int _rintf( float __x ) { __lrint_code; } LIGO_INLINE int _rint( double __x ) { __lrint_code; } LIGO_INLINE void sincos( double __x, double* __sinx, double* __cosx ) { register long double __cosr; register long double __sinr; __asm __volatile__( "push %%rax\n\t" "fsincos\n\t" "fnstsw %%ax\n\t" "testl $0x400, %%eax\n\t" "jz 1f\n\t" "fldpi\n\t" "fadd %%st(0)\n\t" "fxch %%st(1)\n\t" "2: fprem1\n\t" "fnstsw %%ax\n\t" "testl $0x400, %%eax\n\t" "jnz 2b\n\t" "fstp %%st(1)\n\t" "fsincos\n\t" "1: pop %%rax\n\t" : "=t"( __cosr ), "=u"( __sinr ) : "0"( __x ) ); *__sinx = __sinr; *__cosx = __cosr; } /* Fast Pentium FPU SQRT command */ LIGO_INLINE double lsqrt( double __x ) { register double __result; __asm __volatile__( "fsqrt" : "=t"( __result ) : "0"( __x ) ); return __result; } /* Fast Pentium FPU 2^x command for -1<=x<=1*/ LIGO_INLINE double l2xr( double __x ) { register double __result; __asm __volatile__( "f2xm1\n\t fld1\n\t faddp\n\t" : "=t"( __result ) : "0"( __x ) ); return __result; } /* Fast Pentium FPU round to nearest integer command */ LIGO_INLINE double lrndint( double __x ) { register double __result; __asm __volatile__( "frndint" : "=t"( __result ) : "0"( __x ) ); return __result; } /* Fast Pentium FPU to multiply with log2(10) */ LIGO_INLINE double lmullog210( double __x ) { register double __result; __asm __volatile__( "fldl2t\n\t fmulp" : "=t"( __result ) : "0"( __x ) ); return __result; } /* Fast Pentium FPU log10(x) command */ LIGO_INLINE double llog10( double __x ) { register double __result; __asm __volatile__( "fldlg2\n\t fxch %%st(1)\n\t fyl2x" : "=t"( __result ) : "0"( __x ) ); return __result; } /* Fast Pentium absolute value */ LIGO_INLINE double lfabs( double __x ) { register double __result; __asm __volatile__( "fabs" : "=t"( __result ) : "0"( __x ) ); return __result; } /* Fast Pentium ATAN2 */ LIGO_INLINE double latan2( double __y, double __x ) { register long double __atanr; __asm __volatile( "fpatan\n\t" : "=t"( __atanr ) : "0"( __x ), "u"( __y ) ); return __atanr; } /* SSE MXCSR The MXCSR register is a 32-bit register containing flags for control and status information regarding SSE instructions. As of SSE3, only bits 0-15 have been defined. Pnemonic Bit Location Description FZ bit 15 Flush To Zero R+ bit 14 Round Positive R- bit 13 Round Negative RZ bits 13 and 14 Round To Zero RN bits 13 and 14 are 0 Round To Nearest PM bit 12 Precision Mask UM bit 11 Underflow Mask OM bit 10 Overflow Mask ZM bit 9 Divide By Zero Mask DM bit 8 Denormal Mask IM bit 7 Invalid Operation Mask DAZ bit 6 Denormals Are Zero PE bit 5 Precision Flag UE bit 4 Underflow Flag OE bit 3 Overflow Flag ZE bit 2 Divide By Zero Flag DE bit 1 Denormal Flag IE bit 0 Invalid Operation Flag FZ mode causes all underflowing operations to simply go to zero. This saves some processing time, but loses precision. The R+, R-, RN, and RZ rounding modes determine how the lowest bit is generated. Normally, RN is used. PM, UM, MM, ZM, DM, and IM are masks that tell the processor to ignore the exceptions that happen, if they do. This keeps the program from having to deal with problems, but might cause invalid results. DAZ tells the CPU to force all Denormals to zero. A Denormal is a number that is so small that FPU can't renormalize it due to limited exponent ranges. They're just like normal numbers, but they take considerably longer to process. Note that not all processors support DAZ. PE, UE, ME, ZE, DE, and IE are the exception flags that are set if they happen, and aren't unmasked. Programs can check these to see if something interesting happened. These bits are "sticky", which means that once they're set, they stay set forever until the program clears them. This means that the indicated exception could have happened several operations ago, but nobody bothered to clear it. DAZ wasn't available in the first version of SSE. Since setting a reserved bit in MXCSR causes a general protection fault, we need to be able to check the availability of this feature without causing problems. To do this, one needs to set up a 512-byte area of memory to save the SSE state to, using fxsave, and then one needs to inspect bytes 28 through 31 for the MXCSR_MASK value. If bit 6 is set, DAZ is supported, otherwise, it isn't. */ LIGO_INLINE unsigned long long read_mxcsr( void ) { unsigned long long mxcsr; asm( "stmxcsr %0" : "=m"( mxcsr ) ); return mxcsr; } LIGO_INLINE void write_mxcsr( unsigned long long val ) { asm( "ldmxcsr %0" ::"m"( val ) ); } /* Set FZ and DAZ bits, disabling underflows and denorms * This fixes long execution times caused by 0.0 inputs to filter modules. * See the inlineMath.h header file for more information on the bits we are * setting * */ LIGO_INLINE void fz_daz( void ) { write_mxcsr( read_mxcsr( ) | 1 | 1 << 15 ); } #ifdef __cplusplus } #endif #endif //LIGO_INLINE_MATH_X86_ASM