Newer
Older
#ifndef LIGO_INLINE_MATH_X86_ASM
#define LIGO_INLINE_MATH_X86_ASM
#include "portableInline.h"
#define M_PI 3.14159265358979323846
#define M_TWO_PI 6.28318530717958647692

Ezekiel Dohmen
committed
#define M_SQRT2 1.41421356237309504880
#define __lrint_code \
long int __lrintres; \
__asm__ __volatile__( "fistpl %0" \
: "=m"( __lrintres ) \
: "t"( __x ) \
: "st" ); \
return __lrintres
#ifdef __cplusplus
extern "C" {
#endif
LIGO_INLINE int
_rintf( float __x )
{
__lrint_code;
}
LIGO_INLINE int
_rint( double __x )
{
__lrint_code;
}
LIGO_INLINE void
sincos( double __x, double* __sinx, double* __cosx )
{
register long double __cosr;
register long double __sinr;
__asm __volatile__( "push %%rax\n\t"
"fsincos\n\t"
"fnstsw %%ax\n\t"
"testl $0x400, %%eax\n\t"
"jz 1f\n\t"
"fldpi\n\t"
"fadd %%st(0)\n\t"
"fxch %%st(1)\n\t"
"2: fprem1\n\t"
"fnstsw %%ax\n\t"
"testl $0x400, %%eax\n\t"
"jnz 2b\n\t"
"fstp %%st(1)\n\t"
"fsincos\n\t"
"1: pop %%rax\n\t"
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
: "=t"( __cosr ), "=u"( __sinr )
: "0"( __x ) );
*__sinx = __sinr;
*__cosx = __cosr;
}
/* Fast Pentium FPU SQRT command */
LIGO_INLINE double
lsqrt( double __x )
{
register double __result;
__asm __volatile__( "fsqrt" : "=t"( __result ) : "0"( __x ) );
return __result;
}
/* Fast Pentium FPU 2^x command for -1<=x<=1*/
LIGO_INLINE double
l2xr( double __x )
{
register double __result;
__asm __volatile__( "f2xm1\n\t fld1\n\t faddp\n\t"
: "=t"( __result )
: "0"( __x ) );
return __result;
}
/* Fast Pentium FPU round to nearest integer command */
LIGO_INLINE double
lrndint( double __x )
{
register double __result;
__asm __volatile__( "frndint" : "=t"( __result ) : "0"( __x ) );
return __result;
}
/* Fast Pentium FPU to multiply with log2(10) */
LIGO_INLINE double
lmullog210( double __x )
{
register double __result;
__asm __volatile__( "fldl2t\n\t fmulp" : "=t"( __result ) : "0"( __x ) );
return __result;
}
/* Fast Pentium FPU log10(x) command */
LIGO_INLINE double
llog10( double __x )
{
register double __result;
__asm __volatile__( "fldlg2\n\t fxch %%st(1)\n\t fyl2x"
: "=t"( __result )
: "0"( __x ) );
return __result;
}
/* Fast Pentium absolute value */
LIGO_INLINE double
lfabs( double __x )
{
register double __result;
__asm __volatile__( "fabs" : "=t"( __result ) : "0"( __x ) );
return __result;
}
/* Fast Pentium ATAN2 */
LIGO_INLINE double
latan2( double __y, double __x )
{
register long double __atanr;
__asm __volatile( "fpatan\n\t" : "=t"( __atanr ) : "0"( __x ), "u"( __y ) );
return __atanr;
}
/*
SSE MXCSR
The MXCSR register is a 32-bit register containing flags for control and status
information regarding SSE instructions. As of SSE3, only bits 0-15 have been
defined.
Pnemonic Bit Location Description
FZ bit 15 Flush To Zero
R+ bit 14 Round Positive
R- bit 13 Round Negative
RZ bits 13 and 14 Round To Zero
RN bits 13 and 14 are 0 Round To Nearest
PM bit 12 Precision Mask
UM bit 11 Underflow Mask
OM bit 10 Overflow Mask
ZM bit 9 Divide By Zero Mask
DM bit 8 Denormal Mask
IM bit 7 Invalid Operation Mask
DAZ bit 6 Denormals Are Zero
PE bit 5 Precision Flag
UE bit 4 Underflow Flag
OE bit 3 Overflow Flag
ZE bit 2 Divide By Zero Flag
DE bit 1 Denormal Flag
IE bit 0 Invalid Operation Flag
FZ mode causes all underflowing operations to simply go to zero. This saves some
processing time, but loses precision.
The R+, R-, RN, and RZ rounding modes determine how the lowest bit is generated.
Normally, RN is used.
PM, UM, MM, ZM, DM, and IM are masks that tell the processor to ignore the
exceptions that happen, if they do. This keeps the program from having to deal
with problems, but might cause invalid results.
DAZ tells the CPU to force all Denormals to zero. A Denormal is a number that is
so small that FPU can't renormalize it due to limited exponent ranges. They're
just like normal numbers, but they take considerably longer to process. Note
that not all processors support DAZ.
PE, UE, ME, ZE, DE, and IE are the exception flags that are set if they happen,
and aren't unmasked. Programs can check these to see if something interesting
happened. These bits are "sticky", which means that once they're set, they stay
set forever until the program clears them. This means that the indicated
exception could have happened several operations ago, but nobody bothered to
clear it.
DAZ wasn't available in the first version of SSE. Since setting a reserved bit
in MXCSR causes a general protection fault, we need to be able to check the
availability of this feature without causing problems. To do this, one needs to
set up a 512-byte area of memory to save the SSE state to, using fxsave, and
then one needs to inspect bytes 28 through 31 for the MXCSR_MASK value. If bit 6
is set, DAZ is supported, otherwise, it isn't.
*/
LIGO_INLINE unsigned long long
read_mxcsr( void )
{
unsigned long long mxcsr;
asm( "stmxcsr %0" : "=m"( mxcsr ) );
return mxcsr;
}
LIGO_INLINE void
write_mxcsr( unsigned long long val )
{
asm( "ldmxcsr %0" ::"m"( val ) );
}
/* Set FZ and DAZ bits, disabling underflows and denorms
* This fixes long execution times caused by 0.0 inputs to filter modules.
* See the inlineMath.h header file for more information on the bits we are
* setting
* */
LIGO_INLINE void
fz_daz( void )
{
write_mxcsr( read_mxcsr( ) | 1 | 1 << 15 );
}
#ifdef __cplusplus
}
#endif
#endif //LIGO_INLINE_MATH_X86_ASM