1
/* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
3
Inspired by Intel Approximate Math library, and based on the
4
corresponding algorithms of the cephes math library
6
The default is to use the SSE1 version. If you define USE_SSE2 the
7
the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
8
not expect any significant performance improvement with SSE2.
11
/* Copyright (C) 2007 Julien Pommier
13
This software is provided 'as-is', without any express or implied
14
warranty. In no event will the authors be held liable for any damages
15
arising from the use of this software.
17
Permission is granted to anyone to use this software for any purpose,
18
including commercial applications, and to alter it and redistribute it
19
freely, subject to the following restrictions:
21
1. The origin of this software must not be misrepresented; you must not
22
claim that you wrote the original software. If you use this software
23
in a product, an acknowledgment in the product documentation would be
24
appreciated but is not required.
25
2. Altered source versions must be plainly marked as such, and must not be
26
misrepresented as being the original software.
27
3. This notice may not be removed or altered from any source distribution.
29
(this is the zlib license)
32
#include <xmmintrin.h>
34
/* yes I know, the top of this file is quite ugly */
36
#ifdef _MSC_VER /* visual c++ */
37
# define ALIGN16_BEG __declspec(align(16))
39
#else /* gcc or icc */
41
# define ALIGN16_END __attribute__((aligned(16)))
44
/* __m128 is ugly to write */
45
typedef __m128 v4sf; // vector of 4 float (sse1)
48
# include <emmintrin.h>
49
typedef __m128i v4si; // vector of 4 int (sse2)
51
typedef __m64 v2si; // vector of 2 int (mmx)
54
/* declare some SSE constants -- why can't I figure a better way to do that? */
55
#define _PS_CONST(Name, Val) \
56
static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
57
#define _PI32_CONST(Name, Val) \
58
static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
59
#define _PS_CONST_TYPE(Name, Type, Val) \
60
static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
64
/* the smallest non denormalized float number */
65
_PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
66
_PS_CONST_TYPE(mant_mask, int, 0x7f800000);
67
_PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
69
_PS_CONST_TYPE(sign_mask, int, 0x80000000);
70
_PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
73
_PI32_CONST(inv1, ~1);
76
_PI32_CONST(0x7f, 0x7f);
78
_PS_CONST(cephes_SQRTHF, 0.707106781186547524);
79
_PS_CONST(cephes_log_p0, 7.0376836292E-2);
80
_PS_CONST(cephes_log_p1, - 1.1514610310E-1);
81
_PS_CONST(cephes_log_p2, 1.1676998740E-1);
82
_PS_CONST(cephes_log_p3, - 1.2420140846E-1);
83
_PS_CONST(cephes_log_p4, + 1.4249322787E-1);
84
_PS_CONST(cephes_log_p5, - 1.6668057665E-1);
85
_PS_CONST(cephes_log_p6, + 2.0000714765E-1);
86
_PS_CONST(cephes_log_p7, - 2.4999993993E-1);
87
_PS_CONST(cephes_log_p8, + 3.3333331174E-1);
88
_PS_CONST(cephes_log_q1, -2.12194440e-4);
89
_PS_CONST(cephes_log_q2, 0.693359375);
95
void sincos_ps(v4sf x, v4sf *s, v4sf *c);
98
typedef union xmm_mm_union {
103
#define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) { \
104
xmm_mm_union u; u.xmm = xmm_; \
109
#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \
110
xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \
115
/* natural logarithm computed for 4 simultaneous float
116
return NaN for x <= 0
118
v4sf log_ps(v4sf x) {
124
v4sf one = *(v4sf*)_ps_1;
126
v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
127
v4sf e, mask, tmp, z, y;
129
x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos); /* cut off denormalized stuff */
132
/* part 1: x = frexpf(x, &e); */
133
COPY_XMM_TO_MM(x, mm0, mm1);
134
mm0 = _mm_srli_pi32(mm0, 23);
135
mm1 = _mm_srli_pi32(mm1, 23);
137
emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
139
/* keep only the fractional part */
140
x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
141
x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
144
/* now e=mm0:mm1 contain the really base-2 exponent */
145
mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
146
mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
147
e = _mm_cvtpi32x2_ps(mm0, mm1);
148
_mm_empty(); /* bye bye mmx */
150
emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
151
e = _mm_cvtepi32_ps(emm0);
154
e = _mm_add_ps(e, one);
160
} else { x = x - 1.0; }
163
mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
164
tmp = _mm_and_ps(x, mask);
165
x = _mm_sub_ps(x, one);
166
e = _mm_sub_ps(e, _mm_and_ps(one, mask));
167
x = _mm_add_ps(x, tmp);
172
y = *(v4sf*)_ps_cephes_log_p0;
173
y = _mm_mul_ps(y, x);
174
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
175
y = _mm_mul_ps(y, x);
176
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
177
y = _mm_mul_ps(y, x);
178
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
179
y = _mm_mul_ps(y, x);
180
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
181
y = _mm_mul_ps(y, x);
182
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
183
y = _mm_mul_ps(y, x);
184
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
185
y = _mm_mul_ps(y, x);
186
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
187
y = _mm_mul_ps(y, x);
188
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
189
y = _mm_mul_ps(y, x);
191
y = _mm_mul_ps(y, z);
194
tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
195
y = _mm_add_ps(y, tmp);
198
tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
199
y = _mm_sub_ps(y, tmp);
201
tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
202
x = _mm_add_ps(x, y);
203
x = _mm_add_ps(x, tmp);
204
x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
208
_PS_CONST(exp_hi, 88.3762626647949f);
209
_PS_CONST(exp_lo, -88.3762626647949f);
211
_PS_CONST(cephes_LOG2EF, 1.44269504088896341);
212
_PS_CONST(cephes_exp_C1, 0.693359375);
213
_PS_CONST(cephes_exp_C2, -2.12194440e-4);
215
_PS_CONST(cephes_exp_p0, 1.9875691500E-4);
216
_PS_CONST(cephes_exp_p1, 1.3981999507E-3);
217
_PS_CONST(cephes_exp_p2, 8.3334519073E-3);
218
_PS_CONST(cephes_exp_p3, 4.1665795894E-2);
219
_PS_CONST(cephes_exp_p4, 1.6666665459E-1);
220
_PS_CONST(cephes_exp_p5, 5.0000001201E-1);
222
v4sf exp_ps(v4sf x) {
223
v4sf tmp = _mm_setzero_ps(), fx;
229
v4sf one = *(v4sf*)_ps_1;
230
v4sf mask, z, y, pow2n;
232
x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
233
x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
235
/* express exp(x) as exp(g + n*log(2)) */
236
fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
237
fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
239
/* how to perform a floorf with SSE: just below */
241
/* step 1 : cast to int */
242
tmp = _mm_movehl_ps(tmp, fx);
243
mm0 = _mm_cvttps_pi32(fx);
244
mm1 = _mm_cvttps_pi32(tmp);
245
/* step 2 : cast back to float */
246
tmp = _mm_cvtpi32x2_ps(mm0, mm1);
248
emm0 = _mm_cvttps_epi32(fx);
249
tmp = _mm_cvtepi32_ps(emm0);
251
/* if greater, substract 1 */
252
mask = _mm_cmpgt_ps(tmp, fx);
253
mask = _mm_and_ps(mask, one);
254
fx = _mm_sub_ps(tmp, mask);
256
tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
257
z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
258
x = _mm_sub_ps(x, tmp);
259
x = _mm_sub_ps(x, z);
263
y = *(v4sf*)_ps_cephes_exp_p0;
264
y = _mm_mul_ps(y, x);
265
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
266
y = _mm_mul_ps(y, x);
267
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
268
y = _mm_mul_ps(y, x);
269
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
270
y = _mm_mul_ps(y, x);
271
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
272
y = _mm_mul_ps(y, x);
273
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
274
y = _mm_mul_ps(y, z);
275
y = _mm_add_ps(y, x);
276
y = _mm_add_ps(y, one);
280
z = _mm_movehl_ps(z, fx);
281
mm0 = _mm_cvttps_pi32(fx);
282
mm1 = _mm_cvttps_pi32(z);
283
mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
284
mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
285
mm0 = _mm_slli_pi32(mm0, 23);
286
mm1 = _mm_slli_pi32(mm1, 23);
288
COPY_MM_TO_XMM(mm0, mm1, pow2n);
291
emm0 = _mm_cvttps_epi32(fx);
292
emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
293
emm0 = _mm_slli_epi32(emm0, 23);
294
pow2n = _mm_castsi128_ps(emm0);
296
y = _mm_mul_ps(y, pow2n);
300
_PS_CONST(minus_cephes_DP1, -0.78515625);
301
_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
302
_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
303
_PS_CONST(sincof_p0, -1.9515295891E-4);
304
_PS_CONST(sincof_p1, 8.3321608736E-3);
305
_PS_CONST(sincof_p2, -1.6666654611E-1);
306
_PS_CONST(coscof_p0, 2.443315711809948E-005);
307
_PS_CONST(coscof_p1, -1.388731625493765E-003);
308
_PS_CONST(coscof_p2, 4.166664568298827E-002);
309
_PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
312
/* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so
313
it runs also on old athlons XPs and the pentium III of your grand
316
The code is the exact rewriting of the cephes sinf function.
317
Precision is excellent as long as x < 8192 (I did not bother to
318
take into account the special handling they have for greater values
319
-- it does not return garbage for arguments over 8192, though, but
320
the extra precision is missing).
322
Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
323
surprising but correct result.
325
Performance is also surprisingly good, 1.33 times faster than the
326
macos vsinf SSE2 function, and 1.5 times faster than the
327
__vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
328
too bad for an SSE1 function (with no special tuning) !
329
However the latter libraries probably have a much better handling of NaN,
330
Inf, denormalized and other special arguments..
332
On my core 1 duo, the execution of this function takes approximately 95 cycles.
334
From what I have observed on the experiments with Intel AMath lib, switching to an
335
SSE2 version would improve the perf by only 10%.
337
Since it is based on SSE intrinsics, it has to be compiled at -O2 to
340
v4sf sin_ps(v4sf x) { // any x
341
v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
346
v2si mm0, mm1, mm2, mm3;
348
v4sf swap_sign_bit, poly_mask, z, tmp, y2;
351
/* take the absolute value */
352
x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
353
/* extract the sign bit (upper one) */
354
sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
357
y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
359
//printf("plop:"); print4(y);
361
/* store the integer part of y in mm0 */
362
emm2 = _mm_cvttps_epi32(y);
363
/* j=(j+1) & (~1) (see the cephes sources) */
364
emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
365
emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
366
y = _mm_cvtepi32_ps(emm2);
367
/* get the swap sign flag */
368
emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
369
emm0 = _mm_slli_epi32(emm0, 29);
370
/* get the polynom selection mask
371
there is one polynom for 0 <= x <= Pi/4
372
and another one for Pi/4<x<=Pi/2
374
Both branches will be computed.
376
emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
377
emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
379
swap_sign_bit = _mm_castsi128_ps(emm0);
380
poly_mask = _mm_castsi128_ps(emm2);
381
sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
383
/* store the integer part of y in mm0:mm1 */
384
xmm2 = _mm_movehl_ps(xmm2, y);
385
mm2 = _mm_cvttps_pi32(y);
386
mm3 = _mm_cvttps_pi32(xmm2);
387
/* j=(j+1) & (~1) (see the cephes sources) */
388
mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
389
mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
390
mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
391
mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
392
y = _mm_cvtpi32x2_ps(mm2, mm3);
393
/* get the swap sign flag */
394
mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
395
mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
396
mm0 = _mm_slli_pi32(mm0, 29);
397
mm1 = _mm_slli_pi32(mm1, 29);
398
/* get the polynom selection mask */
399
mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
400
mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
401
mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
402
mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
404
COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
405
COPY_MM_TO_XMM(mm2, mm3, poly_mask);
406
sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
407
_mm_empty(); /* good-bye mmx */
410
/* The magic pass: "Extended precision modular arithmetic"
411
x = ((x - y * DP1) - y * DP2) - y * DP3; */
412
xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
413
xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
414
xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
415
xmm1 = _mm_mul_ps(y, xmm1);
416
xmm2 = _mm_mul_ps(y, xmm2);
417
xmm3 = _mm_mul_ps(y, xmm3);
418
x = _mm_add_ps(x, xmm1);
419
x = _mm_add_ps(x, xmm2);
420
x = _mm_add_ps(x, xmm3);
422
/* Evaluate the first polynom (0 <= x <= Pi/4) */
423
y = *(v4sf*)_ps_coscof_p0;
426
y = _mm_mul_ps(y, z);
427
y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
428
y = _mm_mul_ps(y, z);
429
y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
430
y = _mm_mul_ps(y, z);
431
y = _mm_mul_ps(y, z);
432
tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
433
y = _mm_sub_ps(y, tmp);
434
y = _mm_add_ps(y, *(v4sf*)_ps_1);
436
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
438
y2 = *(v4sf*)_ps_sincof_p0;
439
y2 = _mm_mul_ps(y2, z);
440
y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
441
y2 = _mm_mul_ps(y2, z);
442
y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
443
y2 = _mm_mul_ps(y2, z);
444
y2 = _mm_mul_ps(y2, x);
445
y2 = _mm_add_ps(y2, x);
447
/* select the correct result from the two polynoms */
449
y2 = _mm_and_ps(xmm3, y2); //, xmm3);
450
y = _mm_andnot_ps(xmm3, y);
451
y = _mm_add_ps(y,y2);
452
/* update the sign */
453
y = _mm_xor_ps(y, sign_bit);
458
/* almost the same as sin_ps */
459
v4sf cos_ps(v4sf x) { // any x
460
v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
464
v2si mm0, mm1, mm2, mm3;
466
v4sf sign_bit, poly_mask, z, tmp, y2;
468
/* take the absolute value */
469
x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
472
y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
475
/* store the integer part of y in mm0 */
476
emm2 = _mm_cvttps_epi32(y);
477
/* j=(j+1) & (~1) (see the cephes sources) */
478
emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
479
emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
480
y = _mm_cvtepi32_ps(emm2);
482
emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
484
/* get the swap sign flag */
485
emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
486
emm0 = _mm_slli_epi32(emm0, 29);
487
/* get the polynom selection mask */
488
emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
489
emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
491
sign_bit = _mm_castsi128_ps(emm0);
492
poly_mask = _mm_castsi128_ps(emm2);
494
/* store the integer part of y in mm0:mm1 */
495
xmm2 = _mm_movehl_ps(xmm2, y);
496
mm2 = _mm_cvttps_pi32(y);
497
mm3 = _mm_cvttps_pi32(xmm2);
499
/* j=(j+1) & (~1) (see the cephes sources) */
500
mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
501
mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
502
mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
503
mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
505
y = _mm_cvtpi32x2_ps(mm2, mm3);
508
mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
509
mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
511
/* get the swap sign flag in mm0:mm1 and the
512
polynom selection mask in mm2:mm3 */
514
mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
515
mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
516
mm0 = _mm_slli_pi32(mm0, 29);
517
mm1 = _mm_slli_pi32(mm1, 29);
519
mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
520
mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
522
mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
523
mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
525
COPY_MM_TO_XMM(mm0, mm1, sign_bit);
526
COPY_MM_TO_XMM(mm2, mm3, poly_mask);
527
_mm_empty(); /* good-bye mmx */
529
/* The magic pass: "Extended precision modular arithmetic"
530
x = ((x - y * DP1) - y * DP2) - y * DP3; */
531
xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
532
xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
533
xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
534
xmm1 = _mm_mul_ps(y, xmm1);
535
xmm2 = _mm_mul_ps(y, xmm2);
536
xmm3 = _mm_mul_ps(y, xmm3);
537
x = _mm_add_ps(x, xmm1);
538
x = _mm_add_ps(x, xmm2);
539
x = _mm_add_ps(x, xmm3);
541
/* Evaluate the first polynom (0 <= x <= Pi/4) */
542
y = *(v4sf*)_ps_coscof_p0;
545
y = _mm_mul_ps(y, z);
546
y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
547
y = _mm_mul_ps(y, z);
548
y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
549
y = _mm_mul_ps(y, z);
550
y = _mm_mul_ps(y, z);
551
tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
552
y = _mm_sub_ps(y, tmp);
553
y = _mm_add_ps(y, *(v4sf*)_ps_1);
555
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
557
y2 = *(v4sf*)_ps_sincof_p0;
558
y2 = _mm_mul_ps(y2, z);
559
y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
560
y2 = _mm_mul_ps(y2, z);
561
y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
562
y2 = _mm_mul_ps(y2, z);
563
y2 = _mm_mul_ps(y2, x);
564
y2 = _mm_add_ps(y2, x);
566
/* select the correct result from the two polynoms */
568
y2 = _mm_and_ps(xmm3, y2); //, xmm3);
569
y = _mm_andnot_ps(xmm3, y);
570
y = _mm_add_ps(y,y2);
571
/* update the sign */
572
y = _mm_xor_ps(y, sign_bit);
577
/* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
578
it is almost as fast, and gives you a free cosine with your sine */
579
void sincos_ps(v4sf x, v4sf *s, v4sf *c) {
580
v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
582
v4si emm0, emm2, emm4;
584
v2si mm0, mm1, mm2, mm3, mm4, mm5;
586
v4sf swap_sign_bit_sin, poly_mask, z, tmp, y2, ysin1, ysin2;
590
/* take the absolute value */
591
x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
592
/* extract the sign bit (upper one) */
593
sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
596
y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
599
/* store the integer part of y in emm2 */
600
emm2 = _mm_cvttps_epi32(y);
602
/* j=(j+1) & (~1) (see the cephes sources) */
603
emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
604
emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
605
y = _mm_cvtepi32_ps(emm2);
609
/* get the swap sign flag for the sine */
610
emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
611
emm0 = _mm_slli_epi32(emm0, 29);
612
swap_sign_bit_sin = _mm_castsi128_ps(emm0);
614
/* get the polynom selection mask for the sine*/
615
emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
616
emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
617
poly_mask = _mm_castsi128_ps(emm2);
619
/* store the integer part of y in mm2:mm3 */
620
xmm3 = _mm_movehl_ps(xmm3, y);
621
mm2 = _mm_cvttps_pi32(y);
622
mm3 = _mm_cvttps_pi32(xmm3);
624
/* j=(j+1) & (~1) (see the cephes sources) */
625
mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
626
mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
627
mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
628
mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
630
y = _mm_cvtpi32x2_ps(mm2, mm3);
635
/* get the swap sign flag for the sine */
636
mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
637
mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
638
mm0 = _mm_slli_pi32(mm0, 29);
639
mm1 = _mm_slli_pi32(mm1, 29);
641
COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
643
/* get the polynom selection mask for the sine */
645
mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
646
mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
647
mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
648
mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
650
COPY_MM_TO_XMM(mm2, mm3, poly_mask);
653
/* The magic pass: "Extended precision modular arithmetic"
654
x = ((x - y * DP1) - y * DP2) - y * DP3; */
655
xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
656
xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
657
xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
658
xmm1 = _mm_mul_ps(y, xmm1);
659
xmm2 = _mm_mul_ps(y, xmm2);
660
xmm3 = _mm_mul_ps(y, xmm3);
661
x = _mm_add_ps(x, xmm1);
662
x = _mm_add_ps(x, xmm2);
663
x = _mm_add_ps(x, xmm3);
666
emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
667
emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
668
emm4 = _mm_slli_epi32(emm4, 29);
669
sign_bit_cos = _mm_castsi128_ps(emm4);
671
/* get the sign flag for the cosine */
672
mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
673
mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
674
mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
675
mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
676
mm4 = _mm_slli_pi32(mm4, 29);
677
mm5 = _mm_slli_pi32(mm5, 29);
678
COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
679
_mm_empty(); /* good-bye mmx */
682
sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
685
/* Evaluate the first polynom (0 <= x <= Pi/4) */
687
y = *(v4sf*)_ps_coscof_p0;
689
y = _mm_mul_ps(y, z);
690
y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
691
y = _mm_mul_ps(y, z);
692
y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
693
y = _mm_mul_ps(y, z);
694
y = _mm_mul_ps(y, z);
695
tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
696
y = _mm_sub_ps(y, tmp);
697
y = _mm_add_ps(y, *(v4sf*)_ps_1);
699
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
701
y2 = *(v4sf*)_ps_sincof_p0;
702
y2 = _mm_mul_ps(y2, z);
703
y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
704
y2 = _mm_mul_ps(y2, z);
705
y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
706
y2 = _mm_mul_ps(y2, z);
707
y2 = _mm_mul_ps(y2, x);
708
y2 = _mm_add_ps(y2, x);
710
/* select the correct result from the two polynoms */
712
ysin2 = _mm_and_ps(xmm3, y2);
713
ysin1 = _mm_andnot_ps(xmm3, y);
714
y2 = _mm_sub_ps(y2,ysin2);
715
y = _mm_sub_ps(y, ysin1);
717
xmm1 = _mm_add_ps(ysin1,ysin2);
718
xmm2 = _mm_add_ps(y,y2);
720
/* update the sign */
721
*s = _mm_xor_ps(xmm1, sign_bit_sin);
722
*c = _mm_xor_ps(xmm2, sign_bit_cos);