2
Copyright(c) 2002-2005 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
4
Permission is hereby granted, free of charge, to any person
5
obtaining a copy of this software and associated documentation
6
files (the "Software"), to deal in the Software without restriction,
7
including without limitation the rights to use, copy, modify, merge,
8
publish, distribute, sublicense, and/or sell copies of the Software,
9
and to permit persons to whom the Software is furnished to do so,
10
subject to the following conditions:
12
The above copyright notice and this permission notice shall be included
13
in all copies or substantial portions of the Software.
15
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
19
DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21
OTHER DEALINGS IN THE SOFTWARE.
23
For more information please visit: http://bmagic.sourceforge.net
28
#ifndef BMSSE2__H__INCLUDED__
29
#define BMSSE2__H__INCLUDED__
32
// Header implements processor specific intrinsics declarations for SSE2
41
/** @defgroup SSE2 Processor specific optimizations for SSE2 instructions
47
@brief SSE2 reinitialization guard class
49
SSE2 requires to call _mm_empty() if we are intermixing
50
MMX integer commands with floating point arithmetics.
51
This class guards critical code fragments where SSE2 integer
57
class sse2_empty_guard
60
BMFORCEINLINE sse2_empty_guard()
65
BMFORCEINLINE ~sse2_empty_guard()
72
# ifndef BM_SET_MMX_GUARD
73
# define BM_SET_MMX_GUARD sse2_empty_guard bm_mmx_guard_;
78
@brief XOR array elements to specified mask
84
void sse2_xor_arr_2_mask(__m128i* BMRESTRICT dst,
85
const __m128i* BMRESTRICT src,
86
const __m128i* BMRESTRICT src_end,
89
__m128i xmm2 = _mm_set_epi32(mask, mask, mask, mask);
92
__m128i xmm1 = _mm_load_si128(src);
94
xmm1 = _mm_xor_si128(xmm1, xmm2);
95
_mm_store_si128(dst, xmm1);
99
} while (src < src_end);
103
@brief Inverts array elements and NOT them to specified mask
109
void sse2_andnot_arr_2_mask(__m128i* BMRESTRICT dst,
110
const __m128i* BMRESTRICT src,
111
const __m128i* BMRESTRICT src_end,
114
__m128i xmm2 = _mm_set_epi32(mask, mask, mask, mask);
117
//_mm_prefetch((const char*)(src)+1024, _MM_HINT_NTA);
118
//_mm_prefetch((const char*)(src)+1088, _MM_HINT_NTA);
120
__m128i xmm1 = _mm_load_si128(src);
122
xmm1 = _mm_andnot_si128(xmm1, xmm2); // xmm1 = (~xmm1) & xmm2
123
_mm_store_si128(dst, xmm1);
127
} while (src < src_end);
131
@brief AND array elements against another array
137
void sse2_and_arr(__m128i* BMRESTRICT dst,
138
const __m128i* BMRESTRICT src,
139
const __m128i* BMRESTRICT src_end)
144
_mm_prefetch((const char*)(src)+512, _MM_HINT_NTA);
146
xmm1 = _mm_load_si128(src++);
147
xmm2 = _mm_load_si128(dst);
148
xmm1 = _mm_and_si128(xmm1, xmm2);
149
_mm_store_si128(dst++, xmm1);
151
xmm1 = _mm_load_si128(src++);
152
xmm2 = _mm_load_si128(dst);
153
xmm1 = _mm_and_si128(xmm1, xmm2);
154
_mm_store_si128(dst++, xmm1);
156
xmm1 = _mm_load_si128(src++);
157
xmm2 = _mm_load_si128(dst);
158
xmm1 = _mm_and_si128(xmm1, xmm2);
159
_mm_store_si128(dst++, xmm1);
161
xmm1 = _mm_load_si128(src++);
162
xmm2 = _mm_load_si128(dst);
163
xmm1 = _mm_and_si128(xmm1, xmm2);
164
_mm_store_si128(dst++, xmm1);
166
} while (src < src_end);
173
@brief OR array elements against another array
179
void sse2_or_arr(__m128i* BMRESTRICT dst,
180
const __m128i* BMRESTRICT src,
181
const __m128i* BMRESTRICT src_end)
186
_mm_prefetch((const char*)(src)+512, _MM_HINT_NTA);
188
xmm1 = _mm_load_si128(src++);
189
xmm2 = _mm_load_si128(dst);
190
xmm1 = _mm_or_si128(xmm1, xmm2);
191
_mm_store_si128(dst++, xmm1);
193
xmm1 = _mm_load_si128(src++);
194
xmm2 = _mm_load_si128(dst);
195
xmm1 = _mm_or_si128(xmm1, xmm2);
196
_mm_store_si128(dst++, xmm1);
198
xmm1 = _mm_load_si128(src++);
199
xmm2 = _mm_load_si128(dst);
200
xmm1 = _mm_or_si128(xmm1, xmm2);
201
_mm_store_si128(dst++, xmm1);
203
xmm1 = _mm_load_si128(src++);
204
xmm2 = _mm_load_si128(dst);
205
xmm1 = _mm_or_si128(xmm1, xmm2);
206
_mm_store_si128(dst++, xmm1);
208
} while (src < src_end);
212
@brief OR array elements against another array
218
void sse2_xor_arr(__m128i* BMRESTRICT dst,
219
const __m128i* BMRESTRICT src,
220
const __m128i* BMRESTRICT src_end)
225
_mm_prefetch((const char*)(src)+512, _MM_HINT_NTA);
227
xmm1 = _mm_load_si128(src++);
228
xmm2 = _mm_load_si128(dst);
229
xmm1 = _mm_xor_si128(xmm1, xmm2);
230
_mm_store_si128(dst++, xmm1);
232
xmm1 = _mm_load_si128(src++);
233
xmm2 = _mm_load_si128(dst);
234
xmm1 = _mm_xor_si128(xmm1, xmm2);
235
_mm_store_si128(dst++, xmm1);
237
xmm1 = _mm_load_si128(src++);
238
xmm2 = _mm_load_si128(dst);
239
xmm1 = _mm_xor_si128(xmm1, xmm2);
240
_mm_store_si128(dst++, xmm1);
242
xmm1 = _mm_load_si128(src++);
243
xmm2 = _mm_load_si128(dst);
244
xmm1 = _mm_xor_si128(xmm1, xmm2);
245
_mm_store_si128(dst++, xmm1);
247
} while (src < src_end);
252
@brief AND-NOT (SUB) array elements against another array
258
void sse2_sub_arr(__m128i* BMRESTRICT dst,
259
const __m128i* BMRESTRICT src,
260
const __m128i* BMRESTRICT src_end)
265
_mm_prefetch((const char*)(src)+512, _MM_HINT_NTA);
267
xmm1 = _mm_load_si128(src++);
268
xmm2 = _mm_load_si128(dst);
269
xmm1 = _mm_andnot_si128(xmm1, xmm2);
270
_mm_store_si128(dst++, xmm1);
272
xmm1 = _mm_load_si128(src++);
273
xmm2 = _mm_load_si128(dst);
274
xmm1 = _mm_andnot_si128(xmm1, xmm2);
275
_mm_store_si128(dst++, xmm1);
277
xmm1 = _mm_load_si128(src++);
278
xmm2 = _mm_load_si128(dst);
279
xmm1 = _mm_andnot_si128(xmm1, xmm2);
280
_mm_store_si128(dst++, xmm1);
282
xmm1 = _mm_load_si128(src++);
283
xmm2 = _mm_load_si128(dst);
284
xmm1 = _mm_andnot_si128(xmm1, xmm2);
285
_mm_store_si128(dst++, xmm1);
287
} while (src < src_end);
291
@brief SSE2 block memset
298
void sse2_set_block(__m128i* BMRESTRICT dst,
299
__m128i* BMRESTRICT dst_end,
302
__m128i xmm0 = _mm_set_epi32 (value, value, value, value);
305
_mm_store_si128(dst, xmm0);
307
_mm_store_si128(dst+1, xmm0);
308
_mm_store_si128(dst+2, xmm0);
309
_mm_store_si128(dst+3, xmm0);
311
_mm_store_si128(dst+4, xmm0);
312
_mm_store_si128(dst+5, xmm0);
313
_mm_store_si128(dst+6, xmm0);
314
_mm_store_si128(dst+7, xmm0);
318
} while (++dst < dst_end);
324
@brief SSE2 block copy
330
void sse2_copy_block(__m128i* BMRESTRICT dst,
331
const __m128i* BMRESTRICT src,
332
const __m128i* BMRESTRICT src_end)
334
__m128i xmm0, xmm1, xmm2, xmm3;
337
_mm_prefetch((const char*)(src)+512, _MM_HINT_NTA);
339
xmm0 = _mm_load_si128(src+0);
340
xmm1 = _mm_load_si128(src+1);
341
xmm2 = _mm_load_si128(src+2);
342
xmm3 = _mm_load_si128(src+3);
344
_mm_store_si128(dst+0, xmm0);
345
_mm_store_si128(dst+1, xmm1);
346
_mm_store_si128(dst+2, xmm2);
347
_mm_store_si128(dst+3, xmm3);
349
xmm0 = _mm_load_si128(src+4);
350
xmm1 = _mm_load_si128(src+5);
351
xmm2 = _mm_load_si128(src+6);
352
xmm3 = _mm_load_si128(src+7);
354
_mm_store_si128(dst+4, xmm0);
355
_mm_store_si128(dst+5, xmm1);
356
_mm_store_si128(dst+6, xmm2);
357
_mm_store_si128(dst+7, xmm3);
362
} while (src < src_end);
367
@brief Invert array elements
375
void sse2_invert_arr(bm::word_t* first, bm::word_t* last)
377
__m128i xmm1 = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF,
378
0xFFFFFFFF, 0xFFFFFFFF);
379
__m128i* wrd_ptr = (__m128i*)first;
383
_mm_prefetch((const char*)(wrd_ptr)+512, _MM_HINT_NTA);
385
__m128i xmm0 = _mm_load_si128(wrd_ptr);
386
xmm0 = _mm_xor_si128(xmm0, xmm1);
387
_mm_store_si128(wrd_ptr, xmm0);
389
} while (wrd_ptr < (__m128i*)last);
395
SSE2 optimized bitcounting function implements parallel bitcounting
396
algorithm for SSE2 instruction set.
399
unsigned CalcBitCount32(unsigned b)
401
b = (b & 0x55555555) + (b >> 1 & 0x55555555);
402
b = (b & 0x33333333) + (b >> 2 & 0x33333333);
403
b = (b + (b >> 4)) & 0x0F0F0F0F;
405
b = (b + (b >> 16)) & 0x0000003F;
414
bm::id_t sse2_bit_count(const __m128i* block, const __m128i* block_end)
416
const unsigned mu1 = 0x55555555;
417
const unsigned mu2 = 0x33333333;
418
const unsigned mu3 = 0x0F0F0F0F;
419
const unsigned mu4 = 0x0000003F;
422
__m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
423
__m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
424
__m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
425
__m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
427
mcnt = _mm_xor_si128(m1, m1); // cnt = 0
432
__m128i b = _mm_load_si128(block);
435
// b = (b & 0x55555555) + (b >> 1 & 0x55555555);
436
tmp1 = _mm_srli_epi32(b, 1); // tmp1 = (b >> 1 & 0x55555555)
437
tmp1 = _mm_and_si128(tmp1, m1);
438
tmp2 = _mm_and_si128(b, m1); // tmp2 = (b & 0x55555555)
439
b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
441
// b = (b & 0x33333333) + (b >> 2 & 0x33333333);
442
tmp1 = _mm_srli_epi32(b, 2); // (b >> 2 & 0x33333333)
443
tmp1 = _mm_and_si128(tmp1, m2);
444
tmp2 = _mm_and_si128(b, m2); // (b & 0x33333333)
445
b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
447
// b = (b + (b >> 4)) & 0x0F0F0F0F;
448
tmp1 = _mm_srli_epi32(b, 4); // tmp1 = b >> 4
449
b = _mm_add_epi32(b, tmp1); // b = b + (b >> 4)
450
b = _mm_and_si128(b, m3); // & 0x0F0F0F0F
453
tmp1 = _mm_srli_epi32 (b, 8); // tmp1 = b >> 8
454
b = _mm_add_epi32(b, tmp1); // b = b + (b >> 8)
456
// b = (b + (b >> 16)) & 0x0000003F;
457
tmp1 = _mm_srli_epi32 (b, 16); // b >> 16
458
b = _mm_add_epi32(b, tmp1); // b + (b >> 16)
459
b = _mm_and_si128(b, m4); // (b >> 16) & 0x0000003F;
461
mcnt = _mm_add_epi32(mcnt, b); // mcnt += b
463
} while (block < block_end);
465
__declspec(align(16)) bm::id_t tcnt[4];
466
_mm_store_si128((__m128i*)tcnt, mcnt);
468
return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
472
__m128i sse2_and(__m128i a, __m128i b)
474
return _mm_and_si128(a, b);
478
__m128i sse2_or(__m128i a, __m128i b)
480
return _mm_or_si128(a, b);
485
__m128i sse2_xor(__m128i a, __m128i b)
487
return _mm_xor_si128(a, b);
491
__m128i sse2_sub(__m128i a, __m128i b)
493
return _mm_andnot_si128(b, a);
498
bm::id_t sse2_bit_count_op(const __m128i* BMRESTRICT block,
499
const __m128i* BMRESTRICT block_end,
500
const __m128i* BMRESTRICT mask_block,
503
const unsigned mu1 = 0x55555555;
504
const unsigned mu2 = 0x33333333;
505
const unsigned mu3 = 0x0F0F0F0F;
506
const unsigned mu4 = 0x0000003F;
509
__m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
510
__m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
511
__m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
512
__m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
514
mcnt = _mm_xor_si128(m1, m1); // cnt = 0
518
__m128i b = _mm_load_si128(block++);
520
tmp1 = _mm_load_si128(mask_block++);
522
b = sse2_func(b, tmp1);
524
// b = (b & 0x55555555) + (b >> 1 & 0x55555555);
525
tmp1 = _mm_srli_epi32(b, 1); // tmp1 = (b >> 1 & 0x55555555)
526
tmp1 = _mm_and_si128(tmp1, m1);
527
tmp2 = _mm_and_si128(b, m1); // tmp2 = (b & 0x55555555)
528
b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
530
// b = (b & 0x33333333) + (b >> 2 & 0x33333333);
531
tmp1 = _mm_srli_epi32(b, 2); // (b >> 2 & 0x33333333)
532
tmp1 = _mm_and_si128(tmp1, m2);
533
tmp2 = _mm_and_si128(b, m2); // (b & 0x33333333)
534
b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
536
// b = (b + (b >> 4)) & 0x0F0F0F0F;
537
tmp1 = _mm_srli_epi32(b, 4); // tmp1 = b >> 4
538
b = _mm_add_epi32(b, tmp1); // b = b + (b >> 4)
539
b = _mm_and_si128(b, m3); // & 0x0F0F0F0F
542
tmp1 = _mm_srli_epi32 (b, 8); // tmp1 = b >> 8
543
b = _mm_add_epi32(b, tmp1); // b = b + (b >> 8)
545
// b = (b + (b >> 16)) & 0x0000003F;
546
tmp1 = _mm_srli_epi32 (b, 16); // b >> 16
547
b = _mm_add_epi32(b, tmp1); // b + (b >> 16)
548
b = _mm_and_si128(b, m4); // (b >> 16) & 0x0000003F;
550
mcnt = _mm_add_epi32(mcnt, b); // mcnt += b
552
} while (block < block_end);
554
__declspec(align(16)) bm::id_t tcnt[4];
555
_mm_store_si128((__m128i*)tcnt, mcnt);
557
return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
563
#define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\
564
sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), mask)
566
#define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\
567
sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), mask)
569
#define VECT_BITCOUNT(first, last) \
570
sse2_bit_count((__m128i*) (first), (__m128i*) (last))
572
#define VECT_BITCOUNT_AND(first, last, mask) \
573
sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and)
575
#define VECT_BITCOUNT_OR(first, last, mask) \
576
sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or)
578
#define VECT_BITCOUNT_XOR(first, last, mask) \
579
sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor)
581
#define VECT_BITCOUNT_SUB(first, last, mask) \
582
sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub)
584
#define VECT_INVERT_ARR(first, last) \
585
sse2_invert_arr(first, last);
587
#define VECT_AND_ARR(dst, src, src_end) \
588
sse2_and_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
590
#define VECT_OR_ARR(dst, src, src_end) \
591
sse2_or_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
593
#define VECT_SUB_ARR(dst, src, src_end) \
594
sse2_sub_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
596
#define VECT_XOR_ARR(dst, src, src_end) \
597
sse2_xor_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
599
#define VECT_COPY_BLOCK(dst, src, src_end) \
600
sse2_copy_block((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
602
#define VECT_SET_BLOCK(dst, dst_end, value) \
603
sse2_set_block((__m128i*) dst, (__m128i*) (dst_end), (value))
2
Copyright(c) 2002-2005 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
4
Permission is hereby granted, free of charge, to any person
5
obtaining a copy of this software and associated documentation
6
files (the "Software"), to deal in the Software without restriction,
7
including without limitation the rights to use, copy, modify, merge,
8
publish, distribute, sublicense, and/or sell copies of the Software,
9
and to permit persons to whom the Software is furnished to do so,
10
subject to the following conditions:
12
The above copyright notice and this permission notice shall be included
13
in all copies or substantial portions of the Software.
15
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
19
DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21
OTHER DEALINGS IN THE SOFTWARE.
23
For more information please visit: http://bmagic.sourceforge.net
28
#ifndef BMSSE2__H__INCLUDED__
29
#define BMSSE2__H__INCLUDED__
32
// Header implements processor specific intrinsics declarations for SSE2
41
/** @defgroup SSE2 Processor specific optimizations for SSE2 instructions
47
@brief SSE2 reinitialization guard class
49
SSE2 requires to call _mm_empty() if we are intermixing
50
MMX integer commands with floating point arithmetics.
51
This class guards critical code fragments where SSE2 integer
57
class sse2_empty_guard
60
BMFORCEINLINE sse2_empty_guard()
65
BMFORCEINLINE ~sse2_empty_guard()
72
# ifndef BM_SET_MMX_GUARD
73
# define BM_SET_MMX_GUARD sse2_empty_guard bm_mmx_guard_;
78
@brief XOR array elements to specified mask
84
void sse2_xor_arr_2_mask(__m128i* BMRESTRICT dst,
85
const __m128i* BMRESTRICT src,
86
const __m128i* BMRESTRICT src_end,
89
__m128i xmm2 = _mm_set_epi32(mask, mask, mask, mask);
92
__m128i xmm1 = _mm_load_si128(src);
94
xmm1 = _mm_xor_si128(xmm1, xmm2);
95
_mm_store_si128(dst, xmm1);
99
} while (src < src_end);
103
@brief Inverts array elements and NOT them to specified mask
109
void sse2_andnot_arr_2_mask(__m128i* BMRESTRICT dst,
110
const __m128i* BMRESTRICT src,
111
const __m128i* BMRESTRICT src_end,
114
__m128i xmm2 = _mm_set_epi32(mask, mask, mask, mask);
117
//_mm_prefetch((const char*)(src)+1024, _MM_HINT_NTA);
118
//_mm_prefetch((const char*)(src)+1088, _MM_HINT_NTA);
120
__m128i xmm1 = _mm_load_si128(src);
122
xmm1 = _mm_andnot_si128(xmm1, xmm2); // xmm1 = (~xmm1) & xmm2
123
_mm_store_si128(dst, xmm1);
127
} while (src < src_end);
131
@brief AND array elements against another array
137
void sse2_and_arr(__m128i* BMRESTRICT dst,
138
const __m128i* BMRESTRICT src,
139
const __m128i* BMRESTRICT src_end)
144
_mm_prefetch((const char*)(src)+512, _MM_HINT_NTA);
146
xmm1 = _mm_load_si128(src++);
147
xmm2 = _mm_load_si128(dst);
148
xmm1 = _mm_and_si128(xmm1, xmm2);
149
_mm_store_si128(dst++, xmm1);
151
xmm1 = _mm_load_si128(src++);
152
xmm2 = _mm_load_si128(dst);
153
xmm1 = _mm_and_si128(xmm1, xmm2);
154
_mm_store_si128(dst++, xmm1);
156
xmm1 = _mm_load_si128(src++);
157
xmm2 = _mm_load_si128(dst);
158
xmm1 = _mm_and_si128(xmm1, xmm2);
159
_mm_store_si128(dst++, xmm1);
161
xmm1 = _mm_load_si128(src++);
162
xmm2 = _mm_load_si128(dst);
163
xmm1 = _mm_and_si128(xmm1, xmm2);
164
_mm_store_si128(dst++, xmm1);
166
} while (src < src_end);
173
@brief OR array elements against another array
179
void sse2_or_arr(__m128i* BMRESTRICT dst,
180
const __m128i* BMRESTRICT src,
181
const __m128i* BMRESTRICT src_end)
186
_mm_prefetch((const char*)(src)+512, _MM_HINT_NTA);
188
xmm1 = _mm_load_si128(src++);
189
xmm2 = _mm_load_si128(dst);
190
xmm1 = _mm_or_si128(xmm1, xmm2);
191
_mm_store_si128(dst++, xmm1);
193
xmm1 = _mm_load_si128(src++);
194
xmm2 = _mm_load_si128(dst);
195
xmm1 = _mm_or_si128(xmm1, xmm2);
196
_mm_store_si128(dst++, xmm1);
198
xmm1 = _mm_load_si128(src++);
199
xmm2 = _mm_load_si128(dst);
200
xmm1 = _mm_or_si128(xmm1, xmm2);
201
_mm_store_si128(dst++, xmm1);
203
xmm1 = _mm_load_si128(src++);
204
xmm2 = _mm_load_si128(dst);
205
xmm1 = _mm_or_si128(xmm1, xmm2);
206
_mm_store_si128(dst++, xmm1);
208
} while (src < src_end);
212
@brief OR array elements against another array
218
void sse2_xor_arr(__m128i* BMRESTRICT dst,
219
const __m128i* BMRESTRICT src,
220
const __m128i* BMRESTRICT src_end)
225
_mm_prefetch((const char*)(src)+512, _MM_HINT_NTA);
227
xmm1 = _mm_load_si128(src++);
228
xmm2 = _mm_load_si128(dst);
229
xmm1 = _mm_xor_si128(xmm1, xmm2);
230
_mm_store_si128(dst++, xmm1);
232
xmm1 = _mm_load_si128(src++);
233
xmm2 = _mm_load_si128(dst);
234
xmm1 = _mm_xor_si128(xmm1, xmm2);
235
_mm_store_si128(dst++, xmm1);
237
xmm1 = _mm_load_si128(src++);
238
xmm2 = _mm_load_si128(dst);
239
xmm1 = _mm_xor_si128(xmm1, xmm2);
240
_mm_store_si128(dst++, xmm1);
242
xmm1 = _mm_load_si128(src++);
243
xmm2 = _mm_load_si128(dst);
244
xmm1 = _mm_xor_si128(xmm1, xmm2);
245
_mm_store_si128(dst++, xmm1);
247
} while (src < src_end);
252
@brief AND-NOT (SUB) array elements against another array
258
void sse2_sub_arr(__m128i* BMRESTRICT dst,
259
const __m128i* BMRESTRICT src,
260
const __m128i* BMRESTRICT src_end)
265
_mm_prefetch((const char*)(src)+512, _MM_HINT_NTA);
267
xmm1 = _mm_load_si128(src++);
268
xmm2 = _mm_load_si128(dst);
269
xmm1 = _mm_andnot_si128(xmm1, xmm2);
270
_mm_store_si128(dst++, xmm1);
272
xmm1 = _mm_load_si128(src++);
273
xmm2 = _mm_load_si128(dst);
274
xmm1 = _mm_andnot_si128(xmm1, xmm2);
275
_mm_store_si128(dst++, xmm1);
277
xmm1 = _mm_load_si128(src++);
278
xmm2 = _mm_load_si128(dst);
279
xmm1 = _mm_andnot_si128(xmm1, xmm2);
280
_mm_store_si128(dst++, xmm1);
282
xmm1 = _mm_load_si128(src++);
283
xmm2 = _mm_load_si128(dst);
284
xmm1 = _mm_andnot_si128(xmm1, xmm2);
285
_mm_store_si128(dst++, xmm1);
287
} while (src < src_end);
291
@brief SSE2 block memset
298
void sse2_set_block(__m128i* BMRESTRICT dst,
299
__m128i* BMRESTRICT dst_end,
302
__m128i xmm0 = _mm_set_epi32 (value, value, value, value);
305
_mm_store_si128(dst, xmm0);
307
_mm_store_si128(dst+1, xmm0);
308
_mm_store_si128(dst+2, xmm0);
309
_mm_store_si128(dst+3, xmm0);
311
_mm_store_si128(dst+4, xmm0);
312
_mm_store_si128(dst+5, xmm0);
313
_mm_store_si128(dst+6, xmm0);
314
_mm_store_si128(dst+7, xmm0);
318
} while (++dst < dst_end);
324
@brief SSE2 block copy
330
void sse2_copy_block(__m128i* BMRESTRICT dst,
331
const __m128i* BMRESTRICT src,
332
const __m128i* BMRESTRICT src_end)
334
__m128i xmm0, xmm1, xmm2, xmm3;
337
_mm_prefetch((const char*)(src)+512, _MM_HINT_NTA);
339
xmm0 = _mm_load_si128(src+0);
340
xmm1 = _mm_load_si128(src+1);
341
xmm2 = _mm_load_si128(src+2);
342
xmm3 = _mm_load_si128(src+3);
344
_mm_store_si128(dst+0, xmm0);
345
_mm_store_si128(dst+1, xmm1);
346
_mm_store_si128(dst+2, xmm2);
347
_mm_store_si128(dst+3, xmm3);
349
xmm0 = _mm_load_si128(src+4);
350
xmm1 = _mm_load_si128(src+5);
351
xmm2 = _mm_load_si128(src+6);
352
xmm3 = _mm_load_si128(src+7);
354
_mm_store_si128(dst+4, xmm0);
355
_mm_store_si128(dst+5, xmm1);
356
_mm_store_si128(dst+6, xmm2);
357
_mm_store_si128(dst+7, xmm3);
362
} while (src < src_end);
367
@brief Invert array elements
375
void sse2_invert_arr(bm::word_t* first, bm::word_t* last)
377
__m128i xmm1 = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF,
378
0xFFFFFFFF, 0xFFFFFFFF);
379
__m128i* wrd_ptr = (__m128i*)first;
383
_mm_prefetch((const char*)(wrd_ptr)+512, _MM_HINT_NTA);
385
__m128i xmm0 = _mm_load_si128(wrd_ptr);
386
xmm0 = _mm_xor_si128(xmm0, xmm1);
387
_mm_store_si128(wrd_ptr, xmm0);
389
} while (wrd_ptr < (__m128i*)last);
395
SSE2 optimized bitcounting function implements parallel bitcounting
396
algorithm for SSE2 instruction set.
399
unsigned CalcBitCount32(unsigned b)
401
b = (b & 0x55555555) + (b >> 1 & 0x55555555);
402
b = (b & 0x33333333) + (b >> 2 & 0x33333333);
403
b = (b + (b >> 4)) & 0x0F0F0F0F;
405
b = (b + (b >> 16)) & 0x0000003F;
414
bm::id_t sse2_bit_count(const __m128i* block, const __m128i* block_end)
416
const unsigned mu1 = 0x55555555;
417
const unsigned mu2 = 0x33333333;
418
const unsigned mu3 = 0x0F0F0F0F;
419
const unsigned mu4 = 0x0000003F;
422
__m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
423
__m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
424
__m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
425
__m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
427
mcnt = _mm_xor_si128(m1, m1); // cnt = 0
432
__m128i b = _mm_load_si128(block);
435
// b = (b & 0x55555555) + (b >> 1 & 0x55555555);
436
tmp1 = _mm_srli_epi32(b, 1); // tmp1 = (b >> 1 & 0x55555555)
437
tmp1 = _mm_and_si128(tmp1, m1);
438
tmp2 = _mm_and_si128(b, m1); // tmp2 = (b & 0x55555555)
439
b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
441
// b = (b & 0x33333333) + (b >> 2 & 0x33333333);
442
tmp1 = _mm_srli_epi32(b, 2); // (b >> 2 & 0x33333333)
443
tmp1 = _mm_and_si128(tmp1, m2);
444
tmp2 = _mm_and_si128(b, m2); // (b & 0x33333333)
445
b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
447
// b = (b + (b >> 4)) & 0x0F0F0F0F;
448
tmp1 = _mm_srli_epi32(b, 4); // tmp1 = b >> 4
449
b = _mm_add_epi32(b, tmp1); // b = b + (b >> 4)
450
b = _mm_and_si128(b, m3); // & 0x0F0F0F0F
453
tmp1 = _mm_srli_epi32 (b, 8); // tmp1 = b >> 8
454
b = _mm_add_epi32(b, tmp1); // b = b + (b >> 8)
456
// b = (b + (b >> 16)) & 0x0000003F;
457
tmp1 = _mm_srli_epi32 (b, 16); // b >> 16
458
b = _mm_add_epi32(b, tmp1); // b + (b >> 16)
459
b = _mm_and_si128(b, m4); // (b >> 16) & 0x0000003F;
461
mcnt = _mm_add_epi32(mcnt, b); // mcnt += b
463
} while (block < block_end);
465
__declspec(align(16)) bm::id_t tcnt[4];
466
_mm_store_si128((__m128i*)tcnt, mcnt);
468
return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
472
__m128i sse2_and(__m128i a, __m128i b)
474
return _mm_and_si128(a, b);
478
__m128i sse2_or(__m128i a, __m128i b)
480
return _mm_or_si128(a, b);
485
__m128i sse2_xor(__m128i a, __m128i b)
487
return _mm_xor_si128(a, b);
491
__m128i sse2_sub(__m128i a, __m128i b)
493
return _mm_andnot_si128(b, a);
498
bm::id_t sse2_bit_count_op(const __m128i* BMRESTRICT block,
499
const __m128i* BMRESTRICT block_end,
500
const __m128i* BMRESTRICT mask_block,
503
const unsigned mu1 = 0x55555555;
504
const unsigned mu2 = 0x33333333;
505
const unsigned mu3 = 0x0F0F0F0F;
506
const unsigned mu4 = 0x0000003F;
509
__m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
510
__m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
511
__m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
512
__m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
514
mcnt = _mm_xor_si128(m1, m1); // cnt = 0
518
__m128i b = _mm_load_si128(block++);
520
tmp1 = _mm_load_si128(mask_block++);
522
b = sse2_func(b, tmp1);
524
// b = (b & 0x55555555) + (b >> 1 & 0x55555555);
525
tmp1 = _mm_srli_epi32(b, 1); // tmp1 = (b >> 1 & 0x55555555)
526
tmp1 = _mm_and_si128(tmp1, m1);
527
tmp2 = _mm_and_si128(b, m1); // tmp2 = (b & 0x55555555)
528
b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
530
// b = (b & 0x33333333) + (b >> 2 & 0x33333333);
531
tmp1 = _mm_srli_epi32(b, 2); // (b >> 2 & 0x33333333)
532
tmp1 = _mm_and_si128(tmp1, m2);
533
tmp2 = _mm_and_si128(b, m2); // (b & 0x33333333)
534
b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
536
// b = (b + (b >> 4)) & 0x0F0F0F0F;
537
tmp1 = _mm_srli_epi32(b, 4); // tmp1 = b >> 4
538
b = _mm_add_epi32(b, tmp1); // b = b + (b >> 4)
539
b = _mm_and_si128(b, m3); // & 0x0F0F0F0F
542
tmp1 = _mm_srli_epi32 (b, 8); // tmp1 = b >> 8
543
b = _mm_add_epi32(b, tmp1); // b = b + (b >> 8)
545
// b = (b + (b >> 16)) & 0x0000003F;
546
tmp1 = _mm_srli_epi32 (b, 16); // b >> 16
547
b = _mm_add_epi32(b, tmp1); // b + (b >> 16)
548
b = _mm_and_si128(b, m4); // (b >> 16) & 0x0000003F;
550
mcnt = _mm_add_epi32(mcnt, b); // mcnt += b
552
} while (block < block_end);
554
__declspec(align(16)) bm::id_t tcnt[4];
555
_mm_store_si128((__m128i*)tcnt, mcnt);
557
return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
563
#define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\
564
sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), mask)
566
#define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\
567
sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), mask)
569
#define VECT_BITCOUNT(first, last) \
570
sse2_bit_count((__m128i*) (first), (__m128i*) (last))
572
#define VECT_BITCOUNT_AND(first, last, mask) \
573
sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and)
575
#define VECT_BITCOUNT_OR(first, last, mask) \
576
sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or)
578
#define VECT_BITCOUNT_XOR(first, last, mask) \
579
sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor)
581
#define VECT_BITCOUNT_SUB(first, last, mask) \
582
sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub)
584
#define VECT_INVERT_ARR(first, last) \
585
sse2_invert_arr(first, last);
587
#define VECT_AND_ARR(dst, src, src_end) \
588
sse2_and_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
590
#define VECT_OR_ARR(dst, src, src_end) \
591
sse2_or_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
593
#define VECT_SUB_ARR(dst, src, src_end) \
594
sse2_sub_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
596
#define VECT_XOR_ARR(dst, src, src_end) \
597
sse2_xor_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
599
#define VECT_COPY_BLOCK(dst, src, src_end) \
600
sse2_copy_block((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
602
#define VECT_SET_BLOCK(dst, dst_end, value) \
603
sse2_set_block((__m128i*) dst, (__m128i*) (dst_end), (value))