00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028 #ifndef BMSSE2__H__INCLUDED__
00029 #define BMSSE2__H__INCLUDED__
00030
00031
00032
00033
00034 #include<emmintrin.h>
00035
00036
00037
00038 namespace bm
00039 {
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057 class sse2_empty_guard
00058 {
00059 public:
00060 BMFORCEINLINE sse2_empty_guard()
00061 {
00062 _mm_empty();
00063 }
00064
00065 BMFORCEINLINE ~sse2_empty_guard()
00066 {
00067 _mm_empty();
00068 }
00069 };
00070
00071
00072
00073
00074
00075
00076
00077
00078
00079
00080
00081
00082
00083 BMFORCEINLINE
00084 void sse2_xor_arr_2_mask(__m128i* BMRESTRICT dst,
00085 const __m128i* BMRESTRICT src,
00086 const __m128i* BMRESTRICT src_end,
00087 bm::word_t mask)
00088 {
00089 __m128i xmm2 = _mm_set_epi32(mask, mask, mask, mask);
00090 do
00091 {
00092 __m128i xmm1 = _mm_load_si128(src);
00093
00094 xmm1 = _mm_xor_si128(xmm1, xmm2);
00095 _mm_store_si128(dst, xmm1);
00096 ++dst;
00097 ++src;
00098
00099 } while (src < src_end);
00100 }
00101
00102
00103
00104
00105
00106
00107
00108 BMFORCEINLINE
00109 void sse2_andnot_arr_2_mask(__m128i* BMRESTRICT dst,
00110 const __m128i* BMRESTRICT src,
00111 const __m128i* BMRESTRICT src_end,
00112 bm::word_t mask)
00113 {
00114 __m128i xmm2 = _mm_set_epi32(mask, mask, mask, mask);
00115 do
00116 {
00117
00118
00119
00120 __m128i xmm1 = _mm_load_si128(src);
00121
00122 xmm1 = _mm_andnot_si128(xmm1, xmm2);
00123 _mm_store_si128(dst, xmm1);
00124 ++dst;
00125 ++src;
00126
00127 } while (src < src_end);
00128 }
00129
00130
00131
00132
00133
00134
00135
00136 BMFORCEINLINE
00137 void sse2_and_arr(__m128i* BMRESTRICT dst,
00138 const __m128i* BMRESTRICT src,
00139 const __m128i* BMRESTRICT src_end)
00140 {
00141 __m128i xmm1, xmm2;
00142 do
00143 {
00144 _mm_prefetch((const char*)(src)+512, _MM_HINT_NTA);
00145
00146 xmm1 = _mm_load_si128(src++);
00147 xmm2 = _mm_load_si128(dst);
00148 xmm1 = _mm_and_si128(xmm1, xmm2);
00149 _mm_store_si128(dst++, xmm1);
00150
00151 xmm1 = _mm_load_si128(src++);
00152 xmm2 = _mm_load_si128(dst);
00153 xmm1 = _mm_and_si128(xmm1, xmm2);
00154 _mm_store_si128(dst++, xmm1);
00155
00156 xmm1 = _mm_load_si128(src++);
00157 xmm2 = _mm_load_si128(dst);
00158 xmm1 = _mm_and_si128(xmm1, xmm2);
00159 _mm_store_si128(dst++, xmm1);
00160
00161 xmm1 = _mm_load_si128(src++);
00162 xmm2 = _mm_load_si128(dst);
00163 xmm1 = _mm_and_si128(xmm1, xmm2);
00164 _mm_store_si128(dst++, xmm1);
00165
00166 } while (src < src_end);
00167
00168 }
00169
00170
00171
00172
00173
00174
00175
00176
00177
00178 BMFORCEINLINE
00179 void sse2_or_arr(__m128i* BMRESTRICT dst,
00180 const __m128i* BMRESTRICT src,
00181 const __m128i* BMRESTRICT src_end)
00182 {
00183 __m128i xmm1, xmm2;
00184 do
00185 {
00186 _mm_prefetch((const char*)(src)+512, _MM_HINT_NTA);
00187
00188 xmm1 = _mm_load_si128(src++);
00189 xmm2 = _mm_load_si128(dst);
00190 xmm1 = _mm_or_si128(xmm1, xmm2);
00191 _mm_store_si128(dst++, xmm1);
00192
00193 xmm1 = _mm_load_si128(src++);
00194 xmm2 = _mm_load_si128(dst);
00195 xmm1 = _mm_or_si128(xmm1, xmm2);
00196 _mm_store_si128(dst++, xmm1);
00197
00198 xmm1 = _mm_load_si128(src++);
00199 xmm2 = _mm_load_si128(dst);
00200 xmm1 = _mm_or_si128(xmm1, xmm2);
00201 _mm_store_si128(dst++, xmm1);
00202
00203 xmm1 = _mm_load_si128(src++);
00204 xmm2 = _mm_load_si128(dst);
00205 xmm1 = _mm_or_si128(xmm1, xmm2);
00206 _mm_store_si128(dst++, xmm1);
00207
00208 } while (src < src_end);
00209 }
00210
00211
00212
00213
00214
00215
00216
00217 BMFORCEINLINE
00218 void sse2_xor_arr(__m128i* BMRESTRICT dst,
00219 const __m128i* BMRESTRICT src,
00220 const __m128i* BMRESTRICT src_end)
00221 {
00222 __m128i xmm1, xmm2;
00223 do
00224 {
00225 _mm_prefetch((const char*)(src)+512, _MM_HINT_NTA);
00226
00227 xmm1 = _mm_load_si128(src++);
00228 xmm2 = _mm_load_si128(dst);
00229 xmm1 = _mm_xor_si128(xmm1, xmm2);
00230 _mm_store_si128(dst++, xmm1);
00231
00232 xmm1 = _mm_load_si128(src++);
00233 xmm2 = _mm_load_si128(dst);
00234 xmm1 = _mm_xor_si128(xmm1, xmm2);
00235 _mm_store_si128(dst++, xmm1);
00236
00237 xmm1 = _mm_load_si128(src++);
00238 xmm2 = _mm_load_si128(dst);
00239 xmm1 = _mm_xor_si128(xmm1, xmm2);
00240 _mm_store_si128(dst++, xmm1);
00241
00242 xmm1 = _mm_load_si128(src++);
00243 xmm2 = _mm_load_si128(dst);
00244 xmm1 = _mm_xor_si128(xmm1, xmm2);
00245 _mm_store_si128(dst++, xmm1);
00246
00247 } while (src < src_end);
00248 }
00249
00250
00251
00252
00253
00254
00255
00256
00257 BMFORCEINLINE
00258 void sse2_sub_arr(__m128i* BMRESTRICT dst,
00259 const __m128i* BMRESTRICT src,
00260 const __m128i* BMRESTRICT src_end)
00261 {
00262 __m128i xmm1, xmm2;
00263 do
00264 {
00265 _mm_prefetch((const char*)(src)+512, _MM_HINT_NTA);
00266
00267 xmm1 = _mm_load_si128(src++);
00268 xmm2 = _mm_load_si128(dst);
00269 xmm1 = _mm_andnot_si128(xmm1, xmm2);
00270 _mm_store_si128(dst++, xmm1);
00271
00272 xmm1 = _mm_load_si128(src++);
00273 xmm2 = _mm_load_si128(dst);
00274 xmm1 = _mm_andnot_si128(xmm1, xmm2);
00275 _mm_store_si128(dst++, xmm1);
00276
00277 xmm1 = _mm_load_si128(src++);
00278 xmm2 = _mm_load_si128(dst);
00279 xmm1 = _mm_andnot_si128(xmm1, xmm2);
00280 _mm_store_si128(dst++, xmm1);
00281
00282 xmm1 = _mm_load_si128(src++);
00283 xmm2 = _mm_load_si128(dst);
00284 xmm1 = _mm_andnot_si128(xmm1, xmm2);
00285 _mm_store_si128(dst++, xmm1);
00286
00287 } while (src < src_end);
00288 }
00289
00290
00291
00292
00293
00294
00295
00296
00297 BMFORCEINLINE
00298 void sse2_set_block(__m128i* BMRESTRICT dst,
00299 __m128i* BMRESTRICT dst_end,
00300 bm::word_t value)
00301 {
00302 __m128i xmm0 = _mm_set_epi32 (value, value, value, value);
00303 do
00304 {
00305 _mm_store_si128(dst, xmm0);
00306
00307
00308
00309
00310
00311
00312
00313
00314
00315
00316
00317
00318 } while (++dst < dst_end);
00319
00320 _mm_sfence();
00321 }
00322
00323
00324
00325
00326
00327
00328
00329 BMFORCEINLINE
00330 void sse2_copy_block(__m128i* BMRESTRICT dst,
00331 const __m128i* BMRESTRICT src,
00332 const __m128i* BMRESTRICT src_end)
00333 {
00334 __m128i xmm0, xmm1, xmm2, xmm3;
00335 do
00336 {
00337 _mm_prefetch((const char*)(src)+512, _MM_HINT_NTA);
00338
00339 xmm0 = _mm_load_si128(src+0);
00340 xmm1 = _mm_load_si128(src+1);
00341 xmm2 = _mm_load_si128(src+2);
00342 xmm3 = _mm_load_si128(src+3);
00343
00344 _mm_store_si128(dst+0, xmm0);
00345 _mm_store_si128(dst+1, xmm1);
00346 _mm_store_si128(dst+2, xmm2);
00347 _mm_store_si128(dst+3, xmm3);
00348
00349 xmm0 = _mm_load_si128(src+4);
00350 xmm1 = _mm_load_si128(src+5);
00351 xmm2 = _mm_load_si128(src+6);
00352 xmm3 = _mm_load_si128(src+7);
00353
00354 _mm_store_si128(dst+4, xmm0);
00355 _mm_store_si128(dst+5, xmm1);
00356 _mm_store_si128(dst+6, xmm2);
00357 _mm_store_si128(dst+7, xmm3);
00358
00359 src += 8;
00360 dst += 8;
00361
00362 } while (src < src_end);
00363 }
00364
00365
00366
00367
00368
00369
00370
00371
00372
00373
00374 BMFORCEINLINE
00375 void sse2_invert_arr(bm::word_t* first, bm::word_t* last)
00376 {
00377 __m128i xmm1 = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF,
00378 0xFFFFFFFF, 0xFFFFFFFF);
00379 __m128i* wrd_ptr = (__m128i*)first;
00380
00381 do
00382 {
00383 _mm_prefetch((const char*)(wrd_ptr)+512, _MM_HINT_NTA);
00384
00385 __m128i xmm0 = _mm_load_si128(wrd_ptr);
00386 xmm0 = _mm_xor_si128(xmm0, xmm1);
00387 _mm_store_si128(wrd_ptr, xmm0);
00388 ++wrd_ptr;
00389 } while (wrd_ptr < (__m128i*)last);
00390 }
00391
00392
00393
00394
00395
00396
00397
00398
00399
00400
00401
00402
00403
00404
00405
00406
00407
00408
00409
00410
00411
00412
00413 inline
00414 bm::id_t sse2_bit_count(const __m128i* block, const __m128i* block_end)
00415 {
00416 const unsigned mu1 = 0x55555555;
00417 const unsigned mu2 = 0x33333333;
00418 const unsigned mu3 = 0x0F0F0F0F;
00419 const unsigned mu4 = 0x0000003F;
00420
00421
00422 __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
00423 __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
00424 __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
00425 __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
00426 __m128i mcnt;
00427 mcnt = _mm_xor_si128(m1, m1);
00428
00429 __m128i tmp1, tmp2;
00430 do
00431 {
00432 __m128i b = _mm_load_si128(block);
00433 ++block;
00434
00435
00436 tmp1 = _mm_srli_epi32(b, 1);
00437 tmp1 = _mm_and_si128(tmp1, m1);
00438 tmp2 = _mm_and_si128(b, m1);
00439 b = _mm_add_epi32(tmp1, tmp2);
00440
00441
00442 tmp1 = _mm_srli_epi32(b, 2);
00443 tmp1 = _mm_and_si128(tmp1, m2);
00444 tmp2 = _mm_and_si128(b, m2);
00445 b = _mm_add_epi32(tmp1, tmp2);
00446
00447
00448 tmp1 = _mm_srli_epi32(b, 4);
00449 b = _mm_add_epi32(b, tmp1);
00450 b = _mm_and_si128(b, m3);
00451
00452
00453 tmp1 = _mm_srli_epi32 (b, 8);
00454 b = _mm_add_epi32(b, tmp1);
00455
00456
00457 tmp1 = _mm_srli_epi32 (b, 16);
00458 b = _mm_add_epi32(b, tmp1);
00459 b = _mm_and_si128(b, m4);
00460
00461 mcnt = _mm_add_epi32(mcnt, b);
00462
00463 } while (block < block_end);
00464
00465 __declspec(align(16)) bm::id_t tcnt[4];
00466 _mm_store_si128((__m128i*)tcnt, mcnt);
00467
00468 return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
00469 }
00470
00471 BMFORCEINLINE
00472 __m128i sse2_and(__m128i a, __m128i b)
00473 {
00474 return _mm_and_si128(a, b);
00475 }
00476
00477 BMFORCEINLINE
00478 __m128i sse2_or(__m128i a, __m128i b)
00479 {
00480 return _mm_or_si128(a, b);
00481 }
00482
00483
00484 BMFORCEINLINE
00485 __m128i sse2_xor(__m128i a, __m128i b)
00486 {
00487 return _mm_xor_si128(a, b);
00488 }
00489
00490 BMFORCEINLINE
00491 __m128i sse2_sub(__m128i a, __m128i b)
00492 {
00493 return _mm_andnot_si128(b, a);
00494 }
00495
00496
00497 template<class Func>
00498 bm::id_t sse2_bit_count_op(const __m128i* BMRESTRICT block,
00499 const __m128i* BMRESTRICT block_end,
00500 const __m128i* BMRESTRICT mask_block,
00501 Func sse2_func)
00502 {
00503 const unsigned mu1 = 0x55555555;
00504 const unsigned mu2 = 0x33333333;
00505 const unsigned mu3 = 0x0F0F0F0F;
00506 const unsigned mu4 = 0x0000003F;
00507
00508
00509 __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
00510 __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
00511 __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
00512 __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
00513 __m128i mcnt;
00514 mcnt = _mm_xor_si128(m1, m1);
00515 do
00516 {
00517 __m128i tmp1, tmp2;
00518 __m128i b = _mm_load_si128(block++);
00519
00520 tmp1 = _mm_load_si128(mask_block++);
00521
00522 b = sse2_func(b, tmp1);
00523
00524
00525 tmp1 = _mm_srli_epi32(b, 1);
00526 tmp1 = _mm_and_si128(tmp1, m1);
00527 tmp2 = _mm_and_si128(b, m1);
00528 b = _mm_add_epi32(tmp1, tmp2);
00529
00530
00531 tmp1 = _mm_srli_epi32(b, 2);
00532 tmp1 = _mm_and_si128(tmp1, m2);
00533 tmp2 = _mm_and_si128(b, m2);
00534 b = _mm_add_epi32(tmp1, tmp2);
00535
00536
00537 tmp1 = _mm_srli_epi32(b, 4);
00538 b = _mm_add_epi32(b, tmp1);
00539 b = _mm_and_si128(b, m3);
00540
00541
00542 tmp1 = _mm_srli_epi32 (b, 8);
00543 b = _mm_add_epi32(b, tmp1);
00544
00545
00546 tmp1 = _mm_srli_epi32 (b, 16);
00547 b = _mm_add_epi32(b, tmp1);
00548 b = _mm_and_si128(b, m4);
00549
00550 mcnt = _mm_add_epi32(mcnt, b);
00551
00552 } while (block < block_end);
00553
00554 __declspec(align(16)) bm::id_t tcnt[4];
00555 _mm_store_si128((__m128i*)tcnt, mcnt);
00556
00557 return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
00558 }
00559
00560
00561
00562
00563 #define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\
00564 sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), mask)
00565
00566 #define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\
00567 sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), mask)
00568
00569 #define VECT_BITCOUNT(first, last) \
00570 sse2_bit_count((__m128i*) (first), (__m128i*) (last))
00571
00572 #define VECT_BITCOUNT_AND(first, last, mask) \
00573 sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and)
00574
00575 #define VECT_BITCOUNT_OR(first, last, mask) \
00576 sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or)
00577
00578 #define VECT_BITCOUNT_XOR(first, last, mask) \
00579 sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor)
00580
00581 #define VECT_BITCOUNT_SUB(first, last, mask) \
00582 sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub)
00583
00584 #define VECT_INVERT_ARR(first, last) \
00585 sse2_invert_arr(first, last);
00586
00587 #define VECT_AND_ARR(dst, src, src_end) \
00588 sse2_and_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00589
00590 #define VECT_OR_ARR(dst, src, src_end) \
00591 sse2_or_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00592
00593 #define VECT_SUB_ARR(dst, src, src_end) \
00594 sse2_sub_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00595
00596 #define VECT_XOR_ARR(dst, src, src_end) \
00597 sse2_xor_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00598
00599 #define VECT_COPY_BLOCK(dst, src, src_end) \
00600 sse2_copy_block((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00601
00602 #define VECT_SET_BLOCK(dst, dst_end, value) \
00603 sse2_set_block((__m128i*) dst, (__m128i*) (dst_end), (value))
00604
00605 }
00606
00607 #endif