48
@brief SSE2 reinitialization guard class
50
SSE2 requires to call _mm_empty() if we are intermixing
51
MMX integer commands with floating point arithmetics.
52
This class guards critical code fragments where SSE2 integer
58
class sse2_empty_guard
61
BMFORCEINLINE sse2_empty_guard()
66
BMFORCEINLINE ~sse2_empty_guard()
73
# ifndef BM_SET_MMX_GUARD
74
# define BM_SET_MMX_GUARD sse2_empty_guard bm_mmx_guard_;
79
@brief XOR array elements to specified mask
85
void sse2_xor_arr_2_mask(__m128i* BMRESTRICT dst,
86
const __m128i* BMRESTRICT src,
87
const __m128i* BMRESTRICT src_end,
90
__m128i xmm2 = _mm_set_epi32(mask, mask, mask, mask);
93
__m128i xmm1 = _mm_load_si128(src);
95
xmm1 = _mm_xor_si128(xmm1, xmm2);
96
_mm_store_si128(dst, xmm1);
100
} while (src < src_end);
104
@brief Inverts array elements and NOT them to specified mask
110
void sse2_andnot_arr_2_mask(__m128i* BMRESTRICT dst,
111
const __m128i* BMRESTRICT src,
112
const __m128i* BMRESTRICT src_end,
115
__m128i xmm2 = _mm_set_epi32(mask, mask, mask, mask);
118
//_mm_prefetch((const char*)(src)+1024, _MM_HINT_NTA);
119
//_mm_prefetch((const char*)(src)+1088, _MM_HINT_NTA);
121
__m128i xmm1 = _mm_load_si128(src);
123
xmm1 = _mm_andnot_si128(xmm1, xmm2); // xmm1 = (~xmm1) & xmm2
124
_mm_store_si128(dst, xmm1);
128
} while (src < src_end);
132
@brief AND array elements against another array
138
void sse2_and_arr(__m128i* BMRESTRICT dst,
139
const __m128i* BMRESTRICT src,
140
const __m128i* BMRESTRICT src_end)
145
_mm_prefetch((const char*)(src)+512, _MM_HINT_NTA);
147
xmm1 = _mm_load_si128(src++);
148
xmm2 = _mm_load_si128(dst);
149
xmm1 = _mm_and_si128(xmm1, xmm2);
150
_mm_store_si128(dst++, xmm1);
152
xmm1 = _mm_load_si128(src++);
153
xmm2 = _mm_load_si128(dst);
154
xmm1 = _mm_and_si128(xmm1, xmm2);
155
_mm_store_si128(dst++, xmm1);
157
xmm1 = _mm_load_si128(src++);
158
xmm2 = _mm_load_si128(dst);
159
xmm1 = _mm_and_si128(xmm1, xmm2);
160
_mm_store_si128(dst++, xmm1);
162
xmm1 = _mm_load_si128(src++);
163
xmm2 = _mm_load_si128(dst);
164
xmm1 = _mm_and_si128(xmm1, xmm2);
165
_mm_store_si128(dst++, xmm1);
167
} while (src < src_end);
174
@brief OR array elements against another array
180
void sse2_or_arr(__m128i* BMRESTRICT dst,
181
const __m128i* BMRESTRICT src,
182
const __m128i* BMRESTRICT src_end)
187
_mm_prefetch((const char*)(src)+512, _MM_HINT_NTA);
189
xmm1 = _mm_load_si128(src++);
190
xmm2 = _mm_load_si128(dst);
191
xmm1 = _mm_or_si128(xmm1, xmm2);
192
_mm_store_si128(dst++, xmm1);
194
xmm1 = _mm_load_si128(src++);
195
xmm2 = _mm_load_si128(dst);
196
xmm1 = _mm_or_si128(xmm1, xmm2);
197
_mm_store_si128(dst++, xmm1);
199
xmm1 = _mm_load_si128(src++);
200
xmm2 = _mm_load_si128(dst);
201
xmm1 = _mm_or_si128(xmm1, xmm2);
202
_mm_store_si128(dst++, xmm1);
204
xmm1 = _mm_load_si128(src++);
205
xmm2 = _mm_load_si128(dst);
206
xmm1 = _mm_or_si128(xmm1, xmm2);
207
_mm_store_si128(dst++, xmm1);
209
} while (src < src_end);
213
@brief OR array elements against another array
219
void sse2_xor_arr(__m128i* BMRESTRICT dst,
220
const __m128i* BMRESTRICT src,
221
const __m128i* BMRESTRICT src_end)
226
_mm_prefetch((const char*)(src)+512, _MM_HINT_NTA);
228
xmm1 = _mm_load_si128(src++);
229
xmm2 = _mm_load_si128(dst);
230
xmm1 = _mm_xor_si128(xmm1, xmm2);
231
_mm_store_si128(dst++, xmm1);
233
xmm1 = _mm_load_si128(src++);
234
xmm2 = _mm_load_si128(dst);
235
xmm1 = _mm_xor_si128(xmm1, xmm2);
236
_mm_store_si128(dst++, xmm1);
238
xmm1 = _mm_load_si128(src++);
239
xmm2 = _mm_load_si128(dst);
240
xmm1 = _mm_xor_si128(xmm1, xmm2);
241
_mm_store_si128(dst++, xmm1);
243
xmm1 = _mm_load_si128(src++);
244
xmm2 = _mm_load_si128(dst);
245
xmm1 = _mm_xor_si128(xmm1, xmm2);
246
_mm_store_si128(dst++, xmm1);
248
} while (src < src_end);
253
@brief AND-NOT (SUB) array elements against another array
259
void sse2_sub_arr(__m128i* BMRESTRICT dst,
260
const __m128i* BMRESTRICT src,
261
const __m128i* BMRESTRICT src_end)
266
_mm_prefetch((const char*)(src)+512, _MM_HINT_NTA);
268
xmm1 = _mm_load_si128(src++);
269
xmm2 = _mm_load_si128(dst);
270
xmm1 = _mm_andnot_si128(xmm1, xmm2);
271
_mm_store_si128(dst++, xmm1);
273
xmm1 = _mm_load_si128(src++);
274
xmm2 = _mm_load_si128(dst);
275
xmm1 = _mm_andnot_si128(xmm1, xmm2);
276
_mm_store_si128(dst++, xmm1);
278
xmm1 = _mm_load_si128(src++);
279
xmm2 = _mm_load_si128(dst);
280
xmm1 = _mm_andnot_si128(xmm1, xmm2);
281
_mm_store_si128(dst++, xmm1);
283
xmm1 = _mm_load_si128(src++);
284
xmm2 = _mm_load_si128(dst);
285
xmm1 = _mm_andnot_si128(xmm1, xmm2);
286
_mm_store_si128(dst++, xmm1);
288
} while (src < src_end);
292
@brief SSE2 block memset
299
void sse2_set_block(__m128i* BMRESTRICT dst,
300
__m128i* BMRESTRICT dst_end,
303
__m128i xmm0 = _mm_set_epi32 (value, value, value, value);
306
_mm_store_si128(dst, xmm0);
308
_mm_store_si128(dst+1, xmm0);
309
_mm_store_si128(dst+2, xmm0);
310
_mm_store_si128(dst+3, xmm0);
312
_mm_store_si128(dst+4, xmm0);
313
_mm_store_si128(dst+5, xmm0);
314
_mm_store_si128(dst+6, xmm0);
315
_mm_store_si128(dst+7, xmm0);
319
} while (++dst < dst_end);
325
@brief SSE2 block copy
331
void sse2_copy_block(__m128i* BMRESTRICT dst,
332
const __m128i* BMRESTRICT src,
333
const __m128i* BMRESTRICT src_end)
335
__m128i xmm0, xmm1, xmm2, xmm3;
338
_mm_prefetch((const char*)(src)+512, _MM_HINT_NTA);
340
xmm0 = _mm_load_si128(src+0);
341
xmm1 = _mm_load_si128(src+1);
342
xmm2 = _mm_load_si128(src+2);
343
xmm3 = _mm_load_si128(src+3);
345
_mm_store_si128(dst+0, xmm0);
346
_mm_store_si128(dst+1, xmm1);
347
_mm_store_si128(dst+2, xmm2);
348
_mm_store_si128(dst+3, xmm3);
350
xmm0 = _mm_load_si128(src+4);
351
xmm1 = _mm_load_si128(src+5);
352
xmm2 = _mm_load_si128(src+6);
353
xmm3 = _mm_load_si128(src+7);
355
_mm_store_si128(dst+4, xmm0);
356
_mm_store_si128(dst+5, xmm1);
357
_mm_store_si128(dst+6, xmm2);
358
_mm_store_si128(dst+7, xmm3);
363
} while (src < src_end);
368
@brief Invert array elements
376
void sse2_invert_arr(bm::word_t* first, bm::word_t* last)
378
__m128i xmm1 = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF,
379
0xFFFFFFFF, 0xFFFFFFFF);
380
__m128i* wrd_ptr = (__m128i*)first;
384
_mm_prefetch((const char*)(wrd_ptr)+512, _MM_HINT_NTA);
386
__m128i xmm0 = _mm_load_si128(wrd_ptr);
387
xmm0 = _mm_xor_si128(xmm0, xmm1);
388
_mm_store_si128(wrd_ptr, xmm0);
390
} while (wrd_ptr < (__m128i*)last);
604
240
#define VECT_SET_BLOCK(dst, dst_end, value) \
605
241
sse2_set_block((__m128i*) dst, (__m128i*) (dst_end), (value))
248
bm::id_t sse2_bit_block_calc_count_change(const __m128i* BMRESTRICT block,
249
const __m128i* BMRESTRICT block_end,
250
unsigned* BMRESTRICT bit_count)
252
const unsigned mu1 = 0x55555555;
253
const unsigned mu2 = 0x33333333;
254
const unsigned mu3 = 0x0F0F0F0F;
255
const unsigned mu4 = 0x0000003F;
258
__m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
259
__m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
260
__m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
261
__m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
263
mcnt = _mm_xor_si128(m1, m1); // bit_cnt = 0
264
ccnt = _mm_xor_si128(m1, m1); // change_cnt = 0
268
int count = (block_end - block)*4; //0;//1;
270
bm::word_t w, w0, w_prev;//, w_l;
271
const int w_shift = sizeof(w) * 8 - 1;
272
bool first_word = true;
276
const bm::word_t* blk = (const bm::word_t*) block;
279
BM_INCWORD_BITCOUNT(count, w);
280
count -= (w_prev = (w0 >> w_shift)); // negative value correction
283
bm::id_t BM_ALIGN16 tcnt[4] BM_ALIGN16ATTR;
288
// ---------------------------------------------------------------------
290
__m128i b = _mm_load_si128(block);
293
tmp1 = _mm_srli_epi32(b, 1); // tmp1 = b >> 1
294
tmp2 = _mm_xor_si128(b, tmp1); // tmp2 = tmp1 ^ b;
295
_mm_store_si128((__m128i*)tcnt, tmp2);
299
// SSE4: _mm_test_all_zero()
301
// b = (b & 0x55555555) + (b >> 1 & 0x55555555);
302
//tmp1 = _mm_srli_epi32(b, 1); // tmp1 = (b >> 1 & 0x55555555)
303
tmp1 = _mm_and_si128(tmp1, m1);
304
tmp2 = _mm_and_si128(b, m1); // tmp2 = (b & 0x55555555)
305
b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
307
// b = (b & 0x33333333) + (b >> 2 & 0x33333333);
308
tmp1 = _mm_srli_epi32(b, 2); // (b >> 2 & 0x33333333)
309
tmp1 = _mm_and_si128(tmp1, m2);
310
tmp2 = _mm_and_si128(b, m2); // (b & 0x33333333)
311
b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
313
// b = (b + (b >> 4)) & 0x0F0F0F0F;
314
tmp1 = _mm_srli_epi32(b, 4); // tmp1 = b >> 4
315
b = _mm_add_epi32(b, tmp1); // b = b + (b >> 4)
316
b = _mm_and_si128(b, m3); //& 0x0F0F0F0F
319
tmp1 = _mm_srli_epi32 (b, 8); // tmp1 = b >> 8
320
b = _mm_add_epi32(b, tmp1); // b = b + (b >> 8)
322
// b = (b + (b >> 16)) & 0x0000003F;
323
tmp1 = _mm_srli_epi32 (b, 16); // b >> 16
324
b = _mm_add_epi32(b, tmp1); // b + (b >> 16)
325
b = _mm_and_si128(b, m4); // (b >> 16) & 0x0000003F;
327
mcnt = _mm_add_epi32(mcnt, b); // mcnt += b
331
// ---------------------------------------------------------------------
333
//__m128i b = _mm_load_si128(block);
335
//w = _mm_extract_epi32(b, i);
337
const bm::word_t* BMRESTRICT blk = (const bm::word_t*) block;
347
BM_INCWORD_BITCOUNT(count, tcnt[0]);
348
count -= !(w_prev ^ (w0 & 1));
349
count -= w_prev = (w0 >> w_shift);
353
count -= !w_prev; w_prev ^= w_prev;
358
BM_INCWORD_BITCOUNT(count, tcnt[1]);
359
count -= !(w_prev ^ (w0 & 1));
360
count -= w_prev = (w0 >> w_shift);
364
count -= !w_prev; w_prev ^= w_prev;
368
BM_INCWORD_BITCOUNT(count, tcnt[2]);
369
count -= !(w_prev ^ (w0 & 1));
370
count -= w_prev = (w0 >> w_shift);
374
count -= !w_prev; w_prev ^= w_prev;
378
BM_INCWORD_BITCOUNT(count, tcnt[3]);
379
count -= !(w_prev ^ (w0 & 1));
380
count -= w_prev = (w0 >> w_shift);
384
count -= !w_prev; w_prev ^= w_prev;
387
} while (++block < block_end);
389
_mm_store_si128((__m128i*)tcnt, mcnt);
390
*bit_count = tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];