1
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
2
<html><head><meta http-equiv="Content-Type" content="text/html;charset=iso-8859-1">
3
<title>BitMagic: bmsse2.h Source File</title>
4
<link href="doxygen.css" rel="stylesheet" type="text/css">
6
<!-- Generated by Doxygen 1.4.1 -->
7
<div class="qindex"><a class="qindex" href="index.html">Main Page</a> | <a class="qindex" href="modules.html">Modules</a> | <a class="qindex" href="namespaces.html">Namespace List</a> | <a class="qindex" href="hierarchy.html">Class Hierarchy</a> | <a class="qindex" href="classes.html">Alphabetical List</a> | <a class="qindex" href="annotated.html">Data Structures</a> | <a class="qindex" href="dirs.html">Directories</a> | <a class="qindex" href="files.html">File List</a> | <a class="qindex" href="namespacemembers.html">Namespace Members</a> | <a class="qindex" href="functions.html">Data Fields</a> | <a class="qindex" href="globals.html">Globals</a> | <a class="qindex" href="examples.html">Examples</a></div>
9
<a class="el" href="dir_000000.html">src</a></div>
10
<h1>bmsse2.h</h1><a href="a00116.html">Go to the documentation of this file.</a><div class="fragment"><pre class="fragment">00001 <span class="comment">/*</span>
11
00002 <span class="comment">Copyright(c) 2002-2005 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)</span>
12
00003 <span class="comment"></span>
13
00004 <span class="comment">Permission is hereby granted, free of charge, to any person </span>
14
00005 <span class="comment">obtaining a copy of this software and associated documentation </span>
15
00006 <span class="comment">files (the "Software"), to deal in the Software without restriction, </span>
16
00007 <span class="comment">including without limitation the rights to use, copy, modify, merge, </span>
17
00008 <span class="comment">publish, distribute, sublicense, and/or sell copies of the Software, </span>
18
00009 <span class="comment">and to permit persons to whom the Software is furnished to do so, </span>
19
00010 <span class="comment">subject to the following conditions:</span>
20
00011 <span class="comment"></span>
21
00012 <span class="comment">The above copyright notice and this permission notice shall be included </span>
22
00013 <span class="comment">in all copies or substantial portions of the Software.</span>
23
00014 <span class="comment"></span>
24
00015 <span class="comment">THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, </span>
25
00016 <span class="comment">EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES </span>
26
00017 <span class="comment">OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. </span>
27
00018 <span class="comment">IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, </span>
28
00019 <span class="comment">DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, </span>
29
00020 <span class="comment">ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR </span>
30
00021 <span class="comment">OTHER DEALINGS IN THE SOFTWARE.</span>
31
00022 <span class="comment"></span>
32
00023 <span class="comment">For more information please visit: http://bmagic.sourceforge.net</span>
33
00024 <span class="comment"></span>
34
00025 <span class="comment">*/</span>
37
00028 <span class="preprocessor">#ifndef BMSSE2__H__INCLUDED__</span>
38
00029 <span class="preprocessor"></span><span class="preprocessor">#define BMSSE2__H__INCLUDED__</span>
39
00030 <span class="preprocessor"></span>
41
00032 <span class="comment">// Header implements processor specific intrinsics declarations for SSE2</span>
42
00033 <span class="comment">// instruction set</span>
43
00034 <span class="preprocessor">#include<emmintrin.h></span>
47
00038 <span class="keyword">namespace </span>bm
49
00040 <span class="comment"></span>
50
00041 <span class="comment">/** @defgroup SSE2 Processor specific optimizations for SSE2 instructions</span>
51
00042 <span class="comment"> * @ingroup bmagic</span>
52
00043 <span class="comment"> */</span>
54
00045 <span class="comment"></span>
55
00046 <span class="comment">/*! </span>
56
00047 <span class="comment"> @brief SSE2 reinitialization guard class</span>
57
00048 <span class="comment"></span>
58
00049 <span class="comment"> SSE2 requires to call _mm_empty() if we are intermixing</span>
59
00050 <span class="comment"> MMX integer commands with floating point arithmetics.</span>
60
00051 <span class="comment"> This class guards critical code fragments where SSE2 integer</span>
61
00052 <span class="comment"> is used.</span>
62
00053 <span class="comment"></span>
63
00054 <span class="comment"> @ingroup SSE2</span>
64
00055 <span class="comment"></span>
65
00056 <span class="comment">*/</span>
66
<a name="l00057"></a><a class="code" href="a00110.html">00057</a> <span class="keyword">class </span><a class="code" href="a00110.html">sse2_empty_guard</a>
68
00059 <span class="keyword">public</span>:
69
<a name="l00060"></a><a class="code" href="a00110.html#a0">00060</a> <a class="code" href="a00114.html#a12">BMFORCEINLINE</a> <a class="code" href="a00110.html#a0">sse2_empty_guard</a>()
74
<a name="l00065"></a><a class="code" href="a00110.html#a1">00065</a> <a class="code" href="a00114.html#a12">BMFORCEINLINE</a> <a class="code" href="a00110.html#a1">~sse2_empty_guard</a>()
80
00071 <span class="comment">/*</span>
81
00072 <span class="comment"># ifndef BM_SET_MMX_GUARD</span>
82
00073 <span class="comment"># define BM_SET_MMX_GUARD sse2_empty_guard bm_mmx_guard_;</span>
83
00074 <span class="comment"># endif</span>
84
00075 <span class="comment">*/</span>
85
00076 <span class="comment"></span>
86
00077 <span class="comment">/*! </span>
87
00078 <span class="comment"> @brief XOR array elements to specified mask</span>
88
00079 <span class="comment"> *dst = *src ^ mask</span>
89
00080 <span class="comment"></span>
90
00081 <span class="comment"> @ingroup SSE2</span>
91
00082 <span class="comment">*/</span>
92
00083 <a class="code" href="a00114.html#a12">BMFORCEINLINE</a>
93
<a name="l00084"></a><a class="code" href="a00136.html#ga0">00084</a> <span class="keywordtype">void</span> <a class="code" href="a00136.html#ga0">sse2_xor_arr_2_mask</a>(__m128i* BMRESTRICT dst,
94
00085 <span class="keyword">const</span> __m128i* BMRESTRICT src,
95
00086 <span class="keyword">const</span> __m128i* BMRESTRICT src_end,
96
00087 bm::word_t mask)
98
00089 __m128i xmm2 = _mm_set_epi32(mask, mask, mask, mask);
99
00090 <span class="keywordflow">do</span>
101
00092 __m128i xmm1 = _mm_load_si128(src);
103
00094 xmm1 = _mm_xor_si128(xmm1, xmm2);
104
00095 _mm_store_si128(dst, xmm1);
108
00099 } <span class="keywordflow">while</span> (src < src_end);
110
00101 <span class="comment"></span>
111
00102 <span class="comment">/*! </span>
112
00103 <span class="comment"> @brief Inverts array elements and NOT them to specified mask</span>
113
00104 <span class="comment"> *dst = ~*src & mask</span>
114
00105 <span class="comment"></span>
115
00106 <span class="comment"> @ingroup SSE2</span>
116
00107 <span class="comment">*/</span>
117
00108 <a class="code" href="a00114.html#a12">BMFORCEINLINE</a>
118
<a name="l00109"></a><a class="code" href="a00136.html#ga1">00109</a> <span class="keywordtype">void</span> <a class="code" href="a00136.html#ga1">sse2_andnot_arr_2_mask</a>(__m128i* BMRESTRICT dst,
119
00110 <span class="keyword">const</span> __m128i* BMRESTRICT src,
120
00111 <span class="keyword">const</span> __m128i* BMRESTRICT src_end,
121
00112 bm::word_t mask)
123
00114 __m128i xmm2 = _mm_set_epi32(mask, mask, mask, mask);
124
00115 <span class="keywordflow">do</span>
126
00117 <span class="comment">//_mm_prefetch((const char*)(src)+1024, _MM_HINT_NTA);</span>
127
00118 <span class="comment">//_mm_prefetch((const char*)(src)+1088, _MM_HINT_NTA);</span>
129
00120 __m128i xmm1 = _mm_load_si128(src);
131
00122 xmm1 = _mm_andnot_si128(xmm1, xmm2); <span class="comment">// xmm1 = (~xmm1) & xmm2 </span>
132
00123 _mm_store_si128(dst, xmm1);
136
00127 } <span class="keywordflow">while</span> (src < src_end);
138
00129 <span class="comment"></span>
139
00130 <span class="comment">/*! </span>
140
00131 <span class="comment"> @brief AND array elements against another array</span>
141
00132 <span class="comment"> *dst &= *src</span>
142
00133 <span class="comment"></span>
143
00134 <span class="comment"> @ingroup SSE2</span>
144
00135 <span class="comment">*/</span>
145
00136 <a class="code" href="a00114.html#a12">BMFORCEINLINE</a>
146
<a name="l00137"></a><a class="code" href="a00136.html#ga2">00137</a> <span class="keywordtype">void</span> <a class="code" href="a00136.html#ga2">sse2_and_arr</a>(__m128i* BMRESTRICT dst,
147
00138 <span class="keyword">const</span> __m128i* BMRESTRICT src,
148
00139 <span class="keyword">const</span> __m128i* BMRESTRICT src_end)
150
00141 __m128i xmm1, xmm2;
151
00142 <span class="keywordflow">do</span>
153
00144 _mm_prefetch((<span class="keyword">const</span> <span class="keywordtype">char</span>*)(src)+512, _MM_HINT_NTA);
155
00146 xmm1 = _mm_load_si128(src++);
156
00147 xmm2 = _mm_load_si128(dst);
157
00148 xmm1 = _mm_and_si128(xmm1, xmm2);
158
00149 _mm_store_si128(dst++, xmm1);
160
00151 xmm1 = _mm_load_si128(src++);
161
00152 xmm2 = _mm_load_si128(dst);
162
00153 xmm1 = _mm_and_si128(xmm1, xmm2);
163
00154 _mm_store_si128(dst++, xmm1);
165
00156 xmm1 = _mm_load_si128(src++);
166
00157 xmm2 = _mm_load_si128(dst);
167
00158 xmm1 = _mm_and_si128(xmm1, xmm2);
168
00159 _mm_store_si128(dst++, xmm1);
170
00161 xmm1 = _mm_load_si128(src++);
171
00162 xmm2 = _mm_load_si128(dst);
172
00163 xmm1 = _mm_and_si128(xmm1, xmm2);
173
00164 _mm_store_si128(dst++, xmm1);
175
00166 } <span class="keywordflow">while</span> (src < src_end);
180
00171 <span class="comment"></span>
181
00172 <span class="comment">/*! </span>
182
00173 <span class="comment"> @brief OR array elements against another array</span>
183
00174 <span class="comment"> *dst |= *src</span>
184
00175 <span class="comment"></span>
185
00176 <span class="comment"> @ingroup SSE2</span>
186
00177 <span class="comment">*/</span>
187
00178 <a class="code" href="a00114.html#a12">BMFORCEINLINE</a>
188
<a name="l00179"></a><a class="code" href="a00136.html#ga3">00179</a> <span class="keywordtype">void</span> <a class="code" href="a00136.html#ga3">sse2_or_arr</a>(__m128i* BMRESTRICT dst,
189
00180 <span class="keyword">const</span> __m128i* BMRESTRICT src,
190
00181 <span class="keyword">const</span> __m128i* BMRESTRICT src_end)
192
00183 __m128i xmm1, xmm2;
193
00184 <span class="keywordflow">do</span>
195
00186 _mm_prefetch((<span class="keyword">const</span> <span class="keywordtype">char</span>*)(src)+512, _MM_HINT_NTA);
197
00188 xmm1 = _mm_load_si128(src++);
198
00189 xmm2 = _mm_load_si128(dst);
199
00190 xmm1 = _mm_or_si128(xmm1, xmm2);
200
00191 _mm_store_si128(dst++, xmm1);
202
00193 xmm1 = _mm_load_si128(src++);
203
00194 xmm2 = _mm_load_si128(dst);
204
00195 xmm1 = _mm_or_si128(xmm1, xmm2);
205
00196 _mm_store_si128(dst++, xmm1);
207
00198 xmm1 = _mm_load_si128(src++);
208
00199 xmm2 = _mm_load_si128(dst);
209
00200 xmm1 = _mm_or_si128(xmm1, xmm2);
210
00201 _mm_store_si128(dst++, xmm1);
212
00203 xmm1 = _mm_load_si128(src++);
213
00204 xmm2 = _mm_load_si128(dst);
214
00205 xmm1 = _mm_or_si128(xmm1, xmm2);
215
00206 _mm_store_si128(dst++, xmm1);
217
00208 } <span class="keywordflow">while</span> (src < src_end);
219
00210 <span class="comment"></span>
220
00211 <span class="comment">/*! </span>
221
00212 <span class="comment"> @brief OR array elements against another array</span>
222
00213 <span class="comment"> *dst |= *src</span>
223
00214 <span class="comment"></span>
224
00215 <span class="comment"> @ingroup SSE2</span>
225
00216 <span class="comment">*/</span>
226
00217 <a class="code" href="a00114.html#a12">BMFORCEINLINE</a>
227
<a name="l00218"></a><a class="code" href="a00136.html#ga4">00218</a> <span class="keywordtype">void</span> <a class="code" href="a00136.html#ga4">sse2_xor_arr</a>(__m128i* BMRESTRICT dst,
228
00219 <span class="keyword">const</span> __m128i* BMRESTRICT src,
229
00220 <span class="keyword">const</span> __m128i* BMRESTRICT src_end)
231
00222 __m128i xmm1, xmm2;
232
00223 <span class="keywordflow">do</span>
234
00225 _mm_prefetch((<span class="keyword">const</span> <span class="keywordtype">char</span>*)(src)+512, _MM_HINT_NTA);
236
00227 xmm1 = _mm_load_si128(src++);
237
00228 xmm2 = _mm_load_si128(dst);
238
00229 xmm1 = _mm_xor_si128(xmm1, xmm2);
239
00230 _mm_store_si128(dst++, xmm1);
241
00232 xmm1 = _mm_load_si128(src++);
242
00233 xmm2 = _mm_load_si128(dst);
243
00234 xmm1 = _mm_xor_si128(xmm1, xmm2);
244
00235 _mm_store_si128(dst++, xmm1);
246
00237 xmm1 = _mm_load_si128(src++);
247
00238 xmm2 = _mm_load_si128(dst);
248
00239 xmm1 = _mm_xor_si128(xmm1, xmm2);
249
00240 _mm_store_si128(dst++, xmm1);
251
00242 xmm1 = _mm_load_si128(src++);
252
00243 xmm2 = _mm_load_si128(dst);
253
00244 xmm1 = _mm_xor_si128(xmm1, xmm2);
254
00245 _mm_store_si128(dst++, xmm1);
256
00247 } <span class="keywordflow">while</span> (src < src_end);
259
00250 <span class="comment"></span>
260
00251 <span class="comment">/*! </span>
261
00252 <span class="comment"> @brief AND-NOT (SUB) array elements against another array</span>
262
00253 <span class="comment"> *dst &= ~*src</span>
263
00254 <span class="comment"></span>
264
00255 <span class="comment"> @ingroup SSE2</span>
265
00256 <span class="comment">*/</span>
266
00257 <a class="code" href="a00114.html#a12">BMFORCEINLINE</a>
267
<a name="l00258"></a><a class="code" href="a00136.html#ga5">00258</a> <span class="keywordtype">void</span> <a class="code" href="a00136.html#ga5">sse2_sub_arr</a>(__m128i* BMRESTRICT dst,
268
00259 <span class="keyword">const</span> __m128i* BMRESTRICT src,
269
00260 <span class="keyword">const</span> __m128i* BMRESTRICT src_end)
271
00262 __m128i xmm1, xmm2;
272
00263 <span class="keywordflow">do</span>
274
00265 _mm_prefetch((<span class="keyword">const</span> <span class="keywordtype">char</span>*)(src)+512, _MM_HINT_NTA);
276
00267 xmm1 = _mm_load_si128(src++);
277
00268 xmm2 = _mm_load_si128(dst);
278
00269 xmm1 = _mm_andnot_si128(xmm1, xmm2);
279
00270 _mm_store_si128(dst++, xmm1);
281
00272 xmm1 = _mm_load_si128(src++);
282
00273 xmm2 = _mm_load_si128(dst);
283
00274 xmm1 = _mm_andnot_si128(xmm1, xmm2);
284
00275 _mm_store_si128(dst++, xmm1);
286
00277 xmm1 = _mm_load_si128(src++);
287
00278 xmm2 = _mm_load_si128(dst);
288
00279 xmm1 = _mm_andnot_si128(xmm1, xmm2);
289
00280 _mm_store_si128(dst++, xmm1);
291
00282 xmm1 = _mm_load_si128(src++);
292
00283 xmm2 = _mm_load_si128(dst);
293
00284 xmm1 = _mm_andnot_si128(xmm1, xmm2);
294
00285 _mm_store_si128(dst++, xmm1);
296
00287 } <span class="keywordflow">while</span> (src < src_end);
298
00289 <span class="comment"></span>
299
00290 <span class="comment">/*! </span>
300
00291 <span class="comment"> @brief SSE2 block memset</span>
301
00292 <span class="comment"> *dst = value</span>
302
00293 <span class="comment"></span>
303
00294 <span class="comment"> @ingroup SSE2</span>
304
00295 <span class="comment">*/</span>
306
00297 <a class="code" href="a00114.html#a12">BMFORCEINLINE</a>
307
<a name="l00298"></a><a class="code" href="a00136.html#ga6">00298</a> <span class="keywordtype">void</span> <a class="code" href="a00136.html#ga6">sse2_set_block</a>(__m128i* BMRESTRICT dst,
308
00299 __m128i* BMRESTRICT dst_end,
309
00300 bm::word_t value)
311
00302 __m128i xmm0 = _mm_set_epi32 (value, value, value, value);
312
00303 <span class="keywordflow">do</span>
314
00305 _mm_store_si128(dst, xmm0);
315
00306 <span class="comment">/* </span>
316
00307 <span class="comment"> _mm_store_si128(dst+1, xmm0);</span>
317
00308 <span class="comment"> _mm_store_si128(dst+2, xmm0);</span>
318
00309 <span class="comment"> _mm_store_si128(dst+3, xmm0);</span>
319
00310 <span class="comment"></span>
320
00311 <span class="comment"> _mm_store_si128(dst+4, xmm0);</span>
321
00312 <span class="comment"> _mm_store_si128(dst+5, xmm0);</span>
322
00313 <span class="comment"> _mm_store_si128(dst+6, xmm0);</span>
323
00314 <span class="comment"> _mm_store_si128(dst+7, xmm0);</span>
324
00315 <span class="comment"></span>
325
00316 <span class="comment"> dst += 8;</span>
326
00317 <span class="comment">*/</span>
327
00318 } <span class="keywordflow">while</span> (++dst < dst_end);
331
00322 <span class="comment"></span>
332
00323 <span class="comment">/*! </span>
333
00324 <span class="comment"> @brief SSE2 block copy</span>
334
00325 <span class="comment"> *dst = *src</span>
335
00326 <span class="comment"></span>
336
00327 <span class="comment"> @ingroup SSE2</span>
337
00328 <span class="comment">*/</span>
338
00329 <a class="code" href="a00114.html#a12">BMFORCEINLINE</a>
339
<a name="l00330"></a><a class="code" href="a00136.html#ga7">00330</a> <span class="keywordtype">void</span> <a class="code" href="a00136.html#ga7">sse2_copy_block</a>(__m128i* BMRESTRICT dst,
340
00331 <span class="keyword">const</span> __m128i* BMRESTRICT src,
341
00332 <span class="keyword">const</span> __m128i* BMRESTRICT src_end)
343
00334 __m128i xmm0, xmm1, xmm2, xmm3;
344
00335 <span class="keywordflow">do</span>
346
00337 _mm_prefetch((<span class="keyword">const</span> <span class="keywordtype">char</span>*)(src)+512, _MM_HINT_NTA);
348
00339 xmm0 = _mm_load_si128(src+0);
349
00340 xmm1 = _mm_load_si128(src+1);
350
00341 xmm2 = _mm_load_si128(src+2);
351
00342 xmm3 = _mm_load_si128(src+3);
353
00344 _mm_store_si128(dst+0, xmm0);
354
00345 _mm_store_si128(dst+1, xmm1);
355
00346 _mm_store_si128(dst+2, xmm2);
356
00347 _mm_store_si128(dst+3, xmm3);
358
00349 xmm0 = _mm_load_si128(src+4);
359
00350 xmm1 = _mm_load_si128(src+5);
360
00351 xmm2 = _mm_load_si128(src+6);
361
00352 xmm3 = _mm_load_si128(src+7);
363
00354 _mm_store_si128(dst+4, xmm0);
364
00355 _mm_store_si128(dst+5, xmm1);
365
00356 _mm_store_si128(dst+6, xmm2);
366
00357 _mm_store_si128(dst+7, xmm3);
371
00362 } <span class="keywordflow">while</span> (src < src_end);
374
00365 <span class="comment"></span>
375
00366 <span class="comment">/*! </span>
376
00367 <span class="comment"> @brief Invert array elements</span>
377
00368 <span class="comment"> *dst = ~*dst</span>
378
00369 <span class="comment"> or</span>
379
00370 <span class="comment"> *dst ^= *dst </span>
380
00371 <span class="comment"></span>
381
00372 <span class="comment"> @ingroup SSE2</span>
382
00373 <span class="comment">*/</span>
383
00374 <a class="code" href="a00114.html#a12">BMFORCEINLINE</a>
384
<a name="l00375"></a><a class="code" href="a00136.html#ga8">00375</a> <span class="keywordtype">void</span> <a class="code" href="a00136.html#ga8">sse2_invert_arr</a>(bm::word_t* first, bm::word_t* last)
386
00377 __m128i xmm1 = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF,
387
00378 0xFFFFFFFF, 0xFFFFFFFF);
388
00379 __m128i* wrd_ptr = (__m128i*)first;
390
00381 <span class="keywordflow">do</span>
392
00383 _mm_prefetch((<span class="keyword">const</span> <span class="keywordtype">char</span>*)(wrd_ptr)+512, _MM_HINT_NTA);
394
00385 __m128i xmm0 = _mm_load_si128(wrd_ptr);
395
00386 xmm0 = _mm_xor_si128(xmm0, xmm1);
396
00387 _mm_store_si128(wrd_ptr, xmm0);
398
00389 } <span class="keywordflow">while</span> (wrd_ptr < (__m128i*)last);
402
00393 <span class="comment"></span>
403
00394 <span class="comment">/*!</span>
404
00395 <span class="comment"> SSE2 optimized bitcounting function implements parallel bitcounting</span>
405
00396 <span class="comment"> algorithm for SSE2 instruction set.</span>
406
00397 <span class="comment"></span>
407
00398 <span class="comment"><pre></span>
408
00399 <span class="comment">unsigned CalcBitCount32(unsigned b)</span>
409
00400 <span class="comment">{</span>
410
00401 <span class="comment"> b = (b & 0x55555555) + (b >> 1 & 0x55555555);</span>
411
00402 <span class="comment"> b = (b & 0x33333333) + (b >> 2 & 0x33333333);</span>
412
00403 <span class="comment"> b = (b + (b >> 4)) & 0x0F0F0F0F;</span>
413
00404 <span class="comment"> b = b + (b >> 8);</span>
414
00405 <span class="comment"> b = (b + (b >> 16)) & 0x0000003F;</span>
415
00406 <span class="comment"> return b;</span>
416
00407 <span class="comment">}</span>
417
00408 <span class="comment"></pre></span>
418
00409 <span class="comment"></span>
419
00410 <span class="comment"> @ingroup SSE2</span>
420
00411 <span class="comment"></span>
421
00412 <span class="comment">*/</span>
422
00413 <span class="keyword">inline</span>
423
<a name="l00414"></a><a class="code" href="a00136.html#ga9">00414</a> bm::id_t <a class="code" href="a00136.html#ga9">sse2_bit_count</a>(<span class="keyword">const</span> __m128i* block, <span class="keyword">const</span> __m128i* block_end)
425
00416 <span class="keyword">const</span> <span class="keywordtype">unsigned</span> mu1 = 0x55555555;
426
00417 <span class="keyword">const</span> <span class="keywordtype">unsigned</span> mu2 = 0x33333333;
427
00418 <span class="keyword">const</span> <span class="keywordtype">unsigned</span> mu3 = 0x0F0F0F0F;
428
00419 <span class="keyword">const</span> <span class="keywordtype">unsigned</span> mu4 = 0x0000003F;
430
00421 <span class="comment">// Loading masks</span>
431
00422 __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
432
00423 __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
433
00424 __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
434
00425 __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
436
00427 mcnt = _mm_xor_si128(m1, m1); <span class="comment">// cnt = 0</span>
438
00429 __m128i tmp1, tmp2;
439
00430 <span class="keywordflow">do</span>
441
00432 __m128i b = _mm_load_si128(block);
444
00435 <span class="comment">// b = (b & 0x55555555) + (b >> 1 & 0x55555555);</span>
445
00436 tmp1 = _mm_srli_epi32(b, 1); <span class="comment">// tmp1 = (b >> 1 & 0x55555555)</span>
446
00437 tmp1 = _mm_and_si128(tmp1, m1);
447
00438 tmp2 = _mm_and_si128(b, m1); <span class="comment">// tmp2 = (b & 0x55555555)</span>
448
00439 b = _mm_add_epi32(tmp1, tmp2); <span class="comment">// b = tmp1 + tmp2</span>
450
00441 <span class="comment">// b = (b & 0x33333333) + (b >> 2 & 0x33333333);</span>
451
00442 tmp1 = _mm_srli_epi32(b, 2); <span class="comment">// (b >> 2 & 0x33333333)</span>
452
00443 tmp1 = _mm_and_si128(tmp1, m2);
453
00444 tmp2 = _mm_and_si128(b, m2); <span class="comment">// (b & 0x33333333)</span>
454
00445 b = _mm_add_epi32(tmp1, tmp2); <span class="comment">// b = tmp1 + tmp2</span>
456
00447 <span class="comment">// b = (b + (b >> 4)) & 0x0F0F0F0F;</span>
457
00448 tmp1 = _mm_srli_epi32(b, 4); <span class="comment">// tmp1 = b >> 4</span>
458
00449 b = _mm_add_epi32(b, tmp1); <span class="comment">// b = b + (b >> 4)</span>
459
00450 b = _mm_and_si128(b, m3); <span class="comment">// & 0x0F0F0F0F</span>
461
00452 <span class="comment">// b = b + (b >> 8);</span>
462
00453 tmp1 = _mm_srli_epi32 (b, 8); <span class="comment">// tmp1 = b >> 8</span>
463
00454 b = _mm_add_epi32(b, tmp1); <span class="comment">// b = b + (b >> 8)</span>
465
00456 <span class="comment">// b = (b + (b >> 16)) & 0x0000003F;</span>
466
00457 tmp1 = _mm_srli_epi32 (b, 16); <span class="comment">// b >> 16</span>
467
00458 b = _mm_add_epi32(b, tmp1); <span class="comment">// b + (b >> 16)</span>
468
00459 b = _mm_and_si128(b, m4); <span class="comment">// (b >> 16) & 0x0000003F;</span>
470
00461 mcnt = _mm_add_epi32(mcnt, b); <span class="comment">// mcnt += b</span>
472
00463 } <span class="keywordflow">while</span> (block < block_end);
474
00465 __declspec(align(16)) bm::id_t tcnt[4];
475
00466 _mm_store_si128((__m128i*)tcnt, mcnt);
477
00468 <span class="keywordflow">return</span> tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
480
00471 <a class="code" href="a00114.html#a12">BMFORCEINLINE</a>
481
<a name="l00472"></a><a class="code" href="a00129.html#a173">00472</a> __m128i <a class="code" href="a00129.html#a173">sse2_and</a>(__m128i a, __m128i b)
483
00474 <span class="keywordflow">return</span> _mm_and_si128(a, b);
486
00477 <a class="code" href="a00114.html#a12">BMFORCEINLINE</a>
487
<a name="l00478"></a><a class="code" href="a00129.html#a174">00478</a> __m128i <a class="code" href="a00129.html#a174">sse2_or</a>(__m128i a, __m128i b)
489
00480 <span class="keywordflow">return</span> _mm_or_si128(a, b);
493
00484 <a class="code" href="a00114.html#a12">BMFORCEINLINE</a>
494
<a name="l00485"></a><a class="code" href="a00129.html#a175">00485</a> __m128i <a class="code" href="a00129.html#a175">sse2_xor</a>(__m128i a, __m128i b)
496
00487 <span class="keywordflow">return</span> _mm_xor_si128(a, b);
499
00490 <a class="code" href="a00114.html#a12">BMFORCEINLINE</a>
500
<a name="l00491"></a><a class="code" href="a00129.html#a176">00491</a> __m128i <a class="code" href="a00129.html#a176">sse2_sub</a>(__m128i a, __m128i b)
502
00493 <span class="keywordflow">return</span> _mm_andnot_si128(b, a);
506
00497 <span class="keyword">template</span><<span class="keyword">class</span> Func>
507
<a name="l00498"></a><a class="code" href="a00129.html#a177">00498</a> bm::id_t <a class="code" href="a00129.html#a177">sse2_bit_count_op</a>(<span class="keyword">const</span> __m128i* BMRESTRICT block,
508
00499 <span class="keyword">const</span> __m128i* BMRESTRICT block_end,
509
00500 <span class="keyword">const</span> __m128i* BMRESTRICT mask_block,
510
00501 Func sse2_func)
512
00503 <span class="keyword">const</span> <span class="keywordtype">unsigned</span> mu1 = 0x55555555;
513
00504 <span class="keyword">const</span> <span class="keywordtype">unsigned</span> mu2 = 0x33333333;
514
00505 <span class="keyword">const</span> <span class="keywordtype">unsigned</span> mu3 = 0x0F0F0F0F;
515
00506 <span class="keyword">const</span> <span class="keywordtype">unsigned</span> mu4 = 0x0000003F;
517
00508 <span class="comment">// Loading masks</span>
518
00509 __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
519
00510 __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
520
00511 __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
521
00512 __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
523
00514 mcnt = _mm_xor_si128(m1, m1); <span class="comment">// cnt = 0</span>
524
00515 <span class="keywordflow">do</span>
526
00517 __m128i tmp1, tmp2;
527
00518 __m128i b = _mm_load_si128(block++);
529
00520 tmp1 = _mm_load_si128(mask_block++);
531
00522 b = sse2_func(b, tmp1);
533
00524 <span class="comment">// b = (b & 0x55555555) + (b >> 1 & 0x55555555);</span>
534
00525 tmp1 = _mm_srli_epi32(b, 1); <span class="comment">// tmp1 = (b >> 1 & 0x55555555)</span>
535
00526 tmp1 = _mm_and_si128(tmp1, m1);
536
00527 tmp2 = _mm_and_si128(b, m1); <span class="comment">// tmp2 = (b & 0x55555555)</span>
537
00528 b = _mm_add_epi32(tmp1, tmp2); <span class="comment">// b = tmp1 + tmp2</span>
539
00530 <span class="comment">// b = (b & 0x33333333) + (b >> 2 & 0x33333333);</span>
540
00531 tmp1 = _mm_srli_epi32(b, 2); <span class="comment">// (b >> 2 & 0x33333333)</span>
541
00532 tmp1 = _mm_and_si128(tmp1, m2);
542
00533 tmp2 = _mm_and_si128(b, m2); <span class="comment">// (b & 0x33333333)</span>
543
00534 b = _mm_add_epi32(tmp1, tmp2); <span class="comment">// b = tmp1 + tmp2</span>
545
00536 <span class="comment">// b = (b + (b >> 4)) & 0x0F0F0F0F;</span>
546
00537 tmp1 = _mm_srli_epi32(b, 4); <span class="comment">// tmp1 = b >> 4</span>
547
00538 b = _mm_add_epi32(b, tmp1); <span class="comment">// b = b + (b >> 4)</span>
548
00539 b = _mm_and_si128(b, m3); <span class="comment">// & 0x0F0F0F0F</span>
550
00541 <span class="comment">// b = b + (b >> 8);</span>
551
00542 tmp1 = _mm_srli_epi32 (b, 8); <span class="comment">// tmp1 = b >> 8</span>
552
00543 b = _mm_add_epi32(b, tmp1); <span class="comment">// b = b + (b >> 8)</span>
554
00545 <span class="comment">// b = (b + (b >> 16)) & 0x0000003F;</span>
555
00546 tmp1 = _mm_srli_epi32 (b, 16); <span class="comment">// b >> 16</span>
556
00547 b = _mm_add_epi32(b, tmp1); <span class="comment">// b + (b >> 16)</span>
557
00548 b = _mm_and_si128(b, m4); <span class="comment">// (b >> 16) & 0x0000003F;</span>
559
00550 mcnt = _mm_add_epi32(mcnt, b); <span class="comment">// mcnt += b</span>
561
00552 } <span class="keywordflow">while</span> (block < block_end);
563
00554 __declspec(align(16)) bm::id_t tcnt[4];
564
00555 _mm_store_si128((__m128i*)tcnt, mcnt);
566
00557 <span class="keywordflow">return</span> tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
572
<a name="l00563"></a><a class="code" href="a00116.html#a0">00563</a> <span class="preprocessor">#define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\</span>
573
00564 <span class="preprocessor"> sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), mask)</span>
574
00565 <span class="preprocessor"></span>
575
<a name="l00566"></a><a class="code" href="a00116.html#a1">00566</a> <span class="preprocessor">#define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\</span>
576
00567 <span class="preprocessor"> sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), mask)</span>
577
00568 <span class="preprocessor"></span>
578
<a name="l00569"></a><a class="code" href="a00116.html#a2">00569</a> <span class="preprocessor">#define VECT_BITCOUNT(first, last) \</span>
579
00570 <span class="preprocessor"> sse2_bit_count((__m128i*) (first), (__m128i*) (last)) </span>
580
00571 <span class="preprocessor"></span>
581
<a name="l00572"></a><a class="code" href="a00116.html#a3">00572</a> <span class="preprocessor">#define VECT_BITCOUNT_AND(first, last, mask) \</span>
582
00573 <span class="preprocessor"> sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and) </span>
583
00574 <span class="preprocessor"></span>
584
<a name="l00575"></a><a class="code" href="a00116.html#a4">00575</a> <span class="preprocessor">#define VECT_BITCOUNT_OR(first, last, mask) \</span>
585
00576 <span class="preprocessor"> sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or) </span>
586
00577 <span class="preprocessor"></span>
587
<a name="l00578"></a><a class="code" href="a00116.html#a5">00578</a> <span class="preprocessor">#define VECT_BITCOUNT_XOR(first, last, mask) \</span>
588
00579 <span class="preprocessor"> sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor) </span>
589
00580 <span class="preprocessor"></span>
590
<a name="l00581"></a><a class="code" href="a00116.html#a6">00581</a> <span class="preprocessor">#define VECT_BITCOUNT_SUB(first, last, mask) \</span>
591
00582 <span class="preprocessor"> sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub) </span>
592
00583 <span class="preprocessor"></span>
593
<a name="l00584"></a><a class="code" href="a00116.html#a7">00584</a> <span class="preprocessor">#define VECT_INVERT_ARR(first, last) \</span>
594
00585 <span class="preprocessor"> sse2_invert_arr(first, last);</span>
595
00586 <span class="preprocessor"></span>
596
<a name="l00587"></a><a class="code" href="a00116.html#a8">00587</a> <span class="preprocessor">#define VECT_AND_ARR(dst, src, src_end) \</span>
597
00588 <span class="preprocessor"> sse2_and_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))</span>
598
00589 <span class="preprocessor"></span>
599
<a name="l00590"></a><a class="code" href="a00116.html#a9">00590</a> <span class="preprocessor">#define VECT_OR_ARR(dst, src, src_end) \</span>
600
00591 <span class="preprocessor"> sse2_or_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))</span>
601
00592 <span class="preprocessor"></span>
602
<a name="l00593"></a><a class="code" href="a00116.html#a10">00593</a> <span class="preprocessor">#define VECT_SUB_ARR(dst, src, src_end) \</span>
603
00594 <span class="preprocessor"> sse2_sub_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))</span>
604
00595 <span class="preprocessor"></span>
605
<a name="l00596"></a><a class="code" href="a00116.html#a11">00596</a> <span class="preprocessor">#define VECT_XOR_ARR(dst, src, src_end) \</span>
606
00597 <span class="preprocessor"> sse2_xor_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))</span>
607
00598 <span class="preprocessor"></span>
608
<a name="l00599"></a><a class="code" href="a00116.html#a12">00599</a> <span class="preprocessor">#define VECT_COPY_BLOCK(dst, src, src_end) \</span>
609
00600 <span class="preprocessor"> sse2_copy_block((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))</span>
610
00601 <span class="preprocessor"></span>
611
<a name="l00602"></a><a class="code" href="a00116.html#a13">00602</a> <span class="preprocessor">#define VECT_SET_BLOCK(dst, dst_end, value) \</span>
612
00603 <span class="preprocessor"> sse2_set_block((__m128i*) dst, (__m128i*) (dst_end), (value))</span>
613
00604 <span class="preprocessor"></span>
614
00605 } <span class="comment">// namespace</span>
616
00607 <span class="preprocessor">#endif</span>
617
</pre></div><hr size="1"><address style="align: right;"><small>Generated on Sun Aug 5 14:12:26 2007 for BitMagic by
618
<a href="http://www.doxygen.org/index.html">
619
<img src="doxygen.png" alt="doxygen" align="middle" border="0"></a> 1.4.1 </small></address>