~ubuntu-branches/ubuntu/jaunty/bmagic/jaunty

« back to all changes in this revision

Viewing changes to html/a00142.html

  • Committer: Bazaar Package Importer
  • Author(s): Andres Salomon
  • Date: 2008-01-05 23:58:56 UTC
  • mfrom: (1.1.1 upstream)
  • Revision ID: james.westby@ubuntu.com-20080105235856-2kmxhxkz14qjy9ia
Tags: 3.5.0-1
* New upstream release.
* Add tcpp.dpatch.  This stops tests/stress/t.cpp from including
  ncbi_pch.hpp.  As far as I can tell, NCBI is not used at all, I have
  no idea where that came from..
* Silence some lintian warnings; binary-arch-rules-but-pkg-is-arch-indep
  and ancient-standards-version.

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
 
2
<html><head><meta http-equiv="Content-Type" content="text/html;charset=iso-8859-1">
 
3
<title>BitMagic: bmsse2.h Source File</title>
 
4
<link href="doxygen.css" rel="stylesheet" type="text/css">
 
5
</head><body>
 
6
<!-- Generated by Doxygen 1.4.1 -->
 
7
<div class="qindex"><a class="qindex" href="index.html">Main&nbsp;Page</a> | <a class="qindex" href="modules.html">Modules</a> | <a class="qindex" href="namespaces.html">Namespace List</a> | <a class="qindex" href="hierarchy.html">Class&nbsp;Hierarchy</a> | <a class="qindex" href="classes.html">Alphabetical&nbsp;List</a> | <a class="qindex" href="annotated.html">Data&nbsp;Structures</a> | <a class="qindex" href="dirs.html">Directories</a> | <a class="qindex" href="files.html">File&nbsp;List</a> | <a class="qindex" href="namespacemembers.html">Namespace&nbsp;Members</a> | <a class="qindex" href="functions.html">Data&nbsp;Fields</a> | <a class="qindex" href="globals.html">Globals</a> | <a class="qindex" href="examples.html">Examples</a></div>
 
8
<div class="nav">
 
9
<a class="el" href="dir_000000.html">src</a></div>
 
10
<h1>bmsse2.h</h1><a href="a00116.html">Go to the documentation of this file.</a><div class="fragment"><pre class="fragment">00001 <span class="comment">/*</span>
 
11
00002 <span class="comment">Copyright(c) 2002-2005 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)</span>
 
12
00003 <span class="comment"></span>
 
13
00004 <span class="comment">Permission is hereby granted, free of charge, to any person </span>
 
14
00005 <span class="comment">obtaining a copy of this software and associated documentation </span>
 
15
00006 <span class="comment">files (the "Software"), to deal in the Software without restriction, </span>
 
16
00007 <span class="comment">including without limitation the rights to use, copy, modify, merge, </span>
 
17
00008 <span class="comment">publish, distribute, sublicense, and/or sell copies of the Software, </span>
 
18
00009 <span class="comment">and to permit persons to whom the Software is furnished to do so, </span>
 
19
00010 <span class="comment">subject to the following conditions:</span>
 
20
00011 <span class="comment"></span>
 
21
00012 <span class="comment">The above copyright notice and this permission notice shall be included </span>
 
22
00013 <span class="comment">in all copies or substantial portions of the Software.</span>
 
23
00014 <span class="comment"></span>
 
24
00015 <span class="comment">THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, </span>
 
25
00016 <span class="comment">EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES </span>
 
26
00017 <span class="comment">OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. </span>
 
27
00018 <span class="comment">IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, </span>
 
28
00019 <span class="comment">DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, </span>
 
29
00020 <span class="comment">ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR </span>
 
30
00021 <span class="comment">OTHER DEALINGS IN THE SOFTWARE.</span>
 
31
00022 <span class="comment"></span>
 
32
00023 <span class="comment">For more information please visit:  http://bmagic.sourceforge.net</span>
 
33
00024 <span class="comment"></span>
 
34
00025 <span class="comment">*/</span>
 
35
00026 
 
36
00027 
 
37
00028 <span class="preprocessor">#ifndef BMSSE2__H__INCLUDED__</span>
 
38
00029 <span class="preprocessor"></span><span class="preprocessor">#define BMSSE2__H__INCLUDED__</span>
 
39
00030 <span class="preprocessor"></span>
 
40
00031 
 
41
00032 <span class="comment">//    Header implements processor specific intrinsics declarations for SSE2</span>
 
42
00033 <span class="comment">//    instruction set</span>
 
43
00034 <span class="preprocessor">#include&lt;emmintrin.h&gt;</span>
 
44
00035 
 
45
00036 
 
46
00037 
 
47
00038 <span class="keyword">namespace </span>bm
 
48
00039 {
 
49
00040 <span class="comment"></span>
 
50
00041 <span class="comment">/** @defgroup SSE2 Processor specific optimizations for SSE2 instructions</span>
 
51
00042 <span class="comment"> *  @ingroup bmagic</span>
 
52
00043 <span class="comment"> */</span>
 
53
00044 
 
54
00045 <span class="comment"></span>
 
55
00046 <span class="comment">/*! </span>
 
56
00047 <span class="comment">  @brief SSE2 reinitialization guard class</span>
 
57
00048 <span class="comment"></span>
 
58
00049 <span class="comment">  SSE2 requires to call _mm_empty() if we are intermixing</span>
 
59
00050 <span class="comment">  MMX integer commands with floating point arithmetics.</span>
 
60
00051 <span class="comment">  This class guards critical code fragments where SSE2 integer</span>
 
61
00052 <span class="comment">  is used.</span>
 
62
00053 <span class="comment"></span>
 
63
00054 <span class="comment">  @ingroup SSE2</span>
 
64
00055 <span class="comment"></span>
 
65
00056 <span class="comment">*/</span>
 
66
<a name="l00057"></a><a class="code" href="a00110.html">00057</a> <span class="keyword">class </span><a class="code" href="a00110.html">sse2_empty_guard</a>
 
67
00058 {
 
68
00059 <span class="keyword">public</span>:
 
69
<a name="l00060"></a><a class="code" href="a00110.html#a0">00060</a>     <a class="code" href="a00114.html#a12">BMFORCEINLINE</a> <a class="code" href="a00110.html#a0">sse2_empty_guard</a>() 
 
70
00061     {
 
71
00062         _mm_empty();
 
72
00063     }
 
73
00064 
 
74
<a name="l00065"></a><a class="code" href="a00110.html#a1">00065</a>     <a class="code" href="a00114.html#a12">BMFORCEINLINE</a> <a class="code" href="a00110.html#a1">~sse2_empty_guard</a>() 
 
75
00066     {
 
76
00067         _mm_empty();
 
77
00068     }
 
78
00069 };
 
79
00070 
 
80
00071 <span class="comment">/*</span>
 
81
00072 <span class="comment"># ifndef BM_SET_MMX_GUARD</span>
 
82
00073 <span class="comment">#  define BM_SET_MMX_GUARD  sse2_empty_guard  bm_mmx_guard_;</span>
 
83
00074 <span class="comment"># endif</span>
 
84
00075 <span class="comment">*/</span>
 
85
00076 <span class="comment"></span>
 
86
00077 <span class="comment">/*! </span>
 
87
00078 <span class="comment">    @brief XOR array elements to specified mask</span>
 
88
00079 <span class="comment">    *dst = *src ^ mask</span>
 
89
00080 <span class="comment"></span>
 
90
00081 <span class="comment">    @ingroup SSE2</span>
 
91
00082 <span class="comment">*/</span>
 
92
00083 <a class="code" href="a00114.html#a12">BMFORCEINLINE</a> 
 
93
<a name="l00084"></a><a class="code" href="a00136.html#ga0">00084</a> <span class="keywordtype">void</span> <a class="code" href="a00136.html#ga0">sse2_xor_arr_2_mask</a>(__m128i* BMRESTRICT dst, 
 
94
00085                          <span class="keyword">const</span> __m128i* BMRESTRICT src, 
 
95
00086                          <span class="keyword">const</span> __m128i* BMRESTRICT src_end,
 
96
00087                          bm::word_t mask)
 
97
00088 {
 
98
00089      __m128i xmm2 = _mm_set_epi32(mask, mask, mask, mask);
 
99
00090      <span class="keywordflow">do</span>
 
100
00091      {
 
101
00092         __m128i xmm1 = _mm_load_si128(src);
 
102
00093 
 
103
00094         xmm1 = _mm_xor_si128(xmm1, xmm2);
 
104
00095         _mm_store_si128(dst, xmm1);
 
105
00096         ++dst;
 
106
00097         ++src;
 
107
00098 
 
108
00099      } <span class="keywordflow">while</span> (src &lt; src_end);
 
109
00100 }
 
110
00101 <span class="comment"></span>
 
111
00102 <span class="comment">/*! </span>
 
112
00103 <span class="comment">    @brief Inverts array elements and NOT them to specified mask</span>
 
113
00104 <span class="comment">    *dst = ~*src &amp; mask</span>
 
114
00105 <span class="comment"></span>
 
115
00106 <span class="comment">    @ingroup SSE2</span>
 
116
00107 <span class="comment">*/</span>
 
117
00108 <a class="code" href="a00114.html#a12">BMFORCEINLINE</a> 
 
118
<a name="l00109"></a><a class="code" href="a00136.html#ga1">00109</a> <span class="keywordtype">void</span> <a class="code" href="a00136.html#ga1">sse2_andnot_arr_2_mask</a>(__m128i* BMRESTRICT dst, 
 
119
00110                             <span class="keyword">const</span> __m128i* BMRESTRICT src, 
 
120
00111                             <span class="keyword">const</span> __m128i* BMRESTRICT src_end,
 
121
00112                             bm::word_t mask)
 
122
00113 {
 
123
00114      __m128i xmm2 = _mm_set_epi32(mask, mask, mask, mask);
 
124
00115      <span class="keywordflow">do</span>
 
125
00116      {
 
126
00117         <span class="comment">//_mm_prefetch((const char*)(src)+1024, _MM_HINT_NTA);</span>
 
127
00118         <span class="comment">//_mm_prefetch((const char*)(src)+1088, _MM_HINT_NTA);</span>
 
128
00119 
 
129
00120         __m128i xmm1 = _mm_load_si128(src);
 
130
00121 
 
131
00122         xmm1 = _mm_andnot_si128(xmm1, xmm2); <span class="comment">// xmm1 = (~xmm1) &amp; xmm2 </span>
 
132
00123         _mm_store_si128(dst, xmm1);
 
133
00124         ++dst;
 
134
00125         ++src;
 
135
00126 
 
136
00127      } <span class="keywordflow">while</span> (src &lt; src_end);
 
137
00128 }
 
138
00129 <span class="comment"></span>
 
139
00130 <span class="comment">/*! </span>
 
140
00131 <span class="comment">    @brief AND array elements against another array</span>
 
141
00132 <span class="comment">    *dst &amp;= *src</span>
 
142
00133 <span class="comment"></span>
 
143
00134 <span class="comment">    @ingroup SSE2</span>
 
144
00135 <span class="comment">*/</span>
 
145
00136 <a class="code" href="a00114.html#a12">BMFORCEINLINE</a> 
 
146
<a name="l00137"></a><a class="code" href="a00136.html#ga2">00137</a> <span class="keywordtype">void</span> <a class="code" href="a00136.html#ga2">sse2_and_arr</a>(__m128i* BMRESTRICT dst, 
 
147
00138                   <span class="keyword">const</span> __m128i* BMRESTRICT src, 
 
148
00139                   <span class="keyword">const</span> __m128i* BMRESTRICT src_end)
 
149
00140 {
 
150
00141     __m128i xmm1, xmm2;
 
151
00142     <span class="keywordflow">do</span>
 
152
00143     {
 
153
00144         _mm_prefetch((<span class="keyword">const</span> <span class="keywordtype">char</span>*)(src)+512,  _MM_HINT_NTA);
 
154
00145     
 
155
00146         xmm1 = _mm_load_si128(src++);
 
156
00147         xmm2 = _mm_load_si128(dst);
 
157
00148         xmm1 = _mm_and_si128(xmm1, xmm2);
 
158
00149         _mm_store_si128(dst++, xmm1);
 
159
00150         
 
160
00151         xmm1 = _mm_load_si128(src++);
 
161
00152         xmm2 = _mm_load_si128(dst);
 
162
00153         xmm1 = _mm_and_si128(xmm1, xmm2);
 
163
00154         _mm_store_si128(dst++, xmm1);
 
164
00155 
 
165
00156         xmm1 = _mm_load_si128(src++);
 
166
00157         xmm2 = _mm_load_si128(dst);
 
167
00158         xmm1 = _mm_and_si128(xmm1, xmm2);
 
168
00159         _mm_store_si128(dst++, xmm1);
 
169
00160 
 
170
00161         xmm1 = _mm_load_si128(src++);
 
171
00162         xmm2 = _mm_load_si128(dst);
 
172
00163         xmm1 = _mm_and_si128(xmm1, xmm2);
 
173
00164         _mm_store_si128(dst++, xmm1);
 
174
00165 
 
175
00166     } <span class="keywordflow">while</span> (src &lt; src_end);
 
176
00167 
 
177
00168 }
 
178
00169 
 
179
00170 
 
180
00171 <span class="comment"></span>
 
181
00172 <span class="comment">/*! </span>
 
182
00173 <span class="comment">    @brief OR array elements against another array</span>
 
183
00174 <span class="comment">    *dst |= *src</span>
 
184
00175 <span class="comment"></span>
 
185
00176 <span class="comment">    @ingroup SSE2</span>
 
186
00177 <span class="comment">*/</span>
 
187
00178 <a class="code" href="a00114.html#a12">BMFORCEINLINE</a> 
 
188
<a name="l00179"></a><a class="code" href="a00136.html#ga3">00179</a> <span class="keywordtype">void</span> <a class="code" href="a00136.html#ga3">sse2_or_arr</a>(__m128i* BMRESTRICT dst, 
 
189
00180                  <span class="keyword">const</span> __m128i* BMRESTRICT src, 
 
190
00181                  <span class="keyword">const</span> __m128i* BMRESTRICT src_end)
 
191
00182 {
 
192
00183     __m128i xmm1, xmm2;
 
193
00184     <span class="keywordflow">do</span>
 
194
00185     {
 
195
00186         _mm_prefetch((<span class="keyword">const</span> <span class="keywordtype">char</span>*)(src)+512,  _MM_HINT_NTA);
 
196
00187     
 
197
00188         xmm1 = _mm_load_si128(src++);
 
198
00189         xmm2 = _mm_load_si128(dst);
 
199
00190         xmm1 = _mm_or_si128(xmm1, xmm2);
 
200
00191         _mm_store_si128(dst++, xmm1);
 
201
00192         
 
202
00193         xmm1 = _mm_load_si128(src++);
 
203
00194         xmm2 = _mm_load_si128(dst);
 
204
00195         xmm1 = _mm_or_si128(xmm1, xmm2);
 
205
00196         _mm_store_si128(dst++, xmm1);
 
206
00197 
 
207
00198         xmm1 = _mm_load_si128(src++);
 
208
00199         xmm2 = _mm_load_si128(dst);
 
209
00200         xmm1 = _mm_or_si128(xmm1, xmm2);
 
210
00201         _mm_store_si128(dst++, xmm1);
 
211
00202 
 
212
00203         xmm1 = _mm_load_si128(src++);
 
213
00204         xmm2 = _mm_load_si128(dst);
 
214
00205         xmm1 = _mm_or_si128(xmm1, xmm2);
 
215
00206         _mm_store_si128(dst++, xmm1);
 
216
00207 
 
217
00208     } <span class="keywordflow">while</span> (src &lt; src_end);
 
218
00209 }
 
219
00210 <span class="comment"></span>
 
220
00211 <span class="comment">/*! </span>
 
221
00212 <span class="comment">    @brief OR array elements against another array</span>
 
222
00213 <span class="comment">    *dst |= *src</span>
 
223
00214 <span class="comment"></span>
 
224
00215 <span class="comment">    @ingroup SSE2</span>
 
225
00216 <span class="comment">*/</span>
 
226
00217 <a class="code" href="a00114.html#a12">BMFORCEINLINE</a> 
 
227
<a name="l00218"></a><a class="code" href="a00136.html#ga4">00218</a> <span class="keywordtype">void</span> <a class="code" href="a00136.html#ga4">sse2_xor_arr</a>(__m128i* BMRESTRICT dst, 
 
228
00219                   <span class="keyword">const</span> __m128i* BMRESTRICT src, 
 
229
00220                   <span class="keyword">const</span> __m128i* BMRESTRICT src_end)
 
230
00221 {
 
231
00222     __m128i xmm1, xmm2;
 
232
00223     <span class="keywordflow">do</span>
 
233
00224     {
 
234
00225         _mm_prefetch((<span class="keyword">const</span> <span class="keywordtype">char</span>*)(src)+512,  _MM_HINT_NTA);
 
235
00226     
 
236
00227         xmm1 = _mm_load_si128(src++);
 
237
00228         xmm2 = _mm_load_si128(dst);
 
238
00229         xmm1 = _mm_xor_si128(xmm1, xmm2);
 
239
00230         _mm_store_si128(dst++, xmm1);
 
240
00231         
 
241
00232         xmm1 = _mm_load_si128(src++);
 
242
00233         xmm2 = _mm_load_si128(dst);
 
243
00234         xmm1 = _mm_xor_si128(xmm1, xmm2);
 
244
00235         _mm_store_si128(dst++, xmm1);
 
245
00236 
 
246
00237         xmm1 = _mm_load_si128(src++);
 
247
00238         xmm2 = _mm_load_si128(dst);
 
248
00239         xmm1 = _mm_xor_si128(xmm1, xmm2);
 
249
00240         _mm_store_si128(dst++, xmm1);
 
250
00241 
 
251
00242         xmm1 = _mm_load_si128(src++);
 
252
00243         xmm2 = _mm_load_si128(dst);
 
253
00244         xmm1 = _mm_xor_si128(xmm1, xmm2);
 
254
00245         _mm_store_si128(dst++, xmm1);
 
255
00246 
 
256
00247     } <span class="keywordflow">while</span> (src &lt; src_end);
 
257
00248 }
 
258
00249 
 
259
00250 <span class="comment"></span>
 
260
00251 <span class="comment">/*! </span>
 
261
00252 <span class="comment">    @brief AND-NOT (SUB) array elements against another array</span>
 
262
00253 <span class="comment">    *dst &amp;= ~*src</span>
 
263
00254 <span class="comment"></span>
 
264
00255 <span class="comment">    @ingroup SSE2</span>
 
265
00256 <span class="comment">*/</span>
 
266
00257 <a class="code" href="a00114.html#a12">BMFORCEINLINE</a> 
 
267
<a name="l00258"></a><a class="code" href="a00136.html#ga5">00258</a> <span class="keywordtype">void</span> <a class="code" href="a00136.html#ga5">sse2_sub_arr</a>(__m128i* BMRESTRICT dst, 
 
268
00259                  <span class="keyword">const</span> __m128i* BMRESTRICT src, 
 
269
00260                  <span class="keyword">const</span> __m128i* BMRESTRICT src_end)
 
270
00261 {
 
271
00262     __m128i xmm1, xmm2;
 
272
00263     <span class="keywordflow">do</span>
 
273
00264     {
 
274
00265         _mm_prefetch((<span class="keyword">const</span> <span class="keywordtype">char</span>*)(src)+512,  _MM_HINT_NTA);
 
275
00266     
 
276
00267         xmm1 = _mm_load_si128(src++);
 
277
00268         xmm2 = _mm_load_si128(dst);
 
278
00269         xmm1 = _mm_andnot_si128(xmm1, xmm2);
 
279
00270         _mm_store_si128(dst++, xmm1);
 
280
00271         
 
281
00272         xmm1 = _mm_load_si128(src++);
 
282
00273         xmm2 = _mm_load_si128(dst);
 
283
00274         xmm1 = _mm_andnot_si128(xmm1, xmm2);
 
284
00275         _mm_store_si128(dst++, xmm1);
 
285
00276 
 
286
00277         xmm1 = _mm_load_si128(src++);
 
287
00278         xmm2 = _mm_load_si128(dst);
 
288
00279         xmm1 = _mm_andnot_si128(xmm1, xmm2);
 
289
00280         _mm_store_si128(dst++, xmm1);
 
290
00281 
 
291
00282         xmm1 = _mm_load_si128(src++);
 
292
00283         xmm2 = _mm_load_si128(dst);
 
293
00284         xmm1 = _mm_andnot_si128(xmm1, xmm2);
 
294
00285         _mm_store_si128(dst++, xmm1);
 
295
00286 
 
296
00287     } <span class="keywordflow">while</span> (src &lt; src_end);    
 
297
00288 }
 
298
00289 <span class="comment"></span>
 
299
00290 <span class="comment">/*! </span>
 
300
00291 <span class="comment">    @brief SSE2 block memset</span>
 
301
00292 <span class="comment">    *dst = value</span>
 
302
00293 <span class="comment"></span>
 
303
00294 <span class="comment">    @ingroup SSE2</span>
 
304
00295 <span class="comment">*/</span>
 
305
00296 
 
306
00297 <a class="code" href="a00114.html#a12">BMFORCEINLINE</a> 
 
307
<a name="l00298"></a><a class="code" href="a00136.html#ga6">00298</a> <span class="keywordtype">void</span> <a class="code" href="a00136.html#ga6">sse2_set_block</a>(__m128i* BMRESTRICT dst, 
 
308
00299                     __m128i* BMRESTRICT dst_end, 
 
309
00300                     bm::word_t value)
 
310
00301 {
 
311
00302     __m128i xmm0 = _mm_set_epi32 (value, value, value, value);
 
312
00303     <span class="keywordflow">do</span>
 
313
00304     {            
 
314
00305         _mm_store_si128(dst, xmm0);
 
315
00306 <span class="comment">/*        </span>
 
316
00307 <span class="comment">        _mm_store_si128(dst+1, xmm0);</span>
 
317
00308 <span class="comment">        _mm_store_si128(dst+2, xmm0);</span>
 
318
00309 <span class="comment">        _mm_store_si128(dst+3, xmm0);</span>
 
319
00310 <span class="comment"></span>
 
320
00311 <span class="comment">        _mm_store_si128(dst+4, xmm0);</span>
 
321
00312 <span class="comment">        _mm_store_si128(dst+5, xmm0);</span>
 
322
00313 <span class="comment">        _mm_store_si128(dst+6, xmm0);</span>
 
323
00314 <span class="comment">        _mm_store_si128(dst+7, xmm0);</span>
 
324
00315 <span class="comment"></span>
 
325
00316 <span class="comment">        dst += 8;</span>
 
326
00317 <span class="comment">*/</span>        
 
327
00318     } <span class="keywordflow">while</span> (++dst &lt; dst_end);
 
328
00319     
 
329
00320     _mm_sfence();
 
330
00321 }
 
331
00322 <span class="comment"></span>
 
332
00323 <span class="comment">/*! </span>
 
333
00324 <span class="comment">    @brief SSE2 block copy</span>
 
334
00325 <span class="comment">    *dst = *src</span>
 
335
00326 <span class="comment"></span>
 
336
00327 <span class="comment">    @ingroup SSE2</span>
 
337
00328 <span class="comment">*/</span>
 
338
00329 <a class="code" href="a00114.html#a12">BMFORCEINLINE</a> 
 
339
<a name="l00330"></a><a class="code" href="a00136.html#ga7">00330</a> <span class="keywordtype">void</span> <a class="code" href="a00136.html#ga7">sse2_copy_block</a>(__m128i* BMRESTRICT dst, 
 
340
00331                      <span class="keyword">const</span> __m128i* BMRESTRICT src, 
 
341
00332                      <span class="keyword">const</span> __m128i* BMRESTRICT src_end)
 
342
00333 {
 
343
00334     __m128i xmm0, xmm1, xmm2, xmm3;
 
344
00335     <span class="keywordflow">do</span>
 
345
00336     {
 
346
00337         _mm_prefetch((<span class="keyword">const</span> <span class="keywordtype">char</span>*)(src)+512,  _MM_HINT_NTA);
 
347
00338     
 
348
00339         xmm0 = _mm_load_si128(src+0);
 
349
00340         xmm1 = _mm_load_si128(src+1);
 
350
00341         xmm2 = _mm_load_si128(src+2);
 
351
00342         xmm3 = _mm_load_si128(src+3);
 
352
00343         
 
353
00344         _mm_store_si128(dst+0, xmm0);
 
354
00345         _mm_store_si128(dst+1, xmm1);
 
355
00346         _mm_store_si128(dst+2, xmm2);
 
356
00347         _mm_store_si128(dst+3, xmm3);
 
357
00348         
 
358
00349         xmm0 = _mm_load_si128(src+4);
 
359
00350         xmm1 = _mm_load_si128(src+5);
 
360
00351         xmm2 = _mm_load_si128(src+6);
 
361
00352         xmm3 = _mm_load_si128(src+7);
 
362
00353         
 
363
00354         _mm_store_si128(dst+4, xmm0);
 
364
00355         _mm_store_si128(dst+5, xmm1);
 
365
00356         _mm_store_si128(dst+6, xmm2);
 
366
00357         _mm_store_si128(dst+7, xmm3);
 
367
00358         
 
368
00359         src += 8;
 
369
00360         dst += 8;
 
370
00361         
 
371
00362     } <span class="keywordflow">while</span> (src &lt; src_end);    
 
372
00363 }
 
373
00364 
 
374
00365 <span class="comment"></span>
 
375
00366 <span class="comment">/*! </span>
 
376
00367 <span class="comment">    @brief Invert array elements</span>
 
377
00368 <span class="comment">    *dst = ~*dst</span>
 
378
00369 <span class="comment">    or</span>
 
379
00370 <span class="comment">    *dst ^= *dst </span>
 
380
00371 <span class="comment"></span>
 
381
00372 <span class="comment">    @ingroup SSE2</span>
 
382
00373 <span class="comment">*/</span>
 
383
00374 <a class="code" href="a00114.html#a12">BMFORCEINLINE</a> 
 
384
<a name="l00375"></a><a class="code" href="a00136.html#ga8">00375</a> <span class="keywordtype">void</span> <a class="code" href="a00136.html#ga8">sse2_invert_arr</a>(bm::word_t* first, bm::word_t* last)
 
385
00376 {
 
386
00377     __m128i xmm1 = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 
 
387
00378                                  0xFFFFFFFF, 0xFFFFFFFF);
 
388
00379     __m128i* wrd_ptr = (__m128i*)first;
 
389
00380 
 
390
00381     <span class="keywordflow">do</span> 
 
391
00382     {
 
392
00383         _mm_prefetch((<span class="keyword">const</span> <span class="keywordtype">char</span>*)(wrd_ptr)+512,  _MM_HINT_NTA);
 
393
00384         
 
394
00385         __m128i xmm0 = _mm_load_si128(wrd_ptr);
 
395
00386         xmm0 = _mm_xor_si128(xmm0, xmm1);
 
396
00387         _mm_store_si128(wrd_ptr, xmm0);
 
397
00388         ++wrd_ptr;
 
398
00389     } <span class="keywordflow">while</span> (wrd_ptr &lt; (__m128i*)last);
 
399
00390 }
 
400
00391 
 
401
00392 
 
402
00393 <span class="comment"></span>
 
403
00394 <span class="comment">/*!</span>
 
404
00395 <span class="comment">    SSE2 optimized bitcounting function implements parallel bitcounting</span>
 
405
00396 <span class="comment">    algorithm for SSE2 instruction set.</span>
 
406
00397 <span class="comment"></span>
 
407
00398 <span class="comment">&lt;pre&gt;</span>
 
408
00399 <span class="comment">unsigned CalcBitCount32(unsigned b)</span>
 
409
00400 <span class="comment">{</span>
 
410
00401 <span class="comment">    b = (b &amp; 0x55555555) + (b &gt;&gt; 1 &amp; 0x55555555);</span>
 
411
00402 <span class="comment">    b = (b &amp; 0x33333333) + (b &gt;&gt; 2 &amp; 0x33333333);</span>
 
412
00403 <span class="comment">    b = (b + (b &gt;&gt; 4)) &amp; 0x0F0F0F0F;</span>
 
413
00404 <span class="comment">    b = b + (b &gt;&gt; 8);</span>
 
414
00405 <span class="comment">    b = (b + (b &gt;&gt; 16)) &amp; 0x0000003F;</span>
 
415
00406 <span class="comment">    return b;</span>
 
416
00407 <span class="comment">}</span>
 
417
00408 <span class="comment">&lt;/pre&gt;</span>
 
418
00409 <span class="comment"></span>
 
419
00410 <span class="comment">    @ingroup SSE2</span>
 
420
00411 <span class="comment"></span>
 
421
00412 <span class="comment">*/</span>
 
422
00413 <span class="keyword">inline</span> 
 
423
<a name="l00414"></a><a class="code" href="a00136.html#ga9">00414</a> bm::id_t <a class="code" href="a00136.html#ga9">sse2_bit_count</a>(<span class="keyword">const</span> __m128i* block, <span class="keyword">const</span> __m128i* block_end)
 
424
00415 {
 
425
00416     <span class="keyword">const</span> <span class="keywordtype">unsigned</span> mu1 = 0x55555555;
 
426
00417     <span class="keyword">const</span> <span class="keywordtype">unsigned</span> mu2 = 0x33333333;
 
427
00418     <span class="keyword">const</span> <span class="keywordtype">unsigned</span> mu3 = 0x0F0F0F0F;
 
428
00419     <span class="keyword">const</span> <span class="keywordtype">unsigned</span> mu4 = 0x0000003F;
 
429
00420 
 
430
00421     <span class="comment">// Loading masks</span>
 
431
00422     __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
 
432
00423     __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
 
433
00424     __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
 
434
00425     __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
 
435
00426     __m128i mcnt;
 
436
00427     mcnt = _mm_xor_si128(m1, m1); <span class="comment">// cnt = 0</span>
 
437
00428 
 
438
00429     __m128i tmp1, tmp2;
 
439
00430     <span class="keywordflow">do</span>
 
440
00431     {        
 
441
00432         __m128i b = _mm_load_si128(block);
 
442
00433         ++block;
 
443
00434 
 
444
00435         <span class="comment">// b = (b &amp; 0x55555555) + (b &gt;&gt; 1 &amp; 0x55555555);</span>
 
445
00436         tmp1 = _mm_srli_epi32(b, 1);                    <span class="comment">// tmp1 = (b &gt;&gt; 1 &amp; 0x55555555)</span>
 
446
00437         tmp1 = _mm_and_si128(tmp1, m1); 
 
447
00438         tmp2 = _mm_and_si128(b, m1);                    <span class="comment">// tmp2 = (b &amp; 0x55555555)</span>
 
448
00439         b    = _mm_add_epi32(tmp1, tmp2);               <span class="comment">//  b = tmp1 + tmp2</span>
 
449
00440 
 
450
00441         <span class="comment">// b = (b &amp; 0x33333333) + (b &gt;&gt; 2 &amp; 0x33333333);</span>
 
451
00442         tmp1 = _mm_srli_epi32(b, 2);                    <span class="comment">// (b &gt;&gt; 2 &amp; 0x33333333)</span>
 
452
00443         tmp1 = _mm_and_si128(tmp1, m2); 
 
453
00444         tmp2 = _mm_and_si128(b, m2);                    <span class="comment">// (b &amp; 0x33333333)</span>
 
454
00445         b    = _mm_add_epi32(tmp1, tmp2);               <span class="comment">// b = tmp1 + tmp2</span>
 
455
00446 
 
456
00447         <span class="comment">// b = (b + (b &gt;&gt; 4)) &amp; 0x0F0F0F0F;</span>
 
457
00448         tmp1 = _mm_srli_epi32(b, 4);                    <span class="comment">// tmp1 = b &gt;&gt; 4</span>
 
458
00449         b = _mm_add_epi32(b, tmp1);                     <span class="comment">// b = b + (b &gt;&gt; 4)</span>
 
459
00450         b = _mm_and_si128(b, m3);                       <span class="comment">//           &amp; 0x0F0F0F0F</span>
 
460
00451 
 
461
00452         <span class="comment">// b = b + (b &gt;&gt; 8);</span>
 
462
00453         tmp1 = _mm_srli_epi32 (b, 8);                   <span class="comment">// tmp1 = b &gt;&gt; 8</span>
 
463
00454         b = _mm_add_epi32(b, tmp1);                     <span class="comment">// b = b + (b &gt;&gt; 8)</span>
 
464
00455 
 
465
00456         <span class="comment">// b = (b + (b &gt;&gt; 16)) &amp; 0x0000003F;</span>
 
466
00457         tmp1 = _mm_srli_epi32 (b, 16);                  <span class="comment">// b &gt;&gt; 16</span>
 
467
00458         b = _mm_add_epi32(b, tmp1);                     <span class="comment">// b + (b &gt;&gt; 16)</span>
 
468
00459         b = _mm_and_si128(b, m4);                       <span class="comment">// (b &gt;&gt; 16) &amp; 0x0000003F;</span>
 
469
00460 
 
470
00461         mcnt = _mm_add_epi32(mcnt, b);                  <span class="comment">// mcnt += b</span>
 
471
00462 
 
472
00463     } <span class="keywordflow">while</span> (block &lt; block_end);
 
473
00464 
 
474
00465     __declspec(align(16)) bm::id_t tcnt[4];
 
475
00466     _mm_store_si128((__m128i*)tcnt, mcnt);
 
476
00467 
 
477
00468     <span class="keywordflow">return</span> tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
 
478
00469 }
 
479
00470 
 
480
00471 <a class="code" href="a00114.html#a12">BMFORCEINLINE</a> 
 
481
<a name="l00472"></a><a class="code" href="a00129.html#a173">00472</a> __m128i <a class="code" href="a00129.html#a173">sse2_and</a>(__m128i a, __m128i b)
 
482
00473 {
 
483
00474     <span class="keywordflow">return</span> _mm_and_si128(a, b);
 
484
00475 }
 
485
00476 
 
486
00477 <a class="code" href="a00114.html#a12">BMFORCEINLINE</a> 
 
487
<a name="l00478"></a><a class="code" href="a00129.html#a174">00478</a> __m128i <a class="code" href="a00129.html#a174">sse2_or</a>(__m128i a, __m128i b)
 
488
00479 {
 
489
00480     <span class="keywordflow">return</span> _mm_or_si128(a, b);
 
490
00481 }
 
491
00482 
 
492
00483 
 
493
00484 <a class="code" href="a00114.html#a12">BMFORCEINLINE</a> 
 
494
<a name="l00485"></a><a class="code" href="a00129.html#a175">00485</a> __m128i <a class="code" href="a00129.html#a175">sse2_xor</a>(__m128i a, __m128i b)
 
495
00486 {
 
496
00487     <span class="keywordflow">return</span> _mm_xor_si128(a, b);
 
497
00488 }
 
498
00489 
 
499
00490 <a class="code" href="a00114.html#a12">BMFORCEINLINE</a> 
 
500
<a name="l00491"></a><a class="code" href="a00129.html#a176">00491</a> __m128i <a class="code" href="a00129.html#a176">sse2_sub</a>(__m128i a, __m128i b)
 
501
00492 {
 
502
00493     <span class="keywordflow">return</span> _mm_andnot_si128(b, a);
 
503
00494 }
 
504
00495 
 
505
00496 
 
506
00497 <span class="keyword">template</span>&lt;<span class="keyword">class</span> Func&gt;
 
507
<a name="l00498"></a><a class="code" href="a00129.html#a177">00498</a> bm::id_t <a class="code" href="a00129.html#a177">sse2_bit_count_op</a>(<span class="keyword">const</span> __m128i* BMRESTRICT block, 
 
508
00499                            <span class="keyword">const</span> __m128i* BMRESTRICT block_end,
 
509
00500                            <span class="keyword">const</span> __m128i* BMRESTRICT mask_block,
 
510
00501                            Func sse2_func)
 
511
00502 {
 
512
00503     <span class="keyword">const</span> <span class="keywordtype">unsigned</span> mu1 = 0x55555555;
 
513
00504     <span class="keyword">const</span> <span class="keywordtype">unsigned</span> mu2 = 0x33333333;
 
514
00505     <span class="keyword">const</span> <span class="keywordtype">unsigned</span> mu3 = 0x0F0F0F0F;
 
515
00506     <span class="keyword">const</span> <span class="keywordtype">unsigned</span> mu4 = 0x0000003F;
 
516
00507 
 
517
00508     <span class="comment">// Loading masks</span>
 
518
00509     __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
 
519
00510     __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
 
520
00511     __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
 
521
00512     __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
 
522
00513     __m128i mcnt;
 
523
00514     mcnt = _mm_xor_si128(m1, m1); <span class="comment">// cnt = 0</span>
 
524
00515     <span class="keywordflow">do</span>
 
525
00516     {
 
526
00517         __m128i tmp1, tmp2;
 
527
00518         __m128i b = _mm_load_si128(block++);
 
528
00519 
 
529
00520         tmp1 = _mm_load_si128(mask_block++);
 
530
00521         
 
531
00522         b = sse2_func(b, tmp1);
 
532
00523                         
 
533
00524         <span class="comment">// b = (b &amp; 0x55555555) + (b &gt;&gt; 1 &amp; 0x55555555);</span>
 
534
00525         tmp1 = _mm_srli_epi32(b, 1);                    <span class="comment">// tmp1 = (b &gt;&gt; 1 &amp; 0x55555555)</span>
 
535
00526         tmp1 = _mm_and_si128(tmp1, m1); 
 
536
00527         tmp2 = _mm_and_si128(b, m1);                    <span class="comment">// tmp2 = (b &amp; 0x55555555)</span>
 
537
00528         b    = _mm_add_epi32(tmp1, tmp2);               <span class="comment">//  b = tmp1 + tmp2</span>
 
538
00529 
 
539
00530         <span class="comment">// b = (b &amp; 0x33333333) + (b &gt;&gt; 2 &amp; 0x33333333);</span>
 
540
00531         tmp1 = _mm_srli_epi32(b, 2);                    <span class="comment">// (b &gt;&gt; 2 &amp; 0x33333333)</span>
 
541
00532         tmp1 = _mm_and_si128(tmp1, m2); 
 
542
00533         tmp2 = _mm_and_si128(b, m2);                    <span class="comment">// (b &amp; 0x33333333)</span>
 
543
00534         b    = _mm_add_epi32(tmp1, tmp2);               <span class="comment">// b = tmp1 + tmp2</span>
 
544
00535 
 
545
00536         <span class="comment">// b = (b + (b &gt;&gt; 4)) &amp; 0x0F0F0F0F;</span>
 
546
00537         tmp1 = _mm_srli_epi32(b, 4);                    <span class="comment">// tmp1 = b &gt;&gt; 4</span>
 
547
00538         b = _mm_add_epi32(b, tmp1);                     <span class="comment">// b = b + (b &gt;&gt; 4)</span>
 
548
00539         b = _mm_and_si128(b, m3);                       <span class="comment">//           &amp; 0x0F0F0F0F</span>
 
549
00540 
 
550
00541         <span class="comment">// b = b + (b &gt;&gt; 8);</span>
 
551
00542         tmp1 = _mm_srli_epi32 (b, 8);                   <span class="comment">// tmp1 = b &gt;&gt; 8</span>
 
552
00543         b = _mm_add_epi32(b, tmp1);                     <span class="comment">// b = b + (b &gt;&gt; 8)</span>
 
553
00544         
 
554
00545         <span class="comment">// b = (b + (b &gt;&gt; 16)) &amp; 0x0000003F;</span>
 
555
00546         tmp1 = _mm_srli_epi32 (b, 16);                  <span class="comment">// b &gt;&gt; 16</span>
 
556
00547         b = _mm_add_epi32(b, tmp1);                     <span class="comment">// b + (b &gt;&gt; 16)</span>
 
557
00548         b = _mm_and_si128(b, m4);                       <span class="comment">// (b &gt;&gt; 16) &amp; 0x0000003F;</span>
 
558
00549 
 
559
00550         mcnt = _mm_add_epi32(mcnt, b);                  <span class="comment">// mcnt += b</span>
 
560
00551 
 
561
00552     } <span class="keywordflow">while</span> (block &lt; block_end);
 
562
00553 
 
563
00554     __declspec(align(16)) bm::id_t tcnt[4];
 
564
00555     _mm_store_si128((__m128i*)tcnt, mcnt);
 
565
00556 
 
566
00557     <span class="keywordflow">return</span> tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
 
567
00558 }
 
568
00559 
 
569
00560 
 
570
00561 
 
571
00562 
 
572
<a name="l00563"></a><a class="code" href="a00116.html#a0">00563</a> <span class="preprocessor">#define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\</span>
 
573
00564 <span class="preprocessor">    sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), mask)</span>
 
574
00565 <span class="preprocessor"></span>
 
575
<a name="l00566"></a><a class="code" href="a00116.html#a1">00566</a> <span class="preprocessor">#define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\</span>
 
576
00567 <span class="preprocessor">    sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), mask)</span>
 
577
00568 <span class="preprocessor"></span>
 
578
<a name="l00569"></a><a class="code" href="a00116.html#a2">00569</a> <span class="preprocessor">#define VECT_BITCOUNT(first, last) \</span>
 
579
00570 <span class="preprocessor">    sse2_bit_count((__m128i*) (first), (__m128i*) (last)) </span>
 
580
00571 <span class="preprocessor"></span>
 
581
<a name="l00572"></a><a class="code" href="a00116.html#a3">00572</a> <span class="preprocessor">#define VECT_BITCOUNT_AND(first, last, mask) \</span>
 
582
00573 <span class="preprocessor">    sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and) </span>
 
583
00574 <span class="preprocessor"></span>
 
584
<a name="l00575"></a><a class="code" href="a00116.html#a4">00575</a> <span class="preprocessor">#define VECT_BITCOUNT_OR(first, last, mask) \</span>
 
585
00576 <span class="preprocessor">    sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or) </span>
 
586
00577 <span class="preprocessor"></span>
 
587
<a name="l00578"></a><a class="code" href="a00116.html#a5">00578</a> <span class="preprocessor">#define VECT_BITCOUNT_XOR(first, last, mask) \</span>
 
588
00579 <span class="preprocessor">    sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor) </span>
 
589
00580 <span class="preprocessor"></span>
 
590
<a name="l00581"></a><a class="code" href="a00116.html#a6">00581</a> <span class="preprocessor">#define VECT_BITCOUNT_SUB(first, last, mask) \</span>
 
591
00582 <span class="preprocessor">    sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub) </span>
 
592
00583 <span class="preprocessor"></span>
 
593
<a name="l00584"></a><a class="code" href="a00116.html#a7">00584</a> <span class="preprocessor">#define VECT_INVERT_ARR(first, last) \</span>
 
594
00585 <span class="preprocessor">    sse2_invert_arr(first, last);</span>
 
595
00586 <span class="preprocessor"></span>
 
596
<a name="l00587"></a><a class="code" href="a00116.html#a8">00587</a> <span class="preprocessor">#define VECT_AND_ARR(dst, src, src_end) \</span>
 
597
00588 <span class="preprocessor">    sse2_and_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))</span>
 
598
00589 <span class="preprocessor"></span>
 
599
<a name="l00590"></a><a class="code" href="a00116.html#a9">00590</a> <span class="preprocessor">#define VECT_OR_ARR(dst, src, src_end) \</span>
 
600
00591 <span class="preprocessor">    sse2_or_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))</span>
 
601
00592 <span class="preprocessor"></span>
 
602
<a name="l00593"></a><a class="code" href="a00116.html#a10">00593</a> <span class="preprocessor">#define VECT_SUB_ARR(dst, src, src_end) \</span>
 
603
00594 <span class="preprocessor">    sse2_sub_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))</span>
 
604
00595 <span class="preprocessor"></span>
 
605
<a name="l00596"></a><a class="code" href="a00116.html#a11">00596</a> <span class="preprocessor">#define VECT_XOR_ARR(dst, src, src_end) \</span>
 
606
00597 <span class="preprocessor">    sse2_xor_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))</span>
 
607
00598 <span class="preprocessor"></span>
 
608
<a name="l00599"></a><a class="code" href="a00116.html#a12">00599</a> <span class="preprocessor">#define VECT_COPY_BLOCK(dst, src, src_end) \</span>
 
609
00600 <span class="preprocessor">    sse2_copy_block((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))</span>
 
610
00601 <span class="preprocessor"></span>
 
611
<a name="l00602"></a><a class="code" href="a00116.html#a13">00602</a> <span class="preprocessor">#define VECT_SET_BLOCK(dst, dst_end, value) \</span>
 
612
00603 <span class="preprocessor">    sse2_set_block((__m128i*) dst, (__m128i*) (dst_end), (value))</span>
 
613
00604 <span class="preprocessor"></span>
 
614
00605 } <span class="comment">// namespace</span>
 
615
00606 
 
616
00607 <span class="preprocessor">#endif</span>
 
617
</pre></div><hr size="1"><address style="align: right;"><small>Generated on Sun Aug 5 14:12:26 2007 for BitMagic by&nbsp;
 
618
<a href="http://www.doxygen.org/index.html">
 
619
<img src="doxygen.png" alt="doxygen" align="middle" border="0"></a> 1.4.1 </small></address>
 
620
</body>
 
621
</html>