1
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
2
<html><head><meta http-equiv="Content-Type" content="text/html;charset=UTF-8">
3
<title>BitMagic: bmsse2.h Source File</title>
4
<link href="tabs.css" rel="stylesheet" type="text/css">
5
<link href="doxygen.css" rel="stylesheet" type="text/css">
7
<!-- Generated by Doxygen 1.5.9 -->
8
<div class="navigation" id="top">
11
<li><a href="index.html"><span>Main Page</span></a></li>
12
<li><a href="modules.html"><span>Modules</span></a></li>
13
<li><a href="namespaces.html"><span>Namespaces</span></a></li>
14
<li><a href="annotated.html"><span>Data Structures</span></a></li>
15
<li class="current"><a href="files.html"><span>Files</span></a></li>
16
<li><a href="examples.html"><span>Examples</span></a></li>
21
<li><a href="files.html"><span>File List</span></a></li>
22
<li><a href="globals.html"><span>Globals</span></a></li>
25
<h1>bmsse2.h</h1><a href="a00065.html">Go to the documentation of this file.</a><div class="fragment"><pre class="fragment"><a name="l00001"></a>00001 <span class="comment">/*</span>
26
<a name="l00002"></a>00002 <span class="comment">Copyright(c) 2002-2005 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)</span>
27
<a name="l00003"></a>00003 <span class="comment"></span>
28
<a name="l00004"></a>00004 <span class="comment">Permission is hereby granted, free of charge, to any person </span>
29
<a name="l00005"></a>00005 <span class="comment">obtaining a copy of this software and associated documentation </span>
30
<a name="l00006"></a>00006 <span class="comment">files (the "Software"), to deal in the Software without restriction, </span>
31
<a name="l00007"></a>00007 <span class="comment">including without limitation the rights to use, copy, modify, merge, </span>
32
<a name="l00008"></a>00008 <span class="comment">publish, distribute, sublicense, and/or sell copies of the Software, </span>
33
<a name="l00009"></a>00009 <span class="comment">and to permit persons to whom the Software is furnished to do so, </span>
34
<a name="l00010"></a>00010 <span class="comment">subject to the following conditions:</span>
35
<a name="l00011"></a>00011 <span class="comment"></span>
36
<a name="l00012"></a>00012 <span class="comment">The above copyright notice and this permission notice shall be included </span>
37
<a name="l00013"></a>00013 <span class="comment">in all copies or substantial portions of the Software.</span>
38
<a name="l00014"></a>00014 <span class="comment"></span>
39
<a name="l00015"></a>00015 <span class="comment">THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, </span>
40
<a name="l00016"></a>00016 <span class="comment">EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES </span>
41
<a name="l00017"></a>00017 <span class="comment">OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. </span>
42
<a name="l00018"></a>00018 <span class="comment">IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, </span>
43
<a name="l00019"></a>00019 <span class="comment">DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, </span>
44
<a name="l00020"></a>00020 <span class="comment">ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR </span>
45
<a name="l00021"></a>00021 <span class="comment">OTHER DEALINGS IN THE SOFTWARE.</span>
46
<a name="l00022"></a>00022 <span class="comment"></span>
47
<a name="l00023"></a>00023 <span class="comment">For more information please visit: http://bmagic.sourceforge.net</span>
48
<a name="l00024"></a>00024 <span class="comment"></span>
49
<a name="l00025"></a>00025 <span class="comment">*/</span>
50
<a name="l00026"></a>00026
51
<a name="l00027"></a>00027
52
<a name="l00028"></a>00028 <span class="preprocessor">#ifndef BMSSE2__H__INCLUDED__</span>
53
<a name="l00029"></a>00029 <span class="preprocessor"></span><span class="preprocessor">#define BMSSE2__H__INCLUDED__</span>
54
<a name="l00030"></a>00030 <span class="preprocessor"></span>
55
<a name="l00031"></a>00031
56
<a name="l00032"></a>00032 <span class="comment">// Header implements processor specific intrinsics declarations for SSE2</span>
57
<a name="l00033"></a>00033 <span class="comment">// instruction set</span>
58
<a name="l00034"></a>00034 <span class="preprocessor">#include<emmintrin.h></span>
59
<a name="l00035"></a>00035
60
<a name="l00036"></a>00036
61
<a name="l00037"></a>00037
62
<a name="l00038"></a>00038 <span class="keyword">namespace </span>bm
63
<a name="l00039"></a>00039 {
64
<a name="l00040"></a>00040 <span class="comment"></span>
65
<a name="l00041"></a>00041 <span class="comment">/** @defgroup SSE2 Processor specific optimizations for SSE2 instructions</span>
66
<a name="l00042"></a>00042 <span class="comment"> * @ingroup bmagic</span>
67
<a name="l00043"></a>00043 <span class="comment"> */</span>
68
<a name="l00044"></a>00044
69
<a name="l00045"></a>00045 <span class="comment"></span>
70
<a name="l00046"></a>00046 <span class="comment">/*! </span>
71
<a name="l00047"></a>00047 <span class="comment"> @brief SSE2 reinitialization guard class</span>
72
<a name="l00048"></a>00048 <span class="comment"></span>
73
<a name="l00049"></a>00049 <span class="comment"> SSE2 requires to call _mm_empty() if we are intermixing</span>
74
<a name="l00050"></a>00050 <span class="comment"> MMX integer commands with floating point arithmetics.</span>
75
<a name="l00051"></a>00051 <span class="comment"> This class guards critical code fragments where SSE2 integer</span>
76
<a name="l00052"></a>00052 <span class="comment"> is used.</span>
77
<a name="l00053"></a>00053 <span class="comment"></span>
78
<a name="l00054"></a>00054 <span class="comment"> @ingroup SSE2</span>
79
<a name="l00055"></a>00055 <span class="comment"></span>
80
<a name="l00056"></a>00056 <span class="comment">*/</span>
81
<a name="l00057"></a><a class="code" href="a00058.html">00057</a> <span class="keyword">class </span><a class="code" href="a00058.html" title="SSE2 reinitialization guard class.">sse2_empty_guard</a>
82
<a name="l00058"></a>00058 {
83
<a name="l00059"></a>00059 <span class="keyword">public</span>:
84
<a name="l00060"></a><a class="code" href="a00058.html#19cf6cdfabf3f99e8bf8c82dda321c2b">00060</a> <a class="code" href="a00063.html#938734d014fb68dd8b2251fe8ec2b025">BMFORCEINLINE</a> <a class="code" href="a00058.html#19cf6cdfabf3f99e8bf8c82dda321c2b">sse2_empty_guard</a>()
85
<a name="l00061"></a>00061 {
86
<a name="l00062"></a>00062 _mm_empty();
87
<a name="l00063"></a>00063 }
88
<a name="l00064"></a>00064
89
<a name="l00065"></a><a class="code" href="a00058.html#aaf577cfb468058b167b7eb9358f1a02">00065</a> <a class="code" href="a00063.html#938734d014fb68dd8b2251fe8ec2b025">BMFORCEINLINE</a> <a class="code" href="a00058.html#aaf577cfb468058b167b7eb9358f1a02">~sse2_empty_guard</a>()
90
<a name="l00066"></a>00066 {
91
<a name="l00067"></a>00067 _mm_empty();
92
<a name="l00068"></a>00068 }
93
<a name="l00069"></a>00069 };
94
<a name="l00070"></a>00070
95
<a name="l00071"></a>00071 <span class="comment">/*</span>
96
<a name="l00072"></a>00072 <span class="comment"># ifndef BM_SET_MMX_GUARD</span>
97
<a name="l00073"></a>00073 <span class="comment"># define BM_SET_MMX_GUARD sse2_empty_guard bm_mmx_guard_;</span>
98
<a name="l00074"></a>00074 <span class="comment"># endif</span>
99
<a name="l00075"></a>00075 <span class="comment">*/</span>
100
<a name="l00076"></a>00076 <span class="comment"></span>
101
<a name="l00077"></a>00077 <span class="comment">/*! </span>
102
<a name="l00078"></a>00078 <span class="comment"> @brief XOR array elements to specified mask</span>
103
<a name="l00079"></a>00079 <span class="comment"> *dst = *src ^ mask</span>
104
<a name="l00080"></a>00080 <span class="comment"></span>
105
<a name="l00081"></a>00081 <span class="comment"> @ingroup SSE2</span>
106
<a name="l00082"></a>00082 <span class="comment">*/</span>
107
<a name="l00083"></a>00083 <a class="code" href="a00063.html#938734d014fb68dd8b2251fe8ec2b025">BMFORCEINLINE</a>
108
<a name="l00084"></a><a class="code" href="a00085.html#g75c6ddeb0d8a279caa92341878309b50">00084</a> <span class="keywordtype">void</span> <a class="code" href="a00085.html#g75c6ddeb0d8a279caa92341878309b50" title="XOR array elements to specified mask dst = *src ^ mask.">sse2_xor_arr_2_mask</a>(__m128i* <a class="code" href="a00063.html#3d4b22f50e6783522689451783907dc7">BMRESTRICT</a> dst,
109
<a name="l00085"></a>00085 <span class="keyword">const</span> __m128i* <a class="code" href="a00063.html#3d4b22f50e6783522689451783907dc7">BMRESTRICT</a> src,
110
<a name="l00086"></a>00086 <span class="keyword">const</span> __m128i* <a class="code" href="a00063.html#3d4b22f50e6783522689451783907dc7">BMRESTRICT</a> src_end,
111
<a name="l00087"></a>00087 <a class="code" href="a00078.html#17fd5ba52db3ddda05e6f8dd5000a1a4">bm::word_t</a> mask)
112
<a name="l00088"></a>00088 {
113
<a name="l00089"></a>00089 __m128i xmm2 = _mm_set_epi32(mask, mask, mask, mask);
114
<a name="l00090"></a>00090 <span class="keywordflow">do</span>
115
<a name="l00091"></a>00091 {
116
<a name="l00092"></a>00092 __m128i xmm1 = _mm_load_si128(src);
117
<a name="l00093"></a>00093
118
<a name="l00094"></a>00094 xmm1 = _mm_xor_si128(xmm1, xmm2);
119
<a name="l00095"></a>00095 _mm_store_si128(dst, xmm1);
120
<a name="l00096"></a>00096 ++dst;
121
<a name="l00097"></a>00097 ++src;
122
<a name="l00098"></a>00098
123
<a name="l00099"></a>00099 } <span class="keywordflow">while</span> (src < src_end);
124
<a name="l00100"></a>00100 }
125
<a name="l00101"></a>00101 <span class="comment"></span>
126
<a name="l00102"></a>00102 <span class="comment">/*! </span>
127
<a name="l00103"></a>00103 <span class="comment"> @brief Inverts array elements and NOT them to specified mask</span>
128
<a name="l00104"></a>00104 <span class="comment"> *dst = ~*src & mask</span>
129
<a name="l00105"></a>00105 <span class="comment"></span>
130
<a name="l00106"></a>00106 <span class="comment"> @ingroup SSE2</span>
131
<a name="l00107"></a>00107 <span class="comment">*/</span>
132
<a name="l00108"></a>00108 <a class="code" href="a00063.html#938734d014fb68dd8b2251fe8ec2b025">BMFORCEINLINE</a>
133
<a name="l00109"></a><a class="code" href="a00085.html#gb7b21f448684c4d84927792661e67ed5">00109</a> <span class="keywordtype">void</span> <a class="code" href="a00085.html#gb7b21f448684c4d84927792661e67ed5" title="Inverts array elements and NOT them to specified mask dst = ~*src &amp; mask.">sse2_andnot_arr_2_mask</a>(__m128i* <a class="code" href="a00063.html#3d4b22f50e6783522689451783907dc7">BMRESTRICT</a> dst,
134
<a name="l00110"></a>00110 <span class="keyword">const</span> __m128i* <a class="code" href="a00063.html#3d4b22f50e6783522689451783907dc7">BMRESTRICT</a> src,
135
<a name="l00111"></a>00111 <span class="keyword">const</span> __m128i* <a class="code" href="a00063.html#3d4b22f50e6783522689451783907dc7">BMRESTRICT</a> src_end,
136
<a name="l00112"></a>00112 <a class="code" href="a00078.html#17fd5ba52db3ddda05e6f8dd5000a1a4">bm::word_t</a> mask)
137
<a name="l00113"></a>00113 {
138
<a name="l00114"></a>00114 __m128i xmm2 = _mm_set_epi32(mask, mask, mask, mask);
139
<a name="l00115"></a>00115 <span class="keywordflow">do</span>
140
<a name="l00116"></a>00116 {
141
<a name="l00117"></a>00117 <span class="comment">//_mm_prefetch((const char*)(src)+1024, _MM_HINT_NTA);</span>
142
<a name="l00118"></a>00118 <span class="comment">//_mm_prefetch((const char*)(src)+1088, _MM_HINT_NTA);</span>
143
<a name="l00119"></a>00119
144
<a name="l00120"></a>00120 __m128i xmm1 = _mm_load_si128(src);
145
<a name="l00121"></a>00121
146
<a name="l00122"></a>00122 xmm1 = _mm_andnot_si128(xmm1, xmm2); <span class="comment">// xmm1 = (~xmm1) & xmm2 </span>
147
<a name="l00123"></a>00123 _mm_store_si128(dst, xmm1);
148
<a name="l00124"></a>00124 ++dst;
149
<a name="l00125"></a>00125 ++src;
150
<a name="l00126"></a>00126
151
<a name="l00127"></a>00127 } <span class="keywordflow">while</span> (src < src_end);
152
<a name="l00128"></a>00128 }
153
<a name="l00129"></a>00129 <span class="comment"></span>
154
<a name="l00130"></a>00130 <span class="comment">/*! </span>
155
<a name="l00131"></a>00131 <span class="comment"> @brief AND array elements against another array</span>
156
<a name="l00132"></a>00132 <span class="comment"> *dst &= *src</span>
157
<a name="l00133"></a>00133 <span class="comment"></span>
158
<a name="l00134"></a>00134 <span class="comment"> @ingroup SSE2</span>
159
<a name="l00135"></a>00135 <span class="comment">*/</span>
160
<a name="l00136"></a>00136 <a class="code" href="a00063.html#938734d014fb68dd8b2251fe8ec2b025">BMFORCEINLINE</a>
161
<a name="l00137"></a><a class="code" href="a00085.html#g795b544f311409a55da4ee61a3cd939a">00137</a> <span class="keywordtype">void</span> <a class="code" href="a00085.html#g795b544f311409a55da4ee61a3cd939a" title="AND array elements against another array dst &amp;= *src.">sse2_and_arr</a>(__m128i* <a class="code" href="a00063.html#3d4b22f50e6783522689451783907dc7">BMRESTRICT</a> dst,
162
<a name="l00138"></a>00138 <span class="keyword">const</span> __m128i* <a class="code" href="a00063.html#3d4b22f50e6783522689451783907dc7">BMRESTRICT</a> src,
163
<a name="l00139"></a>00139 <span class="keyword">const</span> __m128i* <a class="code" href="a00063.html#3d4b22f50e6783522689451783907dc7">BMRESTRICT</a> src_end)
164
<a name="l00140"></a>00140 {
165
<a name="l00141"></a>00141 __m128i xmm1, xmm2;
166
<a name="l00142"></a>00142 <span class="keywordflow">do</span>
167
<a name="l00143"></a>00143 {
168
<a name="l00144"></a>00144 _mm_prefetch((<span class="keyword">const</span> <span class="keywordtype">char</span>*)(src)+512, _MM_HINT_NTA);
169
<a name="l00145"></a>00145
170
<a name="l00146"></a>00146 xmm1 = _mm_load_si128(src++);
171
<a name="l00147"></a>00147 xmm2 = _mm_load_si128(dst);
172
<a name="l00148"></a>00148 xmm1 = _mm_and_si128(xmm1, xmm2);
173
<a name="l00149"></a>00149 _mm_store_si128(dst++, xmm1);
174
<a name="l00150"></a>00150
175
<a name="l00151"></a>00151 xmm1 = _mm_load_si128(src++);
176
<a name="l00152"></a>00152 xmm2 = _mm_load_si128(dst);
177
<a name="l00153"></a>00153 xmm1 = _mm_and_si128(xmm1, xmm2);
178
<a name="l00154"></a>00154 _mm_store_si128(dst++, xmm1);
179
<a name="l00155"></a>00155
180
<a name="l00156"></a>00156 xmm1 = _mm_load_si128(src++);
181
<a name="l00157"></a>00157 xmm2 = _mm_load_si128(dst);
182
<a name="l00158"></a>00158 xmm1 = _mm_and_si128(xmm1, xmm2);
183
<a name="l00159"></a>00159 _mm_store_si128(dst++, xmm1);
184
<a name="l00160"></a>00160
185
<a name="l00161"></a>00161 xmm1 = _mm_load_si128(src++);
186
<a name="l00162"></a>00162 xmm2 = _mm_load_si128(dst);
187
<a name="l00163"></a>00163 xmm1 = _mm_and_si128(xmm1, xmm2);
188
<a name="l00164"></a>00164 _mm_store_si128(dst++, xmm1);
189
<a name="l00165"></a>00165
190
<a name="l00166"></a>00166 } <span class="keywordflow">while</span> (src < src_end);
191
<a name="l00167"></a>00167
192
<a name="l00168"></a>00168 }
193
<a name="l00169"></a>00169
194
<a name="l00170"></a>00170
195
<a name="l00171"></a>00171 <span class="comment"></span>
196
<a name="l00172"></a>00172 <span class="comment">/*! </span>
197
<a name="l00173"></a>00173 <span class="comment"> @brief OR array elements against another array</span>
198
<a name="l00174"></a>00174 <span class="comment"> *dst |= *src</span>
199
<a name="l00175"></a>00175 <span class="comment"></span>
200
<a name="l00176"></a>00176 <span class="comment"> @ingroup SSE2</span>
201
<a name="l00177"></a>00177 <span class="comment">*/</span>
202
<a name="l00178"></a>00178 <a class="code" href="a00063.html#938734d014fb68dd8b2251fe8ec2b025">BMFORCEINLINE</a>
203
<a name="l00179"></a><a class="code" href="a00085.html#g3a7d61e4e8ad8791ab38fd1c3436aa67">00179</a> <span class="keywordtype">void</span> <a class="code" href="a00085.html#g3a7d61e4e8ad8791ab38fd1c3436aa67" title="OR array elements against another array dst |= *src.">sse2_or_arr</a>(__m128i* <a class="code" href="a00063.html#3d4b22f50e6783522689451783907dc7">BMRESTRICT</a> dst,
204
<a name="l00180"></a>00180 <span class="keyword">const</span> __m128i* <a class="code" href="a00063.html#3d4b22f50e6783522689451783907dc7">BMRESTRICT</a> src,
205
<a name="l00181"></a>00181 <span class="keyword">const</span> __m128i* <a class="code" href="a00063.html#3d4b22f50e6783522689451783907dc7">BMRESTRICT</a> src_end)
206
<a name="l00182"></a>00182 {
207
<a name="l00183"></a>00183 __m128i xmm1, xmm2;
208
<a name="l00184"></a>00184 <span class="keywordflow">do</span>
209
<a name="l00185"></a>00185 {
210
<a name="l00186"></a>00186 _mm_prefetch((<span class="keyword">const</span> <span class="keywordtype">char</span>*)(src)+512, _MM_HINT_NTA);
211
<a name="l00187"></a>00187
212
<a name="l00188"></a>00188 xmm1 = _mm_load_si128(src++);
213
<a name="l00189"></a>00189 xmm2 = _mm_load_si128(dst);
214
<a name="l00190"></a>00190 xmm1 = _mm_or_si128(xmm1, xmm2);
215
<a name="l00191"></a>00191 _mm_store_si128(dst++, xmm1);
216
<a name="l00192"></a>00192
217
<a name="l00193"></a>00193 xmm1 = _mm_load_si128(src++);
218
<a name="l00194"></a>00194 xmm2 = _mm_load_si128(dst);
219
<a name="l00195"></a>00195 xmm1 = _mm_or_si128(xmm1, xmm2);
220
<a name="l00196"></a>00196 _mm_store_si128(dst++, xmm1);
221
<a name="l00197"></a>00197
222
<a name="l00198"></a>00198 xmm1 = _mm_load_si128(src++);
223
<a name="l00199"></a>00199 xmm2 = _mm_load_si128(dst);
224
<a name="l00200"></a>00200 xmm1 = _mm_or_si128(xmm1, xmm2);
225
<a name="l00201"></a>00201 _mm_store_si128(dst++, xmm1);
226
<a name="l00202"></a>00202
227
<a name="l00203"></a>00203 xmm1 = _mm_load_si128(src++);
228
<a name="l00204"></a>00204 xmm2 = _mm_load_si128(dst);
229
<a name="l00205"></a>00205 xmm1 = _mm_or_si128(xmm1, xmm2);
230
<a name="l00206"></a>00206 _mm_store_si128(dst++, xmm1);
231
<a name="l00207"></a>00207
232
<a name="l00208"></a>00208 } <span class="keywordflow">while</span> (src < src_end);
233
<a name="l00209"></a>00209 }
234
<a name="l00210"></a>00210 <span class="comment"></span>
235
<a name="l00211"></a>00211 <span class="comment">/*! </span>
236
<a name="l00212"></a>00212 <span class="comment"> @brief OR array elements against another array</span>
237
<a name="l00213"></a>00213 <span class="comment"> *dst |= *src</span>
238
<a name="l00214"></a>00214 <span class="comment"></span>
239
<a name="l00215"></a>00215 <span class="comment"> @ingroup SSE2</span>
240
<a name="l00216"></a>00216 <span class="comment">*/</span>
241
<a name="l00217"></a>00217 <a class="code" href="a00063.html#938734d014fb68dd8b2251fe8ec2b025">BMFORCEINLINE</a>
242
<a name="l00218"></a><a class="code" href="a00085.html#gf1a5ad26557cc4d71d7421c35a8445fe">00218</a> <span class="keywordtype">void</span> <a class="code" href="a00085.html#gf1a5ad26557cc4d71d7421c35a8445fe" title="OR array elements against another array dst |= *src.">sse2_xor_arr</a>(__m128i* <a class="code" href="a00063.html#3d4b22f50e6783522689451783907dc7">BMRESTRICT</a> dst,
243
<a name="l00219"></a>00219 <span class="keyword">const</span> __m128i* <a class="code" href="a00063.html#3d4b22f50e6783522689451783907dc7">BMRESTRICT</a> src,
244
<a name="l00220"></a>00220 <span class="keyword">const</span> __m128i* <a class="code" href="a00063.html#3d4b22f50e6783522689451783907dc7">BMRESTRICT</a> src_end)
245
<a name="l00221"></a>00221 {
246
<a name="l00222"></a>00222 __m128i xmm1, xmm2;
247
<a name="l00223"></a>00223 <span class="keywordflow">do</span>
248
<a name="l00224"></a>00224 {
249
<a name="l00225"></a>00225 _mm_prefetch((<span class="keyword">const</span> <span class="keywordtype">char</span>*)(src)+512, _MM_HINT_NTA);
250
<a name="l00226"></a>00226
251
<a name="l00227"></a>00227 xmm1 = _mm_load_si128(src++);
252
<a name="l00228"></a>00228 xmm2 = _mm_load_si128(dst);
253
<a name="l00229"></a>00229 xmm1 = _mm_xor_si128(xmm1, xmm2);
254
<a name="l00230"></a>00230 _mm_store_si128(dst++, xmm1);
255
<a name="l00231"></a>00231
256
<a name="l00232"></a>00232 xmm1 = _mm_load_si128(src++);
257
<a name="l00233"></a>00233 xmm2 = _mm_load_si128(dst);
258
<a name="l00234"></a>00234 xmm1 = _mm_xor_si128(xmm1, xmm2);
259
<a name="l00235"></a>00235 _mm_store_si128(dst++, xmm1);
260
<a name="l00236"></a>00236
261
<a name="l00237"></a>00237 xmm1 = _mm_load_si128(src++);
262
<a name="l00238"></a>00238 xmm2 = _mm_load_si128(dst);
263
<a name="l00239"></a>00239 xmm1 = _mm_xor_si128(xmm1, xmm2);
264
<a name="l00240"></a>00240 _mm_store_si128(dst++, xmm1);
265
<a name="l00241"></a>00241
266
<a name="l00242"></a>00242 xmm1 = _mm_load_si128(src++);
267
<a name="l00243"></a>00243 xmm2 = _mm_load_si128(dst);
268
<a name="l00244"></a>00244 xmm1 = _mm_xor_si128(xmm1, xmm2);
269
<a name="l00245"></a>00245 _mm_store_si128(dst++, xmm1);
270
<a name="l00246"></a>00246
271
<a name="l00247"></a>00247 } <span class="keywordflow">while</span> (src < src_end);
272
<a name="l00248"></a>00248 }
273
<a name="l00249"></a>00249
274
<a name="l00250"></a>00250 <span class="comment"></span>
275
<a name="l00251"></a>00251 <span class="comment">/*! </span>
276
<a name="l00252"></a>00252 <span class="comment"> @brief AND-NOT (SUB) array elements against another array</span>
277
<a name="l00253"></a>00253 <span class="comment"> *dst &= ~*src</span>
278
<a name="l00254"></a>00254 <span class="comment"></span>
279
<a name="l00255"></a>00255 <span class="comment"> @ingroup SSE2</span>
280
<a name="l00256"></a>00256 <span class="comment">*/</span>
281
<a name="l00257"></a>00257 <a class="code" href="a00063.html#938734d014fb68dd8b2251fe8ec2b025">BMFORCEINLINE</a>
282
<a name="l00258"></a><a class="code" href="a00085.html#gc99f3b138f8a5e8ffb1296b129f618f0">00258</a> <span class="keywordtype">void</span> <a class="code" href="a00085.html#gc99f3b138f8a5e8ffb1296b129f618f0" title="AND-NOT (SUB) array elements against another array dst &amp;= ~*src.">sse2_sub_arr</a>(__m128i* <a class="code" href="a00063.html#3d4b22f50e6783522689451783907dc7">BMRESTRICT</a> dst,
283
<a name="l00259"></a>00259 <span class="keyword">const</span> __m128i* <a class="code" href="a00063.html#3d4b22f50e6783522689451783907dc7">BMRESTRICT</a> src,
284
<a name="l00260"></a>00260 <span class="keyword">const</span> __m128i* <a class="code" href="a00063.html#3d4b22f50e6783522689451783907dc7">BMRESTRICT</a> src_end)
285
<a name="l00261"></a>00261 {
286
<a name="l00262"></a>00262 __m128i xmm1, xmm2;
287
<a name="l00263"></a>00263 <span class="keywordflow">do</span>
288
<a name="l00264"></a>00264 {
289
<a name="l00265"></a>00265 _mm_prefetch((<span class="keyword">const</span> <span class="keywordtype">char</span>*)(src)+512, _MM_HINT_NTA);
290
<a name="l00266"></a>00266
291
<a name="l00267"></a>00267 xmm1 = _mm_load_si128(src++);
292
<a name="l00268"></a>00268 xmm2 = _mm_load_si128(dst);
293
<a name="l00269"></a>00269 xmm1 = _mm_andnot_si128(xmm1, xmm2);
294
<a name="l00270"></a>00270 _mm_store_si128(dst++, xmm1);
295
<a name="l00271"></a>00271
296
<a name="l00272"></a>00272 xmm1 = _mm_load_si128(src++);
297
<a name="l00273"></a>00273 xmm2 = _mm_load_si128(dst);
298
<a name="l00274"></a>00274 xmm1 = _mm_andnot_si128(xmm1, xmm2);
299
<a name="l00275"></a>00275 _mm_store_si128(dst++, xmm1);
300
<a name="l00276"></a>00276
301
<a name="l00277"></a>00277 xmm1 = _mm_load_si128(src++);
302
<a name="l00278"></a>00278 xmm2 = _mm_load_si128(dst);
303
<a name="l00279"></a>00279 xmm1 = _mm_andnot_si128(xmm1, xmm2);
304
<a name="l00280"></a>00280 _mm_store_si128(dst++, xmm1);
305
<a name="l00281"></a>00281
306
<a name="l00282"></a>00282 xmm1 = _mm_load_si128(src++);
307
<a name="l00283"></a>00283 xmm2 = _mm_load_si128(dst);
308
<a name="l00284"></a>00284 xmm1 = _mm_andnot_si128(xmm1, xmm2);
309
<a name="l00285"></a>00285 _mm_store_si128(dst++, xmm1);
310
<a name="l00286"></a>00286
311
<a name="l00287"></a>00287 } <span class="keywordflow">while</span> (src < src_end);
312
<a name="l00288"></a>00288 }
313
<a name="l00289"></a>00289 <span class="comment"></span>
314
<a name="l00290"></a>00290 <span class="comment">/*! </span>
315
<a name="l00291"></a>00291 <span class="comment"> @brief SSE2 block memset</span>
316
<a name="l00292"></a>00292 <span class="comment"> *dst = value</span>
317
<a name="l00293"></a>00293 <span class="comment"></span>
318
<a name="l00294"></a>00294 <span class="comment"> @ingroup SSE2</span>
319
<a name="l00295"></a>00295 <span class="comment">*/</span>
320
<a name="l00296"></a>00296
321
<a name="l00297"></a>00297 <a class="code" href="a00063.html#938734d014fb68dd8b2251fe8ec2b025">BMFORCEINLINE</a>
322
<a name="l00298"></a><a class="code" href="a00085.html#g302f4fcd0abf355957b305d16d04f452">00298</a> <span class="keywordtype">void</span> <a class="code" href="a00085.html#g302f4fcd0abf355957b305d16d04f452" title="SSE2 block memset dst = value.">sse2_set_block</a>(__m128i* <a class="code" href="a00063.html#3d4b22f50e6783522689451783907dc7">BMRESTRICT</a> dst,
323
<a name="l00299"></a>00299 __m128i* <a class="code" href="a00063.html#3d4b22f50e6783522689451783907dc7">BMRESTRICT</a> dst_end,
324
<a name="l00300"></a>00300 <a class="code" href="a00078.html#17fd5ba52db3ddda05e6f8dd5000a1a4">bm::word_t</a> value)
325
<a name="l00301"></a>00301 {
326
<a name="l00302"></a>00302 __m128i xmm0 = _mm_set_epi32 (value, value, value, value);
327
<a name="l00303"></a>00303 <span class="keywordflow">do</span>
328
<a name="l00304"></a>00304 {
329
<a name="l00305"></a>00305 _mm_store_si128(dst, xmm0);
330
<a name="l00306"></a>00306 <span class="comment">/* </span>
331
<a name="l00307"></a>00307 <span class="comment"> _mm_store_si128(dst+1, xmm0);</span>
332
<a name="l00308"></a>00308 <span class="comment"> _mm_store_si128(dst+2, xmm0);</span>
333
<a name="l00309"></a>00309 <span class="comment"> _mm_store_si128(dst+3, xmm0);</span>
334
<a name="l00310"></a>00310 <span class="comment"></span>
335
<a name="l00311"></a>00311 <span class="comment"> _mm_store_si128(dst+4, xmm0);</span>
336
<a name="l00312"></a>00312 <span class="comment"> _mm_store_si128(dst+5, xmm0);</span>
337
<a name="l00313"></a>00313 <span class="comment"> _mm_store_si128(dst+6, xmm0);</span>
338
<a name="l00314"></a>00314 <span class="comment"> _mm_store_si128(dst+7, xmm0);</span>
339
<a name="l00315"></a>00315 <span class="comment"></span>
340
<a name="l00316"></a>00316 <span class="comment"> dst += 8;</span>
341
<a name="l00317"></a>00317 <span class="comment">*/</span>
342
<a name="l00318"></a>00318 } <span class="keywordflow">while</span> (++dst < dst_end);
343
<a name="l00319"></a>00319
344
<a name="l00320"></a>00320 _mm_sfence();
345
<a name="l00321"></a>00321 }
346
<a name="l00322"></a>00322 <span class="comment"></span>
347
<a name="l00323"></a>00323 <span class="comment">/*! </span>
348
<a name="l00324"></a>00324 <span class="comment"> @brief SSE2 block copy</span>
349
<a name="l00325"></a>00325 <span class="comment"> *dst = *src</span>
350
<a name="l00326"></a>00326 <span class="comment"></span>
351
<a name="l00327"></a>00327 <span class="comment"> @ingroup SSE2</span>
352
<a name="l00328"></a>00328 <span class="comment">*/</span>
353
<a name="l00329"></a>00329 <a class="code" href="a00063.html#938734d014fb68dd8b2251fe8ec2b025">BMFORCEINLINE</a>
354
<a name="l00330"></a><a class="code" href="a00085.html#g571dd54af5c555cad9dfa6bef4561777">00330</a> <span class="keywordtype">void</span> <a class="code" href="a00085.html#g571dd54af5c555cad9dfa6bef4561777" title="SSE2 block copy dst = *src.">sse2_copy_block</a>(__m128i* <a class="code" href="a00063.html#3d4b22f50e6783522689451783907dc7">BMRESTRICT</a> dst,
355
<a name="l00331"></a>00331 <span class="keyword">const</span> __m128i* <a class="code" href="a00063.html#3d4b22f50e6783522689451783907dc7">BMRESTRICT</a> src,
356
<a name="l00332"></a>00332 <span class="keyword">const</span> __m128i* <a class="code" href="a00063.html#3d4b22f50e6783522689451783907dc7">BMRESTRICT</a> src_end)
357
<a name="l00333"></a>00333 {
358
<a name="l00334"></a>00334 __m128i xmm0, xmm1, xmm2, xmm3;
359
<a name="l00335"></a>00335 <span class="keywordflow">do</span>
360
<a name="l00336"></a>00336 {
361
<a name="l00337"></a>00337 _mm_prefetch((<span class="keyword">const</span> <span class="keywordtype">char</span>*)(src)+512, _MM_HINT_NTA);
362
<a name="l00338"></a>00338
363
<a name="l00339"></a>00339 xmm0 = _mm_load_si128(src+0);
364
<a name="l00340"></a>00340 xmm1 = _mm_load_si128(src+1);
365
<a name="l00341"></a>00341 xmm2 = _mm_load_si128(src+2);
366
<a name="l00342"></a>00342 xmm3 = _mm_load_si128(src+3);
367
<a name="l00343"></a>00343
368
<a name="l00344"></a>00344 _mm_store_si128(dst+0, xmm0);
369
<a name="l00345"></a>00345 _mm_store_si128(dst+1, xmm1);
370
<a name="l00346"></a>00346 _mm_store_si128(dst+2, xmm2);
371
<a name="l00347"></a>00347 _mm_store_si128(dst+3, xmm3);
372
<a name="l00348"></a>00348
373
<a name="l00349"></a>00349 xmm0 = _mm_load_si128(src+4);
374
<a name="l00350"></a>00350 xmm1 = _mm_load_si128(src+5);
375
<a name="l00351"></a>00351 xmm2 = _mm_load_si128(src+6);
376
<a name="l00352"></a>00352 xmm3 = _mm_load_si128(src+7);
377
<a name="l00353"></a>00353
378
<a name="l00354"></a>00354 _mm_store_si128(dst+4, xmm0);
379
<a name="l00355"></a>00355 _mm_store_si128(dst+5, xmm1);
380
<a name="l00356"></a>00356 _mm_store_si128(dst+6, xmm2);
381
<a name="l00357"></a>00357 _mm_store_si128(dst+7, xmm3);
382
<a name="l00358"></a>00358
383
<a name="l00359"></a>00359 src += 8;
384
<a name="l00360"></a>00360 dst += 8;
385
<a name="l00361"></a>00361
386
<a name="l00362"></a>00362 } <span class="keywordflow">while</span> (src < src_end);
387
<a name="l00363"></a>00363 }
388
<a name="l00364"></a>00364
389
<a name="l00365"></a>00365 <span class="comment"></span>
390
<a name="l00366"></a>00366 <span class="comment">/*! </span>
391
<a name="l00367"></a>00367 <span class="comment"> @brief Invert array elements</span>
392
<a name="l00368"></a>00368 <span class="comment"> *dst = ~*dst</span>
393
<a name="l00369"></a>00369 <span class="comment"> or</span>
394
<a name="l00370"></a>00370 <span class="comment"> *dst ^= *dst </span>
395
<a name="l00371"></a>00371 <span class="comment"></span>
396
<a name="l00372"></a>00372 <span class="comment"> @ingroup SSE2</span>
397
<a name="l00373"></a>00373 <span class="comment">*/</span>
398
<a name="l00374"></a>00374 <a class="code" href="a00063.html#938734d014fb68dd8b2251fe8ec2b025">BMFORCEINLINE</a>
399
<a name="l00375"></a><a class="code" href="a00085.html#g8d506147673d88005f92caee7f5dd23a">00375</a> <span class="keywordtype">void</span> <a class="code" href="a00085.html#g8d506147673d88005f92caee7f5dd23a" title="Invert array elements dst = ~*dst or dst ^= *dst.">sse2_invert_arr</a>(<a class="code" href="a00078.html#17fd5ba52db3ddda05e6f8dd5000a1a4">bm::word_t</a>* first, <a class="code" href="a00078.html#17fd5ba52db3ddda05e6f8dd5000a1a4">bm::word_t</a>* last)
400
<a name="l00376"></a>00376 {
401
<a name="l00377"></a>00377 __m128i xmm1 = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF,
402
<a name="l00378"></a>00378 0xFFFFFFFF, 0xFFFFFFFF);
403
<a name="l00379"></a>00379 __m128i* wrd_ptr = (__m128i*)first;
404
<a name="l00380"></a>00380
405
<a name="l00381"></a>00381 <span class="keywordflow">do</span>
406
<a name="l00382"></a>00382 {
407
<a name="l00383"></a>00383 _mm_prefetch((<span class="keyword">const</span> <span class="keywordtype">char</span>*)(wrd_ptr)+512, _MM_HINT_NTA);
408
<a name="l00384"></a>00384
409
<a name="l00385"></a>00385 __m128i xmm0 = _mm_load_si128(wrd_ptr);
410
<a name="l00386"></a>00386 xmm0 = _mm_xor_si128(xmm0, xmm1);
411
<a name="l00387"></a>00387 _mm_store_si128(wrd_ptr, xmm0);
412
<a name="l00388"></a>00388 ++wrd_ptr;
413
<a name="l00389"></a>00389 } <span class="keywordflow">while</span> (wrd_ptr < (__m128i*)last);
414
<a name="l00390"></a>00390 }
415
<a name="l00391"></a>00391
416
<a name="l00392"></a>00392
417
<a name="l00393"></a>00393 <span class="comment"></span>
418
<a name="l00394"></a>00394 <span class="comment">/*!</span>
419
<a name="l00395"></a>00395 <span class="comment"> SSE2 optimized bitcounting function implements parallel bitcounting</span>
420
<a name="l00396"></a>00396 <span class="comment"> algorithm for SSE2 instruction set.</span>
421
<a name="l00397"></a>00397 <span class="comment"></span>
422
<a name="l00398"></a>00398 <span class="comment"><pre></span>
423
<a name="l00399"></a>00399 <span class="comment">unsigned CalcBitCount32(unsigned b)</span>
424
<a name="l00400"></a>00400 <span class="comment">{</span>
425
<a name="l00401"></a>00401 <span class="comment"> b = (b & 0x55555555) + (b >> 1 & 0x55555555);</span>
426
<a name="l00402"></a>00402 <span class="comment"> b = (b & 0x33333333) + (b >> 2 & 0x33333333);</span>
427
<a name="l00403"></a>00403 <span class="comment"> b = (b + (b >> 4)) & 0x0F0F0F0F;</span>
428
<a name="l00404"></a>00404 <span class="comment"> b = b + (b >> 8);</span>
429
<a name="l00405"></a>00405 <span class="comment"> b = (b + (b >> 16)) & 0x0000003F;</span>
430
<a name="l00406"></a>00406 <span class="comment"> return b;</span>
431
<a name="l00407"></a>00407 <span class="comment">}</span>
432
<a name="l00408"></a>00408 <span class="comment"></pre></span>
433
<a name="l00409"></a>00409 <span class="comment"></span>
434
<a name="l00410"></a>00410 <span class="comment"> @ingroup SSE2</span>
435
<a name="l00411"></a>00411 <span class="comment"></span>
436
<a name="l00412"></a>00412 <span class="comment">*/</span>
437
<a name="l00413"></a>00413 <span class="keyword">inline</span>
438
<a name="l00414"></a><a class="code" href="a00085.html#g0f7e0b2eb9ac7b2c6a8cd3b8f15b071f">00414</a> <a class="code" href="a00078.html#a3824d882a037396370b16f2f0a8bf37">bm::id_t</a> <a class="code" href="a00085.html#g0f7e0b2eb9ac7b2c6a8cd3b8f15b071f">sse2_bit_count</a>(<span class="keyword">const</span> __m128i* block, <span class="keyword">const</span> __m128i* block_end)
439
<a name="l00415"></a>00415 {
440
<a name="l00416"></a>00416 <span class="keyword">const</span> <span class="keywordtype">unsigned</span> mu1 = 0x55555555;
441
<a name="l00417"></a>00417 <span class="keyword">const</span> <span class="keywordtype">unsigned</span> mu2 = 0x33333333;
442
<a name="l00418"></a>00418 <span class="keyword">const</span> <span class="keywordtype">unsigned</span> mu3 = 0x0F0F0F0F;
443
<a name="l00419"></a>00419 <span class="keyword">const</span> <span class="keywordtype">unsigned</span> mu4 = 0x0000003F;
444
<a name="l00420"></a>00420
445
<a name="l00421"></a>00421 <span class="comment">// Loading masks</span>
446
<a name="l00422"></a>00422 __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
447
<a name="l00423"></a>00423 __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
448
<a name="l00424"></a>00424 __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
449
<a name="l00425"></a>00425 __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
450
<a name="l00426"></a>00426 __m128i mcnt;
451
<a name="l00427"></a>00427 mcnt = _mm_xor_si128(m1, m1); <span class="comment">// cnt = 0</span>
452
<a name="l00428"></a>00428
453
<a name="l00429"></a>00429 __m128i tmp1, tmp2;
454
<a name="l00430"></a>00430 <span class="keywordflow">do</span>
455
<a name="l00431"></a>00431 {
456
<a name="l00432"></a>00432 __m128i b = _mm_load_si128(block);
457
<a name="l00433"></a>00433 ++block;
458
<a name="l00434"></a>00434
459
<a name="l00435"></a>00435 <span class="comment">// b = (b & 0x55555555) + (b >> 1 & 0x55555555);</span>
460
<a name="l00436"></a>00436 tmp1 = _mm_srli_epi32(b, 1); <span class="comment">// tmp1 = (b >> 1 & 0x55555555)</span>
461
<a name="l00437"></a>00437 tmp1 = _mm_and_si128(tmp1, m1);
462
<a name="l00438"></a>00438 tmp2 = _mm_and_si128(b, m1); <span class="comment">// tmp2 = (b & 0x55555555)</span>
463
<a name="l00439"></a>00439 b = _mm_add_epi32(tmp1, tmp2); <span class="comment">// b = tmp1 + tmp2</span>
464
<a name="l00440"></a>00440
465
<a name="l00441"></a>00441 <span class="comment">// b = (b & 0x33333333) + (b >> 2 & 0x33333333);</span>
466
<a name="l00442"></a>00442 tmp1 = _mm_srli_epi32(b, 2); <span class="comment">// (b >> 2 & 0x33333333)</span>
467
<a name="l00443"></a>00443 tmp1 = _mm_and_si128(tmp1, m2);
468
<a name="l00444"></a>00444 tmp2 = _mm_and_si128(b, m2); <span class="comment">// (b & 0x33333333)</span>
469
<a name="l00445"></a>00445 b = _mm_add_epi32(tmp1, tmp2); <span class="comment">// b = tmp1 + tmp2</span>
470
<a name="l00446"></a>00446
471
<a name="l00447"></a>00447 <span class="comment">// b = (b + (b >> 4)) & 0x0F0F0F0F;</span>
472
<a name="l00448"></a>00448 tmp1 = _mm_srli_epi32(b, 4); <span class="comment">// tmp1 = b >> 4</span>
473
<a name="l00449"></a>00449 b = _mm_add_epi32(b, tmp1); <span class="comment">// b = b + (b >> 4)</span>
474
<a name="l00450"></a>00450 b = _mm_and_si128(b, m3); <span class="comment">// & 0x0F0F0F0F</span>
475
<a name="l00451"></a>00451
476
<a name="l00452"></a>00452 <span class="comment">// b = b + (b >> 8);</span>
477
<a name="l00453"></a>00453 tmp1 = _mm_srli_epi32 (b, 8); <span class="comment">// tmp1 = b >> 8</span>
478
<a name="l00454"></a>00454 b = _mm_add_epi32(b, tmp1); <span class="comment">// b = b + (b >> 8)</span>
479
<a name="l00455"></a>00455
480
<a name="l00456"></a>00456 <span class="comment">// b = (b + (b >> 16)) & 0x0000003F;</span>
481
<a name="l00457"></a>00457 tmp1 = _mm_srli_epi32 (b, 16); <span class="comment">// b >> 16</span>
482
<a name="l00458"></a>00458 b = _mm_add_epi32(b, tmp1); <span class="comment">// b + (b >> 16)</span>
483
<a name="l00459"></a>00459 b = _mm_and_si128(b, m4); <span class="comment">// (b >> 16) & 0x0000003F;</span>
484
<a name="l00460"></a>00460
485
<a name="l00461"></a>00461 mcnt = _mm_add_epi32(mcnt, b); <span class="comment">// mcnt += b</span>
486
<a name="l00462"></a>00462
487
<a name="l00463"></a>00463 } <span class="keywordflow">while</span> (block < block_end);
488
<a name="l00464"></a>00464
489
<a name="l00465"></a>00465 __declspec(align(16)) <a class="code" href="a00078.html#a3824d882a037396370b16f2f0a8bf37">bm::id_t</a> tcnt[4];
490
<a name="l00466"></a>00466 _mm_store_si128((__m128i*)tcnt, mcnt);
491
<a name="l00467"></a>00467
492
<a name="l00468"></a>00468 <span class="keywordflow">return</span> tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
493
<a name="l00469"></a>00469 }
494
<a name="l00470"></a>00470
495
<a name="l00471"></a>00471 <a class="code" href="a00063.html#938734d014fb68dd8b2251fe8ec2b025">BMFORCEINLINE</a>
496
<a name="l00472"></a><a class="code" href="a00078.html#c0c75fb7b3dc61602843ac4e1b9b7ef5">00472</a> __m128i <a class="code" href="a00078.html#c0c75fb7b3dc61602843ac4e1b9b7ef5">sse2_and</a>(__m128i a, __m128i b)
497
<a name="l00473"></a>00473 {
498
<a name="l00474"></a>00474 <span class="keywordflow">return</span> _mm_and_si128(a, b);
499
<a name="l00475"></a>00475 }
500
<a name="l00476"></a>00476
501
<a name="l00477"></a>00477 <a class="code" href="a00063.html#938734d014fb68dd8b2251fe8ec2b025">BMFORCEINLINE</a>
502
<a name="l00478"></a><a class="code" href="a00078.html#dea798a9a95a04845c33876087a2f46b">00478</a> __m128i <a class="code" href="a00078.html#dea798a9a95a04845c33876087a2f46b">sse2_or</a>(__m128i a, __m128i b)
503
<a name="l00479"></a>00479 {
504
<a name="l00480"></a>00480 <span class="keywordflow">return</span> _mm_or_si128(a, b);
505
<a name="l00481"></a>00481 }
506
<a name="l00482"></a>00482
507
<a name="l00483"></a>00483
508
<a name="l00484"></a>00484 <a class="code" href="a00063.html#938734d014fb68dd8b2251fe8ec2b025">BMFORCEINLINE</a>
509
<a name="l00485"></a><a class="code" href="a00078.html#6f5de19ee3e1be05037908b4777c4da8">00485</a> __m128i <a class="code" href="a00078.html#6f5de19ee3e1be05037908b4777c4da8">sse2_xor</a>(__m128i a, __m128i b)
510
<a name="l00486"></a>00486 {
511
<a name="l00487"></a>00487 <span class="keywordflow">return</span> _mm_xor_si128(a, b);
512
<a name="l00488"></a>00488 }
513
<a name="l00489"></a>00489
514
<a name="l00490"></a>00490 <a class="code" href="a00063.html#938734d014fb68dd8b2251fe8ec2b025">BMFORCEINLINE</a>
515
<a name="l00491"></a><a class="code" href="a00078.html#b3e6d46fcba1bc2a1a5390c10f571382">00491</a> __m128i <a class="code" href="a00078.html#b3e6d46fcba1bc2a1a5390c10f571382">sse2_sub</a>(__m128i a, __m128i b)
516
<a name="l00492"></a>00492 {
517
<a name="l00493"></a>00493 <span class="keywordflow">return</span> _mm_andnot_si128(b, a);
518
<a name="l00494"></a>00494 }
519
<a name="l00495"></a>00495
520
<a name="l00496"></a>00496
521
<a name="l00497"></a>00497 <span class="keyword">template</span><<span class="keyword">class</span> Func>
522
<a name="l00498"></a><a class="code" href="a00078.html#8831a88ef538b3997d04d2ebf513160d">00498</a> <a class="code" href="a00078.html#a3824d882a037396370b16f2f0a8bf37">bm::id_t</a> <a class="code" href="a00078.html#8831a88ef538b3997d04d2ebf513160d">sse2_bit_count_op</a>(<span class="keyword">const</span> __m128i* <a class="code" href="a00063.html#3d4b22f50e6783522689451783907dc7">BMRESTRICT</a> block,
523
<a name="l00499"></a>00499 <span class="keyword">const</span> __m128i* <a class="code" href="a00063.html#3d4b22f50e6783522689451783907dc7">BMRESTRICT</a> block_end,
524
<a name="l00500"></a>00500 <span class="keyword">const</span> __m128i* <a class="code" href="a00063.html#3d4b22f50e6783522689451783907dc7">BMRESTRICT</a> mask_block,
525
<a name="l00501"></a>00501 Func sse2_func)
526
<a name="l00502"></a>00502 {
527
<a name="l00503"></a>00503 <span class="keyword">const</span> <span class="keywordtype">unsigned</span> mu1 = 0x55555555;
528
<a name="l00504"></a>00504 <span class="keyword">const</span> <span class="keywordtype">unsigned</span> mu2 = 0x33333333;
529
<a name="l00505"></a>00505 <span class="keyword">const</span> <span class="keywordtype">unsigned</span> mu3 = 0x0F0F0F0F;
530
<a name="l00506"></a>00506 <span class="keyword">const</span> <span class="keywordtype">unsigned</span> mu4 = 0x0000003F;
531
<a name="l00507"></a>00507
532
<a name="l00508"></a>00508 <span class="comment">// Loading masks</span>
533
<a name="l00509"></a>00509 __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
534
<a name="l00510"></a>00510 __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
535
<a name="l00511"></a>00511 __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
536
<a name="l00512"></a>00512 __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
537
<a name="l00513"></a>00513 __m128i mcnt;
538
<a name="l00514"></a>00514 mcnt = _mm_xor_si128(m1, m1); <span class="comment">// cnt = 0</span>
539
<a name="l00515"></a>00515 <span class="keywordflow">do</span>
540
<a name="l00516"></a>00516 {
541
<a name="l00517"></a>00517 __m128i tmp1, tmp2;
542
<a name="l00518"></a>00518 __m128i b = _mm_load_si128(block++);
543
<a name="l00519"></a>00519
544
<a name="l00520"></a>00520 tmp1 = _mm_load_si128(mask_block++);
545
<a name="l00521"></a>00521
546
<a name="l00522"></a>00522 b = sse2_func(b, tmp1);
547
<a name="l00523"></a>00523
548
<a name="l00524"></a>00524 <span class="comment">// b = (b & 0x55555555) + (b >> 1 & 0x55555555);</span>
549
<a name="l00525"></a>00525 tmp1 = _mm_srli_epi32(b, 1); <span class="comment">// tmp1 = (b >> 1 & 0x55555555)</span>
550
<a name="l00526"></a>00526 tmp1 = _mm_and_si128(tmp1, m1);
551
<a name="l00527"></a>00527 tmp2 = _mm_and_si128(b, m1); <span class="comment">// tmp2 = (b & 0x55555555)</span>
552
<a name="l00528"></a>00528 b = _mm_add_epi32(tmp1, tmp2); <span class="comment">// b = tmp1 + tmp2</span>
553
<a name="l00529"></a>00529
554
<a name="l00530"></a>00530 <span class="comment">// b = (b & 0x33333333) + (b >> 2 & 0x33333333);</span>
555
<a name="l00531"></a>00531 tmp1 = _mm_srli_epi32(b, 2); <span class="comment">// (b >> 2 & 0x33333333)</span>
556
<a name="l00532"></a>00532 tmp1 = _mm_and_si128(tmp1, m2);
557
<a name="l00533"></a>00533 tmp2 = _mm_and_si128(b, m2); <span class="comment">// (b & 0x33333333)</span>
558
<a name="l00534"></a>00534 b = _mm_add_epi32(tmp1, tmp2); <span class="comment">// b = tmp1 + tmp2</span>
559
<a name="l00535"></a>00535
560
<a name="l00536"></a>00536 <span class="comment">// b = (b + (b >> 4)) & 0x0F0F0F0F;</span>
561
<a name="l00537"></a>00537 tmp1 = _mm_srli_epi32(b, 4); <span class="comment">// tmp1 = b >> 4</span>
562
<a name="l00538"></a>00538 b = _mm_add_epi32(b, tmp1); <span class="comment">// b = b + (b >> 4)</span>
563
<a name="l00539"></a>00539 b = _mm_and_si128(b, m3); <span class="comment">// & 0x0F0F0F0F</span>
564
<a name="l00540"></a>00540
565
<a name="l00541"></a>00541 <span class="comment">// b = b + (b >> 8);</span>
566
<a name="l00542"></a>00542 tmp1 = _mm_srli_epi32 (b, 8); <span class="comment">// tmp1 = b >> 8</span>
567
<a name="l00543"></a>00543 b = _mm_add_epi32(b, tmp1); <span class="comment">// b = b + (b >> 8)</span>
568
<a name="l00544"></a>00544
569
<a name="l00545"></a>00545 <span class="comment">// b = (b + (b >> 16)) & 0x0000003F;</span>
570
<a name="l00546"></a>00546 tmp1 = _mm_srli_epi32 (b, 16); <span class="comment">// b >> 16</span>
571
<a name="l00547"></a>00547 b = _mm_add_epi32(b, tmp1); <span class="comment">// b + (b >> 16)</span>
572
<a name="l00548"></a>00548 b = _mm_and_si128(b, m4); <span class="comment">// (b >> 16) & 0x0000003F;</span>
573
<a name="l00549"></a>00549
574
<a name="l00550"></a>00550 mcnt = _mm_add_epi32(mcnt, b); <span class="comment">// mcnt += b</span>
575
<a name="l00551"></a>00551
576
<a name="l00552"></a>00552 } <span class="keywordflow">while</span> (block < block_end);
577
<a name="l00553"></a>00553
578
<a name="l00554"></a>00554 __declspec(align(16)) <a class="code" href="a00078.html#a3824d882a037396370b16f2f0a8bf37">bm::id_t</a> tcnt[4];
579
<a name="l00555"></a>00555 _mm_store_si128((__m128i*)tcnt, mcnt);
580
<a name="l00556"></a>00556
581
<a name="l00557"></a>00557 <span class="keywordflow">return</span> tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
582
<a name="l00558"></a>00558 }
583
<a name="l00559"></a>00559
584
<a name="l00560"></a>00560
585
<a name="l00561"></a>00561
586
<a name="l00562"></a>00562
587
<a name="l00563"></a><a class="code" href="a00065.html#9b8caa31ab90cbfbc99b5728352e5456">00563</a> <span class="preprocessor">#define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\</span>
588
<a name="l00564"></a>00564 <span class="preprocessor"> sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), mask)</span>
589
<a name="l00565"></a>00565 <span class="preprocessor"></span>
590
<a name="l00566"></a><a class="code" href="a00065.html#7d425b68d8217dc37d33c660e39a61e1">00566</a> <span class="preprocessor">#define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\</span>
591
<a name="l00567"></a>00567 <span class="preprocessor"> sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), mask)</span>
592
<a name="l00568"></a>00568 <span class="preprocessor"></span>
593
<a name="l00569"></a><a class="code" href="a00065.html#e022e7bc509d97359a348393a6b25b99">00569</a> <span class="preprocessor">#define VECT_BITCOUNT(first, last) \</span>
594
<a name="l00570"></a>00570 <span class="preprocessor"> sse2_bit_count((__m128i*) (first), (__m128i*) (last)) </span>
595
<a name="l00571"></a>00571 <span class="preprocessor"></span>
596
<a name="l00572"></a><a class="code" href="a00065.html#3459200618dab65142340f1abd760eb9">00572</a> <span class="preprocessor">#define VECT_BITCOUNT_AND(first, last, mask) \</span>
597
<a name="l00573"></a>00573 <span class="preprocessor"> sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and) </span>
598
<a name="l00574"></a>00574 <span class="preprocessor"></span>
599
<a name="l00575"></a><a class="code" href="a00065.html#07583d48db4ede7eadbc3156e9dd7d3b">00575</a> <span class="preprocessor">#define VECT_BITCOUNT_OR(first, last, mask) \</span>
600
<a name="l00576"></a>00576 <span class="preprocessor"> sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or) </span>
601
<a name="l00577"></a>00577 <span class="preprocessor"></span>
602
<a name="l00578"></a><a class="code" href="a00065.html#9954c72d227a3ca78df22a56efad6baf">00578</a> <span class="preprocessor">#define VECT_BITCOUNT_XOR(first, last, mask) \</span>
603
<a name="l00579"></a>00579 <span class="preprocessor"> sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor) </span>
604
<a name="l00580"></a>00580 <span class="preprocessor"></span>
605
<a name="l00581"></a><a class="code" href="a00065.html#98aaf230431767be4838323266d0b493">00581</a> <span class="preprocessor">#define VECT_BITCOUNT_SUB(first, last, mask) \</span>
606
<a name="l00582"></a>00582 <span class="preprocessor"> sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub) </span>
607
<a name="l00583"></a>00583 <span class="preprocessor"></span>
608
<a name="l00584"></a><a class="code" href="a00065.html#a31c7d6d7d4cd67e1cc86c08221446ce">00584</a> <span class="preprocessor">#define VECT_INVERT_ARR(first, last) \</span>
609
<a name="l00585"></a>00585 <span class="preprocessor"> sse2_invert_arr(first, last);</span>
610
<a name="l00586"></a>00586 <span class="preprocessor"></span>
611
<a name="l00587"></a><a class="code" href="a00065.html#a6a3504bb5449abf6f5ec084b2ddfac6">00587</a> <span class="preprocessor">#define VECT_AND_ARR(dst, src, src_end) \</span>
612
<a name="l00588"></a>00588 <span class="preprocessor"> sse2_and_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))</span>
613
<a name="l00589"></a>00589 <span class="preprocessor"></span>
614
<a name="l00590"></a><a class="code" href="a00065.html#e3cdeec275429ea0d60668da63e96df8">00590</a> <span class="preprocessor">#define VECT_OR_ARR(dst, src, src_end) \</span>
615
<a name="l00591"></a>00591 <span class="preprocessor"> sse2_or_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))</span>
616
<a name="l00592"></a>00592 <span class="preprocessor"></span>
617
<a name="l00593"></a><a class="code" href="a00065.html#f079345cbcb256bb6216c0446fd775f2">00593</a> <span class="preprocessor">#define VECT_SUB_ARR(dst, src, src_end) \</span>
618
<a name="l00594"></a>00594 <span class="preprocessor"> sse2_sub_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))</span>
619
<a name="l00595"></a>00595 <span class="preprocessor"></span>
620
<a name="l00596"></a><a class="code" href="a00065.html#1f520ac49859f992de5c00f43b851179">00596</a> <span class="preprocessor">#define VECT_XOR_ARR(dst, src, src_end) \</span>
621
<a name="l00597"></a>00597 <span class="preprocessor"> sse2_xor_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))</span>
622
<a name="l00598"></a>00598 <span class="preprocessor"></span>
623
<a name="l00599"></a><a class="code" href="a00065.html#72b713f259e0542169a1228bebadcede">00599</a> <span class="preprocessor">#define VECT_COPY_BLOCK(dst, src, src_end) \</span>
624
<a name="l00600"></a>00600 <span class="preprocessor"> sse2_copy_block((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))</span>
625
<a name="l00601"></a>00601 <span class="preprocessor"></span>
626
<a name="l00602"></a><a class="code" href="a00065.html#cefd3b9d39a2e2fb0c9d2aa5c3812e0b">00602</a> <span class="preprocessor">#define VECT_SET_BLOCK(dst, dst_end, value) \</span>
627
<a name="l00603"></a>00603 <span class="preprocessor"> sse2_set_block((__m128i*) dst, (__m128i*) (dst_end), (value))</span>
628
<a name="l00604"></a>00604 <span class="preprocessor"></span>
629
<a name="l00605"></a>00605 } <span class="comment">// namespace</span>
630
<a name="l00606"></a>00606
631
<a name="l00607"></a>00607 <span class="preprocessor">#endif</span>
633
<hr size="1"><address style="text-align: right;"><small>Generated on Sun Jul 12 10:40:29 2009 for BitMagic by
634
<a href="http://www.doxygen.org/index.html">
635
<img src="doxygen.png" alt="doxygen" align="middle" border="0"></a> 1.5.9 </small></address>