1
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
2
<html xmlns="http://www.w3.org/1999/xhtml">
4
<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
5
<title>BitMagic: bmsse4.h Source File</title>
6
<link href="tabs.css" rel="stylesheet" type="text/css"/>
7
<link href="doxygen.css" rel="stylesheet" type="text/css"/>
10
<!-- Generated by Doxygen 1.6.1 -->
11
<div class="navigation" id="top">
14
<li><a href="index.html"><span>Main Page</span></a></li>
15
<li><a href="modules.html"><span>Modules</span></a></li>
16
<li><a href="namespaces.html"><span>Namespaces</span></a></li>
17
<li><a href="annotated.html"><span>Data Structures</span></a></li>
18
<li class="current"><a href="files.html"><span>Files</span></a></li>
19
<li><a href="examples.html"><span>Examples</span></a></li>
24
<li><a href="files.html"><span>File List</span></a></li>
25
<li><a href="globals.html"><span>Globals</span></a></li>
28
<h1>bmsse4.h</h1><a href="a00094.html">Go to the documentation of this file.</a><div class="fragment"><pre class="fragment"><a name="l00001"></a>00001 <span class="preprocessor">#ifndef BMSSE4__H__INCLUDED__</span>
29
<a name="l00002"></a>00002 <span class="preprocessor"></span><span class="preprocessor">#define BMSSE4__H__INCLUDED__</span>
30
<a name="l00003"></a>00003 <span class="preprocessor"></span><span class="comment">/*</span>
31
<a name="l00004"></a>00004 <span class="comment">Copyright(c) 2009 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)</span>
32
<a name="l00005"></a>00005 <span class="comment"></span>
33
<a name="l00006"></a>00006 <span class="comment">Permission is hereby granted, free of charge, to any person </span>
34
<a name="l00007"></a>00007 <span class="comment">obtaining a copy of this software and associated documentation </span>
35
<a name="l00008"></a>00008 <span class="comment">files (the "Software"), to deal in the Software without restriction, </span>
36
<a name="l00009"></a>00009 <span class="comment">including without limitation the rights to use, copy, modify, merge, </span>
37
<a name="l00010"></a>00010 <span class="comment">publish, distribute, sublicense, and/or sell copies of the Software, </span>
38
<a name="l00011"></a>00011 <span class="comment">and to permit persons to whom the Software is furnished to do so, </span>
39
<a name="l00012"></a>00012 <span class="comment">subject to the following conditions:</span>
40
<a name="l00013"></a>00013 <span class="comment"></span>
41
<a name="l00014"></a>00014 <span class="comment">The above copyright notice and this permission notice shall be included </span>
42
<a name="l00015"></a>00015 <span class="comment">in all copies or substantial portions of the Software.</span>
43
<a name="l00016"></a>00016 <span class="comment"></span>
44
<a name="l00017"></a>00017 <span class="comment">THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, </span>
45
<a name="l00018"></a>00018 <span class="comment">EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES </span>
46
<a name="l00019"></a>00019 <span class="comment">OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. </span>
47
<a name="l00020"></a>00020 <span class="comment">IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, </span>
48
<a name="l00021"></a>00021 <span class="comment">DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, </span>
49
<a name="l00022"></a>00022 <span class="comment">ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR </span>
50
<a name="l00023"></a>00023 <span class="comment">OTHER DEALINGS IN THE SOFTWARE.</span>
51
<a name="l00024"></a>00024 <span class="comment"></span>
52
<a name="l00025"></a>00025 <span class="comment">For more information please visit: http://bmagic.sourceforge.net</span>
53
<a name="l00026"></a>00026 <span class="comment"></span>
54
<a name="l00027"></a>00027 <span class="comment">*/</span>
55
<a name="l00028"></a>00028
56
<a name="l00029"></a>00029
57
<a name="l00030"></a>00030
58
<a name="l00031"></a>00031 <span class="comment">// Header implements processor specific intrinsics declarations for SSE2</span>
59
<a name="l00032"></a>00032 <span class="comment">// instruction set</span>
60
<a name="l00033"></a>00033 <span class="preprocessor">#include<mmintrin.h></span>
61
<a name="l00034"></a>00034 <span class="preprocessor">#include<emmintrin.h></span>
62
<a name="l00035"></a>00035 <span class="preprocessor">#include<smmintrin.h></span>
63
<a name="l00036"></a>00036
64
<a name="l00037"></a>00037 <span class="preprocessor">#include "<a class="code" href="a00089.html">bmdef.h</a>"</span>
65
<a name="l00038"></a>00038 <span class="preprocessor">#include "<a class="code" href="a00095.html">bmsse_util.h</a>"</span>
66
<a name="l00039"></a>00039
67
<a name="l00040"></a>00040 <span class="keyword">namespace </span>bm
68
<a name="l00041"></a>00041 {
69
<a name="l00042"></a>00042 <span class="comment"></span>
70
<a name="l00043"></a>00043 <span class="comment">/** @defgroup SSE4 Processor specific optimizations for SSE4.2 instructions</span>
71
<a name="l00044"></a>00044 <span class="comment"> * @ingroup bmagic</span>
72
<a name="l00045"></a>00045 <span class="comment"> */</span>
73
<a name="l00046"></a>00046
74
<a name="l00047"></a>00047
75
<a name="l00048"></a>00048 <span class="comment"></span>
76
<a name="l00049"></a>00049 <span class="comment">/*!</span>
77
<a name="l00050"></a>00050 <span class="comment"> SSE4.2 optimized bitcounting .</span>
78
<a name="l00051"></a>00051 <span class="comment"> @ingroup SSE4</span>
79
<a name="l00052"></a>00052 <span class="comment">*/</span>
80
<a name="l00053"></a>00053 <span class="keyword">inline</span>
81
<a name="l00054"></a><a class="code" href="a00117.html#ga266ed6594dc786e46c1f353443dc41fe">00054</a> <a class="code" href="a00110.html#aa3824d882a037396370b16f2f0a8bf37">bm::id_t</a> <a class="code" href="a00117.html#ga266ed6594dc786e46c1f353443dc41fe">sse4_bit_count</a>(<span class="keyword">const</span> __m128i* block, <span class="keyword">const</span> __m128i* block_end)
82
<a name="l00055"></a>00055 {
83
<a name="l00056"></a>00056 <a class="code" href="a00110.html#aa3824d882a037396370b16f2f0a8bf37">bm::id_t</a> count = 0;
84
<a name="l00057"></a>00057 <span class="keywordflow">do</span>
85
<a name="l00058"></a>00058 {
86
<a name="l00059"></a>00059 <span class="keywordtype">unsigned</span>* b = (<span class="keywordtype">unsigned</span>*) block;
87
<a name="l00060"></a>00060 count += _mm_popcnt_u32(b[0]) +
88
<a name="l00061"></a>00061 _mm_popcnt_u32(b[1]) +
89
<a name="l00062"></a>00062 _mm_popcnt_u32(b[2]) +
90
<a name="l00063"></a>00063 _mm_popcnt_u32(b[3]);
91
<a name="l00064"></a>00064 } <span class="keywordflow">while</span> (++block < block_end);
92
<a name="l00065"></a>00065 <span class="keywordflow">return</span> count;
93
<a name="l00066"></a>00066 }
94
<a name="l00067"></a>00067 <span class="comment"></span>
95
<a name="l00068"></a>00068 <span class="comment">/*!</span>
96
<a name="l00069"></a>00069 <span class="comment">\internal</span>
97
<a name="l00070"></a>00070 <span class="comment">*/</span>
98
<a name="l00071"></a>00071 <a class="code" href="a00089.html#a938734d014fb68dd8b2251fe8ec2b025">BMFORCEINLINE</a>
99
<a name="l00072"></a><a class="code" href="a00110.html#a6d8f327a7d19c11125ac77383a34e986">00072</a> <span class="keywordtype">unsigned</span> <a class="code" href="a00110.html#a6d8f327a7d19c11125ac77383a34e986">op_xor</a>(<span class="keywordtype">unsigned</span> a, <span class="keywordtype">unsigned</span> b)
100
<a name="l00073"></a>00073 {
101
<a name="l00074"></a>00074 <span class="keywordtype">unsigned</span> ret = (a ^ b);
102
<a name="l00075"></a>00075 <span class="keywordflow">return</span> ret;
103
<a name="l00076"></a>00076 }
104
<a name="l00077"></a>00077 <span class="comment"></span>
105
<a name="l00078"></a>00078 <span class="comment">/*!</span>
106
<a name="l00079"></a>00079 <span class="comment">\internal</span>
107
<a name="l00080"></a>00080 <span class="comment">*/</span>
108
<a name="l00081"></a>00081 <a class="code" href="a00089.html#a938734d014fb68dd8b2251fe8ec2b025">BMFORCEINLINE</a>
109
<a name="l00082"></a><a class="code" href="a00110.html#a0ed9a1bcd143f7763b4b59ef442f6994">00082</a> <span class="keywordtype">unsigned</span> <a class="code" href="a00110.html#a0ed9a1bcd143f7763b4b59ef442f6994">op_or</a>(<span class="keywordtype">unsigned</span> a, <span class="keywordtype">unsigned</span> b)
110
<a name="l00083"></a>00083 {
111
<a name="l00084"></a>00084 <span class="keywordflow">return</span> (a | b);
112
<a name="l00085"></a>00085 }
113
<a name="l00086"></a>00086 <span class="comment"></span>
114
<a name="l00087"></a>00087 <span class="comment">/*!</span>
115
<a name="l00088"></a>00088 <span class="comment">\internal</span>
116
<a name="l00089"></a>00089 <span class="comment">*/</span>
117
<a name="l00090"></a>00090 <a class="code" href="a00089.html#a938734d014fb68dd8b2251fe8ec2b025">BMFORCEINLINE</a>
118
<a name="l00091"></a><a class="code" href="a00110.html#a515b2862b33efd4e36d72e4ba7729199">00091</a> <span class="keywordtype">unsigned</span> <a class="code" href="a00110.html#a515b2862b33efd4e36d72e4ba7729199">op_and</a>(<span class="keywordtype">unsigned</span> a, <span class="keywordtype">unsigned</span> b)
119
<a name="l00092"></a>00092 {
120
<a name="l00093"></a>00093 <span class="keywordflow">return</span> (a & b);
121
<a name="l00094"></a>00094 }
122
<a name="l00095"></a>00095
123
<a name="l00096"></a>00096
124
<a name="l00097"></a>00097 <span class="keyword">template</span><<span class="keyword">class</span> Func>
125
<a name="l00098"></a><a class="code" href="a00110.html#a295c6323a972a90e28c36e6012427a26">00098</a> <a class="code" href="a00110.html#aa3824d882a037396370b16f2f0a8bf37">bm::id_t</a> <a class="code" href="a00110.html#a295c6323a972a90e28c36e6012427a26">sse4_bit_count_op</a>(<span class="keyword">const</span> __m128i* BMRESTRICT block,
126
<a name="l00099"></a>00099 <span class="keyword">const</span> __m128i* BMRESTRICT block_end,
127
<a name="l00100"></a>00100 <span class="keyword">const</span> __m128i* BMRESTRICT mask_block,
128
<a name="l00101"></a>00101 Func sse2_func)
129
<a name="l00102"></a>00102 {
130
<a name="l00103"></a>00103 <a class="code" href="a00110.html#aa3824d882a037396370b16f2f0a8bf37">bm::id_t</a> count = 0;
131
<a name="l00104"></a>00104 <span class="keywordflow">do</span>
132
<a name="l00105"></a>00105 {
133
<a name="l00106"></a>00106 __m128i tmp0 = _mm_load_si128(block);
134
<a name="l00107"></a>00107 __m128i tmp1 = _mm_load_si128(mask_block);
135
<a name="l00108"></a>00108 __m128i b = sse2_func(tmp0, tmp1);
136
<a name="l00109"></a>00109
137
<a name="l00110"></a>00110 count += _mm_popcnt_u32(_mm_extract_epi32(b, 0));
138
<a name="l00111"></a>00111 count += _mm_popcnt_u32(_mm_extract_epi32(b, 1));
139
<a name="l00112"></a>00112 count += _mm_popcnt_u32(_mm_extract_epi32(b, 2));
140
<a name="l00113"></a>00113 count += _mm_popcnt_u32(_mm_extract_epi32(b, 3));
141
<a name="l00114"></a>00114
142
<a name="l00115"></a>00115 ++block; ++mask_block;
143
<a name="l00116"></a>00116 } <span class="keywordflow">while</span> (block < block_end);
144
<a name="l00117"></a>00117 <span class="keywordflow">return</span> count;
145
<a name="l00118"></a>00118 }
146
<a name="l00119"></a>00119
147
<a name="l00120"></a>00120 <span class="keyword">template</span><<span class="keyword">class</span> Func>
148
<a name="l00121"></a><a class="code" href="a00110.html#ab297c48a2b5057f32db00a8e8631d7a3">00121</a> <a class="code" href="a00110.html#aa3824d882a037396370b16f2f0a8bf37">bm::id_t</a> <a class="code" href="a00110.html#ab297c48a2b5057f32db00a8e8631d7a3">sse4_bit_count_op2</a>(<span class="keyword">const</span> __m128i* BMRESTRICT block,
149
<a name="l00122"></a>00122 <span class="keyword">const</span> __m128i* BMRESTRICT block_end,
150
<a name="l00123"></a>00123 <span class="keyword">const</span> __m128i* BMRESTRICT mask_block,
151
<a name="l00124"></a>00124 Func op_func)
152
<a name="l00125"></a>00125 {
153
<a name="l00126"></a>00126 <a class="code" href="a00110.html#aa3824d882a037396370b16f2f0a8bf37">bm::id_t</a> count = 0;
154
<a name="l00127"></a>00127 <span class="keywordflow">do</span>
155
<a name="l00128"></a>00128 {
156
<a name="l00129"></a>00129 <span class="keywordtype">unsigned</span> *r1 = (<span class="keywordtype">unsigned</span>*) block;
157
<a name="l00130"></a>00130 <span class="keywordtype">unsigned</span> *r2 = (<span class="keywordtype">unsigned</span>*) mask_block;
158
<a name="l00131"></a>00131
159
<a name="l00132"></a>00132 count += _mm_popcnt_u32(op_func(r1[0], r2[0]));
160
<a name="l00133"></a>00133 count += _mm_popcnt_u32(op_func(r1[1], r2[1]));
161
<a name="l00134"></a>00134 count += _mm_popcnt_u32(op_func(r1[2], r2[2]));
162
<a name="l00135"></a>00135 count += _mm_popcnt_u32(op_func(r1[3], r2[3]));
163
<a name="l00136"></a>00136
164
<a name="l00137"></a>00137 ++mask_block;
165
<a name="l00138"></a>00138
166
<a name="l00139"></a>00139 } <span class="keywordflow">while</span> (++block < block_end);
167
<a name="l00140"></a>00140 <span class="keywordflow">return</span> count;
168
<a name="l00141"></a>00141
169
<a name="l00142"></a>00142 }
170
<a name="l00143"></a>00143
171
<a name="l00144"></a>00144
172
<a name="l00145"></a>00145
173
<a name="l00146"></a><a class="code" href="a00094.html#a9b8caa31ab90cbfbc99b5728352e5456">00146</a> <span class="preprocessor">#define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\</span>
174
<a name="l00147"></a>00147 <span class="preprocessor"> sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), mask)</span>
175
<a name="l00148"></a>00148 <span class="preprocessor"></span>
176
<a name="l00149"></a><a class="code" href="a00094.html#a7d425b68d8217dc37d33c660e39a61e1">00149</a> <span class="preprocessor">#define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\</span>
177
<a name="l00150"></a>00150 <span class="preprocessor"> sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), mask)</span>
178
<a name="l00151"></a>00151 <span class="preprocessor"></span>
179
<a name="l00152"></a><a class="code" href="a00094.html#ae022e7bc509d97359a348393a6b25b99">00152</a> <span class="preprocessor">#define VECT_BITCOUNT(first, last) \</span>
180
<a name="l00153"></a>00153 <span class="preprocessor"> sse4_bit_count((__m128i*) (first), (__m128i*) (last)) </span>
181
<a name="l00154"></a>00154 <span class="preprocessor"></span>
182
<a name="l00155"></a><a class="code" href="a00094.html#a3459200618dab65142340f1abd760eb9">00155</a> <span class="preprocessor">#define VECT_BITCOUNT_AND(first, last, mask) \</span>
183
<a name="l00156"></a>00156 <span class="preprocessor"> sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and) </span>
184
<a name="l00157"></a>00157 <span class="preprocessor"></span>
185
<a name="l00158"></a><a class="code" href="a00094.html#a07583d48db4ede7eadbc3156e9dd7d3b">00158</a> <span class="preprocessor">#define VECT_BITCOUNT_OR(first, last, mask) \</span>
186
<a name="l00159"></a>00159 <span class="preprocessor"> sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or) </span>
187
<a name="l00160"></a>00160 <span class="preprocessor"></span>
188
<a name="l00161"></a><a class="code" href="a00094.html#a9954c72d227a3ca78df22a56efad6baf">00161</a> <span class="preprocessor">#define VECT_BITCOUNT_XOR(first, last, mask) \</span>
189
<a name="l00162"></a>00162 <span class="preprocessor"> sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor) </span>
190
<a name="l00163"></a>00163 <span class="preprocessor"></span>
191
<a name="l00164"></a><a class="code" href="a00094.html#a98aaf230431767be4838323266d0b493">00164</a> <span class="preprocessor">#define VECT_BITCOUNT_SUB(first, last, mask) \</span>
192
<a name="l00165"></a>00165 <span class="preprocessor"> sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub) </span>
193
<a name="l00166"></a>00166 <span class="preprocessor"></span>
194
<a name="l00167"></a><a class="code" href="a00094.html#aa31c7d6d7d4cd67e1cc86c08221446ce">00167</a> <span class="preprocessor">#define VECT_INVERT_ARR(first, last) \</span>
195
<a name="l00168"></a>00168 <span class="preprocessor"> sse2_invert_arr(first, last);</span>
196
<a name="l00169"></a>00169 <span class="preprocessor"></span>
197
<a name="l00170"></a><a class="code" href="a00094.html#aa6a3504bb5449abf6f5ec084b2ddfac6">00170</a> <span class="preprocessor">#define VECT_AND_ARR(dst, src, src_end) \</span>
198
<a name="l00171"></a>00171 <span class="preprocessor"> sse2_and_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))</span>
199
<a name="l00172"></a>00172 <span class="preprocessor"></span>
200
<a name="l00173"></a><a class="code" href="a00094.html#ae3cdeec275429ea0d60668da63e96df8">00173</a> <span class="preprocessor">#define VECT_OR_ARR(dst, src, src_end) \</span>
201
<a name="l00174"></a>00174 <span class="preprocessor"> sse2_or_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))</span>
202
<a name="l00175"></a>00175 <span class="preprocessor"></span>
203
<a name="l00176"></a><a class="code" href="a00094.html#af079345cbcb256bb6216c0446fd775f2">00176</a> <span class="preprocessor">#define VECT_SUB_ARR(dst, src, src_end) \</span>
204
<a name="l00177"></a>00177 <span class="preprocessor"> sse2_sub_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))</span>
205
<a name="l00178"></a>00178 <span class="preprocessor"></span>
206
<a name="l00179"></a><a class="code" href="a00094.html#a1f520ac49859f992de5c00f43b851179">00179</a> <span class="preprocessor">#define VECT_XOR_ARR(dst, src, src_end) \</span>
207
<a name="l00180"></a>00180 <span class="preprocessor"> sse2_xor_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))</span>
208
<a name="l00181"></a>00181 <span class="preprocessor"></span>
209
<a name="l00182"></a><a class="code" href="a00094.html#a72b713f259e0542169a1228bebadcede">00182</a> <span class="preprocessor">#define VECT_COPY_BLOCK(dst, src, src_end) \</span>
210
<a name="l00183"></a>00183 <span class="preprocessor"> sse2_copy_block((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))</span>
211
<a name="l00184"></a>00184 <span class="preprocessor"></span>
212
<a name="l00185"></a><a class="code" href="a00094.html#acefd3b9d39a2e2fb0c9d2aa5c3812e0b">00185</a> <span class="preprocessor">#define VECT_SET_BLOCK(dst, dst_end, value) \</span>
213
<a name="l00186"></a>00186 <span class="preprocessor"> sse2_set_block((__m128i*) dst, (__m128i*) (dst_end), (value))</span>
214
<a name="l00187"></a>00187 <span class="preprocessor"></span>
215
<a name="l00188"></a>00188
216
<a name="l00189"></a>00189
217
<a name="l00190"></a>00190
218
<a name="l00191"></a>00191 <span class="comment"></span>
219
<a name="l00192"></a>00192 <span class="comment">/*!</span>
220
<a name="l00193"></a>00193 <span class="comment"> SSE4.2 optimized bitcounting and number of GAPs</span>
221
<a name="l00194"></a>00194 <span class="comment"> @ingroup SSE4</span>
222
<a name="l00195"></a>00195 <span class="comment">*/</span>
223
<a name="l00196"></a>00196
224
<a name="l00197"></a>00197
225
<a name="l00198"></a>00198 <span class="keyword">inline</span>
226
<a name="l00199"></a><a class="code" href="a00117.html#gad686391db4cdc6382ae6785f62981474">00199</a> <a class="code" href="a00110.html#aa3824d882a037396370b16f2f0a8bf37">bm::id_t</a> <a class="code" href="a00117.html#gad686391db4cdc6382ae6785f62981474">sse4_bit_block_calc_count_change</a>(<span class="keyword">const</span> __m128i* BMRESTRICT block,
227
<a name="l00200"></a>00200 <span class="keyword">const</span> __m128i* BMRESTRICT block_end,
228
<a name="l00201"></a>00201 <span class="keywordtype">unsigned</span>* BMRESTRICT bit_count)
229
<a name="l00202"></a>00202 {
230
<a name="l00203"></a>00203 <span class="comment">// __m128i mask1 = _mm_set_epi32(0x1, 0x1, 0x1, 0x1);</span>
231
<a name="l00204"></a>00204 <span class="keyword">register</span> <span class="keywordtype">int</span> count = (block_end - block)*4;
232
<a name="l00205"></a>00205
233
<a name="l00206"></a>00206 <span class="keyword">register</span> <a class="code" href="a00110.html#a17fd5ba52db3ddda05e6f8dd5000a1a4">bm::word_t</a> w0, w_prev;
234
<a name="l00207"></a>00207 <span class="keyword">const</span> <span class="keywordtype">int</span> w_shift = <span class="keyword">sizeof</span>(w0) * 8 - 1;
235
<a name="l00208"></a>00208 <span class="keywordtype">bool</span> first_word = <span class="keyword">true</span>;
236
<a name="l00209"></a>00209 *bit_count = 0;
237
<a name="l00210"></a>00210
238
<a name="l00211"></a>00211 <span class="comment">// first word</span>
239
<a name="l00212"></a>00212 {
240
<a name="l00213"></a>00213 <a class="code" href="a00110.html#a17fd5ba52db3ddda05e6f8dd5000a1a4">bm::word_t</a> w;
241
<a name="l00214"></a>00214 <span class="keyword">const</span> <a class="code" href="a00110.html#a17fd5ba52db3ddda05e6f8dd5000a1a4">bm::word_t</a>* blk = (<span class="keyword">const</span> <a class="code" href="a00110.html#a17fd5ba52db3ddda05e6f8dd5000a1a4">bm::word_t</a>*) block;
242
<a name="l00215"></a>00215 w = w0 = blk[0];
243
<a name="l00216"></a>00216 *bit_count += _mm_popcnt_u32(w);
244
<a name="l00217"></a>00217 w ^= (w >> 1);
245
<a name="l00218"></a>00218 count += _mm_popcnt_u32(w);
246
<a name="l00219"></a>00219 count -= (w_prev = (w0 >> w_shift));
247
<a name="l00220"></a>00220 }
248
<a name="l00221"></a>00221
249
<a name="l00222"></a>00222 <span class="keywordflow">do</span>
250
<a name="l00223"></a>00223 {
251
<a name="l00224"></a>00224 __m128i b = _mm_load_si128(block);
252
<a name="l00225"></a>00225 __m128i tmp2 = _mm_xor_si128(b, _mm_srli_epi32(b, 1)); <span class="comment">// tmp2=(b >> 1) ^ b;</span>
253
<a name="l00226"></a>00226 __m128i tmp3 = _mm_srli_epi32(b, w_shift); <span class="comment">// tmp3 = w0 >> w_shift</span>
254
<a name="l00227"></a>00227 <span class="comment">// __m128i tmp4 = _mm_and_si128(b, mask1); // tmp4 = w0 & 1 </span>
255
<a name="l00228"></a>00228
256
<a name="l00229"></a>00229 <span class="comment">// ---------------------------------------------------------------------</span>
257
<a name="l00230"></a>00230 {
258
<a name="l00231"></a>00231 <span class="keywordflow">if</span> (first_word)
259
<a name="l00232"></a>00232 {
260
<a name="l00233"></a>00233 first_word = <span class="keyword">false</span>;
261
<a name="l00234"></a>00234 }
262
<a name="l00235"></a>00235 <span class="keywordflow">else</span>
263
<a name="l00236"></a>00236 {
264
<a name="l00237"></a>00237 w0 = _mm_extract_epi32(b, 0);
265
<a name="l00238"></a>00238 <span class="keywordflow">if</span> (w0)
266
<a name="l00239"></a>00239 {
267
<a name="l00240"></a>00240 *bit_count += _mm_popcnt_u32(w0);
268
<a name="l00241"></a>00241 count += _mm_popcnt_u32(_mm_extract_epi32(tmp2, 0));
269
<a name="l00242"></a>00242 count -= !(w_prev ^ (w0 & 1));
270
<a name="l00243"></a>00243 count -= w_prev = _mm_extract_epi32(tmp3, 0);
271
<a name="l00244"></a>00244 }
272
<a name="l00245"></a>00245 <span class="keywordflow">else</span>
273
<a name="l00246"></a>00246 {
274
<a name="l00247"></a>00247 count -= !w_prev; w_prev ^= w_prev;
275
<a name="l00248"></a>00248 }
276
<a name="l00249"></a>00249 }
277
<a name="l00250"></a>00250 w0 = _mm_extract_epi32(b, 1);
278
<a name="l00251"></a>00251 <span class="keywordflow">if</span> (w0)
279
<a name="l00252"></a>00252 {
280
<a name="l00253"></a>00253 *bit_count += _mm_popcnt_u32(w0);
281
<a name="l00254"></a>00254 count += _mm_popcnt_u32(_mm_extract_epi32(tmp2, 1));
282
<a name="l00255"></a>00255 count -= !(w_prev ^ (w0 & 1));
283
<a name="l00256"></a>00256 count -= w_prev = _mm_extract_epi32(tmp3, 1);
284
<a name="l00257"></a>00257 }
285
<a name="l00258"></a>00258 <span class="keywordflow">else</span>
286
<a name="l00259"></a>00259 {
287
<a name="l00260"></a>00260 count -= !w_prev; w_prev ^= w_prev;
288
<a name="l00261"></a>00261 }
289
<a name="l00262"></a>00262 w0 = _mm_extract_epi32(b, 2);
290
<a name="l00263"></a>00263 <span class="keywordflow">if</span> (w0)
291
<a name="l00264"></a>00264 {
292
<a name="l00265"></a>00265 *bit_count += _mm_popcnt_u32(w0);
293
<a name="l00266"></a>00266 count += _mm_popcnt_u32(_mm_extract_epi32(tmp2, 2));
294
<a name="l00267"></a>00267 count -= !(w_prev ^ (w0 & 1));
295
<a name="l00268"></a>00268 count -= w_prev = _mm_extract_epi32(tmp3, 2);
296
<a name="l00269"></a>00269 }
297
<a name="l00270"></a>00270 <span class="keywordflow">else</span>
298
<a name="l00271"></a>00271 {
299
<a name="l00272"></a>00272 count -= !w_prev; w_prev ^= w_prev;
300
<a name="l00273"></a>00273 }
301
<a name="l00274"></a>00274 w0 = _mm_extract_epi32(b, 3);
302
<a name="l00275"></a>00275 <span class="keywordflow">if</span> (w0)
303
<a name="l00276"></a>00276 {
304
<a name="l00277"></a>00277 *bit_count += _mm_popcnt_u32(w0);
305
<a name="l00278"></a>00278 count += _mm_popcnt_u32(_mm_extract_epi32(tmp2, 3));
306
<a name="l00279"></a>00279 count -= !(w_prev ^ (w0 & 1));
307
<a name="l00280"></a>00280 count -= w_prev = _mm_extract_epi32(tmp3, 3);
308
<a name="l00281"></a>00281 }
309
<a name="l00282"></a>00282 <span class="keywordflow">else</span>
310
<a name="l00283"></a>00283 {
311
<a name="l00284"></a>00284 count -= !w_prev; w_prev ^= w_prev;
312
<a name="l00285"></a>00285 }
313
<a name="l00286"></a>00286 }
314
<a name="l00287"></a>00287 } <span class="keywordflow">while</span> (++block < block_end);
315
<a name="l00288"></a>00288
316
<a name="l00289"></a>00289 <span class="keywordflow">return</span> count;
317
<a name="l00290"></a>00290 }
318
<a name="l00291"></a>00291
319
<a name="l00292"></a>00292
320
<a name="l00293"></a>00293
321
<a name="l00294"></a>00294 } <span class="comment">// namespace</span>
322
<a name="l00295"></a>00295
323
<a name="l00296"></a>00296
324
<a name="l00297"></a>00297
325
<a name="l00298"></a>00298
326
<a name="l00299"></a>00299 <span class="preprocessor">#endif</span>
328
<hr size="1"/><address style="text-align: right;"><small>Generated on Sun Nov 22 10:49:35 2009 for BitMagic by
329
<a href="http://www.doxygen.org/index.html">
330
<img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.6.1 </small></address>