~ubuntu-branches/ubuntu/trusty/bmagic/trusty-proposed

« back to all changes in this revision

Viewing changes to doc/html/a00094_source.html

  • Committer: Bazaar Package Importer
  • Author(s): Roberto C. Sanchez
  • Date: 2009-12-09 16:02:55 UTC
  • mfrom: (4.1.5 sid)
  • Revision ID: james.westby@ubuntu.com-20091209160255-hu2bnjrbncl92z5w
Tags: 3.6.1-1
New upstream release

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
 
2
<html xmlns="http://www.w3.org/1999/xhtml">
 
3
<head>
 
4
<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
 
5
<title>BitMagic: bmsse4.h Source File</title>
 
6
<link href="tabs.css" rel="stylesheet" type="text/css"/>
 
7
<link href="doxygen.css" rel="stylesheet" type="text/css"/>
 
8
</head>
 
9
<body>
 
10
<!-- Generated by Doxygen 1.6.1 -->
 
11
<div class="navigation" id="top">
 
12
  <div class="tabs">
 
13
    <ul>
 
14
      <li><a href="index.html"><span>Main&nbsp;Page</span></a></li>
 
15
      <li><a href="modules.html"><span>Modules</span></a></li>
 
16
      <li><a href="namespaces.html"><span>Namespaces</span></a></li>
 
17
      <li><a href="annotated.html"><span>Data&nbsp;Structures</span></a></li>
 
18
      <li class="current"><a href="files.html"><span>Files</span></a></li>
 
19
      <li><a href="examples.html"><span>Examples</span></a></li>
 
20
    </ul>
 
21
  </div>
 
22
  <div class="tabs">
 
23
    <ul>
 
24
      <li><a href="files.html"><span>File&nbsp;List</span></a></li>
 
25
      <li><a href="globals.html"><span>Globals</span></a></li>
 
26
    </ul>
 
27
  </div>
 
28
<h1>bmsse4.h</h1><a href="a00094.html">Go to the documentation of this file.</a><div class="fragment"><pre class="fragment"><a name="l00001"></a>00001 <span class="preprocessor">#ifndef BMSSE4__H__INCLUDED__</span>
 
29
<a name="l00002"></a>00002 <span class="preprocessor"></span><span class="preprocessor">#define BMSSE4__H__INCLUDED__</span>
 
30
<a name="l00003"></a>00003 <span class="preprocessor"></span><span class="comment">/*</span>
 
31
<a name="l00004"></a>00004 <span class="comment">Copyright(c) 2009 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)</span>
 
32
<a name="l00005"></a>00005 <span class="comment"></span>
 
33
<a name="l00006"></a>00006 <span class="comment">Permission is hereby granted, free of charge, to any person </span>
 
34
<a name="l00007"></a>00007 <span class="comment">obtaining a copy of this software and associated documentation </span>
 
35
<a name="l00008"></a>00008 <span class="comment">files (the &quot;Software&quot;), to deal in the Software without restriction, </span>
 
36
<a name="l00009"></a>00009 <span class="comment">including without limitation the rights to use, copy, modify, merge, </span>
 
37
<a name="l00010"></a>00010 <span class="comment">publish, distribute, sublicense, and/or sell copies of the Software, </span>
 
38
<a name="l00011"></a>00011 <span class="comment">and to permit persons to whom the Software is furnished to do so, </span>
 
39
<a name="l00012"></a>00012 <span class="comment">subject to the following conditions:</span>
 
40
<a name="l00013"></a>00013 <span class="comment"></span>
 
41
<a name="l00014"></a>00014 <span class="comment">The above copyright notice and this permission notice shall be included </span>
 
42
<a name="l00015"></a>00015 <span class="comment">in all copies or substantial portions of the Software.</span>
 
43
<a name="l00016"></a>00016 <span class="comment"></span>
 
44
<a name="l00017"></a>00017 <span class="comment">THE SOFTWARE IS PROVIDED &quot;AS IS&quot;, WITHOUT WARRANTY OF ANY KIND, </span>
 
45
<a name="l00018"></a>00018 <span class="comment">EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES </span>
 
46
<a name="l00019"></a>00019 <span class="comment">OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. </span>
 
47
<a name="l00020"></a>00020 <span class="comment">IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, </span>
 
48
<a name="l00021"></a>00021 <span class="comment">DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, </span>
 
49
<a name="l00022"></a>00022 <span class="comment">ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR </span>
 
50
<a name="l00023"></a>00023 <span class="comment">OTHER DEALINGS IN THE SOFTWARE.</span>
 
51
<a name="l00024"></a>00024 <span class="comment"></span>
 
52
<a name="l00025"></a>00025 <span class="comment">For more information please visit:  http://bmagic.sourceforge.net</span>
 
53
<a name="l00026"></a>00026 <span class="comment"></span>
 
54
<a name="l00027"></a>00027 <span class="comment">*/</span>
 
55
<a name="l00028"></a>00028 
 
56
<a name="l00029"></a>00029 
 
57
<a name="l00030"></a>00030 
 
58
<a name="l00031"></a>00031 <span class="comment">//    Header implements processor specific intrinsics declarations for SSE2</span>
 
59
<a name="l00032"></a>00032 <span class="comment">//    instruction set</span>
 
60
<a name="l00033"></a>00033 <span class="preprocessor">#include&lt;mmintrin.h&gt;</span>
 
61
<a name="l00034"></a>00034 <span class="preprocessor">#include&lt;emmintrin.h&gt;</span>
 
62
<a name="l00035"></a>00035 <span class="preprocessor">#include&lt;smmintrin.h&gt;</span>
 
63
<a name="l00036"></a>00036 
 
64
<a name="l00037"></a>00037 <span class="preprocessor">#include &quot;<a class="code" href="a00089.html">bmdef.h</a>&quot;</span>
 
65
<a name="l00038"></a>00038 <span class="preprocessor">#include &quot;<a class="code" href="a00095.html">bmsse_util.h</a>&quot;</span>
 
66
<a name="l00039"></a>00039 
 
67
<a name="l00040"></a>00040 <span class="keyword">namespace </span>bm
 
68
<a name="l00041"></a>00041 {
 
69
<a name="l00042"></a>00042 <span class="comment"></span>
 
70
<a name="l00043"></a>00043 <span class="comment">/** @defgroup SSE4 Processor specific optimizations for SSE4.2 instructions</span>
 
71
<a name="l00044"></a>00044 <span class="comment"> *  @ingroup bmagic</span>
 
72
<a name="l00045"></a>00045 <span class="comment"> */</span>
 
73
<a name="l00046"></a>00046 
 
74
<a name="l00047"></a>00047 
 
75
<a name="l00048"></a>00048 <span class="comment"></span>
 
76
<a name="l00049"></a>00049 <span class="comment">/*!</span>
 
77
<a name="l00050"></a>00050 <span class="comment">    SSE4.2 optimized bitcounting .</span>
 
78
<a name="l00051"></a>00051 <span class="comment">    @ingroup SSE4</span>
 
79
<a name="l00052"></a>00052 <span class="comment">*/</span>
 
80
<a name="l00053"></a>00053 <span class="keyword">inline</span> 
 
81
<a name="l00054"></a><a class="code" href="a00117.html#ga266ed6594dc786e46c1f353443dc41fe">00054</a> <a class="code" href="a00110.html#aa3824d882a037396370b16f2f0a8bf37">bm::id_t</a> <a class="code" href="a00117.html#ga266ed6594dc786e46c1f353443dc41fe">sse4_bit_count</a>(<span class="keyword">const</span> __m128i* block, <span class="keyword">const</span> __m128i* block_end)
 
82
<a name="l00055"></a>00055 {
 
83
<a name="l00056"></a>00056     <a class="code" href="a00110.html#aa3824d882a037396370b16f2f0a8bf37">bm::id_t</a> count = 0;
 
84
<a name="l00057"></a>00057     <span class="keywordflow">do</span>
 
85
<a name="l00058"></a>00058     {
 
86
<a name="l00059"></a>00059         <span class="keywordtype">unsigned</span>* b = (<span class="keywordtype">unsigned</span>*) block;
 
87
<a name="l00060"></a>00060         count += _mm_popcnt_u32(b[0]) +
 
88
<a name="l00061"></a>00061                  _mm_popcnt_u32(b[1]) +
 
89
<a name="l00062"></a>00062                  _mm_popcnt_u32(b[2]) +
 
90
<a name="l00063"></a>00063                  _mm_popcnt_u32(b[3]);
 
91
<a name="l00064"></a>00064     } <span class="keywordflow">while</span> (++block &lt; block_end);
 
92
<a name="l00065"></a>00065     <span class="keywordflow">return</span> count;
 
93
<a name="l00066"></a>00066 }
 
94
<a name="l00067"></a>00067 <span class="comment"></span>
 
95
<a name="l00068"></a>00068 <span class="comment">/*!</span>
 
96
<a name="l00069"></a>00069 <span class="comment">\internal</span>
 
97
<a name="l00070"></a>00070 <span class="comment">*/</span>
 
98
<a name="l00071"></a>00071 <a class="code" href="a00089.html#a938734d014fb68dd8b2251fe8ec2b025">BMFORCEINLINE</a> 
 
99
<a name="l00072"></a><a class="code" href="a00110.html#a6d8f327a7d19c11125ac77383a34e986">00072</a> <span class="keywordtype">unsigned</span> <a class="code" href="a00110.html#a6d8f327a7d19c11125ac77383a34e986">op_xor</a>(<span class="keywordtype">unsigned</span> a, <span class="keywordtype">unsigned</span> b)
 
100
<a name="l00073"></a>00073 {
 
101
<a name="l00074"></a>00074     <span class="keywordtype">unsigned</span> ret = (a ^ b);
 
102
<a name="l00075"></a>00075     <span class="keywordflow">return</span> ret;
 
103
<a name="l00076"></a>00076 }
 
104
<a name="l00077"></a>00077 <span class="comment"></span>
 
105
<a name="l00078"></a>00078 <span class="comment">/*!</span>
 
106
<a name="l00079"></a>00079 <span class="comment">\internal</span>
 
107
<a name="l00080"></a>00080 <span class="comment">*/</span>
 
108
<a name="l00081"></a>00081 <a class="code" href="a00089.html#a938734d014fb68dd8b2251fe8ec2b025">BMFORCEINLINE</a> 
 
109
<a name="l00082"></a><a class="code" href="a00110.html#a0ed9a1bcd143f7763b4b59ef442f6994">00082</a> <span class="keywordtype">unsigned</span> <a class="code" href="a00110.html#a0ed9a1bcd143f7763b4b59ef442f6994">op_or</a>(<span class="keywordtype">unsigned</span> a, <span class="keywordtype">unsigned</span> b)
 
110
<a name="l00083"></a>00083 {
 
111
<a name="l00084"></a>00084     <span class="keywordflow">return</span> (a | b);
 
112
<a name="l00085"></a>00085 }
 
113
<a name="l00086"></a>00086 <span class="comment"></span>
 
114
<a name="l00087"></a>00087 <span class="comment">/*!</span>
 
115
<a name="l00088"></a>00088 <span class="comment">\internal</span>
 
116
<a name="l00089"></a>00089 <span class="comment">*/</span>
 
117
<a name="l00090"></a>00090 <a class="code" href="a00089.html#a938734d014fb68dd8b2251fe8ec2b025">BMFORCEINLINE</a> 
 
118
<a name="l00091"></a><a class="code" href="a00110.html#a515b2862b33efd4e36d72e4ba7729199">00091</a> <span class="keywordtype">unsigned</span> <a class="code" href="a00110.html#a515b2862b33efd4e36d72e4ba7729199">op_and</a>(<span class="keywordtype">unsigned</span> a, <span class="keywordtype">unsigned</span> b)
 
119
<a name="l00092"></a>00092 {
 
120
<a name="l00093"></a>00093     <span class="keywordflow">return</span> (a &amp; b);
 
121
<a name="l00094"></a>00094 }
 
122
<a name="l00095"></a>00095 
 
123
<a name="l00096"></a>00096 
 
124
<a name="l00097"></a>00097 <span class="keyword">template</span>&lt;<span class="keyword">class</span> Func&gt;
 
125
<a name="l00098"></a><a class="code" href="a00110.html#a295c6323a972a90e28c36e6012427a26">00098</a> <a class="code" href="a00110.html#aa3824d882a037396370b16f2f0a8bf37">bm::id_t</a> <a class="code" href="a00110.html#a295c6323a972a90e28c36e6012427a26">sse4_bit_count_op</a>(<span class="keyword">const</span> __m128i* BMRESTRICT block, 
 
126
<a name="l00099"></a>00099                            <span class="keyword">const</span> __m128i* BMRESTRICT block_end,
 
127
<a name="l00100"></a>00100                            <span class="keyword">const</span> __m128i* BMRESTRICT mask_block,
 
128
<a name="l00101"></a>00101                            Func sse2_func)
 
129
<a name="l00102"></a>00102 {
 
130
<a name="l00103"></a>00103     <a class="code" href="a00110.html#aa3824d882a037396370b16f2f0a8bf37">bm::id_t</a> count = 0;
 
131
<a name="l00104"></a>00104     <span class="keywordflow">do</span>
 
132
<a name="l00105"></a>00105     {
 
133
<a name="l00106"></a>00106         __m128i tmp0 = _mm_load_si128(block);
 
134
<a name="l00107"></a>00107         __m128i tmp1 = _mm_load_si128(mask_block);        
 
135
<a name="l00108"></a>00108         __m128i b = sse2_func(tmp0, tmp1);
 
136
<a name="l00109"></a>00109 
 
137
<a name="l00110"></a>00110         count += _mm_popcnt_u32(_mm_extract_epi32(b, 0));
 
138
<a name="l00111"></a>00111         count += _mm_popcnt_u32(_mm_extract_epi32(b, 1));
 
139
<a name="l00112"></a>00112         count += _mm_popcnt_u32(_mm_extract_epi32(b, 2));
 
140
<a name="l00113"></a>00113         count += _mm_popcnt_u32(_mm_extract_epi32(b, 3));
 
141
<a name="l00114"></a>00114 
 
142
<a name="l00115"></a>00115         ++block; ++mask_block;
 
143
<a name="l00116"></a>00116     } <span class="keywordflow">while</span> (block &lt; block_end);
 
144
<a name="l00117"></a>00117     <span class="keywordflow">return</span> count;
 
145
<a name="l00118"></a>00118 }
 
146
<a name="l00119"></a>00119 
 
147
<a name="l00120"></a>00120 <span class="keyword">template</span>&lt;<span class="keyword">class</span> Func&gt;
 
148
<a name="l00121"></a><a class="code" href="a00110.html#ab297c48a2b5057f32db00a8e8631d7a3">00121</a> <a class="code" href="a00110.html#aa3824d882a037396370b16f2f0a8bf37">bm::id_t</a> <a class="code" href="a00110.html#ab297c48a2b5057f32db00a8e8631d7a3">sse4_bit_count_op2</a>(<span class="keyword">const</span> __m128i* BMRESTRICT block, 
 
149
<a name="l00122"></a>00122                             <span class="keyword">const</span> __m128i* BMRESTRICT block_end,
 
150
<a name="l00123"></a>00123                             <span class="keyword">const</span> __m128i* BMRESTRICT mask_block,
 
151
<a name="l00124"></a>00124                            Func op_func)
 
152
<a name="l00125"></a>00125 {
 
153
<a name="l00126"></a>00126     <a class="code" href="a00110.html#aa3824d882a037396370b16f2f0a8bf37">bm::id_t</a> count = 0;
 
154
<a name="l00127"></a>00127     <span class="keywordflow">do</span>
 
155
<a name="l00128"></a>00128     {
 
156
<a name="l00129"></a>00129         <span class="keywordtype">unsigned</span> *r1 = (<span class="keywordtype">unsigned</span>*) block;
 
157
<a name="l00130"></a>00130         <span class="keywordtype">unsigned</span> *r2 = (<span class="keywordtype">unsigned</span>*) mask_block;
 
158
<a name="l00131"></a>00131 
 
159
<a name="l00132"></a>00132         count += _mm_popcnt_u32(op_func(r1[0], r2[0]));
 
160
<a name="l00133"></a>00133         count += _mm_popcnt_u32(op_func(r1[1], r2[1]));
 
161
<a name="l00134"></a>00134         count += _mm_popcnt_u32(op_func(r1[2], r2[2]));
 
162
<a name="l00135"></a>00135         count += _mm_popcnt_u32(op_func(r1[3], r2[3]));
 
163
<a name="l00136"></a>00136 
 
164
<a name="l00137"></a>00137         ++mask_block;
 
165
<a name="l00138"></a>00138 
 
166
<a name="l00139"></a>00139     } <span class="keywordflow">while</span> (++block &lt; block_end);
 
167
<a name="l00140"></a>00140     <span class="keywordflow">return</span> count;
 
168
<a name="l00141"></a>00141 
 
169
<a name="l00142"></a>00142 }
 
170
<a name="l00143"></a>00143 
 
171
<a name="l00144"></a>00144 
 
172
<a name="l00145"></a>00145 
 
173
<a name="l00146"></a><a class="code" href="a00094.html#a9b8caa31ab90cbfbc99b5728352e5456">00146</a> <span class="preprocessor">#define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\</span>
 
174
<a name="l00147"></a>00147 <span class="preprocessor">    sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), mask)</span>
 
175
<a name="l00148"></a>00148 <span class="preprocessor"></span>
 
176
<a name="l00149"></a><a class="code" href="a00094.html#a7d425b68d8217dc37d33c660e39a61e1">00149</a> <span class="preprocessor">#define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\</span>
 
177
<a name="l00150"></a>00150 <span class="preprocessor">    sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), mask)</span>
 
178
<a name="l00151"></a>00151 <span class="preprocessor"></span>
 
179
<a name="l00152"></a><a class="code" href="a00094.html#ae022e7bc509d97359a348393a6b25b99">00152</a> <span class="preprocessor">#define VECT_BITCOUNT(first, last) \</span>
 
180
<a name="l00153"></a>00153 <span class="preprocessor">    sse4_bit_count((__m128i*) (first), (__m128i*) (last)) </span>
 
181
<a name="l00154"></a>00154 <span class="preprocessor"></span>
 
182
<a name="l00155"></a><a class="code" href="a00094.html#a3459200618dab65142340f1abd760eb9">00155</a> <span class="preprocessor">#define VECT_BITCOUNT_AND(first, last, mask) \</span>
 
183
<a name="l00156"></a>00156 <span class="preprocessor">    sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and) </span>
 
184
<a name="l00157"></a>00157 <span class="preprocessor"></span>
 
185
<a name="l00158"></a><a class="code" href="a00094.html#a07583d48db4ede7eadbc3156e9dd7d3b">00158</a> <span class="preprocessor">#define VECT_BITCOUNT_OR(first, last, mask) \</span>
 
186
<a name="l00159"></a>00159 <span class="preprocessor">    sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or) </span>
 
187
<a name="l00160"></a>00160 <span class="preprocessor"></span>
 
188
<a name="l00161"></a><a class="code" href="a00094.html#a9954c72d227a3ca78df22a56efad6baf">00161</a> <span class="preprocessor">#define VECT_BITCOUNT_XOR(first, last, mask) \</span>
 
189
<a name="l00162"></a>00162 <span class="preprocessor">    sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor) </span>
 
190
<a name="l00163"></a>00163 <span class="preprocessor"></span>
 
191
<a name="l00164"></a><a class="code" href="a00094.html#a98aaf230431767be4838323266d0b493">00164</a> <span class="preprocessor">#define VECT_BITCOUNT_SUB(first, last, mask) \</span>
 
192
<a name="l00165"></a>00165 <span class="preprocessor">    sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub) </span>
 
193
<a name="l00166"></a>00166 <span class="preprocessor"></span>
 
194
<a name="l00167"></a><a class="code" href="a00094.html#aa31c7d6d7d4cd67e1cc86c08221446ce">00167</a> <span class="preprocessor">#define VECT_INVERT_ARR(first, last) \</span>
 
195
<a name="l00168"></a>00168 <span class="preprocessor">    sse2_invert_arr(first, last);</span>
 
196
<a name="l00169"></a>00169 <span class="preprocessor"></span>
 
197
<a name="l00170"></a><a class="code" href="a00094.html#aa6a3504bb5449abf6f5ec084b2ddfac6">00170</a> <span class="preprocessor">#define VECT_AND_ARR(dst, src, src_end) \</span>
 
198
<a name="l00171"></a>00171 <span class="preprocessor">    sse2_and_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))</span>
 
199
<a name="l00172"></a>00172 <span class="preprocessor"></span>
 
200
<a name="l00173"></a><a class="code" href="a00094.html#ae3cdeec275429ea0d60668da63e96df8">00173</a> <span class="preprocessor">#define VECT_OR_ARR(dst, src, src_end) \</span>
 
201
<a name="l00174"></a>00174 <span class="preprocessor">    sse2_or_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))</span>
 
202
<a name="l00175"></a>00175 <span class="preprocessor"></span>
 
203
<a name="l00176"></a><a class="code" href="a00094.html#af079345cbcb256bb6216c0446fd775f2">00176</a> <span class="preprocessor">#define VECT_SUB_ARR(dst, src, src_end) \</span>
 
204
<a name="l00177"></a>00177 <span class="preprocessor">    sse2_sub_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))</span>
 
205
<a name="l00178"></a>00178 <span class="preprocessor"></span>
 
206
<a name="l00179"></a><a class="code" href="a00094.html#a1f520ac49859f992de5c00f43b851179">00179</a> <span class="preprocessor">#define VECT_XOR_ARR(dst, src, src_end) \</span>
 
207
<a name="l00180"></a>00180 <span class="preprocessor">    sse2_xor_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))</span>
 
208
<a name="l00181"></a>00181 <span class="preprocessor"></span>
 
209
<a name="l00182"></a><a class="code" href="a00094.html#a72b713f259e0542169a1228bebadcede">00182</a> <span class="preprocessor">#define VECT_COPY_BLOCK(dst, src, src_end) \</span>
 
210
<a name="l00183"></a>00183 <span class="preprocessor">    sse2_copy_block((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))</span>
 
211
<a name="l00184"></a>00184 <span class="preprocessor"></span>
 
212
<a name="l00185"></a><a class="code" href="a00094.html#acefd3b9d39a2e2fb0c9d2aa5c3812e0b">00185</a> <span class="preprocessor">#define VECT_SET_BLOCK(dst, dst_end, value) \</span>
 
213
<a name="l00186"></a>00186 <span class="preprocessor">    sse2_set_block((__m128i*) dst, (__m128i*) (dst_end), (value))</span>
 
214
<a name="l00187"></a>00187 <span class="preprocessor"></span>
 
215
<a name="l00188"></a>00188 
 
216
<a name="l00189"></a>00189 
 
217
<a name="l00190"></a>00190 
 
218
<a name="l00191"></a>00191 <span class="comment"></span>
 
219
<a name="l00192"></a>00192 <span class="comment">/*!</span>
 
220
<a name="l00193"></a>00193 <span class="comment">    SSE4.2 optimized bitcounting and number of GAPs</span>
 
221
<a name="l00194"></a>00194 <span class="comment">    @ingroup SSE4</span>
 
222
<a name="l00195"></a>00195 <span class="comment">*/</span>
 
223
<a name="l00196"></a>00196 
 
224
<a name="l00197"></a>00197 
 
225
<a name="l00198"></a>00198 <span class="keyword">inline</span>
 
226
<a name="l00199"></a><a class="code" href="a00117.html#gad686391db4cdc6382ae6785f62981474">00199</a> <a class="code" href="a00110.html#aa3824d882a037396370b16f2f0a8bf37">bm::id_t</a> <a class="code" href="a00117.html#gad686391db4cdc6382ae6785f62981474">sse4_bit_block_calc_count_change</a>(<span class="keyword">const</span> __m128i* BMRESTRICT block,
 
227
<a name="l00200"></a>00200                                           <span class="keyword">const</span> __m128i* BMRESTRICT block_end,
 
228
<a name="l00201"></a>00201                                                <span class="keywordtype">unsigned</span>* BMRESTRICT bit_count)
 
229
<a name="l00202"></a>00202 {
 
230
<a name="l00203"></a>00203 <span class="comment">//   __m128i mask1 = _mm_set_epi32(0x1, 0x1, 0x1, 0x1);</span>
 
231
<a name="l00204"></a>00204    <span class="keyword">register</span> <span class="keywordtype">int</span> count = (block_end - block)*4; 
 
232
<a name="l00205"></a>00205 
 
233
<a name="l00206"></a>00206    <span class="keyword">register</span> <a class="code" href="a00110.html#a17fd5ba52db3ddda05e6f8dd5000a1a4">bm::word_t</a>  w0, w_prev;
 
234
<a name="l00207"></a>00207    <span class="keyword">const</span> <span class="keywordtype">int</span> w_shift = <span class="keyword">sizeof</span>(w0) * 8 - 1;
 
235
<a name="l00208"></a>00208    <span class="keywordtype">bool</span> first_word = <span class="keyword">true</span>;
 
236
<a name="l00209"></a>00209    *bit_count = 0;
 
237
<a name="l00210"></a>00210  
 
238
<a name="l00211"></a>00211    <span class="comment">// first word</span>
 
239
<a name="l00212"></a>00212    {
 
240
<a name="l00213"></a>00213        <a class="code" href="a00110.html#a17fd5ba52db3ddda05e6f8dd5000a1a4">bm::word_t</a>  w;
 
241
<a name="l00214"></a>00214        <span class="keyword">const</span> <a class="code" href="a00110.html#a17fd5ba52db3ddda05e6f8dd5000a1a4">bm::word_t</a>* blk = (<span class="keyword">const</span> <a class="code" href="a00110.html#a17fd5ba52db3ddda05e6f8dd5000a1a4">bm::word_t</a>*) block;
 
242
<a name="l00215"></a>00215        w = w0 = blk[0];
 
243
<a name="l00216"></a>00216        *bit_count += _mm_popcnt_u32(w);
 
244
<a name="l00217"></a>00217        w ^= (w &gt;&gt; 1);
 
245
<a name="l00218"></a>00218        count += _mm_popcnt_u32(w);
 
246
<a name="l00219"></a>00219        count -= (w_prev = (w0 &gt;&gt; w_shift));
 
247
<a name="l00220"></a>00220    }
 
248
<a name="l00221"></a>00221 
 
249
<a name="l00222"></a>00222    <span class="keywordflow">do</span>
 
250
<a name="l00223"></a>00223    {
 
251
<a name="l00224"></a>00224        __m128i b = _mm_load_si128(block);
 
252
<a name="l00225"></a>00225        __m128i tmp2 = _mm_xor_si128(b, _mm_srli_epi32(b, 1)); <span class="comment">// tmp2=(b &gt;&gt; 1) ^ b;</span>
 
253
<a name="l00226"></a>00226        __m128i tmp3 = _mm_srli_epi32(b, w_shift); <span class="comment">// tmp3 = w0 &gt;&gt; w_shift</span>
 
254
<a name="l00227"></a>00227 <span class="comment">//       __m128i tmp4 = _mm_and_si128(b, mask1);    // tmp4 = w0 &amp; 1 </span>
 
255
<a name="l00228"></a>00228 
 
256
<a name="l00229"></a>00229        <span class="comment">// ---------------------------------------------------------------------</span>
 
257
<a name="l00230"></a>00230        {
 
258
<a name="l00231"></a>00231            <span class="keywordflow">if</span> (first_word)
 
259
<a name="l00232"></a>00232            {
 
260
<a name="l00233"></a>00233                first_word = <span class="keyword">false</span>;               
 
261
<a name="l00234"></a>00234            }
 
262
<a name="l00235"></a>00235            <span class="keywordflow">else</span>
 
263
<a name="l00236"></a>00236            {
 
264
<a name="l00237"></a>00237                w0 = _mm_extract_epi32(b, 0);
 
265
<a name="l00238"></a>00238                <span class="keywordflow">if</span> (w0)
 
266
<a name="l00239"></a>00239                {
 
267
<a name="l00240"></a>00240                    *bit_count += _mm_popcnt_u32(w0);
 
268
<a name="l00241"></a>00241                    count += _mm_popcnt_u32(_mm_extract_epi32(tmp2, 0));
 
269
<a name="l00242"></a>00242                    count -= !(w_prev ^ (w0 &amp; 1));
 
270
<a name="l00243"></a>00243                    count -= w_prev = _mm_extract_epi32(tmp3, 0);
 
271
<a name="l00244"></a>00244                }
 
272
<a name="l00245"></a>00245                <span class="keywordflow">else</span>
 
273
<a name="l00246"></a>00246                {
 
274
<a name="l00247"></a>00247                    count -= !w_prev; w_prev ^= w_prev;
 
275
<a name="l00248"></a>00248                }  
 
276
<a name="l00249"></a>00249            }
 
277
<a name="l00250"></a>00250            w0 = _mm_extract_epi32(b, 1);
 
278
<a name="l00251"></a>00251            <span class="keywordflow">if</span> (w0)
 
279
<a name="l00252"></a>00252            {
 
280
<a name="l00253"></a>00253                *bit_count += _mm_popcnt_u32(w0);
 
281
<a name="l00254"></a>00254                count += _mm_popcnt_u32(_mm_extract_epi32(tmp2, 1));
 
282
<a name="l00255"></a>00255                count -= !(w_prev ^ (w0 &amp; 1));
 
283
<a name="l00256"></a>00256                count -= w_prev = _mm_extract_epi32(tmp3, 1);                    
 
284
<a name="l00257"></a>00257            }
 
285
<a name="l00258"></a>00258            <span class="keywordflow">else</span>
 
286
<a name="l00259"></a>00259            {
 
287
<a name="l00260"></a>00260                count -= !w_prev; w_prev ^= w_prev;
 
288
<a name="l00261"></a>00261            }  
 
289
<a name="l00262"></a>00262            w0 = _mm_extract_epi32(b, 2);
 
290
<a name="l00263"></a>00263            <span class="keywordflow">if</span> (w0)
 
291
<a name="l00264"></a>00264            {
 
292
<a name="l00265"></a>00265                *bit_count += _mm_popcnt_u32(w0);
 
293
<a name="l00266"></a>00266                count += _mm_popcnt_u32(_mm_extract_epi32(tmp2, 2));
 
294
<a name="l00267"></a>00267                count -= !(w_prev ^ (w0 &amp; 1));
 
295
<a name="l00268"></a>00268                count -= w_prev = _mm_extract_epi32(tmp3, 2);                   
 
296
<a name="l00269"></a>00269            }
 
297
<a name="l00270"></a>00270            <span class="keywordflow">else</span>
 
298
<a name="l00271"></a>00271            {
 
299
<a name="l00272"></a>00272                count -= !w_prev; w_prev ^= w_prev;
 
300
<a name="l00273"></a>00273            }  
 
301
<a name="l00274"></a>00274            w0 = _mm_extract_epi32(b, 3);
 
302
<a name="l00275"></a>00275            <span class="keywordflow">if</span> (w0)
 
303
<a name="l00276"></a>00276            {
 
304
<a name="l00277"></a>00277                *bit_count += _mm_popcnt_u32(w0);
 
305
<a name="l00278"></a>00278                count += _mm_popcnt_u32(_mm_extract_epi32(tmp2, 3));
 
306
<a name="l00279"></a>00279                count -= !(w_prev ^ (w0 &amp; 1));
 
307
<a name="l00280"></a>00280                count -= w_prev = _mm_extract_epi32(tmp3, 3);                    
 
308
<a name="l00281"></a>00281            }
 
309
<a name="l00282"></a>00282            <span class="keywordflow">else</span>
 
310
<a name="l00283"></a>00283            {
 
311
<a name="l00284"></a>00284                count -= !w_prev; w_prev ^= w_prev;
 
312
<a name="l00285"></a>00285            }               
 
313
<a name="l00286"></a>00286        }
 
314
<a name="l00287"></a>00287    } <span class="keywordflow">while</span> (++block &lt; block_end);
 
315
<a name="l00288"></a>00288 
 
316
<a name="l00289"></a>00289    <span class="keywordflow">return</span> count;
 
317
<a name="l00290"></a>00290 }
 
318
<a name="l00291"></a>00291 
 
319
<a name="l00292"></a>00292 
 
320
<a name="l00293"></a>00293 
 
321
<a name="l00294"></a>00294 } <span class="comment">// namespace</span>
 
322
<a name="l00295"></a>00295 
 
323
<a name="l00296"></a>00296 
 
324
<a name="l00297"></a>00297 
 
325
<a name="l00298"></a>00298 
 
326
<a name="l00299"></a>00299 <span class="preprocessor">#endif</span>
 
327
</pre></div></div>
 
328
<hr size="1"/><address style="text-align: right;"><small>Generated on Sun Nov 22 10:49:35 2009 for BitMagic by&nbsp;
 
329
<a href="http://www.doxygen.org/index.html">
 
330
<img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.6.1 </small></address>
 
331
</body>
 
332
</html>