55
55
<a name="l00028"></a>00028
56
56
<a name="l00029"></a>00029
57
57
<a name="l00030"></a>00030
58
<a name="l00031"></a>00031 <span class="keyword">namespace </span>bm
59
<a name="l00032"></a>00032 {
60
<a name="l00033"></a>00033 <span class="comment"></span>
61
<a name="l00034"></a>00034 <span class="comment">/** @defgroup SSE2 Processor specific optimizations for SSE2 instructions</span>
62
<a name="l00035"></a>00035 <span class="comment"> * @ingroup bmagic</span>
63
<a name="l00036"></a>00036 <span class="comment"> */</span>
64
<a name="l00037"></a>00037
65
<a name="l00038"></a>00038 <span class="comment"></span>
66
<a name="l00039"></a>00039 <span class="comment">/*! </span>
67
<a name="l00040"></a>00040 <span class="comment"> @brief SSE2 reinitialization guard class</span>
68
<a name="l00041"></a>00041 <span class="comment"></span>
69
<a name="l00042"></a>00042 <span class="comment"> SSE2 requires to call _mm_empty() if we are intermixing</span>
70
<a name="l00043"></a>00043 <span class="comment"> MMX integer commands with floating point arithmetics.</span>
71
<a name="l00044"></a>00044 <span class="comment"> This class guards critical code fragments where SSE2 integer</span>
72
<a name="l00045"></a>00045 <span class="comment"> is used.</span>
73
<a name="l00046"></a>00046 <span class="comment"></span>
74
<a name="l00047"></a>00047 <span class="comment"> @ingroup SSE2</span>
75
<a name="l00048"></a>00048 <span class="comment">*/</span>
76
<a name="l00049"></a><a class="code" href="a00084.html">00049</a> <span class="keyword">class </span><a class="code" href="a00084.html" title="SSE2 reinitialization guard class.">sse_empty_guard</a>
77
<a name="l00050"></a>00050 {
78
<a name="l00051"></a>00051 <span class="keyword">public</span>:
79
<a name="l00052"></a><a class="code" href="a00084.html#a231af2137d8bd3aefc374982804ace24">00052</a> <a class="code" href="a00092.html#a938734d014fb68dd8b2251fe8ec2b025">BMFORCEINLINE</a> <a class="code" href="a00084.html#a231af2137d8bd3aefc374982804ace24">sse_empty_guard</a>()
80
<a name="l00053"></a>00053 {
81
<a name="l00054"></a>00054 _mm_empty();
82
<a name="l00055"></a>00055 }
83
<a name="l00056"></a>00056
84
<a name="l00057"></a><a class="code" href="a00084.html#a5d197a685ce1f87a1cc01b047960377b">00057</a> <a class="code" href="a00092.html#a938734d014fb68dd8b2251fe8ec2b025">BMFORCEINLINE</a> <a class="code" href="a00084.html#a5d197a685ce1f87a1cc01b047960377b">~sse_empty_guard</a>()
85
<a name="l00058"></a>00058 {
86
<a name="l00059"></a>00059 _mm_empty();
87
<a name="l00060"></a>00060 }
88
<a name="l00061"></a>00061 };
89
<a name="l00062"></a>00062
90
<a name="l00063"></a>00063
91
<a name="l00064"></a>00064 <span class="comment"></span>
92
<a name="l00065"></a>00065 <span class="comment">/*! </span>
93
<a name="l00066"></a>00066 <span class="comment"> @brief XOR array elements to specified mask</span>
94
<a name="l00067"></a>00067 <span class="comment"> *dst = *src ^ mask</span>
95
<a name="l00068"></a>00068 <span class="comment"></span>
96
<a name="l00069"></a>00069 <span class="comment"> @ingroup SSE2</span>
97
<a name="l00070"></a>00070 <span class="comment">*/</span>
98
<a name="l00071"></a>00071 <a class="code" href="a00092.html#a938734d014fb68dd8b2251fe8ec2b025">BMFORCEINLINE</a>
99
<a name="l00072"></a><a class="code" href="a00122.html#ga75c6ddeb0d8a279caa92341878309b50">00072</a> <span class="keywordtype">void</span> <a class="code" href="a00122.html#ga75c6ddeb0d8a279caa92341878309b50" title="XOR array elements to specified mask dst = *src ^ mask.">sse2_xor_arr_2_mask</a>(__m128i* BMRESTRICT dst,
100
<a name="l00073"></a>00073 <span class="keyword">const</span> __m128i* BMRESTRICT src,
101
<a name="l00074"></a>00074 <span class="keyword">const</span> __m128i* BMRESTRICT src_end,
102
<a name="l00075"></a>00075 <a class="code" href="a00115.html#a17fd5ba52db3ddda05e6f8dd5000a1a4">bm::word_t</a> mask)
103
<a name="l00076"></a>00076 {
104
<a name="l00077"></a>00077 __m128i xmm2 = _mm_set_epi32(mask, mask, mask, mask);
105
<a name="l00078"></a>00078 <span class="keywordflow">do</span>
106
<a name="l00079"></a>00079 {
107
<a name="l00080"></a>00080 __m128i xmm1 = _mm_load_si128(src);
108
<a name="l00081"></a>00081
109
<a name="l00082"></a>00082 xmm1 = _mm_xor_si128(xmm1, xmm2);
110
<a name="l00083"></a>00083 _mm_store_si128(dst, xmm1);
111
<a name="l00084"></a>00084 ++dst;
112
<a name="l00085"></a>00085 ++src;
113
<a name="l00086"></a>00086
114
<a name="l00087"></a>00087 } <span class="keywordflow">while</span> (src < src_end);
115
<a name="l00088"></a>00088 }
116
<a name="l00089"></a>00089
117
<a name="l00090"></a>00090 <span class="comment"></span>
118
<a name="l00091"></a>00091 <span class="comment">/*! </span>
119
<a name="l00092"></a>00092 <span class="comment"> @brief Inverts array elements and NOT them to specified mask</span>
120
<a name="l00093"></a>00093 <span class="comment"> *dst = ~*src & mask</span>
121
<a name="l00094"></a>00094 <span class="comment"></span>
122
<a name="l00095"></a>00095 <span class="comment"> @ingroup SSE2</span>
123
<a name="l00096"></a>00096 <span class="comment">*/</span>
124
<a name="l00097"></a>00097 <a class="code" href="a00092.html#a938734d014fb68dd8b2251fe8ec2b025">BMFORCEINLINE</a>
125
<a name="l00098"></a><a class="code" href="a00122.html#gab7b21f448684c4d84927792661e67ed5">00098</a> <span class="keywordtype">void</span> <a class="code" href="a00122.html#gab7b21f448684c4d84927792661e67ed5" title="Inverts array elements and NOT them to specified mask dst = ~*src &amp; mask.">sse2_andnot_arr_2_mask</a>(__m128i* BMRESTRICT dst,
126
<a name="l00099"></a>00099 <span class="keyword">const</span> __m128i* BMRESTRICT src,
127
<a name="l00100"></a>00100 <span class="keyword">const</span> __m128i* BMRESTRICT src_end,
128
<a name="l00101"></a>00101 <a class="code" href="a00115.html#a17fd5ba52db3ddda05e6f8dd5000a1a4">bm::word_t</a> mask)
129
<a name="l00102"></a>00102 {
130
<a name="l00103"></a>00103 __m128i xmm2 = _mm_set_epi32(mask, mask, mask, mask);
131
<a name="l00104"></a>00104 <span class="keywordflow">do</span>
132
<a name="l00105"></a>00105 {
133
<a name="l00106"></a>00106 <span class="comment">//_mm_prefetch((const char*)(src)+1024, _MM_HINT_NTA);</span>
134
<a name="l00107"></a>00107 <span class="comment">//_mm_prefetch((const char*)(src)+1088, _MM_HINT_NTA);</span>
135
<a name="l00108"></a>00108
136
<a name="l00109"></a>00109 __m128i xmm1 = _mm_load_si128(src);
137
<a name="l00110"></a>00110
138
<a name="l00111"></a>00111 xmm1 = _mm_andnot_si128(xmm1, xmm2); <span class="comment">// xmm1 = (~xmm1) & xmm2 </span>
139
<a name="l00112"></a>00112 _mm_store_si128(dst, xmm1);
140
<a name="l00113"></a>00113 ++dst;
141
<a name="l00114"></a>00114 ++src;
142
<a name="l00115"></a>00115
143
<a name="l00116"></a>00116 } <span class="keywordflow">while</span> (src < src_end);
144
<a name="l00117"></a>00117 }
145
<a name="l00118"></a>00118 <span class="comment"></span>
146
<a name="l00119"></a>00119 <span class="comment">/*! </span>
147
<a name="l00120"></a>00120 <span class="comment"> @brief AND array elements against another array</span>
148
<a name="l00121"></a>00121 <span class="comment"> *dst &= *src</span>
149
<a name="l00122"></a>00122 <span class="comment"></span>
150
<a name="l00123"></a>00123 <span class="comment"> @ingroup SSE2</span>
151
<a name="l00124"></a>00124 <span class="comment">*/</span>
152
<a name="l00125"></a>00125 <a class="code" href="a00092.html#a938734d014fb68dd8b2251fe8ec2b025">BMFORCEINLINE</a>
153
<a name="l00126"></a><a class="code" href="a00122.html#ga795b544f311409a55da4ee61a3cd939a">00126</a> <span class="keywordtype">void</span> <a class="code" href="a00122.html#ga795b544f311409a55da4ee61a3cd939a" title="AND array elements against another array dst &amp;= *src.">sse2_and_arr</a>(__m128i* BMRESTRICT dst,
154
<a name="l00127"></a>00127 <span class="keyword">const</span> __m128i* BMRESTRICT src,
155
<a name="l00128"></a>00128 <span class="keyword">const</span> __m128i* BMRESTRICT src_end)
156
<a name="l00129"></a>00129 {
157
<a name="l00130"></a>00130 __m128i xmm1, xmm2;
158
<a name="l00131"></a>00131 <span class="keywordflow">do</span>
159
<a name="l00132"></a>00132 {
160
<a name="l00133"></a>00133 _mm_prefetch((<span class="keyword">const</span> <span class="keywordtype">char</span>*)(src)+512, _MM_HINT_NTA);
161
<a name="l00134"></a>00134
162
<a name="l00135"></a>00135 xmm1 = _mm_load_si128(src++);
163
<a name="l00136"></a>00136 xmm2 = _mm_load_si128(dst);
164
<a name="l00137"></a>00137 xmm1 = _mm_and_si128(xmm1, xmm2);
165
<a name="l00138"></a>00138 _mm_store_si128(dst++, xmm1);
166
<a name="l00139"></a>00139
167
<a name="l00140"></a>00140 xmm1 = _mm_load_si128(src++);
168
<a name="l00141"></a>00141 xmm2 = _mm_load_si128(dst);
169
<a name="l00142"></a>00142 xmm1 = _mm_and_si128(xmm1, xmm2);
170
<a name="l00143"></a>00143 _mm_store_si128(dst++, xmm1);
171
<a name="l00144"></a>00144
172
<a name="l00145"></a>00145 xmm1 = _mm_load_si128(src++);
173
<a name="l00146"></a>00146 xmm2 = _mm_load_si128(dst);
174
<a name="l00147"></a>00147 xmm1 = _mm_and_si128(xmm1, xmm2);
175
<a name="l00148"></a>00148 _mm_store_si128(dst++, xmm1);
176
<a name="l00149"></a>00149
177
<a name="l00150"></a>00150 xmm1 = _mm_load_si128(src++);
178
<a name="l00151"></a>00151 xmm2 = _mm_load_si128(dst);
179
<a name="l00152"></a>00152 xmm1 = _mm_and_si128(xmm1, xmm2);
180
<a name="l00153"></a>00153 _mm_store_si128(dst++, xmm1);
181
<a name="l00154"></a>00154
182
<a name="l00155"></a>00155 } <span class="keywordflow">while</span> (src < src_end);
183
<a name="l00156"></a>00156
184
<a name="l00157"></a>00157 }
185
<a name="l00158"></a>00158
58
<a name="l00031"></a>00031 <span class="comment">// Header implements processor specific intrinsics declarations for SSE2</span>
59
<a name="l00032"></a>00032 <span class="comment">// instruction set</span>
60
<a name="l00033"></a>00033 <span class="preprocessor">#include<mmintrin.h></span>
61
<a name="l00034"></a>00034 <span class="preprocessor">#include<emmintrin.h></span>
62
<a name="l00035"></a>00035 <span class="preprocessor">#include<smmintrin.h></span>
63
<a name="l00036"></a>00036
64
<a name="l00037"></a>00037 <span class="preprocessor">#include "<a class="code" href="a00093.html">bmdef.h</a>"</span>
65
<a name="l00038"></a>00038 <span class="preprocessor">#include "<a class="code" href="a00100.html">bmsse_util.h</a>"</span>
66
<a name="l00039"></a>00039
67
<a name="l00040"></a>00040 <span class="keyword">namespace </span>bm
68
<a name="l00041"></a>00041 {
69
<a name="l00042"></a>00042 <span class="comment"></span>
70
<a name="l00043"></a>00043 <span class="comment">/** @defgroup SSE4 Processor specific optimizations for SSE4.2 instructions</span>
71
<a name="l00044"></a>00044 <span class="comment"> * @ingroup bmagic</span>
72
<a name="l00045"></a>00045 <span class="comment"> */</span>
73
<a name="l00046"></a>00046
74
<a name="l00047"></a>00047
75
<a name="l00048"></a>00048 <span class="comment"></span>
76
<a name="l00049"></a>00049 <span class="comment">/*!</span>
77
<a name="l00050"></a>00050 <span class="comment"> SSE4.2 optimized bitcounting .</span>
78
<a name="l00051"></a>00051 <span class="comment"> @ingroup SSE4</span>
79
<a name="l00052"></a>00052 <span class="comment">*/</span>
80
<a name="l00053"></a>00053 <span class="keyword">inline</span>
81
<a name="l00054"></a><a class="code" href="a00124.html#ga266ed6594dc786e46c1f353443dc41fe">00054</a> <a class="code" href="a00116.html#aa3824d882a037396370b16f2f0a8bf37">bm::id_t</a> <a class="code" href="a00124.html#ga266ed6594dc786e46c1f353443dc41fe">sse4_bit_count</a>(<span class="keyword">const</span> __m128i* block, <span class="keyword">const</span> __m128i* block_end)
82
<a name="l00055"></a>00055 {
83
<a name="l00056"></a>00056 <a class="code" href="a00116.html#aa3824d882a037396370b16f2f0a8bf37">bm::id_t</a> count = 0;
84
<a name="l00057"></a>00057 <span class="preprocessor">#ifdef BM64_SSE4</span>
85
<a name="l00058"></a>00058 <span class="preprocessor"></span> <span class="keyword">const</span> <a class="code" href="a00116.html#a1f6a6dd108cd9e9f4fb284043ef518fe">bm::id64_t</a>* b = (<a class="code" href="a00116.html#a1f6a6dd108cd9e9f4fb284043ef518fe">bm::id64_t</a>*) block;
86
<a name="l00059"></a>00059 <span class="keyword">const</span> <a class="code" href="a00116.html#a1f6a6dd108cd9e9f4fb284043ef518fe">bm::id64_t</a>* b_end = (<a class="code" href="a00116.html#a1f6a6dd108cd9e9f4fb284043ef518fe">bm::id64_t</a>*) block_end;
87
<a name="l00060"></a>00060 <span class="keywordflow">do</span>
88
<a name="l00061"></a>00061 {
89
<a name="l00062"></a>00062 count += _mm_popcnt_u64(b[0]) +
90
<a name="l00063"></a>00063 _mm_popcnt_u64(b[1]);
91
<a name="l00064"></a>00064 b += 2;
92
<a name="l00065"></a>00065 } <span class="keywordflow">while</span> (b < b_end);
93
<a name="l00066"></a>00066 <span class="preprocessor">#else</span>
94
<a name="l00067"></a>00067 <span class="preprocessor"></span> <span class="keywordflow">do</span>
95
<a name="l00068"></a>00068 {
96
<a name="l00069"></a>00069 <span class="keyword">const</span> <span class="keywordtype">unsigned</span>* b = (<span class="keywordtype">unsigned</span>*) block;
97
<a name="l00070"></a>00070 count += _mm_popcnt_u32(b[0]) +
98
<a name="l00071"></a>00071 _mm_popcnt_u32(b[1]) +
99
<a name="l00072"></a>00072 _mm_popcnt_u32(b[2]) +
100
<a name="l00073"></a>00073 _mm_popcnt_u32(b[3]);
101
<a name="l00074"></a>00074 } <span class="keywordflow">while</span> (++block < block_end);
102
<a name="l00075"></a>00075 <span class="preprocessor">#endif </span>
103
<a name="l00076"></a>00076 <span class="preprocessor"></span> <span class="keywordflow">return</span> count;
104
<a name="l00077"></a>00077 }
105
<a name="l00078"></a>00078 <span class="comment"></span>
106
<a name="l00079"></a>00079 <span class="comment">/*!</span>
107
<a name="l00080"></a>00080 <span class="comment">\internal</span>
108
<a name="l00081"></a>00081 <span class="comment">*/</span>
109
<a name="l00082"></a>00082 <a class="code" href="a00093.html#a938734d014fb68dd8b2251fe8ec2b025">BMFORCEINLINE</a>
110
<a name="l00083"></a><a class="code" href="a00116.html#a6d8f327a7d19c11125ac77383a34e986">00083</a> <span class="keywordtype">unsigned</span> <a class="code" href="a00116.html#a6d8f327a7d19c11125ac77383a34e986">op_xor</a>(<span class="keywordtype">unsigned</span> a, <span class="keywordtype">unsigned</span> b)
111
<a name="l00084"></a>00084 {
112
<a name="l00085"></a>00085 <span class="keywordtype">unsigned</span> ret = (a ^ b);
113
<a name="l00086"></a>00086 <span class="keywordflow">return</span> ret;
114
<a name="l00087"></a>00087 }
115
<a name="l00088"></a>00088 <span class="comment"></span>
116
<a name="l00089"></a>00089 <span class="comment">/*!</span>
117
<a name="l00090"></a>00090 <span class="comment">\internal</span>
118
<a name="l00091"></a>00091 <span class="comment">*/</span>
119
<a name="l00092"></a>00092 <a class="code" href="a00093.html#a938734d014fb68dd8b2251fe8ec2b025">BMFORCEINLINE</a>
120
<a name="l00093"></a><a class="code" href="a00116.html#a0ed9a1bcd143f7763b4b59ef442f6994">00093</a> <span class="keywordtype">unsigned</span> <a class="code" href="a00116.html#a0ed9a1bcd143f7763b4b59ef442f6994">op_or</a>(<span class="keywordtype">unsigned</span> a, <span class="keywordtype">unsigned</span> b)
121
<a name="l00094"></a>00094 {
122
<a name="l00095"></a>00095 <span class="keywordflow">return</span> (a | b);
123
<a name="l00096"></a>00096 }
124
<a name="l00097"></a>00097 <span class="comment"></span>
125
<a name="l00098"></a>00098 <span class="comment">/*!</span>
126
<a name="l00099"></a>00099 <span class="comment">\internal</span>
127
<a name="l00100"></a>00100 <span class="comment">*/</span>
128
<a name="l00101"></a>00101 <a class="code" href="a00093.html#a938734d014fb68dd8b2251fe8ec2b025">BMFORCEINLINE</a>
129
<a name="l00102"></a><a class="code" href="a00116.html#a515b2862b33efd4e36d72e4ba7729199">00102</a> <span class="keywordtype">unsigned</span> <a class="code" href="a00116.html#a515b2862b33efd4e36d72e4ba7729199">op_and</a>(<span class="keywordtype">unsigned</span> a, <span class="keywordtype">unsigned</span> b)
130
<a name="l00103"></a>00103 {
131
<a name="l00104"></a>00104 <span class="keywordflow">return</span> (a & b);
132
<a name="l00105"></a>00105 }
133
<a name="l00106"></a>00106
134
<a name="l00107"></a>00107
135
<a name="l00108"></a>00108 <span class="keyword">template</span><<span class="keyword">class</span> Func>
136
<a name="l00109"></a><a class="code" href="a00116.html#a295c6323a972a90e28c36e6012427a26">00109</a> <a class="code" href="a00116.html#aa3824d882a037396370b16f2f0a8bf37">bm::id_t</a> <a class="code" href="a00116.html#a295c6323a972a90e28c36e6012427a26">sse4_bit_count_op</a>(<span class="keyword">const</span> __m128i* BMRESTRICT block,
137
<a name="l00110"></a>00110 <span class="keyword">const</span> __m128i* BMRESTRICT block_end,
138
<a name="l00111"></a>00111 <span class="keyword">const</span> __m128i* BMRESTRICT mask_block,
139
<a name="l00112"></a>00112 Func sse2_func)
140
<a name="l00113"></a>00113 {
141
<a name="l00114"></a>00114 <a class="code" href="a00116.html#aa3824d882a037396370b16f2f0a8bf37">bm::id_t</a> count = 0;
142
<a name="l00115"></a>00115 <span class="preprocessor">#ifdef BM64_SSE4</span>
143
<a name="l00116"></a>00116 <span class="preprocessor"></span> <span class="keywordflow">do</span>
144
<a name="l00117"></a>00117 {
145
<a name="l00118"></a>00118 __m128i tmp0 = _mm_load_si128(block);
146
<a name="l00119"></a>00119 __m128i tmp1 = _mm_load_si128(mask_block);
147
<a name="l00120"></a>00120 __m128i b = sse2_func(tmp0, tmp1);
148
<a name="l00121"></a>00121
149
<a name="l00122"></a>00122 count += _mm_popcnt_u64(_mm_extract_epi64(b, 0));
150
<a name="l00123"></a>00123 count += _mm_popcnt_u64(_mm_extract_epi64(b, 1));
151
<a name="l00124"></a>00124
152
<a name="l00125"></a>00125 ++block; ++mask_block;
153
<a name="l00126"></a>00126 } <span class="keywordflow">while</span> (block < block_end);
154
<a name="l00127"></a>00127 <span class="preprocessor">#else </span>
155
<a name="l00128"></a>00128 <span class="preprocessor"></span> <span class="keywordflow">do</span>
156
<a name="l00129"></a>00129 {
157
<a name="l00130"></a>00130 __m128i tmp0 = _mm_load_si128(block);
158
<a name="l00131"></a>00131 __m128i tmp1 = _mm_load_si128(mask_block);
159
<a name="l00132"></a>00132 __m128i b = sse2_func(tmp0, tmp1);
160
<a name="l00133"></a>00133
161
<a name="l00134"></a>00134 count += _mm_popcnt_u32(_mm_extract_epi32(b, 0));
162
<a name="l00135"></a>00135 count += _mm_popcnt_u32(_mm_extract_epi32(b, 1));
163
<a name="l00136"></a>00136 count += _mm_popcnt_u32(_mm_extract_epi32(b, 2));
164
<a name="l00137"></a>00137 count += _mm_popcnt_u32(_mm_extract_epi32(b, 3));
165
<a name="l00138"></a>00138
166
<a name="l00139"></a>00139 ++block; ++mask_block;
167
<a name="l00140"></a>00140 } <span class="keywordflow">while</span> (block < block_end);
168
<a name="l00141"></a>00141 <span class="preprocessor">#endif</span>
169
<a name="l00142"></a>00142 <span class="preprocessor"></span>
170
<a name="l00143"></a>00143 <span class="keywordflow">return</span> count;
171
<a name="l00144"></a>00144 }
172
<a name="l00145"></a>00145
173
<a name="l00146"></a>00146 <span class="comment">/*</span>
174
<a name="l00147"></a>00147 <span class="comment">template<class Func></span>
175
<a name="l00148"></a>00148 <span class="comment">bm::id_t sse4_bit_count_op2(const __m128i* BMRESTRICT block, </span>
176
<a name="l00149"></a>00149 <span class="comment"> const __m128i* BMRESTRICT block_end,</span>
177
<a name="l00150"></a>00150 <span class="comment"> const __m128i* BMRESTRICT mask_block,</span>
178
<a name="l00151"></a>00151 <span class="comment"> Func op_func)</span>
179
<a name="l00152"></a>00152 <span class="comment">{</span>
180
<a name="l00153"></a>00153 <span class="comment"> bm::id_t count = 0;</span>
181
<a name="l00154"></a>00154 <span class="comment">#ifdef BM64_SSE4 </span>
182
<a name="l00155"></a>00155 <span class="comment"> do</span>
183
<a name="l00156"></a>00156 <span class="comment"> {</span>
184
<a name="l00157"></a>00157 <span class="comment"> unsigned *r1 = (unsigned*) block;</span>
185
<a name="l00158"></a>00158 <span class="comment"> unsigned *r2 = (unsigned*) mask_block;</span>
186
186
<a name="l00159"></a>00159 <span class="comment"></span>
187
<a name="l00160"></a>00160 <span class="comment">/*! </span>
188
<a name="l00161"></a>00161 <span class="comment"> @brief OR array elements against another array</span>
189
<a name="l00162"></a>00162 <span class="comment"> *dst |= *src</span>
190
<a name="l00163"></a>00163 <span class="comment"></span>
191
<a name="l00164"></a>00164 <span class="comment"> @ingroup SSE2</span>
192
<a name="l00165"></a>00165 <span class="comment">*/</span>
193
<a name="l00166"></a>00166 <a class="code" href="a00092.html#a938734d014fb68dd8b2251fe8ec2b025">BMFORCEINLINE</a>
194
<a name="l00167"></a><a class="code" href="a00122.html#ga3a7d61e4e8ad8791ab38fd1c3436aa67">00167</a> <span class="keywordtype">void</span> <a class="code" href="a00122.html#ga3a7d61e4e8ad8791ab38fd1c3436aa67" title="OR array elements against another array dst |= *src.">sse2_or_arr</a>(__m128i* BMRESTRICT dst,
195
<a name="l00168"></a>00168 <span class="keyword">const</span> __m128i* BMRESTRICT src,
196
<a name="l00169"></a>00169 <span class="keyword">const</span> __m128i* BMRESTRICT src_end)
197
<a name="l00170"></a>00170 {
198
<a name="l00171"></a>00171 __m128i xmm1, xmm2;
199
<a name="l00172"></a>00172 <span class="keywordflow">do</span>
200
<a name="l00173"></a>00173 {
201
<a name="l00174"></a>00174 _mm_prefetch((<span class="keyword">const</span> <span class="keywordtype">char</span>*)(src)+512, _MM_HINT_NTA);
202
<a name="l00175"></a>00175
203
<a name="l00176"></a>00176 xmm1 = _mm_load_si128(src++);
204
<a name="l00177"></a>00177 xmm2 = _mm_load_si128(dst);
205
<a name="l00178"></a>00178 xmm1 = _mm_or_si128(xmm1, xmm2);
206
<a name="l00179"></a>00179 _mm_store_si128(dst++, xmm1);
207
<a name="l00180"></a>00180
208
<a name="l00181"></a>00181 xmm1 = _mm_load_si128(src++);
209
<a name="l00182"></a>00182 xmm2 = _mm_load_si128(dst);
210
<a name="l00183"></a>00183 xmm1 = _mm_or_si128(xmm1, xmm2);
211
<a name="l00184"></a>00184 _mm_store_si128(dst++, xmm1);
212
<a name="l00185"></a>00185
213
<a name="l00186"></a>00186 xmm1 = _mm_load_si128(src++);
214
<a name="l00187"></a>00187 xmm2 = _mm_load_si128(dst);
215
<a name="l00188"></a>00188 xmm1 = _mm_or_si128(xmm1, xmm2);
216
<a name="l00189"></a>00189 _mm_store_si128(dst++, xmm1);
217
<a name="l00190"></a>00190
218
<a name="l00191"></a>00191 xmm1 = _mm_load_si128(src++);
219
<a name="l00192"></a>00192 xmm2 = _mm_load_si128(dst);
220
<a name="l00193"></a>00193 xmm1 = _mm_or_si128(xmm1, xmm2);
221
<a name="l00194"></a>00194 _mm_store_si128(dst++, xmm1);
222
<a name="l00195"></a>00195
223
<a name="l00196"></a>00196 } <span class="keywordflow">while</span> (src < src_end);
224
<a name="l00197"></a>00197 }
225
<a name="l00198"></a>00198
226
<a name="l00199"></a>00199 <span class="comment"></span>
227
<a name="l00200"></a>00200 <span class="comment">/*! </span>
228
<a name="l00201"></a>00201 <span class="comment"> @brief OR array elements against another array</span>
229
<a name="l00202"></a>00202 <span class="comment"> *dst ^= *src</span>
230
<a name="l00203"></a>00203 <span class="comment"></span>
231
<a name="l00204"></a>00204 <span class="comment"> @ingroup SSE2</span>
232
<a name="l00205"></a>00205 <span class="comment">*/</span>
233
<a name="l00206"></a>00206 <a class="code" href="a00092.html#a938734d014fb68dd8b2251fe8ec2b025">BMFORCEINLINE</a>
234
<a name="l00207"></a><a class="code" href="a00122.html#gaf1a5ad26557cc4d71d7421c35a8445fe">00207</a> <span class="keywordtype">void</span> <a class="code" href="a00122.html#gaf1a5ad26557cc4d71d7421c35a8445fe" title="OR array elements against another array dst ^= *src.">sse2_xor_arr</a>(__m128i* BMRESTRICT dst,
235
<a name="l00208"></a>00208 <span class="keyword">const</span> __m128i* BMRESTRICT src,
236
<a name="l00209"></a>00209 <span class="keyword">const</span> __m128i* BMRESTRICT src_end)
237
<a name="l00210"></a>00210 {
238
<a name="l00211"></a>00211 __m128i xmm1, xmm2;
239
<a name="l00212"></a>00212 <span class="keywordflow">do</span>
240
<a name="l00213"></a>00213 {
241
<a name="l00214"></a>00214 _mm_prefetch((<span class="keyword">const</span> <span class="keywordtype">char</span>*)(src)+512, _MM_HINT_NTA);
242
<a name="l00215"></a>00215
243
<a name="l00216"></a>00216 xmm1 = _mm_load_si128(src++);
244
<a name="l00217"></a>00217 xmm2 = _mm_load_si128(dst);
245
<a name="l00218"></a>00218 xmm1 = _mm_xor_si128(xmm1, xmm2);
246
<a name="l00219"></a>00219 _mm_store_si128(dst++, xmm1);
247
<a name="l00220"></a>00220
248
<a name="l00221"></a>00221 xmm1 = _mm_load_si128(src++);
249
<a name="l00222"></a>00222 xmm2 = _mm_load_si128(dst);
250
<a name="l00223"></a>00223 xmm1 = _mm_xor_si128(xmm1, xmm2);
251
<a name="l00224"></a>00224 _mm_store_si128(dst++, xmm1);
252
<a name="l00225"></a>00225
253
<a name="l00226"></a>00226 xmm1 = _mm_load_si128(src++);
254
<a name="l00227"></a>00227 xmm2 = _mm_load_si128(dst);
255
<a name="l00228"></a>00228 xmm1 = _mm_xor_si128(xmm1, xmm2);
256
<a name="l00229"></a>00229 _mm_store_si128(dst++, xmm1);
257
<a name="l00230"></a>00230
258
<a name="l00231"></a>00231 xmm1 = _mm_load_si128(src++);
259
<a name="l00232"></a>00232 xmm2 = _mm_load_si128(dst);
260
<a name="l00233"></a>00233 xmm1 = _mm_xor_si128(xmm1, xmm2);
261
<a name="l00234"></a>00234 _mm_store_si128(dst++, xmm1);
262
<a name="l00235"></a>00235
263
<a name="l00236"></a>00236 } <span class="keywordflow">while</span> (src < src_end);
264
<a name="l00237"></a>00237 }
265
<a name="l00238"></a>00238
187
<a name="l00160"></a>00160 <span class="comment"> count += _mm_popcnt_u32(op_func(r1[0], r2[0]));</span>
188
<a name="l00161"></a>00161 <span class="comment"> count += _mm_popcnt_u32(op_func(r1[1], r2[1]));</span>
189
<a name="l00162"></a>00162 <span class="comment"> count += _mm_popcnt_u32(op_func(r1[2], r2[2]));</span>
190
<a name="l00163"></a>00163 <span class="comment"> count += _mm_popcnt_u32(op_func(r1[3], r2[3]));</span>
191
<a name="l00164"></a>00164 <span class="comment"></span>
192
<a name="l00165"></a>00165 <span class="comment"> ++mask_block;</span>
193
<a name="l00166"></a>00166 <span class="comment"></span>
194
<a name="l00167"></a>00167 <span class="comment"> } while (++block < block_end);</span>
195
<a name="l00168"></a>00168 <span class="comment">#else</span>
196
<a name="l00169"></a>00169 <span class="comment"> do</span>
197
<a name="l00170"></a>00170 <span class="comment"> {</span>
198
<a name="l00171"></a>00171 <span class="comment"> unsigned *r1 = (unsigned*) block;</span>
199
<a name="l00172"></a>00172 <span class="comment"> unsigned *r2 = (unsigned*) mask_block;</span>
200
<a name="l00173"></a>00173 <span class="comment"></span>
201
<a name="l00174"></a>00174 <span class="comment"> count += _mm_popcnt_u32(op_func(r1[0], r2[0]));</span>
202
<a name="l00175"></a>00175 <span class="comment"> count += _mm_popcnt_u32(op_func(r1[1], r2[1]));</span>
203
<a name="l00176"></a>00176 <span class="comment"> count += _mm_popcnt_u32(op_func(r1[2], r2[2]));</span>
204
<a name="l00177"></a>00177 <span class="comment"> count += _mm_popcnt_u32(op_func(r1[3], r2[3]));</span>
205
<a name="l00178"></a>00178 <span class="comment"></span>
206
<a name="l00179"></a>00179 <span class="comment"> ++mask_block;</span>
207
<a name="l00180"></a>00180 <span class="comment"></span>
208
<a name="l00181"></a>00181 <span class="comment"> } while (++block < block_end);</span>
209
<a name="l00182"></a>00182 <span class="comment">#endif </span>
210
<a name="l00183"></a>00183 <span class="comment"> return count;</span>
211
<a name="l00184"></a>00184 <span class="comment"></span>
212
<a name="l00185"></a>00185 <span class="comment">}</span>
213
<a name="l00186"></a>00186 <span class="comment">*/</span>
214
<a name="l00187"></a>00187
215
<a name="l00188"></a>00188
216
<a name="l00189"></a><a class="code" href="a00099.html#a9b8caa31ab90cbfbc99b5728352e5456">00189</a> <span class="preprocessor">#define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\</span>
217
<a name="l00190"></a>00190 <span class="preprocessor"> sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), mask)</span>
218
<a name="l00191"></a>00191 <span class="preprocessor"></span>
219
<a name="l00192"></a><a class="code" href="a00099.html#a7d425b68d8217dc37d33c660e39a61e1">00192</a> <span class="preprocessor">#define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\</span>
220
<a name="l00193"></a>00193 <span class="preprocessor"> sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), mask)</span>
221
<a name="l00194"></a>00194 <span class="preprocessor"></span>
222
<a name="l00195"></a><a class="code" href="a00099.html#ae022e7bc509d97359a348393a6b25b99">00195</a> <span class="preprocessor">#define VECT_BITCOUNT(first, last) \</span>
223
<a name="l00196"></a>00196 <span class="preprocessor"> sse4_bit_count((__m128i*) (first), (__m128i*) (last)) </span>
224
<a name="l00197"></a>00197 <span class="preprocessor"></span>
225
<a name="l00198"></a><a class="code" href="a00099.html#a3459200618dab65142340f1abd760eb9">00198</a> <span class="preprocessor">#define VECT_BITCOUNT_AND(first, last, mask) \</span>
226
<a name="l00199"></a>00199 <span class="preprocessor"> sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and) </span>
227
<a name="l00200"></a>00200 <span class="preprocessor"></span>
228
<a name="l00201"></a><a class="code" href="a00099.html#a07583d48db4ede7eadbc3156e9dd7d3b">00201</a> <span class="preprocessor">#define VECT_BITCOUNT_OR(first, last, mask) \</span>
229
<a name="l00202"></a>00202 <span class="preprocessor"> sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or) </span>
230
<a name="l00203"></a>00203 <span class="preprocessor"></span>
231
<a name="l00204"></a><a class="code" href="a00099.html#a9954c72d227a3ca78df22a56efad6baf">00204</a> <span class="preprocessor">#define VECT_BITCOUNT_XOR(first, last, mask) \</span>
232
<a name="l00205"></a>00205 <span class="preprocessor"> sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor) </span>
233
<a name="l00206"></a>00206 <span class="preprocessor"></span>
234
<a name="l00207"></a><a class="code" href="a00099.html#a98aaf230431767be4838323266d0b493">00207</a> <span class="preprocessor">#define VECT_BITCOUNT_SUB(first, last, mask) \</span>
235
<a name="l00208"></a>00208 <span class="preprocessor"> sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub) </span>
236
<a name="l00209"></a>00209 <span class="preprocessor"></span>
237
<a name="l00210"></a><a class="code" href="a00099.html#aa31c7d6d7d4cd67e1cc86c08221446ce">00210</a> <span class="preprocessor">#define VECT_INVERT_ARR(first, last) \</span>
238
<a name="l00211"></a>00211 <span class="preprocessor"> sse2_invert_arr(first, last);</span>
239
<a name="l00212"></a>00212 <span class="preprocessor"></span>
240
<a name="l00213"></a><a class="code" href="a00099.html#aa6a3504bb5449abf6f5ec084b2ddfac6">00213</a> <span class="preprocessor">#define VECT_AND_ARR(dst, src, src_end) \</span>
241
<a name="l00214"></a>00214 <span class="preprocessor"> sse2_and_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))</span>
242
<a name="l00215"></a>00215 <span class="preprocessor"></span>
243
<a name="l00216"></a><a class="code" href="a00099.html#ae3cdeec275429ea0d60668da63e96df8">00216</a> <span class="preprocessor">#define VECT_OR_ARR(dst, src, src_end) \</span>
244
<a name="l00217"></a>00217 <span class="preprocessor"> sse2_or_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))</span>
245
<a name="l00218"></a>00218 <span class="preprocessor"></span>
246
<a name="l00219"></a><a class="code" href="a00099.html#af079345cbcb256bb6216c0446fd775f2">00219</a> <span class="preprocessor">#define VECT_SUB_ARR(dst, src, src_end) \</span>
247
<a name="l00220"></a>00220 <span class="preprocessor"> sse2_sub_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))</span>
248
<a name="l00221"></a>00221 <span class="preprocessor"></span>
249
<a name="l00222"></a><a class="code" href="a00099.html#a1f520ac49859f992de5c00f43b851179">00222</a> <span class="preprocessor">#define VECT_XOR_ARR(dst, src, src_end) \</span>
250
<a name="l00223"></a>00223 <span class="preprocessor"> sse2_xor_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))</span>
251
<a name="l00224"></a>00224 <span class="preprocessor"></span>
252
<a name="l00225"></a><a class="code" href="a00099.html#a72b713f259e0542169a1228bebadcede">00225</a> <span class="preprocessor">#define VECT_COPY_BLOCK(dst, src, src_end) \</span>
253
<a name="l00226"></a>00226 <span class="preprocessor"> sse2_copy_block((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))</span>
254
<a name="l00227"></a>00227 <span class="preprocessor"></span>
255
<a name="l00228"></a><a class="code" href="a00099.html#acefd3b9d39a2e2fb0c9d2aa5c3812e0b">00228</a> <span class="preprocessor">#define VECT_SET_BLOCK(dst, dst_end, value) \</span>
256
<a name="l00229"></a>00229 <span class="preprocessor"> sse2_set_block((__m128i*) dst, (__m128i*) (dst_end), (value))</span>
257
<a name="l00230"></a>00230 <span class="preprocessor"></span>
258
<a name="l00231"></a>00231
259
<a name="l00232"></a>00232
260
<a name="l00233"></a>00233
261
<a name="l00234"></a>00234 <span class="comment"></span>
262
<a name="l00235"></a>00235 <span class="comment">/*!</span>
263
<a name="l00236"></a>00236 <span class="comment"> SSE4.2 optimized bitcounting and number of GAPs</span>
264
<a name="l00237"></a>00237 <span class="comment"> @ingroup SSE4</span>
265
<a name="l00238"></a>00238 <span class="comment">*/</span>
266
266
<a name="l00239"></a>00239
267
<a name="l00240"></a>00240 <span class="comment"></span>
268
<a name="l00241"></a>00241 <span class="comment">/*! </span>
269
<a name="l00242"></a>00242 <span class="comment"> @brief AND-NOT (SUB) array elements against another array</span>
270
<a name="l00243"></a>00243 <span class="comment"> *dst &= ~*src</span>
271
<a name="l00244"></a>00244 <span class="comment"></span>
272
<a name="l00245"></a>00245 <span class="comment"> @ingroup SSE2</span>
273
<a name="l00246"></a>00246 <span class="comment">*/</span>
274
<a name="l00247"></a>00247 <a class="code" href="a00092.html#a938734d014fb68dd8b2251fe8ec2b025">BMFORCEINLINE</a>
275
<a name="l00248"></a><a class="code" href="a00122.html#gac99f3b138f8a5e8ffb1296b129f618f0">00248</a> <span class="keywordtype">void</span> <a class="code" href="a00122.html#gac99f3b138f8a5e8ffb1296b129f618f0" title="AND-NOT (SUB) array elements against another array dst &amp;= ~*src.">sse2_sub_arr</a>(__m128i* BMRESTRICT dst,
276
<a name="l00249"></a>00249 <span class="keyword">const</span> __m128i* BMRESTRICT src,
277
<a name="l00250"></a>00250 <span class="keyword">const</span> __m128i* BMRESTRICT src_end)
278
<a name="l00251"></a>00251 {
279
<a name="l00252"></a>00252 __m128i xmm1, xmm2;
280
<a name="l00253"></a>00253 <span class="keywordflow">do</span>
281
<a name="l00254"></a>00254 {
282
<a name="l00255"></a>00255 _mm_prefetch((<span class="keyword">const</span> <span class="keywordtype">char</span>*)(src)+512, _MM_HINT_NTA);
283
<a name="l00256"></a>00256
284
<a name="l00257"></a>00257 xmm1 = _mm_load_si128(src++);
285
<a name="l00258"></a>00258 xmm2 = _mm_load_si128(dst);
286
<a name="l00259"></a>00259 xmm1 = _mm_andnot_si128(xmm1, xmm2);
287
<a name="l00260"></a>00260 _mm_store_si128(dst++, xmm1);
288
<a name="l00261"></a>00261
289
<a name="l00262"></a>00262 xmm1 = _mm_load_si128(src++);
290
<a name="l00263"></a>00263 xmm2 = _mm_load_si128(dst);
291
<a name="l00264"></a>00264 xmm1 = _mm_andnot_si128(xmm1, xmm2);
292
<a name="l00265"></a>00265 _mm_store_si128(dst++, xmm1);
293
<a name="l00266"></a>00266
294
<a name="l00267"></a>00267 xmm1 = _mm_load_si128(src++);
295
<a name="l00268"></a>00268 xmm2 = _mm_load_si128(dst);
296
<a name="l00269"></a>00269 xmm1 = _mm_andnot_si128(xmm1, xmm2);
297
<a name="l00270"></a>00270 _mm_store_si128(dst++, xmm1);
267
<a name="l00240"></a>00240
268
<a name="l00241"></a>00241 <span class="keyword">inline</span>
269
<a name="l00242"></a><a class="code" href="a00124.html#gad686391db4cdc6382ae6785f62981474">00242</a> <a class="code" href="a00116.html#aa3824d882a037396370b16f2f0a8bf37">bm::id_t</a> <a class="code" href="a00124.html#gad686391db4cdc6382ae6785f62981474">sse4_bit_block_calc_count_change</a>(<span class="keyword">const</span> __m128i* BMRESTRICT block,
270
<a name="l00243"></a>00243 <span class="keyword">const</span> __m128i* BMRESTRICT block_end,
271
<a name="l00244"></a>00244 <span class="keywordtype">unsigned</span>* BMRESTRICT bit_count)
272
<a name="l00245"></a>00245 {
273
<a name="l00246"></a>00246 <span class="comment">// __m128i mask1 = _mm_set_epi32(0x1, 0x1, 0x1, 0x1);</span>
274
<a name="l00247"></a>00247 <span class="keyword">register</span> <span class="keywordtype">int</span> count = (block_end - block)*4;
275
<a name="l00248"></a>00248
276
<a name="l00249"></a>00249 <span class="keyword">register</span> <a class="code" href="a00116.html#a17fd5ba52db3ddda05e6f8dd5000a1a4">bm::word_t</a> w0, w_prev;
277
<a name="l00250"></a>00250 <span class="keyword">const</span> <span class="keywordtype">int</span> w_shift = <span class="keyword">sizeof</span>(w0) * 8 - 1;
278
<a name="l00251"></a>00251 <span class="keywordtype">bool</span> first_word = <span class="keyword">true</span>;
279
<a name="l00252"></a>00252 *bit_count = 0;
280
<a name="l00253"></a>00253
281
<a name="l00254"></a>00254 <span class="comment">// first word</span>
282
<a name="l00255"></a>00255 {
283
<a name="l00256"></a>00256 <a class="code" href="a00116.html#a17fd5ba52db3ddda05e6f8dd5000a1a4">bm::word_t</a> w;
284
<a name="l00257"></a>00257 <span class="keyword">const</span> <a class="code" href="a00116.html#a17fd5ba52db3ddda05e6f8dd5000a1a4">bm::word_t</a>* blk = (<span class="keyword">const</span> <a class="code" href="a00116.html#a17fd5ba52db3ddda05e6f8dd5000a1a4">bm::word_t</a>*) block;
285
<a name="l00258"></a>00258 w = w0 = blk[0];
286
<a name="l00259"></a>00259 *bit_count += _mm_popcnt_u32(w);
287
<a name="l00260"></a>00260 w ^= (w >> 1);
288
<a name="l00261"></a>00261 count += _mm_popcnt_u32(w);
289
<a name="l00262"></a>00262 count -= (w_prev = (w0 >> w_shift));
290
<a name="l00263"></a>00263 }
291
<a name="l00264"></a>00264
292
<a name="l00265"></a>00265 <span class="keywordflow">do</span>
293
<a name="l00266"></a>00266 {
294
<a name="l00267"></a>00267 __m128i b = _mm_load_si128(block);
295
<a name="l00268"></a>00268 __m128i tmp2 = _mm_xor_si128(b, _mm_srli_epi32(b, 1)); <span class="comment">// tmp2=(b >> 1) ^ b;</span>
296
<a name="l00269"></a>00269 __m128i tmp3 = _mm_srli_epi32(b, w_shift); <span class="comment">// tmp3 = w0 >> w_shift</span>
297
<a name="l00270"></a>00270 <span class="comment">// __m128i tmp4 = _mm_and_si128(b, mask1); // tmp4 = w0 & 1 </span>
298
298
<a name="l00271"></a>00271
299
<a name="l00272"></a>00272 xmm1 = _mm_load_si128(src++);
300
<a name="l00273"></a>00273 xmm2 = _mm_load_si128(dst);
301
<a name="l00274"></a>00274 xmm1 = _mm_andnot_si128(xmm1, xmm2);
302
<a name="l00275"></a>00275 _mm_store_si128(dst++, xmm1);
303
<a name="l00276"></a>00276
304
<a name="l00277"></a>00277 } <span class="keywordflow">while</span> (src < src_end);
305
<a name="l00278"></a>00278 }
306
<a name="l00279"></a>00279 <span class="comment"></span>
307
<a name="l00280"></a>00280 <span class="comment">/*! </span>
308
<a name="l00281"></a>00281 <span class="comment"> @brief SSE2 block memset</span>
309
<a name="l00282"></a>00282 <span class="comment"> *dst = value</span>
310
<a name="l00283"></a>00283 <span class="comment"></span>
311
<a name="l00284"></a>00284 <span class="comment"> @ingroup SSE2</span>
312
<a name="l00285"></a>00285 <span class="comment">*/</span>
313
<a name="l00286"></a>00286
314
<a name="l00287"></a>00287 <a class="code" href="a00092.html#a938734d014fb68dd8b2251fe8ec2b025">BMFORCEINLINE</a>
315
<a name="l00288"></a><a class="code" href="a00122.html#ga302f4fcd0abf355957b305d16d04f452">00288</a> <span class="keywordtype">void</span> <a class="code" href="a00122.html#ga302f4fcd0abf355957b305d16d04f452" title="SSE2 block memset dst = value.">sse2_set_block</a>(__m128i* BMRESTRICT dst,
316
<a name="l00289"></a>00289 __m128i* BMRESTRICT dst_end,
317
<a name="l00290"></a>00290 <a class="code" href="a00115.html#a17fd5ba52db3ddda05e6f8dd5000a1a4">bm::word_t</a> value)
318
<a name="l00291"></a>00291 {
319
<a name="l00292"></a>00292 __m128i xmm0 = _mm_set_epi32 (value, value, value, value);
320
<a name="l00293"></a>00293 <span class="keywordflow">do</span>
321
<a name="l00294"></a>00294 {
322
<a name="l00295"></a>00295 _mm_store_si128(dst, xmm0);
323
<a name="l00296"></a>00296 <span class="comment">/* </span>
324
<a name="l00297"></a>00297 <span class="comment"> _mm_store_si128(dst+1, xmm0);</span>
325
<a name="l00298"></a>00298 <span class="comment"> _mm_store_si128(dst+2, xmm0);</span>
326
<a name="l00299"></a>00299 <span class="comment"> _mm_store_si128(dst+3, xmm0);</span>
327
<a name="l00300"></a>00300 <span class="comment"></span>
328
<a name="l00301"></a>00301 <span class="comment"> _mm_store_si128(dst+4, xmm0);</span>
329
<a name="l00302"></a>00302 <span class="comment"> _mm_store_si128(dst+5, xmm0);</span>
330
<a name="l00303"></a>00303 <span class="comment"> _mm_store_si128(dst+6, xmm0);</span>
331
<a name="l00304"></a>00304 <span class="comment"> _mm_store_si128(dst+7, xmm0);</span>
332
<a name="l00305"></a>00305 <span class="comment"></span>
333
<a name="l00306"></a>00306 <span class="comment"> dst += 8;</span>
334
<a name="l00307"></a>00307 <span class="comment">*/</span>
335
<a name="l00308"></a>00308 } <span class="keywordflow">while</span> (++dst < dst_end);
336
<a name="l00309"></a>00309
337
<a name="l00310"></a>00310 _mm_sfence();
338
<a name="l00311"></a>00311 }
339
<a name="l00312"></a>00312
340
<a name="l00313"></a>00313
341
<a name="l00314"></a>00314 <span class="comment"></span>
342
<a name="l00315"></a>00315 <span class="comment">/*! </span>
343
<a name="l00316"></a>00316 <span class="comment"> @brief SSE2 block copy</span>
344
<a name="l00317"></a>00317 <span class="comment"> *dst = *src</span>
345
<a name="l00318"></a>00318 <span class="comment"></span>
346
<a name="l00319"></a>00319 <span class="comment"> @ingroup SSE2</span>
347
<a name="l00320"></a>00320 <span class="comment">*/</span>
348
<a name="l00321"></a>00321 <a class="code" href="a00092.html#a938734d014fb68dd8b2251fe8ec2b025">BMFORCEINLINE</a>
349
<a name="l00322"></a><a class="code" href="a00122.html#ga571dd54af5c555cad9dfa6bef4561777">00322</a> <span class="keywordtype">void</span> <a class="code" href="a00122.html#ga571dd54af5c555cad9dfa6bef4561777" title="SSE2 block copy dst = *src.">sse2_copy_block</a>(__m128i* BMRESTRICT dst,
350
<a name="l00323"></a>00323 <span class="keyword">const</span> __m128i* BMRESTRICT src,
351
<a name="l00324"></a>00324 <span class="keyword">const</span> __m128i* BMRESTRICT src_end)
352
<a name="l00325"></a>00325 {
353
<a name="l00326"></a>00326 __m128i xmm0, xmm1, xmm2, xmm3;
354
<a name="l00327"></a>00327 <span class="keywordflow">do</span>
355
<a name="l00328"></a>00328 {
356
<a name="l00329"></a>00329 _mm_prefetch((<span class="keyword">const</span> <span class="keywordtype">char</span>*)(src)+512, _MM_HINT_NTA);
357
<a name="l00330"></a>00330
358
<a name="l00331"></a>00331 xmm0 = _mm_load_si128(src+0);
359
<a name="l00332"></a>00332 xmm1 = _mm_load_si128(src+1);
360
<a name="l00333"></a>00333 xmm2 = _mm_load_si128(src+2);
361
<a name="l00334"></a>00334 xmm3 = _mm_load_si128(src+3);
362
<a name="l00335"></a>00335
363
<a name="l00336"></a>00336 _mm_store_si128(dst+0, xmm0);
364
<a name="l00337"></a>00337 _mm_store_si128(dst+1, xmm1);
365
<a name="l00338"></a>00338 _mm_store_si128(dst+2, xmm2);
366
<a name="l00339"></a>00339 _mm_store_si128(dst+3, xmm3);
367
<a name="l00340"></a>00340
368
<a name="l00341"></a>00341 xmm0 = _mm_load_si128(src+4);
369
<a name="l00342"></a>00342 xmm1 = _mm_load_si128(src+5);
370
<a name="l00343"></a>00343 xmm2 = _mm_load_si128(src+6);
371
<a name="l00344"></a>00344 xmm3 = _mm_load_si128(src+7);
372
<a name="l00345"></a>00345
373
<a name="l00346"></a>00346 _mm_store_si128(dst+4, xmm0);
374
<a name="l00347"></a>00347 _mm_store_si128(dst+5, xmm1);
375
<a name="l00348"></a>00348 _mm_store_si128(dst+6, xmm2);
376
<a name="l00349"></a>00349 _mm_store_si128(dst+7, xmm3);
377
<a name="l00350"></a>00350
378
<a name="l00351"></a>00351 src += 8;
379
<a name="l00352"></a>00352 dst += 8;
380
<a name="l00353"></a>00353
381
<a name="l00354"></a>00354 } <span class="keywordflow">while</span> (src < src_end);
382
<a name="l00355"></a>00355 }
383
<a name="l00356"></a>00356 <span class="comment"></span>
384
<a name="l00357"></a>00357 <span class="comment">/*! </span>
385
<a name="l00358"></a>00358 <span class="comment"> @brief Invert array elements</span>
386
<a name="l00359"></a>00359 <span class="comment"> *dst = ~*dst</span>
387
<a name="l00360"></a>00360 <span class="comment"> or</span>
388
<a name="l00361"></a>00361 <span class="comment"> *dst ^= *dst </span>
389
<a name="l00362"></a>00362 <span class="comment"></span>
390
<a name="l00363"></a>00363 <span class="comment"> @ingroup SSE2</span>
391
<a name="l00364"></a>00364 <span class="comment">*/</span>
392
<a name="l00365"></a>00365 <a class="code" href="a00092.html#a938734d014fb68dd8b2251fe8ec2b025">BMFORCEINLINE</a>
393
<a name="l00366"></a><a class="code" href="a00122.html#ga8d506147673d88005f92caee7f5dd23a">00366</a> <span class="keywordtype">void</span> <a class="code" href="a00122.html#ga8d506147673d88005f92caee7f5dd23a" title="Invert array elements dst = ~*dst or dst ^= *dst.">sse2_invert_arr</a>(<a class="code" href="a00115.html#a17fd5ba52db3ddda05e6f8dd5000a1a4">bm::word_t</a>* first, <a class="code" href="a00115.html#a17fd5ba52db3ddda05e6f8dd5000a1a4">bm::word_t</a>* last)
394
<a name="l00367"></a>00367 {
395
<a name="l00368"></a>00368 __m128i xmm1 = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF,
396
<a name="l00369"></a>00369 0xFFFFFFFF, 0xFFFFFFFF);
397
<a name="l00370"></a>00370 __m128i* wrd_ptr = (__m128i*)first;
398
<a name="l00371"></a>00371
399
<a name="l00372"></a>00372 <span class="keywordflow">do</span>
400
<a name="l00373"></a>00373 {
401
<a name="l00374"></a>00374 _mm_prefetch((<span class="keyword">const</span> <span class="keywordtype">char</span>*)(wrd_ptr)+512, _MM_HINT_NTA);
402
<a name="l00375"></a>00375
403
<a name="l00376"></a>00376 __m128i xmm0 = _mm_load_si128(wrd_ptr);
404
<a name="l00377"></a>00377 xmm0 = _mm_xor_si128(xmm0, xmm1);
405
<a name="l00378"></a>00378 _mm_store_si128(wrd_ptr, xmm0);
406
<a name="l00379"></a>00379 ++wrd_ptr;
407
<a name="l00380"></a>00380 } <span class="keywordflow">while</span> (wrd_ptr < (__m128i*)last);
408
<a name="l00381"></a>00381 }
409
<a name="l00382"></a>00382
410
<a name="l00383"></a>00383 <a class="code" href="a00092.html#a938734d014fb68dd8b2251fe8ec2b025">BMFORCEINLINE</a>
411
<a name="l00384"></a><a class="code" href="a00115.html#ac0c75fb7b3dc61602843ac4e1b9b7ef5">00384</a> __m128i <a class="code" href="a00115.html#ac0c75fb7b3dc61602843ac4e1b9b7ef5">sse2_and</a>(__m128i a, __m128i b)
412
<a name="l00385"></a>00385 {
413
<a name="l00386"></a>00386 <span class="keywordflow">return</span> _mm_and_si128(a, b);
414
<a name="l00387"></a>00387 }
415
<a name="l00388"></a>00388
416
<a name="l00389"></a>00389 <a class="code" href="a00092.html#a938734d014fb68dd8b2251fe8ec2b025">BMFORCEINLINE</a>
417
<a name="l00390"></a><a class="code" href="a00115.html#adea798a9a95a04845c33876087a2f46b">00390</a> __m128i <a class="code" href="a00115.html#adea798a9a95a04845c33876087a2f46b">sse2_or</a>(__m128i a, __m128i b)
418
<a name="l00391"></a>00391 {
419
<a name="l00392"></a>00392 <span class="keywordflow">return</span> _mm_or_si128(a, b);
420
<a name="l00393"></a>00393 }
421
<a name="l00394"></a>00394
422
<a name="l00395"></a>00395
423
<a name="l00396"></a>00396 <a class="code" href="a00092.html#a938734d014fb68dd8b2251fe8ec2b025">BMFORCEINLINE</a>
424
<a name="l00397"></a><a class="code" href="a00115.html#a6f5de19ee3e1be05037908b4777c4da8">00397</a> __m128i <a class="code" href="a00115.html#a6f5de19ee3e1be05037908b4777c4da8">sse2_xor</a>(__m128i a, __m128i b)
425
<a name="l00398"></a>00398 {
426
<a name="l00399"></a>00399 <span class="keywordflow">return</span> _mm_xor_si128(a, b);
427
<a name="l00400"></a>00400 }
428
<a name="l00401"></a>00401
429
<a name="l00402"></a>00402 <a class="code" href="a00092.html#a938734d014fb68dd8b2251fe8ec2b025">BMFORCEINLINE</a>
430
<a name="l00403"></a><a class="code" href="a00115.html#ab3e6d46fcba1bc2a1a5390c10f571382">00403</a> __m128i <a class="code" href="a00115.html#ab3e6d46fcba1bc2a1a5390c10f571382">sse2_sub</a>(__m128i a, __m128i b)
431
<a name="l00404"></a>00404 {
432
<a name="l00405"></a>00405 <span class="keywordflow">return</span> _mm_andnot_si128(b, a);
433
<a name="l00406"></a>00406 }
434
<a name="l00407"></a>00407
435
<a name="l00408"></a>00408
436
<a name="l00409"></a>00409
437
<a name="l00410"></a>00410 } <span class="comment">// namespace</span>
438
<a name="l00411"></a>00411
439
<a name="l00412"></a>00412
440
<a name="l00413"></a>00413
441
<a name="l00414"></a>00414 <span class="preprocessor">#endif</span>
299
<a name="l00272"></a>00272 <span class="comment">// ---------------------------------------------------------------------</span>
300
<a name="l00273"></a>00273 {
301
<a name="l00274"></a>00274 <span class="keywordflow">if</span> (first_word)
302
<a name="l00275"></a>00275 {
303
<a name="l00276"></a>00276 first_word = <span class="keyword">false</span>;
304
<a name="l00277"></a>00277 }
305
<a name="l00278"></a>00278 <span class="keywordflow">else</span>
306
<a name="l00279"></a>00279 {
307
<a name="l00280"></a>00280 w0 = _mm_extract_epi32(b, 0);
308
<a name="l00281"></a>00281 <span class="keywordflow">if</span> (w0)
309
<a name="l00282"></a>00282 {
310
<a name="l00283"></a>00283 *bit_count += _mm_popcnt_u32(w0);
311
<a name="l00284"></a>00284 count += _mm_popcnt_u32(_mm_extract_epi32(tmp2, 0));
312
<a name="l00285"></a>00285 count -= !(w_prev ^ (w0 & 1));
313
<a name="l00286"></a>00286 count -= w_prev = _mm_extract_epi32(tmp3, 0);
314
<a name="l00287"></a>00287 }
315
<a name="l00288"></a>00288 <span class="keywordflow">else</span>
316
<a name="l00289"></a>00289 {
317
<a name="l00290"></a>00290 count -= !w_prev; w_prev ^= w_prev;
318
<a name="l00291"></a>00291 }
319
<a name="l00292"></a>00292 }
320
<a name="l00293"></a>00293 w0 = _mm_extract_epi32(b, 1);
321
<a name="l00294"></a>00294 <span class="keywordflow">if</span> (w0)
322
<a name="l00295"></a>00295 {
323
<a name="l00296"></a>00296 *bit_count += _mm_popcnt_u32(w0);
324
<a name="l00297"></a>00297 count += _mm_popcnt_u32(_mm_extract_epi32(tmp2, 1));
325
<a name="l00298"></a>00298 count -= !(w_prev ^ (w0 & 1));
326
<a name="l00299"></a>00299 count -= w_prev = _mm_extract_epi32(tmp3, 1);
327
<a name="l00300"></a>00300 }
328
<a name="l00301"></a>00301 <span class="keywordflow">else</span>
329
<a name="l00302"></a>00302 {
330
<a name="l00303"></a>00303 count -= !w_prev; w_prev ^= w_prev;
331
<a name="l00304"></a>00304 }
332
<a name="l00305"></a>00305 w0 = _mm_extract_epi32(b, 2);
333
<a name="l00306"></a>00306 <span class="keywordflow">if</span> (w0)
334
<a name="l00307"></a>00307 {
335
<a name="l00308"></a>00308 *bit_count += _mm_popcnt_u32(w0);
336
<a name="l00309"></a>00309 count += _mm_popcnt_u32(_mm_extract_epi32(tmp2, 2));
337
<a name="l00310"></a>00310 count -= !(w_prev ^ (w0 & 1));
338
<a name="l00311"></a>00311 count -= w_prev = _mm_extract_epi32(tmp3, 2);
339
<a name="l00312"></a>00312 }
340
<a name="l00313"></a>00313 <span class="keywordflow">else</span>
341
<a name="l00314"></a>00314 {
342
<a name="l00315"></a>00315 count -= !w_prev; w_prev ^= w_prev;
343
<a name="l00316"></a>00316 }
344
<a name="l00317"></a>00317 w0 = _mm_extract_epi32(b, 3);
345
<a name="l00318"></a>00318 <span class="keywordflow">if</span> (w0)
346
<a name="l00319"></a>00319 {
347
<a name="l00320"></a>00320 *bit_count += _mm_popcnt_u32(w0);
348
<a name="l00321"></a>00321 count += _mm_popcnt_u32(_mm_extract_epi32(tmp2, 3));
349
<a name="l00322"></a>00322 count -= !(w_prev ^ (w0 & 1));
350
<a name="l00323"></a>00323 count -= w_prev = _mm_extract_epi32(tmp3, 3);
351
<a name="l00324"></a>00324 }
352
<a name="l00325"></a>00325 <span class="keywordflow">else</span>
353
<a name="l00326"></a>00326 {
354
<a name="l00327"></a>00327 count -= !w_prev; w_prev ^= w_prev;
355
<a name="l00328"></a>00328 }
356
<a name="l00329"></a>00329 }
357
<a name="l00330"></a>00330 } <span class="keywordflow">while</span> (++block < block_end);
358
<a name="l00331"></a>00331
359
<a name="l00332"></a>00332 <span class="keywordflow">return</span> count;
360
<a name="l00333"></a>00333 }
361
<a name="l00334"></a>00334
362
<a name="l00335"></a>00335
363
<a name="l00336"></a>00336
364
<a name="l00337"></a>00337 } <span class="comment">// namespace</span>
365
<a name="l00338"></a>00338
366
<a name="l00339"></a>00339
367
<a name="l00340"></a>00340
368
<a name="l00341"></a>00341
369
<a name="l00342"></a>00342 <span class="preprocessor">#endif</span>
442
370
</pre></div></div>
443
<hr size="1"/><address style="text-align: right;"><small>Generated on Fri Jan 8 20:09:19 2010 for BitMagic by
371
<hr size="1"/><address style="text-align: right;"><small>Generated on Tue Mar 23 20:47:55 2010 for BitMagic by
444
372
<a href="http://www.doxygen.org/index.html">
445
373
<img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.6.1 </small></address>