~ubuntu-branches/ubuntu/intrepid/bmagic/intrepid

Viewing changes to src/bmsse2.h

Committer: Bazaar Package Importer
Author(s): Andres Salomon
Date: 2008-01-05 23:58:56 UTC
mfrom: (1.1.1 upstream)
Revision ID: james.westby@ubuntu.com-20080105235856-2kmxhxkz14qjy9ia

Tags: 3.5.0-1

* New upstream release.
* Add tcpp.dpatch.  This stops tests/stress/t.cpp from including
  ncbi_pch.hpp.  As far as I can tell, NCBI is not used at all, I have
  no idea where that came from..
* Silence some lintian warnings; binary-arch-rules-but-pkg-is-arch-indep
  and ancient-standards-version.

files added:
debian/patches/tcpp.dpatch

html/a00120.html

html/a00121.html

html/a00122.html

html/a00123.html

html/a00124.html

html/a00125.html

html/a00126.html

html/a00127.html

html/a00128.html

html/a00129.html

html/a00130.html

html/a00131.html

html/a00132.html

html/a00133.html

html/a00134.html

html/a00135.html

html/a00136.html

html/a00137.html

html/a00138.html

html/a00139.html

html/a00140.html

html/a00141.html

html/a00142.html

html/a00143.html

html/a00144.html

html/a00145.html

html/a00146.html

html/a00147.html

html/a00148.html

html/a00149.html

html/a00150.html

html/a00151.html

html/a00152.html

html/a00153.html

html/a00154.html

html/a00166.map

html/a00166.md5

html/a00166.png

html/a00168.map

html/a00168.md5

html/a00168.png

html/a00170.map

html/a00170.md5

html/a00170.png

html/a00172.map

html/a00172.md5

html/a00172.png

html/a00174.map

html/a00174.md5

html/a00174.png

html/a00176.map

html/a00176.md5

html/a00176.png

html/a00178.map

html/a00178.md5

html/a00178.png

html/a00180.map

html/a00180.md5

html/a00180.png

html/a00182.map

html/a00182.md5

html/a00182.png

html/a00184.map

html/a00184.md5

html/a00184.png

html/a00186.map

html/a00186.md5

html/a00186.png

html/a00188.map

html/a00188.md5

html/a00188.png

html/a00197.map

html/a00197.md5

html/a00197.png

html/a00199.map

html/a00199.md5

html/a00199.png

html/a00201.map

html/a00201.md5

html/a00201.png

html/a00203.map

html/a00203.md5

html/a00203.png

html/a00205.map

html/a00205.md5

html/a00205.png

html/a00207.map

html/a00207.md5

html/a00207.png

html/a00209.map

html/a00209.md5

html/a00209.png

html/a00211.map

html/a00211.md5

html/a00211.png

html/a00213.map

html/a00213.md5

html/a00213.png

html/a00215.map

html/a00215.md5

html/a00215.png

html/a00217.map

html/a00217.md5

html/a00217.png

html/a00219.map

html/a00219.md5

html/a00219.png

html/a00221.map

html/a00221.md5

html/a00221.png

html/a00223.map

html/a00223.md5

html/a00223.png

html/a00225.map

html/a00225.md5

html/a00225.png

html/a00227.map

html/a00227.md5

html/a00227.png

html/a00229.map

html/a00229.md5

html/a00229.png

html/a00231.map

html/a00231.md5

html/a00231.png

html/a00254.map

html/a00254.md5

html/a00254.png

html/a00256.map

html/a00256.md5

html/a00256.png

html/a00258.map

html/a00258.md5

html/a00258.png

html/a00260.map

html/a00260.md5

html/a00260.png

html/a00262.map

html/a00262.md5

html/a00262.png

html/a00264.map

html/a00264.md5

html/a00264.png

html/a00266.map

html/a00266.md5

html/a00266.png

html/a00268.map

html/a00268.md5

html/a00268.png

html/a00270.map

html/a00270.md5

html/a00270.png

html/a00272.map

html/a00272.md5

html/a00272.png

html/a00274.map

html/a00274.md5

html/a00274.png

html/a00276.map

html/a00276.md5

html/a00276.png

html/a00278.map

html/a00278.md5

html/a00278.png

html/a00280.map

html/a00280.md5

html/a00280.png

html/a00282.map

html/a00282.md5

html/a00282.png

html/a00284.map

html/a00284.md5

html/a00284.png

html/a00286.map

html/a00286.md5

html/a00286.png

html/a00288.map

html/a00288.md5

html/a00288.png

html/a00290.map

html/a00290.md5

html/a00290.png

html/a00292.map

html/a00292.md5

html/a00292.png

html/a00294.map

html/a00294.md5

html/a00294.png

html/a00296.map

html/a00296.md5

html/a00296.png

html/a00298.map

html/a00298.md5

html/a00298.png

html/a00300.map

html/a00300.md5

html/a00300.png

html/a00302.map

html/a00302.md5

html/a00302.png

html/a00304.map

html/a00304.md5

html/a00304.png

html/a00306.map

html/a00306.md5

html/a00306.png

html/a00308.map

html/a00308.md5

html/a00308.png

html/a00310.map

html/a00310.md5

html/a00310.png

html/a00312.map

html/a00312.md5

html/a00312.png

html/a00314.map

html/a00314.md5

html/a00314.png

html/a00316.map

html/a00316.md5

html/a00316.png

html/a00318.map

html/a00318.md5

html/a00318.png

html/a00320.map

html/a00320.md5

html/a00320.png

html/a00322.map

html/a00322.md5

html/a00322.png

html/a00324.map

html/a00324.md5

html/a00324.png

html/a00326.map

html/a00326.md5

html/a00326.png

html/a00328.map

html/a00328.md5

html/a00328.png

src/bmalgo_impl.h

files removed:
html/a00034.html

html/a00035.html

html/a00036.html

html/a00037.html

html/a00038.html

html/a00039.html

html/a00040.html

html/a00041.html

html/a00042.html

html/a00043.html

html/a00044.html

html/a00045.html

html/a00046.html

html/a00047.html

html/a00048.html

html/a00049.html

html/a00050.html

html/a00051.html

html/a00061.html

html/a00094.png

html/a00095.png

html/a00096.png

html/a00097.png

html/a00098.png

html/a00099.png

html/a00100.png

html/a00101.png

html/a00129.map

html/a00129.md5

html/a00129.png

html/a00130.map

html/a00130.md5

html/a00130.png

html/a00131.map

html/a00131.md5

html/a00132.map

html/a00132.md5

html/a00133.map

html/a00133.md5

html/a00134.map

html/a00134.md5

html/a00135.map

html/a00135.md5

html/a00136.map

html/a00136.md5

html/a00137.map

html/a00137.md5

html/a00137.png

html/a00138.map

html/a00138.md5

html/a00138.png

html/a00139.map

html/a00139.md5

html/a00139.png

html/a00140.map

html/a00140.md5

html/a00140.png

html/a00141.map

html/a00141.md5

html/a00141.png

html/a00142.map

html/a00142.md5

html/a00142.png

html/a00143.map

html/a00143.md5

html/a00143.png

html/a00144.map

html/a00144.md5

html/a00144.png

html/a00145.map

html/a00145.md5

html/a00145.png

html/a00146.map

html/a00146.md5

html/a00146.png

html/a00147.map

html/a00147.md5

html/a00147.png

html/a00148.map

html/a00148.md5

html/a00148.png

html/a00149.map

html/a00149.md5

html/a00149.png

html/a00150.map

html/a00150.md5

html/a00150.png

html/a00151.map

html/a00151.md5

html/a00151.png

html/a00152.map

html/a00152.md5

html/a00152.png

html/a00163.map

html/a00163.md5

html/a00163.png

html/a00189.map

html/a00189.md5

html/a00189.png

html/a00200.map

html/a00200.md5

html/a00200.png

html/a00202.map

html/a00202.md5

html/a00202.png

html/a00204.map

html/a00204.md5

html/a00204.png

html/a00206.map

html/a00206.md5

html/a00206.png

html/a00208.map

html/a00208.md5

html/a00208.png

html/a00210.map

html/a00210.md5

html/a00210.png

html/a00212.map

html/a00212.md5

html/a00212.png

html/a00214.map

html/a00214.md5

html/a00214.png

html/a00216.map

html/a00216.md5

html/a00216.png

html/a00218.map

html/a00218.md5

html/a00218.png

html/a00220.map

html/a00220.md5

html/a00220.png

html/a00222.map

html/a00222.md5

html/a00222.png

html/a00224.map

html/a00224.md5

html/a00224.png

html/a00226.map

html/a00226.md5

html/a00226.png

html/a00228.map

html/a00228.md5

html/a00228.png

html/a00230.map

html/a00230.md5

html/a00230.png

html/a00232.map

html/a00232.md5

html/a00232.png

html/a00234.map

html/a00234.md5

html/a00234.png

html/a00236.map

html/a00236.md5

html/a00236.png

html/a00238.map

html/a00238.md5

html/a00238.png

html/a00240.map

html/a00240.md5

html/a00240.png

files modified:
Doxyfile *

Makefile *

bm.dsw *

canon-system

debian/changelog

debian/control

debian/copyright

debian/patches/00list

debian/rules

html/a00052.html *

html/a00053.html *

html/a00054.html *

html/a00055.html *

html/a00056.html *

html/a00057.html *

html/a00058.html *

html/a00059.html *

html/a00060.html *

html/a00062.html *

html/a00063.html *

html/a00064.html *

html/a00065.html *

html/a00066.html *

html/a00067.html *

html/a00068.html *

html/a00069.html *

html/a00070.html *

html/a00071.html *

html/a00072.html *

html/a00073.html *

html/a00074.html *

html/a00075.html *

html/a00076.html *

html/a00077.html *

html/a00078.html *

html/a00079.html *

html/a00080.html *

html/a00081.html *

html/a00082.html *

html/a00083.html *

html/a00084.html *

html/a00085.html *

html/a00086.html *

html/a00087.html *

html/a00088.html *

html/a00089.html *

html/a00090.html *

html/a00091.html *

html/a00092.html *

html/a00093.html *

html/a00094.html *

html/a00095.html *

html/a00096.html *

html/a00097.html *

html/a00098.html *

html/a00099.html *

html/a00100.html *

html/a00101.html *

html/a00102.html *

html/a00103.html *

html/a00104.html *

html/a00105.html *

html/a00106.html *

html/a00107.html *

html/a00108.html *

html/a00109.html *

html/a00110.html *

html/a00111.html *

html/a00112.html *

html/a00113.html *

html/a00114.html *

html/a00115.html *

html/a00116.html *

html/a00117.html *

html/a00118.html *

html/a00119.html *

html/a00131.png *

html/a00132.png *

html/a00133.png *

html/a00134.png *

html/a00135.png *

html/a00136.png *

html/a00165.map *

html/a00165.md5 *

html/a00165.png *

html/a00167.map *

html/a00167.md5 *

html/a00167.png *

html/a00169.map *

html/a00169.md5 *

html/a00169.png *

html/a00171.map *

html/a00171.md5 *

html/a00171.png *

html/a00173.map *

html/a00173.md5 *

html/a00173.png *

html/a00175.map *

html/a00175.md5 *

html/a00175.png *

html/a00177.map *

html/a00177.md5 *

html/a00177.png *

html/a00179.map *

html/a00179.md5 *

html/a00179.png *

html/a00181.map *

html/a00181.md5 *

html/a00181.png *

html/a00183.map *

html/a00183.md5 *

html/a00183.png *

html/a00185.map *

html/a00185.md5 *

html/a00185.png *

html/a00187.map *

html/a00187.md5 *

html/a00187.png *

html/a00242.map *

html/a00242.md5 *

html/a00242.png *

html/a00244.map *

html/a00244.md5 *

html/a00244.png *

html/a00246.map *

html/a00246.md5 *

html/a00246.png *

html/a00248.map *

html/a00248.md5 *

html/a00248.png *

html/a00250.map *

html/a00250.md5 *

html/a00250.png *

html/a00252.map *

html/a00252.md5 *

html/a00252.png *

html/annotated.html *

html/classes.html *

html/dir_000000.html *

html/dir_000000_dep.map *

html/dir_000000_dep.png *

html/dir_000001.html *

html/dir_000001_dep.map *

html/dir_000001_dep.png *

html/dir_000002.html *

html/dir_000002_000000.html *

html/dir_000002_dep.map *

html/dir_000002_dep.png *

html/dir_000003.html *

html/dir_000003_000000.html *

html/dir_000003_dep.map *

html/dir_000003_dep.png *

html/dir_000004.html *

html/dir_000004_000000.html *

html/dir_000004_dep.map *

html/dir_000004_dep.png *

html/dir_000005.html *

html/dir_000005_000000.html *

html/dir_000005_dep.map *

html/dir_000005_dep.png *

html/dir_000006.html *

html/dir_000006_000000.html *

html/dir_000006_dep.map *

html/dir_000006_dep.png *

html/dir_000007.html *

html/dir_000007_000000.html *

html/dir_000007_dep.map *

html/dir_000007_dep.png *

html/dir_000008.html *

html/dir_000008_000000.html *

html/dir_000008_dep.map *

html/dir_000008_dep.png *

html/dir_000009.html *

html/dir_000009_000000.html *

html/dir_000009_dep.map *

html/dir_000009_dep.png *

html/dir_000010.html *

html/dir_000010_000000.html *

html/dir_000010_dep.map *

html/dir_000010_dep.png *

html/dirs.html *

html/doxygen.css *

html/doxygen.png *

html/examples.html *

html/files.html *

html/functions.html *

html/functions_enum.html *

html/functions_eval.html *

html/functions_func.html *

html/functions_rela.html *

html/functions_type.html *

html/functions_vars.html *

html/globals.html *

html/globals_defs.html *

html/globals_enum.html *

html/globals_eval.html *

html/globals_func.html *

html/globals_type.html *

html/globals_vars.html *

html/graph_legend.dot *

html/graph_legend.html *

html/graph_legend.png *

html/hierarchy.html *

html/index.html *

html/inherits.html *

html/modules.html *

html/namespacemembers.html *

html/namespacemembers_enum.html *

html/namespacemembers_eval.html *

html/namespacemembers_func.html *

html/namespacemembers_type.html *

html/namespacemembers_vars.html *

html/namespaces.html *

license.txt *

makefile.in *

platforms/alpha-osf51.mk *

platforms/cygwin.mk *

platforms/intel-linux-gnu.mk *

platforms/intel-solaris.mk *

platforms/mips-irix6.5.mk *

platforms/rs6000.mk *

platforms/sparc-solaris2.mk *

readme *

samples/sample1/Makefile *

samples/sample1/sample1.cpp *

samples/sample1/sample1.dsp *

samples/sample2/Makefile *

samples/sample2/sample2.cpp *

samples/sample2/sample2.dsp *

samples/sample3/Makefile *

samples/sample3/sample3.cpp *

samples/sample3/sample3.dsp *

samples/sample4/Makefile *

samples/sample4/sample4.cpp *

samples/sample4/sample4.dsp *

samples/sample5/Makefile *

samples/sample5/sample5.cpp *

samples/sample5/sample5.dsp *

samples/sample6/Makefile *

samples/sample6/sample6.cpp *

samples/sample6/sample6.dsp *

samples/sample7/Makefile *

samples/sample7/sample7.cpp *

samples/sample7/sample7.dsp *

samples/sample8/sample8.cpp

samples/sample9/Makefile *

samples/sample9/sample9.cpp *

src/bm.dsp *

src/bm.h *

src/bmalgo.h *

src/bmalloc.h *

src/bmblocks.h *

src/bmconst.h *

src/bmdef.h *

src/bmfunc.h *

src/bmfwd.h *

src/bmserial.h *

src/bmsse2.h *

src/bmundef.h *

src/bmvmin.h *

src/encoding.h *

tests/perf/Makefile *

tests/perf/perf.cpp *

tests/perf/perf.dsp *

tests/stress/Makefile *

tests/stress/rlebtv.h *

tests/stress/t.cpp *

tests/stress/test.dsp *

Show diffs side-by-side

added added

removed removed

src/bmsse2.h

Permission is hereby granted, free of charge, to any person

obtaining a copy of this software and associated documentation

files (the "Software"), to deal in the Software without restriction,

including without limitation the rights to use, copy, modify, merge,

publish, distribute, sublicense, and/or sell copies of the Software,

and to permit persons to whom the Software is furnished to do so,

subject to the following conditions:

The above copyright notice and this permission notice shall be included

in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,

EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES

OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.

IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,

DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,

ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR

OTHER DEALINGS IN THE SOFTWARE.

For more information please visit: http://bmagic.sourceforge.net

#ifndef BMSSE2__H__INCLUDED__

#define BMSSE2__H__INCLUDED__

// Header implements processor specific intrinsics declarations for SSE2

// instruction set

#include<emmintrin.h>

namespace bm

{

/** @defgroup SSE2 Processor specific optimizations for SSE2 instructions

* @ingroup bmagic

/*!

@brief SSE2 reinitialization guard class

SSE2 requires to call _mm_empty() if we are intermixing

MMX integer commands with floating point arithmetics.

This class guards critical code fragments where SSE2 integer

is used.

@ingroup SSE2

class sse2_empty_guard

{

public:

BMFORCEINLINE sse2_empty_guard()

{

_mm_empty();

}

BMFORCEINLINE ~sse2_empty_guard()

{

_mm_empty();

}

};

# ifndef BM_SET_MMX_GUARD

# define BM_SET_MMX_GUARD sse2_empty_guard bm_mmx_guard_;

# endif

/*!

@brief XOR array elements to specified mask

*dst = *src ^ mask

@ingroup SSE2

BMFORCEINLINE

void sse2_xor_arr_2_mask(__m128i* BMRESTRICT dst,

const __m128i* BMRESTRICT src,

const __m128i* BMRESTRICT src_end,

bm::word_t mask)

{

__m128i xmm2 = _mm_set_epi32(mask, mask, mask, mask);

{

__m128i xmm1 = _mm_load_si128(src);

xmm1 = _mm_xor_si128(xmm1, xmm2);

_mm_store_si128(dst, xmm1);

++dst;

++src;

} while (src < src_end);

100

}

101

102

/*!

103

@brief Inverts array elements and NOT them to specified mask

104

*dst = ~*src & mask

105

106

@ingroup SSE2

107

108

BMFORCEINLINE

109

void sse2_andnot_arr_2_mask(__m128i* BMRESTRICT dst,

110

const __m128i* BMRESTRICT src,

111

const __m128i* BMRESTRICT src_end,

112

bm::word_t mask)

113

{

114

__m128i xmm2 = _mm_set_epi32(mask, mask, mask, mask);

115

116

{

117

//_mm_prefetch((const char*)(src)+1024, _MM_HINT_NTA);

118

//_mm_prefetch((const char*)(src)+1088, _MM_HINT_NTA);

119

120

__m128i xmm1 = _mm_load_si128(src);

121

122

xmm1 = _mm_andnot_si128(xmm1, xmm2); // xmm1 = (~xmm1) & xmm2

123

_mm_store_si128(dst, xmm1);

124

++dst;

125

++src;

126

127

} while (src < src_end);

128

}

129

130

/*!

131

@brief AND array elements against another array

132

*dst &= *src

133

134

@ingroup SSE2

135

136

BMFORCEINLINE

137

void sse2_and_arr(__m128i* BMRESTRICT dst,

138

const __m128i* BMRESTRICT src,

139

const __m128i* BMRESTRICT src_end)

140

{

141

__m128i xmm1, xmm2;

142

143

{

144

_mm_prefetch((const char*)(src)+512, _MM_HINT_NTA);

145

146

xmm1 = _mm_load_si128(src++);

147

xmm2 = _mm_load_si128(dst);

148

xmm1 = _mm_and_si128(xmm1, xmm2);

149

_mm_store_si128(dst++, xmm1);

150

151

xmm1 = _mm_load_si128(src++);

152

xmm2 = _mm_load_si128(dst);

153

xmm1 = _mm_and_si128(xmm1, xmm2);

154

_mm_store_si128(dst++, xmm1);

155

156

xmm1 = _mm_load_si128(src++);

157

xmm2 = _mm_load_si128(dst);

158

xmm1 = _mm_and_si128(xmm1, xmm2);

159

_mm_store_si128(dst++, xmm1);

160

161

xmm1 = _mm_load_si128(src++);

162

xmm2 = _mm_load_si128(dst);

163

xmm1 = _mm_and_si128(xmm1, xmm2);

164

_mm_store_si128(dst++, xmm1);

165

166

} while (src < src_end);

167

168

}

169

170

171

172

/*!

173

@brief OR array elements against another array

174

*dst |= *src

175

176

@ingroup SSE2

177

178

BMFORCEINLINE

179

void sse2_or_arr(__m128i* BMRESTRICT dst,

180

const __m128i* BMRESTRICT src,

181

const __m128i* BMRESTRICT src_end)

182

{

183

__m128i xmm1, xmm2;

184

185

{

186

_mm_prefetch((const char*)(src)+512, _MM_HINT_NTA);

187

188

xmm1 = _mm_load_si128(src++);

189

xmm2 = _mm_load_si128(dst);

190

xmm1 = _mm_or_si128(xmm1, xmm2);

191

_mm_store_si128(dst++, xmm1);

192

193

xmm1 = _mm_load_si128(src++);

194

xmm2 = _mm_load_si128(dst);

195

xmm1 = _mm_or_si128(xmm1, xmm2);

196

_mm_store_si128(dst++, xmm1);

197

198

xmm1 = _mm_load_si128(src++);

199

xmm2 = _mm_load_si128(dst);

200

xmm1 = _mm_or_si128(xmm1, xmm2);

201

_mm_store_si128(dst++, xmm1);

202

203

xmm1 = _mm_load_si128(src++);

204

xmm2 = _mm_load_si128(dst);

205

xmm1 = _mm_or_si128(xmm1, xmm2);

206

_mm_store_si128(dst++, xmm1);

207

208

} while (src < src_end);

209

}

210

211

/*!

212

@brief OR array elements against another array

213

*dst |= *src

214

215

@ingroup SSE2

216

217

BMFORCEINLINE

218

void sse2_xor_arr(__m128i* BMRESTRICT dst,

219

const __m128i* BMRESTRICT src,

220

const __m128i* BMRESTRICT src_end)

221

{

222

__m128i xmm1, xmm2;

223

224

{

225

_mm_prefetch((const char*)(src)+512, _MM_HINT_NTA);

226

227

xmm1 = _mm_load_si128(src++);

228

xmm2 = _mm_load_si128(dst);

229

xmm1 = _mm_xor_si128(xmm1, xmm2);

230

_mm_store_si128(dst++, xmm1);

231

232

xmm1 = _mm_load_si128(src++);

233

xmm2 = _mm_load_si128(dst);

234

xmm1 = _mm_xor_si128(xmm1, xmm2);

235

_mm_store_si128(dst++, xmm1);

236

237

xmm1 = _mm_load_si128(src++);

238

xmm2 = _mm_load_si128(dst);

239

xmm1 = _mm_xor_si128(xmm1, xmm2);

240

_mm_store_si128(dst++, xmm1);

241

242

xmm1 = _mm_load_si128(src++);

243

xmm2 = _mm_load_si128(dst);

244

xmm1 = _mm_xor_si128(xmm1, xmm2);

245

_mm_store_si128(dst++, xmm1);

246

247

} while (src < src_end);

248

}

249

250

251

/*!

252

@brief AND-NOT (SUB) array elements against another array

253

*dst &= ~*src

254

255

@ingroup SSE2

256

257

BMFORCEINLINE

258

void sse2_sub_arr(__m128i* BMRESTRICT dst,

259

const __m128i* BMRESTRICT src,

260

const __m128i* BMRESTRICT src_end)

261

{

262

__m128i xmm1, xmm2;

263

264

{

265

_mm_prefetch((const char*)(src)+512, _MM_HINT_NTA);

266

267

xmm1 = _mm_load_si128(src++);

268

xmm2 = _mm_load_si128(dst);

269

xmm1 = _mm_andnot_si128(xmm1, xmm2);

270

_mm_store_si128(dst++, xmm1);

271

272

xmm1 = _mm_load_si128(src++);

273

xmm2 = _mm_load_si128(dst);

274

xmm1 = _mm_andnot_si128(xmm1, xmm2);

275

_mm_store_si128(dst++, xmm1);

276

277

xmm1 = _mm_load_si128(src++);

278

xmm2 = _mm_load_si128(dst);

279

xmm1 = _mm_andnot_si128(xmm1, xmm2);

280

_mm_store_si128(dst++, xmm1);

281

282

xmm1 = _mm_load_si128(src++);

283

xmm2 = _mm_load_si128(dst);

284

xmm1 = _mm_andnot_si128(xmm1, xmm2);

285

_mm_store_si128(dst++, xmm1);

286

287

} while (src < src_end);

288

}

289

290

/*!

291

@brief SSE2 block memset

292

*dst = value

293

294

@ingroup SSE2

295

296

297

BMFORCEINLINE

298

void sse2_set_block(__m128i* BMRESTRICT dst,

299

__m128i* BMRESTRICT dst_end,

300

bm::word_t value)

301

{

302

__m128i xmm0 = _mm_set_epi32 (value, value, value, value);

303

304

{

305

_mm_store_si128(dst, xmm0);

306

307

_mm_store_si128(dst+1, xmm0);

308

_mm_store_si128(dst+2, xmm0);

309

_mm_store_si128(dst+3, xmm0);

310

311

_mm_store_si128(dst+4, xmm0);

312

_mm_store_si128(dst+5, xmm0);

313

_mm_store_si128(dst+6, xmm0);

314

_mm_store_si128(dst+7, xmm0);

315

316

dst += 8;

317

318

} while (++dst < dst_end);

319

320

_mm_sfence();

321

}

322

323

/*!

324

@brief SSE2 block copy

325

*dst = *src

326

327

@ingroup SSE2

328

329

BMFORCEINLINE

330

void sse2_copy_block(__m128i* BMRESTRICT dst,

331

const __m128i* BMRESTRICT src,

332

const __m128i* BMRESTRICT src_end)

333

{

334

__m128i xmm0, xmm1, xmm2, xmm3;

335

336

{

337

_mm_prefetch((const char*)(src)+512, _MM_HINT_NTA);

338

339

xmm0 = _mm_load_si128(src+0);

340

xmm1 = _mm_load_si128(src+1);

341

xmm2 = _mm_load_si128(src+2);

342

xmm3 = _mm_load_si128(src+3);

343

344

_mm_store_si128(dst+0, xmm0);

345

_mm_store_si128(dst+1, xmm1);

346

_mm_store_si128(dst+2, xmm2);

347

_mm_store_si128(dst+3, xmm3);

348

349

xmm0 = _mm_load_si128(src+4);

350

xmm1 = _mm_load_si128(src+5);

351

xmm2 = _mm_load_si128(src+6);

352

xmm3 = _mm_load_si128(src+7);

353

354

_mm_store_si128(dst+4, xmm0);

355

_mm_store_si128(dst+5, xmm1);

356

_mm_store_si128(dst+6, xmm2);

357

_mm_store_si128(dst+7, xmm3);

358

359

src += 8;

360

dst += 8;

361

362

} while (src < src_end);

363

}

364

365

366

/*!

367

@brief Invert array elements

368

*dst = ~*dst

369

370

*dst ^= *dst

371

372

@ingroup SSE2

373

374

BMFORCEINLINE

375

void sse2_invert_arr(bm::word_t* first, bm::word_t* last)

376

{

377

__m128i xmm1 = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF,

378

0xFFFFFFFF, 0xFFFFFFFF);

379

__m128i* wrd_ptr = (__m128i*)first;

380

381

382

{

383

_mm_prefetch((const char*)(wrd_ptr)+512, _MM_HINT_NTA);

384

385

__m128i xmm0 = _mm_load_si128(wrd_ptr);

386

xmm0 = _mm_xor_si128(xmm0, xmm1);

387

_mm_store_si128(wrd_ptr, xmm0);

388

++wrd_ptr;

389

} while (wrd_ptr < (__m128i*)last);

390

}

391

392

393

394

/*!

395

SSE2 optimized bitcounting function implements parallel bitcounting

396

algorithm for SSE2 instruction set.

397

398

<pre>

399

unsigned CalcBitCount32(unsigned b)

400

{

401

b = (b & 0x55555555) + (b >> 1 & 0x55555555);

402

b = (b & 0x33333333) + (b >> 2 & 0x33333333);

403

b = (b + (b >> 4)) & 0x0F0F0F0F;

404

b = b + (b >> 8);

405

b = (b + (b >> 16)) & 0x0000003F;

406

return b;

407

}

408

</pre>

409

410

@ingroup SSE2

411

412

413

inline

414

bm::id_t sse2_bit_count(const __m128i* block, const __m128i* block_end)

415

{

416

const unsigned mu1 = 0x55555555;

417

const unsigned mu2 = 0x33333333;

418

const unsigned mu3 = 0x0F0F0F0F;

419

const unsigned mu4 = 0x0000003F;

420

421

// Loading masks

422

__m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);

423

__m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);

424

__m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);

425

__m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);

426

__m128i mcnt;

427

mcnt = _mm_xor_si128(m1, m1); // cnt = 0

428

429

__m128i tmp1, tmp2;

430

431

{

432

__m128i b = _mm_load_si128(block);

433

++block;

434

435

// b = (b & 0x55555555) + (b >> 1 & 0x55555555);

436

tmp1 = _mm_srli_epi32(b, 1); // tmp1 = (b >> 1 & 0x55555555)

437

tmp1 = _mm_and_si128(tmp1, m1);

438

tmp2 = _mm_and_si128(b, m1); // tmp2 = (b & 0x55555555)

439

b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2

440

441

// b = (b & 0x33333333) + (b >> 2 & 0x33333333);

442

tmp1 = _mm_srli_epi32(b, 2); // (b >> 2 & 0x33333333)

443

tmp1 = _mm_and_si128(tmp1, m2);

444

tmp2 = _mm_and_si128(b, m2); // (b & 0x33333333)

445

b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2

446

447

// b = (b + (b >> 4)) & 0x0F0F0F0F;

448

tmp1 = _mm_srli_epi32(b, 4); // tmp1 = b >> 4

449

b = _mm_add_epi32(b, tmp1); // b = b + (b >> 4)

450

b = _mm_and_si128(b, m3); // & 0x0F0F0F0F

451

452

// b = b + (b >> 8);

453

tmp1 = _mm_srli_epi32 (b, 8); // tmp1 = b >> 8

454

b = _mm_add_epi32(b, tmp1); // b = b + (b >> 8)

455

456

// b = (b + (b >> 16)) & 0x0000003F;

457

tmp1 = _mm_srli_epi32 (b, 16); // b >> 16

458

b = _mm_add_epi32(b, tmp1); // b + (b >> 16)

459

b = _mm_and_si128(b, m4); // (b >> 16) & 0x0000003F;

460

461

mcnt = _mm_add_epi32(mcnt, b); // mcnt += b

462

463

} while (block < block_end);

464

465

__declspec(align(16)) bm::id_t tcnt[4];

466

_mm_store_si128((__m128i*)tcnt, mcnt);

467

468

return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];

469

}

470

471

BMFORCEINLINE

472

__m128i sse2_and(__m128i a, __m128i b)

473

{

474

return _mm_and_si128(a, b);

475

}

476

477

BMFORCEINLINE

478

__m128i sse2_or(__m128i a, __m128i b)

479

{

480

return _mm_or_si128(a, b);

481

}

482

483

484

BMFORCEINLINE

485

__m128i sse2_xor(__m128i a, __m128i b)

486

{

487

return _mm_xor_si128(a, b);

488

}

489

490

BMFORCEINLINE

491

__m128i sse2_sub(__m128i a, __m128i b)

492

{

493

return _mm_andnot_si128(b, a);

494

}

495

496

497

template<class Func>

498

bm::id_t sse2_bit_count_op(const __m128i* BMRESTRICT block,

499

const __m128i* BMRESTRICT block_end,

500

const __m128i* BMRESTRICT mask_block,

501

Func sse2_func)

502

{

503

const unsigned mu1 = 0x55555555;

504

const unsigned mu2 = 0x33333333;

505

const unsigned mu3 = 0x0F0F0F0F;

506

const unsigned mu4 = 0x0000003F;

507

508

// Loading masks

509

__m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);

510

__m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);

511

__m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);

512

__m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);

513

__m128i mcnt;

514

mcnt = _mm_xor_si128(m1, m1); // cnt = 0

515

516

{

517

__m128i tmp1, tmp2;

518

__m128i b = _mm_load_si128(block++);

519

520

tmp1 = _mm_load_si128(mask_block++);

521

522

b = sse2_func(b, tmp1);

523

524

// b = (b & 0x55555555) + (b >> 1 & 0x55555555);

525

tmp1 = _mm_srli_epi32(b, 1); // tmp1 = (b >> 1 & 0x55555555)

526

tmp1 = _mm_and_si128(tmp1, m1);

527

tmp2 = _mm_and_si128(b, m1); // tmp2 = (b & 0x55555555)

528

b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2

529

530

// b = (b & 0x33333333) + (b >> 2 & 0x33333333);

531

tmp1 = _mm_srli_epi32(b, 2); // (b >> 2 & 0x33333333)

532

tmp1 = _mm_and_si128(tmp1, m2);

533

tmp2 = _mm_and_si128(b, m2); // (b & 0x33333333)

534

b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2

535

536

// b = (b + (b >> 4)) & 0x0F0F0F0F;

537

tmp1 = _mm_srli_epi32(b, 4); // tmp1 = b >> 4

538

b = _mm_add_epi32(b, tmp1); // b = b + (b >> 4)

539

b = _mm_and_si128(b, m3); // & 0x0F0F0F0F

540

541

// b = b + (b >> 8);

542

tmp1 = _mm_srli_epi32 (b, 8); // tmp1 = b >> 8

543

b = _mm_add_epi32(b, tmp1); // b = b + (b >> 8)

544

545

// b = (b + (b >> 16)) & 0x0000003F;

546

tmp1 = _mm_srli_epi32 (b, 16); // b >> 16

547

b = _mm_add_epi32(b, tmp1); // b + (b >> 16)

548

b = _mm_and_si128(b, m4); // (b >> 16) & 0x0000003F;

549

550

mcnt = _mm_add_epi32(mcnt, b); // mcnt += b

551

552

} while (block < block_end);

553

554

__declspec(align(16)) bm::id_t tcnt[4];

555

_mm_store_si128((__m128i*)tcnt, mcnt);

556

557

return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];

558

}

559

560

561

562

563

#define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\

564

sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), mask)

565

566

#define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\

567

sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), mask)

568

569

#define VECT_BITCOUNT(first, last) \

570

sse2_bit_count((__m128i*) (first), (__m128i*) (last))

571

572

#define VECT_BITCOUNT_AND(first, last, mask) \

573

sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and)

574

575

#define VECT_BITCOUNT_OR(first, last, mask) \

576

sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or)

577

578

#define VECT_BITCOUNT_XOR(first, last, mask) \

579

sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor)

580

581

#define VECT_BITCOUNT_SUB(first, last, mask) \

582

sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub)

583

584

#define VECT_INVERT_ARR(first, last) \

585

sse2_invert_arr(first, last);

586

587

#define VECT_AND_ARR(dst, src, src_end) \

588

sse2_and_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))

589

590

#define VECT_OR_ARR(dst, src, src_end) \

591

sse2_or_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))

592

593

#define VECT_SUB_ARR(dst, src, src_end) \

594

sse2_sub_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))

595

596

#define VECT_XOR_ARR(dst, src, src_end) \

597

sse2_xor_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))

598

599

#define VECT_COPY_BLOCK(dst, src, src_end) \

600

sse2_copy_block((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))

601

602

#define VECT_SET_BLOCK(dst, dst_end, value) \

603

sse2_set_block((__m128i*) dst, (__m128i*) (dst_end), (value))

604

605

} // namespace

606

607

#endif

Permission is hereby granted, free of charge, to any person

obtaining a copy of this software and associated documentation

files (the "Software"), to deal in the Software without restriction,

including without limitation the rights to use, copy, modify, merge,

publish, distribute, sublicense, and/or sell copies of the Software,

and to permit persons to whom the Software is furnished to do so,

subject to the following conditions:

The above copyright notice and this permission notice shall be included

in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,

EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES

OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.

IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,

DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,

ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR

OTHER DEALINGS IN THE SOFTWARE.

For more information please visit: http://bmagic.sourceforge.net

#ifndef BMSSE2__H__INCLUDED__

#define BMSSE2__H__INCLUDED__

// Header implements processor specific intrinsics declarations for SSE2

// instruction set

#include<emmintrin.h>

namespace bm

{

/** @defgroup SSE2 Processor specific optimizations for SSE2 instructions

* @ingroup bmagic

/*!

@brief SSE2 reinitialization guard class

SSE2 requires to call _mm_empty() if we are intermixing

MMX integer commands with floating point arithmetics.

This class guards critical code fragments where SSE2 integer

is used.

@ingroup SSE2

class sse2_empty_guard

{

public:

BMFORCEINLINE sse2_empty_guard()

{

_mm_empty();

}

BMFORCEINLINE ~sse2_empty_guard()

{

_mm_empty();

}

};

# ifndef BM_SET_MMX_GUARD

# define BM_SET_MMX_GUARD sse2_empty_guard bm_mmx_guard_;

# endif

/*!

@brief XOR array elements to specified mask

*dst = *src ^ mask

@ingroup SSE2

BMFORCEINLINE

void sse2_xor_arr_2_mask(__m128i* BMRESTRICT dst,

const __m128i* BMRESTRICT src,

const __m128i* BMRESTRICT src_end,

bm::word_t mask)

{

__m128i xmm2 = _mm_set_epi32(mask, mask, mask, mask);

{

__m128i xmm1 = _mm_load_si128(src);

xmm1 = _mm_xor_si128(xmm1, xmm2);

_mm_store_si128(dst, xmm1);

++dst;

++src;

} while (src < src_end);

100

}

101

102

/*!

103

@brief Inverts array elements and NOT them to specified mask

104

*dst = ~*src & mask

105

106

@ingroup SSE2

107

108

BMFORCEINLINE

109

void sse2_andnot_arr_2_mask(__m128i* BMRESTRICT dst,

110

const __m128i* BMRESTRICT src,

111

const __m128i* BMRESTRICT src_end,

112

bm::word_t mask)

113

{

114

__m128i xmm2 = _mm_set_epi32(mask, mask, mask, mask);

115

116

{

117

//_mm_prefetch((const char*)(src)+1024, _MM_HINT_NTA);

118

//_mm_prefetch((const char*)(src)+1088, _MM_HINT_NTA);

119

120

__m128i xmm1 = _mm_load_si128(src);

121

122

xmm1 = _mm_andnot_si128(xmm1, xmm2); // xmm1 = (~xmm1) & xmm2

123

_mm_store_si128(dst, xmm1);

124

++dst;

125

++src;

126

127

} while (src < src_end);

128

}

129

130

/*!

131

@brief AND array elements against another array

132

*dst &= *src

133

134

@ingroup SSE2

135

136

BMFORCEINLINE

137

void sse2_and_arr(__m128i* BMRESTRICT dst,

138

const __m128i* BMRESTRICT src,

139

const __m128i* BMRESTRICT src_end)

140

{

141

__m128i xmm1, xmm2;

142

143

{

144

_mm_prefetch((const char*)(src)+512, _MM_HINT_NTA);

145

146

xmm1 = _mm_load_si128(src++);

147

xmm2 = _mm_load_si128(dst);

148

xmm1 = _mm_and_si128(xmm1, xmm2);

149

_mm_store_si128(dst++, xmm1);

150

151

xmm1 = _mm_load_si128(src++);

152

xmm2 = _mm_load_si128(dst);

153

xmm1 = _mm_and_si128(xmm1, xmm2);

154

_mm_store_si128(dst++, xmm1);

155

156

xmm1 = _mm_load_si128(src++);

157

xmm2 = _mm_load_si128(dst);

158

xmm1 = _mm_and_si128(xmm1, xmm2);

159

_mm_store_si128(dst++, xmm1);

160

161

xmm1 = _mm_load_si128(src++);

162

xmm2 = _mm_load_si128(dst);

163

xmm1 = _mm_and_si128(xmm1, xmm2);

164

_mm_store_si128(dst++, xmm1);

165

166

} while (src < src_end);

167

168

}

169

170

171

172

/*!

173

@brief OR array elements against another array

174

*dst |= *src

175

176

@ingroup SSE2

177

178

BMFORCEINLINE

179

void sse2_or_arr(__m128i* BMRESTRICT dst,

180

const __m128i* BMRESTRICT src,

181

const __m128i* BMRESTRICT src_end)

182

{

183

__m128i xmm1, xmm2;

184

185

{

186

_mm_prefetch((const char*)(src)+512, _MM_HINT_NTA);

187

188

xmm1 = _mm_load_si128(src++);

189

xmm2 = _mm_load_si128(dst);

190

xmm1 = _mm_or_si128(xmm1, xmm2);

191

_mm_store_si128(dst++, xmm1);

192

193

xmm1 = _mm_load_si128(src++);

194

xmm2 = _mm_load_si128(dst);

195

xmm1 = _mm_or_si128(xmm1, xmm2);

196

_mm_store_si128(dst++, xmm1);

197

198

xmm1 = _mm_load_si128(src++);

199

xmm2 = _mm_load_si128(dst);

200

xmm1 = _mm_or_si128(xmm1, xmm2);

201

_mm_store_si128(dst++, xmm1);

202

203

xmm1 = _mm_load_si128(src++);

204

xmm2 = _mm_load_si128(dst);

205

xmm1 = _mm_or_si128(xmm1, xmm2);

206

_mm_store_si128(dst++, xmm1);

207

208

} while (src < src_end);

209

}

210

211

/*!

212

@brief OR array elements against another array

213

*dst |= *src

214

215

@ingroup SSE2

216

217

BMFORCEINLINE

218

void sse2_xor_arr(__m128i* BMRESTRICT dst,

219

const __m128i* BMRESTRICT src,

220

const __m128i* BMRESTRICT src_end)

221

{

222

__m128i xmm1, xmm2;

223

224

{

225

_mm_prefetch((const char*)(src)+512, _MM_HINT_NTA);

226

227

xmm1 = _mm_load_si128(src++);

228

xmm2 = _mm_load_si128(dst);

229

xmm1 = _mm_xor_si128(xmm1, xmm2);

230

_mm_store_si128(dst++, xmm1);

231

232

xmm1 = _mm_load_si128(src++);

233

xmm2 = _mm_load_si128(dst);

234

xmm1 = _mm_xor_si128(xmm1, xmm2);

235

_mm_store_si128(dst++, xmm1);

236

237

xmm1 = _mm_load_si128(src++);

238

xmm2 = _mm_load_si128(dst);

239

xmm1 = _mm_xor_si128(xmm1, xmm2);

240

_mm_store_si128(dst++, xmm1);

241

242

xmm1 = _mm_load_si128(src++);

243

xmm2 = _mm_load_si128(dst);

244

xmm1 = _mm_xor_si128(xmm1, xmm2);

245

_mm_store_si128(dst++, xmm1);

246

247

} while (src < src_end);

248

}

249

250

251

/*!

252

@brief AND-NOT (SUB) array elements against another array

253

*dst &= ~*src

254

255

@ingroup SSE2

256

257

BMFORCEINLINE

258

void sse2_sub_arr(__m128i* BMRESTRICT dst,

259

const __m128i* BMRESTRICT src,

260

const __m128i* BMRESTRICT src_end)

261

{

262

__m128i xmm1, xmm2;

263

264

{

265

_mm_prefetch((const char*)(src)+512, _MM_HINT_NTA);

266

267

xmm1 = _mm_load_si128(src++);

268

xmm2 = _mm_load_si128(dst);

269

xmm1 = _mm_andnot_si128(xmm1, xmm2);

270

_mm_store_si128(dst++, xmm1);

271

272

xmm1 = _mm_load_si128(src++);

273

xmm2 = _mm_load_si128(dst);

274

xmm1 = _mm_andnot_si128(xmm1, xmm2);

275

_mm_store_si128(dst++, xmm1);

276

277

xmm1 = _mm_load_si128(src++);

278

xmm2 = _mm_load_si128(dst);

279

xmm1 = _mm_andnot_si128(xmm1, xmm2);

280

_mm_store_si128(dst++, xmm1);

281

282

xmm1 = _mm_load_si128(src++);

283

xmm2 = _mm_load_si128(dst);

284

xmm1 = _mm_andnot_si128(xmm1, xmm2);

285

_mm_store_si128(dst++, xmm1);

286

287

} while (src < src_end);

288

}

289

290

/*!

291

@brief SSE2 block memset

292

*dst = value

293

294

@ingroup SSE2

295

296

297

BMFORCEINLINE

298

void sse2_set_block(__m128i* BMRESTRICT dst,

299

__m128i* BMRESTRICT dst_end,

300

bm::word_t value)

301

{

302

__m128i xmm0 = _mm_set_epi32 (value, value, value, value);

303

304

{

305

_mm_store_si128(dst, xmm0);

306

307

_mm_store_si128(dst+1, xmm0);

308

_mm_store_si128(dst+2, xmm0);

309

_mm_store_si128(dst+3, xmm0);

310

311

_mm_store_si128(dst+4, xmm0);

312

_mm_store_si128(dst+5, xmm0);

313

_mm_store_si128(dst+6, xmm0);

314

_mm_store_si128(dst+7, xmm0);

315

316

dst += 8;

317

318

} while (++dst < dst_end);

319

320

_mm_sfence();

321

}

322

323

/*!

324

@brief SSE2 block copy

325

*dst = *src

326

327

@ingroup SSE2

328

329

BMFORCEINLINE

330

void sse2_copy_block(__m128i* BMRESTRICT dst,

331

const __m128i* BMRESTRICT src,

332

const __m128i* BMRESTRICT src_end)

333

{

334

__m128i xmm0, xmm1, xmm2, xmm3;

335

336

{

337

_mm_prefetch((const char*)(src)+512, _MM_HINT_NTA);

338

339

xmm0 = _mm_load_si128(src+0);

340

xmm1 = _mm_load_si128(src+1);

341

xmm2 = _mm_load_si128(src+2);

342

xmm3 = _mm_load_si128(src+3);

343

344

_mm_store_si128(dst+0, xmm0);

345

_mm_store_si128(dst+1, xmm1);

346

_mm_store_si128(dst+2, xmm2);

347

_mm_store_si128(dst+3, xmm3);

348

349

xmm0 = _mm_load_si128(src+4);

350

xmm1 = _mm_load_si128(src+5);

351

xmm2 = _mm_load_si128(src+6);

352

xmm3 = _mm_load_si128(src+7);

353

354

_mm_store_si128(dst+4, xmm0);

355

_mm_store_si128(dst+5, xmm1);

356

_mm_store_si128(dst+6, xmm2);

357

_mm_store_si128(dst+7, xmm3);

358

359

src += 8;

360

dst += 8;

361

362

} while (src < src_end);

363

}

364

365

366

/*!

367

@brief Invert array elements

368

*dst = ~*dst

369

370

*dst ^= *dst

371

372

@ingroup SSE2

373

374

BMFORCEINLINE

375

void sse2_invert_arr(bm::word_t* first, bm::word_t* last)

376

{

377

__m128i xmm1 = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF,

378

0xFFFFFFFF, 0xFFFFFFFF);

379

__m128i* wrd_ptr = (__m128i*)first;

380

381

382

{

383

_mm_prefetch((const char*)(wrd_ptr)+512, _MM_HINT_NTA);

384

385

__m128i xmm0 = _mm_load_si128(wrd_ptr);

386

xmm0 = _mm_xor_si128(xmm0, xmm1);

387

_mm_store_si128(wrd_ptr, xmm0);

388

++wrd_ptr;

389

} while (wrd_ptr < (__m128i*)last);

390

}

391

392

393

394

/*!

395

SSE2 optimized bitcounting function implements parallel bitcounting

396

algorithm for SSE2 instruction set.

397

398

<pre>

399

unsigned CalcBitCount32(unsigned b)

400

{

401

b = (b & 0x55555555) + (b >> 1 & 0x55555555);

402

b = (b & 0x33333333) + (b >> 2 & 0x33333333);

403

b = (b + (b >> 4)) & 0x0F0F0F0F;

404

b = b + (b >> 8);

405

b = (b + (b >> 16)) & 0x0000003F;

406

return b;

407

}

408

</pre>

409

410

@ingroup SSE2

411

412

413

inline

414

bm::id_t sse2_bit_count(const __m128i* block, const __m128i* block_end)

415

{

416

const unsigned mu1 = 0x55555555;

417

const unsigned mu2 = 0x33333333;

418

const unsigned mu3 = 0x0F0F0F0F;

419

const unsigned mu4 = 0x0000003F;

420

421

// Loading masks

422

__m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);

423

__m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);

424

__m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);

425

__m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);

426

__m128i mcnt;

427

mcnt = _mm_xor_si128(m1, m1); // cnt = 0

428

429

__m128i tmp1, tmp2;

430

431

{

432

__m128i b = _mm_load_si128(block);

433

++block;

434

435

// b = (b & 0x55555555) + (b >> 1 & 0x55555555);

436

tmp1 = _mm_srli_epi32(b, 1); // tmp1 = (b >> 1 & 0x55555555)

437

tmp1 = _mm_and_si128(tmp1, m1);

438

tmp2 = _mm_and_si128(b, m1); // tmp2 = (b & 0x55555555)

439

b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2

440

441

// b = (b & 0x33333333) + (b >> 2 & 0x33333333);

442

tmp1 = _mm_srli_epi32(b, 2); // (b >> 2 & 0x33333333)

443

tmp1 = _mm_and_si128(tmp1, m2);

444

tmp2 = _mm_and_si128(b, m2); // (b & 0x33333333)

445

b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2

446

447

// b = (b + (b >> 4)) & 0x0F0F0F0F;

448

tmp1 = _mm_srli_epi32(b, 4); // tmp1 = b >> 4

449

b = _mm_add_epi32(b, tmp1); // b = b + (b >> 4)

450

b = _mm_and_si128(b, m3); // & 0x0F0F0F0F

451

452

// b = b + (b >> 8);

453

tmp1 = _mm_srli_epi32 (b, 8); // tmp1 = b >> 8

454

b = _mm_add_epi32(b, tmp1); // b = b + (b >> 8)

455

456

// b = (b + (b >> 16)) & 0x0000003F;

457

tmp1 = _mm_srli_epi32 (b, 16); // b >> 16

458

b = _mm_add_epi32(b, tmp1); // b + (b >> 16)

459

b = _mm_and_si128(b, m4); // (b >> 16) & 0x0000003F;

460

461

mcnt = _mm_add_epi32(mcnt, b); // mcnt += b

462

463

} while (block < block_end);

464

465

__declspec(align(16)) bm::id_t tcnt[4];

466

_mm_store_si128((__m128i*)tcnt, mcnt);

467

468

return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];

469

}

470

471

BMFORCEINLINE

472

__m128i sse2_and(__m128i a, __m128i b)

473

{

474

return _mm_and_si128(a, b);

475

}

476

477

BMFORCEINLINE

478

__m128i sse2_or(__m128i a, __m128i b)

479

{

480

return _mm_or_si128(a, b);

481

}

482

483

484

BMFORCEINLINE

485

__m128i sse2_xor(__m128i a, __m128i b)

486

{

487

return _mm_xor_si128(a, b);

488

}

489

490

BMFORCEINLINE

491

__m128i sse2_sub(__m128i a, __m128i b)

492

{

493

return _mm_andnot_si128(b, a);

494

}

495

496

497

template<class Func>

498

bm::id_t sse2_bit_count_op(const __m128i* BMRESTRICT block,

499

const __m128i* BMRESTRICT block_end,

500

const __m128i* BMRESTRICT mask_block,

501

Func sse2_func)

502

{

503

const unsigned mu1 = 0x55555555;

504

const unsigned mu2 = 0x33333333;

505

const unsigned mu3 = 0x0F0F0F0F;

506

const unsigned mu4 = 0x0000003F;

507

508

// Loading masks

509

__m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);

510

__m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);

511

__m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);

512

__m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);

513

__m128i mcnt;

514

mcnt = _mm_xor_si128(m1, m1); // cnt = 0

515

516

{

517

__m128i tmp1, tmp2;

518

__m128i b = _mm_load_si128(block++);

519

520

tmp1 = _mm_load_si128(mask_block++);

521

522

b = sse2_func(b, tmp1);

523

524

// b = (b & 0x55555555) + (b >> 1 & 0x55555555);

525

tmp1 = _mm_srli_epi32(b, 1); // tmp1 = (b >> 1 & 0x55555555)

526

tmp1 = _mm_and_si128(tmp1, m1);

527

tmp2 = _mm_and_si128(b, m1); // tmp2 = (b & 0x55555555)

528

b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2

529

530

// b = (b & 0x33333333) + (b >> 2 & 0x33333333);

531

tmp1 = _mm_srli_epi32(b, 2); // (b >> 2 & 0x33333333)

532

tmp1 = _mm_and_si128(tmp1, m2);

533

tmp2 = _mm_and_si128(b, m2); // (b & 0x33333333)

534

b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2

535

536

// b = (b + (b >> 4)) & 0x0F0F0F0F;

537

tmp1 = _mm_srli_epi32(b, 4); // tmp1 = b >> 4

538

b = _mm_add_epi32(b, tmp1); // b = b + (b >> 4)

539

b = _mm_and_si128(b, m3); // & 0x0F0F0F0F

540

541

// b = b + (b >> 8);

542

tmp1 = _mm_srli_epi32 (b, 8); // tmp1 = b >> 8

543

b = _mm_add_epi32(b, tmp1); // b = b + (b >> 8)

544

545

// b = (b + (b >> 16)) & 0x0000003F;

546

tmp1 = _mm_srli_epi32 (b, 16); // b >> 16

547

b = _mm_add_epi32(b, tmp1); // b + (b >> 16)

548

b = _mm_and_si128(b, m4); // (b >> 16) & 0x0000003F;

549

550

mcnt = _mm_add_epi32(mcnt, b); // mcnt += b

551

552

} while (block < block_end);

553

554

__declspec(align(16)) bm::id_t tcnt[4];

555

_mm_store_si128((__m128i*)tcnt, mcnt);

556

557

return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];

558

}

559

560

561

562

563

#define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\

564

sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), mask)

565

566

#define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\

567

sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), mask)

568

569

#define VECT_BITCOUNT(first, last) \

570

sse2_bit_count((__m128i*) (first), (__m128i*) (last))

571

572

#define VECT_BITCOUNT_AND(first, last, mask) \

573

sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and)

574

575

#define VECT_BITCOUNT_OR(first, last, mask) \

576

sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or)

577

578

#define VECT_BITCOUNT_XOR(first, last, mask) \

579

sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor)

580

581

#define VECT_BITCOUNT_SUB(first, last, mask) \

582

sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub)

583

584

#define VECT_INVERT_ARR(first, last) \

585

sse2_invert_arr(first, last);

586

587

#define VECT_AND_ARR(dst, src, src_end) \

588

sse2_and_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))

589

590

#define VECT_OR_ARR(dst, src, src_end) \

591

sse2_or_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))

592

593

#define VECT_SUB_ARR(dst, src, src_end) \

594

sse2_sub_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))

595

596

#define VECT_XOR_ARR(dst, src, src_end) \

597

sse2_xor_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))

598

599

#define VECT_COPY_BLOCK(dst, src, src_end) \

600

sse2_copy_block((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))

601

602

#define VECT_SET_BLOCK(dst, dst_end, value) \

603

sse2_set_block((__m128i*) dst, (__m128i*) (dst_end), (value))

604

605

} // namespace

606

607

#endif

Older »