1
/* This file is part of the Vc library.
3
Copyright (C) 2009-2011 Matthias Kretz <kretz@kde.org>
5
Vc is free software: you can redistribute it and/or modify
6
it under the terms of the GNU Lesser General Public License as
7
published by the Free Software Foundation, either version 3 of
8
the License, or (at your option) any later version.
10
Vc is distributed in the hope that it will be useful, but
11
WITHOUT ANY WARRANTY; without even the implied warranty of
12
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
GNU Lesser General Public License for more details.
15
You should have received a copy of the GNU Lesser General Public
16
License along with Vc. If not, see <http://www.gnu.org/licenses/>.
20
#ifndef SSE_VECTORHELPER_H
21
#define SSE_VECTORHELPER_H
31
template<typename VectorType, unsigned int Size> struct SortHelper
33
static inline Vc_CONST_L VectorType sort(VectorType) Vc_CONST_R;
35
template<unsigned int Size> struct SortHelper<M256, Size>
37
static inline Vc_PURE_L M256 sort(const M256 &) Vc_PURE_R;
40
#define OP0(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name() { return code; }
41
#define OP2(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(VectorTypeArg a, VectorTypeArg b) { return code; }
42
#define OP3(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(VectorTypeArg a, VectorTypeArg b, VectorTypeArg c) { return code; }
43
template<> struct VectorHelper<M256>
45
typedef M256 VectorType;
46
#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN
47
typedef const VectorType &VectorTypeArg;
49
typedef const VectorType VectorTypeArg;
51
template<typename A> static Vc_ALWAYS_INLINE_L Vc_PURE_L VectorType load(const float *x, A) Vc_ALWAYS_INLINE_R Vc_PURE_R;
52
static Vc_ALWAYS_INLINE_L void store(float *mem, VectorTypeArg x, AlignedFlag) Vc_ALWAYS_INLINE_R;
53
static Vc_ALWAYS_INLINE_L void store(float *mem, VectorTypeArg x, UnalignedFlag) Vc_ALWAYS_INLINE_R;
54
static Vc_ALWAYS_INLINE_L void store(float *mem, VectorTypeArg x, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R;
55
static Vc_ALWAYS_INLINE_L void store(float *mem, VectorTypeArg x, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R;
56
static Vc_ALWAYS_INLINE_L void store(float *mem, VectorTypeArg x, VectorTypeArg m, AlignedFlag) Vc_ALWAYS_INLINE_R;
57
static Vc_ALWAYS_INLINE_L void store(float *mem, VectorTypeArg x, VectorTypeArg m, UnalignedFlag) Vc_ALWAYS_INLINE_R;
58
static Vc_ALWAYS_INLINE_L void store(float *mem, VectorTypeArg x, VectorTypeArg m, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R;
59
static Vc_ALWAYS_INLINE_L void store(float *mem, VectorTypeArg x, VectorTypeArg m, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R;
61
OP0(allone, VectorType::create(_mm_setallone_ps(), _mm_setallone_ps()))
62
OP0(zero, VectorType::create(_mm_setzero_ps(), _mm_setzero_ps()))
63
OP2(or_, VectorType::create(_mm_or_ps(a[0], b[0]), _mm_or_ps(a[1], b[1])))
64
OP2(xor_, VectorType::create(_mm_xor_ps(a[0], b[0]), _mm_xor_ps(a[1], b[1])))
65
OP2(and_, VectorType::create(_mm_and_ps(a[0], b[0]), _mm_and_ps(a[1], b[1])))
66
OP2(andnot_, VectorType::create(_mm_andnot_ps(a[0], b[0]), _mm_andnot_ps(a[1], b[1])))
67
OP3(blend, VectorType::create(_mm_blendv_ps(a[0], b[0], c[0]), _mm_blendv_ps(a[1], b[1], c[1])))
73
#define OP0(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name() { return code; }
74
#define OP1(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(const VectorType a) { return code; }
75
#define OP2(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(const VectorType a, const VectorType b) { return code; }
76
#define OP3(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(const VectorType a, const VectorType b, const VectorType c) { return code; }
78
template<> struct VectorHelper<_M128>
80
typedef _M128 VectorType;
81
template<typename A> static Vc_ALWAYS_INLINE_L Vc_PURE_L VectorType load(const float *x, A) Vc_ALWAYS_INLINE_R Vc_PURE_R;
82
static Vc_ALWAYS_INLINE_L void store(float *mem, const VectorType x, AlignedFlag) Vc_ALWAYS_INLINE_R;
83
static Vc_ALWAYS_INLINE_L void store(float *mem, const VectorType x, UnalignedFlag) Vc_ALWAYS_INLINE_R;
84
static Vc_ALWAYS_INLINE_L void store(float *mem, const VectorType x, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R;
85
static Vc_ALWAYS_INLINE_L void store(float *mem, const VectorType x, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R;
86
static Vc_ALWAYS_INLINE_L void store(float *mem, const VectorType x, const VectorType m, AlignedFlag) Vc_ALWAYS_INLINE_R;
87
static Vc_ALWAYS_INLINE_L void store(float *mem, const VectorType x, const VectorType m, UnalignedFlag) Vc_ALWAYS_INLINE_R;
88
static Vc_ALWAYS_INLINE_L void store(float *mem, const VectorType x, const VectorType m, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R;
89
static Vc_ALWAYS_INLINE_L void store(float *mem, const VectorType x, const VectorType m, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R;
91
OP0(allone, _mm_setallone_ps())
92
OP0(zero, _mm_setzero_ps())
93
OP2(or_, _mm_or_ps(a, b))
94
OP2(xor_, _mm_xor_ps(a, b))
95
OP2(and_, _mm_and_ps(a, b))
96
OP2(andnot_, _mm_andnot_ps(a, b))
97
OP3(blend, _mm_blendv_ps(a, b, c))
101
template<> struct VectorHelper<_M128D>
103
typedef _M128D VectorType;
104
template<typename A> static Vc_ALWAYS_INLINE_L Vc_PURE_L VectorType load(const double *x, A) Vc_ALWAYS_INLINE_R Vc_PURE_R;
105
static Vc_ALWAYS_INLINE_L void store(double *mem, const VectorType x, AlignedFlag) Vc_ALWAYS_INLINE_R;
106
static Vc_ALWAYS_INLINE_L void store(double *mem, const VectorType x, UnalignedFlag) Vc_ALWAYS_INLINE_R;
107
static Vc_ALWAYS_INLINE_L void store(double *mem, const VectorType x, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R;
108
static Vc_ALWAYS_INLINE_L void store(double *mem, const VectorType x, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R;
109
static Vc_ALWAYS_INLINE_L void store(double *mem, const VectorType x, const VectorType m, AlignedFlag) Vc_ALWAYS_INLINE_R;
110
static Vc_ALWAYS_INLINE_L void store(double *mem, const VectorType x, const VectorType m, UnalignedFlag) Vc_ALWAYS_INLINE_R;
111
static Vc_ALWAYS_INLINE_L void store(double *mem, const VectorType x, const VectorType m, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R;
112
static Vc_ALWAYS_INLINE_L void store(double *mem, const VectorType x, const VectorType m, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R;
114
OP0(allone, _mm_setallone_pd())
115
OP0(zero, _mm_setzero_pd())
116
OP2(or_, _mm_or_pd(a, b))
117
OP2(xor_, _mm_xor_pd(a, b))
118
OP2(and_, _mm_and_pd(a, b))
119
OP2(andnot_, _mm_andnot_pd(a, b))
120
OP3(blend, _mm_blendv_pd(a, b, c))
123
template<> struct VectorHelper<_M128I>
125
typedef _M128I VectorType;
126
template<typename T> static Vc_ALWAYS_INLINE_L Vc_PURE_L VectorType load(const T *x, AlignedFlag) Vc_ALWAYS_INLINE_R Vc_PURE_R;
127
template<typename T> static Vc_ALWAYS_INLINE_L Vc_PURE_L VectorType load(const T *x, UnalignedFlag) Vc_ALWAYS_INLINE_R Vc_PURE_R;
128
template<typename T> static Vc_ALWAYS_INLINE_L Vc_PURE_L VectorType load(const T *x, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R Vc_PURE_R;
129
template<typename T> static Vc_ALWAYS_INLINE_L Vc_PURE_L VectorType load(const T *x, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R Vc_PURE_R;
130
template<typename T> static Vc_ALWAYS_INLINE_L void store(T *mem, const VectorType x, AlignedFlag) Vc_ALWAYS_INLINE_R;
131
template<typename T> static Vc_ALWAYS_INLINE_L void store(T *mem, const VectorType x, UnalignedFlag) Vc_ALWAYS_INLINE_R;
132
template<typename T> static Vc_ALWAYS_INLINE_L void store(T *mem, const VectorType x, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R;
133
template<typename T> static Vc_ALWAYS_INLINE_L void store(T *mem, const VectorType x, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R;
134
template<typename T> static Vc_ALWAYS_INLINE_L void store(T *mem, const VectorType x, const VectorType m, AlignedFlag) Vc_ALWAYS_INLINE_R;
135
template<typename T> static Vc_ALWAYS_INLINE_L void store(T *mem, const VectorType x, const VectorType m, UnalignedFlag) Vc_ALWAYS_INLINE_R;
136
template<typename T> static Vc_ALWAYS_INLINE_L void store(T *mem, const VectorType x, const VectorType m, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R;
137
template<typename T> static Vc_ALWAYS_INLINE_L void store(T *mem, const VectorType x, const VectorType m, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R;
139
OP0(allone, _mm_setallone_si128())
140
OP0(zero, _mm_setzero_si128())
141
OP2(or_, _mm_or_si128(a, b))
142
OP2(xor_, _mm_xor_si128(a, b))
143
OP2(and_, _mm_and_si128(a, b))
144
OP2(andnot_, _mm_andnot_si128(a, b))
145
OP3(blend, _mm_blendv_epi8(a, b, c))
153
static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a) { return CAT(_mm_##op##_, SUFFIX)(a); }
155
static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return CAT(_mm_##op##_ , SUFFIX)(a, b); }
157
static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return CAT(_mm_##op , SUFFIX)(a, b); }
158
#define OPx(op, op2) \
159
static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return CAT(_mm_##op2##_, SUFFIX)(a, b); }
161
static Vc_ALWAYS_INLINE Vc_CONST VectorType cmp##op(const VectorType a, const VectorType b) { return CAT(_mm_cmp##op##_, SUFFIX)(a, b); }
162
#define OP_CAST_(op) \
163
static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return CAT(_mm_castps_, SUFFIX)( \
164
_mm_##op##ps(CAT(CAT(_mm_cast, SUFFIX), _ps)(a), \
165
CAT(CAT(_mm_cast, SUFFIX), _ps)(b))); \
168
static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) { return CAT(_mm_min_, SUFFIX)(a, b); } \
169
static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) { return CAT(_mm_max_, SUFFIX)(a, b); }
171
template<> struct VectorHelper<double> {
172
typedef _M128D VectorType;
173
typedef double EntryType;
176
OP_(or_) OP_(and_) OP_(xor_)
177
static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, _M128 mask) { return CAT(_mm_and_, SUFFIX)(_mm_castps_pd(mask), a); }
178
static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const double a) { return CAT(_mm_set1_, SUFFIX)(a); }
179
static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const double a, const double b) { return CAT(_mm_set_, SUFFIX)(a, b); }
180
static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return CAT(_mm_setzero_, SUFFIX)(); }
181
static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return CAT(_mm_setone_, SUFFIX)(); }// set(1.); }
184
static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) {
185
v1 = _mm_macc_pd(v1, v2, v3);
188
static inline void fma(VectorType &v1, VectorType v2, VectorType v3) {
189
VectorType h1 = _mm_and_pd(v1, _mm_load_pd(reinterpret_cast<const double *>(&c_general::highMaskDouble)));
190
VectorType h2 = _mm_and_pd(v2, _mm_load_pd(reinterpret_cast<const double *>(&c_general::highMaskDouble)));
191
#if defined(VC_GCC) && VC_GCC < 0x40703
192
// GCC before 4.7.3 uses an incorrect optimization where it replaces the subtraction with an andnot
193
// http://gcc.gnu.org/bugzilla/show_bug.cgi?id=54703
194
asm("":"+x"(h1), "+x"(h2));
196
const VectorType l1 = _mm_sub_pd(v1, h1);
197
const VectorType l2 = _mm_sub_pd(v2, h2);
198
const VectorType ll = mul(l1, l2);
199
const VectorType lh = add(mul(l1, h2), mul(h1, l2));
200
const VectorType hh = mul(h1, h2);
201
// ll < lh < hh for all entries is certain
202
const VectorType lh_lt_v3 = cmplt(abs(lh), abs(v3)); // |lh| < |v3|
203
const VectorType b = _mm_blendv_pd(v3, lh, lh_lt_v3);
204
const VectorType c = _mm_blendv_pd(lh, v3, lh_lt_v3);
205
v1 = add(add(ll, b), add(c, hh));
209
OP(add) OP(sub) OP(mul)
215
static Vc_ALWAYS_INLINE Vc_CONST VectorType rsqrt(VectorType x) {
216
return _mm_div_pd(one(), sqrt(x));
218
static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VectorType x) {
219
return _mm_div_pd(one(), x);
221
static Vc_ALWAYS_INLINE Vc_CONST VectorType isNaN(VectorType x) {
222
return _mm_cmpunord_pd(x, x);
224
static Vc_ALWAYS_INLINE Vc_CONST VectorType isFinite(VectorType x) {
225
return _mm_cmpord_pd(x, _mm_mul_pd(zero(), x));
227
static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) {
228
return CAT(_mm_and_, SUFFIX)(a, _mm_setabsmask_pd());
232
static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
233
a = _mm_min_sd(a, _mm_unpackhi_pd(a, a));
234
return _mm_cvtsd_f64(a);
236
static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
237
a = _mm_max_sd(a, _mm_unpackhi_pd(a, a));
238
return _mm_cvtsd_f64(a);
240
static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
241
a = _mm_mul_sd(a, _mm_shuffle_pd(a, a, _MM_SHUFFLE2(0, 1)));
242
return _mm_cvtsd_f64(a);
244
static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
245
a = _mm_add_sd(a, _mm_shuffle_pd(a, a, _MM_SHUFFLE2(0, 1)));
246
return _mm_cvtsd_f64(a);
249
static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) {
250
#ifdef VC_IMPL_SSE4_1
251
return _mm_round_pd(a, _MM_FROUND_NINT);
253
//XXX: slow: _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
254
return _mm_cvtepi32_pd(_mm_cvtpd_epi32(a));
259
template<> struct VectorHelper<float> {
260
typedef float EntryType;
261
typedef _M128 VectorType;
264
OP_(or_) OP_(and_) OP_(xor_)
265
static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, _M128 mask) { return CAT(_mm_and_, SUFFIX)(mask, a); }
266
static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a) { return CAT(_mm_set1_, SUFFIX)(a); }
267
static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a, const float b, const float c, const float d) { return CAT(_mm_set_, SUFFIX)(a, b, c, d); }
268
static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return CAT(_mm_setzero_, SUFFIX)(); }
269
static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return CAT(_mm_setone_, SUFFIX)(); }// set(1.f); }
270
static Vc_ALWAYS_INLINE Vc_CONST _M128 concat(_M128D a, _M128D b) { return _mm_movelh_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b)); }
273
static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) {
274
v1 = _mm_macc_ps(v1, v2, v3);
277
static inline void fma(VectorType &v1, VectorType v2, VectorType v3) {
278
__m128d v1_0 = _mm_cvtps_pd(v1);
279
__m128d v1_1 = _mm_cvtps_pd(_mm_movehl_ps(v1, v1));
280
__m128d v2_0 = _mm_cvtps_pd(v2);
281
__m128d v2_1 = _mm_cvtps_pd(_mm_movehl_ps(v2, v2));
282
__m128d v3_0 = _mm_cvtps_pd(v3);
283
__m128d v3_1 = _mm_cvtps_pd(_mm_movehl_ps(v3, v3));
285
_mm_cvtpd_ps(_mm_add_pd(_mm_mul_pd(v1_0, v2_0), v3_0)),
286
_mm_cvtpd_ps(_mm_add_pd(_mm_mul_pd(v1_1, v2_1), v3_1)));
290
OP(add) OP(sub) OP(mul)
296
static Vc_ALWAYS_INLINE Vc_CONST VectorType isNaN(VectorType x) {
297
return _mm_cmpunord_ps(x, x);
299
static Vc_ALWAYS_INLINE Vc_CONST VectorType isFinite(VectorType x) {
300
return _mm_cmpord_ps(x, _mm_mul_ps(zero(), x));
302
static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VectorType x) {
303
return _mm_rcp_ps(x);
305
static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) {
306
return CAT(_mm_and_, SUFFIX)(a, _mm_setabsmask_ps());
310
static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
311
a = _mm_min_ps(a, _mm_movehl_ps(a, a)); // a = min(a0, a2), min(a1, a3), min(a2, a2), min(a3, a3)
312
a = _mm_min_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1))); // a = min(a0, a1), a1, a2, a3
313
return _mm_cvtss_f32(a);
315
static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
316
a = _mm_max_ps(a, _mm_movehl_ps(a, a)); // a = max(a0, a2), max(a1, a3), max(a2, a2), max(a3, a3)
317
a = _mm_max_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1))); // a = max(a0, a1), a1, a2, a3
318
return _mm_cvtss_f32(a);
320
static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
321
a = _mm_mul_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 2, 3)));
322
a = _mm_mul_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 0, 1)));
323
return _mm_cvtss_f32(a);
325
static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
326
a = _mm_add_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 2, 3)));
327
a = _mm_add_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 0, 1)));
328
return _mm_cvtss_f32(a);
331
static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) {
332
#ifdef VC_IMPL_SSE4_1
333
return _mm_round_ps(a, _MM_FROUND_NINT);
335
//XXX slow: _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
336
return _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
341
template<> struct VectorHelper<float8> {
342
typedef float EntryType;
343
typedef M256 VectorType;
344
#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN
345
typedef const VectorType &VectorTypeArg;
347
typedef const VectorType VectorTypeArg;
350
static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a) {
351
const _M128 x = _mm_set1_ps(a);
352
return VectorType::create(x, x);
354
static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a, const float b, const float c, const float d) {
355
const _M128 x = _mm_set_ps(a, b, c, d);
356
return VectorType::create(x, x);
358
static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a, const float b, const float c, const float d,
359
const float e, const float f, const float g, const float h) {
360
return VectorType::create(_mm_set_ps(a, b, c, d), _mm_set_ps(e, f, g, h));
362
static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return VectorType::create(_mm_setzero_ps(), _mm_setzero_ps()); }
363
static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return set(1.f); }
365
#define REUSE_FLOAT_IMPL1(fun) \
366
static Vc_ALWAYS_INLINE Vc_CONST VectorType fun(VectorTypeArg x) { \
367
return VectorType::create(VectorHelper<float>::fun(x[0]), VectorHelper<float>::fun(x[1])); \
369
#define REUSE_FLOAT_IMPL2(fun) \
370
static Vc_ALWAYS_INLINE Vc_CONST VectorType fun(VectorTypeArg x, VectorTypeArg y) { \
371
return VectorType::create(VectorHelper<float>::fun(x[0], y[0]), VectorHelper<float>::fun(x[1], y[1])); \
373
REUSE_FLOAT_IMPL1(reciprocal)
374
REUSE_FLOAT_IMPL1(sqrt)
375
REUSE_FLOAT_IMPL1(rsqrt)
376
REUSE_FLOAT_IMPL1(isNaN)
377
REUSE_FLOAT_IMPL1(isFinite)
378
REUSE_FLOAT_IMPL1(abs)
379
REUSE_FLOAT_IMPL1(round)
381
REUSE_FLOAT_IMPL2(and_)
382
REUSE_FLOAT_IMPL2(or_)
383
REUSE_FLOAT_IMPL2(xor_)
384
REUSE_FLOAT_IMPL2(notMaskedToZero)
385
REUSE_FLOAT_IMPL2(add)
386
REUSE_FLOAT_IMPL2(sub)
387
REUSE_FLOAT_IMPL2(mul)
388
REUSE_FLOAT_IMPL2(cmple)
389
REUSE_FLOAT_IMPL2(cmpnle)
390
REUSE_FLOAT_IMPL2(cmplt)
391
REUSE_FLOAT_IMPL2(cmpnlt)
392
REUSE_FLOAT_IMPL2(cmpeq)
393
REUSE_FLOAT_IMPL2(cmpneq)
394
REUSE_FLOAT_IMPL2(min)
395
REUSE_FLOAT_IMPL2(max)
397
static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorTypeArg a) {
398
return VectorHelper<float>::min(VectorHelper<float>::min(a[0], a[1]));
400
static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorTypeArg a) {
401
return VectorHelper<float>::max(VectorHelper<float>::max(a[0], a[1]));
403
static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorTypeArg a) {
404
return VectorHelper<float>::mul(VectorHelper<float>::mul(a[0], a[1]));
406
static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorTypeArg a) {
407
return VectorHelper<float>::add(VectorHelper<float>::add(a[0], a[1]));
410
static inline void fma(VectorType &a, VectorTypeArg b, VectorTypeArg c) {
411
VectorHelper<float>::fma(a[0], b[0], c[0]);
412
VectorHelper<float>::fma(a[1], b[1], c[1]);
414
#undef REUSE_FLOAT_IMPL2
415
#undef REUSE_FLOAT_IMPL1
418
template<> struct VectorHelper<int> {
419
typedef int EntryType;
420
typedef _M128I VectorType;
423
OP_(or_) OP_(and_) OP_(xor_)
424
static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return CAT(_mm_setzero_, SUFFIX)(); }
425
static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, _M128 mask) { return CAT(_mm_and_, SUFFIX)(_mm_castps_si128(mask), a); }
428
static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return CAT(_mm_setone_, SUFFIX)(); }
430
static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const int a) { return CAT(_mm_set1_, SUFFIX)(a); }
431
static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const int a, const int b, const int c, const int d) { return CAT(_mm_set_, SUFFIX)(a, b, c, d); }
433
static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }
435
static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) {
436
return CAT(_mm_slli_, SUFFIX)(a, shift);
438
static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) {
439
return CAT(_mm_srai_, SUFFIX)(a, shift);
444
static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
445
a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
446
// using lo_epi16 for speed here
447
a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
448
return _mm_cvtsi128_si32(a);
450
static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
451
a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
452
// using lo_epi16 for speed here
453
a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
454
return _mm_cvtsi128_si32(a);
456
static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
457
a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
458
a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
459
return _mm_cvtsi128_si32(a);
461
#ifdef VC_IMPL_SSE4_1
462
static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(VectorType a, VectorType b) { return _mm_mullo_epi32(a, b); }
463
static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
464
a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
465
a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
466
return _mm_cvtsi128_si32(a);
469
static inline Vc_CONST VectorType mul(const VectorType a, const VectorType b) {
470
const VectorType aShift = _mm_srli_si128(a, 4);
471
const VectorType ab02 = _mm_mul_epu32(a, b); // [a0 * b0, a2 * b2]
472
const VectorType bShift = _mm_srli_si128(b, 4);
473
const VectorType ab13 = _mm_mul_epu32(aShift, bShift); // [a1 * b1, a3 * b3]
474
return _mm_unpacklo_epi32(_mm_shuffle_epi32(ab02, 8), _mm_shuffle_epi32(ab13, 8));
482
static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpneq(const VectorType a, const VectorType b) { _M128I x = cmpeq(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); }
483
static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnlt(const VectorType a, const VectorType b) { _M128I x = cmplt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); }
484
static Vc_ALWAYS_INLINE Vc_CONST VectorType cmple (const VectorType a, const VectorType b) { _M128I x = cmpgt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); }
485
static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnle(const VectorType a, const VectorType b) { return cmpgt(a, b); }
487
static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; }
490
template<> struct VectorHelper<unsigned int> {
491
typedef unsigned int EntryType;
492
typedef _M128I VectorType;
494
OP_CAST_(or_) OP_CAST_(and_) OP_CAST_(xor_)
495
static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return CAT(_mm_setzero_, SUFFIX)(); }
496
static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, _M128 mask) { return CAT(_mm_and_, SUFFIX)(_mm_castps_si128(mask), a); }
500
static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return CAT(_mm_setone_, SUFFIX)(); }
503
static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
504
a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
505
// using lo_epi16 for speed here
506
a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
507
return _mm_cvtsi128_si32(a);
509
static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
510
a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
511
// using lo_epi16 for speed here
512
a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
513
return _mm_cvtsi128_si32(a);
515
static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
516
a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
517
// using lo_epi16 for speed here
518
a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
519
return _mm_cvtsi128_si32(a);
521
static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
522
a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
523
// using lo_epi16 for speed here
524
a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
525
return _mm_cvtsi128_si32(a);
528
static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }
530
static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(const VectorType a, const VectorType b) {
531
return VectorHelper<int>::mul(a, b);
533
//X template<unsigned int b> static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(const VectorType a) {
535
//X case 0: return zero();
536
//X case 1: return a;
537
//X case 2: return _mm_slli_epi32(a, 1);
538
//X case 4: return _mm_slli_epi32(a, 2);
539
//X case 8: return _mm_slli_epi32(a, 3);
540
//X case 16: return _mm_slli_epi32(a, 4);
541
//X case 32: return _mm_slli_epi32(a, 5);
542
//X case 64: return _mm_slli_epi32(a, 6);
543
//X case 128: return _mm_slli_epi32(a, 7);
544
//X case 256: return _mm_slli_epi32(a, 8);
545
//X case 512: return _mm_slli_epi32(a, 9);
546
//X case 1024: return _mm_slli_epi32(a, 10);
547
//X case 2048: return _mm_slli_epi32(a, 11);
549
//X return mul(a, set(b));
554
static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) {
555
return CAT(_mm_slli_, SUFFIX)(a, shift);
557
static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) {
558
return CAT(_mm_srli_, SUFFIX)(a, shift);
560
static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const unsigned int a) { return CAT(_mm_set1_, SUFFIX)(a); }
561
static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const unsigned int a, const unsigned int b, const unsigned int c, const unsigned int d) { return CAT(_mm_set_, SUFFIX)(a, b, c, d); }
565
static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpneq(const VectorType a, const VectorType b) { return _mm_andnot_si128(cmpeq(a, b), _mm_setallone_si128()); }
567
#ifndef USE_INCORRECT_UNSIGNED_COMPARE
568
static Vc_ALWAYS_INLINE Vc_CONST VectorType cmplt(const VectorType a, const VectorType b) {
569
return _mm_cmplt_epu32(a, b);
571
static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpgt(const VectorType a, const VectorType b) {
572
return _mm_cmpgt_epu32(a, b);
578
static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnlt(const VectorType a, const VectorType b) { return _mm_andnot_si128(cmplt(a, b), _mm_setallone_si128()); }
579
static Vc_ALWAYS_INLINE Vc_CONST VectorType cmple (const VectorType a, const VectorType b) { return _mm_andnot_si128(cmpgt(a, b), _mm_setallone_si128()); }
580
static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnle(const VectorType a, const VectorType b) { return cmpgt(a, b); }
583
static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; }
586
template<> struct VectorHelper<signed short> {
587
typedef _M128I VectorType;
588
typedef signed short EntryType;
591
OP_(or_) OP_(and_) OP_(xor_)
592
static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return CAT(_mm_setzero_, SUFFIX)(); }
593
static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, _M128 mask) { return CAT(_mm_and_, SUFFIX)(_mm_castps_si128(mask), a); }
594
static Vc_ALWAYS_INLINE Vc_CONST _M128I concat(_M128I a, _M128I b) { return _mm_packs_epi32(a, b); }
595
static Vc_ALWAYS_INLINE Vc_CONST _M128I expand0(_M128I x) { return _mm_srai_epi32(_mm_unpacklo_epi16(x, x), 16); }
596
static Vc_ALWAYS_INLINE Vc_CONST _M128I expand1(_M128I x) { return _mm_srai_epi32(_mm_unpackhi_epi16(x, x), 16); }
600
static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return CAT(_mm_setone_, SUFFIX)(); }
602
static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) {
603
return CAT(_mm_slli_, SUFFIX)(a, shift);
605
static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) {
606
return CAT(_mm_srai_, SUFFIX)(a, shift);
608
static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a) { return CAT(_mm_set1_, SUFFIX)(a); }
609
static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a, const EntryType b, const EntryType c, const EntryType d,
610
const EntryType e, const EntryType f, const EntryType g, const EntryType h) {
611
return CAT(_mm_set_, SUFFIX)(a, b, c, d, e, f, g, h);
614
static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) {
615
v1 = add(mul(v1, v2), v3); }
621
static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
622
// reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
623
a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
624
a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
625
a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
626
return _mm_cvtsi128_si32(a); // & 0xffff is implicit
628
static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
629
// reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
630
a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
631
a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
632
a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
633
return _mm_cvtsi128_si32(a); // & 0xffff is implicit
635
static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
636
a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
637
a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
638
a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
639
return _mm_cvtsi128_si32(a); // & 0xffff is implicit
641
static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
642
a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
643
a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
644
a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
645
return _mm_cvtsi128_si32(a); // & 0xffff is implicit
652
static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpneq(const VectorType a, const VectorType b) { _M128I x = cmpeq(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); }
653
static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnlt(const VectorType a, const VectorType b) { _M128I x = cmplt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); }
654
static Vc_ALWAYS_INLINE Vc_CONST VectorType cmple (const VectorType a, const VectorType b) { _M128I x = cmpgt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); }
655
static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnle(const VectorType a, const VectorType b) { return cmpgt(a, b); }
657
static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; }
660
template<> struct VectorHelper<unsigned short> {
661
typedef _M128I VectorType;
662
typedef unsigned short EntryType;
664
OP_CAST_(or_) OP_CAST_(and_) OP_CAST_(xor_)
665
static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return CAT(_mm_setzero_, SUFFIX)(); }
666
static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, _M128 mask) { return CAT(_mm_and_, SUFFIX)(_mm_castps_si128(mask), a); }
667
#ifdef VC_IMPL_SSE4_1
668
static Vc_ALWAYS_INLINE Vc_CONST _M128I concat(_M128I a, _M128I b) { return _mm_packus_epi32(a, b); }
670
// XXX too bad, but this is broken without SSE 4.1
671
static Vc_ALWAYS_INLINE Vc_CONST _M128I concat(_M128I a, _M128I b) { return _mm_packs_epi32(a, b); }
673
static Vc_ALWAYS_INLINE Vc_CONST _M128I expand0(_M128I x) { return _mm_srli_epi32(_mm_unpacklo_epi16(x, x), 16); }
674
static Vc_ALWAYS_INLINE Vc_CONST _M128I expand1(_M128I x) { return _mm_srli_epi32(_mm_unpackhi_epi16(x, x), 16); }
678
static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return CAT(_mm_setone_, SUFFIX)(); }
680
//X template<unsigned int b> static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(const VectorType a) {
682
//X case 0: return zero();
683
//X case 1: return a;
684
//X case 2: return _mm_slli_epi16(a, 1);
685
//X case 4: return _mm_slli_epi16(a, 2);
686
//X case 8: return _mm_slli_epi16(a, 3);
687
//X case 16: return _mm_slli_epi16(a, 4);
688
//X case 32: return _mm_slli_epi16(a, 5);
689
//X case 64: return _mm_slli_epi16(a, 6);
690
//X case 128: return _mm_slli_epi16(a, 7);
691
//X case 256: return _mm_slli_epi16(a, 8);
692
//X case 512: return _mm_slli_epi16(a, 9);
693
//X case 1024: return _mm_slli_epi16(a, 10);
694
//X case 2048: return _mm_slli_epi16(a, 11);
696
//X return mul(a, set(b));
698
#if !defined(USE_INCORRECT_UNSIGNED_COMPARE) || VC_IMPL_SSE4_1
703
static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) {
704
return CAT(_mm_slli_, SUFFIX)(a, shift);
706
static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) {
707
return CAT(_mm_srli_, SUFFIX)(a, shift);
710
static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }
712
OPx(mul, mullo) // should work correctly for all values
713
#if defined(USE_INCORRECT_UNSIGNED_COMPARE) && !defined(VC_IMPL_SSE4_1)
714
OP(min) OP(max) // XXX breaks for values with MSB set
716
static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
717
// reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
718
a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
719
a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
720
a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
721
return _mm_cvtsi128_si32(a); // & 0xffff is implicit
723
static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
724
// reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
725
a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
726
a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
727
a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
728
return _mm_cvtsi128_si32(a); // & 0xffff is implicit
730
static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
731
// reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
732
a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
733
a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
734
a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
735
return _mm_cvtsi128_si32(a); // & 0xffff is implicit
737
static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
738
// reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
739
a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
740
a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
741
a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
742
return _mm_cvtsi128_si32(a); // & 0xffff is implicit
744
static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a) { return CAT(_mm_set1_, SUFFIX)(a); }
745
static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a, const EntryType b, const EntryType c,
746
const EntryType d, const EntryType e, const EntryType f,
747
const EntryType g, const EntryType h) {
748
return CAT(_mm_set_, SUFFIX)(a, b, c, d, e, f, g, h);
753
static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpneq(const VectorType a, const VectorType b) { return _mm_andnot_si128(cmpeq(a, b), _mm_setallone_si128()); }
755
#ifndef USE_INCORRECT_UNSIGNED_COMPARE
756
static Vc_ALWAYS_INLINE Vc_CONST VectorType cmplt(const VectorType a, const VectorType b) {
757
return _mm_cmplt_epu16(a, b);
759
static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpgt(const VectorType a, const VectorType b) {
760
return _mm_cmpgt_epu16(a, b);
766
static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnlt(const VectorType a, const VectorType b) { return _mm_andnot_si128(cmplt(a, b), _mm_setallone_si128()); }
767
static Vc_ALWAYS_INLINE Vc_CONST VectorType cmple (const VectorType a, const VectorType b) { return _mm_andnot_si128(cmpgt(a, b), _mm_setallone_si128()); }
768
static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnle(const VectorType a, const VectorType b) { return cmpgt(a, b); }
770
static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; }
781
#include "vectorhelper.tcc"
782
#include "undomacros.h"
784
#endif // SSE_VECTORHELPER_H