1
// Copyright 2004 Regents of the University of California
3
// libSIMD++ is free software; you can redistribute it and/or modify it under
4
// the terms of the GNU General Public License as published by the Free
5
// Software Foundation; either version 2, or (at your option) any later
8
// libSIMD++ is distributed in the hope that it will be useful, but WITHOUT
9
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13
// You should have received a copy of the GNU General Public License along
14
// with libSIMD++; see the file COPYING. If not, write to the Free Software
15
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17
// $Id: simd.h,v 1.10.2.1 2007/03/22 00:03:59 korpela Exp $
19
// Original revision: 28-Jul-2004, Eric J. Korpela
30
#define ALIGN_ATTRIBUTE(x) __attribute__ ((aligned(x)))
31
#if defined(USE_MMX) || defined(USE_SSE) || \
32
defined(USE_SSE2) || defined(USE_3DNOW)
37
typedef float __m128 __attribute__ ((mode(V4SF)));
38
typedef int __m64 __attribute__ ((mode(V4HI)));
39
typedef unsigned long __m32;
40
typedef unsigned short __m16;
41
typedef unsigned char __m8;
42
#define PURE_FUNCTION __attribute__ ((pure))
45
#elif defined(_MSC_VER)
47
#define ALIGN_DEF(x) __declspec(align( x ))
48
#define ALIGN_ATTRIBUTE(x)
53
#define ALIGN_ATTRIBUTE(x)
61
#elif (SIZEOF__LONG_DOUBLE)==16
63
#elif defined(HAVE_UINT64_T)
65
#elif defined(HAVE_LONG_LONG)
66
unsigned long long m128[2];
67
#elif defined(HAVE___INT64)
70
} ALIGN_ATTRIBUTE(16);
72
typedef unsigned char pbool;
76
__m8 all ALIGN_ATTRIBUTE(1);
77
unsigned char c[1] ALIGN_ATTRIBUTE(1);
81
__m16 all ALIGN_ATTRIBUTE(2);
82
unsigned short s[1] ALIGN_ATTRIBUTE(2);
83
unsigned char c[2] ALIGN_ATTRIBUTE(2);
84
vec_m8 v8[2] ALIGN_ATTRIBUTE(2);
88
__m32 all ALIGN_ATTRIBUTE(4);
89
float f[1] ALIGN_ATTRIBUTE(4);
90
unsigned long l[4/sizeof(long)] ALIGN_ATTRIBUTE(4);
91
unsigned int i[4/sizeof(int)] ALIGN_ATTRIBUTE(4);
92
unsigned short s[8/sizeof(short)] ALIGN_ATTRIBUTE(4);
93
unsigned char c[4] ALIGN_ATTRIBUTE(4);
94
vec_m8 v8[4] ALIGN_ATTRIBUTE(4);
95
vec_m16 v16[2] ALIGN_ATTRIBUTE(4);
98
#if defined(HAVE_UNIT64_T)
99
typedef uint64_t v_i64;
100
#elif defined(HAVE_LONG_LONG)
101
typedef unsigned long long v_i64;
102
#elif defined(HAVE__INT64)
103
typedef unsigned _int64 v_i64;
105
typedef unsigned long v_i64[2];
109
__m64 all ALIGN_ATTRIBUTE(8);
110
v_i64 ll[1] ALIGN_ATTRIBUTE(8);
111
double d[8/sizeof(double)] ALIGN_ATTRIBUTE(8);
112
float f[8/sizeof(float)] ALIGN_ATTRIBUTE(8);
113
unsigned long l[8/sizeof(long)] ALIGN_ATTRIBUTE(8);
114
unsigned int i[8/sizeof(int)] ALIGN_ATTRIBUTE(8);
115
unsigned short s[8/sizeof(short)] ALIGN_ATTRIBUTE(8);
116
unsigned char c[8] ALIGN_ATTRIBUTE(8);
117
vec_m8 v8[8] ALIGN_ATTRIBUTE(8);
118
vec_m16 v16[4] ALIGN_ATTRIBUTE(8);
119
vec_m32 v32[2] ALIGN_ATTRIBUTE(8);
123
__m128 all ALIGN_ATTRIBUTE(16);
125
__m128 m128 ALIGN_ATTRIBUTE(16);
128
__m128d m128d ALIGN_ATTRIBUTE(16);
131
__m128i m128i ALIGN_ATTRIBUTE(16);
133
#if defined(HAVE_UINT128_T)
134
uint128_t u128 ALIGN_ATTRIBUTE(16);
135
#elif defined(HAVE___INT128)
136
__int128 u128 ALIGN_ATTRIBUTE(16);
138
#if defined(HAVE_UINT64_T)
139
inline operator __m128() const { return *(reinterpret_cast<__m128 *>(this)) };
140
uint64_t ll[2] ALIGN_ATTRIBUTE(16);
141
#elif defined(HAVE_LONG_LONG)
142
unsigned long long ll[2] ALIGN_ATTRIBUTE(16);
143
#elif defined(HAVE___INT64)
144
__int64 ll[2] ALIGN_ATTRIBUTE(16);
146
#if defined(HAVE_LONG_DOUBLE)
147
#if (SIZEOF_LONG_DOUBLE>8)
148
long double ld[1] ALIGN_ATTRIBUTE(16);
149
#elif (SIZEOF_LONG_DOUBLE == 8)
150
long double ld[2] ALIGN_ATTRIBUTE(16);
153
double d[16/sizeof(double)] ALIGN_ATTRIBUTE(16);
154
float f[16/sizeof(float)] ALIGN_ATTRIBUTE(16);
155
unsigned long l[16/sizeof(long)] ALIGN_ATTRIBUTE(16);
156
unsigned int i[16/sizeof(int)] ALIGN_ATTRIBUTE(16);
157
unsigned short s[16/sizeof(short)] ALIGN_ATTRIBUTE(16);
158
unsigned char c[16] ALIGN_ATTRIBUTE(16);
159
vec_m8 v8[16] ALIGN_ATTRIBUTE(16);
160
vec_m16 v16[8] ALIGN_ATTRIBUTE(16);
161
vec_m32 v32[4] ALIGN_ATTRIBUTE(16);
162
vec_m64 v64[2] ALIGN_ATTRIBUTE(16);
168
// Generic definitions
170
template <typename t1, typename t0>
171
inline t1 &interpret_as(t0 &a) { return reinterpret_cast<t1 &>(a); }
173
template <typename T, const size_t N>
180
const size_t size(T*N);
181
typedef T array[N] ALIGN_ATTRIBUTE(size);
190
arrtype(vec_m128 &b) : m128(b) {};
191
arrtype(vec_m64 &b) : m64(b) {};
192
arrtype(vec_m32 &b) : m32(b) {};
193
arrtype(vec_m16 &b) : m16(b) {};
194
arrtype(vec_m8 &b) : m8(b) {};
195
arrtype(__m128 &b) : m128(reinterpret_cast<vec_m128 &>(b)) {};
196
arrtype(const typename simd<pbool,N>::arrtype &b);
199
void prefetch() const PURE_FUNCTION;
200
void prefetchw() const PURE_FUNCTION;
202
inline T &operator [](int i) {return v[i];}
204
#define binop(__op,t) \
205
arrtype REF operator __op(const arrtype &b) const PURE_FUNCTION; \
206
arrtype operator __op(const t &n) const PURE_FUNCTION;
218
#define bool_binop(__op,t) \
219
typename simd<pbool,N>::arrtype REF operator __op(const typename \
220
simd<t,N>::arrtype &b) const PURE_FUNCTION; \
221
typename simd<pbool,N>::arrtype operator __op(const T &n) const PURE_FUNCTION;
231
arrtype REF operator +() const { return *this; };
232
arrtype REF operator -() const PURE_FUNCTION;
233
arrtype REF operator ~() const PURE_FUNCTION;
234
typename simd<pbool,N>::arrtype REF operator !() const PURE_FUNCTION;
235
//arrtype operator *() const;
236
ptr operator &() { return ptr((void *)this); };
237
// assignment operators
238
arrtype &operator =(const arrtype &b);
239
arrtype &operator =(const T &b);
240
arrtype &operator +=(const arrtype &b);
241
arrtype &operator +=(const T &b);
242
arrtype &operator -=(const arrtype &b);
243
arrtype &operator -=(const T &b);
244
arrtype &operator *=(const arrtype &b);
245
arrtype &operator *=(const T &b);
246
arrtype &operator /=(const arrtype &b);
247
arrtype &operator /=(const T &b);
248
arrtype &operator &=(const arrtype &b);
249
arrtype &operator &=(const T &b);
250
arrtype &operator ^=(const arrtype &b);
251
arrtype &operator ^=(const T &b);
252
arrtype &operator |=(const arrtype &b);
253
arrtype &operator |=(const T &b);
254
arrtype &operator %=(const arrtype &b);
255
arrtype &operator %=(const T &b);
256
arrtype &operator <<=(const arrtype &b);
257
arrtype &operator <<=(const int &b);
258
arrtype &operator >>=(const arrtype &b);
259
arrtype &operator >>=(const int &b);
261
arrtype REF sqrt() const PURE_FUNCTION;
262
arrtype REF aprx_sqrt() const PURE_FUNCTION;
263
arrtype REF rsqrt() const PURE_FUNCTION;
264
arrtype REF aprx_rsqrt() const PURE_FUNCTION;
265
arrtype REF recip() const PURE_FUNCTION;
266
arrtype REF aprx_recip() const PURE_FUNCTION;
267
arrtype REF max(const arrtype &b) const PURE_FUNCTION;
268
arrtype REF min(const arrtype &b) const PURE_FUNCTION;
269
arrtype REF avg(const arrtype &b) const PURE_FUNCTION;
270
arrtype REF splat(const int &b) const PURE_FUNCTION;
271
// conversion operators
272
operator typename simd<pbool,N>::arrtype() const;
273
operator __m128() { return *(reinterpret_cast<__m128 *>(this)); } ;
276
static const size_t size;
277
static const size_t nbits;
278
static volatile void simd_mode_start();
279
static volatile void simd_mode_finish();
287
inline operator void *() const { return (void *)p; };
288
inline void *addr() const { return (void *)p; };
289
inline ptr(void *a) : p((unsigned char *)a) {};
290
inline ptr operator +(int i) const
291
{ return ptr(p+i*(N*sizeof(T))); };
292
inline ptr operator -(int i) const
293
{ return ptr(p-i*(N*sizeof(T))); };
294
inline off_t operator -(simd<T,N>::ptr &pp) const
295
{ return (p-pp.p)/(N*sizeof(T)); };
296
inline ptr &operator ++()
297
{ p+=(N*sizeof(T)); return *this; };
298
inline ptr operator ++(int i)
299
{ register void *z(p); p+=(N*sizeof(T)); return z; };
300
inline ptr &operator --()
301
{ p-=(N*sizeof(T)); return *this; };
302
inline ptr operator --(int i)
303
{ register void *z(p); p-=(N*sizeof(T)); return z; };
304
inline ptr operator +=(int i)
305
{ return p+=(i*N*sizeof(T)); };
306
inline ptr operator -=(int i)
307
{ return p-=(i*N*sizeof(T)); };
308
inline arrtype &operator *();
309
inline const arrtype &operator [](const int n) const {
310
return static_cast<const T &>(*(const arrtype *)(p+n*N*sizeof(T)));
312
inline arrtype &operator [](const int n) {
313
return static_cast<arrtype &>(*(arrtype *)(p+n*N*sizeof(T)));
318
template <typename t, const size_t n>
319
const size_t simd<t,n>::size=n*sizeof(t);
321
template <typename t, const size_t n>
322
const size_t simd<t,n>::nbits=n*sizeof(t)*CHAR_BIT;
324
template <const int order, typename T, size_t N>
325
typename simd<T,N>::arrtype &shuffle(const typename simd<T,N>::arrtype &v)
329
// generics to fill in the missing stuff
330
#include "generics.h"
331
// stuff for specific processors
334
#elif defined(USE_SSE)
336
#elif defined(USE_SSE2)
338
#elif defined(USE_3DNOW)
340
#elif defined(USE_ALTIVEC)
342
#elif defined(USE_VIS)
344
#elif defined(USE_VIS2)
347
#include "more_generics.h"
350
typedef simd<double,2>::arrtype double2;
351
typedef simd<long long,2>::arrtype longlong2;
352
typedef simd<unsigned long long,2>::arrtype ulonglong2;
353
typedef simd<float,4>::arrtype float4;
354
typedef simd<long,4>::arrtype long4;
355
typedef simd<unsigned long,4>::arrtype ulong4;
356
typedef simd<short,8>::arrtype short8;
357
typedef simd<unsigned short,8>::arrtype ushort8;
358
typedef simd<char,16>::arrtype char16;
359
typedef simd<unsigned char,16>::arrtype uchar16;
360
typedef simd<signed char,16>::arrtype schar16;
362
typedef simd<float,2>::arrtype float2;
363
typedef simd<long,2>::arrtype long2;
364
typedef simd<unsigned long,2>::arrtype ulong2;
365
typedef simd<short,4>::arrtype short4;
366
typedef simd<unsigned short,4>::arrtype ushort4;
367
typedef simd<char,8>::arrtype char8;
368
typedef simd<unsigned char,8>::arrtype uchar8;
369
typedef simd<signed char,8>::arrtype schar8;
371
typedef simd<short,2>::arrtype short2;
372
typedef simd<unsigned short,2>::arrtype ushort2;
373
typedef simd<char,4>::arrtype char4;
374
typedef simd<unsigned char,4>::arrtype uchar4;
375
typedef simd<signed char,4>::arrtype schar4;