~ubuntu-branches/ubuntu/raring/vc/raring-proposed

template<typename T> static Vc_ALWAYS_INLINE_L void store(T *mem, const VectorType x, const VectorType m, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R;

137

template<typename T> static Vc_ALWAYS_INLINE_L void store(T *mem, const VectorType x, const VectorType m, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R;

138

139

OP0(allone, _mm_setallone_si128())

140

OP0(zero, _mm_setzero_si128())

141

OP2(or_, _mm_or_si128(a, b))

142

OP2(xor_, _mm_xor_si128(a, b))

143

OP2(and_, _mm_and_si128(a, b))

144

OP2(andnot_, _mm_andnot_si128(a, b))

145

OP3(blend, _mm_blendv_epi8(a, b, c))

146

};

147

148

#undef OP1

149

#undef OP2

150

#undef OP3

151

152

#define OP1(op) \

153

static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a) { return CAT(_mm_##op##_, SUFFIX)(a); }

154

#define OP(op) \

155

static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return CAT(_mm_##op##_ , SUFFIX)(a, b); }

156

#define OP_(op) \

157

static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return CAT(_mm_##op , SUFFIX)(a, b); }

158

#define OPx(op, op2) \

159

static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return CAT(_mm_##op2##_, SUFFIX)(a, b); }

160

#define OPcmp(op) \

161

static Vc_ALWAYS_INLINE Vc_CONST VectorType cmp##op(const VectorType a, const VectorType b) { return CAT(_mm_cmp##op##_, SUFFIX)(a, b); }

162

#define OP_CAST_(op) \

163

static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return CAT(_mm_castps_, SUFFIX)( \

164

_mm_##op##ps(CAT(CAT(_mm_cast, SUFFIX), _ps)(a), \

165

CAT(CAT(_mm_cast, SUFFIX), _ps)(b))); \

166

}

167

#define MINMAX \

168

static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) { return CAT(_mm_min_, SUFFIX)(a, b); } \

169

static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) { return CAT(_mm_max_, SUFFIX)(a, b); }

170

171

template<> struct VectorHelper<double> {

172

typedef _M128D VectorType;

173

typedef double EntryType;

174

#define SUFFIX pd

175

176

OP_(or_) OP_(and_) OP_(xor_)

177

static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, _M128 mask) { return CAT(_mm_and_, SUFFIX)(_mm_castps_pd(mask), a); }

178

static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const double a) { return CAT(_mm_set1_, SUFFIX)(a); }

179

static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const double a, const double b) { return CAT(_mm_set_, SUFFIX)(a, b); }

180

static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return CAT(_mm_setzero_, SUFFIX)(); }

181

static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return CAT(_mm_setone_, SUFFIX)(); }// set(1.); }

182

183

#ifdef VC_IMPL_FMA4

184

static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) {

185

v1 = _mm_macc_pd(v1, v2, v3);

186

}

187

#else

188

static inline void fma(VectorType &v1, VectorType v2, VectorType v3) {

189

VectorType h1 = _mm_and_pd(v1, _mm_load_pd(reinterpret_cast<const double *>(&c_general::highMaskDouble)));

190

VectorType h2 = _mm_and_pd(v2, _mm_load_pd(reinterpret_cast<const double *>(&c_general::highMaskDouble)));

191

#if defined(VC_GCC) && VC_GCC < 0x40703

192

// GCC before 4.7.3 uses an incorrect optimization where it replaces the subtraction with an andnot

193

// http://gcc.gnu.org/bugzilla/show_bug.cgi?id=54703

194

asm("":"+x"(h1), "+x"(h2));

195

#endif

196

const VectorType l1 = _mm_sub_pd(v1, h1);

197

const VectorType l2 = _mm_sub_pd(v2, h2);

198

const VectorType ll = mul(l1, l2);

199

const VectorType lh = add(mul(l1, h2), mul(h1, l2));

200

const VectorType hh = mul(h1, h2);

201

// ll < lh < hh for all entries is certain

202

const VectorType lh_lt_v3 = cmplt(abs(lh), abs(v3)); // |lh| < |v3|

203

const VectorType b = _mm_blendv_pd(v3, lh, lh_lt_v3);

204

const VectorType c = _mm_blendv_pd(lh, v3, lh_lt_v3);

205

v1 = add(add(ll, b), add(c, hh));

206

}

207

#endif

208

209

OP(add) OP(sub) OP(mul)

210

OPcmp(eq) OPcmp(neq)

211

OPcmp(lt) OPcmp(nlt)

212

OPcmp(le) OPcmp(nle)

213

214

OP1(sqrt)

215

static Vc_ALWAYS_INLINE Vc_CONST VectorType rsqrt(VectorType x) {

216

return _mm_div_pd(one(), sqrt(x));

217

}

218

static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VectorType x) {

219

return _mm_div_pd(one(), x);

220

}

221

static Vc_ALWAYS_INLINE Vc_CONST VectorType isNaN(VectorType x) {

222

return _mm_cmpunord_pd(x, x);

223

}

224

static Vc_ALWAYS_INLINE Vc_CONST VectorType isFinite(VectorType x) {

225

return _mm_cmpord_pd(x, _mm_mul_pd(zero(), x));

226

}

227

static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) {

228

return CAT(_mm_and_, SUFFIX)(a, _mm_setabsmask_pd());

229

}

230

231

MINMAX

232

static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {

233

a = _mm_min_sd(a, _mm_unpackhi_pd(a, a));

234

return _mm_cvtsd_f64(a);

235

}

236

static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {

237

a = _mm_max_sd(a, _mm_unpackhi_pd(a, a));

238

return _mm_cvtsd_f64(a);

239

}

240

static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {

241

a = _mm_mul_sd(a, _mm_shuffle_pd(a, a, _MM_SHUFFLE2(0, 1)));

242

return _mm_cvtsd_f64(a);

243

}

244

static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {

245

a = _mm_add_sd(a, _mm_shuffle_pd(a, a, _MM_SHUFFLE2(0, 1)));

246

return _mm_cvtsd_f64(a);

247

}

248

#undef SUFFIX

249

static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) {

250

#ifdef VC_IMPL_SSE4_1

251

return _mm_round_pd(a, _MM_FROUND_NINT);

252

#else

253

//XXX: slow: _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);

254

return _mm_cvtepi32_pd(_mm_cvtpd_epi32(a));

255

#endif

256

}

257

};

258

259

template<> struct VectorHelper<float> {

260

typedef float EntryType;

261

typedef _M128 VectorType;

262

#define SUFFIX ps

263

264

OP_(or_) OP_(and_) OP_(xor_)

265

static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, _M128 mask) { return CAT(_mm_and_, SUFFIX)(mask, a); }

266

static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a) { return CAT(_mm_set1_, SUFFIX)(a); }

267

static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a, const float b, const float c, const float d) { return CAT(_mm_set_, SUFFIX)(a, b, c, d); }

268

static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return CAT(_mm_setzero_, SUFFIX)(); }

269

static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return CAT(_mm_setone_, SUFFIX)(); }// set(1.f); }

270

static Vc_ALWAYS_INLINE Vc_CONST _M128 concat(_M128D a, _M128D b) { return _mm_movelh_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b)); }

271

272

#ifdef VC_IMPL_FMA4

273

static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) {

274

v1 = _mm_macc_ps(v1, v2, v3);

275

}

276

#else

277

static inline void fma(VectorType &v1, VectorType v2, VectorType v3) {

278

__m128d v1_0 = _mm_cvtps_pd(v1);

279

__m128d v1_1 = _mm_cvtps_pd(_mm_movehl_ps(v1, v1));

280

__m128d v2_0 = _mm_cvtps_pd(v2);

281

__m128d v2_1 = _mm_cvtps_pd(_mm_movehl_ps(v2, v2));

282

__m128d v3_0 = _mm_cvtps_pd(v3);

283

__m128d v3_1 = _mm_cvtps_pd(_mm_movehl_ps(v3, v3));

284

v1 = _mm_movelh_ps(

285

_mm_cvtpd_ps(_mm_add_pd(_mm_mul_pd(v1_0, v2_0), v3_0)),

286

_mm_cvtpd_ps(_mm_add_pd(_mm_mul_pd(v1_1, v2_1), v3_1)));

287

}

288

#endif

289

290

OP(add) OP(sub) OP(mul)

291

OPcmp(eq) OPcmp(neq)

292

OPcmp(lt) OPcmp(nlt)

293

OPcmp(le) OPcmp(nle)

294

295

OP1(sqrt) OP1(rsqrt)

296

static Vc_ALWAYS_INLINE Vc_CONST VectorType isNaN(VectorType x) {

297

return _mm_cmpunord_ps(x, x);

298

}

299

static Vc_ALWAYS_INLINE Vc_CONST VectorType isFinite(VectorType x) {

300

return _mm_cmpord_ps(x, _mm_mul_ps(zero(), x));

301

}

302

static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VectorType x) {

303

return _mm_rcp_ps(x);

304

}

305

static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) {

306

return CAT(_mm_and_, SUFFIX)(a, _mm_setabsmask_ps());

307

}

308

309

MINMAX

310

static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {

311

a = _mm_min_ps(a, _mm_movehl_ps(a, a)); // a = min(a0, a2), min(a1, a3), min(a2, a2), min(a3, a3)

312

a = _mm_min_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1))); // a = min(a0, a1), a1, a2, a3

313

return _mm_cvtss_f32(a);

314

}

315

static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {

316

a = _mm_max_ps(a, _mm_movehl_ps(a, a)); // a = max(a0, a2), max(a1, a3), max(a2, a2), max(a3, a3)

317

a = _mm_max_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1))); // a = max(a0, a1), a1, a2, a3

318

return _mm_cvtss_f32(a);

319

}

320

static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {

321

a = _mm_mul_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 2, 3)));

322

a = _mm_mul_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 0, 1)));

323

return _mm_cvtss_f32(a);

324

}

325

static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {

326

a = _mm_add_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 2, 3)));

327

a = _mm_add_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 0, 1)));

328

return _mm_cvtss_f32(a);

329

}

330

#undef SUFFIX

331

static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) {

332

#ifdef VC_IMPL_SSE4_1

333

return _mm_round_ps(a, _MM_FROUND_NINT);

334

#else

335

//XXX slow: _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);

336

return _mm_cvtepi32_ps(_mm_cvtps_epi32(a));

337

#endif

338

}

339

};

340

341

template<> struct VectorHelper<float8> {

342

typedef float EntryType;

343

typedef M256 VectorType;

344

#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN

345

typedef const VectorType &VectorTypeArg;

346

#else

347

typedef const VectorType VectorTypeArg;

348

#endif

349

350

static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a) {

351

const _M128 x = _mm_set1_ps(a);

352

return VectorType::create(x, x);

353

}

354

static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a, const float b, const float c, const float d) {

355

const _M128 x = _mm_set_ps(a, b, c, d);

356

return VectorType::create(x, x);

357

}

358

static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a, const float b, const float c, const float d,

359

const float e, const float f, const float g, const float h) {

360

return VectorType::create(_mm_set_ps(a, b, c, d), _mm_set_ps(e, f, g, h));

361

}

362

static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return VectorType::create(_mm_setzero_ps(), _mm_setzero_ps()); }

363

static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return set(1.f); }

364

365

#define REUSE_FLOAT_IMPL1(fun) \

366

static Vc_ALWAYS_INLINE Vc_CONST VectorType fun(VectorTypeArg x) { \

367

return VectorType::create(VectorHelper<float>::fun(x[0]), VectorHelper<float>::fun(x[1])); \

368

}

369

#define REUSE_FLOAT_IMPL2(fun) \

370

static Vc_ALWAYS_INLINE Vc_CONST VectorType fun(VectorTypeArg x, VectorTypeArg y) { \

371

return VectorType::create(VectorHelper<float>::fun(x[0], y[0]), VectorHelper<float>::fun(x[1], y[1])); \

372

}

373

REUSE_FLOAT_IMPL1(reciprocal)

374

REUSE_FLOAT_IMPL1(sqrt)

375

REUSE_FLOAT_IMPL1(rsqrt)

376

REUSE_FLOAT_IMPL1(isNaN)

377

REUSE_FLOAT_IMPL1(isFinite)

378

REUSE_FLOAT_IMPL1(abs)

379

REUSE_FLOAT_IMPL1(round)

380

381

REUSE_FLOAT_IMPL2(and_)

382

REUSE_FLOAT_IMPL2(or_)

383

REUSE_FLOAT_IMPL2(xor_)

384

REUSE_FLOAT_IMPL2(notMaskedToZero)

385

REUSE_FLOAT_IMPL2(add)

386

REUSE_FLOAT_IMPL2(sub)

387

REUSE_FLOAT_IMPL2(mul)

388

REUSE_FLOAT_IMPL2(cmple)

389

REUSE_FLOAT_IMPL2(cmpnle)

390

REUSE_FLOAT_IMPL2(cmplt)

391

REUSE_FLOAT_IMPL2(cmpnlt)

392

REUSE_FLOAT_IMPL2(cmpeq)

393

REUSE_FLOAT_IMPL2(cmpneq)

394

REUSE_FLOAT_IMPL2(min)

395

REUSE_FLOAT_IMPL2(max)

396

397

static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorTypeArg a) {

398

return VectorHelper<float>::min(VectorHelper<float>::min(a[0], a[1]));

399

}

400

static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorTypeArg a) {

401

return VectorHelper<float>::max(VectorHelper<float>::max(a[0], a[1]));

402

}

403

static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorTypeArg a) {

404

return VectorHelper<float>::mul(VectorHelper<float>::mul(a[0], a[1]));

405

}

406

static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorTypeArg a) {

407

return VectorHelper<float>::add(VectorHelper<float>::add(a[0], a[1]));

408

}

409

410

static inline void fma(VectorType &a, VectorTypeArg b, VectorTypeArg c) {

411

VectorHelper<float>::fma(a[0], b[0], c[0]);

412

VectorHelper<float>::fma(a[1], b[1], c[1]);

413

}

414

#undef REUSE_FLOAT_IMPL2

415

#undef REUSE_FLOAT_IMPL1

416

};

417

418

template<> struct VectorHelper<int> {

419

typedef int EntryType;

420

typedef _M128I VectorType;

421

#define SUFFIX si128

422

423

OP_(or_) OP_(and_) OP_(xor_)

424

static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return CAT(_mm_setzero_, SUFFIX)(); }

425

static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, _M128 mask) { return CAT(_mm_and_, SUFFIX)(_mm_castps_si128(mask), a); }

426

#undef SUFFIX

427

#define SUFFIX epi32

428

static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return CAT(_mm_setone_, SUFFIX)(); }

429

430

static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const int a) { return CAT(_mm_set1_, SUFFIX)(a); }

431

static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const int a, const int b, const int c, const int d) { return CAT(_mm_set_, SUFFIX)(a, b, c, d); }

432

433

static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }

434

435

static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) {

436

return CAT(_mm_slli_, SUFFIX)(a, shift);

437

}

438

static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) {

439

return CAT(_mm_srai_, SUFFIX)(a, shift);

440

}

441

OP1(abs)

442

443

MINMAX

444

static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {

445

a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));

446

// using lo_epi16 for speed here

447

a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));

448

return _mm_cvtsi128_si32(a);

449

}

450

static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {

451

a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));

452

// using lo_epi16 for speed here

453

a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));

454

return _mm_cvtsi128_si32(a);

455

}

456

static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {

457

a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));

458

a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));

459

return _mm_cvtsi128_si32(a);

460

}

461

#ifdef VC_IMPL_SSE4_1

462

static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(VectorType a, VectorType b) { return _mm_mullo_epi32(a, b); }

463

static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {

464

a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));

465

a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));

466

return _mm_cvtsi128_si32(a);

467

}

468

#else

469

static inline Vc_CONST VectorType mul(const VectorType a, const VectorType b) {

470

const VectorType aShift = _mm_srli_si128(a, 4);

471

const VectorType ab02 = _mm_mul_epu32(a, b); // [a0 * b0, a2 * b2]

472

const VectorType bShift = _mm_srli_si128(b, 4);

473

const VectorType ab13 = _mm_mul_epu32(aShift, bShift); // [a1 * b1, a3 * b3]

474

return _mm_unpacklo_epi32(_mm_shuffle_epi32(ab02, 8), _mm_shuffle_epi32(ab13, 8));

475

}

476

#endif

477

478

OP(add) OP(sub)

479

OPcmp(eq)

480

OPcmp(lt)

481

OPcmp(gt)

482

static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpneq(const VectorType a, const VectorType b) { _M128I x = cmpeq(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); }

483

static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnlt(const VectorType a, const VectorType b) { _M128I x = cmplt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); }

484

static Vc_ALWAYS_INLINE Vc_CONST VectorType cmple (const VectorType a, const VectorType b) { _M128I x = cmpgt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); }

485

static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnle(const VectorType a, const VectorType b) { return cmpgt(a, b); }

486

#undef SUFFIX

487

static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; }

488

};

489

490

template<> struct VectorHelper<unsigned int> {

491

typedef unsigned int EntryType;

492

typedef _M128I VectorType;

493

#define SUFFIX si128

494

OP_CAST_(or_) OP_CAST_(and_) OP_CAST_(xor_)

495

static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return CAT(_mm_setzero_, SUFFIX)(); }

496

static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, _M128 mask) { return CAT(_mm_and_, SUFFIX)(_mm_castps_si128(mask), a); }

497

498

#undef SUFFIX

499

#define SUFFIX epu32

500

static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return CAT(_mm_setone_, SUFFIX)(); }

501

502

MINMAX

503

static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {

504

a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));

505

// using lo_epi16 for speed here

506

a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));

507

return _mm_cvtsi128_si32(a);

508

}

509

static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {

510

a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));

511

// using lo_epi16 for speed here

512

a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));

513

return _mm_cvtsi128_si32(a);

514

}

515

static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {

516

a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));

517

// using lo_epi16 for speed here

518

a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));

519

return _mm_cvtsi128_si32(a);

520

}

521

static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {

522

a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));

523

// using lo_epi16 for speed here

524

a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));

525

return _mm_cvtsi128_si32(a);

526

}

527

528

static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }

529

530

static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(const VectorType a, const VectorType b) {

531

return VectorHelper<int>::mul(a, b);

532

}

533

//X template<unsigned int b> static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(const VectorType a) {

534

//X switch (b) {

535

//X case 0: return zero();

536

//X case 1: return a;

537

//X case 2: return _mm_slli_epi32(a, 1);

538

//X case 4: return _mm_slli_epi32(a, 2);

539

//X case 8: return _mm_slli_epi32(a, 3);

540

//X case 16: return _mm_slli_epi32(a, 4);

541

//X case 32: return _mm_slli_epi32(a, 5);

542

//X case 64: return _mm_slli_epi32(a, 6);

543

//X case 128: return _mm_slli_epi32(a, 7);

544

//X case 256: return _mm_slli_epi32(a, 8);

545

//X case 512: return _mm_slli_epi32(a, 9);

546

//X case 1024: return _mm_slli_epi32(a, 10);

547

//X case 2048: return _mm_slli_epi32(a, 11);

548

//X }

549

//X return mul(a, set(b));

550

//X }

551

552

#undef SUFFIX

553

#define SUFFIX epi32

554

static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) {

555

return CAT(_mm_slli_, SUFFIX)(a, shift);

556

}

557

static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) {

558

return CAT(_mm_srli_, SUFFIX)(a, shift);

559

}

560

static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const unsigned int a) { return CAT(_mm_set1_, SUFFIX)(a); }

561

static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const unsigned int a, const unsigned int b, const unsigned int c, const unsigned int d) { return CAT(_mm_set_, SUFFIX)(a, b, c, d); }

562

563

OP(add) OP(sub)

564

OPcmp(eq)

565

static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpneq(const VectorType a, const VectorType b) { return _mm_andnot_si128(cmpeq(a, b), _mm_setallone_si128()); }

566

567

#ifndef USE_INCORRECT_UNSIGNED_COMPARE

568

static Vc_ALWAYS_INLINE Vc_CONST VectorType cmplt(const VectorType a, const VectorType b) {

569

return _mm_cmplt_epu32(a, b);

570

}

571

static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpgt(const VectorType a, const VectorType b) {

572

return _mm_cmpgt_epu32(a, b);

573

}

574

#else

575

OPcmp(lt)

576

OPcmp(gt)

577

#endif

578

static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnlt(const VectorType a, const VectorType b) { return _mm_andnot_si128(cmplt(a, b), _mm_setallone_si128()); }

579

static Vc_ALWAYS_INLINE Vc_CONST VectorType cmple (const VectorType a, const VectorType b) { return _mm_andnot_si128(cmpgt(a, b), _mm_setallone_si128()); }

580

static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnle(const VectorType a, const VectorType b) { return cmpgt(a, b); }

581

582

#undef SUFFIX

583

static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; }

584

};

585

586

template<> struct VectorHelper<signed short> {

587

typedef _M128I VectorType;

588

typedef signed short EntryType;

589

#define SUFFIX si128

590

591

OP_(or_) OP_(and_) OP_(xor_)

592

static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return CAT(_mm_setzero_, SUFFIX)(); }

593

static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, _M128 mask) { return CAT(_mm_and_, SUFFIX)(_mm_castps_si128(mask), a); }

594

static Vc_ALWAYS_INLINE Vc_CONST _M128I concat(_M128I a, _M128I b) { return _mm_packs_epi32(a, b); }

595

static Vc_ALWAYS_INLINE Vc_CONST _M128I expand0(_M128I x) { return _mm_srai_epi32(_mm_unpacklo_epi16(x, x), 16); }

596

static Vc_ALWAYS_INLINE Vc_CONST _M128I expand1(_M128I x) { return _mm_srai_epi32(_mm_unpackhi_epi16(x, x), 16); }

597

598

#undef SUFFIX

599

#define SUFFIX epi16

600

static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return CAT(_mm_setone_, SUFFIX)(); }

601

602

static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) {

603

return CAT(_mm_slli_, SUFFIX)(a, shift);

604

}

605

static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) {

606

return CAT(_mm_srai_, SUFFIX)(a, shift);

607

}

608

static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a) { return CAT(_mm_set1_, SUFFIX)(a); }

609

static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a, const EntryType b, const EntryType c, const EntryType d,

610

const EntryType e, const EntryType f, const EntryType g, const EntryType h) {

611

return CAT(_mm_set_, SUFFIX)(a, b, c, d, e, f, g, h);

612

}

613

614

static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) {

615

v1 = add(mul(v1, v2), v3); }

616

617

OP1(abs)

618

619

OPx(mul, mullo)

620

OP(min) OP(max)

621

static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {

622

// reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"

623

a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));

624

a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));

625

a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));

626

return _mm_cvtsi128_si32(a); // & 0xffff is implicit

627

}

628

static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {

629

// reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"

630

a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));

631

a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));

632

a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));

633

return _mm_cvtsi128_si32(a); // & 0xffff is implicit

634

}

635

static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {

636

a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));

637

a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));

638

a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));

639

return _mm_cvtsi128_si32(a); // & 0xffff is implicit

640

}

641

static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {

642

a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));

643

a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));

644

a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));

645

return _mm_cvtsi128_si32(a); // & 0xffff is implicit

646

}

647

648

OP(add) OP(sub)

649

OPcmp(eq)

650

OPcmp(lt)

651

OPcmp(gt)

652

static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpneq(const VectorType a, const VectorType b) { _M128I x = cmpeq(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); }

653

static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnlt(const VectorType a, const VectorType b) { _M128I x = cmplt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); }

654

static Vc_ALWAYS_INLINE Vc_CONST VectorType cmple (const VectorType a, const VectorType b) { _M128I x = cmpgt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); }

655

static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnle(const VectorType a, const VectorType b) { return cmpgt(a, b); }

656

#undef SUFFIX

657

static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; }

658

};

659

660

template<> struct VectorHelper<unsigned short> {

661

typedef _M128I VectorType;

662

typedef unsigned short EntryType;

663

#define SUFFIX si128

664

OP_CAST_(or_) OP_CAST_(and_) OP_CAST_(xor_)

665

static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return CAT(_mm_setzero_, SUFFIX)(); }

666

static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, _M128 mask) { return CAT(_mm_and_, SUFFIX)(_mm_castps_si128(mask), a); }

667

#ifdef VC_IMPL_SSE4_1

668

static Vc_ALWAYS_INLINE Vc_CONST _M128I concat(_M128I a, _M128I b) { return _mm_packus_epi32(a, b); }

669

#else

670

// XXX too bad, but this is broken without SSE 4.1

671

static Vc_ALWAYS_INLINE Vc_CONST _M128I concat(_M128I a, _M128I b) { return _mm_packs_epi32(a, b); }

672

#endif

673

static Vc_ALWAYS_INLINE Vc_CONST _M128I expand0(_M128I x) { return _mm_srli_epi32(_mm_unpacklo_epi16(x, x), 16); }

674

static Vc_ALWAYS_INLINE Vc_CONST _M128I expand1(_M128I x) { return _mm_srli_epi32(_mm_unpackhi_epi16(x, x), 16); }

675

676

#undef SUFFIX

677

#define SUFFIX epu16

678

static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return CAT(_mm_setone_, SUFFIX)(); }

679

680

//X template<unsigned int b> static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(const VectorType a) {

681

//X switch (b) {

682

//X case 0: return zero();

683

//X case 1: return a;

684

//X case 2: return _mm_slli_epi16(a, 1);

685

//X case 4: return _mm_slli_epi16(a, 2);

686

//X case 8: return _mm_slli_epi16(a, 3);

687

//X case 16: return _mm_slli_epi16(a, 4);

688

//X case 32: return _mm_slli_epi16(a, 5);

689

//X case 64: return _mm_slli_epi16(a, 6);

690

//X case 128: return _mm_slli_epi16(a, 7);

691

//X case 256: return _mm_slli_epi16(a, 8);

692

//X case 512: return _mm_slli_epi16(a, 9);

693

//X case 1024: return _mm_slli_epi16(a, 10);

694

//X case 2048: return _mm_slli_epi16(a, 11);

695

//X }

696

//X return mul(a, set(b));

697

//X }

698

#if !defined(USE_INCORRECT_UNSIGNED_COMPARE) || VC_IMPL_SSE4_1

699

OP(min) OP(max)

700

#endif

701

#undef SUFFIX

702

#define SUFFIX epi16

703

static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) {

704

return CAT(_mm_slli_, SUFFIX)(a, shift);

705

}

706

static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) {

707

return CAT(_mm_srli_, SUFFIX)(a, shift);

708

}

709

710

static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }

711

712

OPx(mul, mullo) // should work correctly for all values

713

#if defined(USE_INCORRECT_UNSIGNED_COMPARE) && !defined(VC_IMPL_SSE4_1)

714

OP(min) OP(max) // XXX breaks for values with MSB set

715

#endif

716

static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {

717

// reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"

718

a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));

719

a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));

720

a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));

721

return _mm_cvtsi128_si32(a); // & 0xffff is implicit

722

}

723

static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {

724

// reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"

725

a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));

726

a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));

727

a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));

728

return _mm_cvtsi128_si32(a); // & 0xffff is implicit

729

}

730

static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {

731

// reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"

732

a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));

733

a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));

734

a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));

735

return _mm_cvtsi128_si32(a); // & 0xffff is implicit

736

}

737

static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {

738

// reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"

739

a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));

740

a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));

741

a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));

742

return _mm_cvtsi128_si32(a); // & 0xffff is implicit

743

}

744

static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a) { return CAT(_mm_set1_, SUFFIX)(a); }

745

static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a, const EntryType b, const EntryType c,

746

const EntryType d, const EntryType e, const EntryType f,

747

const EntryType g, const EntryType h) {

748

return CAT(_mm_set_, SUFFIX)(a, b, c, d, e, f, g, h);

749

}

750

751

OP(add) OP(sub)

752

OPcmp(eq)

753

static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpneq(const VectorType a, const VectorType b) { return _mm_andnot_si128(cmpeq(a, b), _mm_setallone_si128()); }

754

755

#ifndef USE_INCORRECT_UNSIGNED_COMPARE

756

static Vc_ALWAYS_INLINE Vc_CONST VectorType cmplt(const VectorType a, const VectorType b) {

757

return _mm_cmplt_epu16(a, b);

758

}

759

static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpgt(const VectorType a, const VectorType b) {

760

return _mm_cmpgt_epu16(a, b);

761

}

762

#else

763

OPcmp(lt)

764

OPcmp(gt)

765

#endif

766

static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnlt(const VectorType a, const VectorType b) { return _mm_andnot_si128(cmplt(a, b), _mm_setallone_si128()); }

767

static Vc_ALWAYS_INLINE Vc_CONST VectorType cmple (const VectorType a, const VectorType b) { return _mm_andnot_si128(cmpgt(a, b), _mm_setallone_si128()); }

768

static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnle(const VectorType a, const VectorType b) { return cmpgt(a, b); }

769

#undef SUFFIX

770

static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; }

771

};

772

#undef OP1

773

#undef OP

774

#undef OP_

775

#undef OPx

776

#undef OPcmp

777

778

} // namespace SSE

779

} // namespace Vc

780

781

#include "vectorhelper.tcc"

782

#include "undomacros.h"

783

784

#endif // SSE_VECTORHELPER_H

Older »