~choreonoid/choreonoid/debian

template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const Packet4i*>(from)); }

209

210

#if defined(_MSC_VER)

211

template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {

212

EIGEN_DEBUG_UNALIGNED_LOAD

213

#if (_MSC_VER==1600)

214

// NOTE Some version of MSVC10 generates bad code when using _mm_loadu_ps

215

// (i.e., it does not generate an unaligned load!!

216

// TODO On most architectures this version should also be faster than a single _mm_loadu_ps

217

// so we could also enable it for MSVC08 but first we have to make this later does not generate crap when doing so...

218

__m128 res = _mm_loadl_pi(_mm_set1_ps(0.0f), (const __m64*)(from));

219

res = _mm_loadh_pi(res, (const __m64*)(from+2));

220

return res;

221

#else

222

return _mm_loadu_ps(from);

223

#endif

224

}

225

template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); }

226

template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); }

227

#else

228

// Fast unaligned loads. Note that here we cannot directly use intrinsics: this would

229

// require pointer casting to incompatible pointer types and leads to invalid code

230

// because of the strict aliasing rule. The "dummy" stuff are required to enforce

231

// a correct instruction dependency.

232

// TODO: do the same for MSVC (ICC is compatible)

233

// NOTE: with the code below, MSVC's compiler crashes!

234

235

#if defined(__GNUC__) && defined(__i386__)

236

// bug 195: gcc/i386 emits weird x87 fldl/fstpl instructions for _mm_load_sd

237

#define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1

238

#elif defined(__clang__)

239

// bug 201: Segfaults in __mm_loadh_pd with clang 2.8

240

#define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1

241

#else

242

#define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 0

243

#endif

244

245

template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)

246

{

247

EIGEN_DEBUG_UNALIGNED_LOAD

248

#if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS

249

return _mm_loadu_ps(from);

250

#else

251

__m128d res;

252

res = _mm_load_sd((const double*)(from)) ;

253

res = _mm_loadh_pd(res, (const double*)(from+2)) ;

254

return _mm_castpd_ps(res);

255

#endif

256

}

257

template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)

258

{

259

EIGEN_DEBUG_UNALIGNED_LOAD

260

#if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS

261

return _mm_loadu_pd(from);

262

#else

263

__m128d res;

264

res = _mm_load_sd(from) ;

265

res = _mm_loadh_pd(res,from+1);

266

return res;

267

#endif

268

}

269

template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)

270

{

271

EIGEN_DEBUG_UNALIGNED_LOAD

272

#if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS

273

return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from));

274

#else

275

__m128d res;

276

res = _mm_load_sd((const double*)(from)) ;

277

res = _mm_loadh_pd(res, (const double*)(from+2)) ;

278

return _mm_castpd_si128(res);

279

#endif

280

}

281

#endif

282

283

template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)

284

{

285

return vec4f_swizzle1(_mm_castpd_ps(_mm_load_sd((const double*)from)), 0, 0, 1, 1);

286

}

287

template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)

288

{ return pset1<Packet2d>(from[0]); }

289

template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)

290

{

291

Packet4i tmp;

292

tmp = _mm_loadl_epi64(reinterpret_cast<const Packet4i*>(from));

293

return vec4i_swizzle1(tmp, 0, 0, 1, 1);

294

}

295

296

template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); }

297

template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); }

298

template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<Packet4i*>(to), from); }

299

300

template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {

301

EIGEN_DEBUG_UNALIGNED_STORE

302

_mm_storel_pd((to), from);

303

_mm_storeh_pd((to+1), from);

304

}

305

template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, _mm_castps_pd(from)); }

306

template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, _mm_castsi128_pd(from)); }

307

308

// some compilers might be tempted to perform multiple moves instead of using a vector path.

309

template<> EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a)

310

{

311

Packet4f pa = _mm_set_ss(a);

312

pstore(to, vec4f_swizzle1(pa,0,0,0,0));

313

}

314

// some compilers might be tempted to perform multiple moves instead of using a vector path.

315

template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double& a)

316

{

317

Packet2d pa = _mm_set_sd(a);

318

pstore(to, vec2d_swizzle1(pa,0,0));

319

}

320

321

template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }

322

template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }

323

template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }

324

325

#if defined(_MSC_VER) && defined(_WIN64) && !defined(__INTEL_COMPILER)

326

// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010

327

// Direct of the struct members fixed bug #62.

328

template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { return a.m128_f32[0]; }

329

template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return a.m128d_f64[0]; }

330

template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; }

331

#elif defined(_MSC_VER) && !defined(__INTEL_COMPILER)

332

// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010

333

template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float x = _mm_cvtss_f32(a); return x; }

334

template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double x = _mm_cvtsd_f64(a); return x; }

335

template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; }

336

#else

337

template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { return _mm_cvtss_f32(a); }

338

template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return _mm_cvtsd_f64(a); }

339

template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { return _mm_cvtsi128_si32(a); }

340

#endif

341

342

template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)

343

{ return _mm_shuffle_ps(a,a,0x1B); }

344

template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)

345

{ return _mm_shuffle_pd(a,a,0x1); }

346

template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)

347

{ return _mm_shuffle_epi32(a,0x1B); }

348

349

350

template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a)

351

{

352

const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));

353

return _mm_and_ps(a,mask);

354

}

355

template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a)

356

{

357

const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));

358

return _mm_and_pd(a,mask);

359

}

360

template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a)

361

{

362

#ifdef EIGEN_VECTORIZE_SSSE3

363

return _mm_abs_epi32(a);

364

#else

365

Packet4i aux = _mm_srai_epi32(a,31);

366

return _mm_sub_epi32(_mm_xor_si128(a,aux),aux);

367

#endif

368

}

369

370

EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs)

371

{

372

vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55));

373

vecs[2] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xAA));

374

vecs[3] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xFF));

375

vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00));

376

}

377

378

#ifdef EIGEN_VECTORIZE_SSE3

379

// TODO implement SSE2 versions as well as integer versions

380

template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)

381

{

382

return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3]));

383

}

384

template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)

385

{

386

return _mm_hadd_pd(vecs[0], vecs[1]);

387

}

388

// SSSE3 version:

389

// EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs)

390

// {

391

// return _mm_hadd_epi32(_mm_hadd_epi32(vecs[0], vecs[1]),_mm_hadd_epi32(vecs[2], vecs[3]));

392

// }

393

394

template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)

395

{

396

Packet4f tmp0 = _mm_hadd_ps(a,a);

397

return pfirst(_mm_hadd_ps(tmp0, tmp0));

398

}

399

400

template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return pfirst(_mm_hadd_pd(a, a)); }

401

402

// SSSE3 version:

403

// EIGEN_STRONG_INLINE float predux(const Packet4i& a)

404

// {

405

// Packet4i tmp0 = _mm_hadd_epi32(a,a);

406

// return pfirst(_mm_hadd_epi32(tmp0, tmp0));

407

// }

408

#else

409

// SSE2 versions

410

template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)

411

{

412

Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a));

413

return pfirst(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));

414

}

415

template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)

416

{

417

return pfirst(_mm_add_sd(a, _mm_unpackhi_pd(a,a)));

418

}

419

420

template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)

421

{

422

Packet4f tmp0, tmp1, tmp2;

423

tmp0 = _mm_unpacklo_ps(vecs[0], vecs[1]);

424

tmp1 = _mm_unpackhi_ps(vecs[0], vecs[1]);

425

tmp2 = _mm_unpackhi_ps(vecs[2], vecs[3]);

426

tmp0 = _mm_add_ps(tmp0, tmp1);

427

tmp1 = _mm_unpacklo_ps(vecs[2], vecs[3]);

428

tmp1 = _mm_add_ps(tmp1, tmp2);

429

tmp2 = _mm_movehl_ps(tmp1, tmp0);

430

tmp0 = _mm_movelh_ps(tmp0, tmp1);

431

return _mm_add_ps(tmp0, tmp2);

432

}

433

434

template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)

435

{

436

return _mm_add_pd(_mm_unpacklo_pd(vecs[0], vecs[1]), _mm_unpackhi_pd(vecs[0], vecs[1]));

437

}

438

#endif // SSE3

439

440

template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)

441

{

442

Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a));

443

return pfirst(tmp) + pfirst(_mm_shuffle_epi32(tmp, 1));

444

}

445

446

template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)

447

{

448

Packet4i tmp0, tmp1, tmp2;

449

tmp0 = _mm_unpacklo_epi32(vecs[0], vecs[1]);

450

tmp1 = _mm_unpackhi_epi32(vecs[0], vecs[1]);

451

tmp2 = _mm_unpackhi_epi32(vecs[2], vecs[3]);

452

tmp0 = _mm_add_epi32(tmp0, tmp1);

453

tmp1 = _mm_unpacklo_epi32(vecs[2], vecs[3]);

454

tmp1 = _mm_add_epi32(tmp1, tmp2);

455

tmp2 = _mm_unpacklo_epi64(tmp0, tmp1);

456

tmp0 = _mm_unpackhi_epi64(tmp0, tmp1);

457

return _mm_add_epi32(tmp0, tmp2);

458

}

459

460

// Other reduction functions:

461

462

// mul

463

template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)

464

{

465

Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a));

466

return pfirst(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));

467

}

468

template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)

469

{

470

return pfirst(_mm_mul_sd(a, _mm_unpackhi_pd(a,a)));

471

}

472

template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)

473

{

474

// after some experiments, it is seems this is the fastest way to implement it

475

// for GCC (eg., reusing pmul is very slow !)

476

// TODO try to call _mm_mul_epu32 directly

477

EIGEN_ALIGN16 int aux[4];

478

pstore(aux, a);

479

return (aux[0] * aux[1]) * (aux[2] * aux[3]);;

480

}

481

482

// min

483

template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)

484

{

485

Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a,a));

486

return pfirst(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));

487

}

488

template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)

489

{

490

return pfirst(_mm_min_sd(a, _mm_unpackhi_pd(a,a)));

491

}

492

template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)

493

{

494

// after some experiments, it is seems this is the fastest way to implement it

495

// for GCC (eg., it does not like using std::min after the pstore !!)

496

EIGEN_ALIGN16 int aux[4];

497

pstore(aux, a);

498

499

500

return aux0<aux2 ? aux0 : aux2;

501

}

502

503

// max

504

template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)

505

{

506

Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a,a));

507

return pfirst(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));

508

}

509

template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)

510

{

511

return pfirst(_mm_max_sd(a, _mm_unpackhi_pd(a,a)));

512

}

513

template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)

514

{

515

// after some experiments, it is seems this is the fastest way to implement it

516

// for GCC (eg., it does not like using std::min after the pstore !!)

517

EIGEN_ALIGN16 int aux[4];

518

pstore(aux, a);

519

520

521

return aux0>aux2 ? aux0 : aux2;

522

}

523

524

#if (defined __GNUC__)

525

// template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)

526

// {

527

// Packet4f res = b;

528

// asm("mulps %[a], %[b] \n\taddps %[c], %[b]" : [b] "+x" (res) : [a] "x" (a), [c] "x" (c));

529

// return res;

530

// }

531

// EIGEN_STRONG_INLINE Packet4i _mm_alignr_epi8(const Packet4i& a, const Packet4i& b, const int i)

532

// {

533

// Packet4i res = a;

534

// asm("palignr %[i], %[a], %[b] " : [b] "+x" (res) : [a] "x" (a), [i] "i" (i));

535

// return res;

536

// }

537

#endif

538

539

#ifdef EIGEN_VECTORIZE_SSSE3

540

// SSSE3 versions

541

template<int Offset>

542

struct palign_impl<Offset,Packet4f>

543

{

544

EIGEN_STRONG_INLINE static void run(Packet4f& first, const Packet4f& second)

545

{

546

if (Offset!=0)

547

first = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(second), _mm_castps_si128(first), Offset*4));

548

}

549

};

550

551

template<int Offset>

552

struct palign_impl<Offset,Packet4i>

553

{

554

EIGEN_STRONG_INLINE static void run(Packet4i& first, const Packet4i& second)

555

{

556

if (Offset!=0)

557

first = _mm_alignr_epi8(second,first, Offset*4);

558

}

559

};

560

561

template<int Offset>

562

struct palign_impl<Offset,Packet2d>

563

{

564

EIGEN_STRONG_INLINE static void run(Packet2d& first, const Packet2d& second)

565

{

566

if (Offset==1)

567

first = _mm_castsi128_pd(_mm_alignr_epi8(_mm_castpd_si128(second), _mm_castpd_si128(first), 8));

568

}

569

};

570

#else

571

// SSE2 versions

572

template<int Offset>

573

struct palign_impl<Offset,Packet4f>

574

{

575

EIGEN_STRONG_INLINE static void run(Packet4f& first, const Packet4f& second)

576

{

577

if (Offset==1)

578

{

579

first = _mm_move_ss(first,second);

580

first = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(first),0x39));

581

}

582

else if (Offset==2)

583

{

584

first = _mm_movehl_ps(first,first);

585

first = _mm_movelh_ps(first,second);

586

}

587

else if (Offset==3)

588

{

589

first = _mm_move_ss(first,second);

590

first = _mm_shuffle_ps(first,second,0x93);

591

}

592

}

593

};

594

595

template<int Offset>

596

struct palign_impl<Offset,Packet4i>

597

{

598

EIGEN_STRONG_INLINE static void run(Packet4i& first, const Packet4i& second)

599

{

600

if (Offset==1)

601

{

602

first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));

603

first = _mm_shuffle_epi32(first,0x39);

604

}

605

else if (Offset==2)

606

{

607

first = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(first)));

608

first = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));

609

}

610

else if (Offset==3)

611

{

612

first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));

613

first = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second),0x93));

614

}

615

}

616

};

617

618

template<int Offset>

619

struct palign_impl<Offset,Packet2d>

620

{

621

EIGEN_STRONG_INLINE static void run(Packet2d& first, const Packet2d& second)

622

{

623

if (Offset==1)

624

{

625

first = _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(first),_mm_castpd_ps(first)));

626

first = _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(first),_mm_castpd_ps(second)));

627

}

628

}

629

};

630

#endif

631

632

} // end namespace internal

633

634

#endif // EIGEN_PACKET_MATH_SSE_H

Older »