~ubuntu-branches/ubuntu/wily/boinc-app-seti/wily-proposed

« back to all changes in this revision

Viewing changes to client/vector/analyzeFuncs_sse.cpp

Committer: Package Import Robot
Author(s): Gianfranco Costamagna
Date: 2014-04-14 00:10:11 UTC
mfrom: (1.1.11)
Revision ID: package-import@ubuntu.com-20140414001011-uos97gr5k8imsx8e

Tags: 7.28~svn2203-1

* New upstream release, patch refresh.
* Drop fix-armel.patch and fix-ftbfs-arm64.patch, addressed
upstream.

files added:
client/working_collect2_line_for_android_armv6-neon

client/working_collect2_line_for_android_armv6-vfp

files removed:
.pc/fix-armel.patch

.pc/fix-armel.patch/configure.ac

.pc/fix-ftbfs-arm64.patch

.pc/fix-ftbfs-arm64.patch/client

.pc/fix-ftbfs-arm64.patch/client/vector

.pc/fix-ftbfs-arm64.patch/client/vector/fp_arm.h

debian/patches/addressing-autosetup-warnings.patch

debian/patches/fix-armel.patch

debian/patches/fix-ftbfs-arm64.patch

files modified:
.pc/003_dont_use_own_jpeglib_and_glut.patch/Makefile.am

.pc/003_dont_use_own_jpeglib_and_glut.patch/configure.ac

.pc/004_disable_altivec_on_ppc_linux.patch/client/Makefile.am

.pc/007_worker_comments.patch/client/worker.cpp

.pc/210_fix_fgets_warning.patch/client/seti_header.cpp

.pc/211_give_stderr_some_output.patch/client/main.cpp

.pc/212_increase_buffers.patch/client/seti.cpp

.pc/213_const_warning_reduction.patch/client/main.cpp

.pc/213_const_warning_reduction.patch/client/s_util.cpp

.pc/213_const_warning_reduction.patch/client/seti_header.cpp

.pc/applied-patches

.pc/disable_avx_in_configure.patch/configure.ac

.pc/disable_avx_in_configure.patch/m4/sah_avx.m4

Makefile.am

Makefile.incl

build_android_client.sh

client/Makefile.am

client/amd64fft8g.cpp

client/analyzeFuncs.cpp

client/analyzeFuncs.h

client/analyzePoT.cpp

client/analyzeReport.cpp

client/autocorr.cpp

client/chirpfft.cpp

client/fft8g.cpp

client/fft8g.h

client/gaussfit.cpp

client/gdata.cpp

client/lcgamm.cpp

client/main.cpp

client/malloc_a.cpp

client/malloc_a.h

client/progress.cpp

client/pulsefind.cpp

client/pulsefind.h

client/s_util.cpp

client/sah_gfx.cpp

client/seti.cpp

client/seti.h

client/seti_header.cpp

client/spike.cpp

client/timecvt.cpp

client/vector/analyzeFuncs_altivec.cpp

client/vector/analyzeFuncs_avx.cpp

client/vector/analyzeFuncs_fpu.cpp

client/vector/analyzeFuncs_mmx.cpp

client/vector/analyzeFuncs_sse.cpp

client/vector/analyzeFuncs_sse2.cpp

client/vector/analyzeFuncs_sse3.cpp

client/vector/analyzeFuncs_vector.cpp

client/vector/analyzeFuncs_vector.h

client/vector/analyzeFuncs_vfp_aux.cpp

client/vector/analyzeFuncs_x86_64.cpp

client/vector/fp_arm.h

client/vector/sighandler.h

client/vector/x86_ops.h

client/worker.cpp

configure.ac

db/schema_master.cpp

db/schema_master.h

db/tools/analysis_configs.xml

db/tools/insert_splitter_config.cpp

db/tools/settings.sql

db/tools/splitter_configs.xml

db/xml_util.h

debian/changelog

debian/patches/003_dont_use_own_jpeglib_and_glut.patch

debian/patches/007_worker_comments.patch

debian/patches/210_fix_fgets_warning.patch

debian/patches/211_give_stderr_some_output.patch

debian/patches/212_increase_buffers.patch

debian/patches/213_const_warning_reduction.patch

debian/patches/disable_avx_in_configure.patch

debian/patches/series

m4/sah_avx.m4

Show diffs side-by-side

added added

removed removed

client/vector/analyzeFuncs_sse.cpp

// appropriate part of a YLINE by XLINE matrix. "IN" points to the first

// (lowest address) element of the input submatrix. "OUT" points to the

// first (lowest address) element of the output submatrix.

#ifdef USE_MANUAL_CALLSTACK

static char name[64];

if (name[0]==0) sprintf(name,"v_pfsubTranspose<%d>()",x);

call_stack.enter(name);

#endif

int i,j;

float *p;

}

prefetcht0(in+j*xline+x);

}

#ifdef USE_MANUAL_CALLSTACK

call_stack.exit();

#endif

}

int v_pfTranspose2(int x, int y, float *in, float *out) {

100

// Attempts to improve cache hit ratio by transposing 4 elements at a time.

101

#ifdef USE_MANUAL_CALLSTACK

102

call_stack.enter("v_pfTranspose2()");

103

#endif

104

int i,j;

105

for (j=0;j<y-1;j+=2) {

106

for (i=0;i<x-1;i+=2) {

104

115

out[i*y+j]=in[j*x+i];

105

116

}

106

117

}

118

#ifdef USE_MANUAL_CALLSTACK

119

call_stack.exit();

120

#endif

107

121

return 0;

108

122

}

109

123

110

124

int v_pfTranspose4(int x, int y, float *in, float *out) {

111

125

// Attempts to improve cache hit ratio by transposing 16 elements at a time.

126

#ifdef USE_MANUAL_CALLSTACK

127

call_stack.enter("v_pfTranspose4()");

128

#endif

112

129

int i,j;

113

130

for (j=0;j<y-3;j+=4) {

114

131

for (i=0;i<x-3;i+=4) {

134

151

out[i*y+j]=in[j*x+i];

135

152

}

136

153

}

154

#ifdef USE_MANUAL_CALLSTACK

155

call_stack.exit();

156

#endif

137

157

return 0;

138

158

}

139

159

140

160

int v_pfTranspose8(int x, int y, float *in, float *out) {

141

161

// Attempts to improve cache hit ratio by transposing 64 elements at a time.

162

#ifdef USE_MANUAL_CALLSTACK

163

call_stack.enter("v_pfTranspose8()");

164

#endif

142

165

int i,j;

143

166

for (j=0;j<y-7;j+=8) {

144

167

for (i=0;i<x-7;i+=8) {

178

201

out[i*y+j]=in[j*x+i];

179

202

}

180

203

}

204

#ifdef USE_MANUAL_CALLSTACK

205

call_stack.exit();

206

#endif

181

207

return 0;

182

208

}

183

209

185

211

inline void v_vsubTranspose4(float *in, float *out, int xline, int yline) {

186

212

// do a 4x4 transpose in the SSE registers.

187

213

// This could probably be optimized a bit further.

214

#ifdef USE_MANUAL_CALLSTACK

215

call_stack.enter("v_vsubTranspose4()");

216

#endif

188

217

prefetcht0(out);

189

218

prefetcht0(out+yline);

190

219

prefetcht0(out+2*yline);

191

220

prefetcht0(out+3*yline);

192

221

// TODO: figure out why the intrinsic version crashes for MinGW build

193

222

// not critical, but shuffle-only _MM_TRANSPOSE4_PS is optimal on some

194

#if defined(USE_INTRINSICS) && defined(_MM_TRANSPOSE4_PS) && !defined(__GNUC__)

223

#if defined(USE_INTRINSICS) && (defined(_MM_TRANSPOSE4_PS) && !defined(__GNUC__) || defined(__clang__))

195

224

196

225

197

226

236

265

prefetcht0(in+1*xline+4);

237

266

prefetcht0(in+2*xline+4);

238

267

prefetcht0(in+3*xline+4);

268

#ifdef USE_MANUAL_CALLSTACK

269

call_stack.exit();

270

#endif

239

271

}

240

272

241

273

int v_vTranspose4(int x, int y, float *in, float *out) {

274

#ifdef USE_MANUAL_CALLSTACK

275

call_stack.enter("v_vTranspose4()");

276

#endif

242

277

int i,j;

243

278

for (j=0;j<y-3;j+=4) {

244

279

for (i=0;i<x-3;i+=4) {

264

299

out[i*y+j]=in[j*x+i];

265

300

}

266

301

}

302

#ifdef USE_MANUAL_CALLSTACK

303

call_stack.exit();

304

#endif

267

305

return 0;

268

306

}

269

307

inline void v_vsubTranspose4np(float *in, float *out, int xline, int yline) {

308

#ifdef USE_MANUAL_CALLSTACK

309

call_stack.enter("v_vsubTranspose4np()");

310

#endif

270

311

// do a 4x4 transpose in the SSE registers.

271

312

// This could probably be optimized a bit further.

272

313

// JWS: No prefetches in this version, faster on some systems.

273

314

274

315

// TODO: figure out why the intrinsic version crashes for MinGW build

275

316

// not critical, but the shuffle-only _MM_TRANSPOSE4_PS is optimal on some

276

#if defined(USE_INTRINSICS) && defined(_MM_TRANSPOSE4_PS) && !defined(__GNUC__)

317

#if defined(USE_INTRINSICS) && (defined(_MM_TRANSPOSE4_PS) && !defined(__GNUC__) || defined(__clang__))

277

318

278

319

279

320

314

355

// no intrinsics, no GCC, just do something which should work

315

356

v_pfsubTranspose<4>(in, out, xline, yline);

316

357

#endif

358

#ifdef USE_MANUAL_CALLSTACK

359

call_stack.exit();

360

#endif

317

361

}

318

362

319

363

int v_vTranspose4np(int x, int y, float *in, float *out) {

364

#ifdef USE_MANUAL_CALLSTACK

365

call_stack.enter("v_vTranspose4np()");

366

#endif

320

367

int i,j;

321

368

for (j=0;j<y-3;j+=4) {

322

369

for (i=0;i<x-3;i+=4) {

331

378

out[i*y+j]=in[j*x+i];

332

379

}

333

380

}

381

#ifdef USE_MANUAL_CALLSTACK

382

call_stack.exit();

383

#endif

334

384

return 0;

335

385

}

336

386

337

387

// Following section disappears without intrinsics

338

388

#ifdef USE_INTRINSICS

339

389

inline void v_vsubTranspose4ntw(float *in, float *out, int xline, int yline) {

390

#ifdef USE_MANUAL_CALLSTACK

391

call_stack.enter("v_vsubTranspose4ntw()");

392

#endif

340

393

// Do a 4x4 transpose in the SSE registers, non-temporal writes.

341

394

// An sfence is needed after using this sub to ensure global visibilty of the writes.

342

395

__m128 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;

361

414

tmp6 = _mm_movehl_ps(tmp6,tmp2); // d3 c3 b3 a3

362

415

_mm_stream_ps(out+2*yline, tmp5);

363

416

_mm_stream_ps(out+3*yline, tmp6);

417

#ifdef USE_MANUAL_CALLSTACK

418

call_stack.exit();

419

#endif

364

420

}

365

421

366

422

int v_vTranspose4ntw(int x, int y, float *in, float *out) {

423

#ifdef USE_MANUAL_CALLSTACK

424

call_stack.enter("v_vTranspose4ntw()");

425

#endif

367

426

int i,j;

368

427

for (j=0;j<y-3;j+=4) {

369

428

for (i=0;i<x-3;i+=4) {

379

438

}

380

439

}

381

440

_mm_sfence();

441

#ifdef USE_MANUAL_CALLSTACK

442

call_stack.exit();

443

#endif

382

444

return 0;

383

445

}

384

446

385

447

int v_vTranspose4x8ntw(int x, int y, float *in, float *out) {

448

#ifdef USE_MANUAL_CALLSTACK

449

call_stack.enter("v_vTranspose4x8ntw()");

450

#endif

386

451

int i,j;

387

452

for (j=0;j<y-7;j+=8) {

388

453

for (i=0;i<x-3;i+=4) {

399

464

}

400

465

}

401

466

_mm_sfence();

467

#ifdef USE_MANUAL_CALLSTACK

468

call_stack.exit();

469

#endif

402

470

return 0;

403

471

}

404

472

405

473

int v_vTranspose4x16ntw(int x, int y, float *in, float *out) {

474

#ifdef USE_MANUAL_CALLSTACK

475

call_stack.enter("v_vTranspose4x16ntw()");

476

#endif

406

477

int i,j;

407

478

for (j=0;j<y-15;j+=16) {

408

479

for (i=0;i<x-3;i+=4) {

421

492

}

422

493

}

423

494

_mm_sfence();

495

#ifdef USE_MANUAL_CALLSTACK

496

call_stack.exit();

497

#endif

424

498

return 0;

425

499

}

426

500

int v_vpfTranspose8x4ntw(int x, int y, float *in, float *out) {

501

#ifdef USE_MANUAL_CALLSTACK

502

call_stack.enter("v_vpfTranspose8x4ntw()");

503

#endif

427

504

int i,j;

428

505

for (j=0;j<y-3;j+=4) {

429

506

for (i=0;i<x-7;i+=8) {

445

522

}

446

523

}

447

524

_mm_sfence();

525

#ifdef USE_MANUAL_CALLSTACK

526

call_stack.exit();

527

#endif

448

528

return 0;

449

529

}

450

530

#endif // USE_INTRINSICS

461

541

#endif

462

542

463

543

if (!boinc_has_sse()) return UNSUPPORTED_FUNCTION;

544

#ifdef USE_MANUAL_CALLSTACK

545

call_stack.enter("v_vGetPowerSpectrum()");

546

#endif

464

547

465

548

analysis_state.FLOP_counter+=3.0*NumDataPoints;

466

549

506

589

PowerSpectrum[i] = FreqData[i][0] * FreqData[i][0]

507

590

+ FreqData[i][1] * FreqData[i][1];

508

591

}

592

#ifdef USE_MANUAL_CALLSTACK

593

call_stack.exit();

594

#endif

509

595

return 0;

510

596

}

511

597

520

606

521

607

#endif

522

608

if (!boinc_has_sse()) return UNSUPPORTED_FUNCTION;

609

#ifdef USE_MANUAL_CALLSTACK

610

call_stack.enter("v_vGetPowerSpectrumUnrolled()");

611

#endif

523

612

524

613

analysis_state.FLOP_counter+=3.0*NumDataPoints;

525

614

591

680

PowerSpectrum[i] = FreqData[i][0] * FreqData[i][0]

592

681

+ FreqData[i][1] * FreqData[i][1];

593

682

}

683

#ifdef USE_MANUAL_CALLSTACK

684

call_stack.exit();

685

#endif

594

686

return 0;

595

687

}

596

688

606

698

607

699

#endif

608

700

if (!boinc_has_sse()) return UNSUPPORTED_FUNCTION;

701

#ifdef USE_MANUAL_CALLSTACK

702

call_stack.enter("v_vGetPowerSpectrum2()");

703

#endif

609

704

610

705

analysis_state.FLOP_counter+=3.0*NumDataPoints;

611

706

649

744

PowerSpectrum[i] = FreqData[i][0] * FreqData[i][0]

650

745

+ FreqData[i][1] * FreqData[i][1];

651

746

}

747

#ifdef USE_MANUAL_CALLSTACK

748

call_stack.exit();

749

#endif

652

750

return 0;

653

751

}

654

752

665

763

#endif

666

764

667

765

if (!boinc_has_sse()) return UNSUPPORTED_FUNCTION;

766

#ifdef USE_MANUAL_CALLSTACK

767

call_stack.enter("v_vGetPowerSpectrumUnrolled2()");

768

#endif

668

769

669

770

analysis_state.FLOP_counter+=3.0*NumDataPoints;

670

771

734

835

PowerSpectrum[i] = FreqData[i][0] * FreqData[i][0]

735

836

+ FreqData[i][1] * FreqData[i][1];

736

837

}

838

#ifdef USE_MANUAL_CALLSTACK

839

call_stack.exit();

840

#endif

737

841

return 0;

738

842

}

739

843

770

874

// This routine should work as long as x86_64 supports x87 instructions.

771

875

// After that the illegal instruction trap should take care of it.

772

876

inline double fastfrac(double val, double roundVal) {

877

#ifdef USE_MANUAL_CALLSTACK

878

call_stack.enter("fastfrac()");

879

#endif

773

880

// reduce val to the range (-0.5, 0.5) using "val - round(val)"

774

881

#if defined(_MSC_VER) && !defined(_WIN64)

775

882

__asm {

793

900

#else

794

901

val -= std::floor(val + 0.5);

795

902

#endif

903

#ifdef USE_MANUAL_CALLSTACK

904

call_stack.exit();

905

#endif

796

906

return val;

797

907

}

798

908

799

909

static unsigned short fpucw1;

800

910

801

911

inline void set_extended_precision() {

912

#ifdef USE_MANUAL_CALLSTACK

913

call_stack.enter("set_extended_precision()");

914

#endif

802

915

// Windows and *BSD operate the X87 FPU so it rounds at mantissa bit 53, the

803

916

// quick rounding algorithm needs rounding at the last bit.

804

917

unsigned short fpucw2;

813

926

#else

814

927

// Nothing necessary for linux, osx, _WIN64 VC++ and most everything else.

815

928

#endif

929

#ifdef USE_MANUAL_CALLSTACK

930

call_stack.exit();

931

#endif

816

932

}

817

933

818

934

inline void restore_fpucw() {

935

#ifdef USE_MANUAL_CALLSTACK

936

call_stack.enter("restore_fpucw()");

937

#endif

819

938

#if defined(_MSC_VER) && !defined(_WIN64)

820

939

__asm fldcw fpucw1;

821

940

#elif defined(__GNUC__) && (defined(_WIN32) || defined(_WIN64) || defined(_BSD))

823

942

#else

824

943

// NADA

825

944

#endif

945

#ifdef USE_MANUAL_CALLSTACK

946

call_stack.exit();

947

#endif

826

948

}

827

949

828

950

829

951

__m128 vec_recip1(__m128 v) {

952

#ifdef USE_MANUAL_CALLSTACK

953

call_stack.enter("vec_recip1()");

954

#endif

830

955

// obtain estimate

831

956

__m128 estimate = _mm_rcp_ps( v );

832

957

// one round of Newton-Raphson

833

return _mm_add_ps(_mm_mul_ps(_mm_sub_ps(ONE, _mm_mul_ps(estimate, v)), estimate), estimate);

958

__m128 rv=_mm_add_ps(_mm_mul_ps(_mm_sub_ps(ONE, _mm_mul_ps(estimate, v)), estimate), estimate);

959

#ifdef USE_MANUAL_CALLSTACK

960

call_stack.exit();

961

#endif

962

return rv;

834

963

}

835

964

836

965

int sse1_ChirpData_ak(

841

970

int ul_NumDataPoints,

842

971

double sample_rate

843

972

) {

973

#ifdef USE_MANUAL_CALLSTACK

974

call_stack.enter("sse1_ChirpData_ak()");

975

#endif

844

976

845

977

if (ChirpRateInd == 0) {

846

978

memcpy(fp_ChirpDataArray, fp_DataArray, (int)ul_NumDataPoints * sizeof(sah_complex) );

979

#ifdef USE_MANUAL_CALLSTACK

980

call_stack.exit();

981

#endif

847

982

return 0;

848

983

}

849

984

1064

1199

}

1065

1200

analysis_state.FLOP_counter+=12.0*ul_NumDataPoints;

1066

1201

1202

#ifdef USE_MANUAL_CALLSTACK

1203

call_stack.exit();

1204

#endif

1067

1205

return 0;

1068

1206

}

1069

1207

1080

1218

int ul_NumDataPoints,

1081

1219

double sample_rate

1082

1220

) {

1083

1221

#ifdef USE_MANUAL_CALLSTACK

1222

call_stack.enter("sse1_ChirpData_ak8e()");

1223

#endif

1084

1224

if (ChirpRateInd == 0) {

1085

1225

memcpy(fp_ChirpDataArray, fp_DataArray, (int)ul_NumDataPoints * sizeof(sah_complex) );

1226

#ifdef USE_MANUAL_CALLSTACK

1227

call_stack.exit();

1228

#endif

1086

1229

return 0;

1087

1230

}

1088

1231

1305

1448

}

1306

1449

analysis_state.FLOP_counter+=12.0*ul_NumDataPoints;

1307

1450

1451

#ifdef USE_MANUAL_CALLSTACK

1452

call_stack.exit();

1453

#endif

1308

1454

return 0;

1309

1455

}

1310

1456

1322

1468

double sample_rate

1323

1469

) {

1324

1470

1471

#ifdef USE_MANUAL_CALLSTACK

1472

call_stack.enter("sse1_ChirpData_ak8h()");

1473

#endif

1325

1474

if (ChirpRateInd == 0) {

1326

1475

memcpy(fp_ChirpDataArray, fp_DataArray, (int)ul_NumDataPoints * sizeof(sah_complex) );

1476

#ifdef USE_MANUAL_CALLSTACK

1477

call_stack.exit();

1478

#endif

1327

1479

return 0;

1328

1480

}

1329

1481

1541

1693

}

1542

1694

analysis_state.FLOP_counter+=12.0*ul_NumDataPoints;

1543

1695

1696

#ifdef USE_MANUAL_CALLSTACK

1697

call_stack.exit();

1698

#endif

1544

1699

return 0;

1545

1700

}

1546

1701

1580

1735

// concept: Ben Herndon

1581

1736

1582

1737

inline float s_maxp2f( __m128 max1 ) {

1738

#ifdef USE_MANUAL_CALLSTACK

1739

call_stack.enter("s_maxp2f()");

1740

#endif

1583

1741

float tMax;

1584

1742

__m128 maxReg = max1;

1585

1743

1589

1747

maxReg = _mm_max_ss( maxReg, max1 ); // [1] vs [0]

1590

1748

1591

1749

_mm_store_ss( &tMax, maxReg );

1750

#ifdef USE_MANUAL_CALLSTACK

1751

call_stack.exit();

1752

#endif

1592

1753

return ( tMax );

1593

1754

}

1594

1755

1756

#if defined(__clang__)

1757

#define s_getU( aaaa, ptr ) \

1758

aaaa = _mm_loadl_pi(aaaa, (__m64*)(ptr)); \

1759

aaaa = _mm_loadh_pi(aaaa, ((__m64 *)(ptr))+1)

1760

#else

1595

1761

#define s_getU( aaaa, ptr ) \

1596

1762

aaaa = _mm_loadh_pi( _mm_loadl_pi(aaaa, (__m64 *)ptr), ((__m64 *)(ptr))+1 )

1763

#endif

1597

1764

1598

1765

#define s_putU( ptr, aaaa ) \

1599

1766

_mm_storel_pi((__m64 *)ptr, aaaa), _mm_storeh_pi( ((__m64 *)ptr)+1 , aaaa)

1614

1781

// fold by 2

1615

1782

1616

1783

float sse_sum2(float *ss[], struct PoTPlan *P) {

1784

#ifdef USE_MANUAL_CALLSTACK

1785

call_stack.enter("sse_sum2()");

1786

#endif

1617

1787

__m128 sum1, sum2, tmp1, tmp2, max1, max2;

1618

1788

int i, length = P->di;

1619

1789

float *ptr1 = ss[1]+P->offset;

1654

1824

s_putU( &sums[i + 0], sum1 );

1655

1825

s_putU( &sums[i +4], sum2 );

1656

1826

1657

return s_maxp2f( _mm_max_ps( max1, max2 ) );

1827

float rv= s_maxp2f( _mm_max_ps( max1, max2 ) );

1828

#ifdef USE_MANUAL_CALLSTACK

1829

call_stack.exit();

1830

#endif

1831

return rv;

1658

1832

}

1659

1833

1660

1834

1661

1835

// fold by 3s

1662

1836

1663

1837

float sse_sum3(float *ss[], struct PoTPlan *P) {

1838

#ifdef USE_MANUAL_CALLSTACK

1839

call_stack.enter("sse_sum3()");

1840

#endif

1664

1841

__m128 sum1, sum2, tmp1, tmp2, max1, max2;

1665

1842

int i, length = P->di;

1666

1843

float *ptr1 = ss[0];

1709

1886

s_putU( &sums[i + 0], sum1 );

1710

1887

s_putU( &sums[i +4], sum2 );

1711

1888

1712

return s_maxp2f( _mm_max_ps( max1, max2 ) );

1889

float rv= s_maxp2f( _mm_max_ps( max1, max2 ) );

1890

#ifdef USE_MANUAL_CALLSTACK

1891

call_stack.exit();

1892

#endif

1893

return rv;

1713

1894

}

1714

1895

1715

1896

1716

1897

// fold by 4

1717

1898

1718

1899

float sse_sum4(float *ss[], struct PoTPlan *P) {

1900

#ifdef USE_MANUAL_CALLSTACK

1901

call_stack.enter("sse_sum4()");

1902

#endif

1719

1903

__m128 sum1, sum2, tmp1, tmp2, max1, max2;

1720

1904

int i, length = P->di;

1721

1905

float *ptr1 = ss[0];

1773

1957

s_putU( &sums[i + 0], sum1 );

1774

1958

s_putU( &sums[i +4], sum2 );

1775

1959

1776

return s_maxp2f( _mm_max_ps( max1, max2 ) );

1960

float rv= s_maxp2f( _mm_max_ps( max1, max2 ) );

1961

#ifdef USE_MANUAL_CALLSTACK

1962

call_stack.exit();

1963

#endif

1964

return rv;

1777

1965

}

1778

1966

1779

1967

1780

1968

// fold by 5

1781

1969

1782

1970

float sse_sum5(float *ss[], struct PoTPlan *P) {

1971

#ifdef USE_MANUAL_CALLSTACK

1972

call_stack.enter("sse_sum5()");

1973

#endif

1783

1974

__m128 sum1, sum2, tmp1, tmp2, max1, max2;

1784

1975

int i, length = P->di;

1785

1976

float *ptr1 = ss[0];

1846

2037

s_putU( &sums[i + 0], sum1 );

1847

2038

s_putU( &sums[i +4], sum2 );

1848

2039

1849

return s_maxp2f( _mm_max_ps( max1, max2 ) );

2040

float rv= s_maxp2f( _mm_max_ps( max1, max2 ) );

2041

#ifdef USE_MANUAL_CALLSTACK

2042

call_stack.exit();

2043

#endif

2044

return rv;

1850

2045

}

1851

2046

1852

2047

sum_func sse_list3[FOLDTBLEN] = {

1934

2129

// Fold by 3 versions

1935

2130

1936

2131

float sse_pulPoTf3u(float *ss[], struct PoTPlan *P) {

2132

#ifdef USE_MANUAL_CALLSTACK

2133

call_stack.enter("sse_pulPoTf2u()");

2134

#endif

1937

2135

__m128 sum1, sum2, max1, max2;

1938

2136

__m128 tmp1, tmp2;

1939

2137

int i;

2004

2202

sum1 = _mm_sub_ps(sum1, SSE_LIM( mask1 ) );

2005

2203

max1 = _mm_max_ps( max1, sum1 );

2006

2204

}

2007

return ( s_maxp2f( max1 ) );

2205

float rv= ( s_maxp2f( max1 ) );

2206

#ifdef USE_MANUAL_CALLSTACK

2207

call_stack.exit();

2208

#endif

2209

return rv;

2008

2210

}

2009

2211

2010

2212

float sse_pulPoTf3(float *ss[], struct PoTPlan *P) {

2213

#ifdef USE_MANUAL_CALLSTACK

2214

call_stack.enter("sse_pulPoTf3()");

2215

#endif

2011

2216

__m128 sum1, sum2, max1, max2;

2012

2217

__m128 tmp1, tmp2;

2013

2218

int i;

2048

2253

sum1 = _mm_sub_ps(sum1, SSE_LIM( mask1 ) );

2049

2254

max1 = _mm_max_ps( max1, sum1 );

2050

2255

}

2051

return ( s_maxp2f( max1 ) );

2256

float rv= ( s_maxp2f( max1 ) );

2257

#ifdef USE_MANUAL_CALLSTACK

2258

call_stack.exit();

2259

#endif

2260

return rv;

2052

2261

}

2053

2262

2054

2263

float sse_pulPoTf3L8(float *ss[], struct PoTPlan *P) {

2264

#ifdef USE_MANUAL_CALLSTACK

2265

call_stack.enter("sse_pulPoTf3L8()");

2266

#endif

2055

2267

__m128 sum1, sum2, max1, max2;

2056

2268

__m128 tmp1, tmp2;

2057

2269

2081

2293

max1 = _mm_max_ps( max1, sum1 );

2082

2294

max2 = _mm_max_ps( max2, sum2 );

2083

2295

2084

return ( s_maxp2f( _mm_max_ps( max1, max2 ) ) );

2296

float rv= ( s_maxp2f( _mm_max_ps( max1, max2 ) ) );

2297

#ifdef USE_MANUAL_CALLSTACK

2298

call_stack.exit();

2299

#endif

2300

return rv;

2085

2301

}

2086

2302

2087

2303

sum_func BHSSETB3[FOLDTBLEN] =

2099

2315

// Fold by 4 versions

2100

2316

2101

2317

float sse_pulPoTf4u(float *ss[], struct PoTPlan *P) {

2318

#ifdef USE_MANUAL_CALLSTACK

2319

call_stack.enter("sse_pulPoTf4u()");

2320

#endif

2102

2321

__m128 sum1, sum2, max1, max2;

2103

2322

__m128 tmp1, tmp2;

2104

2323

int i;

2184

2403

sum1 = _mm_sub_ps(sum1, SSE_LIM( mask1 ) );

2185

2404

max1 = _mm_max_ps( max1, sum1 );

2186

2405

}

2187

return ( s_maxp2f( max1 ) );

2406

float rv= ( s_maxp2f( max1 ) );

2407

#ifdef USE_MANUAL_CALLSTACK

2408

call_stack.exit();

2409

#endif

2410

return rv;

2188

2411

}

2189

2412

2190

2413

float sse_pulPoTf4(float *ss[], struct PoTPlan *P) {

2414

#ifdef USE_MANUAL_CALLSTACK

2415

call_stack.enter("sse_pulPoTf4()");

2416

#endif

2191

2417

__m128 sum1, sum2, max1, max2;

2192

2418

__m128 tmp1, tmp2;

2193

2419

int i;

2235

2461

sum1 = _mm_sub_ps(sum1, SSE_LIM( mask1 ) );

2236

2462

max1 = _mm_max_ps( max1, sum1 );

2237

2463

}

2238

return ( s_maxp2f( max1 ) );

2464

float rv=( s_maxp2f( max1 ) );

2465

#ifdef USE_MANUAL_CALLSTACK

2466

call_stack.exit();

2467

#endif

2468

return rv;

2239

2469

}

2240

2470

2241

2471

2242

2472

float sse_pulPoTf4L8(float *ss[], struct PoTPlan *P) {

2473

#ifdef USE_MANUAL_CALLSTACK

2474

call_stack.enter("sse_pulPoTf4L8()");

2475

#endif

2243

2476

__m128 sum1, sum2, max1, max2;

2244

2477

__m128 tmp1, tmp2;

2245

2478

int i;

2275

2508

max1 = _mm_max_ps( max1, sum1 );

2276

2509

max2 = _mm_max_ps( max2, sum2 );

2277

2510

2278

return ( s_maxp2f( _mm_max_ps( max1, max2 ) ) );

2511

float rv= ( s_maxp2f( _mm_max_ps( max1, max2 ) ) );

2512

#ifdef USE_MANUAL_CALLSTACK

2513

call_stack.exit();

2514

#endif

2515

return rv;

2279

2516

}

2280

2517

2281

2518

sum_func BHSSETB4[FOLDTBLEN] = {

2292

2529

// Fold by 5 versions

2293

2530

2294

2531

float sse_pulPoTf5u(float *ss[], struct PoTPlan *P) {

2532

#ifdef USE_MANUAL_CALLSTACK

2533

call_stack.enter("sse_pulPoTf5u()");

2534

#endif

2295

2535

__m128 sum1, sum2, max1, max2;

2296

2536

__m128 tmp1, tmp2;

2297

2537

int i;

2392

2632

sum1 = _mm_sub_ps(sum1, SSE_LIM( mask1 ) );

2393

2633

max1 = _mm_max_ps( max1, sum1 );

2394

2634

}

2395

return ( s_maxp2f( max1 ) );

2635

float rv=( s_maxp2f( max1 ) );

2636

#ifdef USE_MANUAL_CALLSTACK

2637

call_stack.exit();

2638

#endif

2639

return rv;

2396

2640

}

2397

2641

2398

2642

float sse_pulPoTf5(float *ss[], struct PoTPlan *P) {

2643

#ifdef USE_MANUAL_CALLSTACK

2644

call_stack.enter("sse_pulPoTf5()");

2645

#endif

2399

2646

__m128 sum1, sum2, max1, max2;

2400

2647

__m128 tmp1, tmp2;

2401

2648

int i;

2450

2697

sum1 = _mm_sub_ps(sum1, SSE_LIM( mask1 ) );

2451

2698

max1 = _mm_max_ps( max1, sum1 );

2452

2699

}

2453

return ( s_maxp2f( max1 ) );

2700

float rv=( s_maxp2f( max1 ) );

2701

#ifdef USE_MANUAL_CALLSTACK

2702

call_stack.exit();

2703

#endif

2704

return rv;

2454

2705

}

2455

2706

2456

2707

float sse_pulPoTf5L8(float *ss[], struct PoTPlan *P) {

2708

#ifdef USE_MANUAL_CALLSTACK

2709

call_stack.enter("sse_pulPoTf5L8()");

2710

#endif

2457

2711

__m128 sum1, sum2, max1, max2;

2458

2712

__m128 tmp1, tmp2;

2459

2713

int i;

2494

2748

max1 = _mm_max_ps( max1, sum1 );

2495

2749

max2 = _mm_max_ps( max2, sum2 );

2496

2750

2497

return ( s_maxp2f( _mm_max_ps( max1, max2 ) ) );

2751

float rv= ( s_maxp2f( _mm_max_ps( max1, max2 ) ) );

2752

#ifdef USE_MANUAL_CALLSTACK

2753

call_stack.exit();

2754

#endif

2755

return rv;

2498

2756

}

2499

2757

2500

2758

float sse_pulPoTf5L4(float *ss[], struct PoTPlan *P) {

2759

#ifdef USE_MANUAL_CALLSTACK

2760

call_stack.enter("sse_pulPoTf5L4()");

2761

#endif

2501

2762

__m128 sum1, sum2, max1, max2;

2502

2763

__m128 tmp1, tmp2;

2503

2764

int i;

2523

2784

sum1 = _mm_sub_ps(sum1, SSE_LIM( mask1 ) );

2524

2785

max1 = _mm_max_ps( max1, sum1 );

2525

2786

2526

return ( s_maxp2f( max1 ) );

2787

float rv= ( s_maxp2f( max1 ) );

2788

#ifdef USE_MANUAL_CALLSTACK

2789

call_stack.exit();

2790

#endif

2791

return rv;

2527

2792

}

2528

2793

2529

2794

sum_func BHSSETB5[FOLDTBLEN] = {

2540

2805

// Fold by 2 versions

2541

2806

2542

2807

float sse_pulPoTf2u(float *ss[], struct PoTPlan *P) {

2808

#ifdef USE_MANUAL_CALLSTACK

2809

call_stack.enter("sse_pulPoTf2u()");

2810

#endif

2543

2811

__m128 sum1, sum2, tmp1, tmp2, max1, max2;

2544

2812

int i;

2545

2813

2595

2863

sum1 = _mm_sub_ps(sum1, SSE_LIM( mask1 ) );

2596

2864

max1 = _mm_max_ps( max1, sum1 );

2597

2865

}

2598

return ( s_maxp2f( max1 ) );

2866

float rv= ( s_maxp2f( max1 ) );

2867

#ifdef USE_MANUAL_CALLSTACK

2868

call_stack.exit();

2869

#endif

2870

return rv;

2599

2871

}

2600

2872

2601

2873

float sse_pulPoTf2(float *ss[], struct PoTPlan *P) {

2874

#ifdef USE_MANUAL_CALLSTACK

2875

call_stack.enter("sse_pulPoTf2()");

2876

#endif

2602

2877

__m128 sum1, sum2, tmp1, tmp2, max1, max2;

2603

2878

int i;

2604

2879

2632

2907

sum1 = _mm_sub_ps(sum1, SSE_LIM( mask1 ) );

2633

2908

max1 = _mm_max_ps( max1, sum1 );

2634

2909

}

2635

return ( s_maxp2f( max1 ) );

2910

float rv= ( s_maxp2f( max1 ) );

2911

#ifdef USE_MANUAL_CALLSTACK

2912

call_stack.exit();

2913

#endif

2914

return rv;

2636

2915

}

2637

2916

2638

2917

float sse_pulPoTf2L8(float *ss[], struct PoTPlan *P) {

2918

#ifdef USE_MANUAL_CALLSTACK

2919

call_stack.enter("sse_pulPoTf2L8()");

2920

#endif

2639

2921

__m128 sum1, sum2, tmp1, tmp2, max1, max2;

2640

2922

int i;

2641

2923

2660

2942

max1 = _mm_max_ps( max1, sum1 );

2661

2943

max2 = _mm_max_ps( max2, sum2 );

2662

2944

2663

return ( s_maxp2f( _mm_max_ps( max1, max2 ) ) );

2945

float rv= ( s_maxp2f( _mm_max_ps( max1, max2 ) ) );

2946

#ifdef USE_MANUAL_CALLSTACK

2947

call_stack.exit();

2948

#endif

2949

return rv;

2664

2950

}

2665

2951

2666

2952

float sse_pulPoTf2L4(float *ss[], struct PoTPlan *P) {

2953

#ifdef USE_MANUAL_CALLSTACK

2954

call_stack.enter("sse_pulPoTf2L4()");

2955

#endif

2667

2956

__m128 sum1, tmp1, max1;

2668

2957

int i;

2669

2958

2680

2969

sum1 = _mm_sub_ps(sum1, SSE_LIM( mask1 ) );

2681

2970

max1 = _mm_max_ps( max1, sum1 );

2682

2971

2683

return ( s_maxp2f( max1 ) );

2972

float rv= ( s_maxp2f( max1 ) );

2973

#ifdef USE_MANUAL_CALLSTACK

2974

call_stack.exit();

2975

#endif

2976

return rv;

2684

2977

}

2685

2978

2686

2979

sum_func BHSSETB2[FOLDTBLEN] = {

2698

2991

// versions for tmp0 aligned

2699

2992

2700

2993

float sse_pulPoTf2ALu(float *ss[], struct PoTPlan *P) {

2994

#ifdef USE_MANUAL_CALLSTACK

2995

call_stack.enter("sse_pulPoTf2ALu()");

2996

#endif

2701

2997

__m128 sum1, sum2, tmp1, tmp2, max1, max2;

2702

2998

int i;

2703

2999

2753

3049

sum1 = _mm_sub_ps(sum1, SSE_LIM( mask1 ) );

2754

3050

max1 = _mm_max_ps( max1, sum1 );

2755

3051

}

2756

return ( s_maxp2f( max1 ) );

3052

float rv= ( s_maxp2f( max1 ) );

3053

#ifdef USE_MANUAL_CALLSTACK

3054

call_stack.exit();

3055

#endif

3056

return rv;

2757

3057

}

2758

3058

2759

3059

float sse_pulPoTf2AL(float *ss[], struct PoTPlan *P) {

3060

#ifdef USE_MANUAL_CALLSTACK

3061

call_stack.enter("sse_pulPoTf2AL()");

3062

#endif

2760

3063

__m128 sum1, sum2, tmp1, tmp2, max1, max2;

2761

3064

int i;

2762

3065

2790

3093

sum1 = _mm_sub_ps(sum1, SSE_LIM( mask1 ) );

2791

3094

max1 = _mm_max_ps( max1, sum1 );

2792

3095

}

2793

return ( s_maxp2f( max1 ) );

3096

float rv= ( s_maxp2f( max1 ) );

3097

#ifdef USE_MANUAL_CALLSTACK

3098

call_stack.exit();

3099

#endif

3100

return rv;

2794

3101

}

2795

3102

2796

3103

2797

3104

float sse_pulPoTf2AL8(float *ss[], struct PoTPlan *P) {

3105

#ifdef USE_MANUAL_CALLSTACK

3106

call_stack.enter("sse_pulPoTf2AL8()");

3107

#endif

2798

3108

__m128 sum1, sum2, tmp1, tmp2, max1, max2;

2799

3109

int i;

2800

3110

2819

3129

max1 = _mm_max_ps( max1, sum1 );

2820

3130

max2 = _mm_max_ps( max2, sum2 );

2821

3131

2822

return ( s_maxp2f( _mm_max_ps( max1, max2 ) ) );

3132

float rv= ( s_maxp2f( _mm_max_ps( max1, max2 ) ) );

3133

#ifdef USE_MANUAL_CALLSTACK

3134

call_stack.exit();

3135

#endif

3136

return rv;

2823

3137

}

2824

3138

2825

3139

float sse_pulPoTf2AL4(float *ss[], struct PoTPlan *P) {

3140

#ifdef USE_MANUAL_CALLSTACK

3141

call_stack.enter("sse_pulPoTf2AL4()");

3142

#endif

2826

3143

__m128 sum1, tmp1, max1;

2827

3144

int i;

2828

3145

2839

3156

sum1 = _mm_sub_ps(sum1, SSE_LIM( mask1 ) );

2840

3157

max1 = _mm_max_ps( max1, sum1 );

2841

3158

2842

return ( s_maxp2f( max1 ) );

3159

float rv= ( s_maxp2f( max1 ) );

3160

#ifdef USE_MANUAL_CALLSTACK

3161

call_stack.exit();

3162

#endif

3163

return rv;

2843

3164

}

2844

3165

2845

3166

sum_func BHSSETB2AL[FOLDTBLEN] = {

2877

3198

2878

3199

2879

3200

float foldArrayBy3(float *ss[], struct PoTPlan *P) {

3201

#ifdef USE_MANUAL_CALLSTACK

3202

call_stack.enter("foldArrayBy3()");

3203

#endif

2880

3204

float max;

2881

3205

__m128 maxV = ZERO;

2882

3206

int i = 0;

2909

3233

maxV = _mm_max_ps(maxV, _mm_shuffle_ps(maxV, maxV, 0x39));

2910

3234

_mm_store_ss(&max, maxV);

2911

3235

3236

#ifdef USE_MANUAL_CALLSTACK

3237

call_stack.exit();

3238

#endif

2912

3239

return max;

2913

3240

}

2914

3241

2915

3242

2916

3243

float foldArrayBy3LO(float *ss[], struct PoTPlan *P) {

3244

#ifdef USE_MANUAL_CALLSTACK

3245

call_stack.enter("foldArrayBy3LO()");

3246

#endif

2917

3247

2918

3248

const float *p1 = ss[0], *p2 = ss[0]+P->tmp0, *p3 = ss[0]+P->tmp1;

2919

3249

float *pst = P->dest;

2925

3255

if (pst[i] > max) max = pst[i];

2926

3256

i += 1;

2927

3257

}

3258

#ifdef USE_MANUAL_CALLSTACK

3259

call_stack.exit();

3260

#endif

2928

3261

return max;

2929

3262

}

2930

3263

sum_func AKSSETB3[FOLDTBLEN] = {

2935

3268

};

2936

3269

2937

3270

float foldArrayBy4(float *ss[], struct PoTPlan *P) {

3271

#ifdef USE_MANUAL_CALLSTACK

3272

call_stack.enter("foldArrayBy4()");

3273

#endif

2938

3274

float max;

2939

3275

__m128 maxV = ZERO;

2940

3276

int i = 0;

2971

3307

maxV = _mm_max_ps(maxV, _mm_shuffle_ps(maxV, maxV, 0x39));

2972

3308

_mm_store_ss(&max, maxV);

2973

3309

3310

#ifdef USE_MANUAL_CALLSTACK

3311

call_stack.exit();

3312

#endif

2974

3313

return max;

2975

3314

}

2976

3315

2977

3316

float foldArrayBy4LO(float *ss[], struct PoTPlan *P) {

3317

#ifdef USE_MANUAL_CALLSTACK

3318

call_stack.enter("foldArrayBy4LO()");

3319

#endif

2978

3320

2979

3321

const float *p1 = ss[0], *p2 = ss[0]+P->tmp0, *p3 = ss[0]+P->tmp1, *p4 = ss[0]+P->tmp2;

2980

3322

float *pst = P->dest;

2986

3328

if (pst[i] > max) max = pst[i];

2987

3329

i += 1;

2988

3330

}

3331

#ifdef USE_MANUAL_CALLSTACK

3332

call_stack.exit();

3333

#endif

2989

3334

return max;

2990

3335

}

2991

3336

sum_func AKSSETB4[FOLDTBLEN] = {

2996

3341

};

2997

3342

2998

3343

float foldArrayBy5(float *ss[], struct PoTPlan *P) {

3344

#ifdef USE_MANUAL_CALLSTACK

3345

call_stack.enter("foldArrayBy5()");

3346

#endif

2999

3347

float max;

3000

3348

__m128 maxV = ZERO;

3001

3349

int i = 0;

3036

3384

maxV = _mm_max_ps(maxV, _mm_shuffle_ps(maxV, maxV, 0x39));

3037

3385

_mm_store_ss(&max, maxV);

3038

3386

3387

#ifdef USE_MANUAL_CALLSTACK

3388

call_stack.exit();

3389

#endif

3039

3390

return max;

3040

3391

}

3041

3392

3042

3393

float foldArrayBy5LO(float *ss[], struct PoTPlan *P) {

3394

#ifdef USE_MANUAL_CALLSTACK

3395

call_stack.enter("foldArrayBy5LO()");

3396

#endif

3043

3397

3044

3398

const float *p1 = ss[0], *p2 = ss[0]+P->tmp0, *p3 = ss[0]+P->tmp1, *p4 = ss[0]+P->tmp2, *p5 = ss[0]+P->tmp3;

3045

3399

float *pst = P->dest;

3051

3405

if (pst[i] > max) max = pst[i];

3052

3406

i += 1;

3053

3407

}

3408

#ifdef USE_MANUAL_CALLSTACK

3409

call_stack.exit();

3410

#endif

3054

3411

return max;

3055

3412

}

3056

3413

3064

3421

// 2A version allows non-aligned tmp0

3065

3422

3066

3423

float foldArrayBy2A(float *ss[], struct PoTPlan *P) {

3424

#ifdef USE_MANUAL_CALLSTACK

3425

call_stack.enter("FoldArrayBy2A()");

3426

#endif

3067

3427

float max;

3068

3428

__m128 maxV = ZERO;

3069

3429

int i = 0;

3092

3452

maxV = _mm_max_ps(maxV, _mm_shuffle_ps(maxV, maxV, 0x39));

3093

3453

_mm_store_ss(&max, maxV);

3094

3454

3455

#ifdef USE_MANUAL_CALLSTACK

3456

call_stack.exit();

3457

#endif

3095

3458

return max;

3096

3459

}

3097

3460

3099

3462

3100

3463

3101

3464

float foldArrayBy2AL(float *ss[], struct PoTPlan *P) {

3465

#ifdef USE_MANUAL_CALLSTACK

3466

call_stack.enter("foldArrayBy2AL()");

3467

#endif

3102

3468

float max;

3103

3469

__m128 maxV = ZERO;

3104

3470

int i = 0;

3127

3493

maxV = _mm_max_ps(maxV, _mm_shuffle_ps(maxV, maxV, 0x39));

3128

3494

_mm_store_ss(&max, maxV);

3129

3495

3496

#ifdef USE_MANUAL_CALLSTACK

3497

call_stack.exit();

3498

#endif

3130

3499

return max;

3131

3500

}

3132

3501

3133

3502

float foldArrayBy2LO(float *ss[], struct PoTPlan *P) {

3503

#ifdef USE_MANUAL_CALLSTACK

3504

call_stack.enter("foldArrayBy2LO()");

3505

#endif

3134

3506

3135

3507

const float *p1 = ss[1]+P->offset, *p2 = ss[1]+P->tmp0;

3136

3508

float *pst = P->dest;

3142

3514

if (pst[i] > max) max = pst[i];

3143

3515

i += 1;

3144

3516

}

3517

#ifdef USE_MANUAL_CALLSTACK

3518

call_stack.exit();

3519

#endif

3145

3520

return max;

3146

3521

}

3147

3522

sum_func AKSSETB2[FOLDTBLEN] = {

Older »