~ubuntu-branches/ubuntu/utopic/openssl/utopic-security

« back to all changes in this revision

Viewing changes to .pc/power8-optimisations.patch/crypto/bn/asm/ppc64-mont.pl

Committer: Package Import Robot
Author(s): Colin Watson
Date: 2014-09-26 11:32:32 UTC
Revision ID: package-import@ubuntu.com-20140926113232-ds6gavd9wl43wft5

Tags: 1.0.1f-1ubuntu8

https://launchpad.net/bugs/1290579

Backport collected POWER8 optimisations from upstream (LP: #1290579).

files added:
.pc/power8-optimisations.patch

.pc/power8-optimisations.patch/Configure

.pc/power8-optimisations.patch/config

.pc/power8-optimisations.patch/crypto

.pc/power8-optimisations.patch/crypto/aes

.pc/power8-optimisations.patch/crypto/aes/Makefile

.pc/power8-optimisations.patch/crypto/aes/asm

.pc/power8-optimisations.patch/crypto/aes/asm/aes-ppc.pl

.pc/power8-optimisations.patch/crypto/aes/asm/aesp8-ppc.pl

.pc/power8-optimisations.patch/crypto/aes/asm/vpaes-ppc.pl

.pc/power8-optimisations.patch/crypto/aes/asm/vpaes-x86_64.pl

.pc/power8-optimisations.patch/crypto/bn

.pc/power8-optimisations.patch/crypto/bn/asm

.pc/power8-optimisations.patch/crypto/bn/asm/ppc-mont.pl

.pc/power8-optimisations.patch/crypto/bn/asm/ppc.pl

.pc/power8-optimisations.patch/crypto/bn/asm/ppc64-mont.pl

.pc/power8-optimisations.patch/crypto/evp

.pc/power8-optimisations.patch/crypto/evp/e_aes.c

.pc/power8-optimisations.patch/crypto/modes

.pc/power8-optimisations.patch/crypto/modes/Makefile

.pc/power8-optimisations.patch/crypto/modes/asm

.pc/power8-optimisations.patch/crypto/modes/asm/ghashp8-ppc.pl

.pc/power8-optimisations.patch/crypto/modes/gcm128.c

.pc/power8-optimisations.patch/crypto/perlasm

.pc/power8-optimisations.patch/crypto/perlasm/ppc-xlate.pl

.pc/power8-optimisations.patch/crypto/ppc_arch.h

.pc/power8-optimisations.patch/crypto/ppccap.c

.pc/power8-optimisations.patch/crypto/ppccpuid.pl

.pc/power8-optimisations.patch/crypto/sha

.pc/power8-optimisations.patch/crypto/sha/Makefile

.pc/power8-optimisations.patch/crypto/sha/asm

.pc/power8-optimisations.patch/crypto/sha/asm/sha1-ppc.pl

.pc/power8-optimisations.patch/crypto/sha/asm/sha512-ppc.pl

.pc/power8-optimisations.patch/crypto/sha/asm/sha512p8-ppc.pl

crypto/aes/asm/aesp8-ppc.pl

crypto/aes/asm/vpaes-ppc.pl

crypto/modes/asm/ghashp8-ppc.pl

crypto/ppc_arch.h

crypto/sha/asm/sha512p8-ppc.pl

debian/patches/power8-optimisations.patch

files modified:
.pc/applied-patches

Configure

config

crypto/aes/Makefile

crypto/aes/asm/aes-ppc.pl

crypto/aes/asm/vpaes-x86_64.pl

crypto/bn/asm/ppc-mont.pl

crypto/bn/asm/ppc.pl

crypto/bn/asm/ppc64-mont.pl

crypto/evp/e_aes.c

crypto/modes/Makefile

crypto/modes/gcm128.c

crypto/perlasm/ppc-xlate.pl

crypto/ppccap.c

crypto/ppccpuid.pl

crypto/sha/Makefile

crypto/sha/asm/sha1-ppc.pl

crypto/sha/asm/sha512-ppc.pl

debian/changelog

debian/patches/series

Show diffs side-by-side

added added

removed removed

.pc/power8-optimisations.patch/crypto/bn/asm/ppc64-mont.pl

#!/usr/bin/env perl

# ====================================================================

# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL

# project. The module is, however, dual licensed under OpenSSL and

# CRYPTOGAMS licenses depending on where you obtain it. For further

# details see http://www.openssl.org/~appro/cryptogams/.

# ====================================================================

# December 2007

# The reason for undertaken effort is basically following. Even though

# Power 6 CPU operates at incredible 4.7GHz clock frequency, its PKI

# performance was observed to be less than impressive, essentially as

# fast as 1.8GHz PPC970, or 2.6 times(!) slower than one would hope.

# Well, it's not surprising that IBM had to make some sacrifices to

# boost the clock frequency that much, but no overall improvement?

# Having observed how much difference did switching to FPU make on

# UltraSPARC, playing same stunt on Power 6 appeared appropriate...

# Unfortunately the resulting performance improvement is not as

# impressive, ~30%, and in absolute terms is still very far from what

# one would expect from 4.7GHz CPU. There is a chance that I'm doing

# something wrong, but in the lack of assembler level micro-profiling

# data or at least decent platform guide I can't tell... Or better

# results might be achieved with VMX... Anyway, this module provides

# *worse* performance on other PowerPC implementations, ~40-15% slower

# on PPC970 depending on key length and ~40% slower on Power 5 for all

# key lengths. As it's obviously inappropriate as "best all-round"

# alternative, it has to be complemented with run-time CPU family

# detection. Oh! It should also be noted that unlike other PowerPC

# implementation IALU ppc-mont.pl module performs *suboptimaly* on

# >=1024-bit key lengths on Power 6. It should also be noted that

# *everything* said so far applies to 64-bit builds! As far as 32-bit

# application executed on 64-bit CPU goes, this module is likely to

# become preferred choice, because it's easy to adapt it for such

# case and *is* faster than 32-bit ppc-mont.pl on *all* processors.

# February 2008

# Micro-profiling assisted optimization results in ~15% improvement

# over original ppc64-mont.pl version, or overall ~50% improvement

# over ppc.pl module on Power 6. If compared to ppc-mont.pl on same

# Power 6 CPU, this module is 5-150% faster depending on key length,

# [hereafter] more for longer keys. But if compared to ppc-mont.pl

# on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive

# in absolute terms, but it's apparently the way Power 6 is...

# December 2009

# Adapted for 32-bit build this module delivers 25-120%, yes, more

# than *twice* for longer keys, performance improvement over 32-bit

# ppc-mont.pl on 1.8GHz PPC970. However! This implementation utilizes

# even 64-bit integer operations and the trouble is that most PPC

# operating systems don't preserve upper halves of general purpose

# registers upon 32-bit signal delivery. They do preserve them upon

# context switch, but not signalling:-( This means that asynchronous

# signals have to be blocked upon entry to this subroutine. Signal

# masking (and of course complementary unmasking) has quite an impact

# on performance, naturally larger for shorter keys. It's so severe

# that 512-bit key performance can be as low as 1/3 of expected one.

# This is why this routine can be engaged for longer key operations

# only on these OSes, see crypto/ppccap.c for further details. MacOS X

# is an exception from this and doesn't require signal masking, and

# that's where above improvement coefficients were collected. For

# others alternative would be to break dependence on upper halves of

# GPRs by sticking to 32-bit integer operations...

$flavour = shift;

if ($flavour =~ /32/) {

$SIZE_T=4;

$RZONE= 224;

$fname= "bn_mul_mont_fpu64";

$STUX= "stwux"; # store indexed and update

$PUSH= "stw";

$POP= "lwz";

} elsif ($flavour =~ /64/) {

$SIZE_T=8;

$RZONE= 288;

$fname= "bn_mul_mont_fpu64";

# same as above, but 64-bit mnemonics...

$STUX= "stdux"; # store indexed and update

$PUSH= "std";

$POP= "ld";

} else { die "nonsense $flavour"; }

$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;

( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or

( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or

die "can't locate ppc-xlate.pl";

open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";

$FRAME=64; # padded frame header

$TRANSFER=16*8;

$carry="r0";

100

$sp="r1";

101

$toc="r2";

102

$rp="r3"; $ovf="r3";

103

$ap="r4";

104

$bp="r5";

105

$np="r6";

106

$n0="r7";

107

$num="r8";

108

$rp="r9"; # $rp is reassigned

109

$tp="r10";

110

$j="r11";

111

$i="r12";

112

# non-volatile registers

113

$nap_d="r22"; # interleaved ap and np in double format

114

$a0="r23"; # ap[0]

115

$t0="r24"; # temporary registers

116

$t1="r25";

117

$t2="r26";

118

$t3="r27";

119

$t4="r28";

120

$t5="r29";

121

$t6="r30";

122

$t7="r31";

123

124

# PPC offers enough register bank capacity to unroll inner loops twice

125

126

# ..A3A2A1A0

127

# dcba

128

# -----------

129

# A0a

130

# A0b

131

# A0c

132

# A0d

133

# A1a

134

# A1b

135

# A1c

136

# A1d

137

# A2a

138

# A2b

139

# A2c

140

# A2d

141

# A3a

142

# A3b

143

# A3c

144

# A3d

145

# ..a

146

# ..b

147

148

$ba="f0"; $bb="f1"; $bc="f2"; $bd="f3";

149

$na="f4"; $nb="f5"; $nc="f6"; $nd="f7";

150

$dota="f8"; $dotb="f9";

151

$A0="f10"; $A1="f11"; $A2="f12"; $A3="f13";

152

$N0="f20"; $N1="f21"; $N2="f22"; $N3="f23";

153

$T0a="f24"; $T0b="f25";

154

$T1a="f26"; $T1b="f27";

155

$T2a="f28"; $T2b="f29";

156

$T3a="f30"; $T3b="f31";

157

158

# sp----------->+-------------------------------+

159

# | saved sp |

160

# +-------------------------------+

161

# . .

162

# +64 +-------------------------------+

163

# | 16 gpr<->fpr transfer zone |

164

# . .

165

# . .

166

# +16*8 +-------------------------------+

167

# | __int64 tmp[-1] |

168

# +-------------------------------+

169

# | __int64 tmp[num] |

170

# . .

171

# . .

172

# . .

173

# +(num+1)*8 +-------------------------------+

174

# | padding to 64 byte boundary |

175

# . .

176

# +X +-------------------------------+

177

# | double nap_d[4*num] |

178

# . .

179

# . .

180

# . .

181

# +-------------------------------+

182

# . .

183

# -12*size_t +-------------------------------+

184

# | 10 saved gpr, r22-r31 |

185

# . .

186

# . .

187

# -12*8 +-------------------------------+

188

# | 12 saved fpr, f20-f31 |

189

# . .

190

# . .

191

# +-------------------------------+

192

193

$code=<<___;

194

.machine "any"

195

.text

196

197

.globl .$fname

198

.align 5

199

.$fname:

200

cmpwi $num,`3*8/$SIZE_T`

201

mr $rp,r3 ; $rp is reassigned

202

li r3,0 ; possible "not handled" return code

203

bltlr-

204

andi. r0,$num,`16/$SIZE_T-1` ; $num has to be "even"

205

bnelr-

206

207

slwi $num,$num,`log($SIZE_T)/log(2)` ; num*=sizeof(BN_LONG)

208

li $i,-4096

209

slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num

210

add $tp,$tp,$num ; place for tp[num+1]

211

addi $tp,$tp,`$FRAME+$TRANSFER+8+64+$RZONE`

212

subf $tp,$tp,$sp ; $sp-$tp

213

and $tp,$tp,$i ; minimize TLB usage

214

subf $tp,$sp,$tp ; $tp-$sp

215

mr $i,$sp

216

$STUX $sp,$sp,$tp ; alloca

217

218

$PUSH r22,`-12*8-10*$SIZE_T`($i)

219

$PUSH r23,`-12*8-9*$SIZE_T`($i)

220

$PUSH r24,`-12*8-8*$SIZE_T`($i)

221

$PUSH r25,`-12*8-7*$SIZE_T`($i)

222

$PUSH r26,`-12*8-6*$SIZE_T`($i)

223

$PUSH r27,`-12*8-5*$SIZE_T`($i)

224

$PUSH r28,`-12*8-4*$SIZE_T`($i)

225

$PUSH r29,`-12*8-3*$SIZE_T`($i)

226

$PUSH r30,`-12*8-2*$SIZE_T`($i)

227

$PUSH r31,`-12*8-1*$SIZE_T`($i)

228

stfd f20,`-12*8`($i)

229

stfd f21,`-11*8`($i)

230

stfd f22,`-10*8`($i)

231

stfd f23,`-9*8`($i)

232

stfd f24,`-8*8`($i)

233

stfd f25,`-7*8`($i)

234

stfd f26,`-6*8`($i)

235

stfd f27,`-5*8`($i)

236

stfd f28,`-4*8`($i)

237

stfd f29,`-3*8`($i)

238

stfd f30,`-2*8`($i)

239

stfd f31,`-1*8`($i)

240

___

241

$code.=<<___ if ($SIZE_T==8);

242

ld $a0,0($ap) ; pull ap[0] value

243

ld $n0,0($n0) ; pull n0[0] value

244

ld $t3,0($bp) ; bp[0]

245

___

246

$code.=<<___ if ($SIZE_T==4);

247

mr $t1,$n0

248

lwz $a0,0($ap) ; pull ap[0,1] value

249

lwz $t0,4($ap)

250

lwz $n0,0($t1) ; pull n0[0,1] value

251

lwz $t1,4($t1)

252

lwz $t3,0($bp) ; bp[0,1]

253

lwz $t2,4($bp)

254

insrdi $a0,$t0,32,0

255

insrdi $n0,$t1,32,0

256

insrdi $t3,$t2,32,0

257

___

258

$code.=<<___;

259

addi $tp,$sp,`$FRAME+$TRANSFER+8+64`

260

li $i,-64

261

add $nap_d,$tp,$num

262

and $nap_d,$nap_d,$i ; align to 64 bytes

263

264

mulld $t7,$a0,$t3 ; ap[0]*bp[0]

265

; nap_d is off by 1, because it's used with stfdu/lfdu

266

addi $nap_d,$nap_d,-8

267

srwi $j,$num,`3+1` ; counter register, num/2

268

mulld $t7,$t7,$n0 ; tp[0]*n0

269

addi $j,$j,-1

270

addi $tp,$sp,`$FRAME+$TRANSFER-8`

271

li $carry,0

272

mtctr $j

273

274

; transfer bp[0] to FPU as 4x16-bit values

275

extrdi $t0,$t3,16,48

276

extrdi $t1,$t3,16,32

277

extrdi $t2,$t3,16,16

278

extrdi $t3,$t3,16,0

279

std $t0,`$FRAME+0`($sp)

280

std $t1,`$FRAME+8`($sp)

281

std $t2,`$FRAME+16`($sp)

282

std $t3,`$FRAME+24`($sp)

283

; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values

284

extrdi $t4,$t7,16,48

285

extrdi $t5,$t7,16,32

286

extrdi $t6,$t7,16,16

287

extrdi $t7,$t7,16,0

288

std $t4,`$FRAME+32`($sp)

289

std $t5,`$FRAME+40`($sp)

290

std $t6,`$FRAME+48`($sp)

291

std $t7,`$FRAME+56`($sp)

292

___

293

$code.=<<___ if ($SIZE_T==8);

294

lwz $t0,4($ap) ; load a[j] as 32-bit word pair

295

lwz $t1,0($ap)

296

lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair

297

lwz $t3,8($ap)

298

lwz $t4,4($np) ; load n[j] as 32-bit word pair

299

lwz $t5,0($np)

300

lwz $t6,12($np) ; load n[j+1] as 32-bit word pair

301

lwz $t7,8($np)

302

___

303

$code.=<<___ if ($SIZE_T==4);

304

lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs

305

lwz $t1,4($ap)

306

lwz $t2,8($ap)

307

lwz $t3,12($ap)

308

lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs

309

lwz $t5,4($np)

310

lwz $t6,8($np)

311

lwz $t7,12($np)

312

___

313

$code.=<<___;

314

lfd $ba,`$FRAME+0`($sp)

315

lfd $bb,`$FRAME+8`($sp)

316

lfd $bc,`$FRAME+16`($sp)

317

lfd $bd,`$FRAME+24`($sp)

318

lfd $na,`$FRAME+32`($sp)

319

lfd $nb,`$FRAME+40`($sp)

320

lfd $nc,`$FRAME+48`($sp)

321

lfd $nd,`$FRAME+56`($sp)

322

std $t0,`$FRAME+64`($sp)

323

std $t1,`$FRAME+72`($sp)

324

std $t2,`$FRAME+80`($sp)

325

std $t3,`$FRAME+88`($sp)

326

std $t4,`$FRAME+96`($sp)

327

std $t5,`$FRAME+104`($sp)

328

std $t6,`$FRAME+112`($sp)

329

std $t7,`$FRAME+120`($sp)

330

fcfid $ba,$ba

331

fcfid $bb,$bb

332

fcfid $bc,$bc

333

fcfid $bd,$bd

334

fcfid $na,$na

335

fcfid $nb,$nb

336

fcfid $nc,$nc

337

fcfid $nd,$nd

338

339

lfd $A0,`$FRAME+64`($sp)

340

lfd $A1,`$FRAME+72`($sp)

341

lfd $A2,`$FRAME+80`($sp)

342

lfd $A3,`$FRAME+88`($sp)

343

lfd $N0,`$FRAME+96`($sp)

344

lfd $N1,`$FRAME+104`($sp)

345

lfd $N2,`$FRAME+112`($sp)

346

lfd $N3,`$FRAME+120`($sp)

347

fcfid $A0,$A0

348

fcfid $A1,$A1

349

fcfid $A2,$A2

350

fcfid $A3,$A3

351

fcfid $N0,$N0

352

fcfid $N1,$N1

353

fcfid $N2,$N2

354

fcfid $N3,$N3

355

addi $ap,$ap,16

356

addi $np,$np,16

357

358

fmul $T1a,$A1,$ba

359

fmul $T1b,$A1,$bb

360

stfd $A0,8($nap_d) ; save a[j] in double format

361

stfd $A1,16($nap_d)

362

fmul $T2a,$A2,$ba

363

fmul $T2b,$A2,$bb

364

stfd $A2,24($nap_d) ; save a[j+1] in double format

365

stfd $A3,32($nap_d)

366

fmul $T3a,$A3,$ba

367

fmul $T3b,$A3,$bb

368

stfd $N0,40($nap_d) ; save n[j] in double format

369

stfd $N1,48($nap_d)

370

fmul $T0a,$A0,$ba

371

fmul $T0b,$A0,$bb

372

stfd $N2,56($nap_d) ; save n[j+1] in double format

373

stfdu $N3,64($nap_d)

374

375

fmadd $T1a,$A0,$bc,$T1a

376

fmadd $T1b,$A0,$bd,$T1b

377

fmadd $T2a,$A1,$bc,$T2a

378

fmadd $T2b,$A1,$bd,$T2b

379

fmadd $T3a,$A2,$bc,$T3a

380

fmadd $T3b,$A2,$bd,$T3b

381

fmul $dota,$A3,$bc

382

fmul $dotb,$A3,$bd

383

384

fmadd $T1a,$N1,$na,$T1a

385

fmadd $T1b,$N1,$nb,$T1b

386

fmadd $T2a,$N2,$na,$T2a

387

fmadd $T2b,$N2,$nb,$T2b

388

fmadd $T3a,$N3,$na,$T3a

389

fmadd $T3b,$N3,$nb,$T3b

390

fmadd $T0a,$N0,$na,$T0a

391

fmadd $T0b,$N0,$nb,$T0b

392

393

fmadd $T1a,$N0,$nc,$T1a

394

fmadd $T1b,$N0,$nd,$T1b

395

fmadd $T2a,$N1,$nc,$T2a

396

fmadd $T2b,$N1,$nd,$T2b

397

fmadd $T3a,$N2,$nc,$T3a

398

fmadd $T3b,$N2,$nd,$T3b

399

fmadd $dota,$N3,$nc,$dota

400

fmadd $dotb,$N3,$nd,$dotb

401

402

fctid $T0a,$T0a

403

fctid $T0b,$T0b

404

fctid $T1a,$T1a

405

fctid $T1b,$T1b

406

fctid $T2a,$T2a

407

fctid $T2b,$T2b

408

fctid $T3a,$T3a

409

fctid $T3b,$T3b

410

411

stfd $T0a,`$FRAME+0`($sp)

412

stfd $T0b,`$FRAME+8`($sp)

413

stfd $T1a,`$FRAME+16`($sp)

414

stfd $T1b,`$FRAME+24`($sp)

415

stfd $T2a,`$FRAME+32`($sp)

416

stfd $T2b,`$FRAME+40`($sp)

417

stfd $T3a,`$FRAME+48`($sp)

418

stfd $T3b,`$FRAME+56`($sp)

419

420

.align 5

421

L1st:

422

___

423

$code.=<<___ if ($SIZE_T==8);

424

lwz $t0,4($ap) ; load a[j] as 32-bit word pair

425

lwz $t1,0($ap)

426

lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair

427

lwz $t3,8($ap)

428

lwz $t4,4($np) ; load n[j] as 32-bit word pair

429

lwz $t5,0($np)

430

lwz $t6,12($np) ; load n[j+1] as 32-bit word pair

431

lwz $t7,8($np)

432

___

433

$code.=<<___ if ($SIZE_T==4);

434

lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs

435

lwz $t1,4($ap)

436

lwz $t2,8($ap)

437

lwz $t3,12($ap)

438

lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs

439

lwz $t5,4($np)

440

lwz $t6,8($np)

441

lwz $t7,12($np)

442

___

443

$code.=<<___;

444

std $t0,`$FRAME+64`($sp)

445

std $t1,`$FRAME+72`($sp)

446

std $t2,`$FRAME+80`($sp)

447

std $t3,`$FRAME+88`($sp)

448

std $t4,`$FRAME+96`($sp)

449

std $t5,`$FRAME+104`($sp)

450

std $t6,`$FRAME+112`($sp)

451

std $t7,`$FRAME+120`($sp)

452

ld $t0,`$FRAME+0`($sp)

453

ld $t1,`$FRAME+8`($sp)

454

ld $t2,`$FRAME+16`($sp)

455

ld $t3,`$FRAME+24`($sp)

456

ld $t4,`$FRAME+32`($sp)

457

ld $t5,`$FRAME+40`($sp)

458

ld $t6,`$FRAME+48`($sp)

459

ld $t7,`$FRAME+56`($sp)

460

lfd $A0,`$FRAME+64`($sp)

461

lfd $A1,`$FRAME+72`($sp)

462

lfd $A2,`$FRAME+80`($sp)

463

lfd $A3,`$FRAME+88`($sp)

464

lfd $N0,`$FRAME+96`($sp)

465

lfd $N1,`$FRAME+104`($sp)

466

lfd $N2,`$FRAME+112`($sp)

467

lfd $N3,`$FRAME+120`($sp)

468

fcfid $A0,$A0

469

fcfid $A1,$A1

470

fcfid $A2,$A2

471

fcfid $A3,$A3

472

fcfid $N0,$N0

473

fcfid $N1,$N1

474

fcfid $N2,$N2

475

fcfid $N3,$N3

476

addi $ap,$ap,16

477

addi $np,$np,16

478

479

fmul $T1a,$A1,$ba

480

fmul $T1b,$A1,$bb

481

fmul $T2a,$A2,$ba

482

fmul $T2b,$A2,$bb

483

stfd $A0,8($nap_d) ; save a[j] in double format

484

stfd $A1,16($nap_d)

485

fmul $T3a,$A3,$ba

486

fmul $T3b,$A3,$bb

487

fmadd $T0a,$A0,$ba,$dota

488

fmadd $T0b,$A0,$bb,$dotb

489

stfd $A2,24($nap_d) ; save a[j+1] in double format

490

stfd $A3,32($nap_d)

491

492

fmadd $T1a,$A0,$bc,$T1a

493

fmadd $T1b,$A0,$bd,$T1b

494

fmadd $T2a,$A1,$bc,$T2a

495

fmadd $T2b,$A1,$bd,$T2b

496

stfd $N0,40($nap_d) ; save n[j] in double format

497

stfd $N1,48($nap_d)

498

fmadd $T3a,$A2,$bc,$T3a

499

fmadd $T3b,$A2,$bd,$T3b

500

add $t0,$t0,$carry ; can not overflow

501

fmul $dota,$A3,$bc

502

fmul $dotb,$A3,$bd

503

stfd $N2,56($nap_d) ; save n[j+1] in double format

504

stfdu $N3,64($nap_d)

505

srdi $carry,$t0,16

506

add $t1,$t1,$carry

507

srdi $carry,$t1,16

508

509

fmadd $T1a,$N1,$na,$T1a

510

fmadd $T1b,$N1,$nb,$T1b

511

insrdi $t0,$t1,16,32

512

fmadd $T2a,$N2,$na,$T2a

513

fmadd $T2b,$N2,$nb,$T2b

514

add $t2,$t2,$carry

515

fmadd $T3a,$N3,$na,$T3a

516

fmadd $T3b,$N3,$nb,$T3b

517

srdi $carry,$t2,16

518

fmadd $T0a,$N0,$na,$T0a

519

fmadd $T0b,$N0,$nb,$T0b

520

insrdi $t0,$t2,16,16

521

add $t3,$t3,$carry

522

srdi $carry,$t3,16

523

524

fmadd $T1a,$N0,$nc,$T1a

525

fmadd $T1b,$N0,$nd,$T1b

526

insrdi $t0,$t3,16,0 ; 0..63 bits

527

fmadd $T2a,$N1,$nc,$T2a

528

fmadd $T2b,$N1,$nd,$T2b

529

add $t4,$t4,$carry

530

fmadd $T3a,$N2,$nc,$T3a

531

fmadd $T3b,$N2,$nd,$T3b

532

srdi $carry,$t4,16

533

fmadd $dota,$N3,$nc,$dota

534

fmadd $dotb,$N3,$nd,$dotb

535

add $t5,$t5,$carry

536

srdi $carry,$t5,16

537

insrdi $t4,$t5,16,32

538

539

fctid $T0a,$T0a

540

fctid $T0b,$T0b

541

add $t6,$t6,$carry

542

fctid $T1a,$T1a

543

fctid $T1b,$T1b

544

srdi $carry,$t6,16

545

fctid $T2a,$T2a

546

fctid $T2b,$T2b

547

insrdi $t4,$t6,16,16

548

fctid $T3a,$T3a

549

fctid $T3b,$T3b

550

add $t7,$t7,$carry

551

insrdi $t4,$t7,16,0 ; 64..127 bits

552

srdi $carry,$t7,16 ; upper 33 bits

553

554

stfd $T0a,`$FRAME+0`($sp)

555

stfd $T0b,`$FRAME+8`($sp)

556

stfd $T1a,`$FRAME+16`($sp)

557

stfd $T1b,`$FRAME+24`($sp)

558

stfd $T2a,`$FRAME+32`($sp)

559

stfd $T2b,`$FRAME+40`($sp)

560

stfd $T3a,`$FRAME+48`($sp)

561

stfd $T3b,`$FRAME+56`($sp)

562

std $t0,8($tp) ; tp[j-1]

563

stdu $t4,16($tp) ; tp[j]

564

bdnz- L1st

565

566

fctid $dota,$dota

567

fctid $dotb,$dotb

568

569

ld $t0,`$FRAME+0`($sp)

570

ld $t1,`$FRAME+8`($sp)

571

ld $t2,`$FRAME+16`($sp)

572

ld $t3,`$FRAME+24`($sp)

573

ld $t4,`$FRAME+32`($sp)

574

ld $t5,`$FRAME+40`($sp)

575

ld $t6,`$FRAME+48`($sp)

576

ld $t7,`$FRAME+56`($sp)

577

stfd $dota,`$FRAME+64`($sp)

578

stfd $dotb,`$FRAME+72`($sp)

579

580

add $t0,$t0,$carry ; can not overflow

581

srdi $carry,$t0,16

582

add $t1,$t1,$carry

583

srdi $carry,$t1,16

584

insrdi $t0,$t1,16,32

585

add $t2,$t2,$carry

586

srdi $carry,$t2,16

587

insrdi $t0,$t2,16,16

588

add $t3,$t3,$carry

589

srdi $carry,$t3,16

590

insrdi $t0,$t3,16,0 ; 0..63 bits

591

add $t4,$t4,$carry

592

srdi $carry,$t4,16

593

add $t5,$t5,$carry

594

srdi $carry,$t5,16

595

insrdi $t4,$t5,16,32

596

add $t6,$t6,$carry

597

srdi $carry,$t6,16

598

insrdi $t4,$t6,16,16

599

add $t7,$t7,$carry

600

insrdi $t4,$t7,16,0 ; 64..127 bits

601

srdi $carry,$t7,16 ; upper 33 bits

602

ld $t6,`$FRAME+64`($sp)

603

ld $t7,`$FRAME+72`($sp)

604

605

std $t0,8($tp) ; tp[j-1]

606

stdu $t4,16($tp) ; tp[j]

607

608

add $t6,$t6,$carry ; can not overflow

609

srdi $carry,$t6,16

610

add $t7,$t7,$carry

611

insrdi $t6,$t7,48,0

612

srdi $ovf,$t7,48

613

std $t6,8($tp) ; tp[num-1]

614

615

slwi $t7,$num,2

616

subf $nap_d,$t7,$nap_d ; rewind pointer

617

618

li $i,8 ; i=1

619

.align 5

620

Louter:

621

___

622

$code.=<<___ if ($SIZE_T==8);

623

ldx $t3,$bp,$i ; bp[i]

624

___

625

$code.=<<___ if ($SIZE_T==4);

626

add $t0,$bp,$i

627

lwz $t3,0($t0) ; bp[i,i+1]

628

lwz $t0,4($t0)

629

insrdi $t3,$t0,32,0

630

___

631

$code.=<<___;

632

ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0]

633

mulld $t7,$a0,$t3 ; ap[0]*bp[i]

634

635

addi $tp,$sp,`$FRAME+$TRANSFER`

636

add $t7,$t7,$t6 ; ap[0]*bp[i]+tp[0]

637

li $carry,0

638

mulld $t7,$t7,$n0 ; tp[0]*n0

639

mtctr $j

640

641

; transfer bp[i] to FPU as 4x16-bit values

642

extrdi $t0,$t3,16,48

643

extrdi $t1,$t3,16,32

644

extrdi $t2,$t3,16,16

645

extrdi $t3,$t3,16,0

646

std $t0,`$FRAME+0`($sp)

647

std $t1,`$FRAME+8`($sp)

648

std $t2,`$FRAME+16`($sp)

649

std $t3,`$FRAME+24`($sp)

650

; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values

651

extrdi $t4,$t7,16,48

652

extrdi $t5,$t7,16,32

653

extrdi $t6,$t7,16,16

654

extrdi $t7,$t7,16,0

655

std $t4,`$FRAME+32`($sp)

656

std $t5,`$FRAME+40`($sp)

657

std $t6,`$FRAME+48`($sp)

658

std $t7,`$FRAME+56`($sp)

659

660

lfd $A0,8($nap_d) ; load a[j] in double format

661

lfd $A1,16($nap_d)

662

lfd $A2,24($nap_d) ; load a[j+1] in double format

663

lfd $A3,32($nap_d)

664

lfd $N0,40($nap_d) ; load n[j] in double format

665

lfd $N1,48($nap_d)

666

lfd $N2,56($nap_d) ; load n[j+1] in double format

667

lfdu $N3,64($nap_d)

668

669

lfd $ba,`$FRAME+0`($sp)

670

lfd $bb,`$FRAME+8`($sp)

671

lfd $bc,`$FRAME+16`($sp)

672

lfd $bd,`$FRAME+24`($sp)

673

lfd $na,`$FRAME+32`($sp)

674

lfd $nb,`$FRAME+40`($sp)

675

lfd $nc,`$FRAME+48`($sp)

676

lfd $nd,`$FRAME+56`($sp)

677

678

fcfid $ba,$ba

679

fcfid $bb,$bb

680

fcfid $bc,$bc

681

fcfid $bd,$bd

682

fcfid $na,$na

683

fcfid $nb,$nb

684

fcfid $nc,$nc

685

fcfid $nd,$nd

686

687

fmul $T1a,$A1,$ba

688

fmul $T1b,$A1,$bb

689

fmul $T2a,$A2,$ba

690

fmul $T2b,$A2,$bb

691

fmul $T3a,$A3,$ba

692

fmul $T3b,$A3,$bb

693

fmul $T0a,$A0,$ba

694

fmul $T0b,$A0,$bb

695

696

fmadd $T1a,$A0,$bc,$T1a

697

fmadd $T1b,$A0,$bd,$T1b

698

fmadd $T2a,$A1,$bc,$T2a

699

fmadd $T2b,$A1,$bd,$T2b

700

fmadd $T3a,$A2,$bc,$T3a

701

fmadd $T3b,$A2,$bd,$T3b

702

fmul $dota,$A3,$bc

703

fmul $dotb,$A3,$bd

704

705

fmadd $T1a,$N1,$na,$T1a

706

fmadd $T1b,$N1,$nb,$T1b

707

lfd $A0,8($nap_d) ; load a[j] in double format

708

lfd $A1,16($nap_d)

709

fmadd $T2a,$N2,$na,$T2a

710

fmadd $T2b,$N2,$nb,$T2b

711

lfd $A2,24($nap_d) ; load a[j+1] in double format

712

lfd $A3,32($nap_d)

713

fmadd $T3a,$N3,$na,$T3a

714

fmadd $T3b,$N3,$nb,$T3b

715

fmadd $T0a,$N0,$na,$T0a

716

fmadd $T0b,$N0,$nb,$T0b

717

718

fmadd $T1a,$N0,$nc,$T1a

719

fmadd $T1b,$N0,$nd,$T1b

720

fmadd $T2a,$N1,$nc,$T2a

721

fmadd $T2b,$N1,$nd,$T2b

722

fmadd $T3a,$N2,$nc,$T3a

723

fmadd $T3b,$N2,$nd,$T3b

724

fmadd $dota,$N3,$nc,$dota

725

fmadd $dotb,$N3,$nd,$dotb

726

727

fctid $T0a,$T0a

728

fctid $T0b,$T0b

729

fctid $T1a,$T1a

730

fctid $T1b,$T1b

731

fctid $T2a,$T2a

732

fctid $T2b,$T2b

733

fctid $T3a,$T3a

734

fctid $T3b,$T3b

735

736

stfd $T0a,`$FRAME+0`($sp)

737

stfd $T0b,`$FRAME+8`($sp)

738

stfd $T1a,`$FRAME+16`($sp)

739

stfd $T1b,`$FRAME+24`($sp)

740

stfd $T2a,`$FRAME+32`($sp)

741

stfd $T2b,`$FRAME+40`($sp)

742

stfd $T3a,`$FRAME+48`($sp)

743

stfd $T3b,`$FRAME+56`($sp)

744

745

.align 5

746

Linner:

747

fmul $T1a,$A1,$ba

748

fmul $T1b,$A1,$bb

749

fmul $T2a,$A2,$ba

750

fmul $T2b,$A2,$bb

751

lfd $N0,40($nap_d) ; load n[j] in double format

752

lfd $N1,48($nap_d)

753

fmul $T3a,$A3,$ba

754

fmul $T3b,$A3,$bb

755

fmadd $T0a,$A0,$ba,$dota

756

fmadd $T0b,$A0,$bb,$dotb

757

lfd $N2,56($nap_d) ; load n[j+1] in double format

758

lfdu $N3,64($nap_d)

759

760

fmadd $T1a,$A0,$bc,$T1a

761

fmadd $T1b,$A0,$bd,$T1b

762

fmadd $T2a,$A1,$bc,$T2a

763

fmadd $T2b,$A1,$bd,$T2b

764

lfd $A0,8($nap_d) ; load a[j] in double format

765

lfd $A1,16($nap_d)

766

fmadd $T3a,$A2,$bc,$T3a

767

fmadd $T3b,$A2,$bd,$T3b

768

fmul $dota,$A3,$bc

769

fmul $dotb,$A3,$bd

770

lfd $A2,24($nap_d) ; load a[j+1] in double format

771

lfd $A3,32($nap_d)

772

773

fmadd $T1a,$N1,$na,$T1a

774

fmadd $T1b,$N1,$nb,$T1b

775

ld $t0,`$FRAME+0`($sp)

776

ld $t1,`$FRAME+8`($sp)

777

fmadd $T2a,$N2,$na,$T2a

778

fmadd $T2b,$N2,$nb,$T2b

779

ld $t2,`$FRAME+16`($sp)

780

ld $t3,`$FRAME+24`($sp)

781

fmadd $T3a,$N3,$na,$T3a

782

fmadd $T3b,$N3,$nb,$T3b

783

add $t0,$t0,$carry ; can not overflow

784

ld $t4,`$FRAME+32`($sp)

785

ld $t5,`$FRAME+40`($sp)

786

fmadd $T0a,$N0,$na,$T0a

787

fmadd $T0b,$N0,$nb,$T0b

788

srdi $carry,$t0,16

789

add $t1,$t1,$carry

790

srdi $carry,$t1,16

791

ld $t6,`$FRAME+48`($sp)

792

ld $t7,`$FRAME+56`($sp)

793

794

fmadd $T1a,$N0,$nc,$T1a

795

fmadd $T1b,$N0,$nd,$T1b

796

insrdi $t0,$t1,16,32

797

ld $t1,8($tp) ; tp[j]

798

fmadd $T2a,$N1,$nc,$T2a

799

fmadd $T2b,$N1,$nd,$T2b

800

add $t2,$t2,$carry

801

fmadd $T3a,$N2,$nc,$T3a

802

fmadd $T3b,$N2,$nd,$T3b

803

srdi $carry,$t2,16

804

insrdi $t0,$t2,16,16

805

fmadd $dota,$N3,$nc,$dota

806

fmadd $dotb,$N3,$nd,$dotb

807

add $t3,$t3,$carry

808

ldu $t2,16($tp) ; tp[j+1]

809

srdi $carry,$t3,16

810

insrdi $t0,$t3,16,0 ; 0..63 bits

811

add $t4,$t4,$carry

812

813

fctid $T0a,$T0a

814

fctid $T0b,$T0b

815

srdi $carry,$t4,16

816

fctid $T1a,$T1a

817

fctid $T1b,$T1b

818

add $t5,$t5,$carry

819

fctid $T2a,$T2a

820

fctid $T2b,$T2b

821

srdi $carry,$t5,16

822

insrdi $t4,$t5,16,32

823

fctid $T3a,$T3a

824

fctid $T3b,$T3b

825

add $t6,$t6,$carry

826

srdi $carry,$t6,16

827

insrdi $t4,$t6,16,16

828

829

stfd $T0a,`$FRAME+0`($sp)

830

stfd $T0b,`$FRAME+8`($sp)

831

add $t7,$t7,$carry

832

addc $t3,$t0,$t1

833

___

834

$code.=<<___ if ($SIZE_T==4); # adjust XER[CA]

835

extrdi $t0,$t0,32,0

836

extrdi $t1,$t1,32,0

837

adde $t0,$t0,$t1

838

___

839

$code.=<<___;

840

stfd $T1a,`$FRAME+16`($sp)

841

stfd $T1b,`$FRAME+24`($sp)

842

insrdi $t4,$t7,16,0 ; 64..127 bits

843

srdi $carry,$t7,16 ; upper 33 bits

844

stfd $T2a,`$FRAME+32`($sp)

845

stfd $T2b,`$FRAME+40`($sp)

846

adde $t5,$t4,$t2

847

___

848

$code.=<<___ if ($SIZE_T==4); # adjust XER[CA]

849

extrdi $t4,$t4,32,0

850

extrdi $t2,$t2,32,0

851

adde $t4,$t4,$t2

852

___

853

$code.=<<___;

854

stfd $T3a,`$FRAME+48`($sp)

855

stfd $T3b,`$FRAME+56`($sp)

856

addze $carry,$carry

857

std $t3,-16($tp) ; tp[j-1]

858

std $t5,-8($tp) ; tp[j]

859

bdnz- Linner

860

861

fctid $dota,$dota

862

fctid $dotb,$dotb

863

ld $t0,`$FRAME+0`($sp)

864

ld $t1,`$FRAME+8`($sp)

865

ld $t2,`$FRAME+16`($sp)

866

ld $t3,`$FRAME+24`($sp)

867

ld $t4,`$FRAME+32`($sp)

868

ld $t5,`$FRAME+40`($sp)

869

ld $t6,`$FRAME+48`($sp)

870

ld $t7,`$FRAME+56`($sp)

871

stfd $dota,`$FRAME+64`($sp)

872

stfd $dotb,`$FRAME+72`($sp)

873

874

add $t0,$t0,$carry ; can not overflow

875

srdi $carry,$t0,16

876

add $t1,$t1,$carry

877

srdi $carry,$t1,16

878

insrdi $t0,$t1,16,32

879

add $t2,$t2,$carry

880

ld $t1,8($tp) ; tp[j]

881

srdi $carry,$t2,16

882

insrdi $t0,$t2,16,16

883

add $t3,$t3,$carry

884

ldu $t2,16($tp) ; tp[j+1]

885

srdi $carry,$t3,16

886

insrdi $t0,$t3,16,0 ; 0..63 bits

887

add $t4,$t4,$carry

888

srdi $carry,$t4,16

889

add $t5,$t5,$carry

890

srdi $carry,$t5,16

891

insrdi $t4,$t5,16,32

892

add $t6,$t6,$carry

893

srdi $carry,$t6,16

894

insrdi $t4,$t6,16,16

895

add $t7,$t7,$carry

896

insrdi $t4,$t7,16,0 ; 64..127 bits

897

srdi $carry,$t7,16 ; upper 33 bits

898

ld $t6,`$FRAME+64`($sp)

899

ld $t7,`$FRAME+72`($sp)

900

901

addc $t3,$t0,$t1

902

___

903

$code.=<<___ if ($SIZE_T==4); # adjust XER[CA]

904

extrdi $t0,$t0,32,0

905

extrdi $t1,$t1,32,0

906

adde $t0,$t0,$t1

907

___

908

$code.=<<___;

909

adde $t5,$t4,$t2

910

___

911

$code.=<<___ if ($SIZE_T==4); # adjust XER[CA]

912

extrdi $t4,$t4,32,0

913

extrdi $t2,$t2,32,0

914

adde $t4,$t4,$t2

915

___

916

$code.=<<___;

917

addze $carry,$carry

918

919

std $t3,-16($tp) ; tp[j-1]

920

std $t5,-8($tp) ; tp[j]

921

922

add $carry,$carry,$ovf ; comsume upmost overflow

923

add $t6,$t6,$carry ; can not overflow

924

srdi $carry,$t6,16

925

add $t7,$t7,$carry

926

insrdi $t6,$t7,48,0

927

srdi $ovf,$t7,48

928

std $t6,0($tp) ; tp[num-1]

929

930

slwi $t7,$num,2

931

addi $i,$i,8

932

subf $nap_d,$t7,$nap_d ; rewind pointer

933

cmpw $i,$num

934

blt- Louter

935

___

936

937

$code.=<<___ if ($SIZE_T==8);

938

subf $np,$num,$np ; rewind np

939

addi $j,$j,1 ; restore counter

940

subfc $i,$i,$i ; j=0 and "clear" XER[CA]

941

addi $tp,$sp,`$FRAME+$TRANSFER+8`

942

addi $t4,$sp,`$FRAME+$TRANSFER+16`

943

addi $t5,$np,8

944

addi $t6,$rp,8

945

mtctr $j

946

947

.align 4

948

Lsub: ldx $t0,$tp,$i

949

ldx $t1,$np,$i

950

ldx $t2,$t4,$i

951

ldx $t3,$t5,$i

952

subfe $t0,$t1,$t0 ; tp[j]-np[j]

953

subfe $t2,$t3,$t2 ; tp[j+1]-np[j+1]

954

stdx $t0,$rp,$i

955

stdx $t2,$t6,$i

956

addi $i,$i,16

957

bdnz- Lsub

958

959

li $i,0

960

subfe $ovf,$i,$ovf ; handle upmost overflow bit

961

and $ap,$tp,$ovf

962

andc $np,$rp,$ovf

963

or $ap,$ap,$np ; ap=borrow?tp:rp

964

addi $t7,$ap,8

965

mtctr $j

966

967

.align 4

968

Lcopy: ; copy or in-place refresh

969

ldx $t0,$ap,$i

970

ldx $t1,$t7,$i

971

std $i,8($nap_d) ; zap nap_d

972

std $i,16($nap_d)

973

std $i,24($nap_d)

974

std $i,32($nap_d)

975

std $i,40($nap_d)

976

std $i,48($nap_d)

977

std $i,56($nap_d)

978

stdu $i,64($nap_d)

979

stdx $t0,$rp,$i

980

stdx $t1,$t6,$i

981

stdx $i,$tp,$i ; zap tp at once

982

stdx $i,$t4,$i

983

addi $i,$i,16

984

bdnz- Lcopy

985

___

986

$code.=<<___ if ($SIZE_T==4);

987

subf $np,$num,$np ; rewind np

988

addi $j,$j,1 ; restore counter

989

subfc $i,$i,$i ; j=0 and "clear" XER[CA]

990

addi $tp,$sp,`$FRAME+$TRANSFER`

991

addi $np,$np,-4

992

addi $rp,$rp,-4

993

addi $ap,$sp,`$FRAME+$TRANSFER+4`

994

mtctr $j

995

996

.align 4

997

Lsub: ld $t0,8($tp) ; load tp[j..j+3] in 64-bit word order

998

ldu $t2,16($tp)

999

lwz $t4,4($np) ; load np[j..j+3] in 32-bit word order

1000

lwz $t5,8($np)

1001

lwz $t6,12($np)

1002

lwzu $t7,16($np)

1003

extrdi $t1,$t0,32,0

1004

extrdi $t3,$t2,32,0

1005

subfe $t4,$t4,$t0 ; tp[j]-np[j]

1006

stw $t0,4($ap) ; save tp[j..j+3] in 32-bit word order

1007

subfe $t5,$t5,$t1 ; tp[j+1]-np[j+1]

1008

stw $t1,8($ap)

1009

subfe $t6,$t6,$t2 ; tp[j+2]-np[j+2]

1010

stw $t2,12($ap)

1011

subfe $t7,$t7,$t3 ; tp[j+3]-np[j+3]

1012

stwu $t3,16($ap)

1013

stw $t4,4($rp)

1014

stw $t5,8($rp)

1015

stw $t6,12($rp)

1016

stwu $t7,16($rp)

1017

bdnz- Lsub

1018

1019

li $i,0

1020

subfe $ovf,$i,$ovf ; handle upmost overflow bit

1021

addi $tp,$sp,`$FRAME+$TRANSFER+4`

1022

subf $rp,$num,$rp ; rewind rp

1023

and $ap,$tp,$ovf

1024

andc $np,$rp,$ovf

1025

or $ap,$ap,$np ; ap=borrow?tp:rp

1026

addi $tp,$sp,`$FRAME+$TRANSFER`

1027

mtctr $j

1028

1029

.align 4

1030

Lcopy: ; copy or in-place refresh

1031

lwz $t0,4($ap)

1032

lwz $t1,8($ap)

1033

lwz $t2,12($ap)

1034

lwzu $t3,16($ap)

1035

std $i,8($nap_d) ; zap nap_d

1036

std $i,16($nap_d)

1037

std $i,24($nap_d)

1038

std $i,32($nap_d)

1039

std $i,40($nap_d)

1040

std $i,48($nap_d)

1041

std $i,56($nap_d)

1042

stdu $i,64($nap_d)

1043

stw $t0,4($rp)

1044

stw $t1,8($rp)

1045

stw $t2,12($rp)

1046

stwu $t3,16($rp)

1047

std $i,8($tp) ; zap tp at once

1048

stdu $i,16($tp)

1049

bdnz- Lcopy

1050

___

1051

1052

$code.=<<___;

1053

$POP $i,0($sp)

1054

li r3,1 ; signal "handled"

1055

$POP r22,`-12*8-10*$SIZE_T`($i)

1056

$POP r23,`-12*8-9*$SIZE_T`($i)

1057

$POP r24,`-12*8-8*$SIZE_T`($i)

1058

$POP r25,`-12*8-7*$SIZE_T`($i)

1059

$POP r26,`-12*8-6*$SIZE_T`($i)

1060

$POP r27,`-12*8-5*$SIZE_T`($i)

1061

$POP r28,`-12*8-4*$SIZE_T`($i)

1062

$POP r29,`-12*8-3*$SIZE_T`($i)

1063

$POP r30,`-12*8-2*$SIZE_T`($i)

1064

$POP r31,`-12*8-1*$SIZE_T`($i)

1065

lfd f20,`-12*8`($i)

1066

lfd f21,`-11*8`($i)

1067

lfd f22,`-10*8`($i)

1068

lfd f23,`-9*8`($i)

1069

lfd f24,`-8*8`($i)

1070

lfd f25,`-7*8`($i)

1071

lfd f26,`-6*8`($i)

1072

lfd f27,`-5*8`($i)

1073

lfd f28,`-4*8`($i)

1074

lfd f29,`-3*8`($i)

1075

lfd f30,`-2*8`($i)

1076

lfd f31,`-1*8`($i)

1077

mr $sp,$i

1078

blr

1079

.long 0

1080

.byte 0,12,4,0,0x8c,10,6,0

1081

.long 0

1082

1083

.asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@openssl.org>"

1084

___

1085

1086

$code =~ s/\`([^\`]*)\`/eval $1/gem;

1087

print $code;

1088

close STDOUT;

Older »