3
# ====================================================================
4
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5
# project. The module is, however, dual licensed under OpenSSL and
6
# CRYPTOGAMS licenses depending on where you obtain it. For further
7
# details see http://www.openssl.org/~appro/cryptogams/.
8
# ====================================================================
10
# On 21264 RSA sign performance improves by 70/35/20/15 percent for
11
# 512/1024/2048/4096 bit key lengths. This is against vendor compiler
12
# instructed to '-tune host' code with in-line assembler. Other
13
# benchmarks improve by 15-20%. To anchor it to something else, the
14
# code provides approximately the same performance per GHz as AMD64.
15
# I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
19
$rp="a0"; # BN_ULONG *rp,
20
$ap="a1"; # const BN_ULONG *ap,
21
$bp="a2"; # const BN_ULONG *bp,
22
$np="a3"; # const BN_ULONG *np,
23
$n0="a4"; # const BN_ULONG *n0,
24
$num="a5"; # int num);
45
#include <asm/regdef.h>
78
ldq $hi0,0($ap) # ap[0]
82
ldq $bi,0($bp) # bp[0]
83
lda AT,-4096(zero) # mov -4096,AT
88
ldq $hi1,0($np) # np[0]
203
s8addq $j,$np,$nj #U0
207
addq $alo,$hi0,$lo0 #L1
210
mulq $aj,$bi,$alo #U1
211
cmpult $lo0,$hi0,AT #L0
212
addq $nlo,$hi1,$lo1 #L1
215
mulq $nj,$m1,$nlo #U1
216
addq $ahi,AT,$hi0 #L0
217
addq $lo0,$tj,$lo0 #L1
218
cmpult $lo1,$hi1,v0 #U0
220
umulh $aj,$bi,$ahi #U1
221
cmpult $lo0,$tj,AT #L0
222
addq $lo1,$lo0,$lo1 #L1
223
addq $nhi,v0,$hi1 #U0
225
umulh $nj,$m1,$nhi #U1
226
s8addq $j,$ap,$aj #L0
227
cmpult $lo1,$lo0,v0 #L1
228
cmplt $j,$num,$tj #U0 # borrow $tj
230
addq $hi0,AT,$hi0 #L0
231
addq $hi1,v0,$hi1 #U1
255
cmpult $lo1,$hi0,$hi1
261
cmplt $i,$num,$tj # borrow $tj
265
s8addq $num,sp,$tj # &tp[num]
266
mov $rp,$bp # put rp aside
269
mov 0,$hi0 # clear borrow bit
272
.Lsub: ldq $lo0,0($tp)
276
subq $lo0,$lo1,$lo1 # tp[i]-np[i]
279
cmpult $lo1,$lo0,$hi0
286
subq $hi1,$hi0,$hi0 # handle upmost overflow bit
288
mov $bp,$rp # restore rp
292
bis $bp,$ap,$ap # ap=borrow?tp:rp
295
.Lcopy: ldq $aj,0($ap) # copy or in-place refresh
299
stq zero,-8($tp) # zap tp
316
.ascii "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>"