3
# ====================================================================
4
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5
# project. The module is, however, dual licensed under OpenSSL and
6
# CRYPTOGAMS licenses depending on where you obtain it. For further
7
# details see http://www.openssl.org/~appro/cryptogams/.
8
# ====================================================================
12
# Performance improvement over vanilla C code varies from 85% to 45%
13
# depending on key length and benchmark. Unfortunately in this context
14
# these are not very impressive results [for code that utilizes "wide"
15
# 64x64=128-bit multiplication, which is not commonly available to C
16
# programmers], at least hand-coded bn_asm.c replacement is known to
17
# provide 30-40% better results for longest keys. Well, on a second
18
# thought it's not very surprising, because z-CPUs are single-issue
19
# and _strictly_ in-order execution, while bn_mul_mont is more or less
20
# dependent on CPU ability to pipe-line instructions and have several
21
# of them "in-flight" at the same time. I mean while other methods,
22
# for example Karatsuba, aim to minimize amount of multiplications at
23
# the cost of other operations increase, bn_mul_mont aim to neatly
24
# "overlap" multiplications and the other operations [and on most
25
# platforms even minimize the amount of the other operations, in
26
# particular references to memory]. But it's possible to improve this
27
# module performance by implementing dedicated squaring code-path and
28
# possibly by unrolling loops...
32
# Reschedule to minimize/avoid Address Generation Interlock hazard,
33
# make inner loops counter-based.
39
$rp="%r2"; # BN_ULONG *rp,
40
$ap="%r3"; # const BN_ULONG *ap,
41
$bp="%r4"; # const BN_ULONG *bp,
42
$np="%r5"; # const BN_ULONG *np,
43
$n0="%r6"; # const BN_ULONG *n0,
44
#$num="160(%r15)" # int num);
61
.type bn_mul_mont,\@function
63
lgf $num,164($sp) # pull $num
64
sla $num,3 # $num to enumerate bytes
71
blr %r14 # if($num<16) return 0;
73
bhr %r14 # if($num>128) return 0;
77
lghi $rp,-160-8 # leave room for carry bit
81
la $sp,0($j,$rp) # alloca
82
stg %r0,0($sp) # back chain
84
sra $num,3 # restore $num
85
la $bp,0($j,$bp) # restore $bp
86
ahi $num,-1 # adjust $num for inner loop
87
lg $n0,0($n0) # pull n0
91
mlgr $ahi,$bi # ap[0]*bp[0]
94
lgr $mn0,$alo # "tp[0]"*n0
98
mlgr $nhi,$mn0 # np[0]*m1
99
algr $nlo,$alo # +="tp[0]"
109
mlgr $ahi,$bi # ap[j]*bp[0]
115
mlgr $nhi,$mn0 # np[j]*m1
118
alcgr $nhi,$NHI # +="tp[j]"
122
stg $nlo,160-8($j,$sp) # tp[j-1]=
128
alcgr $AHI,$AHI # upmost overflow bit
129
stg $NHI,160-8($j,$sp)
134
lg $bi,0($bp) # bp[i]
136
mlgr $ahi,$bi # ap[0]*bp[i]
137
alg $alo,160($sp) # +=tp[0]
142
msgr $mn0,$n0 # tp[0]*n0
144
lg $nlo,0($np) # np[0]
145
mlgr $nhi,$mn0 # np[0]*m1
146
algr $nlo,$alo # +="tp[0]"
156
mlgr $ahi,$bi # ap[j]*bp[i]
160
alg $alo,160($j,$sp)# +=tp[j]
164
mlgr $nhi,$mn0 # np[j]*m1
168
algr $nlo,$alo # +="tp[j]"
171
stg $nlo,160-8($j,$sp) # tp[j-1]=
178
alg $NHI,160($j,$sp)# accumulate previous upmost overflow bit
180
alcgr $AHI,$ahi # new upmost overflow bit
181
stg $NHI,160-8($j,$sp)
185
clg $bp,160+8+32($j,$sp) # compare to &bp[num]
188
lg $rp,160+8+16($j,$sp) # reincarnate rp
190
ahi $num,1 # restore $num, incidentally clears "borrow"
194
.Lsub: lg $alo,0($j,$ap)
200
slbgr $AHI,$ahi # handle upmost carry
206
ogr $ap,$np # ap=borrow?tp:rp
210
.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh
211
stg $j,160($j,$sp) # zap tp
216
la %r1,160+8+48($j,$sp)
218
lghi %r2,1 # signal "processed"
220
.size bn_mul_mont,.-bn_mul_mont
221
.string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"