3
# ====================================================================
4
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5
# project. The module is, however, dual licensed under OpenSSL and
6
# CRYPTOGAMS licenses depending on where you obtain it. For further
7
# details see http://www.openssl.org/~appro/cryptogams/.
8
# ====================================================================
12
# "Teaser" Montgomery multiplication module for PowerPC. It's possible
13
# to gain a bit more by modulo-scheduling outer loop, then dedicated
14
# squaring procedure should give further 20% and code can be adapted
15
# for 32-bit application running on 64-bit CPU. As for the latter.
16
# It won't be able to achieve "native" 64-bit performance, because in
17
# 32-bit application context every addc instruction will have to be
18
# expanded as addc, twice right shift by 32 and finally adde, etc.
19
# So far RSA *sign* performance improvement over pre-bn_mul_mont asm
20
# for 64-bit application running on PPC970/G5 is:
29
if ($flavour =~ /32/) {
37
$LDU= "lwzu"; # load and update
38
$LDX= "lwzx"; # load indexed
40
$STU= "stwu"; # store and update
41
$STX= "stwx"; # store indexed
42
$STUX= "stwux"; # store indexed and update
43
$UMULL= "mullw"; # unsigned multiply low
44
$UMULH= "mulhwu"; # unsigned multiply high
45
$UCMP= "cmplw"; # unsigned compare
46
$SHRI= "srwi"; # unsigned shift right by immediate
49
} elsif ($flavour =~ /64/) {
56
# same as above, but 64-bit mnemonics...
58
$LDU= "ldu"; # load and update
59
$LDX= "ldx"; # load indexed
61
$STU= "stdu"; # store and update
62
$STX= "stdx"; # store indexed
63
$STUX= "stdux"; # store indexed and update
64
$UMULL= "mulld"; # unsigned multiply low
65
$UMULH= "mulhdu"; # unsigned multiply high
66
$UCMP= "cmpld"; # unsigned compare
67
$SHRI= "srdi"; # unsigned shift right by immediate
70
} else { die "nonsense $flavour"; }
72
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
73
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
74
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
75
die "can't locate ppc-xlate.pl";
77
open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
87
$rp="r9"; # $rp is reassigned
91
# non-volatile registers
115
mr $rp,r3 ; $rp is reassigned
119
slwi $num,$num,`log($BNSZ)/log(2)`
121
addi $ovf,$num,`$FRAME+$RZONE`
122
subf $ovf,$ovf,$sp ; $sp-$ovf
123
and $ovf,$ovf,$tj ; minimize TLB usage
124
subf $ovf,$sp,$ovf ; $ovf-$sp
125
srwi $num,$num,`log($BNSZ)/log(2)`
128
$PUSH r14,`4*$SIZE_T`($sp)
129
$PUSH r15,`5*$SIZE_T`($sp)
130
$PUSH r16,`6*$SIZE_T`($sp)
131
$PUSH r17,`7*$SIZE_T`($sp)
132
$PUSH r18,`8*$SIZE_T`($sp)
133
$PUSH r19,`9*$SIZE_T`($sp)
134
$PUSH r20,`10*$SIZE_T`($sp)
135
$PUSH r21,`11*$SIZE_T`($sp)
136
$PUSH r22,`12*$SIZE_T`($sp)
137
$PUSH r23,`13*$SIZE_T`($sp)
138
$PUSH r24,`14*$SIZE_T`($sp)
139
$PUSH r25,`15*$SIZE_T`($sp)
141
$LD $n0,0($n0) ; pull n0[0] value
142
addi $num,$num,-2 ; adjust $num for counter register
144
$LD $m0,0($bp) ; m0=bp[0]
145
$LD $aj,0($ap) ; ap[0]
147
$UMULL $lo0,$aj,$m0 ; ap[0]*bp[0]
150
$LD $aj,$BNSZ($ap) ; ap[1]
151
$LD $nj,0($np) ; np[0]
153
$UMULL $m1,$lo0,$n0 ; "tp[0]"*n0
155
$UMULL $alo,$aj,$m0 ; ap[1]*bp[0]
158
$UMULL $lo1,$nj,$m1 ; np[0]*m1
160
$LD $nj,$BNSZ($np) ; np[1]
164
$UMULL $nlo,$nj,$m1 ; np[1]*m1
171
$LDX $aj,$ap,$j ; ap[j]
173
$LDX $nj,$np,$j ; np[j]
175
$UMULL $alo,$aj,$m0 ; ap[j]*bp[0]
179
$UMULL $nlo,$nj,$m1 ; np[j]*m1
180
addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0]
183
$ST $lo1,0($tp) ; tp[j-1]
185
addi $j,$j,$BNSZ ; j++
186
addi $tp,$tp,$BNSZ ; tp++
194
addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0]
196
$ST $lo1,0($tp) ; tp[j-1]
200
addze $ovf,$ovf ; upmost overflow bit
206
$LDX $m0,$bp,$i ; m0=bp[i]
207
$LD $aj,0($ap) ; ap[0]
209
$LD $tj,$FRAME($sp) ; tp[0]
210
$UMULL $lo0,$aj,$m0 ; ap[0]*bp[i]
212
$LD $aj,$BNSZ($ap) ; ap[1]
213
$LD $nj,0($np) ; np[0]
214
addc $lo0,$lo0,$tj ; ap[0]*bp[i]+tp[0]
215
$UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
217
$UMULL $m1,$lo0,$n0 ; tp[0]*n0
219
$UMULL $lo1,$nj,$m1 ; np[0]*m1
221
$LD $nj,$BNSZ($np) ; np[1]
223
$UMULL $nlo,$nj,$m1 ; np[1]*m1
231
$LDX $aj,$ap,$j ; ap[j]
233
$LD $tj,$BNSZ($tp) ; tp[j]
235
$LDX $nj,$np,$j ; np[j]
237
$UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
240
addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j]
241
$UMULL $nlo,$nj,$m1 ; np[j]*m1
244
addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j]
245
addi $j,$j,$BNSZ ; j++
247
$ST $lo1,0($tp) ; tp[j-1]
248
addi $tp,$tp,$BNSZ ; tp++
251
$LD $tj,$BNSZ($tp) ; tp[j]
254
addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j]
259
addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j]
261
$ST $lo1,0($tp) ; tp[j-1]
263
addic $ovf,$ovf,-1 ; move upmost overflow to XER[CA]
269
slwi $tj,$num,`log($BNSZ)/log(2)`
274
addi $num,$num,2 ; restore $num
275
subfc $j,$j,$j ; j=0 and "clear" XER[CA]
280
Lsub: $LDX $tj,$tp,$j
282
subfe $aj,$nj,$tj ; tp[j]-np[j]
289
subfe $ovf,$j,$ovf ; handle upmost overflow bit
292
or $ap,$ap,$np ; ap=borrow?tp:rp
295
Lcopy: ; copy or in-place refresh
298
$STX $j,$tp,$j ; zap at once
302
$POP r14,`4*$SIZE_T`($sp)
303
$POP r15,`5*$SIZE_T`($sp)
304
$POP r16,`6*$SIZE_T`($sp)
305
$POP r17,`7*$SIZE_T`($sp)
306
$POP r18,`8*$SIZE_T`($sp)
307
$POP r19,`9*$SIZE_T`($sp)
308
$POP r20,`10*$SIZE_T`($sp)
309
$POP r21,`11*$SIZE_T`($sp)
310
$POP r22,`12*$SIZE_T`($sp)
311
$POP r23,`13*$SIZE_T`($sp)
312
$POP r24,`14*$SIZE_T`($sp)
313
$POP r25,`15*$SIZE_T`($sp)
318
.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
321
$code =~ s/\`([^\`]*)\`/eval $1/gem;