3
# ====================================================================
4
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5
# project. The module is, however, dual licensed under OpenSSL and
6
# CRYPTOGAMS licenses depending on where you obtain it. For further
7
# details see http://www.openssl.org/~appro/cryptogams/.
8
# ====================================================================
12
# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
13
# for undertaken effort are multiple. First of all, UltraSPARC is not
14
# the whole SPARCv9 universe and other VIS-free implementations deserve
15
# optimized code as much. Secondly, newly introduced UltraSPARC T1,
16
# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes,
17
# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
18
# several integrated RSA/DSA accelerator circuits accessible through
19
# kernel driver [only(*)], but having decent user-land software
20
# implementation is important too. Finally, reasons like desire to
21
# experiment with dedicated squaring procedure. Yes, this module
22
# implements one, because it was easiest to draft it in SPARCv9
25
# (*) Engine accessing the driver in question is on my TODO list.
26
# For reference, acceleator is estimated to give 6 to 10 times
27
# improvement on single-threaded RSA sign. It should be noted
28
# that 6-10x improvement coefficient does not actually mean
29
# something extraordinary in terms of absolute [single-threaded]
30
# performance, as SPARCv9 instruction set is by all means least
31
# suitable for high performance crypto among other 64 bit
32
# platforms. 6-10x factor simply places T1 in same performance
33
# domain as say AMD64 and IA-64. Improvement of RSA verify don't
34
# appear impressive at all, but it's the sign operation which is
35
# far more critical/interesting.
37
# You might notice that inner loops are modulo-scheduled:-) This has
38
# essentially negligible impact on UltraSPARC performance, it's
39
# Fujitsu SPARC64 V users who should notice and hopefully appreciate
40
# the advantage... Currently this module surpasses sparcv9a-mont.pl
41
# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
42
# module still have hidden potential [see TODO list there], which is
43
# estimated to be larger than 20%...
46
$rp="%i0"; # BN_ULONG *rp,
47
$ap="%i1"; # const BN_ULONG *ap,
48
$bp="%i2"; # const BN_ULONG *bp,
49
$np="%i3"; # const BN_ULONG *np,
50
$n0="%i4"; # const BN_ULONG *n0,
51
$num="%i5"; # int num);
54
for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
55
if ($bits==64) { $bias=2047; $frame=192; }
56
else { $bias=0; $frame=128; }
63
$mask="%g1"; # 32 bits, what a waste...
76
$fname="bn_mul_mont_int";
79
.section ".text",#alloc,#execinstr
84
cmp %o5,4 ! 128 bits minimum
86
sethi %hi(0xffffffff),$mask
92
sll $num,2,$num ! num*=4
93
or $mask,%lo(0xffffffff),$mask
97
ld [$bp],$mul0 ! bp[0]
100
add %sp,$bias,%o7 ! real top of stack
101
ld [$ap],$car0 ! ap[0] ! redundant in squaring context
103
ld [$ap+4],$apj ! ap[1]
105
ld [$np],$car1 ! np[0]
106
sub %o7,$bias,%sp ! alloca
107
ld [$np+4],$npj ! np[1]
108
be,pt `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont
111
mulx $car0,$mul0,$car0 ! ap[0]*bp[0]
112
mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0]
113
and $car0,$mask,$acc0
114
add %sp,$bias+$frame,$tp
115
ld [$ap+8],$apj !prologue!
117
mulx $n0,$acc0,$mul1 ! "t[0]"*n0
118
and $mul1,$mask,$mul1
120
mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
121
mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0
123
add $acc0,$car1,$car1
124
ld [$np+8],$npj !prologue!
126
mov $tmp0,$acc0 !prologue!
129
mulx $apj,$mul0,$tmp0
130
mulx $npj,$mul1,$tmp1
131
add $acc0,$car0,$car0
132
ld [$ap+$j],$apj ! ap[j]
133
and $car0,$mask,$acc0
134
add $acc1,$car1,$car1
135
ld [$np+$j],$npj ! np[j]
137
add $acc0,$car1,$car1
148
mulx $apj,$mul0,$tmp0 !epilogue!
149
mulx $npj,$mul1,$tmp1
150
add $acc0,$car0,$car0
151
and $car0,$mask,$acc0
152
add $acc1,$car1,$car1
154
add $acc0,$car1,$car1
158
add $tmp0,$car0,$car0
159
and $car0,$mask,$acc0
160
add $tmp1,$car1,$car1
162
add $acc0,$car1,$car1
166
add $car0,$car1,$car1
171
ld [$bp+4],$mul0 ! bp[1]
173
add %sp,$bias+$frame,$tp
174
ld [$ap],$car0 ! ap[0]
175
ld [$ap+4],$apj ! ap[1]
176
ld [$np],$car1 ! np[0]
177
ld [$np+4],$npj ! np[1]
178
ld [$tp],$tmp1 ! tp[0]
179
ld [$tp+4],$tpj ! tp[1]
182
mulx $car0,$mul0,$car0
183
mulx $apj,$mul0,$tmp0 !prologue!
184
add $tmp1,$car0,$car0
185
ld [$ap+8],$apj !prologue!
186
and $car0,$mask,$acc0
189
and $mul1,$mask,$mul1
191
mulx $car1,$mul1,$car1
192
mulx $npj,$mul1,$acc1 !prologue!
194
add $acc0,$car1,$car1
195
ld [$np+8],$npj !prologue!
197
mov $tmp0,$acc0 !prologue!
200
mulx $apj,$mul0,$tmp0
201
mulx $npj,$mul1,$tmp1
203
ld [$ap+$j],$apj ! ap[j]
204
add $acc0,$car0,$car0
205
add $acc1,$car1,$car1
206
ld [$np+$j],$npj ! np[j]
207
and $car0,$mask,$acc0
208
ld [$tp+8],$tpj ! tp[j]
210
add $acc0,$car1,$car1
213
st $car1,[$tp] ! tp[j-1]
221
mulx $apj,$mul0,$tmp0 !epilogue!
222
mulx $npj,$mul1,$tmp1
224
add $acc0,$car0,$car0
225
ld [$tp+8],$tpj ! tp[j]
226
and $car0,$mask,$acc0
227
add $acc1,$car1,$car1
229
add $acc0,$car1,$car1
230
st $car1,[$tp] ! tp[j-1]
234
add $tmp0,$car0,$car0
235
and $car0,$mask,$acc0
236
add $tmp1,$car1,$car1
237
add $acc0,$car1,$car1
238
st $car1,[$tp+4] ! tp[j-1]
243
add $car0,$car1,$car1
245
add $car2,$car1,$car1
250
ld [$bp+$i],$mul0 ! bp[i]
259
sub %g0,$num,%o7 ! k=-num
261
subcc %g0,%g0,%g0 ! clear %icc.c
266
subccc %o0,%o1,%o1 ! tp[j]-np[j]
271
subc $car2,0,$car2 ! handle upmost overflow bit
278
ld [$ap+%o7],%o0 ! copy or in-place refresh
279
st %g0,[$tp+%o7] ! zap tp
290
######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
291
######## code without following dedicated squaring procedure.
293
$sbit="%i2"; # re-use $bp!
298
mulx $mul0,$mul0,$car0 ! ap[0]*ap[0]
299
mulx $apj,$mul0,$tmp0 !prologue!
300
and $car0,$mask,$acc0
301
add %sp,$bias+$frame,$tp
302
ld [$ap+8],$apj !prologue!
304
mulx $n0,$acc0,$mul1 ! "t[0]"*n0
306
and $mul1,$mask,$mul1
308
mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
309
mulx $npj,$mul1,$acc1 !prologue!
311
ld [$np+8],$npj !prologue!
313
add $acc0,$car1,$car1
315
mov $tmp0,$acc0 !prologue!
318
mulx $apj,$mul0,$tmp0
319
mulx $npj,$mul1,$tmp1
320
add $acc0,$car0,$car0 ! ap[j]*a0+c0
321
add $acc1,$car1,$car1
322
ld [$ap+$j],$apj ! ap[j]
323
and $car0,$mask,$acc0
324
ld [$np+$j],$npj ! np[j]
326
add $acc0,$acc0,$acc0
331
and $acc0,$mask,$acc0
333
add $acc0,$car1,$car1
341
mulx $apj,$mul0,$tmp0 ! epilogue
342
mulx $npj,$mul1,$tmp1
343
add $acc0,$car0,$car0 ! ap[j]*a0+c0
344
add $acc1,$car1,$car1
345
and $car0,$mask,$acc0
347
add $acc0,$acc0,$acc0
350
and $acc0,$mask,$acc0
351
add $acc0,$car1,$car1
355
add $tmp0,$car0,$car0 ! ap[j]*a0+c0
356
add $tmp1,$car1,$car1
357
and $car0,$mask,$acc0
359
add $acc0,$acc0,$acc0
362
and $acc0,$mask,$acc0
363
add $acc0,$car1,$car1
367
add $car0,$car0,$car0
369
add $car0,$car1,$car1
373
ld [%sp+$bias+$frame],$tmp0 ! tp[0]
374
ld [%sp+$bias+$frame+4],$tmp1 ! tp[1]
375
ld [%sp+$bias+$frame+8],$tpj ! tp[2]
376
ld [$ap+4],$mul0 ! ap[1]
377
ld [$ap+8],$apj ! ap[2]
378
ld [$np],$car1 ! np[0]
379
ld [$np+4],$npj ! np[1]
382
mulx $mul0,$mul0,$car0
383
and $mul1,$mask,$mul1
385
mulx $car1,$mul1,$car1
386
mulx $npj,$mul1,$acc1
387
add $tmp0,$car1,$car1
388
and $car0,$mask,$acc0
389
ld [$np+8],$npj ! np[2]
391
add $tmp1,$car1,$car1
393
add $acc0,$car1,$car1
395
add $acc1,$car1,$car1
398
st $car1,[%sp+$bias+$frame] ! tp[0]=
400
add %sp,$bias+$frame+4,$tp
403
mulx $apj,$mul0,$acc0
404
mulx $npj,$mul1,$acc1
405
add $acc0,$car0,$car0
407
ld [$ap+$j],$apj ! ap[j]
408
and $car0,$mask,$acc0
409
ld [$np+$j],$npj ! np[j]
411
add $acc1,$car1,$car1
412
ld [$tp+8],$tpj ! tp[j]
413
add $acc0,$acc0,$acc0
417
and $acc0,$mask,$acc0
419
add $acc0,$car1,$car1
420
st $car1,[$tp] ! tp[j-1]
426
mulx $apj,$mul0,$acc0
427
mulx $npj,$mul1,$acc1
428
add $acc0,$car0,$car0
430
and $car0,$mask,$acc0
432
add $acc1,$car1,$car1
433
add $acc0,$acc0,$acc0
436
and $acc0,$mask,$acc0
437
add $acc0,$car1,$car1
438
st $car1,[$tp] ! tp[j-1]
441
add $car0,$car0,$car0
443
add $car0,$car1,$car1
444
add $car2,$car1,$car1
448
ld [%sp+$bias+$frame],$tmp1 ! tp[0]
449
ld [%sp+$bias+$frame+4],$tpj ! tp[1]
450
ld [$ap+8],$mul0 ! ap[2]
451
ld [$np],$car1 ! np[0]
452
ld [$np+4],$npj ! np[1]
454
and $mul1,$mask,$mul1
457
mulx $mul0,$mul0,$car0
458
mulx $car1,$mul1,$car1
459
and $car0,$mask,$acc0
460
add $tmp1,$car1,$car1
462
add %sp,$bias+$frame,$tp
470
mulx $npj,$mul1,$acc1
475
add $acc1,$car1,$car1
484
ld [$ap+$j],$apj ! ap[j]
485
mulx $npj,$mul1,$acc1
487
ld [$np+$j],$npj ! np[j]
488
add $acc0,$car1,$car1
489
ld [$tp+8],$tpj ! tp[j]
490
add $acc1,$car1,$car1
496
be,pn %icc,.Lsqr_no_inner2
500
mulx $apj,$mul0,$acc0
501
mulx $npj,$mul1,$acc1
503
add $acc0,$car0,$car0
504
ld [$ap+$j],$apj ! ap[j]
505
and $car0,$mask,$acc0
506
ld [$np+$j],$npj ! np[j]
508
add $acc0,$acc0,$acc0
509
ld [$tp+8],$tpj ! tp[j]
513
and $acc0,$mask,$acc0
515
add $acc0,$car1,$car1
516
add $acc1,$car1,$car1
517
st $car1,[$tp] ! tp[j-1]
523
mulx $apj,$mul0,$acc0
524
mulx $npj,$mul1,$acc1
526
add $acc0,$car0,$car0
527
and $car0,$mask,$acc0
529
add $acc0,$acc0,$acc0
532
and $acc0,$mask,$acc0
533
add $acc0,$car1,$car1
534
add $acc1,$car1,$car1
535
st $car1,[$tp] ! tp[j-1]
538
add $car0,$car0,$car0
540
add $car0,$car1,$car1
541
add $car2,$car1,$car1
546
ld [%sp+$bias+$frame],$tmp1 ! tp[0]
547
ld [%sp+$bias+$frame+4],$tpj ! tp[1]
548
ld [$ap+$i],$mul0 ! ap[j]
549
ld [$np],$car1 ! np[0]
550
ld [$np+4],$npj ! np[1]
552
and $mul1,$mask,$mul1
555
mulx $mul0,$mul0,$car0
556
mulx $car1,$mul1,$car1
557
and $car0,$mask,$acc0
558
add $tmp1,$car1,$car1
560
add %sp,$bias+$frame,$tp
565
cmp $tmp0,$num ! i<num-1
570
mulx $npj,$mul1,$acc1
575
add $acc1,$car1,$car1
583
mulx $npj,$mul1,$acc1
585
add $acc0,$car1,$car1
586
add $acc1,$car1,$car1
590
add $car0,$car0,$car0 ! recover $car0
592
add $car0,$car1,$car1
593
add $car2,$car1,$car1
599
.type $fname,#function
600
.size $fname,(.-$fname)
601
.asciz "Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
604
$code =~ s/\`([^\`]*)\`/eval($1)/gem;