3
# ====================================================================
4
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5
# project. The module is, however, dual licensed under OpenSSL and
6
# CRYPTOGAMS licenses depending on where you obtain it. For further
7
# details see http://www.openssl.org/~appro/cryptogams/.
8
# ====================================================================
12
# "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU?
13
# Because unlike integer multiplier, which simply stalls whole CPU,
14
# FPU is fully pipelined and can effectively emit 48 bit partial
15
# product every cycle. Why not blended SPARC v9? One can argue that
16
# making this module dependent on UltraSPARC VIS extension limits its
17
# binary compatibility. Well yes, it does exclude SPARC64 prior-V(!)
18
# implementations from compatibility matrix. But the rest, whole Sun
19
# UltraSPARC family and brand new Fujitsu's SPARC64 V, all support
20
# VIS extension instructions used in this module. This is considered
21
# good enough to not care about HAL SPARC64 users [if any] who have
22
# integer-only pure SPARCv9 module to "fall down" to.
24
# USI&II cores currently exhibit uniform 2x improvement [over pre-
25
# bn_mul_mont codebase] for all key lengths and benchmarks. On USIII
26
# performance improves few percents for shorter keys and worsens few
27
# percents for longer keys. This is because USIII integer multiplier
28
# is >3x faster than USI&II one, which is harder to match [but see
29
# TODO list below]. It should also be noted that SPARC64 V features
30
# out-of-order execution, which *might* mean that integer multiplier
31
# is pipelined, which in turn *might* be impossible to match... On
32
# additional note, SPARC64 V implements FP Multiply-Add instruction,
33
# which is perfectly usable in this context... In other words, as far
34
# as Fujitsu SPARC64 V goes, talk to the author:-)
36
# The implementation implies following "non-natural" limitations on
38
# - num may not be less than 4;
39
# - num has to be even;
40
# Failure to meet either condition has no fatal effects, simply
41
# doesn't give any performance gain.
44
# - modulo-schedule inner loop for better performance (on in-order
45
# execution core such as UltraSPARC this shall result in further
46
# noticeable(!) improvement);
47
# - dedicated squaring procedure[?];
49
######################################################################
52
# Modulo-scheduled inner loops allow to interleave floating point and
53
# integer instructions and minimize Read-After-Write penalties. This
54
# results in *further* 20-50% perfromance improvement [depending on
55
# key length, more for longer keys] on USI&II cores and 30-80% - on
58
$fname="bn_mul_mont_fpu";
60
for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
67
$frame=128; # 96 rounded up to largest known cache-line
71
# In order to provide for 32-/64-bit ABI duality, I keep integers wider
72
# than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used
73
# exclusively for pointers, indexes and other small values...
75
$rp="%i0"; # BN_ULONG *rp,
76
$ap="%i1"; # const BN_ULONG *ap,
77
$bp="%i2"; # const BN_ULONG *bp,
78
$np="%i3"; # const BN_ULONG *np,
79
$n0="%i4"; # const BN_ULONG *n0,
80
$num="%i5"; # int num);
83
$ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved
84
$ap_h="%l2"; # to these four vectors as double-precision FP values.
85
$np_l="%l3"; # This way a bunch of fxtods are eliminated in second
86
$np_h="%l4"; # loop and L1-cache aliasing is minimized...
89
$mask="%l7"; # 16-bit mask, 0xffff
91
$n0="%g4"; # reassigned(!) to "64-bit" register
92
$carry="%i4"; # %i4 reused(!) for a carry bit
94
# FP register naming chart
109
$ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6";
110
$na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14";
111
$alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19";
112
$nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23";
114
$dota="%f24"; $dotb="%f26";
116
$aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38";
117
$ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46";
118
$nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54";
119
$nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";
121
$ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load
124
.section ".text",#alloc,#execinstr
129
save %sp,-$frame-$locals,%sp
134
andcc $num,1,%g0 ! $num has to be even...
136
clr %i0 ! signal "unsupported input value"
139
sethi %hi(0xffff),$mask
140
ld [%i4+0],$n0 ! $n0 reassigned, remember?
141
or $mask,%lo(0xffff),$mask
144
or %o0,$n0,$n0 ! $n0=n0[1].n0[0]
146
sll $num,3,$num ! num*=8
148
add %sp,$bias,%o0 ! real top of stack
150
add %o1,$num,%o1 ! %o1=num*5
152
and %o0,-2048,%o0 ! optimize TLB utilization
153
sub %o0,$bias,%sp ! alloca(5*num*8)
155
rd %asi,%o7 ! save %asi
156
add %sp,$bias+$frame+$locals,$tp
158
add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends !
163
wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads
165
add $rp,$num,$rp ! readjust input pointers to point
166
add $ap,$num,$ap ! at the ends too...
170
stx %o7,[%sp+$bias+$frame+48] ! save %asi
172
sub %g0,$num,$i ! i=-num
173
sub %g0,$num,$j ! j=-num
178
ld [%o3+4],%g1 ! bp[0]
180
ld [%o4+4],%g5 ! ap[0]
189
mulx %o1,%o0,%o0 ! ap[0]*bp[0]
190
mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0
191
stx %o0,[%sp+$bias+$frame+0]
193
ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words
197
ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
202
! transfer b[i] to FPU as 4x16-bit values
212
! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values
213
ldda [%sp+$bias+$frame+6]%asi,$na
215
ldda [%sp+$bias+$frame+4]%asi,$nb
217
ldda [%sp+$bias+$frame+2]%asi,$nc
219
ldda [%sp+$bias+$frame+0]%asi,$nd
222
std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
226
std $nlo,[$np_l+$j] ! save smashed np[j] in double format
236
faddd $aloa,$nloa,$nloa
239
faddd $alob,$nlob,$nlob
242
faddd $aloc,$nloc,$nloc
245
faddd $alod,$nlod,$nlod
248
faddd $ahia,$nhia,$nhia
251
faddd $ahib,$nhib,$nhib
254
faddd $ahic,$nhic,$dota ! $nhic
255
faddd $ahid,$nhid,$dotb ! $nhid
257
faddd $nloc,$nhia,$nloc
258
faddd $nlod,$nhib,$nlod
265
std $nloa,[%sp+$bias+$frame+0]
267
std $nlob,[%sp+$bias+$frame+8]
269
std $nloc,[%sp+$bias+$frame+16]
271
std $nlod,[%sp+$bias+$frame+24]
273
ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
277
ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
287
ldx [%sp+$bias+$frame+0],%o0
289
ldx [%sp+$bias+$frame+8],%o1
291
ldx [%sp+$bias+$frame+16],%o2
293
ldx [%sp+$bias+$frame+24],%o3
297
std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
301
faddd $aloa,$nloa,$nloa
304
std $nlo,[$np_l+$j] ! save smashed np[j] in double format
308
faddd $alob,$nlob,$nlob
312
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
313
faddd $aloc,$nloc,$nloc
323
!or %o7,%o0,%o0 ! 64-bit result
324
srlx %o3,16,%g1 ! 34-bit carry
327
faddd $alod,$nlod,$nlod
330
faddd $ahia,$nhia,$nhia
333
faddd $ahib,$nhib,$nhib
336
faddd $dota,$nloa,$nloa
337
faddd $dotb,$nlob,$nlob
338
faddd $ahic,$nhic,$dota ! $nhic
339
faddd $ahid,$nhid,$dotb ! $nhid
341
faddd $nloc,$nhia,$nloc
342
faddd $nlod,$nhib,$nlod
349
std $nloa,[%sp+$bias+$frame+0]
350
std $nlob,[%sp+$bias+$frame+8]
352
std $nloc,[%sp+$bias+$frame+16]
354
std $nlod,[%sp+$bias+$frame+24]
356
.align 32 ! incidentally already aligned !
360
ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
364
ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
374
ldx [%sp+$bias+$frame+0],%o0
376
ldx [%sp+$bias+$frame+8],%o1
378
ldx [%sp+$bias+$frame+16],%o2
380
ldx [%sp+$bias+$frame+24],%o3
384
std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
388
faddd $aloa,$nloa,$nloa
391
std $nlo,[$np_l+$j] ! save smashed np[j] in double format
395
faddd $alob,$nlob,$nlob
399
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
401
faddd $aloc,$nloc,$nloc
407
faddd $alod,$nlod,$nlod
413
faddd $ahia,$nhia,$nhia
417
or %o7,%o0,%o0 ! 64-bit result
418
faddd $ahib,$nhib,$nhib
421
faddd $dota,$nloa,$nloa
422
srlx %o3,16,%g1 ! 34-bit carry
423
faddd $dotb,$nlob,$nlob
427
stx %o0,[$tp] ! tp[j-1]=
429
faddd $ahic,$nhic,$dota ! $nhic
430
faddd $ahid,$nhid,$dotb ! $nhid
432
faddd $nloc,$nhia,$nloc
433
faddd $nlod,$nhib,$nlod
440
std $nloa,[%sp+$bias+$frame+0]
441
std $nlob,[%sp+$bias+$frame+8]
442
std $nloc,[%sp+$bias+$frame+16]
443
std $nlod,[%sp+$bias+$frame+24]
453
ldx [%sp+$bias+$frame+0],%o0
454
ldx [%sp+$bias+$frame+8],%o1
455
ldx [%sp+$bias+$frame+16],%o2
456
ldx [%sp+$bias+$frame+24],%o3
459
std $dota,[%sp+$bias+$frame+32]
461
std $dotb,[%sp+$bias+$frame+40]
465
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
474
or %o7,%o0,%o0 ! 64-bit result
475
ldx [%sp+$bias+$frame+32],%o4
477
ldx [%sp+$bias+$frame+40],%o5
478
srlx %o3,16,%g1 ! 34-bit carry
482
stx %o0,[$tp] ! tp[j-1]=
496
stx %o4,[$tp] ! tp[num-1]=
502
sub %g0,$num,$j ! j=-num
503
add %sp,$bias+$frame+$locals,$tp
508
ld [%o3+4],%g1 ! bp[i]
510
ld [%o4+4],%g5 ! ap[0]
517
ldx [$tp],%o2 ! tp[0]
520
mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0
521
stx %o0,[%sp+$bias+$frame+0]
523
! transfer b[i] to FPU as 4x16-bit values
529
! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
530
ldda [%sp+$bias+$frame+6]%asi,$na
532
ldda [%sp+$bias+$frame+4]%asi,$nb
534
ldda [%sp+$bias+$frame+2]%asi,$nc
536
ldda [%sp+$bias+$frame+0]%asi,$nd
538
ldd [$ap_l+$j],$alo ! load a[j] in double format
542
ldd [$np_l+$j],$nlo ! load n[j] in double format
552
faddd $aloa,$nloa,$nloa
555
faddd $alob,$nlob,$nlob
558
faddd $aloc,$nloc,$nloc
561
faddd $alod,$nlod,$nlod
564
faddd $ahia,$nhia,$nhia
567
faddd $ahib,$nhib,$nhib
570
faddd $ahic,$nhic,$dota ! $nhic
571
faddd $ahid,$nhid,$dotb ! $nhid
573
faddd $nloc,$nhia,$nloc
574
faddd $nlod,$nhib,$nlod
581
std $nloa,[%sp+$bias+$frame+0]
582
std $nlob,[%sp+$bias+$frame+8]
583
std $nloc,[%sp+$bias+$frame+16]
585
std $nlod,[%sp+$bias+$frame+24]
587
ldd [$ap_l+$j],$alo ! load a[j] in double format
589
ldd [$np_l+$j],$nlo ! load n[j] in double format
597
ldx [%sp+$bias+$frame+0],%o0
598
faddd $aloa,$nloa,$nloa
600
ldx [%sp+$bias+$frame+8],%o1
602
ldx [%sp+$bias+$frame+16],%o2
603
faddd $alob,$nlob,$nlob
605
ldx [%sp+$bias+$frame+24],%o3
609
faddd $aloc,$nloc,$nloc
614
faddd $alod,$nlod,$nlod
619
faddd $ahia,$nhia,$nhia
621
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
627
faddd $ahib,$nhib,$nhib
630
faddd $dota,$nloa,$nloa
632
faddd $dotb,$nlob,$nlob
635
faddd $ahic,$nhic,$dota ! $nhic
637
faddd $ahid,$nhid,$dotb ! $nhid
638
or %o7,%o0,%o0 ! 64-bit result
640
faddd $nloc,$nhia,$nloc
643
faddd $nlod,$nhib,$nlod
644
srlx %o3,16,%g1 ! 34-bit carry
653
std $nloa,[%sp+$bias+$frame+0]
654
std $nlob,[%sp+$bias+$frame+8]
656
std $nloc,[%sp+$bias+$frame+16]
657
bz,pn %icc,.Linnerskip
658
std $nlod,[%sp+$bias+$frame+24]
664
ldd [$ap_l+$j],$alo ! load a[j] in double format
666
ldd [$np_l+$j],$nlo ! load n[j] in double format
674
ldx [%sp+$bias+$frame+0],%o0
675
faddd $aloa,$nloa,$nloa
677
ldx [%sp+$bias+$frame+8],%o1
679
ldx [%sp+$bias+$frame+16],%o2
680
faddd $alob,$nlob,$nlob
682
ldx [%sp+$bias+$frame+24],%o3
686
faddd $aloc,$nloc,$nloc
691
faddd $alod,$nlod,$nlod
696
faddd $ahia,$nhia,$nhia
698
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
703
faddd $ahib,$nhib,$nhib
706
faddd $dota,$nloa,$nloa
708
faddd $dotb,$nlob,$nlob
711
faddd $ahic,$nhic,$dota ! $nhic
713
faddd $ahid,$nhid,$dotb ! $nhid
714
or %o7,%o0,%o0 ! 64-bit result
715
faddd $nloc,$nhia,$nloc
717
ldx [$tp+8],%o7 ! tp[j]
718
faddd $nlod,$nhib,$nlod
719
srlx %o3,16,%g1 ! 34-bit carry
729
stx %o0,[$tp] ! tp[j-1]
732
std $nloa,[%sp+$bias+$frame+0]
733
std $nlob,[%sp+$bias+$frame+8]
734
std $nloc,[%sp+$bias+$frame+16]
736
std $nlod,[%sp+$bias+$frame+24]
744
ldx [%sp+$bias+$frame+0],%o0
745
ldx [%sp+$bias+$frame+8],%o1
746
ldx [%sp+$bias+$frame+16],%o2
747
ldx [%sp+$bias+$frame+24],%o3
750
std $dota,[%sp+$bias+$frame+32]
752
std $dotb,[%sp+$bias+$frame+40]
756
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
765
ldx [%sp+$bias+$frame+32],%o4
766
or %o7,%o0,%o0 ! 64-bit result
767
ldx [%sp+$bias+$frame+40],%o5
769
ldx [$tp+8],%o7 ! tp[j]
770
srlx %o3,16,%g1 ! 34-bit carry
778
stx %o0,[$tp] ! tp[j-1]
792
stx %o4,[$tp] ! tp[num-1]
801
add $tp,8,$tp ! adjust tp to point at the end
803
sub %g0,$num,%o7 ! n=-num
805
subcc %g0,%g0,%g0 ! clear %icc.c
822
sub %g0,$num,%o7 ! n=-num
843
sub %g0,$num,%o7 ! n=-num
854
ldx [%sp+$bias+$frame+48],%o7
855
wr %g0,%o7,%asi ! restore %asi
861
.type $fname,#function
862
.size $fname,(.-$fname)
863
.asciz "Montgomery Multipltication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>"
867
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
869
# Below substitution makes it possible to compile without demanding
870
# VIS extentions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I
871
# dare to do this, because VIS capability is detected at run-time now
872
# and this routine is not called on CPU not capable to execute it. Do
873
# note that fzeros is not the only VIS dependency! Another dependency
874
# is implicit and is just _a_ numerical value loaded to %asi register,
875
# which assembler can't recognize as VIS specific...
876
$code =~ s/fzeros\s+%f([0-9]+)/
877
sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1)