1
.ident "sparcv8.s, Version 1.4"
2
.ident "SPARC v8 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
5
* ====================================================================
6
* Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
9
* Rights for redistribution and usage in source and binary forms are
10
* granted according to the OpenSSL license. Warranty of any kind is
12
* ====================================================================
16
* This is my modest contributon to OpenSSL project (see
17
* http://www.openssl.org/ for more information about it) and is
18
* a drop-in SuperSPARC ISA replacement for crypto/bn/bn_asm.c
19
* module. For updates see http://fy.chalmers.se/~appro/hpe/.
21
* See bn_asm.sparc.v8plus.S for more details.
27
* 1.1 - new loop unrolling model(*);
28
* 1.2 - made gas friendly;
29
* 1.3 - fixed problem with /usr/ccs/lib/cpp;
32
* (*) see bn_asm.sparc.v8plus.S for details
35
.section ".text",#alloc,#execinstr
36
.file "bn_asm.sparc.v8.S"
40
.global bn_mul_add_words
42
* BN_ULONG bn_mul_add_words(rp,ap,num,w)
49
bg,a .L_bn_mul_add_words_proceed
54
.L_bn_mul_add_words_proceed:
56
bz .L_bn_mul_add_words_tail
59
.L_bn_mul_add_words_loop:
102
bnz,a .L_bn_mul_add_words_loop
106
bnz,a .L_bn_mul_add_words_tail
108
.L_bn_mul_add_words_return:
113
.L_bn_mul_add_words_tail:
122
bz .L_bn_mul_add_words_return
134
bz .L_bn_mul_add_words_return
148
.type bn_mul_add_words,#function
149
.size bn_mul_add_words,(.-bn_mul_add_words)
155
* BN_ULONG bn_mul_words(rp,ap,num,w)
162
bg,a .L_bn_mul_words_proceeed
167
.L_bn_mul_words_proceeed:
169
bz .L_bn_mul_words_tail
172
.L_bn_mul_words_loop:
204
bnz,a .L_bn_mul_words_loop
208
bnz,a .L_bn_mul_words_tail
210
.L_bn_mul_words_return:
215
.L_bn_mul_words_tail:
221
bz .L_bn_mul_words_return
231
bz .L_bn_mul_words_return
242
.type bn_mul_words,#function
243
.size bn_mul_words,(.-bn_mul_words)
248
* void bn_sqr_words(r,a,n)
254
bg,a .L_bn_sqr_words_proceeed
259
.L_bn_sqr_words_proceeed:
261
bz .L_bn_sqr_words_tail
264
.L_bn_sqr_words_loop:
292
bnz,a .L_bn_sqr_words_loop
297
bnz,a .L_bn_sqr_words_tail
299
.L_bn_sqr_words_return:
303
.L_bn_sqr_words_tail:
308
bz .L_bn_sqr_words_return
317
bz .L_bn_sqr_words_return
328
.type bn_sqr_words,#function
329
.size bn_sqr_words,(.-bn_sqr_words)
335
* BN_ULONG bn_div_words(h,l,d)
344
.type bn_div_words,#function
345
.size bn_div_words,(.-bn_div_words)
351
* BN_ULONG bn_add_words(rp,ap,bp,n)
352
* BN_ULONG *rp,*ap,*bp;
357
bg,a .L_bn_add_words_proceed
362
.L_bn_add_words_proceed:
364
bz .L_bn_add_words_tail
366
ba .L_bn_add_words_warn_loop
367
addcc %g0,0,%g0 ! clear carry flag
369
.L_bn_add_words_loop:
371
.L_bn_add_words_warn_loop:
396
bnz,a .L_bn_add_words_loop
400
bnz,a .L_bn_add_words_tail
402
.L_bn_add_words_return:
406
.L_bn_add_words_tail:
412
bz .L_bn_add_words_return
421
bz .L_bn_add_words_return
432
.type bn_add_words,#function
433
.size bn_add_words,(.-bn_add_words)
439
* BN_ULONG bn_sub_words(rp,ap,bp,n)
440
* BN_ULONG *rp,*ap,*bp;
445
bg,a .L_bn_sub_words_proceed
450
.L_bn_sub_words_proceed:
452
bz .L_bn_sub_words_tail
454
ba .L_bn_sub_words_warm_loop
455
addcc %g0,0,%g0 ! clear carry flag
457
.L_bn_sub_words_loop:
459
.L_bn_sub_words_warm_loop:
484
bnz,a .L_bn_sub_words_loop
489
bnz,a .L_bn_sub_words_tail
491
.L_bn_sub_words_return:
495
.L_bn_sub_words_tail:
501
bz .L_bn_sub_words_return
511
bz .L_bn_sub_words_return
522
.type bn_sub_words,#function
523
.size bn_sub_words,(.-bn_sub_words)
525
#define FRAME_SIZE -96
528
* Here is register usage map for *all* routines below.
536
#define ap(I) [%i1+4*I]
537
#define bp(I) [%i2+4*I]
538
#define rp(I) [%i0+4*I]
559
.global bn_mul_comba8
561
* void bn_mul_comba8(r,a,b)
565
save %sp,FRAME_SIZE,%sp
568
umul a_0,b_0,c_1 !=!mul_add_c(a[0],b[0],c1,c2,c3);
571
st c_1,rp(0) !r[0]=c1;
573
umul a_0,b_1,t_1 !=!mul_add_c(a[0],b[1],c2,c3,c1);
577
addxcc %g0,t_2,c_3 !=
580
umul a_1,b_0,t_1 !mul_add_c(a[1],b[0],c2,c3,c1);
584
st c_2,rp(1) !r[1]=c2;
587
umul a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2);
590
addxcc c_1,t_2,c_1 !=
593
umul a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2);
599
umul a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2);
602
addxcc c_1,t_2,c_1 !=
604
st c_3,rp(2) !r[2]=c3;
606
umul a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3);
611
umul a_1,b_2,t_1 !=!mul_add_c(a[1],b[2],c1,c2,c3);
617
umul a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3);
623
umul a_3,b_0,t_1 !mul_add_c(a[3],b[0],c1,c2,c3);!=
628
st c_1,rp(3) !r[3]=c1;
630
umul a_4,b_0,t_1 !mul_add_c(a[4],b[0],c2,c3,c1);
635
umul a_3,b_1,t_1 !mul_add_c(a[3],b[1],c2,c3,c1);
640
umul a_2,b_2,t_1 !=!mul_add_c(a[2],b[2],c2,c3,c1);
646
umul a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1);
652
umul a_0,b_4,t_1 !=!mul_add_c(a[0],b[4],c2,c3,c1);
657
st c_2,rp(4) !r[4]=c2;
659
umul a_0,b_5,t_1 !mul_add_c(a[0],b[5],c3,c1,c2);
664
umul a_1,b_4,t_1 !mul_add_c(a[1],b[4],c3,c1,c2);
669
umul a_2,b_3,t_1 !=!mul_add_c(a[2],b[3],c3,c1,c2);
674
umul a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2);
677
addxcc c_1,t_2,c_1 !=
680
umul a_4,b_1,t_1 !mul_add_c(a[4],b[1],c3,c1,c2);
686
umul a_5,b_0,t_1 !mul_add_c(a[5],b[0],c3,c1,c2);
689
addxcc c_1,t_2,c_1 !=
691
st c_3,rp(5) !r[5]=c3;
693
umul a_6,b_0,t_1 !mul_add_c(a[6],b[0],c1,c2,c3);
698
umul a_5,b_1,t_1 !=!mul_add_c(a[5],b[1],c1,c2,c3);
703
umul a_4,b_2,t_1 !mul_add_c(a[4],b[2],c1,c2,c3);
706
addxcc c_2,t_2,c_2 !=
708
umul a_3,b_3,t_1 !mul_add_c(a[3],b[3],c1,c2,c3);
713
umul a_2,b_4,t_1 !mul_add_c(a[2],b[4],c1,c2,c3);
719
umul a_1,b_5,t_1 !mul_add_c(a[1],b[5],c1,c2,c3);
722
addxcc c_2,t_2,c_2 !=
725
umul a_0,b_6,t_1 !mul_add_c(a[0],b[6],c1,c2,c3);
729
st c_1,rp(6) !r[6]=c1;
732
umul a_0,b_7,t_1 !mul_add_c(a[0],b[7],c2,c3,c1);
735
addxcc c_3,t_2,c_3 !=
737
umul a_1,b_6,t_1 !mul_add_c(a[1],b[6],c2,c3,c1);
742
umul a_2,b_5,t_1 !mul_add_c(a[2],b[5],c2,c3,c1);
747
umul a_3,b_4,t_1 !=!mul_add_c(a[3],b[4],c2,c3,c1);
752
umul a_4,b_3,t_1 !mul_add_c(a[4],b[3],c2,c3,c1);
755
addxcc c_3,t_2,c_3 !=
757
umul a_5,b_2,t_1 !mul_add_c(a[5],b[2],c2,c3,c1);
763
umul a_6,b_1,t_1 !=!mul_add_c(a[6],b[1],c2,c3,c1);
768
umul a_7,b_0,t_1 !mul_add_c(a[7],b[0],c2,c3,c1);
771
addxcc c_3,t_2,c_3 !=
773
st c_2,rp(7) !r[7]=c2;
775
umul a_7,b_1,t_1 !mul_add_c(a[7],b[1],c3,c1,c2);
780
umul a_6,b_2,t_1 !=!mul_add_c(a[6],b[2],c3,c1,c2);
785
umul a_5,b_3,t_1 !mul_add_c(a[5],b[3],c3,c1,c2);
788
addxcc c_1,t_2,c_1 !=
790
umul a_4,b_4,t_1 !mul_add_c(a[4],b[4],c3,c1,c2);
795
umul a_3,b_5,t_1 !mul_add_c(a[3],b[5],c3,c1,c2);
800
umul a_2,b_6,t_1 !=!mul_add_c(a[2],b[6],c3,c1,c2);
805
umul a_1,b_7,t_1 !mul_add_c(a[1],b[7],c3,c1,c2);
810
st c_3,rp(8) !r[8]=c3;
812
umul a_2,b_7,t_1 !mul_add_c(a[2],b[7],c1,c2,c3);
817
umul a_3,b_6,t_1 !=!mul_add_c(a[3],b[6],c1,c2,c3);
822
umul a_4,b_5,t_1 !mul_add_c(a[4],b[5],c1,c2,c3);
825
addxcc c_2,t_2,c_2 !=
827
umul a_5,b_4,t_1 !mul_add_c(a[5],b[4],c1,c2,c3);
832
umul a_6,b_3,t_1 !mul_add_c(a[6],b[3],c1,c2,c3);
837
umul a_7,b_2,t_1 !=!mul_add_c(a[7],b[2],c1,c2,c3);
842
st c_1,rp(9) !r[9]=c1;
844
umul a_7,b_3,t_1 !mul_add_c(a[7],b[3],c2,c3,c1);
849
umul a_6,b_4,t_1 !mul_add_c(a[6],b[4],c2,c3,c1);
854
umul a_5,b_5,t_1 !=!mul_add_c(a[5],b[5],c2,c3,c1);
859
umul a_4,b_6,t_1 !mul_add_c(a[4],b[6],c2,c3,c1);
862
addxcc c_3,t_2,c_3 !=
864
umul a_3,b_7,t_1 !mul_add_c(a[3],b[7],c2,c3,c1);
869
st c_2,rp(10) !r[10]=c2;
871
umul a_4,b_7,t_1 !=!mul_add_c(a[4],b[7],c3,c1,c2);
876
umul a_5,b_6,t_1 !mul_add_c(a[5],b[6],c3,c1,c2);
879
addxcc c_1,t_2,c_1 !=
881
umul a_6,b_5,t_1 !mul_add_c(a[6],b[5],c3,c1,c2);
886
umul a_7,b_4,t_1 !mul_add_c(a[7],b[4],c3,c1,c2);
890
st c_3,rp(11) !r[11]=c3;
893
umul a_7,b_5,t_1 !mul_add_c(a[7],b[5],c1,c2,c3);
896
addxcc c_2,t_2,c_2 !=
898
umul a_6,b_6,t_1 !mul_add_c(a[6],b[6],c1,c2,c3);
903
umul a_5,b_7,t_1 !mul_add_c(a[5],b[7],c1,c2,c3);
907
st c_1,rp(12) !r[12]=c1;
910
umul a_6,b_7,t_1 !mul_add_c(a[6],b[7],c2,c3,c1);
913
addxcc c_3,t_2,c_3 !=
915
umul a_7,b_6,t_1 !mul_add_c(a[7],b[6],c2,c3,c1);
920
st c_2,rp(13) !r[13]=c2;
922
umul a_7,b_7,t_1 !=!mul_add_c(a[7],b[7],c3,c1,c2);
927
st c_3,rp(14) !r[14]=c3;
928
st c_1,rp(15) !r[15]=c1;
933
.type bn_mul_comba8,#function
934
.size bn_mul_comba8,(.-bn_mul_comba8)
938
.global bn_mul_comba4
940
* void bn_mul_comba4(r,a,b)
944
save %sp,FRAME_SIZE,%sp
947
umul a_0,b_0,c_1 !=!mul_add_c(a[0],b[0],c1,c2,c3);
950
st c_1,rp(0) !r[0]=c1;
952
umul a_0,b_1,t_1 !=!mul_add_c(a[0],b[1],c2,c3,c1);
959
umul a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1);
964
st c_2,rp(1) !r[1]=c2;
966
umul a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2);
972
umul a_1,b_1,t_1 !=!mul_add_c(a[1],b[1],c3,c1,c2);
978
umul a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2);
983
st c_3,rp(2) !r[2]=c3;
985
umul a_0,b_3,t_1 !=!mul_add_c(a[0],b[3],c1,c2,c3);
990
umul a_1,b_2,t_1 !mul_add_c(a[1],b[2],c1,c2,c3);
993
addxcc c_2,t_2,c_2 !=
996
umul a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3);
1001
umul a_3,b_0,t_1 !=!mul_add_c(a[3],b[0],c1,c2,c3);
1006
st c_1,rp(3) !r[3]=c1;
1008
umul a_3,b_1,t_1 !mul_add_c(a[3],b[1],c2,c3,c1);
1013
umul a_2,b_2,t_1 !mul_add_c(a[2],b[2],c2,c3,c1);
1014
addcc c_2,t_1,c_2 !=
1018
umul a_1,b_3,t_1 !=!mul_add_c(a[1],b[3],c2,c3,c1);
1023
st c_2,rp(4) !r[4]=c2;
1025
umul a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2);
1030
umul a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2);
1031
addcc c_3,t_1,c_3 !=
1034
st c_3,rp(5) !r[5]=c3;
1037
umul a_3,b_3,t_1 !mul_add_c(a[3],b[3],c1,c2,c3);
1040
addxcc c_2,t_2,c_2 !=
1041
st c_1,rp(6) !r[6]=c1;
1042
st c_2,rp(7) !r[7]=c2;
1047
.type bn_mul_comba4,#function
1048
.size bn_mul_comba4,(.-bn_mul_comba4)
1052
.global bn_sqr_comba8
1054
save %sp,FRAME_SIZE,%sp
1057
umul a_0,a_0,c_1 !=!sqr_add_c(a,0,c1,c2,c3);
1059
st c_1,rp(0) !r[0]=c1;
1062
umul a_0,a_1,t_1 !=!sqr_add_c2(a,1,0,c2,c3,c1);
1069
st c_2,rp(1) !r[1]=c2;
1072
umul a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2);
1075
addxcc c_1,t_2,c_1 !=
1081
umul a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2);
1086
st c_3,rp(2) !r[2]=c3;
1088
umul a_0,a_3,t_1 !=!sqr_add_c2(a,3,0,c1,c2,c3);
1097
umul a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3);
1100
addxcc c_2,t_2,c_2 !=
1105
st c_1,rp(3) !r[3]=c1;
1107
umul a_4,a_0,t_1 !sqr_add_c2(a,4,0,c2,c3,c1);
1113
addxcc c_3,t_2,c_3 !=
1115
umul a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1);
1121
addxcc c_3,t_2,c_3 !=
1124
umul a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1);
1125
addcc c_2,t_1,c_2 !=
1128
st c_2,rp(4) !r[4]=c2;
1131
umul a_0,a_5,t_1 !sqr_add_c2(a,5,0,c3,c1,c2);
1134
addxcc c_1,t_2,c_1 !=
1139
umul a_1,a_4,t_1 !sqr_add_c2(a,4,1,c3,c1,c2);
1142
addxcc c_1,t_2,c_1 !=
1148
umul a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2);
1154
addxcc c_1,t_2,c_1 !=
1156
st c_3,rp(5) !r[5]=c3;
1158
umul a_6,a_0,t_1 !sqr_add_c2(a,6,0,c1,c2,c3);
1159
addcc c_1,t_1,c_1 !=
1163
addcc c_1,t_1,c_1 !=
1166
umul a_5,a_1,t_1 !sqr_add_c2(a,5,1,c1,c2,c3);
1167
addcc c_1,t_1,c_1 !=
1171
addcc c_1,t_1,c_1 !=
1174
umul a_4,a_2,t_1 !sqr_add_c2(a,4,2,c1,c2,c3);
1175
addcc c_1,t_1,c_1 !=
1179
addcc c_1,t_1,c_1 !=
1183
umul a_3,a_3,t_1 !=!sqr_add_c(a,3,c1,c2,c3);
1188
st c_1,rp(6) !r[6]=c1;
1190
umul a_0,a_7,t_1 !sqr_add_c2(a,7,0,c2,c3,c1);
1196
addxcc c_3,t_2,c_3 !=
1198
umul a_1,a_6,t_1 !sqr_add_c2(a,6,1,c2,c3,c1);
1204
addxcc c_3,t_2,c_3 !=
1206
umul a_2,a_5,t_1 !sqr_add_c2(a,5,2,c2,c3,c1);
1212
addxcc c_3,t_2,c_3 !=
1214
umul a_3,a_4,t_1 !sqr_add_c2(a,4,3,c2,c3,c1);
1220
addxcc c_3,t_2,c_3 !=
1222
st c_2,rp(7) !r[7]=c2;
1224
umul a_7,a_1,t_1 !sqr_add_c2(a,7,1,c3,c1,c2);
1225
addcc c_3,t_1,c_3 !=
1229
addcc c_3,t_1,c_3 !=
1232
umul a_6,a_2,t_1 !sqr_add_c2(a,6,2,c3,c1,c2);
1233
addcc c_3,t_1,c_3 !=
1237
addcc c_3,t_1,c_3 !=
1240
umul a_5,a_3,t_1 !sqr_add_c2(a,5,3,c3,c1,c2);
1241
addcc c_3,t_1,c_3 !=
1245
addcc c_3,t_1,c_3 !=
1248
umul a_4,a_4,t_1 !sqr_add_c(a,4,c3,c1,c2);
1249
addcc c_3,t_1,c_3 !=
1252
st c_3,rp(8) !r[8]=c3;
1255
umul a_2,a_7,t_1 !sqr_add_c2(a,7,2,c1,c2,c3);
1258
addxcc c_2,t_2,c_2 !=
1263
umul a_3,a_6,t_1 !sqr_add_c2(a,6,3,c1,c2,c3);
1266
addxcc c_2,t_2,c_2 !=
1271
umul a_4,a_5,t_1 !sqr_add_c2(a,5,4,c1,c2,c3);
1274
addxcc c_2,t_2,c_2 !=
1279
st c_1,rp(9) !r[9]=c1;
1281
umul a_7,a_3,t_1 !sqr_add_c2(a,7,3,c2,c3,c1);
1287
addxcc c_3,t_2,c_3 !=
1289
umul a_6,a_4,t_1 !sqr_add_c2(a,6,4,c2,c3,c1);
1295
addxcc c_3,t_2,c_3 !=
1297
umul a_5,a_5,t_1 !sqr_add_c(a,5,c2,c3,c1);
1302
st c_2,rp(10) !r[10]=c2;
1304
umul a_4,a_7,t_1 !=!sqr_add_c2(a,7,4,c3,c1,c2);
1312
umul a_5,a_6,t_1 !=!sqr_add_c2(a,6,5,c3,c1,c2);
1319
st c_3,rp(11) !r[11]=c3;
1322
umul a_7,a_5,t_1 !sqr_add_c2(a,7,5,c1,c2,c3);
1325
addxcc c_2,t_2,c_2 !=
1330
umul a_6,a_6,t_1 !sqr_add_c(a,6,c1,c2,c3);
1333
addxcc c_2,t_2,c_2 !=
1335
st c_1,rp(12) !r[12]=c1;
1337
umul a_6,a_7,t_1 !sqr_add_c2(a,7,6,c2,c3,c1);
1338
addcc c_2,t_1,c_2 !=
1342
addcc c_2,t_1,c_2 !=
1344
st c_2,rp(13) !r[13]=c2;
1347
umul a_7,a_7,t_1 !sqr_add_c(a,7,c3,c1,c2);
1350
addxcc c_1,t_2,c_1 !=
1351
st c_3,rp(14) !r[14]=c3;
1352
st c_1,rp(15) !r[15]=c1;
1357
.type bn_sqr_comba8,#function
1358
.size bn_sqr_comba8,(.-bn_sqr_comba8)
1362
.global bn_sqr_comba4
1364
* void bn_sqr_comba4(r,a)
1368
save %sp,FRAME_SIZE,%sp
1370
umul a_0,a_0,c_1 !sqr_add_c(a,0,c1,c2,c3);
1373
st c_1,rp(0) !r[0]=c1;
1376
umul a_0,a_1,t_1 !=!sqr_add_c2(a,1,0,c2,c3,c1);
1384
st c_2,rp(1) !r[1]=c2;
1386
umul a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2);
1392
addxcc c_1,t_2,c_1 !=
1395
umul a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2);
1396
addcc c_3,t_1,c_3 !=
1399
st c_3,rp(2) !r[2]=c3;
1402
umul a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3);
1405
addxcc c_2,t_2,c_2 !=
1410
umul a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3);
1413
addxcc c_2,t_2,c_2 !=
1418
st c_1,rp(3) !r[3]=c1;
1420
umul a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1);
1426
addxcc c_3,t_2,c_3 !=
1428
umul a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1);
1433
st c_2,rp(4) !r[4]=c2;
1435
umul a_2,a_3,t_1 !=!sqr_add_c2(a,3,2,c3,c1,c2);
1442
st c_3,rp(5) !r[5]=c3;
1445
umul a_3,a_3,t_1 !sqr_add_c(a,3,c1,c2,c3);
1448
addxcc c_2,t_2,c_2 !=
1449
st c_1,rp(6) !r[6]=c1;
1450
st c_2,rp(7) !r[7]=c2;
1455
.type bn_sqr_comba4,#function
1456
.size bn_sqr_comba4,(.-bn_sqr_comba4)