1
/* crypto/bn/bn_asm.c */
2
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
5
* This package is an SSL implementation written
6
* by Eric Young (eay@cryptsoft.com).
7
* The implementation was written so as to conform with Netscapes SSL.
9
* This library is free for commercial and non-commercial use as long as
10
* the following conditions are aheared to. The following conditions
11
* apply to all code found in this distribution, be it the RC4, RSA,
12
* lhash, DES, etc., code; not just the SSL code. The SSL documentation
13
* included with this distribution is covered by the same copyright terms
14
* except that the holder is Tim Hudson (tjh@cryptsoft.com).
16
* Copyright remains Eric Young's, and as such any Copyright notices in
17
* the code are not to be removed.
18
* If this package is used in a product, Eric Young should be given attribution
19
* as the author of the parts of the library used.
20
* This can be in the form of a textual message at program startup or
21
* in documentation (online or textual) provided with the package.
23
* Redistribution and use in source and binary forms, with or without
24
* modification, are permitted provided that the following conditions
26
* 1. Redistributions of source code must retain the copyright
27
* notice, this list of conditions and the following disclaimer.
28
* 2. Redistributions in binary form must reproduce the above copyright
29
* notice, this list of conditions and the following disclaimer in the
30
* documentation and/or other materials provided with the distribution.
31
* 3. All advertising materials mentioning features or use of this software
32
* must display the following acknowledgement:
33
* "This product includes cryptographic software written by
34
* Eric Young (eay@cryptsoft.com)"
35
* The word 'cryptographic' can be left out if the rouines from the library
36
* being used are not cryptographic related :-).
37
* 4. If you include any Windows specific code (or a derivative thereof) from
38
* the apps directory (application code) you must include an acknowledgement:
39
* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
41
* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
53
* The licence and distribution terms for any publically available version or
54
* derivative of this code cannot be changed. i.e. this code cannot simply be
55
* copied and put under another distribution licence
56
* [including the GNU Public Licence.]
60
# undef NDEBUG /* avoid conflicting definitions */
69
#if defined(BN_LLONG) || defined(BN_UMULT_HIGH)
71
BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
76
if (num <= 0) return(c1);
80
mul_add(rp[0],ap[0],w,c1);
81
mul_add(rp[1],ap[1],w,c1);
82
mul_add(rp[2],ap[2],w,c1);
83
mul_add(rp[3],ap[3],w,c1);
88
mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
89
mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
90
mul_add(rp[2],ap[2],w,c1); return c1;
96
BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
101
if (num <= 0) return(c1);
105
mul(rp[0],ap[0],w,c1);
106
mul(rp[1],ap[1],w,c1);
107
mul(rp[2],ap[2],w,c1);
108
mul(rp[3],ap[3],w,c1);
109
ap+=4; rp+=4; num-=4;
113
mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
114
mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
115
mul(rp[2],ap[2],w,c1);
120
void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
134
sqr(r[0],r[1],a[0]); if (--n == 0) return;
135
sqr(r[2],r[3],a[1]); if (--n == 0) return;
140
#else /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
142
BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
148
if (num <= 0) return((BN_ULONG)0);
155
mul_add(rp[0],ap[0],bl,bh,c);
156
if (--num == 0) break;
157
mul_add(rp[1],ap[1],bl,bh,c);
158
if (--num == 0) break;
159
mul_add(rp[2],ap[2],bl,bh,c);
160
if (--num == 0) break;
161
mul_add(rp[3],ap[3],bl,bh,c);
162
if (--num == 0) break;
169
BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
175
if (num <= 0) return((BN_ULONG)0);
182
mul(rp[0],ap[0],bl,bh,carry);
183
if (--num == 0) break;
184
mul(rp[1],ap[1],bl,bh,carry);
185
if (--num == 0) break;
186
mul(rp[2],ap[2],bl,bh,carry);
187
if (--num == 0) break;
188
mul(rp[3],ap[3],bl,bh,carry);
189
if (--num == 0) break;
196
void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
202
sqr64(r[0],r[1],a[0]);
205
sqr64(r[2],r[3],a[1]);
208
sqr64(r[4],r[5],a[2]);
211
sqr64(r[6],r[7],a[3]);
219
#endif /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
221
#if defined(BN_LLONG) && defined(BN_DIV2W)
223
BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
225
return((BN_ULONG)(((((BN_ULLONG)h)<<BN_BITS2)|l)/(BN_ULLONG)d));
230
/* Divide h,l by d and return the result. */
231
/* I need to test this some more :-( */
232
BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
234
BN_ULONG dh,dl,q,ret=0,th,tl,t;
237
if (d == 0) return(BN_MASK2);
239
i=BN_num_bits_word(d);
240
assert((i == BN_BITS2) || (h > (BN_ULONG)1<<i));
248
h=(h<<i)|(l>>(BN_BITS2-i));
251
dh=(d&BN_MASK2h)>>BN_BITS4;
255
if ((h>>BN_BITS4) == dh)
268
((l&BN_MASK2h)>>BN_BITS4))))
275
tl=(tl<<BN_BITS4)&BN_MASK2h;
287
if (--count == 0) break;
290
h=((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2;
291
l=(l&BN_MASK2l)<<BN_BITS4;
296
#endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */
299
BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
304
if (n <= 0) return((BN_ULONG)0);
308
ll+=(BN_ULLONG)a[0]+b[0];
309
r[0]=(BN_ULONG)ll&BN_MASK2;
313
ll+=(BN_ULLONG)a[1]+b[1];
314
r[1]=(BN_ULONG)ll&BN_MASK2;
318
ll+=(BN_ULLONG)a[2]+b[2];
319
r[2]=(BN_ULONG)ll&BN_MASK2;
323
ll+=(BN_ULLONG)a[3]+b[3];
324
r[3]=(BN_ULONG)ll&BN_MASK2;
332
return((BN_ULONG)ll);
334
#else /* !BN_LLONG */
335
BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
340
if (n <= 0) return((BN_ULONG)0);
383
#endif /* !BN_LLONG */
385
BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
391
if (n <= 0) return((BN_ULONG)0);
396
r[0]=(t1-t2-c)&BN_MASK2;
397
if (t1 != t2) c=(t1 < t2);
401
r[1]=(t1-t2-c)&BN_MASK2;
402
if (t1 != t2) c=(t1 < t2);
406
r[2]=(t1-t2-c)&BN_MASK2;
407
if (t1 != t2) c=(t1 < t2);
411
r[3]=(t1-t2-c)&BN_MASK2;
412
if (t1 != t2) c=(t1 < t2);
429
/* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */
430
/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
431
/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
432
/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
435
#define mul_add_c(a,b,c0,c1,c2) \
437
t1=(BN_ULONG)Lw(t); \
438
t2=(BN_ULONG)Hw(t); \
439
c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
440
c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
442
#define mul_add_c2(a,b,c0,c1,c2) \
446
t1=(BN_ULONG)Lw(tt); \
447
t2=(BN_ULONG)Hw(tt); \
448
c0=(c0+t1)&BN_MASK2; \
449
if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \
450
c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
452
#define sqr_add_c(a,i,c0,c1,c2) \
453
t=(BN_ULLONG)a[i]*a[i]; \
454
t1=(BN_ULONG)Lw(t); \
455
t2=(BN_ULONG)Hw(t); \
456
c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
457
c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
459
#define sqr_add_c2(a,i,j,c0,c1,c2) \
460
mul_add_c2((a)[i],(a)[j],c0,c1,c2)
462
#elif defined(BN_UMULT_HIGH)
464
#define mul_add_c(a,b,c0,c1,c2) { \
465
BN_ULONG ta=(a),tb=(b); \
467
t2 = BN_UMULT_HIGH(ta,tb); \
468
c0 += t1; t2 += (c0<t1)?1:0; \
469
c1 += t2; c2 += (c1<t2)?1:0; \
472
#define mul_add_c2(a,b,c0,c1,c2) { \
473
BN_ULONG ta=(a),tb=(b),t0; \
474
t1 = BN_UMULT_HIGH(ta,tb); \
476
t2 = t1+t1; c2 += (t2<t1)?1:0; \
477
t1 = t0+t0; t2 += (t1<t0)?1:0; \
478
c0 += t1; t2 += (c0<t1)?1:0; \
479
c1 += t2; c2 += (c1<t2)?1:0; \
482
#define sqr_add_c(a,i,c0,c1,c2) { \
483
BN_ULONG ta=(a)[i]; \
485
t2 = BN_UMULT_HIGH(ta,ta); \
486
c0 += t1; t2 += (c0<t1)?1:0; \
487
c1 += t2; c2 += (c1<t2)?1:0; \
490
#define sqr_add_c2(a,i,j,c0,c1,c2) \
491
mul_add_c2((a)[i],(a)[j],c0,c1,c2)
493
#else /* !BN_LLONG */
494
#define mul_add_c(a,b,c0,c1,c2) \
495
t1=LBITS(a); t2=HBITS(a); \
496
bl=LBITS(b); bh=HBITS(b); \
497
mul64(t1,t2,bl,bh); \
498
c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
499
c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
501
#define mul_add_c2(a,b,c0,c1,c2) \
502
t1=LBITS(a); t2=HBITS(a); \
503
bl=LBITS(b); bh=HBITS(b); \
504
mul64(t1,t2,bl,bh); \
505
if (t2 & BN_TBIT) c2++; \
506
t2=(t2+t2)&BN_MASK2; \
507
if (t1 & BN_TBIT) t2++; \
508
t1=(t1+t1)&BN_MASK2; \
509
c0=(c0+t1)&BN_MASK2; \
510
if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \
511
c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
513
#define sqr_add_c(a,i,c0,c1,c2) \
514
sqr64(t1,t2,(a)[i]); \
515
c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
516
c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
518
#define sqr_add_c2(a,i,j,c0,c1,c2) \
519
mul_add_c2((a)[i],(a)[j],c0,c1,c2)
520
#endif /* !BN_LLONG */
522
void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
535
mul_add_c(a[0],b[0],c1,c2,c3);
538
mul_add_c(a[0],b[1],c2,c3,c1);
539
mul_add_c(a[1],b[0],c2,c3,c1);
542
mul_add_c(a[2],b[0],c3,c1,c2);
543
mul_add_c(a[1],b[1],c3,c1,c2);
544
mul_add_c(a[0],b[2],c3,c1,c2);
547
mul_add_c(a[0],b[3],c1,c2,c3);
548
mul_add_c(a[1],b[2],c1,c2,c3);
549
mul_add_c(a[2],b[1],c1,c2,c3);
550
mul_add_c(a[3],b[0],c1,c2,c3);
553
mul_add_c(a[4],b[0],c2,c3,c1);
554
mul_add_c(a[3],b[1],c2,c3,c1);
555
mul_add_c(a[2],b[2],c2,c3,c1);
556
mul_add_c(a[1],b[3],c2,c3,c1);
557
mul_add_c(a[0],b[4],c2,c3,c1);
560
mul_add_c(a[0],b[5],c3,c1,c2);
561
mul_add_c(a[1],b[4],c3,c1,c2);
562
mul_add_c(a[2],b[3],c3,c1,c2);
563
mul_add_c(a[3],b[2],c3,c1,c2);
564
mul_add_c(a[4],b[1],c3,c1,c2);
565
mul_add_c(a[5],b[0],c3,c1,c2);
568
mul_add_c(a[6],b[0],c1,c2,c3);
569
mul_add_c(a[5],b[1],c1,c2,c3);
570
mul_add_c(a[4],b[2],c1,c2,c3);
571
mul_add_c(a[3],b[3],c1,c2,c3);
572
mul_add_c(a[2],b[4],c1,c2,c3);
573
mul_add_c(a[1],b[5],c1,c2,c3);
574
mul_add_c(a[0],b[6],c1,c2,c3);
577
mul_add_c(a[0],b[7],c2,c3,c1);
578
mul_add_c(a[1],b[6],c2,c3,c1);
579
mul_add_c(a[2],b[5],c2,c3,c1);
580
mul_add_c(a[3],b[4],c2,c3,c1);
581
mul_add_c(a[4],b[3],c2,c3,c1);
582
mul_add_c(a[5],b[2],c2,c3,c1);
583
mul_add_c(a[6],b[1],c2,c3,c1);
584
mul_add_c(a[7],b[0],c2,c3,c1);
587
mul_add_c(a[7],b[1],c3,c1,c2);
588
mul_add_c(a[6],b[2],c3,c1,c2);
589
mul_add_c(a[5],b[3],c3,c1,c2);
590
mul_add_c(a[4],b[4],c3,c1,c2);
591
mul_add_c(a[3],b[5],c3,c1,c2);
592
mul_add_c(a[2],b[6],c3,c1,c2);
593
mul_add_c(a[1],b[7],c3,c1,c2);
596
mul_add_c(a[2],b[7],c1,c2,c3);
597
mul_add_c(a[3],b[6],c1,c2,c3);
598
mul_add_c(a[4],b[5],c1,c2,c3);
599
mul_add_c(a[5],b[4],c1,c2,c3);
600
mul_add_c(a[6],b[3],c1,c2,c3);
601
mul_add_c(a[7],b[2],c1,c2,c3);
604
mul_add_c(a[7],b[3],c2,c3,c1);
605
mul_add_c(a[6],b[4],c2,c3,c1);
606
mul_add_c(a[5],b[5],c2,c3,c1);
607
mul_add_c(a[4],b[6],c2,c3,c1);
608
mul_add_c(a[3],b[7],c2,c3,c1);
611
mul_add_c(a[4],b[7],c3,c1,c2);
612
mul_add_c(a[5],b[6],c3,c1,c2);
613
mul_add_c(a[6],b[5],c3,c1,c2);
614
mul_add_c(a[7],b[4],c3,c1,c2);
617
mul_add_c(a[7],b[5],c1,c2,c3);
618
mul_add_c(a[6],b[6],c1,c2,c3);
619
mul_add_c(a[5],b[7],c1,c2,c3);
622
mul_add_c(a[6],b[7],c2,c3,c1);
623
mul_add_c(a[7],b[6],c2,c3,c1);
626
mul_add_c(a[7],b[7],c3,c1,c2);
631
void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
644
mul_add_c(a[0],b[0],c1,c2,c3);
647
mul_add_c(a[0],b[1],c2,c3,c1);
648
mul_add_c(a[1],b[0],c2,c3,c1);
651
mul_add_c(a[2],b[0],c3,c1,c2);
652
mul_add_c(a[1],b[1],c3,c1,c2);
653
mul_add_c(a[0],b[2],c3,c1,c2);
656
mul_add_c(a[0],b[3],c1,c2,c3);
657
mul_add_c(a[1],b[2],c1,c2,c3);
658
mul_add_c(a[2],b[1],c1,c2,c3);
659
mul_add_c(a[3],b[0],c1,c2,c3);
662
mul_add_c(a[3],b[1],c2,c3,c1);
663
mul_add_c(a[2],b[2],c2,c3,c1);
664
mul_add_c(a[1],b[3],c2,c3,c1);
667
mul_add_c(a[2],b[3],c3,c1,c2);
668
mul_add_c(a[3],b[2],c3,c1,c2);
671
mul_add_c(a[3],b[3],c1,c2,c3);
676
void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
689
sqr_add_c(a,0,c1,c2,c3);
692
sqr_add_c2(a,1,0,c2,c3,c1);
695
sqr_add_c(a,1,c3,c1,c2);
696
sqr_add_c2(a,2,0,c3,c1,c2);
699
sqr_add_c2(a,3,0,c1,c2,c3);
700
sqr_add_c2(a,2,1,c1,c2,c3);
703
sqr_add_c(a,2,c2,c3,c1);
704
sqr_add_c2(a,3,1,c2,c3,c1);
705
sqr_add_c2(a,4,0,c2,c3,c1);
708
sqr_add_c2(a,5,0,c3,c1,c2);
709
sqr_add_c2(a,4,1,c3,c1,c2);
710
sqr_add_c2(a,3,2,c3,c1,c2);
713
sqr_add_c(a,3,c1,c2,c3);
714
sqr_add_c2(a,4,2,c1,c2,c3);
715
sqr_add_c2(a,5,1,c1,c2,c3);
716
sqr_add_c2(a,6,0,c1,c2,c3);
719
sqr_add_c2(a,7,0,c2,c3,c1);
720
sqr_add_c2(a,6,1,c2,c3,c1);
721
sqr_add_c2(a,5,2,c2,c3,c1);
722
sqr_add_c2(a,4,3,c2,c3,c1);
725
sqr_add_c(a,4,c3,c1,c2);
726
sqr_add_c2(a,5,3,c3,c1,c2);
727
sqr_add_c2(a,6,2,c3,c1,c2);
728
sqr_add_c2(a,7,1,c3,c1,c2);
731
sqr_add_c2(a,7,2,c1,c2,c3);
732
sqr_add_c2(a,6,3,c1,c2,c3);
733
sqr_add_c2(a,5,4,c1,c2,c3);
736
sqr_add_c(a,5,c2,c3,c1);
737
sqr_add_c2(a,6,4,c2,c3,c1);
738
sqr_add_c2(a,7,3,c2,c3,c1);
741
sqr_add_c2(a,7,4,c3,c1,c2);
742
sqr_add_c2(a,6,5,c3,c1,c2);
745
sqr_add_c(a,6,c1,c2,c3);
746
sqr_add_c2(a,7,5,c1,c2,c3);
749
sqr_add_c2(a,7,6,c2,c3,c1);
752
sqr_add_c(a,7,c3,c1,c2);
757
void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
770
sqr_add_c(a,0,c1,c2,c3);
773
sqr_add_c2(a,1,0,c2,c3,c1);
776
sqr_add_c(a,1,c3,c1,c2);
777
sqr_add_c2(a,2,0,c3,c1,c2);
780
sqr_add_c2(a,3,0,c1,c2,c3);
781
sqr_add_c2(a,2,1,c1,c2,c3);
784
sqr_add_c(a,2,c2,c3,c1);
785
sqr_add_c2(a,3,1,c2,c3,c1);
788
sqr_add_c2(a,3,2,c3,c1,c2);
791
sqr_add_c(a,3,c1,c2,c3);
795
#else /* !BN_MUL_COMBA */
797
/* hmm... is it faster just to do a multiply? */
799
void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
802
bn_sqr_normal(r,a,4,t);
806
void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
809
bn_sqr_normal(r,a,8,t);
812
void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
814
r[4]=bn_mul_words( &(r[0]),a,4,b[0]);
815
r[5]=bn_mul_add_words(&(r[1]),a,4,b[1]);
816
r[6]=bn_mul_add_words(&(r[2]),a,4,b[2]);
817
r[7]=bn_mul_add_words(&(r[3]),a,4,b[3]);
820
void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
822
r[ 8]=bn_mul_words( &(r[0]),a,8,b[0]);
823
r[ 9]=bn_mul_add_words(&(r[1]),a,8,b[1]);
824
r[10]=bn_mul_add_words(&(r[2]),a,8,b[2]);
825
r[11]=bn_mul_add_words(&(r[3]),a,8,b[3]);
826
r[12]=bn_mul_add_words(&(r[4]),a,8,b[4]);
827
r[13]=bn_mul_add_words(&(r[5]),a,8,b[5]);
828
r[14]=bn_mul_add_words(&(r[6]),a,8,b[6]);
829
r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]);
832
#endif /* !BN_MUL_COMBA */