458
430
// ********************************************************
463
static int Add(word *C, const word *A, const word *B, size_t N);
464
static int Subtract(word *C, const word *A, const word *B, size_t N);
466
static inline void Multiply2(word *C, const word *A, const word *B);
467
static inline word Multiply2Add(word *C, const word *A, const word *B);
468
static void Multiply4(word *C, const word *A, const word *B);
469
static void Multiply8(word *C, const word *A, const word *B);
470
static inline unsigned int MultiplyRecursionLimit() {return 8;}
472
static inline void Multiply2Bottom(word *C, const word *A, const word *B);
473
static void Multiply4Bottom(word *C, const word *A, const word *B);
474
static void Multiply8Bottom(word *C, const word *A, const word *B);
475
static inline unsigned int MultiplyBottomRecursionLimit() {return 8;}
477
static void Square2(word *R, const word *A);
478
static void Square4(word *R, const word *A);
479
static void Square8(word *R, const word *A) {assert(false);}
480
static inline unsigned int SquareRecursionLimit() {return 4;}
483
int Portable::Add(word *C, const word *A, const word *B, size_t N)
488
for (unsigned int i = 0; i < N; i+=2)
490
u = DWord(A[i]) + B[i] + u.GetHighHalf();
491
C[i] = u.GetLowHalf();
492
u = DWord(A[i+1]) + B[i+1] + u.GetHighHalf();
493
C[i+1] = u.GetLowHalf();
495
return int(u.GetHighHalf());
498
int Portable::Subtract(word *C, const word *A, const word *B, size_t N)
503
for (unsigned int i = 0; i < N; i+=2)
505
u = (DWord) A[i] - B[i] - u.GetHighHalfAsBorrow();
506
C[i] = u.GetLowHalf();
507
u = (DWord) A[i+1] - B[i+1] - u.GetHighHalfAsBorrow();
508
C[i+1] = u.GetLowHalf();
510
return int(0-u.GetHighHalf());
513
void Portable::Multiply2(word *C, const word *A, const word *B)
523
d = (dword)(A1-A0)*(B0-B1);
528
d = (dword)s*(word)(B0-B1);
534
d = (word)(A1-A0)*(dword)s;
539
d = (dword)(A0-A1)*(B1-B0);
542
// this segment is the branchless equivalent of above
543
word D[4] = {A[1]-A[0], A[0]-A[1], B[0]-B[1], B[1]-B[0]};
544
unsigned int ai = A[1] < A[0];
545
unsigned int bi = B[0] < B[1];
546
unsigned int di = ai & bi;
547
DWord d = DWord::Multiply(D[di], D[di+2]);
549
unsigned int si = ai + !bi;
552
DWord A0B0 = DWord::Multiply(A[0], B[0]);
553
C[0] = A0B0.GetLowHalf();
555
DWord A1B1 = DWord::Multiply(A[1], B[1]);
556
DWord t = (DWord) A0B0.GetHighHalf() + A0B0.GetLowHalf() + d.GetLowHalf() + A1B1.GetLowHalf();
557
C[1] = t.GetLowHalf();
559
t = A1B1 + t.GetHighHalf() + A0B0.GetHighHalf() + d.GetHighHalf() + A1B1.GetHighHalf() - s;
560
C[2] = t.GetLowHalf();
561
C[3] = t.GetHighHalf();
564
inline void Portable::Multiply2Bottom(word *C, const word *A, const word *B)
566
DWord t = DWord::Multiply(A[0], B[0]);
567
C[0] = t.GetLowHalf();
568
C[1] = t.GetHighHalf() + A[0]*B[1] + A[1]*B[0];
571
word Portable::Multiply2Add(word *C, const word *A, const word *B)
573
word D[4] = {A[1]-A[0], A[0]-A[1], B[0]-B[1], B[1]-B[0]};
574
unsigned int ai = A[1] < A[0];
575
unsigned int bi = B[0] < B[1];
576
unsigned int di = ai & bi;
577
DWord d = DWord::Multiply(D[di], D[di+2]);
579
unsigned int si = ai + !bi;
582
DWord A0B0 = DWord::Multiply(A[0], B[0]);
583
DWord t = A0B0 + C[0];
584
C[0] = t.GetLowHalf();
586
DWord A1B1 = DWord::Multiply(A[1], B[1]);
587
t = (DWord) t.GetHighHalf() + A0B0.GetLowHalf() + d.GetLowHalf() + A1B1.GetLowHalf() + C[1];
588
C[1] = t.GetLowHalf();
590
t = (DWord) t.GetHighHalf() + A1B1.GetLowHalf() + A0B0.GetHighHalf() + d.GetHighHalf() + A1B1.GetHighHalf() - s + C[2];
591
C[2] = t.GetLowHalf();
593
t = (DWord) t.GetHighHalf() + A1B1.GetHighHalf() + C[3];
594
C[3] = t.GetLowHalf();
595
return t.GetHighHalf();
598
#define MulAcc(x, y) \
599
p = DWord::MultiplyAndAdd(A[x], B[y], c); \
600
c = p.GetLowHalf(); \
601
p = (DWord) d + p.GetHighHalf(); \
602
d = p.GetLowHalf(); \
603
e += p.GetHighHalf();
605
#define SaveMulAcc(s, x, y) \
607
p = DWord::MultiplyAndAdd(A[x], B[y], d); \
608
c = p.GetLowHalf(); \
609
p = (DWord) e + p.GetHighHalf(); \
610
d = p.GetLowHalf(); \
613
#define SquAcc(x, y) \
614
q = DWord::Multiply(A[x], A[y]); \
616
c = p.GetLowHalf(); \
617
p = (DWord) d + p.GetHighHalf(); \
618
d = p.GetLowHalf(); \
619
e += p.GetHighHalf(); \
621
c = p.GetLowHalf(); \
622
p = (DWord) d + p.GetHighHalf(); \
623
d = p.GetLowHalf(); \
624
e += p.GetHighHalf();
626
#define SaveSquAcc(s, x, y) \
628
q = DWord::Multiply(A[x], A[y]); \
630
c = p.GetLowHalf(); \
631
p = (DWord) e + p.GetHighHalf(); \
632
d = p.GetLowHalf(); \
633
e = p.GetHighHalf(); \
635
c = p.GetLowHalf(); \
636
p = (DWord) d + p.GetHighHalf(); \
637
d = p.GetLowHalf(); \
638
e += p.GetHighHalf();
640
void Portable::Multiply4(word *R, const word *A, const word *B)
645
p = DWord::Multiply(A[0], B[0]);
646
R[0] = p.GetLowHalf();
670
p = DWord::MultiplyAndAdd(A[3], B[3], d);
671
R[6] = p.GetLowHalf();
672
R[7] = e + p.GetHighHalf();
675
void Portable::Square2(word *R, const word *A)
680
p = DWord::Multiply(A[0], A[0]);
681
R[0] = p.GetLowHalf();
688
p = DWord::MultiplyAndAdd(A[1], A[1], d);
689
R[2] = p.GetLowHalf();
690
R[3] = e + p.GetHighHalf();
693
void Portable::Square4(word *R, const word *A)
696
// VC60 workaround: MSVC 6.0 has an optimization bug that makes
697
// (dword)A*B where either A or B has been cast to a dword before
698
// very expensive. Revisit this function when this
706
p = DWord::Multiply(A[0], A[0]);
707
R[0] = p.GetLowHalf();
725
p = DWord::MultiplyAndAdd(A[3], A[3], d);
726
R[6] = p.GetLowHalf();
727
R[7] = e + p.GetHighHalf();
731
void Portable::Multiply8(word *R, const word *A, const word *B)
736
p = DWord::Multiply(A[0], B[0]);
737
R[0] = p.GetLowHalf();
804
SaveMulAcc(10, 4, 7);
809
SaveMulAcc(11, 5, 7);
813
SaveMulAcc(12, 6, 7);
817
p = DWord::MultiplyAndAdd(A[7], B[7], d);
818
R[14] = p.GetLowHalf();
819
R[15] = e + p.GetHighHalf();
822
void Portable::Multiply4Bottom(word *R, const word *A, const word *B)
827
p = DWord::Multiply(A[0], B[0]);
828
R[0] = p.GetLowHalf();
840
R[3] = d + A[0] * B[3] + A[1] * B[2] + A[2] * B[1] + A[3] * B[0];
843
void Portable::Multiply8Bottom(word *R, const word *A, const word *B)
848
p = DWord::Multiply(A[0], B[0]);
849
R[0] = p.GetLowHalf();
887
R[7] = d + A[0] * B[7] + A[1] * B[6] + A[2] * B[5] + A[3] * B[4] +
888
A[4] * B[3] + A[5] * B[2] + A[6] * B[1] + A[7] * B[0];
896
#ifdef CRYPTOPP_X86ASM_AVAILABLE
898
// ************** x86 feature detection ***************
900
static bool s_sse2Enabled = true;
902
static void CpuId(word32 input, word32 *output)
907
// save ebx in case -fPIC is being used
908
"push %%ebx; cpuid; mov %%ebx, %%edi; pop %%ebx"
909
: "=a" (output[0]), "=D" (output[1]), "=c" (output[2]), "=d" (output[3])
926
#ifdef SSE2_INTRINSICS_AVAILABLE
928
static jmp_buf s_env;
929
static void SigIllHandler(int)
935
static bool HasSSE2()
942
if ((cpuid[3] & (1 << 26)) == 0)
948
__asm xorpd xmm0, xmm0 // executing SSE2 instruction
956
typedef void (*SigHandler)(int);
958
SigHandler oldHandler = signal(SIGILL, SigIllHandler);
959
if (oldHandler == SIG_ERR)
966
__asm __volatile ("xorps %xmm0, %xmm0");
968
signal(SIGILL, oldHandler);
979
std::swap(cpuid[2], cpuid[3]);
980
if (memcmp(cpuid+1, "GenuineIntel", 12) != 0)
984
return ((cpuid[0] >> 8) & 0xf) == 0xf;
987
// ************** Pentium/P4 optimizations ***************
989
class PentiumOptimized : public Portable
992
static int Add(word *C, const word *A, const word *B, size_t N);
993
static int Subtract(word *C, const word *A, const word *B, size_t N);
994
static void Multiply4(word *C, const word *A, const word *B);
995
static void Multiply8(word *C, const word *A, const word *B);
996
static void Multiply8Bottom(word *C, const word *A, const word *B);
1002
static int Add(word *C, const word *A, const word *B, size_t N);
1003
static int Subtract(word *C, const word *A, const word *B, size_t N);
1004
#ifdef SSE2_INTRINSICS_AVAILABLE
1005
static void Multiply4(word *C, const word *A, const word *B);
1006
static void Multiply8(word *C, const word *A, const word *B);
1007
static void Multiply8Bottom(word *C, const word *A, const word *B);
1011
typedef int (* PAddSub)(word *C, const word *A, const word *B, size_t N);
432
// use some tricks to share assembly code between MSVC and GCC
433
#if defined(__GNUC__)
434
#define AddPrologue \
436
__asm__ __volatile__ \
438
".intel_syntax noprefix;"
439
#define AddEpilogue \
440
".att_syntax prefix;" \
442
: "d" (C), "a" (A), "D" (B), "c" (N) \
443
: "%esi", "memory", "cc" \
446
#define MulPrologue \
447
__asm__ __volatile__ \
449
".intel_syntax noprefix;" \
452
#define MulEpilogue \
454
".att_syntax prefix;" \
456
: "d" (s_maskLow16), "c" (C), "a" (A), "D" (B) \
457
: "%esi", "memory", "cc" \
459
#define SquPrologue MulPrologue
460
#define SquEpilogue \
462
".att_syntax prefix;" \
464
: "d" (s_maskLow16), "c" (C), "a" (A) \
465
: "%esi", "%edi", "memory", "cc" \
467
#define TopPrologue MulPrologue
468
#define TopEpilogue \
470
".att_syntax prefix;" \
472
: "d" (s_maskLow16), "c" (C), "a" (A), "D" (B), "S" (L) \
476
#define AddPrologue \
479
__asm mov eax, [esp+12] \
480
__asm mov edi, [esp+16]
481
#define AddEpilogue \
486
#define SaveEBX __asm push ebx
487
#define RestoreEBX __asm pop ebx
492
#define SquPrologue \
496
AS2( lea ebx, s_maskLow16)
497
#define MulPrologue \
502
AS2( lea ebx, s_maskLow16)
503
#define TopPrologue \
509
AS2( lea ebx, s_maskLow16)
510
#define SquEpilogue RestoreEBX
511
#define MulEpilogue RestoreEBX
512
#define TopEpilogue RestoreEBX
515
#ifdef CRYPTOPP_X64_MASM_AVAILABLE
517
word Baseline_Add(size_t N, word *C, const word *A, const word *B);
518
word Baseline_Sub(size_t N, word *C, const word *A, const word *B);
520
#elif defined(CRYPTOPP_X64_ASM_AVAILABLE) && defined(__GNUC__)
521
word Baseline_Add(size_t N, word *C, const word *A, const word *B)
529
AS2( mov %0,[%3+8*%1])
530
AS2( add %0,[%4+8*%1])
531
AS2( mov [%2+8*%1],%0)
533
AS2( mov %0,[%3+8*%1+8])
534
AS2( adc %0,[%4+8*%1+8])
535
AS2( mov [%2+8*%1+8],%0)
538
AS2( mov %0,[%3+8*%1])
539
AS2( adc %0,[%4+8*%1])
540
AS2( mov [%2+8*%1],%0)
547
: "c" (N), "r" (C+N), "r" (A+N), "r" (B+N)
553
word Baseline_Sub(size_t N, word *C, const word *A, const word *B)
561
AS2( mov %0,[%3+8*%1])
562
AS2( sub %0,[%4+8*%1])
563
AS2( mov [%2+8*%1],%0)
565
AS2( mov %0,[%3+8*%1+8])
566
AS2( sbb %0,[%4+8*%1+8])
567
AS2( mov [%2+8*%1+8],%0)
570
AS2( mov %0,[%3+8*%1])
571
AS2( sbb %0,[%4+8*%1])
572
AS2( mov [%2+8*%1],%0)
579
: "c" (N), "r" (C+N), "r" (A+N), "r" (B+N)
584
#elif defined(CRYPTOPP_X86_ASM_AVAILABLE) && CRYPTOPP_BOOL_X86
585
CRYPTOPP_NAKED word CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B)
589
// now: eax = A, edi = B, edx = C, ecx = N
590
AS2( lea eax, [eax+4*ecx])
591
AS2( lea edi, [edi+4*ecx])
592
AS2( lea edx, [edx+4*ecx])
594
AS1( neg ecx) // ecx is negative index
595
AS2( test ecx, 2) // this clears carry flag
601
ASJ( jecxz, 2, f) // loop until ecx overflows and becomes zero
602
AS2( mov esi,[eax+4*ecx])
603
AS2( adc esi,[edi+4*ecx])
604
AS2( mov [edx+4*ecx],esi)
605
AS2( mov esi,[eax+4*ecx+4])
606
AS2( adc esi,[edi+4*ecx+4])
607
AS2( mov [edx+4*ecx+4],esi)
609
AS2( mov esi,[eax+4*ecx+8])
610
AS2( adc esi,[edi+4*ecx+8])
611
AS2( mov [edx+4*ecx+8],esi)
612
AS2( mov esi,[eax+4*ecx+12])
613
AS2( adc esi,[edi+4*ecx+12])
614
AS2( mov [edx+4*ecx+12],esi)
616
AS2( lea ecx,[ecx+4]) // advance index, avoid inc which causes slowdown on Intel Core 2
621
AS1( setc al) // store carry into eax (return result register)
626
CRYPTOPP_NAKED word CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B)
630
// now: eax = A, edi = B, edx = C, ecx = N
631
AS2( lea eax, [eax+4*ecx])
632
AS2( lea edi, [edi+4*ecx])
633
AS2( lea edx, [edx+4*ecx])
635
AS1( neg ecx) // ecx is negative index
636
AS2( test ecx, 2) // this clears carry flag
642
ASJ( jecxz, 2, f) // loop until ecx overflows and becomes zero
643
AS2( mov esi,[eax+4*ecx])
644
AS2( sbb esi,[edi+4*ecx])
645
AS2( mov [edx+4*ecx],esi)
646
AS2( mov esi,[eax+4*ecx+4])
647
AS2( sbb esi,[edi+4*ecx+4])
648
AS2( mov [edx+4*ecx+4],esi)
650
AS2( mov esi,[eax+4*ecx+8])
651
AS2( sbb esi,[edi+4*ecx+8])
652
AS2( mov [edx+4*ecx+8],esi)
653
AS2( mov esi,[eax+4*ecx+12])
654
AS2( sbb esi,[edi+4*ecx+12])
655
AS2( mov [edx+4*ecx+12],esi)
657
AS2( lea ecx,[ecx+4]) // advance index, avoid inc which causes slowdown on Intel Core 2
662
AS1( setc al) // store carry into eax (return result register)
667
#if CRYPTOPP_INTEGER_SSE2
668
CRYPTOPP_NAKED word CRYPTOPP_FASTCALL SSE2_Add(size_t N, word *C, const word *A, const word *B)
672
// now: eax = A, edi = B, edx = C, ecx = N
673
AS2( lea eax, [eax+4*ecx])
674
AS2( lea edi, [edi+4*ecx])
675
AS2( lea edx, [edx+4*ecx])
677
AS1( neg ecx) // ecx is negative index
680
AS2( test ecx, 2) // this clears carry flag
686
AS2( movd mm0, DWORD PTR [eax+4*ecx])
687
AS2( movd mm1, DWORD PTR [edi+4*ecx])
690
AS2( movd DWORD PTR [edx+4*ecx], mm2)
693
AS2( movd mm0, DWORD PTR [eax+4*ecx+4])
694
AS2( movd mm1, DWORD PTR [edi+4*ecx+4])
697
AS2( movd DWORD PTR [edx+4*ecx+4], mm2)
701
AS2( movd mm0, DWORD PTR [eax+4*ecx+8])
702
AS2( movd mm1, DWORD PTR [edi+4*ecx+8])
705
AS2( movd DWORD PTR [edx+4*ecx+8], mm2)
708
AS2( movd mm0, DWORD PTR [eax+4*ecx+12])
709
AS2( movd mm1, DWORD PTR [edi+4*ecx+12])
712
AS2( movd DWORD PTR [edx+4*ecx+12], mm2)
724
CRYPTOPP_NAKED word CRYPTOPP_FASTCALL SSE2_Sub(size_t N, word *C, const word *A, const word *B)
728
// now: eax = A, edi = B, edx = C, ecx = N
729
AS2( lea eax, [eax+4*ecx])
730
AS2( lea edi, [edi+4*ecx])
731
AS2( lea edx, [edx+4*ecx])
733
AS1( neg ecx) // ecx is negative index
736
AS2( test ecx, 2) // this clears carry flag
742
AS2( movd mm0, DWORD PTR [eax+4*ecx])
743
AS2( movd mm1, DWORD PTR [edi+4*ecx])
746
AS2( movd DWORD PTR [edx+4*ecx], mm0)
749
AS2( movd mm2, DWORD PTR [eax+4*ecx+4])
750
AS2( movd mm1, DWORD PTR [edi+4*ecx+4])
753
AS2( movd DWORD PTR [edx+4*ecx+4], mm2)
757
AS2( movd mm0, DWORD PTR [eax+4*ecx+8])
758
AS2( movd mm1, DWORD PTR [edi+4*ecx+8])
761
AS2( movd DWORD PTR [edx+4*ecx+8], mm0)
764
AS2( movd mm2, DWORD PTR [eax+4*ecx+12])
765
AS2( movd mm1, DWORD PTR [edi+4*ecx+12])
768
AS2( movd DWORD PTR [edx+4*ecx+12], mm2)
780
#endif // #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
782
word CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B)
788
for (size_t i=0; i<N; i+=2)
790
AddWithCarry(u, A[i], B[i]);
792
AddWithCarry(u, A[i+1], B[i+1]);
795
return int(GetCarry(u));
798
word CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B)
804
for (size_t i=0; i<N; i+=2)
806
SubtractWithBorrow(u, A[i], B[i]);
808
SubtractWithBorrow(u, A[i+1], B[i+1]);
811
return int(GetBorrow(u));
815
static word LinearMultiply(word *C, const word *A, word B, size_t N)
818
for(unsigned i=0; i<N; i++)
821
MultiplyWords(p, A[i], B);
822
Acc2WordsBy1(p, carry);
831
Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
836
Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
837
Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \
838
Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
839
Mul_SaveAcc(3, 1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) \
840
Mul_SaveAcc(4, 2, 3) Mul_Acc(3, 2) \
845
Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
846
Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \
847
Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
848
Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
849
Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
850
Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
851
Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
852
Mul_SaveAcc(7, 1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) \
853
Mul_SaveAcc(8, 2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) \
854
Mul_SaveAcc(9, 3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) \
855
Mul_SaveAcc(10, 4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) \
856
Mul_SaveAcc(11, 5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) \
857
Mul_SaveAcc(12, 6, 7) Mul_Acc(7, 6) \
862
Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
863
Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \
864
Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
865
Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
866
Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
867
Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
868
Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
869
Mul_SaveAcc(7, 0, 8) Mul_Acc(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) Mul_Acc(8, 0) \
870
Mul_SaveAcc(8, 0, 9) Mul_Acc(1, 8) Mul_Acc(2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) Mul_Acc(8, 1) Mul_Acc(9, 0) \
871
Mul_SaveAcc(9, 0, 10) Mul_Acc(1, 9) Mul_Acc(2, 8) Mul_Acc(3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) Mul_Acc(8, 2) Mul_Acc(9, 1) Mul_Acc(10, 0) \
872
Mul_SaveAcc(10, 0, 11) Mul_Acc(1, 10) Mul_Acc(2, 9) Mul_Acc(3, 8) Mul_Acc(4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) Mul_Acc(8, 3) Mul_Acc(9, 2) Mul_Acc(10, 1) Mul_Acc(11, 0) \
873
Mul_SaveAcc(11, 0, 12) Mul_Acc(1, 11) Mul_Acc(2, 10) Mul_Acc(3, 9) Mul_Acc(4, 8) Mul_Acc(5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) Mul_Acc(8, 4) Mul_Acc(9, 3) Mul_Acc(10, 2) Mul_Acc(11, 1) Mul_Acc(12, 0) \
874
Mul_SaveAcc(12, 0, 13) Mul_Acc(1, 12) Mul_Acc(2, 11) Mul_Acc(3, 10) Mul_Acc(4, 9) Mul_Acc(5, 8) Mul_Acc(6, 7) Mul_Acc(7, 6) Mul_Acc(8, 5) Mul_Acc(9, 4) Mul_Acc(10, 3) Mul_Acc(11, 2) Mul_Acc(12, 1) Mul_Acc(13, 0) \
875
Mul_SaveAcc(13, 0, 14) Mul_Acc(1, 13) Mul_Acc(2, 12) Mul_Acc(3, 11) Mul_Acc(4, 10) Mul_Acc(5, 9) Mul_Acc(6, 8) Mul_Acc(7, 7) Mul_Acc(8, 6) Mul_Acc(9, 5) Mul_Acc(10, 4) Mul_Acc(11, 3) Mul_Acc(12, 2) Mul_Acc(13, 1) Mul_Acc(14, 0) \
876
Mul_SaveAcc(14, 0, 15) Mul_Acc(1, 14) Mul_Acc(2, 13) Mul_Acc(3, 12) Mul_Acc(4, 11) Mul_Acc(5, 10) Mul_Acc(6, 9) Mul_Acc(7, 8) Mul_Acc(8, 7) Mul_Acc(9, 6) Mul_Acc(10, 5) Mul_Acc(11, 4) Mul_Acc(12, 3) Mul_Acc(13, 2) Mul_Acc(14, 1) Mul_Acc(15, 0) \
877
Mul_SaveAcc(15, 1, 15) Mul_Acc(2, 14) Mul_Acc(3, 13) Mul_Acc(4, 12) Mul_Acc(5, 11) Mul_Acc(6, 10) Mul_Acc(7, 9) Mul_Acc(8, 8) Mul_Acc(9, 7) Mul_Acc(10, 6) Mul_Acc(11, 5) Mul_Acc(12, 4) Mul_Acc(13, 3) Mul_Acc(14, 2) Mul_Acc(15, 1) \
878
Mul_SaveAcc(16, 2, 15) Mul_Acc(3, 14) Mul_Acc(4, 13) Mul_Acc(5, 12) Mul_Acc(6, 11) Mul_Acc(7, 10) Mul_Acc(8, 9) Mul_Acc(9, 8) Mul_Acc(10, 7) Mul_Acc(11, 6) Mul_Acc(12, 5) Mul_Acc(13, 4) Mul_Acc(14, 3) Mul_Acc(15, 2) \
879
Mul_SaveAcc(17, 3, 15) Mul_Acc(4, 14) Mul_Acc(5, 13) Mul_Acc(6, 12) Mul_Acc(7, 11) Mul_Acc(8, 10) Mul_Acc(9, 9) Mul_Acc(10, 8) Mul_Acc(11, 7) Mul_Acc(12, 6) Mul_Acc(13, 5) Mul_Acc(14, 4) Mul_Acc(15, 3) \
880
Mul_SaveAcc(18, 4, 15) Mul_Acc(5, 14) Mul_Acc(6, 13) Mul_Acc(7, 12) Mul_Acc(8, 11) Mul_Acc(9, 10) Mul_Acc(10, 9) Mul_Acc(11, 8) Mul_Acc(12, 7) Mul_Acc(13, 6) Mul_Acc(14, 5) Mul_Acc(15, 4) \
881
Mul_SaveAcc(19, 5, 15) Mul_Acc(6, 14) Mul_Acc(7, 13) Mul_Acc(8, 12) Mul_Acc(9, 11) Mul_Acc(10, 10) Mul_Acc(11, 9) Mul_Acc(12, 8) Mul_Acc(13, 7) Mul_Acc(14, 6) Mul_Acc(15, 5) \
882
Mul_SaveAcc(20, 6, 15) Mul_Acc(7, 14) Mul_Acc(8, 13) Mul_Acc(9, 12) Mul_Acc(10, 11) Mul_Acc(11, 10) Mul_Acc(12, 9) Mul_Acc(13, 8) Mul_Acc(14, 7) Mul_Acc(15, 6) \
883
Mul_SaveAcc(21, 7, 15) Mul_Acc(8, 14) Mul_Acc(9, 13) Mul_Acc(10, 12) Mul_Acc(11, 11) Mul_Acc(12, 10) Mul_Acc(13, 9) Mul_Acc(14, 8) Mul_Acc(15, 7) \
884
Mul_SaveAcc(22, 8, 15) Mul_Acc(9, 14) Mul_Acc(10, 13) Mul_Acc(11, 12) Mul_Acc(12, 11) Mul_Acc(13, 10) Mul_Acc(14, 9) Mul_Acc(15, 8) \
885
Mul_SaveAcc(23, 9, 15) Mul_Acc(10, 14) Mul_Acc(11, 13) Mul_Acc(12, 12) Mul_Acc(13, 11) Mul_Acc(14, 10) Mul_Acc(15, 9) \
886
Mul_SaveAcc(24, 10, 15) Mul_Acc(11, 14) Mul_Acc(12, 13) Mul_Acc(13, 12) Mul_Acc(14, 11) Mul_Acc(15, 10) \
887
Mul_SaveAcc(25, 11, 15) Mul_Acc(12, 14) Mul_Acc(13, 13) Mul_Acc(14, 12) Mul_Acc(15, 11) \
888
Mul_SaveAcc(26, 12, 15) Mul_Acc(13, 14) Mul_Acc(14, 13) Mul_Acc(15, 12) \
889
Mul_SaveAcc(27, 13, 15) Mul_Acc(14, 14) Mul_Acc(15, 13) \
890
Mul_SaveAcc(28, 14, 15) Mul_Acc(15, 14) \
899
Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \
900
Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \
901
Squ_SaveAcc(3, 1, 3) Squ_Diag(2) \
902
Squ_SaveAcc(4, 2, 3) Squ_NonDiag \
907
Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \
908
Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \
909
Squ_SaveAcc(3, 0, 4) Squ_Acc(1, 3) Squ_Diag(2) \
910
Squ_SaveAcc(4, 0, 5) Squ_Acc(1, 4) Squ_Acc(2, 3) Squ_NonDiag \
911
Squ_SaveAcc(5, 0, 6) Squ_Acc(1, 5) Squ_Acc(2, 4) Squ_Diag(3) \
912
Squ_SaveAcc(6, 0, 7) Squ_Acc(1, 6) Squ_Acc(2, 5) Squ_Acc(3, 4) Squ_NonDiag \
913
Squ_SaveAcc(7, 1, 7) Squ_Acc(2, 6) Squ_Acc(3, 5) Squ_Diag(4) \
914
Squ_SaveAcc(8, 2, 7) Squ_Acc(3, 6) Squ_Acc(4, 5) Squ_NonDiag \
915
Squ_SaveAcc(9, 3, 7) Squ_Acc(4, 6) Squ_Diag(5) \
916
Squ_SaveAcc(10, 4, 7) Squ_Acc(5, 6) Squ_NonDiag \
917
Squ_SaveAcc(11, 5, 7) Squ_Diag(6) \
918
Squ_SaveAcc(12, 6, 7) Squ_NonDiag \
923
Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \
924
Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \
925
Squ_SaveAcc(3, 0, 4) Squ_Acc(1, 3) Squ_Diag(2) \
926
Squ_SaveAcc(4, 0, 5) Squ_Acc(1, 4) Squ_Acc(2, 3) Squ_NonDiag \
927
Squ_SaveAcc(5, 0, 6) Squ_Acc(1, 5) Squ_Acc(2, 4) Squ_Diag(3) \
928
Squ_SaveAcc(6, 0, 7) Squ_Acc(1, 6) Squ_Acc(2, 5) Squ_Acc(3, 4) Squ_NonDiag \
929
Squ_SaveAcc(7, 0, 8) Squ_Acc(1, 7) Squ_Acc(2, 6) Squ_Acc(3, 5) Squ_Diag(4) \
930
Squ_SaveAcc(8, 0, 9) Squ_Acc(1, 8) Squ_Acc(2, 7) Squ_Acc(3, 6) Squ_Acc(4, 5) Squ_NonDiag \
931
Squ_SaveAcc(9, 0, 10) Squ_Acc(1, 9) Squ_Acc(2, 8) Squ_Acc(3, 7) Squ_Acc(4, 6) Squ_Diag(5) \
932
Squ_SaveAcc(10, 0, 11) Squ_Acc(1, 10) Squ_Acc(2, 9) Squ_Acc(3, 8) Squ_Acc(4, 7) Squ_Acc(5, 6) Squ_NonDiag \
933
Squ_SaveAcc(11, 0, 12) Squ_Acc(1, 11) Squ_Acc(2, 10) Squ_Acc(3, 9) Squ_Acc(4, 8) Squ_Acc(5, 7) Squ_Diag(6) \
934
Squ_SaveAcc(12, 0, 13) Squ_Acc(1, 12) Squ_Acc(2, 11) Squ_Acc(3, 10) Squ_Acc(4, 9) Squ_Acc(5, 8) Squ_Acc(6, 7) Squ_NonDiag \
935
Squ_SaveAcc(13, 0, 14) Squ_Acc(1, 13) Squ_Acc(2, 12) Squ_Acc(3, 11) Squ_Acc(4, 10) Squ_Acc(5, 9) Squ_Acc(6, 8) Squ_Diag(7) \
936
Squ_SaveAcc(14, 0, 15) Squ_Acc(1, 14) Squ_Acc(2, 13) Squ_Acc(3, 12) Squ_Acc(4, 11) Squ_Acc(5, 10) Squ_Acc(6, 9) Squ_Acc(7, 8) Squ_NonDiag \
937
Squ_SaveAcc(15, 1, 15) Squ_Acc(2, 14) Squ_Acc(3, 13) Squ_Acc(4, 12) Squ_Acc(5, 11) Squ_Acc(6, 10) Squ_Acc(7, 9) Squ_Diag(8) \
938
Squ_SaveAcc(16, 2, 15) Squ_Acc(3, 14) Squ_Acc(4, 13) Squ_Acc(5, 12) Squ_Acc(6, 11) Squ_Acc(7, 10) Squ_Acc(8, 9) Squ_NonDiag \
939
Squ_SaveAcc(17, 3, 15) Squ_Acc(4, 14) Squ_Acc(5, 13) Squ_Acc(6, 12) Squ_Acc(7, 11) Squ_Acc(8, 10) Squ_Diag(9) \
940
Squ_SaveAcc(18, 4, 15) Squ_Acc(5, 14) Squ_Acc(6, 13) Squ_Acc(7, 12) Squ_Acc(8, 11) Squ_Acc(9, 10) Squ_NonDiag \
941
Squ_SaveAcc(19, 5, 15) Squ_Acc(6, 14) Squ_Acc(7, 13) Squ_Acc(8, 12) Squ_Acc(9, 11) Squ_Diag(10) \
942
Squ_SaveAcc(20, 6, 15) Squ_Acc(7, 14) Squ_Acc(8, 13) Squ_Acc(9, 12) Squ_Acc(10, 11) Squ_NonDiag \
943
Squ_SaveAcc(21, 7, 15) Squ_Acc(8, 14) Squ_Acc(9, 13) Squ_Acc(10, 12) Squ_Diag(11) \
944
Squ_SaveAcc(22, 8, 15) Squ_Acc(9, 14) Squ_Acc(10, 13) Squ_Acc(11, 12) Squ_NonDiag \
945
Squ_SaveAcc(23, 9, 15) Squ_Acc(10, 14) Squ_Acc(11, 13) Squ_Diag(12) \
946
Squ_SaveAcc(24, 10, 15) Squ_Acc(11, 14) Squ_Acc(12, 13) Squ_NonDiag \
947
Squ_SaveAcc(25, 11, 15) Squ_Acc(12, 14) Squ_Diag(13) \
948
Squ_SaveAcc(26, 12, 15) Squ_Acc(13, 14) Squ_NonDiag \
949
Squ_SaveAcc(27, 13, 15) Squ_Diag(14) \
950
Squ_SaveAcc(28, 14, 15) Squ_NonDiag \
955
Bot_SaveAcc(0, 0, 1) Bot_Acc(1, 0) \
960
Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
961
Mul_SaveAcc(1, 2, 0) Mul_Acc(1, 1) Mul_Acc(0, 2) \
962
Bot_SaveAcc(2, 0, 3) Bot_Acc(1, 2) Bot_Acc(2, 1) Bot_Acc(3, 0) \
967
Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
968
Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \
969
Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
970
Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
971
Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
972
Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
973
Bot_SaveAcc(6, 0, 7) Bot_Acc(1, 6) Bot_Acc(2, 5) Bot_Acc(3, 4) Bot_Acc(4, 3) Bot_Acc(5, 2) Bot_Acc(6, 1) Bot_Acc(7, 0) \
978
Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
979
Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \
980
Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
981
Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
982
Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
983
Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
984
Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
985
Mul_SaveAcc(7, 0, 8) Mul_Acc(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) Mul_Acc(8, 0) \
986
Mul_SaveAcc(8, 0, 9) Mul_Acc(1, 8) Mul_Acc(2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) Mul_Acc(8, 1) Mul_Acc(9, 0) \
987
Mul_SaveAcc(9, 0, 10) Mul_Acc(1, 9) Mul_Acc(2, 8) Mul_Acc(3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) Mul_Acc(8, 2) Mul_Acc(9, 1) Mul_Acc(10, 0) \
988
Mul_SaveAcc(10, 0, 11) Mul_Acc(1, 10) Mul_Acc(2, 9) Mul_Acc(3, 8) Mul_Acc(4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) Mul_Acc(8, 3) Mul_Acc(9, 2) Mul_Acc(10, 1) Mul_Acc(11, 0) \
989
Mul_SaveAcc(11, 0, 12) Mul_Acc(1, 11) Mul_Acc(2, 10) Mul_Acc(3, 9) Mul_Acc(4, 8) Mul_Acc(5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) Mul_Acc(8, 4) Mul_Acc(9, 3) Mul_Acc(10, 2) Mul_Acc(11, 1) Mul_Acc(12, 0) \
990
Mul_SaveAcc(12, 0, 13) Mul_Acc(1, 12) Mul_Acc(2, 11) Mul_Acc(3, 10) Mul_Acc(4, 9) Mul_Acc(5, 8) Mul_Acc(6, 7) Mul_Acc(7, 6) Mul_Acc(8, 5) Mul_Acc(9, 4) Mul_Acc(10, 3) Mul_Acc(11, 2) Mul_Acc(12, 1) Mul_Acc(13, 0) \
991
Mul_SaveAcc(13, 0, 14) Mul_Acc(1, 13) Mul_Acc(2, 12) Mul_Acc(3, 11) Mul_Acc(4, 10) Mul_Acc(5, 9) Mul_Acc(6, 8) Mul_Acc(7, 7) Mul_Acc(8, 6) Mul_Acc(9, 5) Mul_Acc(10, 4) Mul_Acc(11, 3) Mul_Acc(12, 2) Mul_Acc(13, 1) Mul_Acc(14, 0) \
992
Bot_SaveAcc(14, 0, 15) Bot_Acc(1, 14) Bot_Acc(2, 13) Bot_Acc(3, 12) Bot_Acc(4, 11) Bot_Acc(5, 10) Bot_Acc(6, 9) Bot_Acc(7, 8) Bot_Acc(8, 7) Bot_Acc(9, 6) Bot_Acc(10, 5) Bot_Acc(11, 4) Bot_Acc(12, 3) Bot_Acc(13, 2) Bot_Acc(14, 1) Bot_Acc(15, 0) \
996
#define Mul_Begin(n) \
1000
MultiplyWords(p, A[0], B[0]) \
1001
AssignWord(c, LowWord(p)) \
1002
AssignWord(d, HighWord(p))
1004
#define Mul_Acc(i, j) \
1005
MultiplyWords(p, A[i], B[j]) \
1006
Acc2WordsBy1(c, LowWord(p)) \
1007
Acc2WordsBy1(d, HighWord(p))
1009
#define Mul_SaveAcc(k, i, j) \
1010
R[k] = LowWord(c); \
1011
Add2WordsBy1(c, d, HighWord(c)) \
1012
MultiplyWords(p, A[i], B[j]) \
1013
AssignWord(d, HighWord(p)) \
1014
Acc2WordsBy1(c, LowWord(p))
1016
#define Mul_End(n) \
1017
R[2*n-3] = LowWord(c); \
1018
Acc2WordsBy1(d, HighWord(c)) \
1019
MultiplyWords(p, A[n-1], B[n-1])\
1020
Acc2WordsBy2(d, p) \
1021
R[2*n-2] = LowWord(d); \
1022
R[2*n-1] = HighWord(d);
1024
#define Bot_SaveAcc(k, i, j) \
1025
R[k] = LowWord(c); \
1026
word e = LowWord(d) + HighWord(c); \
1029
#define Bot_Acc(i, j) \
1032
#define Bot_End(n) \
1035
#define Mul_Begin(n) \
1039
MultiplyWords(p, A[0], B[0]) \
1041
AssignWord(d, HighWord(p))
1043
#define Mul_Acc(i, j) \
1044
MulAcc(c, d, A[i], B[j])
1046
#define Mul_SaveAcc(k, i, j) \
1049
AssignWord(d, HighWord(d)) \
1050
MulAcc(c, d, A[i], B[j])
1052
#define Mul_End(k, i) \
1054
MultiplyWords(p, A[i], B[i]) \
1055
Acc2WordsBy2(p, d) \
1056
R[k+1] = LowWord(p); \
1057
R[k+2] = HighWord(p);
1059
#define Bot_SaveAcc(k, i, j) \
1064
#define Bot_Acc(i, j) \
1067
#define Bot_End(n) \
1071
#define Squ_Begin(n) \
1076
MultiplyWords(p, A[0], A[0]) \
1077
R[0] = LowWord(p); \
1078
AssignWord(e, HighWord(p)) \
1079
MultiplyWords(p, A[0], A[1]) \
1081
AssignWord(d, HighWord(p)) \
1084
#define Squ_NonDiag \
1087
#define Squ_SaveAcc(k, i, j) \
1088
Acc3WordsBy2(c, d, e) \
1090
MultiplyWords(p, A[i], A[j]) \
1092
AssignWord(d, HighWord(p)) \
1094
#define Squ_Acc(i, j) \
1095
MulAcc(c, d, A[i], A[j])
1097
#define Squ_Diag(i) \
1099
MulAcc(c, d, A[i], A[i])
1101
#define Squ_End(n) \
1102
Acc3WordsBy2(c, d, e) \
1104
MultiplyWords(p, A[n-1], A[n-1])\
1105
Acc2WordsBy2(p, e) \
1106
R[2*n-2] = LowWord(p); \
1107
R[2*n-1] = HighWord(p);
1109
void Baseline_Multiply2(word *R, const word *A, const word *B)
1114
void Baseline_Multiply4(word *R, const word *A, const word *B)
1119
void Baseline_Multiply8(word *R, const word *A, const word *B)
1124
void Baseline_Square2(word *R, const word *A)
1129
void Baseline_Square4(word *R, const word *A)
1134
void Baseline_Square8(word *R, const word *A)
1139
void Baseline_MultiplyBottom2(word *R, const word *A, const word *B)
1144
void Baseline_MultiplyBottom4(word *R, const word *A, const word *B)
1149
void Baseline_MultiplyBottom8(word *R, const word *A, const word *B)
1154
#define Top_Begin(n) \
1158
MultiplyWords(p, A[0], B[n-2]);\
1159
AssignWord(d, HighWord(p));
1161
#define Top_Acc(i, j) \
1162
MultiplyWords(p, A[i], B[j]);\
1163
Acc2WordsBy1(d, HighWord(p));
1165
#define Top_SaveAcc0(i, j) \
1167
AssignWord(d, HighWord(d)) \
1168
MulAcc(c, d, A[i], B[j])
1170
#define Top_SaveAcc1(i, j) \
1172
Acc2WordsBy1(d, c); \
1174
AssignWord(d, HighWord(d)) \
1175
MulAcc(c, d, A[i], B[j])
1177
void Baseline_MultiplyTop2(word *R, const word *A, const word *B, word L)
1180
Baseline_Multiply2(T, A, B);
1185
void Baseline_MultiplyTop4(word *R, const word *A, const word *B, word L)
1188
Top_Acc(1, 1) Top_Acc(2, 0) \
1189
Top_SaveAcc0(0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
1190
Top_SaveAcc1(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) \
1191
Mul_SaveAcc(0, 2, 3) Mul_Acc(3, 2) \
1195
void Baseline_MultiplyTop8(word *R, const word *A, const word *B, word L)
1198
Top_Acc(1, 5) Top_Acc(2, 4) Top_Acc(3, 3) Top_Acc(4, 2) Top_Acc(5, 1) Top_Acc(6, 0) \
1199
Top_SaveAcc0(0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
1200
Top_SaveAcc1(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) \
1201
Mul_SaveAcc(0, 2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) \
1202
Mul_SaveAcc(1, 3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) \
1203
Mul_SaveAcc(2, 4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) \
1204
Mul_SaveAcc(3, 5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) \
1205
Mul_SaveAcc(4, 6, 7) Mul_Acc(7, 6) \
1209
#if !CRYPTOPP_INTEGER_SSE2 // save memory by not compiling these functions when SSE2 is available
1210
void Baseline_Multiply16(word *R, const word *A, const word *B)
1215
void Baseline_Square16(word *R, const word *A)
1220
void Baseline_MultiplyBottom16(word *R, const word *A, const word *B)
1225
void Baseline_MultiplyTop16(word *R, const word *A, const word *B, word L)
1228
Top_Acc(1, 13) Top_Acc(2, 12) Top_Acc(3, 11) Top_Acc(4, 10) Top_Acc(5, 9) Top_Acc(6, 8) Top_Acc(7, 7) Top_Acc(8, 6) Top_Acc(9, 5) Top_Acc(10, 4) Top_Acc(11, 3) Top_Acc(12, 2) Top_Acc(13, 1) Top_Acc(14, 0) \
1229
Top_SaveAcc0(0, 15) Mul_Acc(1, 14) Mul_Acc(2, 13) Mul_Acc(3, 12) Mul_Acc(4, 11) Mul_Acc(5, 10) Mul_Acc(6, 9) Mul_Acc(7, 8) Mul_Acc(8, 7) Mul_Acc(9, 6) Mul_Acc(10, 5) Mul_Acc(11, 4) Mul_Acc(12, 3) Mul_Acc(13, 2) Mul_Acc(14, 1) Mul_Acc(15, 0) \
1230
Top_SaveAcc1(1, 15) Mul_Acc(2, 14) Mul_Acc(3, 13) Mul_Acc(4, 12) Mul_Acc(5, 11) Mul_Acc(6, 10) Mul_Acc(7, 9) Mul_Acc(8, 8) Mul_Acc(9, 7) Mul_Acc(10, 6) Mul_Acc(11, 5) Mul_Acc(12, 4) Mul_Acc(13, 3) Mul_Acc(14, 2) Mul_Acc(15, 1) \
1231
Mul_SaveAcc(0, 2, 15) Mul_Acc(3, 14) Mul_Acc(4, 13) Mul_Acc(5, 12) Mul_Acc(6, 11) Mul_Acc(7, 10) Mul_Acc(8, 9) Mul_Acc(9, 8) Mul_Acc(10, 7) Mul_Acc(11, 6) Mul_Acc(12, 5) Mul_Acc(13, 4) Mul_Acc(14, 3) Mul_Acc(15, 2) \
1232
Mul_SaveAcc(1, 3, 15) Mul_Acc(4, 14) Mul_Acc(5, 13) Mul_Acc(6, 12) Mul_Acc(7, 11) Mul_Acc(8, 10) Mul_Acc(9, 9) Mul_Acc(10, 8) Mul_Acc(11, 7) Mul_Acc(12, 6) Mul_Acc(13, 5) Mul_Acc(14, 4) Mul_Acc(15, 3) \
1233
Mul_SaveAcc(2, 4, 15) Mul_Acc(5, 14) Mul_Acc(6, 13) Mul_Acc(7, 12) Mul_Acc(8, 11) Mul_Acc(9, 10) Mul_Acc(10, 9) Mul_Acc(11, 8) Mul_Acc(12, 7) Mul_Acc(13, 6) Mul_Acc(14, 5) Mul_Acc(15, 4) \
1234
Mul_SaveAcc(3, 5, 15) Mul_Acc(6, 14) Mul_Acc(7, 13) Mul_Acc(8, 12) Mul_Acc(9, 11) Mul_Acc(10, 10) Mul_Acc(11, 9) Mul_Acc(12, 8) Mul_Acc(13, 7) Mul_Acc(14, 6) Mul_Acc(15, 5) \
1235
Mul_SaveAcc(4, 6, 15) Mul_Acc(7, 14) Mul_Acc(8, 13) Mul_Acc(9, 12) Mul_Acc(10, 11) Mul_Acc(11, 10) Mul_Acc(12, 9) Mul_Acc(13, 8) Mul_Acc(14, 7) Mul_Acc(15, 6) \
1236
Mul_SaveAcc(5, 7, 15) Mul_Acc(8, 14) Mul_Acc(9, 13) Mul_Acc(10, 12) Mul_Acc(11, 11) Mul_Acc(12, 10) Mul_Acc(13, 9) Mul_Acc(14, 8) Mul_Acc(15, 7) \
1237
Mul_SaveAcc(6, 8, 15) Mul_Acc(9, 14) Mul_Acc(10, 13) Mul_Acc(11, 12) Mul_Acc(12, 11) Mul_Acc(13, 10) Mul_Acc(14, 9) Mul_Acc(15, 8) \
1238
Mul_SaveAcc(7, 9, 15) Mul_Acc(10, 14) Mul_Acc(11, 13) Mul_Acc(12, 12) Mul_Acc(13, 11) Mul_Acc(14, 10) Mul_Acc(15, 9) \
1239
Mul_SaveAcc(8, 10, 15) Mul_Acc(11, 14) Mul_Acc(12, 13) Mul_Acc(13, 12) Mul_Acc(14, 11) Mul_Acc(15, 10) \
1240
Mul_SaveAcc(9, 11, 15) Mul_Acc(12, 14) Mul_Acc(13, 13) Mul_Acc(14, 12) Mul_Acc(15, 11) \
1241
Mul_SaveAcc(10, 12, 15) Mul_Acc(13, 14) Mul_Acc(14, 13) Mul_Acc(15, 12) \
1242
Mul_SaveAcc(11, 13, 15) Mul_Acc(14, 14) Mul_Acc(15, 13) \
1243
Mul_SaveAcc(12, 14, 15) Mul_Acc(15, 14) \
1248
// ********************************************************
1250
#if CRYPTOPP_INTEGER_SSE2
1252
CRYPTOPP_ALIGN_DATA(16) static const word32 s_maskLow16[4] CRYPTOPP_SECTION_ALIGN16 = {0xffff,0xffff,0xffff,0xffff};
1270
#define SSE2_FinalSave(k) \
1271
AS2( psllq xmm5, 16) \
1272
AS2( paddq xmm4, xmm5) \
1273
AS2( movq QWORD PTR [ecx+8*(k)], xmm4)
1275
#define SSE2_SaveShift(k) \
1276
AS2( movq xmm0, xmm6) \
1277
AS2( punpckhqdq xmm6, xmm0) \
1278
AS2( movq xmm1, xmm7) \
1279
AS2( punpckhqdq xmm7, xmm1) \
1280
AS2( paddd xmm6, xmm0) \
1281
AS2( pslldq xmm6, 4) \
1282
AS2( paddd xmm7, xmm1) \
1283
AS2( paddd xmm4, xmm6) \
1284
AS2( pslldq xmm7, 4) \
1285
AS2( movq xmm6, xmm4) \
1286
AS2( paddd xmm5, xmm7) \
1287
AS2( movq xmm7, xmm5) \
1288
AS2( movd DWORD PTR [ecx+8*(k)], xmm4) \
1289
AS2( psrlq xmm6, 16) \
1290
AS2( paddq xmm6, xmm7) \
1291
AS2( punpckhqdq xmm4, xmm0) \
1292
AS2( punpckhqdq xmm5, xmm0) \
1293
AS2( movq QWORD PTR [ecx+8*(k)+2], xmm6) \
1294
AS2( psrlq xmm6, 3*16) \
1295
AS2( paddd xmm4, xmm6) \
1297
#define Squ_SSE2_SaveShift(k) \
1298
AS2( movq xmm0, xmm6) \
1299
AS2( punpckhqdq xmm6, xmm0) \
1300
AS2( movq xmm1, xmm7) \
1301
AS2( punpckhqdq xmm7, xmm1) \
1302
AS2( paddd xmm6, xmm0) \
1303
AS2( pslldq xmm6, 4) \
1304
AS2( paddd xmm7, xmm1) \
1305
AS2( paddd xmm4, xmm6) \
1306
AS2( pslldq xmm7, 4) \
1307
AS2( movhlps xmm6, xmm4) \
1308
AS2( movd DWORD PTR [ecx+8*(k)], xmm4) \
1309
AS2( paddd xmm5, xmm7) \
1310
AS2( movhps QWORD PTR [esp+12], xmm5)\
1311
AS2( psrlq xmm4, 16) \
1312
AS2( paddq xmm4, xmm5) \
1313
AS2( movq QWORD PTR [ecx+8*(k)+2], xmm4) \
1314
AS2( psrlq xmm4, 3*16) \
1315
AS2( paddd xmm4, xmm6) \
1316
AS2( movq QWORD PTR [esp+4], xmm4)\
1318
#define SSE2_FirstMultiply(i) \
1319
AS2( movdqa xmm7, [esi+(i)*16])\
1320
AS2( movdqa xmm5, [edi-(i)*16])\
1321
AS2( pmuludq xmm5, xmm7) \
1322
AS2( movdqa xmm4, [ebx])\
1323
AS2( movdqa xmm6, xmm4) \
1324
AS2( pand xmm4, xmm5) \
1325
AS2( psrld xmm5, 16) \
1326
AS2( pmuludq xmm7, [edx-(i)*16])\
1327
AS2( pand xmm6, xmm7) \
1328
AS2( psrld xmm7, 16)
1330
#define Squ_Begin(n) \
1333
AS2( and esp, 0xfffffff0)\
1334
AS2( lea edi, [esp-32*n])\
1335
AS2( sub esp, 32*n+16)\
1337
AS2( mov esi, edi) \
1338
AS2( xor edx, edx) \
1340
ASS( pshufd xmm0, [eax+edx], 3,1,2,0) \
1341
ASS( pshufd xmm1, [eax+edx], 2,0,3,1) \
1342
AS2( movdqa [edi+2*edx], xmm0) \
1343
AS2( psrlq xmm0, 32) \
1344
AS2( movdqa [edi+2*edx+16], xmm0) \
1345
AS2( movdqa [edi+16*n+2*edx], xmm1) \
1346
AS2( psrlq xmm1, 32) \
1347
AS2( movdqa [edi+16*n+2*edx+16], xmm1) \
1349
AS2( cmp edx, 8*(n)) \
1351
AS2( lea edx, [edi+16*n])\
1352
SSE2_FirstMultiply(0) \
1354
#define Squ_Acc(i) \
1356
AS2( movdqa xmm1, [esi+(i)*16]) \
1357
AS2( movdqa xmm0, [edi-(i)*16]) \
1358
AS2( movdqa xmm2, [ebx]) \
1359
AS2( pmuludq xmm0, xmm1) \
1360
AS2( pmuludq xmm1, [edx-(i)*16]) \
1361
AS2( movdqa xmm3, xmm2) \
1362
AS2( pand xmm2, xmm0) \
1363
AS2( psrld xmm0, 16) \
1364
AS2( paddd xmm4, xmm2) \
1365
AS2( paddd xmm5, xmm0) \
1366
AS2( pand xmm3, xmm1) \
1367
AS2( psrld xmm1, 16) \
1368
AS2( paddd xmm6, xmm3) \
1369
AS2( paddd xmm7, xmm1) \
1372
#define Squ_Acc2(i) ASC(call, LSqu##i)
1373
#define Squ_Acc3(i) Squ_Acc2(i)
1374
#define Squ_Acc4(i) Squ_Acc2(i)
1375
#define Squ_Acc5(i) Squ_Acc2(i)
1376
#define Squ_Acc6(i) Squ_Acc2(i)
1377
#define Squ_Acc7(i) Squ_Acc2(i)
1378
#define Squ_Acc8(i) Squ_Acc2(i)
1380
#define SSE2_End(E, n) \
1381
SSE2_SaveShift(2*(n)-3) \
1382
AS2( movdqa xmm7, [esi+16]) \
1383
AS2( movdqa xmm0, [edi]) \
1384
AS2( pmuludq xmm0, xmm7) \
1385
AS2( movdqa xmm2, [ebx]) \
1386
AS2( pmuludq xmm7, [edx]) \
1387
AS2( movdqa xmm6, xmm2) \
1388
AS2( pand xmm2, xmm0) \
1389
AS2( psrld xmm0, 16) \
1390
AS2( paddd xmm4, xmm2) \
1391
AS2( paddd xmm5, xmm0) \
1392
AS2( pand xmm6, xmm7) \
1393
AS2( psrld xmm7, 16) \
1394
SSE2_SaveShift(2*(n)-2) \
1395
SSE2_FinalSave(2*(n)-1) \
1399
#define Squ_End(n) SSE2_End(SquEpilogue, n)
1400
#define Mul_End(n) SSE2_End(MulEpilogue, n)
1401
#define Top_End(n) SSE2_End(TopEpilogue, n)
1403
#define Squ_Column1(k, i) \
1404
Squ_SSE2_SaveShift(k) \
1406
SSE2_FirstMultiply(1)\
1408
AS2( paddd xmm4, xmm4) \
1409
AS2( paddd xmm5, xmm5) \
1410
AS2( movdqa xmm3, [esi]) \
1411
AS2( movq xmm1, QWORD PTR [esi+8]) \
1412
AS2( pmuludq xmm1, xmm3) \
1413
AS2( pmuludq xmm3, xmm3) \
1414
AS2( movdqa xmm0, [ebx])\
1415
AS2( movdqa xmm2, xmm0) \
1416
AS2( pand xmm0, xmm1) \
1417
AS2( psrld xmm1, 16) \
1418
AS2( paddd xmm6, xmm0) \
1419
AS2( paddd xmm7, xmm1) \
1420
AS2( pand xmm2, xmm3) \
1421
AS2( psrld xmm3, 16) \
1422
AS2( paddd xmm6, xmm6) \
1423
AS2( paddd xmm7, xmm7) \
1424
AS2( paddd xmm4, xmm2) \
1425
AS2( paddd xmm5, xmm3) \
1426
AS2( movq xmm0, QWORD PTR [esp+4])\
1427
AS2( movq xmm1, QWORD PTR [esp+12])\
1428
AS2( paddd xmm4, xmm0)\
1429
AS2( paddd xmm5, xmm1)\
1431
#define Squ_Column0(k, i) \
1432
Squ_SSE2_SaveShift(k) \
1435
SSE2_FirstMultiply(1)\
1437
AS2( paddd xmm6, xmm6) \
1438
AS2( paddd xmm7, xmm7) \
1439
AS2( paddd xmm4, xmm4) \
1440
AS2( paddd xmm5, xmm5) \
1441
AS2( movq xmm0, QWORD PTR [esp+4])\
1442
AS2( movq xmm1, QWORD PTR [esp+12])\
1443
AS2( paddd xmm4, xmm0)\
1444
AS2( paddd xmm5, xmm1)\
1446
#define SSE2_MulAdd45 \
1447
AS2( movdqa xmm7, [esi]) \
1448
AS2( movdqa xmm0, [edi]) \
1449
AS2( pmuludq xmm0, xmm7) \
1450
AS2( movdqa xmm2, [ebx]) \
1451
AS2( pmuludq xmm7, [edx]) \
1452
AS2( movdqa xmm6, xmm2) \
1453
AS2( pand xmm2, xmm0) \
1454
AS2( psrld xmm0, 16) \
1455
AS2( paddd xmm4, xmm2) \
1456
AS2( paddd xmm5, xmm0) \
1457
AS2( pand xmm6, xmm7) \
1458
AS2( psrld xmm7, 16)
1460
#define Mul_Begin(n) \
1463
AS2( and esp, 0xfffffff0)\
1464
AS2( sub esp, 48*n+16)\
1466
AS2( xor edx, edx) \
1468
ASS( pshufd xmm0, [eax+edx], 3,1,2,0) \
1469
ASS( pshufd xmm1, [eax+edx], 2,0,3,1) \
1470
ASS( pshufd xmm2, [edi+edx], 3,1,2,0) \
1471
AS2( movdqa [esp+20+2*edx], xmm0) \
1472
AS2( psrlq xmm0, 32) \
1473
AS2( movdqa [esp+20+2*edx+16], xmm0) \
1474
AS2( movdqa [esp+20+16*n+2*edx], xmm1) \
1475
AS2( psrlq xmm1, 32) \
1476
AS2( movdqa [esp+20+16*n+2*edx+16], xmm1) \
1477
AS2( movdqa [esp+20+32*n+2*edx], xmm2) \
1478
AS2( psrlq xmm2, 32) \
1479
AS2( movdqa [esp+20+32*n+2*edx+16], xmm2) \
1481
AS2( cmp edx, 8*(n)) \
1483
AS2( lea edi, [esp+20])\
1484
AS2( lea edx, [esp+20+16*n])\
1485
AS2( lea esi, [esp+20+32*n])\
1486
SSE2_FirstMultiply(0) \
1488
#define Mul_Acc(i) \
1490
AS2( movdqa xmm1, [esi+i/2*(1-(i-2*(i/2))*2)*16]) \
1491
AS2( movdqa xmm0, [edi-i/2*(1-(i-2*(i/2))*2)*16]) \
1492
AS2( movdqa xmm2, [ebx]) \
1493
AS2( pmuludq xmm0, xmm1) \
1494
AS2( pmuludq xmm1, [edx-i/2*(1-(i-2*(i/2))*2)*16]) \
1495
AS2( movdqa xmm3, xmm2) \
1496
AS2( pand xmm2, xmm0) \
1497
AS2( psrld xmm0, 16) \
1498
AS2( paddd xmm4, xmm2) \
1499
AS2( paddd xmm5, xmm0) \
1500
AS2( pand xmm3, xmm1) \
1501
AS2( psrld xmm1, 16) \
1502
AS2( paddd xmm6, xmm3) \
1503
AS2( paddd xmm7, xmm1) \
1506
#define Mul_Acc2(i) ASC(call, LMul##i)
1507
#define Mul_Acc3(i) Mul_Acc2(i)
1508
#define Mul_Acc4(i) Mul_Acc2(i)
1509
#define Mul_Acc5(i) Mul_Acc2(i)
1510
#define Mul_Acc6(i) Mul_Acc2(i)
1511
#define Mul_Acc7(i) Mul_Acc2(i)
1512
#define Mul_Acc8(i) Mul_Acc2(i)
1513
#define Mul_Acc9(i) Mul_Acc2(i)
1514
#define Mul_Acc10(i) Mul_Acc2(i)
1515
#define Mul_Acc11(i) Mul_Acc2(i)
1516
#define Mul_Acc12(i) Mul_Acc2(i)
1517
#define Mul_Acc13(i) Mul_Acc2(i)
1518
#define Mul_Acc14(i) Mul_Acc2(i)
1519
#define Mul_Acc15(i) Mul_Acc2(i)
1520
#define Mul_Acc16(i) Mul_Acc2(i)
1522
#define Mul_Column1(k, i) \
1528
#define Mul_Column0(k, i) \
1535
#define Bot_Acc(i) \
1536
AS2( movdqa xmm1, [esi+i/2*(1-(i-2*(i/2))*2)*16]) \
1537
AS2( movdqa xmm0, [edi-i/2*(1-(i-2*(i/2))*2)*16]) \
1538
AS2( pmuludq xmm0, xmm1) \
1539
AS2( pmuludq xmm1, [edx-i/2*(1-(i-2*(i/2))*2)*16]) \
1540
AS2( paddq xmm4, xmm0) \
1541
AS2( paddd xmm6, xmm1)
1543
#define Bot_SaveAcc(k) \
1547
AS2( movdqa xmm6, [esi]) \
1548
AS2( movdqa xmm0, [edi]) \
1549
AS2( pmuludq xmm0, xmm6) \
1550
AS2( paddq xmm4, xmm0) \
1551
AS2( psllq xmm5, 16) \
1552
AS2( paddq xmm4, xmm5) \
1553
AS2( pmuludq xmm6, [edx])
1555
#define Bot_End(n) \
1556
AS2( movhlps xmm7, xmm6) \
1557
AS2( paddd xmm6, xmm7) \
1558
AS2( psllq xmm6, 32) \
1559
AS2( paddd xmm4, xmm6) \
1560
AS2( movq QWORD PTR [ecx+8*((n)-1)], xmm4) \
1564
#define Top_Begin(n) \
1567
AS2( and esp, 0xfffffff0)\
1568
AS2( sub esp, 48*n+16)\
1570
AS2( xor edx, edx) \
1572
ASS( pshufd xmm0, [eax+edx], 3,1,2,0) \
1573
ASS( pshufd xmm1, [eax+edx], 2,0,3,1) \
1574
ASS( pshufd xmm2, [edi+edx], 3,1,2,0) \
1575
AS2( movdqa [esp+20+2*edx], xmm0) \
1576
AS2( psrlq xmm0, 32) \
1577
AS2( movdqa [esp+20+2*edx+16], xmm0) \
1578
AS2( movdqa [esp+20+16*n+2*edx], xmm1) \
1579
AS2( psrlq xmm1, 32) \
1580
AS2( movdqa [esp+20+16*n+2*edx+16], xmm1) \
1581
AS2( movdqa [esp+20+32*n+2*edx], xmm2) \
1582
AS2( psrlq xmm2, 32) \
1583
AS2( movdqa [esp+20+32*n+2*edx+16], xmm2) \
1585
AS2( cmp edx, 8*(n)) \
1587
AS2( mov eax, esi) \
1588
AS2( lea edi, [esp+20+00*n+16*(n/2-1)])\
1589
AS2( lea edx, [esp+20+16*n+16*(n/2-1)])\
1590
AS2( lea esi, [esp+20+32*n+16*(n/2-1)])\
1591
AS2( pxor xmm4, xmm4)\
1592
AS2( pxor xmm5, xmm5)
1594
#define Top_Acc(i) \
1595
AS2( movq xmm0, QWORD PTR [esi+i/2*(1-(i-2*(i/2))*2)*16+8]) \
1596
AS2( pmuludq xmm0, [edx-i/2*(1-(i-2*(i/2))*2)*16]) \
1597
AS2( psrlq xmm0, 48) \
1598
AS2( paddd xmm5, xmm0)\
1600
#define Top_Column0(i) \
1601
AS2( psllq xmm5, 32) \
1607
#define Top_Column1(i) \
1613
AS2( movd xmm0, eax)\
1614
AS2( movd xmm1, [ecx+4])\
1615
AS2( psrld xmm1, 16)\
1616
AS2( pcmpgtd xmm1, xmm0)\
1617
AS2( psrld xmm1, 31)\
1618
AS2( paddd xmm4, xmm1)\
1620
void SSE2_Square4(word *C, const word *A)
1627
void SSE2_Square8(word *C, const word *A)
1643
void SSE2_Square16(word *C, const word *A)
1648
Squ_Acc(4) Squ_Acc(3) Squ_Acc(2)
1667
void SSE2_Square32(word *C, const word *A)
1671
Squ_Acc(8) Squ_Acc(7) Squ_Acc(6) Squ_Acc(5) Squ_Acc(4) Squ_Acc(3) Squ_Acc(2)
1705
void SSE2_Multiply4(word *C, const word *A, const word *B)
1717
void SSE2_Multiply8(word *C, const word *A, const word *B)
1722
Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1733
void SSE2_Multiply16(word *C, const word *A, const word *B)
1738
Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1757
void SSE2_Multiply32(word *C, const word *A, const word *B)
1761
Mul_Acc(16) Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1795
void SSE2_MultiplyBottom4(word *C, const word *A, const word *B)
1798
Bot_SaveAcc(0) Bot_Acc(2)
1802
void SSE2_MultiplyBottom8(word *C, const word *A, const word *B)
1807
Mul_Acc(3) Mul_Acc(2)
1812
Bot_SaveAcc(2) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2)
1816
void SSE2_MultiplyBottom16(word *C, const word *A, const word *B)
1821
Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1830
Bot_SaveAcc(6) Bot_Acc(8) Bot_Acc(7) Bot_Acc(6) Bot_Acc(5) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2)
1834
void SSE2_MultiplyBottom32(word *C, const word *A, const word *B)
1839
Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1856
Bot_SaveAcc(14) Bot_Acc(16) Bot_Acc(15) Bot_Acc(14) Bot_Acc(13) Bot_Acc(12) Bot_Acc(11) Bot_Acc(10) Bot_Acc(9) Bot_Acc(8) Bot_Acc(7) Bot_Acc(6) Bot_Acc(5) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2)
1860
void SSE2_MultiplyTop8(word *C, const word *A, const word *B, word L)
1863
Top_Acc(3) Top_Acc(2) Top_Acc(1)
1866
Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1875
void SSE2_MultiplyTop16(word *C, const word *A, const word *B, word L)
1878
Top_Acc(7) Top_Acc(6) Top_Acc(5) Top_Acc(4) Top_Acc(3) Top_Acc(2) Top_Acc(1)
1881
Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1894
void SSE2_MultiplyTop32(word *C, const word *A, const word *B, word L)
1897
Top_Acc(15) Top_Acc(14) Top_Acc(13) Top_Acc(12) Top_Acc(11) Top_Acc(10) Top_Acc(9) Top_Acc(8) Top_Acc(7) Top_Acc(6) Top_Acc(5) Top_Acc(4) Top_Acc(3) Top_Acc(2) Top_Acc(1)
1900
Mul_Acc(16) Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1921
#endif // #if CRYPTOPP_INTEGER_SSE2
1923
// ********************************************************
1925
typedef word (CRYPTOPP_FASTCALL * PAdd)(size_t N, word *C, const word *A, const word *B);
1012
1926
typedef void (* PMul)(word *C, const word *A, const word *B);
1927
typedef void (* PSqu)(word *C, const word *A);
1928
typedef void (* PMulTop)(word *C, const word *A, const word *B, word L);
1014
static PAddSub s_pAdd, s_pSub;
1015
#ifdef SSE2_INTRINSICS_AVAILABLE
1016
static PMul s_pMul4, s_pMul8, s_pMul8B;
1930
#if CRYPTOPP_INTEGER_SSE2
1931
static PAdd s_pAdd = &Baseline_Add, s_pSub = &Baseline_Sub;
1932
static size_t s_recursionLimit = 8;
1934
static const size_t s_recursionLimit = 16;
1019
static void SetPentiumFunctionPointers()
1937
static PMul s_pMul[9], s_pBot[9];
1938
static PSqu s_pSqu[9];
1939
static PMulTop s_pTop[9];
1941
static void SetFunctionPointers()
1023
s_pAdd = &P4Optimized::Add;
1024
s_pSub = &P4Optimized::Subtract;
1028
s_pAdd = &PentiumOptimized::Add;
1029
s_pSub = &PentiumOptimized::Subtract;
1943
s_pMul[0] = &Baseline_Multiply2;
1944
s_pBot[0] = &Baseline_MultiplyBottom2;
1945
s_pSqu[0] = &Baseline_Square2;
1946
s_pTop[0] = &Baseline_MultiplyTop2;
1947
s_pTop[1] = &Baseline_MultiplyTop4;
1032
#ifdef SSE2_INTRINSICS_AVAILABLE
1949
#if CRYPTOPP_INTEGER_SSE2
1035
s_pMul4 = &P4Optimized::Multiply4;
1036
s_pMul8 = &P4Optimized::Multiply8;
1037
s_pMul8B = &P4Optimized::Multiply8Bottom;
1958
s_recursionLimit = 32;
1960
s_pMul[1] = &SSE2_Multiply4;
1961
s_pMul[2] = &SSE2_Multiply8;
1962
s_pMul[4] = &SSE2_Multiply16;
1963
s_pMul[8] = &SSE2_Multiply32;
1965
s_pBot[1] = &SSE2_MultiplyBottom4;
1966
s_pBot[2] = &SSE2_MultiplyBottom8;
1967
s_pBot[4] = &SSE2_MultiplyBottom16;
1968
s_pBot[8] = &SSE2_MultiplyBottom32;
1970
s_pSqu[1] = &SSE2_Square4;
1971
s_pSqu[2] = &SSE2_Square8;
1972
s_pSqu[4] = &SSE2_Square16;
1973
s_pSqu[8] = &SSE2_Square32;
1975
s_pTop[2] = &SSE2_MultiplyTop8;
1976
s_pTop[4] = &SSE2_MultiplyTop16;
1977
s_pTop[8] = &SSE2_MultiplyTop32;
1041
s_pMul4 = &PentiumOptimized::Multiply4;
1042
s_pMul8 = &PentiumOptimized::Multiply8;
1043
s_pMul8B = &PentiumOptimized::Multiply8Bottom;
1982
s_pMul[1] = &Baseline_Multiply4;
1983
s_pMul[2] = &Baseline_Multiply8;
1985
s_pBot[1] = &Baseline_MultiplyBottom4;
1986
s_pBot[2] = &Baseline_MultiplyBottom8;
1988
s_pSqu[1] = &Baseline_Square4;
1989
s_pSqu[2] = &Baseline_Square8;
1991
s_pTop[2] = &Baseline_MultiplyTop8;
1993
#if !CRYPTOPP_INTEGER_SSE2
1994
s_pMul[4] = &Baseline_Multiply16;
1995
s_pBot[4] = &Baseline_MultiplyBottom16;
1996
s_pSqu[4] = &Baseline_Square16;
1997
s_pTop[4] = &Baseline_MultiplyTop16;
1050
s_sse2Enabled = false;
1051
SetPentiumFunctionPointers();
1054
class LowLevel : public PentiumOptimized
1057
inline static int Add(word *C, const word *A, const word *B, size_t N)
1058
{return s_pAdd(C, A, B, N);}
1059
inline static int Subtract(word *C, const word *A, const word *B, size_t N)
1060
{return s_pSub(C, A, B, N);}
1061
inline static void Square4(word *R, const word *A)
1062
{Multiply4(R, A, A);}
1063
#ifdef SSE2_INTRINSICS_AVAILABLE
1064
inline static void Multiply4(word *C, const word *A, const word *B)
1066
inline static void Multiply8(word *C, const word *A, const word *B)
1068
inline static void Multiply8Bottom(word *C, const word *A, const word *B)
1069
{s_pMul8B(C, A, B);}
1073
// use some tricks to share assembly code between MSVC and GCC
1075
#define CRYPTOPP_NAKED __declspec(naked)
1076
#define AS1(x) __asm x
1077
#define AS2(x, y) __asm x, y
1078
#define AddPrologue \
1083
__asm mov ecx, [esp+20] \
1084
__asm mov edx, [esp+24] \
1085
__asm mov ebx, [esp+28] \
1086
__asm mov esi, [esp+32]
1087
#define AddEpilogue \
1093
#define MulPrologue \
1098
__asm mov ecx, [esp+28] \
1099
__asm mov esi, [esp+24] \
1101
#define MulEpilogue \
1109
#define CRYPTOPP_NAKED
1110
#define AS1(x) #x ";"
1111
#define AS2(x, y) #x ", " #y ";"
1112
#define AddPrologue \
1113
__asm__ __volatile__ \
1115
"push %%ebx;" /* save this manually, in case of -fPIC */ \
1117
".intel_syntax noprefix;" \
1119
#define AddEpilogue \
1121
".att_syntax prefix;" \
1124
: "c" (C), "d" (A), "m" (B), "S" (N) \
1125
: "%edi", "memory", "cc" \
1127
#define MulPrologue \
1128
__asm__ __volatile__ \
1130
"push %%ebx;" /* save this manually, in case of -fPIC */ \
1133
".intel_syntax noprefix;"
1134
#define MulEpilogue \
1138
".att_syntax prefix;" \
1140
: "rm" (Z), "S" (X), "c" (Y) \
1141
: "%eax", "%edx", "%edi", "memory", "cc" \
1145
CRYPTOPP_NAKED int PentiumOptimized::Add(word *C, const word *A, const word *B, size_t N)
1149
// now: ebx = B, ecx = C, edx = A, esi = N
1150
AS2( sub ecx, edx) // hold the distance between C & A so we can add this to A to get C
1151
AS2( xor eax, eax) // clear eax
1153
AS2( sub eax, esi) // eax is a negative index from end of B
1154
AS2( lea ebx, [ebx+4*esi]) // ebx is end of B
1156
AS2( sar eax, 1) // unit of eax is now dwords; this also clears the carry flag
1157
AS1( jz loopendAdd) // if no dwords then nothing to do
1160
AS2( mov esi,[edx]) // load lower word of A
1161
AS2( mov ebp,[edx+4]) // load higher word of A
1163
AS2( mov edi,[ebx+8*eax]) // load lower word of B
1164
AS2( lea edx,[edx+8]) // advance A and C
1166
AS2( adc esi,edi) // add lower words
1167
AS2( mov edi,[ebx+8*eax+4]) // load higher word of B
1169
AS2( adc ebp,edi) // add higher words
1170
AS1( inc eax) // advance B
1172
AS2( mov [edx+ecx-8],esi) // store lower word result
1173
AS2( mov [edx+ecx-4],ebp) // store higher word result
1175
AS1( jnz loopstartAdd) // loop until eax overflows and becomes zero
1178
AS2( adc eax, 0) // store carry into eax (return result register)
1183
CRYPTOPP_NAKED int PentiumOptimized::Subtract(word *C, const word *A, const word *B, size_t N)
1187
// now: ebx = B, ecx = C, edx = A, esi = N
1188
AS2( sub ecx, edx) // hold the distance between C & A so we can add this to A to get C
1189
AS2( xor eax, eax) // clear eax
1191
AS2( sub eax, esi) // eax is a negative index from end of B
1192
AS2( lea ebx, [ebx+4*esi]) // ebx is end of B
1194
AS2( sar eax, 1) // unit of eax is now dwords; this also clears the carry flag
1195
AS1( jz loopendSub) // if no dwords then nothing to do
1198
AS2( mov esi,[edx]) // load lower word of A
1199
AS2( mov ebp,[edx+4]) // load higher word of A
1201
AS2( mov edi,[ebx+8*eax]) // load lower word of B
1202
AS2( lea edx,[edx+8]) // advance A and C
1204
AS2( sbb esi,edi) // subtract lower words
1205
AS2( mov edi,[ebx+8*eax+4]) // load higher word of B
1207
AS2( sbb ebp,edi) // subtract higher words
1208
AS1( inc eax) // advance B
1210
AS2( mov [edx+ecx-8],esi) // store lower word result
1211
AS2( mov [edx+ecx-4],ebp) // store higher word result
1213
AS1( jnz loopstartSub) // loop until eax overflows and becomes zero
1216
AS2( adc eax, 0) // store carry into eax (return result register)
1221
// On Pentium 4, the adc and sbb instructions are very expensive, so avoid them.
1223
CRYPTOPP_NAKED int P4Optimized::Add(word *C, const word *A, const word *B, size_t N)
1227
// now: ebx = B, ecx = C, edx = A, esi = N
1230
AS1( jz loopendAddP4) // if no dwords then nothing to do
1232
AS2( mov edi, [edx])
1233
AS2( mov ebp, [ebx])
1234
AS1( jmp carry1AddP4)
1236
AS1(loopstartAddP4:)
1237
AS2( mov edi, [edx+8])
1240
AS2( mov ebp, [ebx])
1242
AS1( jc carry1AddP4)
1248
AS2( mov [ecx], edi)
1249
AS2( mov edi, [edx+4])
1250
AS2( cmovc eax, ebp)
1251
AS2( mov ebp, [ebx+4])
1254
AS1( jc carry2AddP4)
1260
AS2( cmovc eax, ebp)
1261
AS2( mov [ecx+4], edi)
1263
AS1( jnz loopstartAddP4)
1270
CRYPTOPP_NAKED int P4Optimized::Subtract(word *C, const word *A, const word *B, size_t N)
1274
// now: ebx = B, ecx = C, edx = A, esi = N
1277
AS1( jz loopendSubP4) // if no dwords then nothing to do
1279
AS2( mov edi, [edx])
1280
AS2( mov ebp, [ebx])
1281
AS1( jmp carry1SubP4)
1283
AS1(loopstartSubP4:)
1284
AS2( mov edi, [edx+8])
1287
AS2( mov ebp, [ebx])
1289
AS1( jc carry1SubP4)
1295
AS2( mov [ecx], edi)
1296
AS2( mov edi, [edx+4])
1297
AS2( cmovc eax, ebp)
1298
AS2( mov ebp, [ebx+4])
1301
AS1( jc carry2SubP4)
1307
AS2( cmovc eax, ebp)
1308
AS2( mov [ecx+4], edi)
1310
AS1( jnz loopstartSubP4)
1317
// multiply assembly code originally contributed by Leonard Janke
1319
#define MulStartup \
1324
#define MulShiftCarry \
1329
#define MulAccumulateBottom(i,j) \
1330
AS2(mov eax, [ecx+4*j]) \
1331
AS2(imul eax, dword ptr [esi+4*i]) \
1334
#define MulAccumulate(i,j) \
1335
AS2(mov eax, [ecx+4*j]) \
1336
AS1(mul dword ptr [esi+4*i]) \
1341
#define MulStoreDigit(i) \
1343
AS2(mov edi, [esp]) \
1344
AS2(mov [edi+4*i], ebp)
1346
#define MulLastDiagonal(digits) \
1347
AS2(mov eax, [ecx+4*(digits-1)]) \
1348
AS1(mul dword ptr [esi+4*(digits-1)]) \
1351
AS2(mov edi, [esp]) \
1352
AS2(mov [edi+4*(2*digits-2)], ebp) \
1353
AS2(mov [edi+4*(2*digits-1)], edx)
1355
CRYPTOPP_NAKED void PentiumOptimized::Multiply4(word* Z, const word* X, const word* Y)
1358
// now: [esp] = Z, esi = X, ecx = Y
1397
CRYPTOPP_NAKED void PentiumOptimized::Multiply8(word* Z, const word* X, const word* Y)
1400
// now: [esp] = Z, esi = X, ecx = Y
1511
CRYPTOPP_NAKED void PentiumOptimized::Multiply8Bottom(word* Z, const word* X, const word* Y)
1514
// now: [esp] = Z, esi = X, ecx = Y
1565
MulAccumulateBottom(7,0)
1566
MulAccumulateBottom(6,1)
1567
MulAccumulateBottom(5,2)
1568
MulAccumulateBottom(4,3)
1569
MulAccumulateBottom(3,4)
1570
MulAccumulateBottom(2,5)
1571
MulAccumulateBottom(1,6)
1572
MulAccumulateBottom(0,7)
1580
#else // not x86 - no processor specific code at this layer
1582
typedef Portable LowLevel;
1586
#ifdef SSE2_INTRINSICS_AVAILABLE
1589
#define CRYPTOPP_FASTCALL
1591
#define CRYPTOPP_FASTCALL __fastcall
1594
static void CRYPTOPP_FASTCALL P4_Mul(__m128i *C, const __m128i *A, const __m128i *B)
1596
__m128i a3210 = _mm_load_si128(A);
1597
__m128i b3210 = _mm_load_si128(B);
1601
__m128i z = _mm_setzero_si128();
1602
__m128i a2b2_a0b0 = _mm_mul_epu32(a3210, b3210);
1605
__m128i a3120 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(3, 1, 2, 0));
1606
__m128i b3021 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 2, 1));
1607
__m128i a1b0_a0b1 = _mm_mul_epu32(a3120, b3021);
1608
__m128i a1b0 = _mm_unpackhi_epi32(a1b0_a0b1, z);
1609
__m128i a0b1 = _mm_unpacklo_epi32(a1b0_a0b1, z);
1610
C[1] = _mm_add_epi64(a1b0, a0b1);
1612
__m128i a31 = _mm_srli_epi64(a3210, 32);
1613
__m128i b31 = _mm_srli_epi64(b3210, 32);
1614
__m128i a3b3_a1b1 = _mm_mul_epu32(a31, b31);
1617
__m128i a1b1 = _mm_unpacklo_epi32(a3b3_a1b1, z);
1618
__m128i b3012 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 1, 2));
1619
__m128i a2b0_a0b2 = _mm_mul_epu32(a3210, b3012);
1620
__m128i a0b2 = _mm_unpacklo_epi32(a2b0_a0b2, z);
1621
__m128i a2b0 = _mm_unpackhi_epi32(a2b0_a0b2, z);
1622
sum = _mm_add_epi64(a1b1, a0b2);
1623
C[2] = _mm_add_epi64(sum, a2b0);
1625
__m128i a2301 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(2, 3, 0, 1));
1626
__m128i b2103 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(2, 1, 0, 3));
1627
__m128i a3b0_a1b2 = _mm_mul_epu32(a2301, b3012);
1628
__m128i a2b1_a0b3 = _mm_mul_epu32(a3210, b2103);
1629
__m128i a3b0 = _mm_unpackhi_epi32(a3b0_a1b2, z);
1630
__m128i a1b2 = _mm_unpacklo_epi32(a3b0_a1b2, z);
1631
__m128i a2b1 = _mm_unpackhi_epi32(a2b1_a0b3, z);
1632
__m128i a0b3 = _mm_unpacklo_epi32(a2b1_a0b3, z);
1633
__m128i sum1 = _mm_add_epi64(a3b0, a1b2);
1634
sum = _mm_add_epi64(a2b1, a0b3);
1635
C[3] = _mm_add_epi64(sum, sum1);
1637
__m128i a3b1_a1b3 = _mm_mul_epu32(a2301, b2103);
1638
__m128i a2b2 = _mm_unpackhi_epi32(a2b2_a0b0, z);
1639
__m128i a3b1 = _mm_unpackhi_epi32(a3b1_a1b3, z);
1640
__m128i a1b3 = _mm_unpacklo_epi32(a3b1_a1b3, z);
1641
sum = _mm_add_epi64(a2b2, a3b1);
1642
C[4] = _mm_add_epi64(sum, a1b3);
1644
__m128i a1302 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(1, 3, 0, 2));
1645
__m128i b1203 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(1, 2, 0, 3));
1646
__m128i a3b2_a2b3 = _mm_mul_epu32(a1302, b1203);
1647
__m128i a3b2 = _mm_unpackhi_epi32(a3b2_a2b3, z);
1648
__m128i a2b3 = _mm_unpacklo_epi32(a3b2_a2b3, z);
1649
C[5] = _mm_add_epi64(a3b2, a2b3);
1652
void P4Optimized::Multiply4(word *C, const word *A, const word *B)
1655
const word *w = (word *)temp;
1656
const __m64 *mw = (__m64 *)w;
1658
P4_Mul(temp, (__m128i *)A, (__m128i *)B);
1664
__m64 w1 = _mm_cvtsi32_si64(w[1]);
1675
__m64 w26 = _mm_cvtsi32_si64(w[26]);
1677
s1 = _mm_add_si64(w1, w4);
1678
C[1] = _mm_cvtsi64_si32(s1);
1679
s1 = _mm_srli_si64(s1, 32);
1681
s2 = _mm_add_si64(w6, w8);
1682
s1 = _mm_add_si64(s1, s2);
1683
C[2] = _mm_cvtsi64_si32(s1);
1684
s1 = _mm_srli_si64(s1, 32);
1686
s2 = _mm_add_si64(w10, w12);
1687
s1 = _mm_add_si64(s1, s2);
1688
C[3] = _mm_cvtsi64_si32(s1);
1689
s1 = _mm_srli_si64(s1, 32);
1691
s2 = _mm_add_si64(w14, w16);
1692
s1 = _mm_add_si64(s1, s2);
1693
C[4] = _mm_cvtsi64_si32(s1);
1694
s1 = _mm_srli_si64(s1, 32);
1696
s2 = _mm_add_si64(w18, w20);
1697
s1 = _mm_add_si64(s1, s2);
1698
C[5] = _mm_cvtsi64_si32(s1);
1699
s1 = _mm_srli_si64(s1, 32);
1701
s2 = _mm_add_si64(w22, w26);
1702
s1 = _mm_add_si64(s1, s2);
1703
C[6] = _mm_cvtsi64_si32(s1);
1704
s1 = _mm_srli_si64(s1, 32);
1706
C[7] = _mm_cvtsi64_si32(s1) + w[27];
1710
void P4Optimized::Multiply8(word *C, const word *A, const word *B)
1713
const word *w = (word *)temp;
1714
const __m64 *mw = (__m64 *)w;
1715
const word *x = (word *)temp+7*4;
1716
const __m64 *mx = (__m64 *)x;
1717
const word *y = (word *)temp+7*4*2;
1718
const __m64 *my = (__m64 *)y;
1719
const word *z = (word *)temp+7*4*3;
1720
const __m64 *mz = (__m64 *)z;
1722
P4_Mul(temp, (__m128i *)A, (__m128i *)B);
1724
P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B);
1726
P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1);
1728
P4_Mul(temp+21, (__m128i *)A+1, (__m128i *)B+1);
1732
__m64 s1, s2, s3, s4;
1734
__m64 w1 = _mm_cvtsi32_si64(w[1]);
1745
__m64 w26 = _mm_cvtsi32_si64(w[26]);
1746
__m64 w27 = _mm_cvtsi32_si64(w[27]);
1748
__m64 x0 = _mm_cvtsi32_si64(x[0]);
1749
__m64 x1 = _mm_cvtsi32_si64(x[1]);
1760
__m64 x26 = _mm_cvtsi32_si64(x[26]);
1761
__m64 x27 = _mm_cvtsi32_si64(x[27]);
1763
__m64 y0 = _mm_cvtsi32_si64(y[0]);
1764
__m64 y1 = _mm_cvtsi32_si64(y[1]);
1775
__m64 y26 = _mm_cvtsi32_si64(y[26]);
1776
__m64 y27 = _mm_cvtsi32_si64(y[27]);
1778
__m64 z0 = _mm_cvtsi32_si64(z[0]);
1779
__m64 z1 = _mm_cvtsi32_si64(z[1]);
1790
__m64 z26 = _mm_cvtsi32_si64(z[26]);
1792
s1 = _mm_add_si64(w1, w4);
1793
C[1] = _mm_cvtsi64_si32(s1);
1794
s1 = _mm_srli_si64(s1, 32);
1796
s2 = _mm_add_si64(w6, w8);
1797
s1 = _mm_add_si64(s1, s2);
1798
C[2] = _mm_cvtsi64_si32(s1);
1799
s1 = _mm_srli_si64(s1, 32);
1801
s2 = _mm_add_si64(w10, w12);
1802
s1 = _mm_add_si64(s1, s2);
1803
C[3] = _mm_cvtsi64_si32(s1);
1804
s1 = _mm_srli_si64(s1, 32);
1806
s3 = _mm_add_si64(x0, y0);
1807
s2 = _mm_add_si64(w14, w16);
1808
s1 = _mm_add_si64(s1, s3);
1809
s1 = _mm_add_si64(s1, s2);
1810
C[4] = _mm_cvtsi64_si32(s1);
1811
s1 = _mm_srli_si64(s1, 32);
1813
s3 = _mm_add_si64(x1, y1);
1814
s4 = _mm_add_si64(x4, y4);
1815
s1 = _mm_add_si64(s1, w18);
1816
s3 = _mm_add_si64(s3, s4);
1817
s1 = _mm_add_si64(s1, w20);
1818
s1 = _mm_add_si64(s1, s3);
1819
C[5] = _mm_cvtsi64_si32(s1);
1820
s1 = _mm_srli_si64(s1, 32);
1822
s3 = _mm_add_si64(x6, y6);
1823
s4 = _mm_add_si64(x8, y8);
1824
s1 = _mm_add_si64(s1, w22);
1825
s3 = _mm_add_si64(s3, s4);
1826
s1 = _mm_add_si64(s1, w26);
1827
s1 = _mm_add_si64(s1, s3);
1828
C[6] = _mm_cvtsi64_si32(s1);
1829
s1 = _mm_srli_si64(s1, 32);
1831
s3 = _mm_add_si64(x10, y10);
1832
s4 = _mm_add_si64(x12, y12);
1833
s1 = _mm_add_si64(s1, w27);
1834
s3 = _mm_add_si64(s3, s4);
1835
s1 = _mm_add_si64(s1, s3);
1836
C[7] = _mm_cvtsi64_si32(s1);
1837
s1 = _mm_srli_si64(s1, 32);
1839
s3 = _mm_add_si64(x14, y14);
1840
s4 = _mm_add_si64(x16, y16);
1841
s1 = _mm_add_si64(s1, z0);
1842
s3 = _mm_add_si64(s3, s4);
1843
s1 = _mm_add_si64(s1, s3);
1844
C[8] = _mm_cvtsi64_si32(s1);
1845
s1 = _mm_srli_si64(s1, 32);
1847
s3 = _mm_add_si64(x18, y18);
1848
s4 = _mm_add_si64(x20, y20);
1849
s1 = _mm_add_si64(s1, z1);
1850
s3 = _mm_add_si64(s3, s4);
1851
s1 = _mm_add_si64(s1, z4);
1852
s1 = _mm_add_si64(s1, s3);
1853
C[9] = _mm_cvtsi64_si32(s1);
1854
s1 = _mm_srli_si64(s1, 32);
1856
s3 = _mm_add_si64(x22, y22);
1857
s4 = _mm_add_si64(x26, y26);
1858
s1 = _mm_add_si64(s1, z6);
1859
s3 = _mm_add_si64(s3, s4);
1860
s1 = _mm_add_si64(s1, z8);
1861
s1 = _mm_add_si64(s1, s3);
1862
C[10] = _mm_cvtsi64_si32(s1);
1863
s1 = _mm_srli_si64(s1, 32);
1865
s3 = _mm_add_si64(x27, y27);
1866
s1 = _mm_add_si64(s1, z10);
1867
s1 = _mm_add_si64(s1, z12);
1868
s1 = _mm_add_si64(s1, s3);
1869
C[11] = _mm_cvtsi64_si32(s1);
1870
s1 = _mm_srli_si64(s1, 32);
1872
s3 = _mm_add_si64(z14, z16);
1873
s1 = _mm_add_si64(s1, s3);
1874
C[12] = _mm_cvtsi64_si32(s1);
1875
s1 = _mm_srli_si64(s1, 32);
1877
s3 = _mm_add_si64(z18, z20);
1878
s1 = _mm_add_si64(s1, s3);
1879
C[13] = _mm_cvtsi64_si32(s1);
1880
s1 = _mm_srli_si64(s1, 32);
1882
s3 = _mm_add_si64(z22, z26);
1883
s1 = _mm_add_si64(s1, s3);
1884
C[14] = _mm_cvtsi64_si32(s1);
1885
s1 = _mm_srli_si64(s1, 32);
1887
C[15] = z[27] + _mm_cvtsi64_si32(s1);
1891
void P4Optimized::Multiply8Bottom(word *C, const word *A, const word *B)
1894
const word *w = (word *)temp;
1895
const __m64 *mw = (__m64 *)w;
1896
const word *x = (word *)temp+7*4;
1897
const __m64 *mx = (__m64 *)x;
1898
const word *y = (word *)temp+7*4*2;
1899
const __m64 *my = (__m64 *)y;
1901
P4_Mul(temp, (__m128i *)A, (__m128i *)B);
1903
P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B);
1905
P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1);
1909
__m64 s1, s2, s3, s4;
1911
__m64 w1 = _mm_cvtsi32_si64(w[1]);
1922
__m64 w26 = _mm_cvtsi32_si64(w[26]);
1924
__m64 x0 = _mm_cvtsi32_si64(x[0]);
1925
__m64 x1 = _mm_cvtsi32_si64(x[1]);
1930
__m64 y0 = _mm_cvtsi32_si64(y[0]);
1931
__m64 y1 = _mm_cvtsi32_si64(y[1]);
1936
s1 = _mm_add_si64(w1, w4);
1937
C[1] = _mm_cvtsi64_si32(s1);
1938
s1 = _mm_srli_si64(s1, 32);
1940
s2 = _mm_add_si64(w6, w8);
1941
s1 = _mm_add_si64(s1, s2);
1942
C[2] = _mm_cvtsi64_si32(s1);
1943
s1 = _mm_srli_si64(s1, 32);
1945
s2 = _mm_add_si64(w10, w12);
1946
s1 = _mm_add_si64(s1, s2);
1947
C[3] = _mm_cvtsi64_si32(s1);
1948
s1 = _mm_srli_si64(s1, 32);
1950
s3 = _mm_add_si64(x0, y0);
1951
s2 = _mm_add_si64(w14, w16);
1952
s1 = _mm_add_si64(s1, s3);
1953
s1 = _mm_add_si64(s1, s2);
1954
C[4] = _mm_cvtsi64_si32(s1);
1955
s1 = _mm_srli_si64(s1, 32);
1957
s3 = _mm_add_si64(x1, y1);
1958
s4 = _mm_add_si64(x4, y4);
1959
s1 = _mm_add_si64(s1, w18);
1960
s3 = _mm_add_si64(s3, s4);
1961
s1 = _mm_add_si64(s1, w20);
1962
s1 = _mm_add_si64(s1, s3);
1963
C[5] = _mm_cvtsi64_si32(s1);
1964
s1 = _mm_srli_si64(s1, 32);
1966
s3 = _mm_add_si64(x6, y6);
1967
s4 = _mm_add_si64(x8, y8);
1968
s1 = _mm_add_si64(s1, w22);
1969
s3 = _mm_add_si64(s3, s4);
1970
s1 = _mm_add_si64(s1, w26);
1971
s1 = _mm_add_si64(s1, s3);
1972
C[6] = _mm_cvtsi64_si32(s1);
1973
s1 = _mm_srli_si64(s1, 32);
1975
C[7] = _mm_cvtsi64_si32(s1) + w[27] + x[10] + y[10] + x[12] + y[12];
1979
#endif // #ifdef SSE2_INTRINSICS_AVAILABLE
2002
inline word Add(word *C, const word *A, const word *B, size_t N)
2004
#if CRYPTOPP_INTEGER_SSE2
2005
return s_pAdd(N, C, A, B);
2007
return Baseline_Add(N, C, A, B);
2011
inline word Subtract(word *C, const word *A, const word *B, size_t N)
2013
#if CRYPTOPP_INTEGER_SSE2
2014
return s_pSub(N, C, A, B);
2016
return Baseline_Sub(N, C, A, B);
1981
2020
// ********************************************************
1984
2024
#define A1 (A+N2)