458
440
// ********************************************************
463
static int Add(word *C, const word *A, const word *B, size_t N);
464
static int Subtract(word *C, const word *A, const word *B, size_t N);
466
static inline void Multiply2(word *C, const word *A, const word *B);
467
static inline word Multiply2Add(word *C, const word *A, const word *B);
468
static void Multiply4(word *C, const word *A, const word *B);
469
static void Multiply8(word *C, const word *A, const word *B);
470
static inline unsigned int MultiplyRecursionLimit() {return 8;}
472
static inline void Multiply2Bottom(word *C, const word *A, const word *B);
473
static void Multiply4Bottom(word *C, const word *A, const word *B);
474
static void Multiply8Bottom(word *C, const word *A, const word *B);
475
static inline unsigned int MultiplyBottomRecursionLimit() {return 8;}
477
static void Square2(word *R, const word *A);
478
static void Square4(word *R, const word *A);
479
static void Square8(word *R, const word *A) {assert(false);}
480
static inline unsigned int SquareRecursionLimit() {return 4;}
483
int Portable::Add(word *C, const word *A, const word *B, size_t N)
488
for (unsigned int i = 0; i < N; i+=2)
490
u = DWord(A[i]) + B[i] + u.GetHighHalf();
491
C[i] = u.GetLowHalf();
492
u = DWord(A[i+1]) + B[i+1] + u.GetHighHalf();
493
C[i+1] = u.GetLowHalf();
495
return int(u.GetHighHalf());
498
int Portable::Subtract(word *C, const word *A, const word *B, size_t N)
503
for (unsigned int i = 0; i < N; i+=2)
505
u = (DWord) A[i] - B[i] - u.GetHighHalfAsBorrow();
506
C[i] = u.GetLowHalf();
507
u = (DWord) A[i+1] - B[i+1] - u.GetHighHalfAsBorrow();
508
C[i+1] = u.GetLowHalf();
510
return int(0-u.GetHighHalf());
513
void Portable::Multiply2(word *C, const word *A, const word *B)
523
d = (dword)(A1-A0)*(B0-B1);
528
d = (dword)s*(word)(B0-B1);
534
d = (word)(A1-A0)*(dword)s;
539
d = (dword)(A0-A1)*(B1-B0);
542
// this segment is the branchless equivalent of above
543
word D[4] = {A[1]-A[0], A[0]-A[1], B[0]-B[1], B[1]-B[0]};
544
unsigned int ai = A[1] < A[0];
545
unsigned int bi = B[0] < B[1];
546
unsigned int di = ai & bi;
547
DWord d = DWord::Multiply(D[di], D[di+2]);
549
unsigned int si = ai + !bi;
552
DWord A0B0 = DWord::Multiply(A[0], B[0]);
553
C[0] = A0B0.GetLowHalf();
555
DWord A1B1 = DWord::Multiply(A[1], B[1]);
556
DWord t = (DWord) A0B0.GetHighHalf() + A0B0.GetLowHalf() + d.GetLowHalf() + A1B1.GetLowHalf();
557
C[1] = t.GetLowHalf();
559
t = A1B1 + t.GetHighHalf() + A0B0.GetHighHalf() + d.GetHighHalf() + A1B1.GetHighHalf() - s;
560
C[2] = t.GetLowHalf();
561
C[3] = t.GetHighHalf();
564
inline void Portable::Multiply2Bottom(word *C, const word *A, const word *B)
566
DWord t = DWord::Multiply(A[0], B[0]);
567
C[0] = t.GetLowHalf();
568
C[1] = t.GetHighHalf() + A[0]*B[1] + A[1]*B[0];
571
word Portable::Multiply2Add(word *C, const word *A, const word *B)
573
word D[4] = {A[1]-A[0], A[0]-A[1], B[0]-B[1], B[1]-B[0]};
574
unsigned int ai = A[1] < A[0];
575
unsigned int bi = B[0] < B[1];
576
unsigned int di = ai & bi;
577
DWord d = DWord::Multiply(D[di], D[di+2]);
579
unsigned int si = ai + !bi;
582
DWord A0B0 = DWord::Multiply(A[0], B[0]);
583
DWord t = A0B0 + C[0];
584
C[0] = t.GetLowHalf();
586
DWord A1B1 = DWord::Multiply(A[1], B[1]);
587
t = (DWord) t.GetHighHalf() + A0B0.GetLowHalf() + d.GetLowHalf() + A1B1.GetLowHalf() + C[1];
588
C[1] = t.GetLowHalf();
590
t = (DWord) t.GetHighHalf() + A1B1.GetLowHalf() + A0B0.GetHighHalf() + d.GetHighHalf() + A1B1.GetHighHalf() - s + C[2];
591
C[2] = t.GetLowHalf();
593
t = (DWord) t.GetHighHalf() + A1B1.GetHighHalf() + C[3];
594
C[3] = t.GetLowHalf();
595
return t.GetHighHalf();
598
#define MulAcc(x, y) \
599
p = DWord::MultiplyAndAdd(A[x], B[y], c); \
600
c = p.GetLowHalf(); \
601
p = (DWord) d + p.GetHighHalf(); \
602
d = p.GetLowHalf(); \
603
e += p.GetHighHalf();
605
#define SaveMulAcc(s, x, y) \
607
p = DWord::MultiplyAndAdd(A[x], B[y], d); \
608
c = p.GetLowHalf(); \
609
p = (DWord) e + p.GetHighHalf(); \
610
d = p.GetLowHalf(); \
613
#define SquAcc(x, y) \
614
q = DWord::Multiply(A[x], A[y]); \
616
c = p.GetLowHalf(); \
617
p = (DWord) d + p.GetHighHalf(); \
618
d = p.GetLowHalf(); \
619
e += p.GetHighHalf(); \
621
c = p.GetLowHalf(); \
622
p = (DWord) d + p.GetHighHalf(); \
623
d = p.GetLowHalf(); \
624
e += p.GetHighHalf();
626
#define SaveSquAcc(s, x, y) \
628
q = DWord::Multiply(A[x], A[y]); \
630
c = p.GetLowHalf(); \
631
p = (DWord) e + p.GetHighHalf(); \
632
d = p.GetLowHalf(); \
633
e = p.GetHighHalf(); \
635
c = p.GetLowHalf(); \
636
p = (DWord) d + p.GetHighHalf(); \
637
d = p.GetLowHalf(); \
638
e += p.GetHighHalf();
640
void Portable::Multiply4(word *R, const word *A, const word *B)
645
p = DWord::Multiply(A[0], B[0]);
646
R[0] = p.GetLowHalf();
670
p = DWord::MultiplyAndAdd(A[3], B[3], d);
671
R[6] = p.GetLowHalf();
672
R[7] = e + p.GetHighHalf();
675
void Portable::Square2(word *R, const word *A)
680
p = DWord::Multiply(A[0], A[0]);
681
R[0] = p.GetLowHalf();
688
p = DWord::MultiplyAndAdd(A[1], A[1], d);
689
R[2] = p.GetLowHalf();
690
R[3] = e + p.GetHighHalf();
693
void Portable::Square4(word *R, const word *A)
696
// VC60 workaround: MSVC 6.0 has an optimization bug that makes
697
// (dword)A*B where either A or B has been cast to a dword before
698
// very expensive. Revisit this function when this
706
p = DWord::Multiply(A[0], A[0]);
707
R[0] = p.GetLowHalf();
725
p = DWord::MultiplyAndAdd(A[3], A[3], d);
726
R[6] = p.GetLowHalf();
727
R[7] = e + p.GetHighHalf();
731
void Portable::Multiply8(word *R, const word *A, const word *B)
736
p = DWord::Multiply(A[0], B[0]);
737
R[0] = p.GetLowHalf();
804
SaveMulAcc(10, 4, 7);
809
SaveMulAcc(11, 5, 7);
813
SaveMulAcc(12, 6, 7);
817
p = DWord::MultiplyAndAdd(A[7], B[7], d);
818
R[14] = p.GetLowHalf();
819
R[15] = e + p.GetHighHalf();
822
void Portable::Multiply4Bottom(word *R, const word *A, const word *B)
827
p = DWord::Multiply(A[0], B[0]);
828
R[0] = p.GetLowHalf();
840
R[3] = d + A[0] * B[3] + A[1] * B[2] + A[2] * B[1] + A[3] * B[0];
843
void Portable::Multiply8Bottom(word *R, const word *A, const word *B)
848
p = DWord::Multiply(A[0], B[0]);
849
R[0] = p.GetLowHalf();
887
R[7] = d + A[0] * B[7] + A[1] * B[6] + A[2] * B[5] + A[3] * B[4] +
888
A[4] * B[3] + A[5] * B[2] + A[6] * B[1] + A[7] * B[0];
896
#ifdef CRYPTOPP_X86ASM_AVAILABLE
898
// ************** x86 feature detection ***************
900
static bool s_sse2Enabled = true;
902
static void CpuId(word32 input, word32 *output)
907
// save ebx in case -fPIC is being used
908
"push %%ebx; cpuid; mov %%ebx, %%edi; pop %%ebx"
909
: "=a" (output[0]), "=D" (output[1]), "=c" (output[2]), "=d" (output[3])
926
#ifdef SSE2_INTRINSICS_AVAILABLE
928
static jmp_buf s_env;
929
static void SigIllHandler(int)
935
static bool HasSSE2()
942
if ((cpuid[3] & (1 << 26)) == 0)
948
__asm xorpd xmm0, xmm0 // executing SSE2 instruction
956
typedef void (*SigHandler)(int);
958
SigHandler oldHandler = signal(SIGILL, SigIllHandler);
959
if (oldHandler == SIG_ERR)
966
__asm __volatile ("xorps %xmm0, %xmm0");
968
signal(SIGILL, oldHandler);
979
std::swap(cpuid[2], cpuid[3]);
980
if (memcmp(cpuid+1, "GenuineIntel", 12) != 0)
984
return ((cpuid[0] >> 8) & 0xf) == 0xf;
987
// ************** Pentium/P4 optimizations ***************
989
class PentiumOptimized : public Portable
992
static int Add(word *C, const word *A, const word *B, size_t N);
993
static int Subtract(word *C, const word *A, const word *B, size_t N);
994
static void Multiply4(word *C, const word *A, const word *B);
995
static void Multiply8(word *C, const word *A, const word *B);
996
static void Multiply8Bottom(word *C, const word *A, const word *B);
1002
static int Add(word *C, const word *A, const word *B, size_t N);
1003
static int Subtract(word *C, const word *A, const word *B, size_t N);
1004
#ifdef SSE2_INTRINSICS_AVAILABLE
1005
static void Multiply4(word *C, const word *A, const word *B);
1006
static void Multiply8(word *C, const word *A, const word *B);
1007
static void Multiply8Bottom(word *C, const word *A, const word *B);
1011
typedef int (* PAddSub)(word *C, const word *A, const word *B, size_t N);
442
// use some tricks to share assembly code between MSVC and GCC
443
#if defined(__GNUC__)
444
#define AddPrologue \
446
__asm__ __volatile__ \
448
".intel_syntax noprefix;"
449
#define AddEpilogue \
450
".att_syntax prefix;" \
452
: "d" (C), "a" (A), "D" (B), "c" (N) \
453
: "%esi", "memory", "cc" \
456
#define MulPrologue \
457
__asm__ __volatile__ \
459
".intel_syntax noprefix;" \
462
#define MulEpilogue \
464
".att_syntax prefix;" \
466
: "d" (s_maskLow16), "c" (C), "a" (A), "D" (B) \
467
: "%esi", "memory", "cc" \
469
#define SquPrologue MulPrologue
470
#define SquEpilogue \
472
".att_syntax prefix;" \
474
: "d" (s_maskLow16), "c" (C), "a" (A) \
475
: "%esi", "%edi", "memory", "cc" \
477
#define TopPrologue MulPrologue
478
#define TopEpilogue \
480
".att_syntax prefix;" \
482
: "d" (s_maskLow16), "c" (C), "a" (A), "D" (B), "S" (L) \
486
#define AddPrologue \
489
__asm mov eax, [esp+12] \
490
__asm mov edi, [esp+16]
491
#define AddEpilogue \
496
#define SaveEBX __asm push ebx
497
#define RestoreEBX __asm pop ebx
502
#define SquPrologue \
506
AS2( lea ebx, s_maskLow16)
507
#define MulPrologue \
512
AS2( lea ebx, s_maskLow16)
513
#define TopPrologue \
519
AS2( lea ebx, s_maskLow16)
520
#define SquEpilogue RestoreEBX
521
#define MulEpilogue RestoreEBX
522
#define TopEpilogue RestoreEBX
525
#ifdef CRYPTOPP_X64_MASM_AVAILABLE
527
int Baseline_Add(size_t N, word *C, const word *A, const word *B);
528
int Baseline_Sub(size_t N, word *C, const word *A, const word *B);
530
#elif defined(CRYPTOPP_X64_ASM_AVAILABLE) && defined(__GNUC__) && defined(CRYPTOPP_WORD128_AVAILABLE)
531
int Baseline_Add(size_t N, word *C, const word *A, const word *B)
539
AS2( mov %0,[%3+8*%1])
540
AS2( add %0,[%4+8*%1])
541
AS2( mov [%2+8*%1],%0)
543
AS2( mov %0,[%3+8*%1+8])
544
AS2( adc %0,[%4+8*%1+8])
545
AS2( mov [%2+8*%1+8],%0)
548
AS2( mov %0,[%3+8*%1])
549
AS2( adc %0,[%4+8*%1])
550
AS2( mov [%2+8*%1],%0)
556
: "=&r" (result), "+c" (N)
557
: "r" (C+N), "r" (A+N), "r" (B+N)
563
int Baseline_Sub(size_t N, word *C, const word *A, const word *B)
571
AS2( mov %0,[%3+8*%1])
572
AS2( sub %0,[%4+8*%1])
573
AS2( mov [%2+8*%1],%0)
575
AS2( mov %0,[%3+8*%1+8])
576
AS2( sbb %0,[%4+8*%1+8])
577
AS2( mov [%2+8*%1+8],%0)
580
AS2( mov %0,[%3+8*%1])
581
AS2( sbb %0,[%4+8*%1])
582
AS2( mov [%2+8*%1],%0)
588
: "=&r" (result), "+c" (N)
589
: "r" (C+N), "r" (A+N), "r" (B+N)
594
#elif defined(CRYPTOPP_X86_ASM_AVAILABLE) && CRYPTOPP_BOOL_X86
595
CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B)
599
// now: eax = A, edi = B, edx = C, ecx = N
600
AS2( lea eax, [eax+4*ecx])
601
AS2( lea edi, [edi+4*ecx])
602
AS2( lea edx, [edx+4*ecx])
604
AS1( neg ecx) // ecx is negative index
605
AS2( test ecx, 2) // this clears carry flag
611
ASJ( jecxz, 2, f) // loop until ecx overflows and becomes zero
612
AS2( mov esi,[eax+4*ecx])
613
AS2( adc esi,[edi+4*ecx])
614
AS2( mov [edx+4*ecx],esi)
615
AS2( mov esi,[eax+4*ecx+4])
616
AS2( adc esi,[edi+4*ecx+4])
617
AS2( mov [edx+4*ecx+4],esi)
619
AS2( mov esi,[eax+4*ecx+8])
620
AS2( adc esi,[edi+4*ecx+8])
621
AS2( mov [edx+4*ecx+8],esi)
622
AS2( mov esi,[eax+4*ecx+12])
623
AS2( adc esi,[edi+4*ecx+12])
624
AS2( mov [edx+4*ecx+12],esi)
626
AS2( lea ecx,[ecx+4]) // advance index, avoid inc which causes slowdown on Intel Core 2
631
AS1( setc al) // store carry into eax (return result register)
636
CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B)
640
// now: eax = A, edi = B, edx = C, ecx = N
641
AS2( lea eax, [eax+4*ecx])
642
AS2( lea edi, [edi+4*ecx])
643
AS2( lea edx, [edx+4*ecx])
645
AS1( neg ecx) // ecx is negative index
646
AS2( test ecx, 2) // this clears carry flag
652
ASJ( jecxz, 2, f) // loop until ecx overflows and becomes zero
653
AS2( mov esi,[eax+4*ecx])
654
AS2( sbb esi,[edi+4*ecx])
655
AS2( mov [edx+4*ecx],esi)
656
AS2( mov esi,[eax+4*ecx+4])
657
AS2( sbb esi,[edi+4*ecx+4])
658
AS2( mov [edx+4*ecx+4],esi)
660
AS2( mov esi,[eax+4*ecx+8])
661
AS2( sbb esi,[edi+4*ecx+8])
662
AS2( mov [edx+4*ecx+8],esi)
663
AS2( mov esi,[eax+4*ecx+12])
664
AS2( sbb esi,[edi+4*ecx+12])
665
AS2( mov [edx+4*ecx+12],esi)
667
AS2( lea ecx,[ecx+4]) // advance index, avoid inc which causes slowdown on Intel Core 2
672
AS1( setc al) // store carry into eax (return result register)
677
#if CRYPTOPP_INTEGER_SSE2
678
CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Add(size_t N, word *C, const word *A, const word *B)
682
// now: eax = A, edi = B, edx = C, ecx = N
683
AS2( lea eax, [eax+4*ecx])
684
AS2( lea edi, [edi+4*ecx])
685
AS2( lea edx, [edx+4*ecx])
687
AS1( neg ecx) // ecx is negative index
690
AS2( test ecx, 2) // this clears carry flag
696
AS2( movd mm0, DWORD PTR [eax+4*ecx])
697
AS2( movd mm1, DWORD PTR [edi+4*ecx])
700
AS2( movd DWORD PTR [edx+4*ecx], mm2)
703
AS2( movd mm0, DWORD PTR [eax+4*ecx+4])
704
AS2( movd mm1, DWORD PTR [edi+4*ecx+4])
707
AS2( movd DWORD PTR [edx+4*ecx+4], mm2)
711
AS2( movd mm0, DWORD PTR [eax+4*ecx+8])
712
AS2( movd mm1, DWORD PTR [edi+4*ecx+8])
715
AS2( movd DWORD PTR [edx+4*ecx+8], mm2)
718
AS2( movd mm0, DWORD PTR [eax+4*ecx+12])
719
AS2( movd mm1, DWORD PTR [edi+4*ecx+12])
722
AS2( movd DWORD PTR [edx+4*ecx+12], mm2)
734
CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Sub(size_t N, word *C, const word *A, const word *B)
738
// now: eax = A, edi = B, edx = C, ecx = N
739
AS2( lea eax, [eax+4*ecx])
740
AS2( lea edi, [edi+4*ecx])
741
AS2( lea edx, [edx+4*ecx])
743
AS1( neg ecx) // ecx is negative index
746
AS2( test ecx, 2) // this clears carry flag
752
AS2( movd mm0, DWORD PTR [eax+4*ecx])
753
AS2( movd mm1, DWORD PTR [edi+4*ecx])
756
AS2( movd DWORD PTR [edx+4*ecx], mm0)
759
AS2( movd mm2, DWORD PTR [eax+4*ecx+4])
760
AS2( movd mm1, DWORD PTR [edi+4*ecx+4])
763
AS2( movd DWORD PTR [edx+4*ecx+4], mm2)
767
AS2( movd mm0, DWORD PTR [eax+4*ecx+8])
768
AS2( movd mm1, DWORD PTR [edi+4*ecx+8])
771
AS2( movd DWORD PTR [edx+4*ecx+8], mm0)
774
AS2( movd mm2, DWORD PTR [eax+4*ecx+12])
775
AS2( movd mm1, DWORD PTR [edi+4*ecx+12])
778
AS2( movd DWORD PTR [edx+4*ecx+12], mm2)
790
#endif // #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
792
int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B)
798
for (size_t i=0; i<N; i+=2)
800
AddWithCarry(u, A[i], B[i]);
802
AddWithCarry(u, A[i+1], B[i+1]);
805
return int(GetCarry(u));
808
int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B)
814
for (size_t i=0; i<N; i+=2)
816
SubtractWithBorrow(u, A[i], B[i]);
818
SubtractWithBorrow(u, A[i+1], B[i+1]);
821
return int(GetBorrow(u));
825
static word LinearMultiply(word *C, const word *A, word B, size_t N)
828
for(unsigned i=0; i<N; i++)
831
MultiplyWords(p, A[i], B);
832
Acc2WordsBy1(p, carry);
839
#ifndef CRYPTOPP_DOXYGEN_PROCESSING
843
Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
848
Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
849
Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \
850
Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
851
Mul_SaveAcc(3, 1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) \
852
Mul_SaveAcc(4, 2, 3) Mul_Acc(3, 2) \
857
Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
858
Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \
859
Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
860
Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
861
Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
862
Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
863
Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
864
Mul_SaveAcc(7, 1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) \
865
Mul_SaveAcc(8, 2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) \
866
Mul_SaveAcc(9, 3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) \
867
Mul_SaveAcc(10, 4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) \
868
Mul_SaveAcc(11, 5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) \
869
Mul_SaveAcc(12, 6, 7) Mul_Acc(7, 6) \
874
Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
875
Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \
876
Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
877
Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
878
Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
879
Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
880
Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
881
Mul_SaveAcc(7, 0, 8) Mul_Acc(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) Mul_Acc(8, 0) \
882
Mul_SaveAcc(8, 0, 9) Mul_Acc(1, 8) Mul_Acc(2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) Mul_Acc(8, 1) Mul_Acc(9, 0) \
883
Mul_SaveAcc(9, 0, 10) Mul_Acc(1, 9) Mul_Acc(2, 8) Mul_Acc(3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) Mul_Acc(8, 2) Mul_Acc(9, 1) Mul_Acc(10, 0) \
884
Mul_SaveAcc(10, 0, 11) Mul_Acc(1, 10) Mul_Acc(2, 9) Mul_Acc(3, 8) Mul_Acc(4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) Mul_Acc(8, 3) Mul_Acc(9, 2) Mul_Acc(10, 1) Mul_Acc(11, 0) \
885
Mul_SaveAcc(11, 0, 12) Mul_Acc(1, 11) Mul_Acc(2, 10) Mul_Acc(3, 9) Mul_Acc(4, 8) Mul_Acc(5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) Mul_Acc(8, 4) Mul_Acc(9, 3) Mul_Acc(10, 2) Mul_Acc(11, 1) Mul_Acc(12, 0) \
886
Mul_SaveAcc(12, 0, 13) Mul_Acc(1, 12) Mul_Acc(2, 11) Mul_Acc(3, 10) Mul_Acc(4, 9) Mul_Acc(5, 8) Mul_Acc(6, 7) Mul_Acc(7, 6) Mul_Acc(8, 5) Mul_Acc(9, 4) Mul_Acc(10, 3) Mul_Acc(11, 2) Mul_Acc(12, 1) Mul_Acc(13, 0) \
887
Mul_SaveAcc(13, 0, 14) Mul_Acc(1, 13) Mul_Acc(2, 12) Mul_Acc(3, 11) Mul_Acc(4, 10) Mul_Acc(5, 9) Mul_Acc(6, 8) Mul_Acc(7, 7) Mul_Acc(8, 6) Mul_Acc(9, 5) Mul_Acc(10, 4) Mul_Acc(11, 3) Mul_Acc(12, 2) Mul_Acc(13, 1) Mul_Acc(14, 0) \
888
Mul_SaveAcc(14, 0, 15) Mul_Acc(1, 14) Mul_Acc(2, 13) Mul_Acc(3, 12) Mul_Acc(4, 11) Mul_Acc(5, 10) Mul_Acc(6, 9) Mul_Acc(7, 8) Mul_Acc(8, 7) Mul_Acc(9, 6) Mul_Acc(10, 5) Mul_Acc(11, 4) Mul_Acc(12, 3) Mul_Acc(13, 2) Mul_Acc(14, 1) Mul_Acc(15, 0) \
889
Mul_SaveAcc(15, 1, 15) Mul_Acc(2, 14) Mul_Acc(3, 13) Mul_Acc(4, 12) Mul_Acc(5, 11) Mul_Acc(6, 10) Mul_Acc(7, 9) Mul_Acc(8, 8) Mul_Acc(9, 7) Mul_Acc(10, 6) Mul_Acc(11, 5) Mul_Acc(12, 4) Mul_Acc(13, 3) Mul_Acc(14, 2) Mul_Acc(15, 1) \
890
Mul_SaveAcc(16, 2, 15) Mul_Acc(3, 14) Mul_Acc(4, 13) Mul_Acc(5, 12) Mul_Acc(6, 11) Mul_Acc(7, 10) Mul_Acc(8, 9) Mul_Acc(9, 8) Mul_Acc(10, 7) Mul_Acc(11, 6) Mul_Acc(12, 5) Mul_Acc(13, 4) Mul_Acc(14, 3) Mul_Acc(15, 2) \
891
Mul_SaveAcc(17, 3, 15) Mul_Acc(4, 14) Mul_Acc(5, 13) Mul_Acc(6, 12) Mul_Acc(7, 11) Mul_Acc(8, 10) Mul_Acc(9, 9) Mul_Acc(10, 8) Mul_Acc(11, 7) Mul_Acc(12, 6) Mul_Acc(13, 5) Mul_Acc(14, 4) Mul_Acc(15, 3) \
892
Mul_SaveAcc(18, 4, 15) Mul_Acc(5, 14) Mul_Acc(6, 13) Mul_Acc(7, 12) Mul_Acc(8, 11) Mul_Acc(9, 10) Mul_Acc(10, 9) Mul_Acc(11, 8) Mul_Acc(12, 7) Mul_Acc(13, 6) Mul_Acc(14, 5) Mul_Acc(15, 4) \
893
Mul_SaveAcc(19, 5, 15) Mul_Acc(6, 14) Mul_Acc(7, 13) Mul_Acc(8, 12) Mul_Acc(9, 11) Mul_Acc(10, 10) Mul_Acc(11, 9) Mul_Acc(12, 8) Mul_Acc(13, 7) Mul_Acc(14, 6) Mul_Acc(15, 5) \
894
Mul_SaveAcc(20, 6, 15) Mul_Acc(7, 14) Mul_Acc(8, 13) Mul_Acc(9, 12) Mul_Acc(10, 11) Mul_Acc(11, 10) Mul_Acc(12, 9) Mul_Acc(13, 8) Mul_Acc(14, 7) Mul_Acc(15, 6) \
895
Mul_SaveAcc(21, 7, 15) Mul_Acc(8, 14) Mul_Acc(9, 13) Mul_Acc(10, 12) Mul_Acc(11, 11) Mul_Acc(12, 10) Mul_Acc(13, 9) Mul_Acc(14, 8) Mul_Acc(15, 7) \
896
Mul_SaveAcc(22, 8, 15) Mul_Acc(9, 14) Mul_Acc(10, 13) Mul_Acc(11, 12) Mul_Acc(12, 11) Mul_Acc(13, 10) Mul_Acc(14, 9) Mul_Acc(15, 8) \
897
Mul_SaveAcc(23, 9, 15) Mul_Acc(10, 14) Mul_Acc(11, 13) Mul_Acc(12, 12) Mul_Acc(13, 11) Mul_Acc(14, 10) Mul_Acc(15, 9) \
898
Mul_SaveAcc(24, 10, 15) Mul_Acc(11, 14) Mul_Acc(12, 13) Mul_Acc(13, 12) Mul_Acc(14, 11) Mul_Acc(15, 10) \
899
Mul_SaveAcc(25, 11, 15) Mul_Acc(12, 14) Mul_Acc(13, 13) Mul_Acc(14, 12) Mul_Acc(15, 11) \
900
Mul_SaveAcc(26, 12, 15) Mul_Acc(13, 14) Mul_Acc(14, 13) Mul_Acc(15, 12) \
901
Mul_SaveAcc(27, 13, 15) Mul_Acc(14, 14) Mul_Acc(15, 13) \
902
Mul_SaveAcc(28, 14, 15) Mul_Acc(15, 14) \
911
Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \
912
Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \
913
Squ_SaveAcc(3, 1, 3) Squ_Diag(2) \
914
Squ_SaveAcc(4, 2, 3) Squ_NonDiag \
919
Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \
920
Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \
921
Squ_SaveAcc(3, 0, 4) Squ_Acc(1, 3) Squ_Diag(2) \
922
Squ_SaveAcc(4, 0, 5) Squ_Acc(1, 4) Squ_Acc(2, 3) Squ_NonDiag \
923
Squ_SaveAcc(5, 0, 6) Squ_Acc(1, 5) Squ_Acc(2, 4) Squ_Diag(3) \
924
Squ_SaveAcc(6, 0, 7) Squ_Acc(1, 6) Squ_Acc(2, 5) Squ_Acc(3, 4) Squ_NonDiag \
925
Squ_SaveAcc(7, 1, 7) Squ_Acc(2, 6) Squ_Acc(3, 5) Squ_Diag(4) \
926
Squ_SaveAcc(8, 2, 7) Squ_Acc(3, 6) Squ_Acc(4, 5) Squ_NonDiag \
927
Squ_SaveAcc(9, 3, 7) Squ_Acc(4, 6) Squ_Diag(5) \
928
Squ_SaveAcc(10, 4, 7) Squ_Acc(5, 6) Squ_NonDiag \
929
Squ_SaveAcc(11, 5, 7) Squ_Diag(6) \
930
Squ_SaveAcc(12, 6, 7) Squ_NonDiag \
935
Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \
936
Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \
937
Squ_SaveAcc(3, 0, 4) Squ_Acc(1, 3) Squ_Diag(2) \
938
Squ_SaveAcc(4, 0, 5) Squ_Acc(1, 4) Squ_Acc(2, 3) Squ_NonDiag \
939
Squ_SaveAcc(5, 0, 6) Squ_Acc(1, 5) Squ_Acc(2, 4) Squ_Diag(3) \
940
Squ_SaveAcc(6, 0, 7) Squ_Acc(1, 6) Squ_Acc(2, 5) Squ_Acc(3, 4) Squ_NonDiag \
941
Squ_SaveAcc(7, 0, 8) Squ_Acc(1, 7) Squ_Acc(2, 6) Squ_Acc(3, 5) Squ_Diag(4) \
942
Squ_SaveAcc(8, 0, 9) Squ_Acc(1, 8) Squ_Acc(2, 7) Squ_Acc(3, 6) Squ_Acc(4, 5) Squ_NonDiag \
943
Squ_SaveAcc(9, 0, 10) Squ_Acc(1, 9) Squ_Acc(2, 8) Squ_Acc(3, 7) Squ_Acc(4, 6) Squ_Diag(5) \
944
Squ_SaveAcc(10, 0, 11) Squ_Acc(1, 10) Squ_Acc(2, 9) Squ_Acc(3, 8) Squ_Acc(4, 7) Squ_Acc(5, 6) Squ_NonDiag \
945
Squ_SaveAcc(11, 0, 12) Squ_Acc(1, 11) Squ_Acc(2, 10) Squ_Acc(3, 9) Squ_Acc(4, 8) Squ_Acc(5, 7) Squ_Diag(6) \
946
Squ_SaveAcc(12, 0, 13) Squ_Acc(1, 12) Squ_Acc(2, 11) Squ_Acc(3, 10) Squ_Acc(4, 9) Squ_Acc(5, 8) Squ_Acc(6, 7) Squ_NonDiag \
947
Squ_SaveAcc(13, 0, 14) Squ_Acc(1, 13) Squ_Acc(2, 12) Squ_Acc(3, 11) Squ_Acc(4, 10) Squ_Acc(5, 9) Squ_Acc(6, 8) Squ_Diag(7) \
948
Squ_SaveAcc(14, 0, 15) Squ_Acc(1, 14) Squ_Acc(2, 13) Squ_Acc(3, 12) Squ_Acc(4, 11) Squ_Acc(5, 10) Squ_Acc(6, 9) Squ_Acc(7, 8) Squ_NonDiag \
949
Squ_SaveAcc(15, 1, 15) Squ_Acc(2, 14) Squ_Acc(3, 13) Squ_Acc(4, 12) Squ_Acc(5, 11) Squ_Acc(6, 10) Squ_Acc(7, 9) Squ_Diag(8) \
950
Squ_SaveAcc(16, 2, 15) Squ_Acc(3, 14) Squ_Acc(4, 13) Squ_Acc(5, 12) Squ_Acc(6, 11) Squ_Acc(7, 10) Squ_Acc(8, 9) Squ_NonDiag \
951
Squ_SaveAcc(17, 3, 15) Squ_Acc(4, 14) Squ_Acc(5, 13) Squ_Acc(6, 12) Squ_Acc(7, 11) Squ_Acc(8, 10) Squ_Diag(9) \
952
Squ_SaveAcc(18, 4, 15) Squ_Acc(5, 14) Squ_Acc(6, 13) Squ_Acc(7, 12) Squ_Acc(8, 11) Squ_Acc(9, 10) Squ_NonDiag \
953
Squ_SaveAcc(19, 5, 15) Squ_Acc(6, 14) Squ_Acc(7, 13) Squ_Acc(8, 12) Squ_Acc(9, 11) Squ_Diag(10) \
954
Squ_SaveAcc(20, 6, 15) Squ_Acc(7, 14) Squ_Acc(8, 13) Squ_Acc(9, 12) Squ_Acc(10, 11) Squ_NonDiag \
955
Squ_SaveAcc(21, 7, 15) Squ_Acc(8, 14) Squ_Acc(9, 13) Squ_Acc(10, 12) Squ_Diag(11) \
956
Squ_SaveAcc(22, 8, 15) Squ_Acc(9, 14) Squ_Acc(10, 13) Squ_Acc(11, 12) Squ_NonDiag \
957
Squ_SaveAcc(23, 9, 15) Squ_Acc(10, 14) Squ_Acc(11, 13) Squ_Diag(12) \
958
Squ_SaveAcc(24, 10, 15) Squ_Acc(11, 14) Squ_Acc(12, 13) Squ_NonDiag \
959
Squ_SaveAcc(25, 11, 15) Squ_Acc(12, 14) Squ_Diag(13) \
960
Squ_SaveAcc(26, 12, 15) Squ_Acc(13, 14) Squ_NonDiag \
961
Squ_SaveAcc(27, 13, 15) Squ_Diag(14) \
962
Squ_SaveAcc(28, 14, 15) Squ_NonDiag \
967
Bot_SaveAcc(0, 0, 1) Bot_Acc(1, 0) \
972
Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
973
Mul_SaveAcc(1, 2, 0) Mul_Acc(1, 1) Mul_Acc(0, 2) \
974
Bot_SaveAcc(2, 0, 3) Bot_Acc(1, 2) Bot_Acc(2, 1) Bot_Acc(3, 0) \
979
Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
980
Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \
981
Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
982
Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
983
Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
984
Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
985
Bot_SaveAcc(6, 0, 7) Bot_Acc(1, 6) Bot_Acc(2, 5) Bot_Acc(3, 4) Bot_Acc(4, 3) Bot_Acc(5, 2) Bot_Acc(6, 1) Bot_Acc(7, 0) \
990
Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
991
Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \
992
Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
993
Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
994
Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
995
Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
996
Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
997
Mul_SaveAcc(7, 0, 8) Mul_Acc(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) Mul_Acc(8, 0) \
998
Mul_SaveAcc(8, 0, 9) Mul_Acc(1, 8) Mul_Acc(2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) Mul_Acc(8, 1) Mul_Acc(9, 0) \
999
Mul_SaveAcc(9, 0, 10) Mul_Acc(1, 9) Mul_Acc(2, 8) Mul_Acc(3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) Mul_Acc(8, 2) Mul_Acc(9, 1) Mul_Acc(10, 0) \
1000
Mul_SaveAcc(10, 0, 11) Mul_Acc(1, 10) Mul_Acc(2, 9) Mul_Acc(3, 8) Mul_Acc(4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) Mul_Acc(8, 3) Mul_Acc(9, 2) Mul_Acc(10, 1) Mul_Acc(11, 0) \
1001
Mul_SaveAcc(11, 0, 12) Mul_Acc(1, 11) Mul_Acc(2, 10) Mul_Acc(3, 9) Mul_Acc(4, 8) Mul_Acc(5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) Mul_Acc(8, 4) Mul_Acc(9, 3) Mul_Acc(10, 2) Mul_Acc(11, 1) Mul_Acc(12, 0) \
1002
Mul_SaveAcc(12, 0, 13) Mul_Acc(1, 12) Mul_Acc(2, 11) Mul_Acc(3, 10) Mul_Acc(4, 9) Mul_Acc(5, 8) Mul_Acc(6, 7) Mul_Acc(7, 6) Mul_Acc(8, 5) Mul_Acc(9, 4) Mul_Acc(10, 3) Mul_Acc(11, 2) Mul_Acc(12, 1) Mul_Acc(13, 0) \
1003
Mul_SaveAcc(13, 0, 14) Mul_Acc(1, 13) Mul_Acc(2, 12) Mul_Acc(3, 11) Mul_Acc(4, 10) Mul_Acc(5, 9) Mul_Acc(6, 8) Mul_Acc(7, 7) Mul_Acc(8, 6) Mul_Acc(9, 5) Mul_Acc(10, 4) Mul_Acc(11, 3) Mul_Acc(12, 2) Mul_Acc(13, 1) Mul_Acc(14, 0) \
1004
Bot_SaveAcc(14, 0, 15) Bot_Acc(1, 14) Bot_Acc(2, 13) Bot_Acc(3, 12) Bot_Acc(4, 11) Bot_Acc(5, 10) Bot_Acc(6, 9) Bot_Acc(7, 8) Bot_Acc(8, 7) Bot_Acc(9, 6) Bot_Acc(10, 5) Bot_Acc(11, 4) Bot_Acc(12, 3) Bot_Acc(13, 2) Bot_Acc(14, 1) Bot_Acc(15, 0) \
1010
#define Mul_Begin(n) \
1014
MultiplyWords(p, A[0], B[0]) \
1015
AssignWord(c, LowWord(p)) \
1016
AssignWord(d, HighWord(p))
1018
#define Mul_Acc(i, j) \
1019
MultiplyWords(p, A[i], B[j]) \
1020
Acc2WordsBy1(c, LowWord(p)) \
1021
Acc2WordsBy1(d, HighWord(p))
1023
#define Mul_SaveAcc(k, i, j) \
1024
R[k] = LowWord(c); \
1025
Add2WordsBy1(c, d, HighWord(c)) \
1026
MultiplyWords(p, A[i], B[j]) \
1027
AssignWord(d, HighWord(p)) \
1028
Acc2WordsBy1(c, LowWord(p))
1030
#define Mul_End(n) \
1031
R[2*n-3] = LowWord(c); \
1032
Acc2WordsBy1(d, HighWord(c)) \
1033
MultiplyWords(p, A[n-1], B[n-1])\
1034
Acc2WordsBy2(d, p) \
1035
R[2*n-2] = LowWord(d); \
1036
R[2*n-1] = HighWord(d);
1038
#define Bot_SaveAcc(k, i, j) \
1039
R[k] = LowWord(c); \
1040
word e = LowWord(d) + HighWord(c); \
1043
#define Bot_Acc(i, j) \
1046
#define Bot_End(n) \
1049
#define Mul_Begin(n) \
1053
MultiplyWords(p, A[0], B[0]) \
1055
AssignWord(d, HighWord(p))
1057
#define Mul_Acc(i, j) \
1058
MulAcc(c, d, A[i], B[j])
1060
#define Mul_SaveAcc(k, i, j) \
1063
AssignWord(d, HighWord(d)) \
1064
MulAcc(c, d, A[i], B[j])
1066
#define Mul_End(k, i) \
1068
MultiplyWords(p, A[i], B[i]) \
1069
Acc2WordsBy2(p, d) \
1070
R[k+1] = LowWord(p); \
1071
R[k+2] = HighWord(p);
1073
#define Bot_SaveAcc(k, i, j) \
1078
#define Bot_Acc(i, j) \
1081
#define Bot_End(n) \
1085
#define Squ_Begin(n) \
1090
MultiplyWords(p, A[0], A[0]) \
1091
R[0] = LowWord(p); \
1092
AssignWord(e, HighWord(p)) \
1093
MultiplyWords(p, A[0], A[1]) \
1095
AssignWord(d, HighWord(p)) \
1098
#define Squ_NonDiag \
1101
#define Squ_SaveAcc(k, i, j) \
1102
Acc3WordsBy2(c, d, e) \
1104
MultiplyWords(p, A[i], A[j]) \
1106
AssignWord(d, HighWord(p)) \
1108
#define Squ_Acc(i, j) \
1109
MulAcc(c, d, A[i], A[j])
1111
#define Squ_Diag(i) \
1113
MulAcc(c, d, A[i], A[i])
1115
#define Squ_End(n) \
1116
Acc3WordsBy2(c, d, e) \
1118
MultiplyWords(p, A[n-1], A[n-1])\
1119
Acc2WordsBy2(p, e) \
1120
R[2*n-2] = LowWord(p); \
1121
R[2*n-1] = HighWord(p);
1123
void Baseline_Multiply2(word *R, const word *A, const word *B)
1128
void Baseline_Multiply4(word *R, const word *A, const word *B)
1133
void Baseline_Multiply8(word *R, const word *A, const word *B)
1138
void Baseline_Square2(word *R, const word *A)
1143
void Baseline_Square4(word *R, const word *A)
1148
void Baseline_Square8(word *R, const word *A)
1153
void Baseline_MultiplyBottom2(word *R, const word *A, const word *B)
1158
void Baseline_MultiplyBottom4(word *R, const word *A, const word *B)
1163
void Baseline_MultiplyBottom8(word *R, const word *A, const word *B)
1168
#define Top_Begin(n) \
1172
MultiplyWords(p, A[0], B[n-2]);\
1173
AssignWord(d, HighWord(p));
1175
#define Top_Acc(i, j) \
1176
MultiplyWords(p, A[i], B[j]);\
1177
Acc2WordsBy1(d, HighWord(p));
1179
#define Top_SaveAcc0(i, j) \
1181
AssignWord(d, HighWord(d)) \
1182
MulAcc(c, d, A[i], B[j])
1184
#define Top_SaveAcc1(i, j) \
1186
Acc2WordsBy1(d, c); \
1188
AssignWord(d, HighWord(d)) \
1189
MulAcc(c, d, A[i], B[j])
1191
void Baseline_MultiplyTop2(word *R, const word *A, const word *B, word L)
1194
Baseline_Multiply2(T, A, B);
1199
void Baseline_MultiplyTop4(word *R, const word *A, const word *B, word L)
1202
Top_Acc(1, 1) Top_Acc(2, 0) \
1203
Top_SaveAcc0(0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
1204
Top_SaveAcc1(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) \
1205
Mul_SaveAcc(0, 2, 3) Mul_Acc(3, 2) \
1209
void Baseline_MultiplyTop8(word *R, const word *A, const word *B, word L)
1212
Top_Acc(1, 5) Top_Acc(2, 4) Top_Acc(3, 3) Top_Acc(4, 2) Top_Acc(5, 1) Top_Acc(6, 0) \
1213
Top_SaveAcc0(0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
1214
Top_SaveAcc1(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) \
1215
Mul_SaveAcc(0, 2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) \
1216
Mul_SaveAcc(1, 3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) \
1217
Mul_SaveAcc(2, 4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) \
1218
Mul_SaveAcc(3, 5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) \
1219
Mul_SaveAcc(4, 6, 7) Mul_Acc(7, 6) \
1223
#if !CRYPTOPP_INTEGER_SSE2 // save memory by not compiling these functions when SSE2 is available
1224
void Baseline_Multiply16(word *R, const word *A, const word *B)
1229
void Baseline_Square16(word *R, const word *A)
1234
void Baseline_MultiplyBottom16(word *R, const word *A, const word *B)
1239
void Baseline_MultiplyTop16(word *R, const word *A, const word *B, word L)
1242
Top_Acc(1, 13) Top_Acc(2, 12) Top_Acc(3, 11) Top_Acc(4, 10) Top_Acc(5, 9) Top_Acc(6, 8) Top_Acc(7, 7) Top_Acc(8, 6) Top_Acc(9, 5) Top_Acc(10, 4) Top_Acc(11, 3) Top_Acc(12, 2) Top_Acc(13, 1) Top_Acc(14, 0) \
1243
Top_SaveAcc0(0, 15) Mul_Acc(1, 14) Mul_Acc(2, 13) Mul_Acc(3, 12) Mul_Acc(4, 11) Mul_Acc(5, 10) Mul_Acc(6, 9) Mul_Acc(7, 8) Mul_Acc(8, 7) Mul_Acc(9, 6) Mul_Acc(10, 5) Mul_Acc(11, 4) Mul_Acc(12, 3) Mul_Acc(13, 2) Mul_Acc(14, 1) Mul_Acc(15, 0) \
1244
Top_SaveAcc1(1, 15) Mul_Acc(2, 14) Mul_Acc(3, 13) Mul_Acc(4, 12) Mul_Acc(5, 11) Mul_Acc(6, 10) Mul_Acc(7, 9) Mul_Acc(8, 8) Mul_Acc(9, 7) Mul_Acc(10, 6) Mul_Acc(11, 5) Mul_Acc(12, 4) Mul_Acc(13, 3) Mul_Acc(14, 2) Mul_Acc(15, 1) \
1245
Mul_SaveAcc(0, 2, 15) Mul_Acc(3, 14) Mul_Acc(4, 13) Mul_Acc(5, 12) Mul_Acc(6, 11) Mul_Acc(7, 10) Mul_Acc(8, 9) Mul_Acc(9, 8) Mul_Acc(10, 7) Mul_Acc(11, 6) Mul_Acc(12, 5) Mul_Acc(13, 4) Mul_Acc(14, 3) Mul_Acc(15, 2) \
1246
Mul_SaveAcc(1, 3, 15) Mul_Acc(4, 14) Mul_Acc(5, 13) Mul_Acc(6, 12) Mul_Acc(7, 11) Mul_Acc(8, 10) Mul_Acc(9, 9) Mul_Acc(10, 8) Mul_Acc(11, 7) Mul_Acc(12, 6) Mul_Acc(13, 5) Mul_Acc(14, 4) Mul_Acc(15, 3) \
1247
Mul_SaveAcc(2, 4, 15) Mul_Acc(5, 14) Mul_Acc(6, 13) Mul_Acc(7, 12) Mul_Acc(8, 11) Mul_Acc(9, 10) Mul_Acc(10, 9) Mul_Acc(11, 8) Mul_Acc(12, 7) Mul_Acc(13, 6) Mul_Acc(14, 5) Mul_Acc(15, 4) \
1248
Mul_SaveAcc(3, 5, 15) Mul_Acc(6, 14) Mul_Acc(7, 13) Mul_Acc(8, 12) Mul_Acc(9, 11) Mul_Acc(10, 10) Mul_Acc(11, 9) Mul_Acc(12, 8) Mul_Acc(13, 7) Mul_Acc(14, 6) Mul_Acc(15, 5) \
1249
Mul_SaveAcc(4, 6, 15) Mul_Acc(7, 14) Mul_Acc(8, 13) Mul_Acc(9, 12) Mul_Acc(10, 11) Mul_Acc(11, 10) Mul_Acc(12, 9) Mul_Acc(13, 8) Mul_Acc(14, 7) Mul_Acc(15, 6) \
1250
Mul_SaveAcc(5, 7, 15) Mul_Acc(8, 14) Mul_Acc(9, 13) Mul_Acc(10, 12) Mul_Acc(11, 11) Mul_Acc(12, 10) Mul_Acc(13, 9) Mul_Acc(14, 8) Mul_Acc(15, 7) \
1251
Mul_SaveAcc(6, 8, 15) Mul_Acc(9, 14) Mul_Acc(10, 13) Mul_Acc(11, 12) Mul_Acc(12, 11) Mul_Acc(13, 10) Mul_Acc(14, 9) Mul_Acc(15, 8) \
1252
Mul_SaveAcc(7, 9, 15) Mul_Acc(10, 14) Mul_Acc(11, 13) Mul_Acc(12, 12) Mul_Acc(13, 11) Mul_Acc(14, 10) Mul_Acc(15, 9) \
1253
Mul_SaveAcc(8, 10, 15) Mul_Acc(11, 14) Mul_Acc(12, 13) Mul_Acc(13, 12) Mul_Acc(14, 11) Mul_Acc(15, 10) \
1254
Mul_SaveAcc(9, 11, 15) Mul_Acc(12, 14) Mul_Acc(13, 13) Mul_Acc(14, 12) Mul_Acc(15, 11) \
1255
Mul_SaveAcc(10, 12, 15) Mul_Acc(13, 14) Mul_Acc(14, 13) Mul_Acc(15, 12) \
1256
Mul_SaveAcc(11, 13, 15) Mul_Acc(14, 14) Mul_Acc(15, 13) \
1257
Mul_SaveAcc(12, 14, 15) Mul_Acc(15, 14) \
1262
// ********************************************************
1264
#if CRYPTOPP_INTEGER_SSE2
1266
CRYPTOPP_ALIGN_DATA(16) static const word32 s_maskLow16[4] CRYPTOPP_SECTION_ALIGN16 = {0xffff,0xffff,0xffff,0xffff};
1284
#define SSE2_FinalSave(k) \
1285
AS2( psllq xmm5, 16) \
1286
AS2( paddq xmm4, xmm5) \
1287
AS2( movq QWORD PTR [ecx+8*(k)], xmm4)
1289
#define SSE2_SaveShift(k) \
1290
AS2( movq xmm0, xmm6) \
1291
AS2( punpckhqdq xmm6, xmm0) \
1292
AS2( movq xmm1, xmm7) \
1293
AS2( punpckhqdq xmm7, xmm1) \
1294
AS2( paddd xmm6, xmm0) \
1295
AS2( pslldq xmm6, 4) \
1296
AS2( paddd xmm7, xmm1) \
1297
AS2( paddd xmm4, xmm6) \
1298
AS2( pslldq xmm7, 4) \
1299
AS2( movq xmm6, xmm4) \
1300
AS2( paddd xmm5, xmm7) \
1301
AS2( movq xmm7, xmm5) \
1302
AS2( movd DWORD PTR [ecx+8*(k)], xmm4) \
1303
AS2( psrlq xmm6, 16) \
1304
AS2( paddq xmm6, xmm7) \
1305
AS2( punpckhqdq xmm4, xmm0) \
1306
AS2( punpckhqdq xmm5, xmm0) \
1307
AS2( movq QWORD PTR [ecx+8*(k)+2], xmm6) \
1308
AS2( psrlq xmm6, 3*16) \
1309
AS2( paddd xmm4, xmm6) \
1311
#define Squ_SSE2_SaveShift(k) \
1312
AS2( movq xmm0, xmm6) \
1313
AS2( punpckhqdq xmm6, xmm0) \
1314
AS2( movq xmm1, xmm7) \
1315
AS2( punpckhqdq xmm7, xmm1) \
1316
AS2( paddd xmm6, xmm0) \
1317
AS2( pslldq xmm6, 4) \
1318
AS2( paddd xmm7, xmm1) \
1319
AS2( paddd xmm4, xmm6) \
1320
AS2( pslldq xmm7, 4) \
1321
AS2( movhlps xmm6, xmm4) \
1322
AS2( movd DWORD PTR [ecx+8*(k)], xmm4) \
1323
AS2( paddd xmm5, xmm7) \
1324
AS2( movhps QWORD PTR [esp+12], xmm5)\
1325
AS2( psrlq xmm4, 16) \
1326
AS2( paddq xmm4, xmm5) \
1327
AS2( movq QWORD PTR [ecx+8*(k)+2], xmm4) \
1328
AS2( psrlq xmm4, 3*16) \
1329
AS2( paddd xmm4, xmm6) \
1330
AS2( movq QWORD PTR [esp+4], xmm4)\
1332
#define SSE2_FirstMultiply(i) \
1333
AS2( movdqa xmm7, [esi+(i)*16])\
1334
AS2( movdqa xmm5, [edi-(i)*16])\
1335
AS2( pmuludq xmm5, xmm7) \
1336
AS2( movdqa xmm4, [ebx])\
1337
AS2( movdqa xmm6, xmm4) \
1338
AS2( pand xmm4, xmm5) \
1339
AS2( psrld xmm5, 16) \
1340
AS2( pmuludq xmm7, [edx-(i)*16])\
1341
AS2( pand xmm6, xmm7) \
1342
AS2( psrld xmm7, 16)
1344
#define Squ_Begin(n) \
1347
AS2( and esp, 0xfffffff0)\
1348
AS2( lea edi, [esp-32*n])\
1349
AS2( sub esp, 32*n+16)\
1351
AS2( mov esi, edi) \
1352
AS2( xor edx, edx) \
1354
ASS( pshufd xmm0, [eax+edx], 3,1,2,0) \
1355
ASS( pshufd xmm1, [eax+edx], 2,0,3,1) \
1356
AS2( movdqa [edi+2*edx], xmm0) \
1357
AS2( psrlq xmm0, 32) \
1358
AS2( movdqa [edi+2*edx+16], xmm0) \
1359
AS2( movdqa [edi+16*n+2*edx], xmm1) \
1360
AS2( psrlq xmm1, 32) \
1361
AS2( movdqa [edi+16*n+2*edx+16], xmm1) \
1363
AS2( cmp edx, 8*(n)) \
1365
AS2( lea edx, [edi+16*n])\
1366
SSE2_FirstMultiply(0) \
1368
#define Squ_Acc(i) \
1370
AS2( movdqa xmm1, [esi+(i)*16]) \
1371
AS2( movdqa xmm0, [edi-(i)*16]) \
1372
AS2( movdqa xmm2, [ebx]) \
1373
AS2( pmuludq xmm0, xmm1) \
1374
AS2( pmuludq xmm1, [edx-(i)*16]) \
1375
AS2( movdqa xmm3, xmm2) \
1376
AS2( pand xmm2, xmm0) \
1377
AS2( psrld xmm0, 16) \
1378
AS2( paddd xmm4, xmm2) \
1379
AS2( paddd xmm5, xmm0) \
1380
AS2( pand xmm3, xmm1) \
1381
AS2( psrld xmm1, 16) \
1382
AS2( paddd xmm6, xmm3) \
1383
AS2( paddd xmm7, xmm1) \
1386
#define Squ_Acc2(i) ASC(call, LSqu##i)
1387
#define Squ_Acc3(i) Squ_Acc2(i)
1388
#define Squ_Acc4(i) Squ_Acc2(i)
1389
#define Squ_Acc5(i) Squ_Acc2(i)
1390
#define Squ_Acc6(i) Squ_Acc2(i)
1391
#define Squ_Acc7(i) Squ_Acc2(i)
1392
#define Squ_Acc8(i) Squ_Acc2(i)
1394
#define SSE2_End(E, n) \
1395
SSE2_SaveShift(2*(n)-3) \
1396
AS2( movdqa xmm7, [esi+16]) \
1397
AS2( movdqa xmm0, [edi]) \
1398
AS2( pmuludq xmm0, xmm7) \
1399
AS2( movdqa xmm2, [ebx]) \
1400
AS2( pmuludq xmm7, [edx]) \
1401
AS2( movdqa xmm6, xmm2) \
1402
AS2( pand xmm2, xmm0) \
1403
AS2( psrld xmm0, 16) \
1404
AS2( paddd xmm4, xmm2) \
1405
AS2( paddd xmm5, xmm0) \
1406
AS2( pand xmm6, xmm7) \
1407
AS2( psrld xmm7, 16) \
1408
SSE2_SaveShift(2*(n)-2) \
1409
SSE2_FinalSave(2*(n)-1) \
1413
#define Squ_End(n) SSE2_End(SquEpilogue, n)
1414
#define Mul_End(n) SSE2_End(MulEpilogue, n)
1415
#define Top_End(n) SSE2_End(TopEpilogue, n)
1417
#define Squ_Column1(k, i) \
1418
Squ_SSE2_SaveShift(k) \
1420
SSE2_FirstMultiply(1)\
1422
AS2( paddd xmm4, xmm4) \
1423
AS2( paddd xmm5, xmm5) \
1424
AS2( movdqa xmm3, [esi]) \
1425
AS2( movq xmm1, QWORD PTR [esi+8]) \
1426
AS2( pmuludq xmm1, xmm3) \
1427
AS2( pmuludq xmm3, xmm3) \
1428
AS2( movdqa xmm0, [ebx])\
1429
AS2( movdqa xmm2, xmm0) \
1430
AS2( pand xmm0, xmm1) \
1431
AS2( psrld xmm1, 16) \
1432
AS2( paddd xmm6, xmm0) \
1433
AS2( paddd xmm7, xmm1) \
1434
AS2( pand xmm2, xmm3) \
1435
AS2( psrld xmm3, 16) \
1436
AS2( paddd xmm6, xmm6) \
1437
AS2( paddd xmm7, xmm7) \
1438
AS2( paddd xmm4, xmm2) \
1439
AS2( paddd xmm5, xmm3) \
1440
AS2( movq xmm0, QWORD PTR [esp+4])\
1441
AS2( movq xmm1, QWORD PTR [esp+12])\
1442
AS2( paddd xmm4, xmm0)\
1443
AS2( paddd xmm5, xmm1)\
1445
#define Squ_Column0(k, i) \
1446
Squ_SSE2_SaveShift(k) \
1449
SSE2_FirstMultiply(1)\
1451
AS2( paddd xmm6, xmm6) \
1452
AS2( paddd xmm7, xmm7) \
1453
AS2( paddd xmm4, xmm4) \
1454
AS2( paddd xmm5, xmm5) \
1455
AS2( movq xmm0, QWORD PTR [esp+4])\
1456
AS2( movq xmm1, QWORD PTR [esp+12])\
1457
AS2( paddd xmm4, xmm0)\
1458
AS2( paddd xmm5, xmm1)\
1460
#define SSE2_MulAdd45 \
1461
AS2( movdqa xmm7, [esi]) \
1462
AS2( movdqa xmm0, [edi]) \
1463
AS2( pmuludq xmm0, xmm7) \
1464
AS2( movdqa xmm2, [ebx]) \
1465
AS2( pmuludq xmm7, [edx]) \
1466
AS2( movdqa xmm6, xmm2) \
1467
AS2( pand xmm2, xmm0) \
1468
AS2( psrld xmm0, 16) \
1469
AS2( paddd xmm4, xmm2) \
1470
AS2( paddd xmm5, xmm0) \
1471
AS2( pand xmm6, xmm7) \
1472
AS2( psrld xmm7, 16)
1474
#define Mul_Begin(n) \
1477
AS2( and esp, 0xfffffff0)\
1478
AS2( sub esp, 48*n+16)\
1480
AS2( xor edx, edx) \
1482
ASS( pshufd xmm0, [eax+edx], 3,1,2,0) \
1483
ASS( pshufd xmm1, [eax+edx], 2,0,3,1) \
1484
ASS( pshufd xmm2, [edi+edx], 3,1,2,0) \
1485
AS2( movdqa [esp+20+2*edx], xmm0) \
1486
AS2( psrlq xmm0, 32) \
1487
AS2( movdqa [esp+20+2*edx+16], xmm0) \
1488
AS2( movdqa [esp+20+16*n+2*edx], xmm1) \
1489
AS2( psrlq xmm1, 32) \
1490
AS2( movdqa [esp+20+16*n+2*edx+16], xmm1) \
1491
AS2( movdqa [esp+20+32*n+2*edx], xmm2) \
1492
AS2( psrlq xmm2, 32) \
1493
AS2( movdqa [esp+20+32*n+2*edx+16], xmm2) \
1495
AS2( cmp edx, 8*(n)) \
1497
AS2( lea edi, [esp+20])\
1498
AS2( lea edx, [esp+20+16*n])\
1499
AS2( lea esi, [esp+20+32*n])\
1500
SSE2_FirstMultiply(0) \
1502
#define Mul_Acc(i) \
1504
AS2( movdqa xmm1, [esi+i/2*(1-(i-2*(i/2))*2)*16]) \
1505
AS2( movdqa xmm0, [edi-i/2*(1-(i-2*(i/2))*2)*16]) \
1506
AS2( movdqa xmm2, [ebx]) \
1507
AS2( pmuludq xmm0, xmm1) \
1508
AS2( pmuludq xmm1, [edx-i/2*(1-(i-2*(i/2))*2)*16]) \
1509
AS2( movdqa xmm3, xmm2) \
1510
AS2( pand xmm2, xmm0) \
1511
AS2( psrld xmm0, 16) \
1512
AS2( paddd xmm4, xmm2) \
1513
AS2( paddd xmm5, xmm0) \
1514
AS2( pand xmm3, xmm1) \
1515
AS2( psrld xmm1, 16) \
1516
AS2( paddd xmm6, xmm3) \
1517
AS2( paddd xmm7, xmm1) \
1520
#define Mul_Acc2(i) ASC(call, LMul##i)
1521
#define Mul_Acc3(i) Mul_Acc2(i)
1522
#define Mul_Acc4(i) Mul_Acc2(i)
1523
#define Mul_Acc5(i) Mul_Acc2(i)
1524
#define Mul_Acc6(i) Mul_Acc2(i)
1525
#define Mul_Acc7(i) Mul_Acc2(i)
1526
#define Mul_Acc8(i) Mul_Acc2(i)
1527
#define Mul_Acc9(i) Mul_Acc2(i)
1528
#define Mul_Acc10(i) Mul_Acc2(i)
1529
#define Mul_Acc11(i) Mul_Acc2(i)
1530
#define Mul_Acc12(i) Mul_Acc2(i)
1531
#define Mul_Acc13(i) Mul_Acc2(i)
1532
#define Mul_Acc14(i) Mul_Acc2(i)
1533
#define Mul_Acc15(i) Mul_Acc2(i)
1534
#define Mul_Acc16(i) Mul_Acc2(i)
1536
#define Mul_Column1(k, i) \
1542
#define Mul_Column0(k, i) \
1549
#define Bot_Acc(i) \
1550
AS2( movdqa xmm1, [esi+i/2*(1-(i-2*(i/2))*2)*16]) \
1551
AS2( movdqa xmm0, [edi-i/2*(1-(i-2*(i/2))*2)*16]) \
1552
AS2( pmuludq xmm0, xmm1) \
1553
AS2( pmuludq xmm1, [edx-i/2*(1-(i-2*(i/2))*2)*16]) \
1554
AS2( paddq xmm4, xmm0) \
1555
AS2( paddd xmm6, xmm1)
1557
#define Bot_SaveAcc(k) \
1561
AS2( movdqa xmm6, [esi]) \
1562
AS2( movdqa xmm0, [edi]) \
1563
AS2( pmuludq xmm0, xmm6) \
1564
AS2( paddq xmm4, xmm0) \
1565
AS2( psllq xmm5, 16) \
1566
AS2( paddq xmm4, xmm5) \
1567
AS2( pmuludq xmm6, [edx])
1569
#define Bot_End(n) \
1570
AS2( movhlps xmm7, xmm6) \
1571
AS2( paddd xmm6, xmm7) \
1572
AS2( psllq xmm6, 32) \
1573
AS2( paddd xmm4, xmm6) \
1574
AS2( movq QWORD PTR [ecx+8*((n)-1)], xmm4) \
1578
#define Top_Begin(n) \
1581
AS2( and esp, 0xfffffff0)\
1582
AS2( sub esp, 48*n+16)\
1584
AS2( xor edx, edx) \
1586
ASS( pshufd xmm0, [eax+edx], 3,1,2,0) \
1587
ASS( pshufd xmm1, [eax+edx], 2,0,3,1) \
1588
ASS( pshufd xmm2, [edi+edx], 3,1,2,0) \
1589
AS2( movdqa [esp+20+2*edx], xmm0) \
1590
AS2( psrlq xmm0, 32) \
1591
AS2( movdqa [esp+20+2*edx+16], xmm0) \
1592
AS2( movdqa [esp+20+16*n+2*edx], xmm1) \
1593
AS2( psrlq xmm1, 32) \
1594
AS2( movdqa [esp+20+16*n+2*edx+16], xmm1) \
1595
AS2( movdqa [esp+20+32*n+2*edx], xmm2) \
1596
AS2( psrlq xmm2, 32) \
1597
AS2( movdqa [esp+20+32*n+2*edx+16], xmm2) \
1599
AS2( cmp edx, 8*(n)) \
1601
AS2( mov eax, esi) \
1602
AS2( lea edi, [esp+20+00*n+16*(n/2-1)])\
1603
AS2( lea edx, [esp+20+16*n+16*(n/2-1)])\
1604
AS2( lea esi, [esp+20+32*n+16*(n/2-1)])\
1605
AS2( pxor xmm4, xmm4)\
1606
AS2( pxor xmm5, xmm5)
1608
#define Top_Acc(i) \
1609
AS2( movq xmm0, QWORD PTR [esi+i/2*(1-(i-2*(i/2))*2)*16+8]) \
1610
AS2( pmuludq xmm0, [edx-i/2*(1-(i-2*(i/2))*2)*16]) \
1611
AS2( psrlq xmm0, 48) \
1612
AS2( paddd xmm5, xmm0)\
1614
#define Top_Column0(i) \
1615
AS2( psllq xmm5, 32) \
1621
#define Top_Column1(i) \
1627
AS2( movd xmm0, eax)\
1628
AS2( movd xmm1, [ecx+4])\
1629
AS2( psrld xmm1, 16)\
1630
AS2( pcmpgtd xmm1, xmm0)\
1631
AS2( psrld xmm1, 31)\
1632
AS2( paddd xmm4, xmm1)\
1634
void SSE2_Square4(word *C, const word *A)
1641
void SSE2_Square8(word *C, const word *A)
1657
void SSE2_Square16(word *C, const word *A)
1662
Squ_Acc(4) Squ_Acc(3) Squ_Acc(2)
1681
void SSE2_Square32(word *C, const word *A)
1685
Squ_Acc(8) Squ_Acc(7) Squ_Acc(6) Squ_Acc(5) Squ_Acc(4) Squ_Acc(3) Squ_Acc(2)
1719
void SSE2_Multiply4(word *C, const word *A, const word *B)
1731
void SSE2_Multiply8(word *C, const word *A, const word *B)
1736
Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1747
void SSE2_Multiply16(word *C, const word *A, const word *B)
1752
Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1771
void SSE2_Multiply32(word *C, const word *A, const word *B)
1775
Mul_Acc(16) Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1809
void SSE2_MultiplyBottom4(word *C, const word *A, const word *B)
1812
Bot_SaveAcc(0) Bot_Acc(2)
1816
void SSE2_MultiplyBottom8(word *C, const word *A, const word *B)
1821
Mul_Acc(3) Mul_Acc(2)
1826
Bot_SaveAcc(2) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2)
1830
void SSE2_MultiplyBottom16(word *C, const word *A, const word *B)
1835
Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1844
Bot_SaveAcc(6) Bot_Acc(8) Bot_Acc(7) Bot_Acc(6) Bot_Acc(5) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2)
1848
void SSE2_MultiplyBottom32(word *C, const word *A, const word *B)
1853
Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1870
Bot_SaveAcc(14) Bot_Acc(16) Bot_Acc(15) Bot_Acc(14) Bot_Acc(13) Bot_Acc(12) Bot_Acc(11) Bot_Acc(10) Bot_Acc(9) Bot_Acc(8) Bot_Acc(7) Bot_Acc(6) Bot_Acc(5) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2)
1874
void SSE2_MultiplyTop8(word *C, const word *A, const word *B, word L)
1877
Top_Acc(3) Top_Acc(2) Top_Acc(1)
1880
Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1889
void SSE2_MultiplyTop16(word *C, const word *A, const word *B, word L)
1892
Top_Acc(7) Top_Acc(6) Top_Acc(5) Top_Acc(4) Top_Acc(3) Top_Acc(2) Top_Acc(1)
1895
Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1908
void SSE2_MultiplyTop32(word *C, const word *A, const word *B, word L)
1911
Top_Acc(15) Top_Acc(14) Top_Acc(13) Top_Acc(12) Top_Acc(11) Top_Acc(10) Top_Acc(9) Top_Acc(8) Top_Acc(7) Top_Acc(6) Top_Acc(5) Top_Acc(4) Top_Acc(3) Top_Acc(2) Top_Acc(1)
1914
Mul_Acc(16) Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1935
#endif // #if CRYPTOPP_INTEGER_SSE2
1937
// ********************************************************
1939
typedef int (CRYPTOPP_FASTCALL * PAdd)(size_t N, word *C, const word *A, const word *B);
1012
1940
typedef void (* PMul)(word *C, const word *A, const word *B);
1941
typedef void (* PSqu)(word *C, const word *A);
1942
typedef void (* PMulTop)(word *C, const word *A, const word *B, word L);
1014
static PAddSub s_pAdd, s_pSub;
1015
#ifdef SSE2_INTRINSICS_AVAILABLE
1016
static PMul s_pMul4, s_pMul8, s_pMul8B;
1944
#if CRYPTOPP_INTEGER_SSE2
1945
static PAdd s_pAdd = &Baseline_Add, s_pSub = &Baseline_Sub;
1946
static size_t s_recursionLimit = 8;
1948
static const size_t s_recursionLimit = 16;
1019
static void SetPentiumFunctionPointers()
1951
static PMul s_pMul[9], s_pBot[9];
1952
static PSqu s_pSqu[9];
1953
static PMulTop s_pTop[9];
1955
static void SetFunctionPointers()
1023
s_pAdd = &P4Optimized::Add;
1024
s_pSub = &P4Optimized::Subtract;
1028
s_pAdd = &PentiumOptimized::Add;
1029
s_pSub = &PentiumOptimized::Subtract;
1957
s_pMul[0] = &Baseline_Multiply2;
1958
s_pBot[0] = &Baseline_MultiplyBottom2;
1959
s_pSqu[0] = &Baseline_Square2;
1960
s_pTop[0] = &Baseline_MultiplyTop2;
1961
s_pTop[1] = &Baseline_MultiplyTop4;
1032
#ifdef SSE2_INTRINSICS_AVAILABLE
1963
#if CRYPTOPP_INTEGER_SSE2
1035
s_pMul4 = &P4Optimized::Multiply4;
1036
s_pMul8 = &P4Optimized::Multiply8;
1037
s_pMul8B = &P4Optimized::Multiply8Bottom;
1966
#if _MSC_VER != 1200 || defined(NDEBUG)
1974
s_recursionLimit = 32;
1976
s_pMul[1] = &SSE2_Multiply4;
1977
s_pMul[2] = &SSE2_Multiply8;
1978
s_pMul[4] = &SSE2_Multiply16;
1979
s_pMul[8] = &SSE2_Multiply32;
1981
s_pBot[1] = &SSE2_MultiplyBottom4;
1982
s_pBot[2] = &SSE2_MultiplyBottom8;
1983
s_pBot[4] = &SSE2_MultiplyBottom16;
1984
s_pBot[8] = &SSE2_MultiplyBottom32;
1986
s_pSqu[1] = &SSE2_Square4;
1987
s_pSqu[2] = &SSE2_Square8;
1988
s_pSqu[4] = &SSE2_Square16;
1989
s_pSqu[8] = &SSE2_Square32;
1991
s_pTop[2] = &SSE2_MultiplyTop8;
1992
s_pTop[4] = &SSE2_MultiplyTop16;
1993
s_pTop[8] = &SSE2_MultiplyTop32;
1041
s_pMul4 = &PentiumOptimized::Multiply4;
1042
s_pMul8 = &PentiumOptimized::Multiply8;
1043
s_pMul8B = &PentiumOptimized::Multiply8Bottom;
1998
s_pMul[1] = &Baseline_Multiply4;
1999
s_pMul[2] = &Baseline_Multiply8;
2001
s_pBot[1] = &Baseline_MultiplyBottom4;
2002
s_pBot[2] = &Baseline_MultiplyBottom8;
2004
s_pSqu[1] = &Baseline_Square4;
2005
s_pSqu[2] = &Baseline_Square8;
2007
s_pTop[2] = &Baseline_MultiplyTop8;
2009
#if !CRYPTOPP_INTEGER_SSE2
2010
s_pMul[4] = &Baseline_Multiply16;
2011
s_pBot[4] = &Baseline_MultiplyBottom16;
2012
s_pSqu[4] = &Baseline_Square16;
2013
s_pTop[4] = &Baseline_MultiplyTop16;
1050
s_sse2Enabled = false;
1051
SetPentiumFunctionPointers();
1054
class LowLevel : public PentiumOptimized
1057
inline static int Add(word *C, const word *A, const word *B, size_t N)
1058
{return s_pAdd(C, A, B, N);}
1059
inline static int Subtract(word *C, const word *A, const word *B, size_t N)
1060
{return s_pSub(C, A, B, N);}
1061
inline static void Square4(word *R, const word *A)
1062
{Multiply4(R, A, A);}
1063
#ifdef SSE2_INTRINSICS_AVAILABLE
1064
inline static void Multiply4(word *C, const word *A, const word *B)
1066
inline static void Multiply8(word *C, const word *A, const word *B)
1068
inline static void Multiply8Bottom(word *C, const word *A, const word *B)
1069
{s_pMul8B(C, A, B);}
1073
// use some tricks to share assembly code between MSVC and GCC
1075
#define CRYPTOPP_NAKED __declspec(naked)
1076
#define AS1(x) __asm x
1077
#define AS2(x, y) __asm x, y
1078
#define AddPrologue \
1083
__asm mov ecx, [esp+20] \
1084
__asm mov edx, [esp+24] \
1085
__asm mov ebx, [esp+28] \
1086
__asm mov esi, [esp+32]
1087
#define AddEpilogue \
1093
#define MulPrologue \
1098
__asm mov ecx, [esp+28] \
1099
__asm mov esi, [esp+24] \
1101
#define MulEpilogue \
1109
#define CRYPTOPP_NAKED
1110
#define AS1(x) #x ";"
1111
#define AS2(x, y) #x ", " #y ";"
1112
#define AddPrologue \
1113
__asm__ __volatile__ \
1115
"push %%ebx;" /* save this manually, in case of -fPIC */ \
1117
".intel_syntax noprefix;" \
1119
#define AddEpilogue \
1121
".att_syntax prefix;" \
1124
: "c" (C), "d" (A), "m" (B), "S" (N) \
1125
: "%edi", "memory", "cc" \
1127
#define MulPrologue \
1128
__asm__ __volatile__ \
1130
"push %%ebx;" /* save this manually, in case of -fPIC */ \
1133
".intel_syntax noprefix;"
1134
#define MulEpilogue \
1138
".att_syntax prefix;" \
1140
: "rm" (Z), "S" (X), "c" (Y) \
1141
: "%eax", "%edx", "%edi", "memory", "cc" \
1145
CRYPTOPP_NAKED int PentiumOptimized::Add(word *C, const word *A, const word *B, size_t N)
1149
// now: ebx = B, ecx = C, edx = A, esi = N
1150
AS2( sub ecx, edx) // hold the distance between C & A so we can add this to A to get C
1151
AS2( xor eax, eax) // clear eax
1153
AS2( sub eax, esi) // eax is a negative index from end of B
1154
AS2( lea ebx, [ebx+4*esi]) // ebx is end of B
1156
AS2( sar eax, 1) // unit of eax is now dwords; this also clears the carry flag
1157
AS1( jz loopendAdd) // if no dwords then nothing to do
1160
AS2( mov esi,[edx]) // load lower word of A
1161
AS2( mov ebp,[edx+4]) // load higher word of A
1163
AS2( mov edi,[ebx+8*eax]) // load lower word of B
1164
AS2( lea edx,[edx+8]) // advance A and C
1166
AS2( adc esi,edi) // add lower words
1167
AS2( mov edi,[ebx+8*eax+4]) // load higher word of B
1169
AS2( adc ebp,edi) // add higher words
1170
AS1( inc eax) // advance B
1172
AS2( mov [edx+ecx-8],esi) // store lower word result
1173
AS2( mov [edx+ecx-4],ebp) // store higher word result
1175
AS1( jnz loopstartAdd) // loop until eax overflows and becomes zero
1178
AS2( adc eax, 0) // store carry into eax (return result register)
1183
CRYPTOPP_NAKED int PentiumOptimized::Subtract(word *C, const word *A, const word *B, size_t N)
1187
// now: ebx = B, ecx = C, edx = A, esi = N
1188
AS2( sub ecx, edx) // hold the distance between C & A so we can add this to A to get C
1189
AS2( xor eax, eax) // clear eax
1191
AS2( sub eax, esi) // eax is a negative index from end of B
1192
AS2( lea ebx, [ebx+4*esi]) // ebx is end of B
1194
AS2( sar eax, 1) // unit of eax is now dwords; this also clears the carry flag
1195
AS1( jz loopendSub) // if no dwords then nothing to do
1198
AS2( mov esi,[edx]) // load lower word of A
1199
AS2( mov ebp,[edx+4]) // load higher word of A
1201
AS2( mov edi,[ebx+8*eax]) // load lower word of B
1202
AS2( lea edx,[edx+8]) // advance A and C
1204
AS2( sbb esi,edi) // subtract lower words
1205
AS2( mov edi,[ebx+8*eax+4]) // load higher word of B
1207
AS2( sbb ebp,edi) // subtract higher words
1208
AS1( inc eax) // advance B
1210
AS2( mov [edx+ecx-8],esi) // store lower word result
1211
AS2( mov [edx+ecx-4],ebp) // store higher word result
1213
AS1( jnz loopstartSub) // loop until eax overflows and becomes zero
1216
AS2( adc eax, 0) // store carry into eax (return result register)
1221
// On Pentium 4, the adc and sbb instructions are very expensive, so avoid them.
1223
CRYPTOPP_NAKED int P4Optimized::Add(word *C, const word *A, const word *B, size_t N)
1227
// now: ebx = B, ecx = C, edx = A, esi = N
1230
AS1( jz loopendAddP4) // if no dwords then nothing to do
1232
AS2( mov edi, [edx])
1233
AS2( mov ebp, [ebx])
1234
AS1( jmp carry1AddP4)
1236
AS1(loopstartAddP4:)
1237
AS2( mov edi, [edx+8])
1240
AS2( mov ebp, [ebx])
1242
AS1( jc carry1AddP4)
1248
AS2( mov [ecx], edi)
1249
AS2( mov edi, [edx+4])
1250
AS2( cmovc eax, ebp)
1251
AS2( mov ebp, [ebx+4])
1254
AS1( jc carry2AddP4)
1260
AS2( cmovc eax, ebp)
1261
AS2( mov [ecx+4], edi)
1263
AS1( jnz loopstartAddP4)
1270
CRYPTOPP_NAKED int P4Optimized::Subtract(word *C, const word *A, const word *B, size_t N)
1274
// now: ebx = B, ecx = C, edx = A, esi = N
1277
AS1( jz loopendSubP4) // if no dwords then nothing to do
1279
AS2( mov edi, [edx])
1280
AS2( mov ebp, [ebx])
1281
AS1( jmp carry1SubP4)
1283
AS1(loopstartSubP4:)
1284
AS2( mov edi, [edx+8])
1287
AS2( mov ebp, [ebx])
1289
AS1( jc carry1SubP4)
1295
AS2( mov [ecx], edi)
1296
AS2( mov edi, [edx+4])
1297
AS2( cmovc eax, ebp)
1298
AS2( mov ebp, [ebx+4])
1301
AS1( jc carry2SubP4)
1307
AS2( cmovc eax, ebp)
1308
AS2( mov [ecx+4], edi)
1310
AS1( jnz loopstartSubP4)
1317
// multiply assembly code originally contributed by Leonard Janke
1319
#define MulStartup \
1324
#define MulShiftCarry \
1329
#define MulAccumulateBottom(i,j) \
1330
AS2(mov eax, [ecx+4*j]) \
1331
AS2(imul eax, dword ptr [esi+4*i]) \
1334
#define MulAccumulate(i,j) \
1335
AS2(mov eax, [ecx+4*j]) \
1336
AS1(mul dword ptr [esi+4*i]) \
1341
#define MulStoreDigit(i) \
1343
AS2(mov edi, [esp]) \
1344
AS2(mov [edi+4*i], ebp)
1346
#define MulLastDiagonal(digits) \
1347
AS2(mov eax, [ecx+4*(digits-1)]) \
1348
AS1(mul dword ptr [esi+4*(digits-1)]) \
1351
AS2(mov edi, [esp]) \
1352
AS2(mov [edi+4*(2*digits-2)], ebp) \
1353
AS2(mov [edi+4*(2*digits-1)], edx)
1355
CRYPTOPP_NAKED void PentiumOptimized::Multiply4(word* Z, const word* X, const word* Y)
1358
// now: [esp] = Z, esi = X, ecx = Y
1397
CRYPTOPP_NAKED void PentiumOptimized::Multiply8(word* Z, const word* X, const word* Y)
1400
// now: [esp] = Z, esi = X, ecx = Y
1511
CRYPTOPP_NAKED void PentiumOptimized::Multiply8Bottom(word* Z, const word* X, const word* Y)
1514
// now: [esp] = Z, esi = X, ecx = Y
1565
MulAccumulateBottom(7,0)
1566
MulAccumulateBottom(6,1)
1567
MulAccumulateBottom(5,2)
1568
MulAccumulateBottom(4,3)
1569
MulAccumulateBottom(3,4)
1570
MulAccumulateBottom(2,5)
1571
MulAccumulateBottom(1,6)
1572
MulAccumulateBottom(0,7)
1580
#else // not x86 - no processor specific code at this layer
1582
typedef Portable LowLevel;
1586
#ifdef SSE2_INTRINSICS_AVAILABLE
1589
#define CRYPTOPP_FASTCALL
1591
#define CRYPTOPP_FASTCALL __fastcall
1594
static void CRYPTOPP_FASTCALL P4_Mul(__m128i *C, const __m128i *A, const __m128i *B)
1596
__m128i a3210 = _mm_load_si128(A);
1597
__m128i b3210 = _mm_load_si128(B);
1601
__m128i z = _mm_setzero_si128();
1602
__m128i a2b2_a0b0 = _mm_mul_epu32(a3210, b3210);
1605
__m128i a3120 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(3, 1, 2, 0));
1606
__m128i b3021 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 2, 1));
1607
__m128i a1b0_a0b1 = _mm_mul_epu32(a3120, b3021);
1608
__m128i a1b0 = _mm_unpackhi_epi32(a1b0_a0b1, z);
1609
__m128i a0b1 = _mm_unpacklo_epi32(a1b0_a0b1, z);
1610
C[1] = _mm_add_epi64(a1b0, a0b1);
1612
__m128i a31 = _mm_srli_epi64(a3210, 32);
1613
__m128i b31 = _mm_srli_epi64(b3210, 32);
1614
__m128i a3b3_a1b1 = _mm_mul_epu32(a31, b31);
1617
__m128i a1b1 = _mm_unpacklo_epi32(a3b3_a1b1, z);
1618
__m128i b3012 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 1, 2));
1619
__m128i a2b0_a0b2 = _mm_mul_epu32(a3210, b3012);
1620
__m128i a0b2 = _mm_unpacklo_epi32(a2b0_a0b2, z);
1621
__m128i a2b0 = _mm_unpackhi_epi32(a2b0_a0b2, z);
1622
sum = _mm_add_epi64(a1b1, a0b2);
1623
C[2] = _mm_add_epi64(sum, a2b0);
1625
__m128i a2301 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(2, 3, 0, 1));
1626
__m128i b2103 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(2, 1, 0, 3));
1627
__m128i a3b0_a1b2 = _mm_mul_epu32(a2301, b3012);
1628
__m128i a2b1_a0b3 = _mm_mul_epu32(a3210, b2103);
1629
__m128i a3b0 = _mm_unpackhi_epi32(a3b0_a1b2, z);
1630
__m128i a1b2 = _mm_unpacklo_epi32(a3b0_a1b2, z);
1631
__m128i a2b1 = _mm_unpackhi_epi32(a2b1_a0b3, z);
1632
__m128i a0b3 = _mm_unpacklo_epi32(a2b1_a0b3, z);
1633
__m128i sum1 = _mm_add_epi64(a3b0, a1b2);
1634
sum = _mm_add_epi64(a2b1, a0b3);
1635
C[3] = _mm_add_epi64(sum, sum1);
1637
__m128i a3b1_a1b3 = _mm_mul_epu32(a2301, b2103);
1638
__m128i a2b2 = _mm_unpackhi_epi32(a2b2_a0b0, z);
1639
__m128i a3b1 = _mm_unpackhi_epi32(a3b1_a1b3, z);
1640
__m128i a1b3 = _mm_unpacklo_epi32(a3b1_a1b3, z);
1641
sum = _mm_add_epi64(a2b2, a3b1);
1642
C[4] = _mm_add_epi64(sum, a1b3);
1644
__m128i a1302 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(1, 3, 0, 2));
1645
__m128i b1203 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(1, 2, 0, 3));
1646
__m128i a3b2_a2b3 = _mm_mul_epu32(a1302, b1203);
1647
__m128i a3b2 = _mm_unpackhi_epi32(a3b2_a2b3, z);
1648
__m128i a2b3 = _mm_unpacklo_epi32(a3b2_a2b3, z);
1649
C[5] = _mm_add_epi64(a3b2, a2b3);
1652
void P4Optimized::Multiply4(word *C, const word *A, const word *B)
1655
const word *w = (word *)temp;
1656
const __m64 *mw = (__m64 *)w;
1658
P4_Mul(temp, (__m128i *)A, (__m128i *)B);
1664
__m64 w1 = _mm_cvtsi32_si64(w[1]);
1675
__m64 w26 = _mm_cvtsi32_si64(w[26]);
1677
s1 = _mm_add_si64(w1, w4);
1678
C[1] = _mm_cvtsi64_si32(s1);
1679
s1 = _mm_srli_si64(s1, 32);
1681
s2 = _mm_add_si64(w6, w8);
1682
s1 = _mm_add_si64(s1, s2);
1683
C[2] = _mm_cvtsi64_si32(s1);
1684
s1 = _mm_srli_si64(s1, 32);
1686
s2 = _mm_add_si64(w10, w12);
1687
s1 = _mm_add_si64(s1, s2);
1688
C[3] = _mm_cvtsi64_si32(s1);
1689
s1 = _mm_srli_si64(s1, 32);
1691
s2 = _mm_add_si64(w14, w16);
1692
s1 = _mm_add_si64(s1, s2);
1693
C[4] = _mm_cvtsi64_si32(s1);
1694
s1 = _mm_srli_si64(s1, 32);
1696
s2 = _mm_add_si64(w18, w20);
1697
s1 = _mm_add_si64(s1, s2);
1698
C[5] = _mm_cvtsi64_si32(s1);
1699
s1 = _mm_srli_si64(s1, 32);
1701
s2 = _mm_add_si64(w22, w26);
1702
s1 = _mm_add_si64(s1, s2);
1703
C[6] = _mm_cvtsi64_si32(s1);
1704
s1 = _mm_srli_si64(s1, 32);
1706
C[7] = _mm_cvtsi64_si32(s1) + w[27];
1710
void P4Optimized::Multiply8(word *C, const word *A, const word *B)
1713
const word *w = (word *)temp;
1714
const __m64 *mw = (__m64 *)w;
1715
const word *x = (word *)temp+7*4;
1716
const __m64 *mx = (__m64 *)x;
1717
const word *y = (word *)temp+7*4*2;
1718
const __m64 *my = (__m64 *)y;
1719
const word *z = (word *)temp+7*4*3;
1720
const __m64 *mz = (__m64 *)z;
1722
P4_Mul(temp, (__m128i *)A, (__m128i *)B);
1724
P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B);
1726
P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1);
1728
P4_Mul(temp+21, (__m128i *)A+1, (__m128i *)B+1);
1732
__m64 s1, s2, s3, s4;
1734
__m64 w1 = _mm_cvtsi32_si64(w[1]);
1745
__m64 w26 = _mm_cvtsi32_si64(w[26]);
1746
__m64 w27 = _mm_cvtsi32_si64(w[27]);
1748
__m64 x0 = _mm_cvtsi32_si64(x[0]);
1749
__m64 x1 = _mm_cvtsi32_si64(x[1]);
1760
__m64 x26 = _mm_cvtsi32_si64(x[26]);
1761
__m64 x27 = _mm_cvtsi32_si64(x[27]);
1763
__m64 y0 = _mm_cvtsi32_si64(y[0]);
1764
__m64 y1 = _mm_cvtsi32_si64(y[1]);
1775
__m64 y26 = _mm_cvtsi32_si64(y[26]);
1776
__m64 y27 = _mm_cvtsi32_si64(y[27]);
1778
__m64 z0 = _mm_cvtsi32_si64(z[0]);
1779
__m64 z1 = _mm_cvtsi32_si64(z[1]);
1790
__m64 z26 = _mm_cvtsi32_si64(z[26]);
1792
s1 = _mm_add_si64(w1, w4);
1793
C[1] = _mm_cvtsi64_si32(s1);
1794
s1 = _mm_srli_si64(s1, 32);
1796
s2 = _mm_add_si64(w6, w8);
1797
s1 = _mm_add_si64(s1, s2);
1798
C[2] = _mm_cvtsi64_si32(s1);
1799
s1 = _mm_srli_si64(s1, 32);
1801
s2 = _mm_add_si64(w10, w12);
1802
s1 = _mm_add_si64(s1, s2);
1803
C[3] = _mm_cvtsi64_si32(s1);
1804
s1 = _mm_srli_si64(s1, 32);
1806
s3 = _mm_add_si64(x0, y0);
1807
s2 = _mm_add_si64(w14, w16);
1808
s1 = _mm_add_si64(s1, s3);
1809
s1 = _mm_add_si64(s1, s2);
1810
C[4] = _mm_cvtsi64_si32(s1);
1811
s1 = _mm_srli_si64(s1, 32);
1813
s3 = _mm_add_si64(x1, y1);
1814
s4 = _mm_add_si64(x4, y4);
1815
s1 = _mm_add_si64(s1, w18);
1816
s3 = _mm_add_si64(s3, s4);
1817
s1 = _mm_add_si64(s1, w20);
1818
s1 = _mm_add_si64(s1, s3);
1819
C[5] = _mm_cvtsi64_si32(s1);
1820
s1 = _mm_srli_si64(s1, 32);
1822
s3 = _mm_add_si64(x6, y6);
1823
s4 = _mm_add_si64(x8, y8);
1824
s1 = _mm_add_si64(s1, w22);
1825
s3 = _mm_add_si64(s3, s4);
1826
s1 = _mm_add_si64(s1, w26);
1827
s1 = _mm_add_si64(s1, s3);
1828
C[6] = _mm_cvtsi64_si32(s1);
1829
s1 = _mm_srli_si64(s1, 32);
1831
s3 = _mm_add_si64(x10, y10);
1832
s4 = _mm_add_si64(x12, y12);
1833
s1 = _mm_add_si64(s1, w27);
1834
s3 = _mm_add_si64(s3, s4);
1835
s1 = _mm_add_si64(s1, s3);
1836
C[7] = _mm_cvtsi64_si32(s1);
1837
s1 = _mm_srli_si64(s1, 32);
1839
s3 = _mm_add_si64(x14, y14);
1840
s4 = _mm_add_si64(x16, y16);
1841
s1 = _mm_add_si64(s1, z0);
1842
s3 = _mm_add_si64(s3, s4);
1843
s1 = _mm_add_si64(s1, s3);
1844
C[8] = _mm_cvtsi64_si32(s1);
1845
s1 = _mm_srli_si64(s1, 32);
1847
s3 = _mm_add_si64(x18, y18);
1848
s4 = _mm_add_si64(x20, y20);
1849
s1 = _mm_add_si64(s1, z1);
1850
s3 = _mm_add_si64(s3, s4);
1851
s1 = _mm_add_si64(s1, z4);
1852
s1 = _mm_add_si64(s1, s3);
1853
C[9] = _mm_cvtsi64_si32(s1);
1854
s1 = _mm_srli_si64(s1, 32);
1856
s3 = _mm_add_si64(x22, y22);
1857
s4 = _mm_add_si64(x26, y26);
1858
s1 = _mm_add_si64(s1, z6);
1859
s3 = _mm_add_si64(s3, s4);
1860
s1 = _mm_add_si64(s1, z8);
1861
s1 = _mm_add_si64(s1, s3);
1862
C[10] = _mm_cvtsi64_si32(s1);
1863
s1 = _mm_srli_si64(s1, 32);
1865
s3 = _mm_add_si64(x27, y27);
1866
s1 = _mm_add_si64(s1, z10);
1867
s1 = _mm_add_si64(s1, z12);
1868
s1 = _mm_add_si64(s1, s3);
1869
C[11] = _mm_cvtsi64_si32(s1);
1870
s1 = _mm_srli_si64(s1, 32);
1872
s3 = _mm_add_si64(z14, z16);
1873
s1 = _mm_add_si64(s1, s3);
1874
C[12] = _mm_cvtsi64_si32(s1);
1875
s1 = _mm_srli_si64(s1, 32);
1877
s3 = _mm_add_si64(z18, z20);
1878
s1 = _mm_add_si64(s1, s3);
1879
C[13] = _mm_cvtsi64_si32(s1);
1880
s1 = _mm_srli_si64(s1, 32);
1882
s3 = _mm_add_si64(z22, z26);
1883
s1 = _mm_add_si64(s1, s3);
1884
C[14] = _mm_cvtsi64_si32(s1);
1885
s1 = _mm_srli_si64(s1, 32);
1887
C[15] = z[27] + _mm_cvtsi64_si32(s1);
1891
void P4Optimized::Multiply8Bottom(word *C, const word *A, const word *B)
1894
const word *w = (word *)temp;
1895
const __m64 *mw = (__m64 *)w;
1896
const word *x = (word *)temp+7*4;
1897
const __m64 *mx = (__m64 *)x;
1898
const word *y = (word *)temp+7*4*2;
1899
const __m64 *my = (__m64 *)y;
1901
P4_Mul(temp, (__m128i *)A, (__m128i *)B);
1903
P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B);
1905
P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1);
1909
__m64 s1, s2, s3, s4;
1911
__m64 w1 = _mm_cvtsi32_si64(w[1]);
1922
__m64 w26 = _mm_cvtsi32_si64(w[26]);
1924
__m64 x0 = _mm_cvtsi32_si64(x[0]);
1925
__m64 x1 = _mm_cvtsi32_si64(x[1]);
1930
__m64 y0 = _mm_cvtsi32_si64(y[0]);
1931
__m64 y1 = _mm_cvtsi32_si64(y[1]);
1936
s1 = _mm_add_si64(w1, w4);
1937
C[1] = _mm_cvtsi64_si32(s1);
1938
s1 = _mm_srli_si64(s1, 32);
1940
s2 = _mm_add_si64(w6, w8);
1941
s1 = _mm_add_si64(s1, s2);
1942
C[2] = _mm_cvtsi64_si32(s1);
1943
s1 = _mm_srli_si64(s1, 32);
1945
s2 = _mm_add_si64(w10, w12);
1946
s1 = _mm_add_si64(s1, s2);
1947
C[3] = _mm_cvtsi64_si32(s1);
1948
s1 = _mm_srli_si64(s1, 32);
1950
s3 = _mm_add_si64(x0, y0);
1951
s2 = _mm_add_si64(w14, w16);
1952
s1 = _mm_add_si64(s1, s3);
1953
s1 = _mm_add_si64(s1, s2);
1954
C[4] = _mm_cvtsi64_si32(s1);
1955
s1 = _mm_srli_si64(s1, 32);
1957
s3 = _mm_add_si64(x1, y1);
1958
s4 = _mm_add_si64(x4, y4);
1959
s1 = _mm_add_si64(s1, w18);
1960
s3 = _mm_add_si64(s3, s4);
1961
s1 = _mm_add_si64(s1, w20);
1962
s1 = _mm_add_si64(s1, s3);
1963
C[5] = _mm_cvtsi64_si32(s1);
1964
s1 = _mm_srli_si64(s1, 32);
1966
s3 = _mm_add_si64(x6, y6);
1967
s4 = _mm_add_si64(x8, y8);
1968
s1 = _mm_add_si64(s1, w22);
1969
s3 = _mm_add_si64(s3, s4);
1970
s1 = _mm_add_si64(s1, w26);
1971
s1 = _mm_add_si64(s1, s3);
1972
C[6] = _mm_cvtsi64_si32(s1);
1973
s1 = _mm_srli_si64(s1, 32);
1975
C[7] = _mm_cvtsi64_si32(s1) + w[27] + x[10] + y[10] + x[12] + y[12];
1979
#endif // #ifdef SSE2_INTRINSICS_AVAILABLE
2018
inline int Add(word *C, const word *A, const word *B, size_t N)
2020
#if CRYPTOPP_INTEGER_SSE2
2021
return s_pAdd(N, C, A, B);
2023
return Baseline_Add(N, C, A, B);
2027
inline int Subtract(word *C, const word *A, const word *B, size_t N)
2029
#if CRYPTOPP_INTEGER_SSE2
2030
return s_pSub(N, C, A, B);
2032
return Baseline_Sub(N, C, A, B);
1981
2036
// ********************************************************
1984
2040
#define A1 (A+N2)