458
445
// ********************************************************
463
static int Add(word *C, const word *A, const word *B, size_t N);
464
static int Subtract(word *C, const word *A, const word *B, size_t N);
466
static inline void Multiply2(word *C, const word *A, const word *B);
467
static inline word Multiply2Add(word *C, const word *A, const word *B);
468
static void Multiply4(word *C, const word *A, const word *B);
469
static void Multiply8(word *C, const word *A, const word *B);
470
static inline unsigned int MultiplyRecursionLimit() {return 8;}
472
static inline void Multiply2Bottom(word *C, const word *A, const word *B);
473
static void Multiply4Bottom(word *C, const word *A, const word *B);
474
static void Multiply8Bottom(word *C, const word *A, const word *B);
475
static inline unsigned int MultiplyBottomRecursionLimit() {return 8;}
477
static void Square2(word *R, const word *A);
478
static void Square4(word *R, const word *A);
479
static void Square8(word *R, const word *A) {assert(false);}
480
static inline unsigned int SquareRecursionLimit() {return 4;}
483
int Portable::Add(word *C, const word *A, const word *B, size_t N)
488
for (unsigned int i = 0; i < N; i+=2)
490
u = DWord(A[i]) + B[i] + u.GetHighHalf();
491
C[i] = u.GetLowHalf();
492
u = DWord(A[i+1]) + B[i+1] + u.GetHighHalf();
493
C[i+1] = u.GetLowHalf();
495
return int(u.GetHighHalf());
498
int Portable::Subtract(word *C, const word *A, const word *B, size_t N)
503
for (unsigned int i = 0; i < N; i+=2)
505
u = (DWord) A[i] - B[i] - u.GetHighHalfAsBorrow();
506
C[i] = u.GetLowHalf();
507
u = (DWord) A[i+1] - B[i+1] - u.GetHighHalfAsBorrow();
508
C[i+1] = u.GetLowHalf();
510
return int(0-u.GetHighHalf());
513
void Portable::Multiply2(word *C, const word *A, const word *B)
523
d = (dword)(A1-A0)*(B0-B1);
528
d = (dword)s*(word)(B0-B1);
534
d = (word)(A1-A0)*(dword)s;
539
d = (dword)(A0-A1)*(B1-B0);
542
// this segment is the branchless equivalent of above
543
word D[4] = {A[1]-A[0], A[0]-A[1], B[0]-B[1], B[1]-B[0]};
544
unsigned int ai = A[1] < A[0];
545
unsigned int bi = B[0] < B[1];
546
unsigned int di = ai & bi;
547
DWord d = DWord::Multiply(D[di], D[di+2]);
549
unsigned int si = ai + !bi;
552
DWord A0B0 = DWord::Multiply(A[0], B[0]);
553
C[0] = A0B0.GetLowHalf();
555
DWord A1B1 = DWord::Multiply(A[1], B[1]);
556
DWord t = (DWord) A0B0.GetHighHalf() + A0B0.GetLowHalf() + d.GetLowHalf() + A1B1.GetLowHalf();
557
C[1] = t.GetLowHalf();
559
t = A1B1 + t.GetHighHalf() + A0B0.GetHighHalf() + d.GetHighHalf() + A1B1.GetHighHalf() - s;
560
C[2] = t.GetLowHalf();
561
C[3] = t.GetHighHalf();
564
inline void Portable::Multiply2Bottom(word *C, const word *A, const word *B)
566
DWord t = DWord::Multiply(A[0], B[0]);
567
C[0] = t.GetLowHalf();
568
C[1] = t.GetHighHalf() + A[0]*B[1] + A[1]*B[0];
571
word Portable::Multiply2Add(word *C, const word *A, const word *B)
573
word D[4] = {A[1]-A[0], A[0]-A[1], B[0]-B[1], B[1]-B[0]};
574
unsigned int ai = A[1] < A[0];
575
unsigned int bi = B[0] < B[1];
576
unsigned int di = ai & bi;
577
DWord d = DWord::Multiply(D[di], D[di+2]);
579
unsigned int si = ai + !bi;
582
DWord A0B0 = DWord::Multiply(A[0], B[0]);
583
DWord t = A0B0 + C[0];
584
C[0] = t.GetLowHalf();
586
DWord A1B1 = DWord::Multiply(A[1], B[1]);
587
t = (DWord) t.GetHighHalf() + A0B0.GetLowHalf() + d.GetLowHalf() + A1B1.GetLowHalf() + C[1];
588
C[1] = t.GetLowHalf();
590
t = (DWord) t.GetHighHalf() + A1B1.GetLowHalf() + A0B0.GetHighHalf() + d.GetHighHalf() + A1B1.GetHighHalf() - s + C[2];
591
C[2] = t.GetLowHalf();
593
t = (DWord) t.GetHighHalf() + A1B1.GetHighHalf() + C[3];
594
C[3] = t.GetLowHalf();
595
return t.GetHighHalf();
598
#define MulAcc(x, y) \
599
p = DWord::MultiplyAndAdd(A[x], B[y], c); \
600
c = p.GetLowHalf(); \
601
p = (DWord) d + p.GetHighHalf(); \
602
d = p.GetLowHalf(); \
603
e += p.GetHighHalf();
605
#define SaveMulAcc(s, x, y) \
607
p = DWord::MultiplyAndAdd(A[x], B[y], d); \
608
c = p.GetLowHalf(); \
609
p = (DWord) e + p.GetHighHalf(); \
610
d = p.GetLowHalf(); \
613
#define SquAcc(x, y) \
614
q = DWord::Multiply(A[x], A[y]); \
616
c = p.GetLowHalf(); \
617
p = (DWord) d + p.GetHighHalf(); \
618
d = p.GetLowHalf(); \
619
e += p.GetHighHalf(); \
621
c = p.GetLowHalf(); \
622
p = (DWord) d + p.GetHighHalf(); \
623
d = p.GetLowHalf(); \
624
e += p.GetHighHalf();
626
#define SaveSquAcc(s, x, y) \
628
q = DWord::Multiply(A[x], A[y]); \
630
c = p.GetLowHalf(); \
631
p = (DWord) e + p.GetHighHalf(); \
632
d = p.GetLowHalf(); \
633
e = p.GetHighHalf(); \
635
c = p.GetLowHalf(); \
636
p = (DWord) d + p.GetHighHalf(); \
637
d = p.GetLowHalf(); \
638
e += p.GetHighHalf();
640
void Portable::Multiply4(word *R, const word *A, const word *B)
645
p = DWord::Multiply(A[0], B[0]);
646
R[0] = p.GetLowHalf();
670
p = DWord::MultiplyAndAdd(A[3], B[3], d);
671
R[6] = p.GetLowHalf();
672
R[7] = e + p.GetHighHalf();
675
void Portable::Square2(word *R, const word *A)
680
p = DWord::Multiply(A[0], A[0]);
681
R[0] = p.GetLowHalf();
688
p = DWord::MultiplyAndAdd(A[1], A[1], d);
689
R[2] = p.GetLowHalf();
690
R[3] = e + p.GetHighHalf();
693
void Portable::Square4(word *R, const word *A)
696
// VC60 workaround: MSVC 6.0 has an optimization bug that makes
697
// (dword)A*B where either A or B has been cast to a dword before
698
// very expensive. Revisit this function when this
706
p = DWord::Multiply(A[0], A[0]);
707
R[0] = p.GetLowHalf();
725
p = DWord::MultiplyAndAdd(A[3], A[3], d);
726
R[6] = p.GetLowHalf();
727
R[7] = e + p.GetHighHalf();
731
void Portable::Multiply8(word *R, const word *A, const word *B)
736
p = DWord::Multiply(A[0], B[0]);
737
R[0] = p.GetLowHalf();
804
SaveMulAcc(10, 4, 7);
809
SaveMulAcc(11, 5, 7);
813
SaveMulAcc(12, 6, 7);
817
p = DWord::MultiplyAndAdd(A[7], B[7], d);
818
R[14] = p.GetLowHalf();
819
R[15] = e + p.GetHighHalf();
822
void Portable::Multiply4Bottom(word *R, const word *A, const word *B)
827
p = DWord::Multiply(A[0], B[0]);
828
R[0] = p.GetLowHalf();
840
R[3] = d + A[0] * B[3] + A[1] * B[2] + A[2] * B[1] + A[3] * B[0];
843
void Portable::Multiply8Bottom(word *R, const word *A, const word *B)
848
p = DWord::Multiply(A[0], B[0]);
849
R[0] = p.GetLowHalf();
887
R[7] = d + A[0] * B[7] + A[1] * B[6] + A[2] * B[5] + A[3] * B[4] +
888
A[4] * B[3] + A[5] * B[2] + A[6] * B[1] + A[7] * B[0];
896
#ifdef CRYPTOPP_X86ASM_AVAILABLE
898
// ************** x86 feature detection ***************
900
static bool s_sse2Enabled = true;
902
static void CpuId(word32 input, word32 *output)
907
// save ebx in case -fPIC is being used
908
"push %%ebx; cpuid; mov %%ebx, %%edi; pop %%ebx"
909
: "=a" (output[0]), "=D" (output[1]), "=c" (output[2]), "=d" (output[3])
926
#ifdef SSE2_INTRINSICS_AVAILABLE
928
static jmp_buf s_env;
929
static void SigIllHandler(int)
935
static bool HasSSE2()
942
if ((cpuid[3] & (1 << 26)) == 0)
948
__asm xorpd xmm0, xmm0 // executing SSE2 instruction
956
typedef void (*SigHandler)(int);
958
SigHandler oldHandler = signal(SIGILL, SigIllHandler);
959
if (oldHandler == SIG_ERR)
966
__asm __volatile ("xorps %xmm0, %xmm0");
968
signal(SIGILL, oldHandler);
979
std::swap(cpuid[2], cpuid[3]);
980
if (memcmp(cpuid+1, "GenuineIntel", 12) != 0)
984
return ((cpuid[0] >> 8) & 0xf) == 0xf;
987
// ************** Pentium/P4 optimizations ***************
989
class PentiumOptimized : public Portable
992
static int Add(word *C, const word *A, const word *B, size_t N);
993
static int Subtract(word *C, const word *A, const word *B, size_t N);
994
static void Multiply4(word *C, const word *A, const word *B);
995
static void Multiply8(word *C, const word *A, const word *B);
996
static void Multiply8Bottom(word *C, const word *A, const word *B);
1002
static int Add(word *C, const word *A, const word *B, size_t N);
1003
static int Subtract(word *C, const word *A, const word *B, size_t N);
1004
#ifdef SSE2_INTRINSICS_AVAILABLE
1005
static void Multiply4(word *C, const word *A, const word *B);
1006
static void Multiply8(word *C, const word *A, const word *B);
1007
static void Multiply8Bottom(word *C, const word *A, const word *B);
1011
typedef int (* PAddSub)(word *C, const word *A, const word *B, size_t N);
447
// use some tricks to share assembly code between MSVC and GCC
448
#if defined(__GNUC__)
449
#define AddPrologue \
451
__asm__ __volatile__ \
453
".intel_syntax noprefix;"
454
#define AddEpilogue \
455
".att_syntax prefix;" \
457
: "d" (C), "a" (A), "D" (B), "c" (N) \
458
: "%esi", "memory", "cc" \
461
#define MulPrologue \
462
__asm__ __volatile__ \
464
".intel_syntax noprefix;" \
467
#define MulEpilogue \
469
".att_syntax prefix;" \
471
: "d" (s_maskLow16), "c" (C), "a" (A), "D" (B) \
472
: "%esi", "memory", "cc" \
474
#define SquPrologue MulPrologue
475
#define SquEpilogue \
477
".att_syntax prefix;" \
479
: "d" (s_maskLow16), "c" (C), "a" (A) \
480
: "%esi", "%edi", "memory", "cc" \
482
#define TopPrologue MulPrologue
483
#define TopEpilogue \
485
".att_syntax prefix;" \
487
: "d" (s_maskLow16), "c" (C), "a" (A), "D" (B), "S" (L) \
491
#define AddPrologue \
494
__asm mov eax, [esp+12] \
495
__asm mov edi, [esp+16]
496
#define AddEpilogue \
501
#define SaveEBX __asm push ebx
502
#define RestoreEBX __asm pop ebx
507
#define SquPrologue \
511
AS2( lea ebx, s_maskLow16)
512
#define MulPrologue \
517
AS2( lea ebx, s_maskLow16)
518
#define TopPrologue \
524
AS2( lea ebx, s_maskLow16)
525
#define SquEpilogue RestoreEBX
526
#define MulEpilogue RestoreEBX
527
#define TopEpilogue RestoreEBX
530
#ifdef CRYPTOPP_X64_MASM_AVAILABLE
532
int Baseline_Add(size_t N, word *C, const word *A, const word *B);
533
int Baseline_Sub(size_t N, word *C, const word *A, const word *B);
535
#elif defined(CRYPTOPP_X64_ASM_AVAILABLE) && defined(__GNUC__) && defined(CRYPTOPP_WORD128_AVAILABLE)
536
int Baseline_Add(size_t N, word *C, const word *A, const word *B)
544
AS2( mov %0,[%3+8*%1])
545
AS2( add %0,[%4+8*%1])
546
AS2( mov [%2+8*%1],%0)
548
AS2( mov %0,[%3+8*%1+8])
549
AS2( adc %0,[%4+8*%1+8])
550
AS2( mov [%2+8*%1+8],%0)
553
AS2( mov %0,[%3+8*%1])
554
AS2( adc %0,[%4+8*%1])
555
AS2( mov [%2+8*%1],%0)
561
: "=&r" (result), "+c" (N)
562
: "r" (C+N), "r" (A+N), "r" (B+N)
568
int Baseline_Sub(size_t N, word *C, const word *A, const word *B)
576
AS2( mov %0,[%3+8*%1])
577
AS2( sub %0,[%4+8*%1])
578
AS2( mov [%2+8*%1],%0)
580
AS2( mov %0,[%3+8*%1+8])
581
AS2( sbb %0,[%4+8*%1+8])
582
AS2( mov [%2+8*%1+8],%0)
585
AS2( mov %0,[%3+8*%1])
586
AS2( sbb %0,[%4+8*%1])
587
AS2( mov [%2+8*%1],%0)
593
: "=&r" (result), "+c" (N)
594
: "r" (C+N), "r" (A+N), "r" (B+N)
599
#elif defined(CRYPTOPP_X86_ASM_AVAILABLE) && CRYPTOPP_BOOL_X86
600
CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B)
604
// now: eax = A, edi = B, edx = C, ecx = N
605
AS2( lea eax, [eax+4*ecx])
606
AS2( lea edi, [edi+4*ecx])
607
AS2( lea edx, [edx+4*ecx])
609
AS1( neg ecx) // ecx is negative index
610
AS2( test ecx, 2) // this clears carry flag
616
ASJ( jecxz, 2, f) // loop until ecx overflows and becomes zero
617
AS2( mov esi,[eax+4*ecx])
618
AS2( adc esi,[edi+4*ecx])
619
AS2( mov [edx+4*ecx],esi)
620
AS2( mov esi,[eax+4*ecx+4])
621
AS2( adc esi,[edi+4*ecx+4])
622
AS2( mov [edx+4*ecx+4],esi)
624
AS2( mov esi,[eax+4*ecx+8])
625
AS2( adc esi,[edi+4*ecx+8])
626
AS2( mov [edx+4*ecx+8],esi)
627
AS2( mov esi,[eax+4*ecx+12])
628
AS2( adc esi,[edi+4*ecx+12])
629
AS2( mov [edx+4*ecx+12],esi)
631
AS2( lea ecx,[ecx+4]) // advance index, avoid inc which causes slowdown on Intel Core 2
636
AS1( setc al) // store carry into eax (return result register)
641
CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B)
645
// now: eax = A, edi = B, edx = C, ecx = N
646
AS2( lea eax, [eax+4*ecx])
647
AS2( lea edi, [edi+4*ecx])
648
AS2( lea edx, [edx+4*ecx])
650
AS1( neg ecx) // ecx is negative index
651
AS2( test ecx, 2) // this clears carry flag
657
ASJ( jecxz, 2, f) // loop until ecx overflows and becomes zero
658
AS2( mov esi,[eax+4*ecx])
659
AS2( sbb esi,[edi+4*ecx])
660
AS2( mov [edx+4*ecx],esi)
661
AS2( mov esi,[eax+4*ecx+4])
662
AS2( sbb esi,[edi+4*ecx+4])
663
AS2( mov [edx+4*ecx+4],esi)
665
AS2( mov esi,[eax+4*ecx+8])
666
AS2( sbb esi,[edi+4*ecx+8])
667
AS2( mov [edx+4*ecx+8],esi)
668
AS2( mov esi,[eax+4*ecx+12])
669
AS2( sbb esi,[edi+4*ecx+12])
670
AS2( mov [edx+4*ecx+12],esi)
672
AS2( lea ecx,[ecx+4]) // advance index, avoid inc which causes slowdown on Intel Core 2
677
AS1( setc al) // store carry into eax (return result register)
682
#if CRYPTOPP_INTEGER_SSE2
683
CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Add(size_t N, word *C, const word *A, const word *B)
687
// now: eax = A, edi = B, edx = C, ecx = N
688
AS2( lea eax, [eax+4*ecx])
689
AS2( lea edi, [edi+4*ecx])
690
AS2( lea edx, [edx+4*ecx])
692
AS1( neg ecx) // ecx is negative index
695
AS2( test ecx, 2) // this clears carry flag
701
AS2( movd mm0, DWORD PTR [eax+4*ecx])
702
AS2( movd mm1, DWORD PTR [edi+4*ecx])
705
AS2( movd DWORD PTR [edx+4*ecx], mm2)
708
AS2( movd mm0, DWORD PTR [eax+4*ecx+4])
709
AS2( movd mm1, DWORD PTR [edi+4*ecx+4])
712
AS2( movd DWORD PTR [edx+4*ecx+4], mm2)
716
AS2( movd mm0, DWORD PTR [eax+4*ecx+8])
717
AS2( movd mm1, DWORD PTR [edi+4*ecx+8])
720
AS2( movd DWORD PTR [edx+4*ecx+8], mm2)
723
AS2( movd mm0, DWORD PTR [eax+4*ecx+12])
724
AS2( movd mm1, DWORD PTR [edi+4*ecx+12])
727
AS2( movd DWORD PTR [edx+4*ecx+12], mm2)
739
CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Sub(size_t N, word *C, const word *A, const word *B)
743
// now: eax = A, edi = B, edx = C, ecx = N
744
AS2( lea eax, [eax+4*ecx])
745
AS2( lea edi, [edi+4*ecx])
746
AS2( lea edx, [edx+4*ecx])
748
AS1( neg ecx) // ecx is negative index
751
AS2( test ecx, 2) // this clears carry flag
757
AS2( movd mm0, DWORD PTR [eax+4*ecx])
758
AS2( movd mm1, DWORD PTR [edi+4*ecx])
761
AS2( movd DWORD PTR [edx+4*ecx], mm0)
764
AS2( movd mm2, DWORD PTR [eax+4*ecx+4])
765
AS2( movd mm1, DWORD PTR [edi+4*ecx+4])
768
AS2( movd DWORD PTR [edx+4*ecx+4], mm2)
772
AS2( movd mm0, DWORD PTR [eax+4*ecx+8])
773
AS2( movd mm1, DWORD PTR [edi+4*ecx+8])
776
AS2( movd DWORD PTR [edx+4*ecx+8], mm0)
779
AS2( movd mm2, DWORD PTR [eax+4*ecx+12])
780
AS2( movd mm1, DWORD PTR [edi+4*ecx+12])
783
AS2( movd DWORD PTR [edx+4*ecx+12], mm2)
795
#endif // #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
797
int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B)
803
for (size_t i=0; i<N; i+=2)
805
AddWithCarry(u, A[i], B[i]);
807
AddWithCarry(u, A[i+1], B[i+1]);
810
return int(GetCarry(u));
813
int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B)
819
for (size_t i=0; i<N; i+=2)
821
SubtractWithBorrow(u, A[i], B[i]);
823
SubtractWithBorrow(u, A[i+1], B[i+1]);
826
return int(GetBorrow(u));
830
static word LinearMultiply(word *C, const word *A, word B, size_t N)
833
for(unsigned i=0; i<N; i++)
836
MultiplyWords(p, A[i], B);
837
Acc2WordsBy1(p, carry);
844
#ifndef CRYPTOPP_DOXYGEN_PROCESSING
848
Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
853
Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
854
Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \
855
Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
856
Mul_SaveAcc(3, 1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) \
857
Mul_SaveAcc(4, 2, 3) Mul_Acc(3, 2) \
862
Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
863
Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \
864
Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
865
Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
866
Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
867
Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
868
Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
869
Mul_SaveAcc(7, 1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) \
870
Mul_SaveAcc(8, 2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) \
871
Mul_SaveAcc(9, 3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) \
872
Mul_SaveAcc(10, 4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) \
873
Mul_SaveAcc(11, 5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) \
874
Mul_SaveAcc(12, 6, 7) Mul_Acc(7, 6) \
879
Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
880
Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \
881
Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
882
Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
883
Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
884
Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
885
Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
886
Mul_SaveAcc(7, 0, 8) Mul_Acc(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) Mul_Acc(8, 0) \
887
Mul_SaveAcc(8, 0, 9) Mul_Acc(1, 8) Mul_Acc(2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) Mul_Acc(8, 1) Mul_Acc(9, 0) \
888
Mul_SaveAcc(9, 0, 10) Mul_Acc(1, 9) Mul_Acc(2, 8) Mul_Acc(3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) Mul_Acc(8, 2) Mul_Acc(9, 1) Mul_Acc(10, 0) \
889
Mul_SaveAcc(10, 0, 11) Mul_Acc(1, 10) Mul_Acc(2, 9) Mul_Acc(3, 8) Mul_Acc(4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) Mul_Acc(8, 3) Mul_Acc(9, 2) Mul_Acc(10, 1) Mul_Acc(11, 0) \
890
Mul_SaveAcc(11, 0, 12) Mul_Acc(1, 11) Mul_Acc(2, 10) Mul_Acc(3, 9) Mul_Acc(4, 8) Mul_Acc(5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) Mul_Acc(8, 4) Mul_Acc(9, 3) Mul_Acc(10, 2) Mul_Acc(11, 1) Mul_Acc(12, 0) \
891
Mul_SaveAcc(12, 0, 13) Mul_Acc(1, 12) Mul_Acc(2, 11) Mul_Acc(3, 10) Mul_Acc(4, 9) Mul_Acc(5, 8) Mul_Acc(6, 7) Mul_Acc(7, 6) Mul_Acc(8, 5) Mul_Acc(9, 4) Mul_Acc(10, 3) Mul_Acc(11, 2) Mul_Acc(12, 1) Mul_Acc(13, 0) \
892
Mul_SaveAcc(13, 0, 14) Mul_Acc(1, 13) Mul_Acc(2, 12) Mul_Acc(3, 11) Mul_Acc(4, 10) Mul_Acc(5, 9) Mul_Acc(6, 8) Mul_Acc(7, 7) Mul_Acc(8, 6) Mul_Acc(9, 5) Mul_Acc(10, 4) Mul_Acc(11, 3) Mul_Acc(12, 2) Mul_Acc(13, 1) Mul_Acc(14, 0) \
893
Mul_SaveAcc(14, 0, 15) Mul_Acc(1, 14) Mul_Acc(2, 13) Mul_Acc(3, 12) Mul_Acc(4, 11) Mul_Acc(5, 10) Mul_Acc(6, 9) Mul_Acc(7, 8) Mul_Acc(8, 7) Mul_Acc(9, 6) Mul_Acc(10, 5) Mul_Acc(11, 4) Mul_Acc(12, 3) Mul_Acc(13, 2) Mul_Acc(14, 1) Mul_Acc(15, 0) \
894
Mul_SaveAcc(15, 1, 15) Mul_Acc(2, 14) Mul_Acc(3, 13) Mul_Acc(4, 12) Mul_Acc(5, 11) Mul_Acc(6, 10) Mul_Acc(7, 9) Mul_Acc(8, 8) Mul_Acc(9, 7) Mul_Acc(10, 6) Mul_Acc(11, 5) Mul_Acc(12, 4) Mul_Acc(13, 3) Mul_Acc(14, 2) Mul_Acc(15, 1) \
895
Mul_SaveAcc(16, 2, 15) Mul_Acc(3, 14) Mul_Acc(4, 13) Mul_Acc(5, 12) Mul_Acc(6, 11) Mul_Acc(7, 10) Mul_Acc(8, 9) Mul_Acc(9, 8) Mul_Acc(10, 7) Mul_Acc(11, 6) Mul_Acc(12, 5) Mul_Acc(13, 4) Mul_Acc(14, 3) Mul_Acc(15, 2) \
896
Mul_SaveAcc(17, 3, 15) Mul_Acc(4, 14) Mul_Acc(5, 13) Mul_Acc(6, 12) Mul_Acc(7, 11) Mul_Acc(8, 10) Mul_Acc(9, 9) Mul_Acc(10, 8) Mul_Acc(11, 7) Mul_Acc(12, 6) Mul_Acc(13, 5) Mul_Acc(14, 4) Mul_Acc(15, 3) \
897
Mul_SaveAcc(18, 4, 15) Mul_Acc(5, 14) Mul_Acc(6, 13) Mul_Acc(7, 12) Mul_Acc(8, 11) Mul_Acc(9, 10) Mul_Acc(10, 9) Mul_Acc(11, 8) Mul_Acc(12, 7) Mul_Acc(13, 6) Mul_Acc(14, 5) Mul_Acc(15, 4) \
898
Mul_SaveAcc(19, 5, 15) Mul_Acc(6, 14) Mul_Acc(7, 13) Mul_Acc(8, 12) Mul_Acc(9, 11) Mul_Acc(10, 10) Mul_Acc(11, 9) Mul_Acc(12, 8) Mul_Acc(13, 7) Mul_Acc(14, 6) Mul_Acc(15, 5) \
899
Mul_SaveAcc(20, 6, 15) Mul_Acc(7, 14) Mul_Acc(8, 13) Mul_Acc(9, 12) Mul_Acc(10, 11) Mul_Acc(11, 10) Mul_Acc(12, 9) Mul_Acc(13, 8) Mul_Acc(14, 7) Mul_Acc(15, 6) \
900
Mul_SaveAcc(21, 7, 15) Mul_Acc(8, 14) Mul_Acc(9, 13) Mul_Acc(10, 12) Mul_Acc(11, 11) Mul_Acc(12, 10) Mul_Acc(13, 9) Mul_Acc(14, 8) Mul_Acc(15, 7) \
901
Mul_SaveAcc(22, 8, 15) Mul_Acc(9, 14) Mul_Acc(10, 13) Mul_Acc(11, 12) Mul_Acc(12, 11) Mul_Acc(13, 10) Mul_Acc(14, 9) Mul_Acc(15, 8) \
902
Mul_SaveAcc(23, 9, 15) Mul_Acc(10, 14) Mul_Acc(11, 13) Mul_Acc(12, 12) Mul_Acc(13, 11) Mul_Acc(14, 10) Mul_Acc(15, 9) \
903
Mul_SaveAcc(24, 10, 15) Mul_Acc(11, 14) Mul_Acc(12, 13) Mul_Acc(13, 12) Mul_Acc(14, 11) Mul_Acc(15, 10) \
904
Mul_SaveAcc(25, 11, 15) Mul_Acc(12, 14) Mul_Acc(13, 13) Mul_Acc(14, 12) Mul_Acc(15, 11) \
905
Mul_SaveAcc(26, 12, 15) Mul_Acc(13, 14) Mul_Acc(14, 13) Mul_Acc(15, 12) \
906
Mul_SaveAcc(27, 13, 15) Mul_Acc(14, 14) Mul_Acc(15, 13) \
907
Mul_SaveAcc(28, 14, 15) Mul_Acc(15, 14) \
916
Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \
917
Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \
918
Squ_SaveAcc(3, 1, 3) Squ_Diag(2) \
919
Squ_SaveAcc(4, 2, 3) Squ_NonDiag \
924
Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \
925
Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \
926
Squ_SaveAcc(3, 0, 4) Squ_Acc(1, 3) Squ_Diag(2) \
927
Squ_SaveAcc(4, 0, 5) Squ_Acc(1, 4) Squ_Acc(2, 3) Squ_NonDiag \
928
Squ_SaveAcc(5, 0, 6) Squ_Acc(1, 5) Squ_Acc(2, 4) Squ_Diag(3) \
929
Squ_SaveAcc(6, 0, 7) Squ_Acc(1, 6) Squ_Acc(2, 5) Squ_Acc(3, 4) Squ_NonDiag \
930
Squ_SaveAcc(7, 1, 7) Squ_Acc(2, 6) Squ_Acc(3, 5) Squ_Diag(4) \
931
Squ_SaveAcc(8, 2, 7) Squ_Acc(3, 6) Squ_Acc(4, 5) Squ_NonDiag \
932
Squ_SaveAcc(9, 3, 7) Squ_Acc(4, 6) Squ_Diag(5) \
933
Squ_SaveAcc(10, 4, 7) Squ_Acc(5, 6) Squ_NonDiag \
934
Squ_SaveAcc(11, 5, 7) Squ_Diag(6) \
935
Squ_SaveAcc(12, 6, 7) Squ_NonDiag \
940
Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \
941
Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \
942
Squ_SaveAcc(3, 0, 4) Squ_Acc(1, 3) Squ_Diag(2) \
943
Squ_SaveAcc(4, 0, 5) Squ_Acc(1, 4) Squ_Acc(2, 3) Squ_NonDiag \
944
Squ_SaveAcc(5, 0, 6) Squ_Acc(1, 5) Squ_Acc(2, 4) Squ_Diag(3) \
945
Squ_SaveAcc(6, 0, 7) Squ_Acc(1, 6) Squ_Acc(2, 5) Squ_Acc(3, 4) Squ_NonDiag \
946
Squ_SaveAcc(7, 0, 8) Squ_Acc(1, 7) Squ_Acc(2, 6) Squ_Acc(3, 5) Squ_Diag(4) \
947
Squ_SaveAcc(8, 0, 9) Squ_Acc(1, 8) Squ_Acc(2, 7) Squ_Acc(3, 6) Squ_Acc(4, 5) Squ_NonDiag \
948
Squ_SaveAcc(9, 0, 10) Squ_Acc(1, 9) Squ_Acc(2, 8) Squ_Acc(3, 7) Squ_Acc(4, 6) Squ_Diag(5) \
949
Squ_SaveAcc(10, 0, 11) Squ_Acc(1, 10) Squ_Acc(2, 9) Squ_Acc(3, 8) Squ_Acc(4, 7) Squ_Acc(5, 6) Squ_NonDiag \
950
Squ_SaveAcc(11, 0, 12) Squ_Acc(1, 11) Squ_Acc(2, 10) Squ_Acc(3, 9) Squ_Acc(4, 8) Squ_Acc(5, 7) Squ_Diag(6) \
951
Squ_SaveAcc(12, 0, 13) Squ_Acc(1, 12) Squ_Acc(2, 11) Squ_Acc(3, 10) Squ_Acc(4, 9) Squ_Acc(5, 8) Squ_Acc(6, 7) Squ_NonDiag \
952
Squ_SaveAcc(13, 0, 14) Squ_Acc(1, 13) Squ_Acc(2, 12) Squ_Acc(3, 11) Squ_Acc(4, 10) Squ_Acc(5, 9) Squ_Acc(6, 8) Squ_Diag(7) \
953
Squ_SaveAcc(14, 0, 15) Squ_Acc(1, 14) Squ_Acc(2, 13) Squ_Acc(3, 12) Squ_Acc(4, 11) Squ_Acc(5, 10) Squ_Acc(6, 9) Squ_Acc(7, 8) Squ_NonDiag \
954
Squ_SaveAcc(15, 1, 15) Squ_Acc(2, 14) Squ_Acc(3, 13) Squ_Acc(4, 12) Squ_Acc(5, 11) Squ_Acc(6, 10) Squ_Acc(7, 9) Squ_Diag(8) \
955
Squ_SaveAcc(16, 2, 15) Squ_Acc(3, 14) Squ_Acc(4, 13) Squ_Acc(5, 12) Squ_Acc(6, 11) Squ_Acc(7, 10) Squ_Acc(8, 9) Squ_NonDiag \
956
Squ_SaveAcc(17, 3, 15) Squ_Acc(4, 14) Squ_Acc(5, 13) Squ_Acc(6, 12) Squ_Acc(7, 11) Squ_Acc(8, 10) Squ_Diag(9) \
957
Squ_SaveAcc(18, 4, 15) Squ_Acc(5, 14) Squ_Acc(6, 13) Squ_Acc(7, 12) Squ_Acc(8, 11) Squ_Acc(9, 10) Squ_NonDiag \
958
Squ_SaveAcc(19, 5, 15) Squ_Acc(6, 14) Squ_Acc(7, 13) Squ_Acc(8, 12) Squ_Acc(9, 11) Squ_Diag(10) \
959
Squ_SaveAcc(20, 6, 15) Squ_Acc(7, 14) Squ_Acc(8, 13) Squ_Acc(9, 12) Squ_Acc(10, 11) Squ_NonDiag \
960
Squ_SaveAcc(21, 7, 15) Squ_Acc(8, 14) Squ_Acc(9, 13) Squ_Acc(10, 12) Squ_Diag(11) \
961
Squ_SaveAcc(22, 8, 15) Squ_Acc(9, 14) Squ_Acc(10, 13) Squ_Acc(11, 12) Squ_NonDiag \
962
Squ_SaveAcc(23, 9, 15) Squ_Acc(10, 14) Squ_Acc(11, 13) Squ_Diag(12) \
963
Squ_SaveAcc(24, 10, 15) Squ_Acc(11, 14) Squ_Acc(12, 13) Squ_NonDiag \
964
Squ_SaveAcc(25, 11, 15) Squ_Acc(12, 14) Squ_Diag(13) \
965
Squ_SaveAcc(26, 12, 15) Squ_Acc(13, 14) Squ_NonDiag \
966
Squ_SaveAcc(27, 13, 15) Squ_Diag(14) \
967
Squ_SaveAcc(28, 14, 15) Squ_NonDiag \
972
Bot_SaveAcc(0, 0, 1) Bot_Acc(1, 0) \
977
Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
978
Mul_SaveAcc(1, 2, 0) Mul_Acc(1, 1) Mul_Acc(0, 2) \
979
Bot_SaveAcc(2, 0, 3) Bot_Acc(1, 2) Bot_Acc(2, 1) Bot_Acc(3, 0) \
984
Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
985
Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \
986
Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
987
Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
988
Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
989
Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
990
Bot_SaveAcc(6, 0, 7) Bot_Acc(1, 6) Bot_Acc(2, 5) Bot_Acc(3, 4) Bot_Acc(4, 3) Bot_Acc(5, 2) Bot_Acc(6, 1) Bot_Acc(7, 0) \
995
Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
996
Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \
997
Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
998
Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
999
Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
1000
Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
1001
Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
1002
Mul_SaveAcc(7, 0, 8) Mul_Acc(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) Mul_Acc(8, 0) \
1003
Mul_SaveAcc(8, 0, 9) Mul_Acc(1, 8) Mul_Acc(2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) Mul_Acc(8, 1) Mul_Acc(9, 0) \
1004
Mul_SaveAcc(9, 0, 10) Mul_Acc(1, 9) Mul_Acc(2, 8) Mul_Acc(3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) Mul_Acc(8, 2) Mul_Acc(9, 1) Mul_Acc(10, 0) \
1005
Mul_SaveAcc(10, 0, 11) Mul_Acc(1, 10) Mul_Acc(2, 9) Mul_Acc(3, 8) Mul_Acc(4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) Mul_Acc(8, 3) Mul_Acc(9, 2) Mul_Acc(10, 1) Mul_Acc(11, 0) \
1006
Mul_SaveAcc(11, 0, 12) Mul_Acc(1, 11) Mul_Acc(2, 10) Mul_Acc(3, 9) Mul_Acc(4, 8) Mul_Acc(5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) Mul_Acc(8, 4) Mul_Acc(9, 3) Mul_Acc(10, 2) Mul_Acc(11, 1) Mul_Acc(12, 0) \
1007
Mul_SaveAcc(12, 0, 13) Mul_Acc(1, 12) Mul_Acc(2, 11) Mul_Acc(3, 10) Mul_Acc(4, 9) Mul_Acc(5, 8) Mul_Acc(6, 7) Mul_Acc(7, 6) Mul_Acc(8, 5) Mul_Acc(9, 4) Mul_Acc(10, 3) Mul_Acc(11, 2) Mul_Acc(12, 1) Mul_Acc(13, 0) \
1008
Mul_SaveAcc(13, 0, 14) Mul_Acc(1, 13) Mul_Acc(2, 12) Mul_Acc(3, 11) Mul_Acc(4, 10) Mul_Acc(5, 9) Mul_Acc(6, 8) Mul_Acc(7, 7) Mul_Acc(8, 6) Mul_Acc(9, 5) Mul_Acc(10, 4) Mul_Acc(11, 3) Mul_Acc(12, 2) Mul_Acc(13, 1) Mul_Acc(14, 0) \
1009
Bot_SaveAcc(14, 0, 15) Bot_Acc(1, 14) Bot_Acc(2, 13) Bot_Acc(3, 12) Bot_Acc(4, 11) Bot_Acc(5, 10) Bot_Acc(6, 9) Bot_Acc(7, 8) Bot_Acc(8, 7) Bot_Acc(9, 6) Bot_Acc(10, 5) Bot_Acc(11, 4) Bot_Acc(12, 3) Bot_Acc(13, 2) Bot_Acc(14, 1) Bot_Acc(15, 0) \
1015
#define Mul_Begin(n) \
1019
MultiplyWords(p, A[0], B[0]) \
1020
AssignWord(c, LowWord(p)) \
1021
AssignWord(d, HighWord(p))
1023
#define Mul_Acc(i, j) \
1024
MultiplyWords(p, A[i], B[j]) \
1025
Acc2WordsBy1(c, LowWord(p)) \
1026
Acc2WordsBy1(d, HighWord(p))
1028
#define Mul_SaveAcc(k, i, j) \
1029
R[k] = LowWord(c); \
1030
Add2WordsBy1(c, d, HighWord(c)) \
1031
MultiplyWords(p, A[i], B[j]) \
1032
AssignWord(d, HighWord(p)) \
1033
Acc2WordsBy1(c, LowWord(p))
1035
#define Mul_End(n) \
1036
R[2*n-3] = LowWord(c); \
1037
Acc2WordsBy1(d, HighWord(c)) \
1038
MultiplyWords(p, A[n-1], B[n-1])\
1039
Acc2WordsBy2(d, p) \
1040
R[2*n-2] = LowWord(d); \
1041
R[2*n-1] = HighWord(d);
1043
#define Bot_SaveAcc(k, i, j) \
1044
R[k] = LowWord(c); \
1045
word e = LowWord(d) + HighWord(c); \
1048
#define Bot_Acc(i, j) \
1051
#define Bot_End(n) \
1054
#define Mul_Begin(n) \
1058
MultiplyWords(p, A[0], B[0]) \
1060
AssignWord(d, HighWord(p))
1062
#define Mul_Acc(i, j) \
1063
MulAcc(c, d, A[i], B[j])
1065
#define Mul_SaveAcc(k, i, j) \
1068
AssignWord(d, HighWord(d)) \
1069
MulAcc(c, d, A[i], B[j])
1071
#define Mul_End(k, i) \
1073
MultiplyWords(p, A[i], B[i]) \
1074
Acc2WordsBy2(p, d) \
1075
R[k+1] = LowWord(p); \
1076
R[k+2] = HighWord(p);
1078
#define Bot_SaveAcc(k, i, j) \
1083
#define Bot_Acc(i, j) \
1086
#define Bot_End(n) \
1090
#define Squ_Begin(n) \
1095
MultiplyWords(p, A[0], A[0]) \
1096
R[0] = LowWord(p); \
1097
AssignWord(e, HighWord(p)) \
1098
MultiplyWords(p, A[0], A[1]) \
1100
AssignWord(d, HighWord(p)) \
1103
#define Squ_NonDiag \
1106
#define Squ_SaveAcc(k, i, j) \
1107
Acc3WordsBy2(c, d, e) \
1109
MultiplyWords(p, A[i], A[j]) \
1111
AssignWord(d, HighWord(p)) \
1113
#define Squ_Acc(i, j) \
1114
MulAcc(c, d, A[i], A[j])
1116
#define Squ_Diag(i) \
1118
MulAcc(c, d, A[i], A[i])
1120
#define Squ_End(n) \
1121
Acc3WordsBy2(c, d, e) \
1123
MultiplyWords(p, A[n-1], A[n-1])\
1124
Acc2WordsBy2(p, e) \
1125
R[2*n-2] = LowWord(p); \
1126
R[2*n-1] = HighWord(p);
1128
void Baseline_Multiply2(word *R, const word *A, const word *B)
1133
void Baseline_Multiply4(word *R, const word *A, const word *B)
1138
void Baseline_Multiply8(word *R, const word *A, const word *B)
1143
void Baseline_Square2(word *R, const word *A)
1148
void Baseline_Square4(word *R, const word *A)
1153
void Baseline_Square8(word *R, const word *A)
1158
void Baseline_MultiplyBottom2(word *R, const word *A, const word *B)
1163
void Baseline_MultiplyBottom4(word *R, const word *A, const word *B)
1168
void Baseline_MultiplyBottom8(word *R, const word *A, const word *B)
1173
#define Top_Begin(n) \
1177
MultiplyWords(p, A[0], B[n-2]);\
1178
AssignWord(d, HighWord(p));
1180
#define Top_Acc(i, j) \
1181
MultiplyWords(p, A[i], B[j]);\
1182
Acc2WordsBy1(d, HighWord(p));
1184
#define Top_SaveAcc0(i, j) \
1186
AssignWord(d, HighWord(d)) \
1187
MulAcc(c, d, A[i], B[j])
1189
#define Top_SaveAcc1(i, j) \
1191
Acc2WordsBy1(d, c); \
1193
AssignWord(d, HighWord(d)) \
1194
MulAcc(c, d, A[i], B[j])
1196
void Baseline_MultiplyTop2(word *R, const word *A, const word *B, word L)
1199
Baseline_Multiply2(T, A, B);
1204
void Baseline_MultiplyTop4(word *R, const word *A, const word *B, word L)
1207
Top_Acc(1, 1) Top_Acc(2, 0) \
1208
Top_SaveAcc0(0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
1209
Top_SaveAcc1(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) \
1210
Mul_SaveAcc(0, 2, 3) Mul_Acc(3, 2) \
1214
void Baseline_MultiplyTop8(word *R, const word *A, const word *B, word L)
1217
Top_Acc(1, 5) Top_Acc(2, 4) Top_Acc(3, 3) Top_Acc(4, 2) Top_Acc(5, 1) Top_Acc(6, 0) \
1218
Top_SaveAcc0(0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
1219
Top_SaveAcc1(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) \
1220
Mul_SaveAcc(0, 2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) \
1221
Mul_SaveAcc(1, 3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) \
1222
Mul_SaveAcc(2, 4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) \
1223
Mul_SaveAcc(3, 5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) \
1224
Mul_SaveAcc(4, 6, 7) Mul_Acc(7, 6) \
1228
#if !CRYPTOPP_INTEGER_SSE2 // save memory by not compiling these functions when SSE2 is available
1229
void Baseline_Multiply16(word *R, const word *A, const word *B)
1234
void Baseline_Square16(word *R, const word *A)
1239
void Baseline_MultiplyBottom16(word *R, const word *A, const word *B)
1244
void Baseline_MultiplyTop16(word *R, const word *A, const word *B, word L)
1247
Top_Acc(1, 13) Top_Acc(2, 12) Top_Acc(3, 11) Top_Acc(4, 10) Top_Acc(5, 9) Top_Acc(6, 8) Top_Acc(7, 7) Top_Acc(8, 6) Top_Acc(9, 5) Top_Acc(10, 4) Top_Acc(11, 3) Top_Acc(12, 2) Top_Acc(13, 1) Top_Acc(14, 0) \
1248
Top_SaveAcc0(0, 15) Mul_Acc(1, 14) Mul_Acc(2, 13) Mul_Acc(3, 12) Mul_Acc(4, 11) Mul_Acc(5, 10) Mul_Acc(6, 9) Mul_Acc(7, 8) Mul_Acc(8, 7) Mul_Acc(9, 6) Mul_Acc(10, 5) Mul_Acc(11, 4) Mul_Acc(12, 3) Mul_Acc(13, 2) Mul_Acc(14, 1) Mul_Acc(15, 0) \
1249
Top_SaveAcc1(1, 15) Mul_Acc(2, 14) Mul_Acc(3, 13) Mul_Acc(4, 12) Mul_Acc(5, 11) Mul_Acc(6, 10) Mul_Acc(7, 9) Mul_Acc(8, 8) Mul_Acc(9, 7) Mul_Acc(10, 6) Mul_Acc(11, 5) Mul_Acc(12, 4) Mul_Acc(13, 3) Mul_Acc(14, 2) Mul_Acc(15, 1) \
1250
Mul_SaveAcc(0, 2, 15) Mul_Acc(3, 14) Mul_Acc(4, 13) Mul_Acc(5, 12) Mul_Acc(6, 11) Mul_Acc(7, 10) Mul_Acc(8, 9) Mul_Acc(9, 8) Mul_Acc(10, 7) Mul_Acc(11, 6) Mul_Acc(12, 5) Mul_Acc(13, 4) Mul_Acc(14, 3) Mul_Acc(15, 2) \
1251
Mul_SaveAcc(1, 3, 15) Mul_Acc(4, 14) Mul_Acc(5, 13) Mul_Acc(6, 12) Mul_Acc(7, 11) Mul_Acc(8, 10) Mul_Acc(9, 9) Mul_Acc(10, 8) Mul_Acc(11, 7) Mul_Acc(12, 6) Mul_Acc(13, 5) Mul_Acc(14, 4) Mul_Acc(15, 3) \
1252
Mul_SaveAcc(2, 4, 15) Mul_Acc(5, 14) Mul_Acc(6, 13) Mul_Acc(7, 12) Mul_Acc(8, 11) Mul_Acc(9, 10) Mul_Acc(10, 9) Mul_Acc(11, 8) Mul_Acc(12, 7) Mul_Acc(13, 6) Mul_Acc(14, 5) Mul_Acc(15, 4) \
1253
Mul_SaveAcc(3, 5, 15) Mul_Acc(6, 14) Mul_Acc(7, 13) Mul_Acc(8, 12) Mul_Acc(9, 11) Mul_Acc(10, 10) Mul_Acc(11, 9) Mul_Acc(12, 8) Mul_Acc(13, 7) Mul_Acc(14, 6) Mul_Acc(15, 5) \
1254
Mul_SaveAcc(4, 6, 15) Mul_Acc(7, 14) Mul_Acc(8, 13) Mul_Acc(9, 12) Mul_Acc(10, 11) Mul_Acc(11, 10) Mul_Acc(12, 9) Mul_Acc(13, 8) Mul_Acc(14, 7) Mul_Acc(15, 6) \
1255
Mul_SaveAcc(5, 7, 15) Mul_Acc(8, 14) Mul_Acc(9, 13) Mul_Acc(10, 12) Mul_Acc(11, 11) Mul_Acc(12, 10) Mul_Acc(13, 9) Mul_Acc(14, 8) Mul_Acc(15, 7) \
1256
Mul_SaveAcc(6, 8, 15) Mul_Acc(9, 14) Mul_Acc(10, 13) Mul_Acc(11, 12) Mul_Acc(12, 11) Mul_Acc(13, 10) Mul_Acc(14, 9) Mul_Acc(15, 8) \
1257
Mul_SaveAcc(7, 9, 15) Mul_Acc(10, 14) Mul_Acc(11, 13) Mul_Acc(12, 12) Mul_Acc(13, 11) Mul_Acc(14, 10) Mul_Acc(15, 9) \
1258
Mul_SaveAcc(8, 10, 15) Mul_Acc(11, 14) Mul_Acc(12, 13) Mul_Acc(13, 12) Mul_Acc(14, 11) Mul_Acc(15, 10) \
1259
Mul_SaveAcc(9, 11, 15) Mul_Acc(12, 14) Mul_Acc(13, 13) Mul_Acc(14, 12) Mul_Acc(15, 11) \
1260
Mul_SaveAcc(10, 12, 15) Mul_Acc(13, 14) Mul_Acc(14, 13) Mul_Acc(15, 12) \
1261
Mul_SaveAcc(11, 13, 15) Mul_Acc(14, 14) Mul_Acc(15, 13) \
1262
Mul_SaveAcc(12, 14, 15) Mul_Acc(15, 14) \
1267
// ********************************************************
1269
#if CRYPTOPP_INTEGER_SSE2
1271
CRYPTOPP_ALIGN_DATA(16) static const word32 s_maskLow16[4] CRYPTOPP_SECTION_ALIGN16 = {0xffff,0xffff,0xffff,0xffff};
1289
#define SSE2_FinalSave(k) \
1290
AS2( psllq xmm5, 16) \
1291
AS2( paddq xmm4, xmm5) \
1292
AS2( movq QWORD PTR [ecx+8*(k)], xmm4)
1294
#define SSE2_SaveShift(k) \
1295
AS2( movq xmm0, xmm6) \
1296
AS2( punpckhqdq xmm6, xmm0) \
1297
AS2( movq xmm1, xmm7) \
1298
AS2( punpckhqdq xmm7, xmm1) \
1299
AS2( paddd xmm6, xmm0) \
1300
AS2( pslldq xmm6, 4) \
1301
AS2( paddd xmm7, xmm1) \
1302
AS2( paddd xmm4, xmm6) \
1303
AS2( pslldq xmm7, 4) \
1304
AS2( movq xmm6, xmm4) \
1305
AS2( paddd xmm5, xmm7) \
1306
AS2( movq xmm7, xmm5) \
1307
AS2( movd DWORD PTR [ecx+8*(k)], xmm4) \
1308
AS2( psrlq xmm6, 16) \
1309
AS2( paddq xmm6, xmm7) \
1310
AS2( punpckhqdq xmm4, xmm0) \
1311
AS2( punpckhqdq xmm5, xmm0) \
1312
AS2( movq QWORD PTR [ecx+8*(k)+2], xmm6) \
1313
AS2( psrlq xmm6, 3*16) \
1314
AS2( paddd xmm4, xmm6) \
1316
#define Squ_SSE2_SaveShift(k) \
1317
AS2( movq xmm0, xmm6) \
1318
AS2( punpckhqdq xmm6, xmm0) \
1319
AS2( movq xmm1, xmm7) \
1320
AS2( punpckhqdq xmm7, xmm1) \
1321
AS2( paddd xmm6, xmm0) \
1322
AS2( pslldq xmm6, 4) \
1323
AS2( paddd xmm7, xmm1) \
1324
AS2( paddd xmm4, xmm6) \
1325
AS2( pslldq xmm7, 4) \
1326
AS2( movhlps xmm6, xmm4) \
1327
AS2( movd DWORD PTR [ecx+8*(k)], xmm4) \
1328
AS2( paddd xmm5, xmm7) \
1329
AS2( movhps QWORD PTR [esp+12], xmm5)\
1330
AS2( psrlq xmm4, 16) \
1331
AS2( paddq xmm4, xmm5) \
1332
AS2( movq QWORD PTR [ecx+8*(k)+2], xmm4) \
1333
AS2( psrlq xmm4, 3*16) \
1334
AS2( paddd xmm4, xmm6) \
1335
AS2( movq QWORD PTR [esp+4], xmm4)\
1337
#define SSE2_FirstMultiply(i) \
1338
AS2( movdqa xmm7, [esi+(i)*16])\
1339
AS2( movdqa xmm5, [edi-(i)*16])\
1340
AS2( pmuludq xmm5, xmm7) \
1341
AS2( movdqa xmm4, [ebx])\
1342
AS2( movdqa xmm6, xmm4) \
1343
AS2( pand xmm4, xmm5) \
1344
AS2( psrld xmm5, 16) \
1345
AS2( pmuludq xmm7, [edx-(i)*16])\
1346
AS2( pand xmm6, xmm7) \
1347
AS2( psrld xmm7, 16)
1349
#define Squ_Begin(n) \
1352
AS2( and esp, 0xfffffff0)\
1353
AS2( lea edi, [esp-32*n])\
1354
AS2( sub esp, 32*n+16)\
1356
AS2( mov esi, edi) \
1357
AS2( xor edx, edx) \
1359
ASS( pshufd xmm0, [eax+edx], 3,1,2,0) \
1360
ASS( pshufd xmm1, [eax+edx], 2,0,3,1) \
1361
AS2( movdqa [edi+2*edx], xmm0) \
1362
AS2( psrlq xmm0, 32) \
1363
AS2( movdqa [edi+2*edx+16], xmm0) \
1364
AS2( movdqa [edi+16*n+2*edx], xmm1) \
1365
AS2( psrlq xmm1, 32) \
1366
AS2( movdqa [edi+16*n+2*edx+16], xmm1) \
1368
AS2( cmp edx, 8*(n)) \
1370
AS2( lea edx, [edi+16*n])\
1371
SSE2_FirstMultiply(0) \
1373
#define Squ_Acc(i) \
1375
AS2( movdqa xmm1, [esi+(i)*16]) \
1376
AS2( movdqa xmm0, [edi-(i)*16]) \
1377
AS2( movdqa xmm2, [ebx]) \
1378
AS2( pmuludq xmm0, xmm1) \
1379
AS2( pmuludq xmm1, [edx-(i)*16]) \
1380
AS2( movdqa xmm3, xmm2) \
1381
AS2( pand xmm2, xmm0) \
1382
AS2( psrld xmm0, 16) \
1383
AS2( paddd xmm4, xmm2) \
1384
AS2( paddd xmm5, xmm0) \
1385
AS2( pand xmm3, xmm1) \
1386
AS2( psrld xmm1, 16) \
1387
AS2( paddd xmm6, xmm3) \
1388
AS2( paddd xmm7, xmm1) \
1391
#define Squ_Acc2(i) ASC(call, LSqu##i)
1392
#define Squ_Acc3(i) Squ_Acc2(i)
1393
#define Squ_Acc4(i) Squ_Acc2(i)
1394
#define Squ_Acc5(i) Squ_Acc2(i)
1395
#define Squ_Acc6(i) Squ_Acc2(i)
1396
#define Squ_Acc7(i) Squ_Acc2(i)
1397
#define Squ_Acc8(i) Squ_Acc2(i)
1399
#define SSE2_End(E, n) \
1400
SSE2_SaveShift(2*(n)-3) \
1401
AS2( movdqa xmm7, [esi+16]) \
1402
AS2( movdqa xmm0, [edi]) \
1403
AS2( pmuludq xmm0, xmm7) \
1404
AS2( movdqa xmm2, [ebx]) \
1405
AS2( pmuludq xmm7, [edx]) \
1406
AS2( movdqa xmm6, xmm2) \
1407
AS2( pand xmm2, xmm0) \
1408
AS2( psrld xmm0, 16) \
1409
AS2( paddd xmm4, xmm2) \
1410
AS2( paddd xmm5, xmm0) \
1411
AS2( pand xmm6, xmm7) \
1412
AS2( psrld xmm7, 16) \
1413
SSE2_SaveShift(2*(n)-2) \
1414
SSE2_FinalSave(2*(n)-1) \
1418
#define Squ_End(n) SSE2_End(SquEpilogue, n)
1419
#define Mul_End(n) SSE2_End(MulEpilogue, n)
1420
#define Top_End(n) SSE2_End(TopEpilogue, n)
1422
#define Squ_Column1(k, i) \
1423
Squ_SSE2_SaveShift(k) \
1425
SSE2_FirstMultiply(1)\
1427
AS2( paddd xmm4, xmm4) \
1428
AS2( paddd xmm5, xmm5) \
1429
AS2( movdqa xmm3, [esi]) \
1430
AS2( movq xmm1, QWORD PTR [esi+8]) \
1431
AS2( pmuludq xmm1, xmm3) \
1432
AS2( pmuludq xmm3, xmm3) \
1433
AS2( movdqa xmm0, [ebx])\
1434
AS2( movdqa xmm2, xmm0) \
1435
AS2( pand xmm0, xmm1) \
1436
AS2( psrld xmm1, 16) \
1437
AS2( paddd xmm6, xmm0) \
1438
AS2( paddd xmm7, xmm1) \
1439
AS2( pand xmm2, xmm3) \
1440
AS2( psrld xmm3, 16) \
1441
AS2( paddd xmm6, xmm6) \
1442
AS2( paddd xmm7, xmm7) \
1443
AS2( paddd xmm4, xmm2) \
1444
AS2( paddd xmm5, xmm3) \
1445
AS2( movq xmm0, QWORD PTR [esp+4])\
1446
AS2( movq xmm1, QWORD PTR [esp+12])\
1447
AS2( paddd xmm4, xmm0)\
1448
AS2( paddd xmm5, xmm1)\
1450
#define Squ_Column0(k, i) \
1451
Squ_SSE2_SaveShift(k) \
1454
SSE2_FirstMultiply(1)\
1456
AS2( paddd xmm6, xmm6) \
1457
AS2( paddd xmm7, xmm7) \
1458
AS2( paddd xmm4, xmm4) \
1459
AS2( paddd xmm5, xmm5) \
1460
AS2( movq xmm0, QWORD PTR [esp+4])\
1461
AS2( movq xmm1, QWORD PTR [esp+12])\
1462
AS2( paddd xmm4, xmm0)\
1463
AS2( paddd xmm5, xmm1)\
1465
#define SSE2_MulAdd45 \
1466
AS2( movdqa xmm7, [esi]) \
1467
AS2( movdqa xmm0, [edi]) \
1468
AS2( pmuludq xmm0, xmm7) \
1469
AS2( movdqa xmm2, [ebx]) \
1470
AS2( pmuludq xmm7, [edx]) \
1471
AS2( movdqa xmm6, xmm2) \
1472
AS2( pand xmm2, xmm0) \
1473
AS2( psrld xmm0, 16) \
1474
AS2( paddd xmm4, xmm2) \
1475
AS2( paddd xmm5, xmm0) \
1476
AS2( pand xmm6, xmm7) \
1477
AS2( psrld xmm7, 16)
1479
#define Mul_Begin(n) \
1482
AS2( and esp, 0xfffffff0)\
1483
AS2( sub esp, 48*n+16)\
1485
AS2( xor edx, edx) \
1487
ASS( pshufd xmm0, [eax+edx], 3,1,2,0) \
1488
ASS( pshufd xmm1, [eax+edx], 2,0,3,1) \
1489
ASS( pshufd xmm2, [edi+edx], 3,1,2,0) \
1490
AS2( movdqa [esp+20+2*edx], xmm0) \
1491
AS2( psrlq xmm0, 32) \
1492
AS2( movdqa [esp+20+2*edx+16], xmm0) \
1493
AS2( movdqa [esp+20+16*n+2*edx], xmm1) \
1494
AS2( psrlq xmm1, 32) \
1495
AS2( movdqa [esp+20+16*n+2*edx+16], xmm1) \
1496
AS2( movdqa [esp+20+32*n+2*edx], xmm2) \
1497
AS2( psrlq xmm2, 32) \
1498
AS2( movdqa [esp+20+32*n+2*edx+16], xmm2) \
1500
AS2( cmp edx, 8*(n)) \
1502
AS2( lea edi, [esp+20])\
1503
AS2( lea edx, [esp+20+16*n])\
1504
AS2( lea esi, [esp+20+32*n])\
1505
SSE2_FirstMultiply(0) \
1507
#define Mul_Acc(i) \
1509
AS2( movdqa xmm1, [esi+i/2*(1-(i-2*(i/2))*2)*16]) \
1510
AS2( movdqa xmm0, [edi-i/2*(1-(i-2*(i/2))*2)*16]) \
1511
AS2( movdqa xmm2, [ebx]) \
1512
AS2( pmuludq xmm0, xmm1) \
1513
AS2( pmuludq xmm1, [edx-i/2*(1-(i-2*(i/2))*2)*16]) \
1514
AS2( movdqa xmm3, xmm2) \
1515
AS2( pand xmm2, xmm0) \
1516
AS2( psrld xmm0, 16) \
1517
AS2( paddd xmm4, xmm2) \
1518
AS2( paddd xmm5, xmm0) \
1519
AS2( pand xmm3, xmm1) \
1520
AS2( psrld xmm1, 16) \
1521
AS2( paddd xmm6, xmm3) \
1522
AS2( paddd xmm7, xmm1) \
1525
#define Mul_Acc2(i) ASC(call, LMul##i)
1526
#define Mul_Acc3(i) Mul_Acc2(i)
1527
#define Mul_Acc4(i) Mul_Acc2(i)
1528
#define Mul_Acc5(i) Mul_Acc2(i)
1529
#define Mul_Acc6(i) Mul_Acc2(i)
1530
#define Mul_Acc7(i) Mul_Acc2(i)
1531
#define Mul_Acc8(i) Mul_Acc2(i)
1532
#define Mul_Acc9(i) Mul_Acc2(i)
1533
#define Mul_Acc10(i) Mul_Acc2(i)
1534
#define Mul_Acc11(i) Mul_Acc2(i)
1535
#define Mul_Acc12(i) Mul_Acc2(i)
1536
#define Mul_Acc13(i) Mul_Acc2(i)
1537
#define Mul_Acc14(i) Mul_Acc2(i)
1538
#define Mul_Acc15(i) Mul_Acc2(i)
1539
#define Mul_Acc16(i) Mul_Acc2(i)
1541
#define Mul_Column1(k, i) \
1547
#define Mul_Column0(k, i) \
1554
#define Bot_Acc(i) \
1555
AS2( movdqa xmm1, [esi+i/2*(1-(i-2*(i/2))*2)*16]) \
1556
AS2( movdqa xmm0, [edi-i/2*(1-(i-2*(i/2))*2)*16]) \
1557
AS2( pmuludq xmm0, xmm1) \
1558
AS2( pmuludq xmm1, [edx-i/2*(1-(i-2*(i/2))*2)*16]) \
1559
AS2( paddq xmm4, xmm0) \
1560
AS2( paddd xmm6, xmm1)
1562
#define Bot_SaveAcc(k) \
1566
AS2( movdqa xmm6, [esi]) \
1567
AS2( movdqa xmm0, [edi]) \
1568
AS2( pmuludq xmm0, xmm6) \
1569
AS2( paddq xmm4, xmm0) \
1570
AS2( psllq xmm5, 16) \
1571
AS2( paddq xmm4, xmm5) \
1572
AS2( pmuludq xmm6, [edx])
1574
#define Bot_End(n) \
1575
AS2( movhlps xmm7, xmm6) \
1576
AS2( paddd xmm6, xmm7) \
1577
AS2( psllq xmm6, 32) \
1578
AS2( paddd xmm4, xmm6) \
1579
AS2( movq QWORD PTR [ecx+8*((n)-1)], xmm4) \
1583
#define Top_Begin(n) \
1586
AS2( and esp, 0xfffffff0)\
1587
AS2( sub esp, 48*n+16)\
1589
AS2( xor edx, edx) \
1591
ASS( pshufd xmm0, [eax+edx], 3,1,2,0) \
1592
ASS( pshufd xmm1, [eax+edx], 2,0,3,1) \
1593
ASS( pshufd xmm2, [edi+edx], 3,1,2,0) \
1594
AS2( movdqa [esp+20+2*edx], xmm0) \
1595
AS2( psrlq xmm0, 32) \
1596
AS2( movdqa [esp+20+2*edx+16], xmm0) \
1597
AS2( movdqa [esp+20+16*n+2*edx], xmm1) \
1598
AS2( psrlq xmm1, 32) \
1599
AS2( movdqa [esp+20+16*n+2*edx+16], xmm1) \
1600
AS2( movdqa [esp+20+32*n+2*edx], xmm2) \
1601
AS2( psrlq xmm2, 32) \
1602
AS2( movdqa [esp+20+32*n+2*edx+16], xmm2) \
1604
AS2( cmp edx, 8*(n)) \
1606
AS2( mov eax, esi) \
1607
AS2( lea edi, [esp+20+00*n+16*(n/2-1)])\
1608
AS2( lea edx, [esp+20+16*n+16*(n/2-1)])\
1609
AS2( lea esi, [esp+20+32*n+16*(n/2-1)])\
1610
AS2( pxor xmm4, xmm4)\
1611
AS2( pxor xmm5, xmm5)
1613
#define Top_Acc(i) \
1614
AS2( movq xmm0, QWORD PTR [esi+i/2*(1-(i-2*(i/2))*2)*16+8]) \
1615
AS2( pmuludq xmm0, [edx-i/2*(1-(i-2*(i/2))*2)*16]) \
1616
AS2( psrlq xmm0, 48) \
1617
AS2( paddd xmm5, xmm0)\
1619
#define Top_Column0(i) \
1620
AS2( psllq xmm5, 32) \
1626
#define Top_Column1(i) \
1632
AS2( movd xmm0, eax)\
1633
AS2( movd xmm1, [ecx+4])\
1634
AS2( psrld xmm1, 16)\
1635
AS2( pcmpgtd xmm1, xmm0)\
1636
AS2( psrld xmm1, 31)\
1637
AS2( paddd xmm4, xmm1)\
1639
void SSE2_Square4(word *C, const word *A)
1646
void SSE2_Square8(word *C, const word *A)
1662
void SSE2_Square16(word *C, const word *A)
1667
Squ_Acc(4) Squ_Acc(3) Squ_Acc(2)
1686
void SSE2_Square32(word *C, const word *A)
1690
Squ_Acc(8) Squ_Acc(7) Squ_Acc(6) Squ_Acc(5) Squ_Acc(4) Squ_Acc(3) Squ_Acc(2)
1724
void SSE2_Multiply4(word *C, const word *A, const word *B)
1736
void SSE2_Multiply8(word *C, const word *A, const word *B)
1741
Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1752
void SSE2_Multiply16(word *C, const word *A, const word *B)
1757
Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1776
void SSE2_Multiply32(word *C, const word *A, const word *B)
1780
Mul_Acc(16) Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1814
void SSE2_MultiplyBottom4(word *C, const word *A, const word *B)
1817
Bot_SaveAcc(0) Bot_Acc(2)
1821
void SSE2_MultiplyBottom8(word *C, const word *A, const word *B)
1826
Mul_Acc(3) Mul_Acc(2)
1831
Bot_SaveAcc(2) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2)
1835
void SSE2_MultiplyBottom16(word *C, const word *A, const word *B)
1840
Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1849
Bot_SaveAcc(6) Bot_Acc(8) Bot_Acc(7) Bot_Acc(6) Bot_Acc(5) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2)
1853
void SSE2_MultiplyBottom32(word *C, const word *A, const word *B)
1858
Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1875
Bot_SaveAcc(14) Bot_Acc(16) Bot_Acc(15) Bot_Acc(14) Bot_Acc(13) Bot_Acc(12) Bot_Acc(11) Bot_Acc(10) Bot_Acc(9) Bot_Acc(8) Bot_Acc(7) Bot_Acc(6) Bot_Acc(5) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2)
1879
void SSE2_MultiplyTop8(word *C, const word *A, const word *B, word L)
1882
Top_Acc(3) Top_Acc(2) Top_Acc(1)
1885
Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1894
void SSE2_MultiplyTop16(word *C, const word *A, const word *B, word L)
1897
Top_Acc(7) Top_Acc(6) Top_Acc(5) Top_Acc(4) Top_Acc(3) Top_Acc(2) Top_Acc(1)
1900
Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1913
void SSE2_MultiplyTop32(word *C, const word *A, const word *B, word L)
1916
Top_Acc(15) Top_Acc(14) Top_Acc(13) Top_Acc(12) Top_Acc(11) Top_Acc(10) Top_Acc(9) Top_Acc(8) Top_Acc(7) Top_Acc(6) Top_Acc(5) Top_Acc(4) Top_Acc(3) Top_Acc(2) Top_Acc(1)
1919
Mul_Acc(16) Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1940
#endif // #if CRYPTOPP_INTEGER_SSE2
1942
// ********************************************************
1944
typedef int (CRYPTOPP_FASTCALL * PAdd)(size_t N, word *C, const word *A, const word *B);
1012
1945
typedef void (* PMul)(word *C, const word *A, const word *B);
1946
typedef void (* PSqu)(word *C, const word *A);
1947
typedef void (* PMulTop)(word *C, const word *A, const word *B, word L);
1014
static PAddSub s_pAdd, s_pSub;
1015
#ifdef SSE2_INTRINSICS_AVAILABLE
1016
static PMul s_pMul4, s_pMul8, s_pMul8B;
1949
#if CRYPTOPP_INTEGER_SSE2
1950
static PAdd s_pAdd = &Baseline_Add, s_pSub = &Baseline_Sub;
1951
static size_t s_recursionLimit = 8;
1953
static const size_t s_recursionLimit = 16;
1019
static void SetPentiumFunctionPointers()
1956
static PMul s_pMul[9], s_pBot[9];
1957
static PSqu s_pSqu[9];
1958
static PMulTop s_pTop[9];
1960
static void SetFunctionPointers()
1023
s_pAdd = &P4Optimized::Add;
1024
s_pSub = &P4Optimized::Subtract;
1028
s_pAdd = &PentiumOptimized::Add;
1029
s_pSub = &PentiumOptimized::Subtract;
1962
s_pMul[0] = &Baseline_Multiply2;
1963
s_pBot[0] = &Baseline_MultiplyBottom2;
1964
s_pSqu[0] = &Baseline_Square2;
1965
s_pTop[0] = &Baseline_MultiplyTop2;
1966
s_pTop[1] = &Baseline_MultiplyTop4;
1032
#ifdef SSE2_INTRINSICS_AVAILABLE
1968
#if CRYPTOPP_INTEGER_SSE2
1035
s_pMul4 = &P4Optimized::Multiply4;
1036
s_pMul8 = &P4Optimized::Multiply8;
1037
s_pMul8B = &P4Optimized::Multiply8Bottom;
1971
#if _MSC_VER != 1200 || defined(NDEBUG)
1979
s_recursionLimit = 32;
1981
s_pMul[1] = &SSE2_Multiply4;
1982
s_pMul[2] = &SSE2_Multiply8;
1983
s_pMul[4] = &SSE2_Multiply16;
1984
s_pMul[8] = &SSE2_Multiply32;
1986
s_pBot[1] = &SSE2_MultiplyBottom4;
1987
s_pBot[2] = &SSE2_MultiplyBottom8;
1988
s_pBot[4] = &SSE2_MultiplyBottom16;
1989
s_pBot[8] = &SSE2_MultiplyBottom32;
1991
s_pSqu[1] = &SSE2_Square4;
1992
s_pSqu[2] = &SSE2_Square8;
1993
s_pSqu[4] = &SSE2_Square16;
1994
s_pSqu[8] = &SSE2_Square32;
1996
s_pTop[2] = &SSE2_MultiplyTop8;
1997
s_pTop[4] = &SSE2_MultiplyTop16;
1998
s_pTop[8] = &SSE2_MultiplyTop32;
1041
s_pMul4 = &PentiumOptimized::Multiply4;
1042
s_pMul8 = &PentiumOptimized::Multiply8;
1043
s_pMul8B = &PentiumOptimized::Multiply8Bottom;
2003
s_pMul[1] = &Baseline_Multiply4;
2004
s_pMul[2] = &Baseline_Multiply8;
2006
s_pBot[1] = &Baseline_MultiplyBottom4;
2007
s_pBot[2] = &Baseline_MultiplyBottom8;
2009
s_pSqu[1] = &Baseline_Square4;
2010
s_pSqu[2] = &Baseline_Square8;
2012
s_pTop[2] = &Baseline_MultiplyTop8;
2014
#if !CRYPTOPP_INTEGER_SSE2
2015
s_pMul[4] = &Baseline_Multiply16;
2016
s_pBot[4] = &Baseline_MultiplyBottom16;
2017
s_pSqu[4] = &Baseline_Square16;
2018
s_pTop[4] = &Baseline_MultiplyTop16;
1050
s_sse2Enabled = false;
1051
SetPentiumFunctionPointers();
1054
class LowLevel : public PentiumOptimized
1057
inline static int Add(word *C, const word *A, const word *B, size_t N)
1058
{return s_pAdd(C, A, B, N);}
1059
inline static int Subtract(word *C, const word *A, const word *B, size_t N)
1060
{return s_pSub(C, A, B, N);}
1061
inline static void Square4(word *R, const word *A)
1062
{Multiply4(R, A, A);}
1063
#ifdef SSE2_INTRINSICS_AVAILABLE
1064
inline static void Multiply4(word *C, const word *A, const word *B)
1066
inline static void Multiply8(word *C, const word *A, const word *B)
1068
inline static void Multiply8Bottom(word *C, const word *A, const word *B)
1069
{s_pMul8B(C, A, B);}
1073
// use some tricks to share assembly code between MSVC and GCC
1075
#define CRYPTOPP_NAKED __declspec(naked)
1076
#define AS1(x) __asm x
1077
#define AS2(x, y) __asm x, y
1078
#define AddPrologue \
1083
__asm mov ecx, [esp+20] \
1084
__asm mov edx, [esp+24] \
1085
__asm mov ebx, [esp+28] \
1086
__asm mov esi, [esp+32]
1087
#define AddEpilogue \
1093
#define MulPrologue \
1098
__asm mov ecx, [esp+28] \
1099
__asm mov esi, [esp+24] \
1101
#define MulEpilogue \
1109
#define CRYPTOPP_NAKED
1110
#define AS1(x) #x ";"
1111
#define AS2(x, y) #x ", " #y ";"
1112
#define AddPrologue \
1113
__asm__ __volatile__ \
1115
"push %%ebx;" /* save this manually, in case of -fPIC */ \
1117
".intel_syntax noprefix;" \
1119
#define AddEpilogue \
1121
".att_syntax prefix;" \
1124
: "c" (C), "d" (A), "m" (B), "S" (N) \
1125
: "%edi", "memory", "cc" \
1127
#define MulPrologue \
1128
__asm__ __volatile__ \
1130
"push %%ebx;" /* save this manually, in case of -fPIC */ \
1133
".intel_syntax noprefix;"
1134
#define MulEpilogue \
1138
".att_syntax prefix;" \
1140
: "rm" (Z), "S" (X), "c" (Y) \
1141
: "%eax", "%edx", "%edi", "memory", "cc" \
1145
CRYPTOPP_NAKED int PentiumOptimized::Add(word *C, const word *A, const word *B, size_t N)
1149
// now: ebx = B, ecx = C, edx = A, esi = N
1150
AS2( sub ecx, edx) // hold the distance between C & A so we can add this to A to get C
1151
AS2( xor eax, eax) // clear eax
1153
AS2( sub eax, esi) // eax is a negative index from end of B
1154
AS2( lea ebx, [ebx+4*esi]) // ebx is end of B
1156
AS2( sar eax, 1) // unit of eax is now dwords; this also clears the carry flag
1157
AS1( jz loopendAdd) // if no dwords then nothing to do
1160
AS2( mov esi,[edx]) // load lower word of A
1161
AS2( mov ebp,[edx+4]) // load higher word of A
1163
AS2( mov edi,[ebx+8*eax]) // load lower word of B
1164
AS2( lea edx,[edx+8]) // advance A and C
1166
AS2( adc esi,edi) // add lower words
1167
AS2( mov edi,[ebx+8*eax+4]) // load higher word of B
1169
AS2( adc ebp,edi) // add higher words
1170
AS1( inc eax) // advance B
1172
AS2( mov [edx+ecx-8],esi) // store lower word result
1173
AS2( mov [edx+ecx-4],ebp) // store higher word result
1175
AS1( jnz loopstartAdd) // loop until eax overflows and becomes zero
1178
AS2( adc eax, 0) // store carry into eax (return result register)
1183
CRYPTOPP_NAKED int PentiumOptimized::Subtract(word *C, const word *A, const word *B, size_t N)
1187
// now: ebx = B, ecx = C, edx = A, esi = N
1188
AS2( sub ecx, edx) // hold the distance between C & A so we can add this to A to get C
1189
AS2( xor eax, eax) // clear eax
1191
AS2( sub eax, esi) // eax is a negative index from end of B
1192
AS2( lea ebx, [ebx+4*esi]) // ebx is end of B
1194
AS2( sar eax, 1) // unit of eax is now dwords; this also clears the carry flag
1195
AS1( jz loopendSub) // if no dwords then nothing to do
1198
AS2( mov esi,[edx]) // load lower word of A
1199
AS2( mov ebp,[edx+4]) // load higher word of A
1201
AS2( mov edi,[ebx+8*eax]) // load lower word of B
1202
AS2( lea edx,[edx+8]) // advance A and C
1204
AS2( sbb esi,edi) // subtract lower words
1205
AS2( mov edi,[ebx+8*eax+4]) // load higher word of B
1207
AS2( sbb ebp,edi) // subtract higher words
1208
AS1( inc eax) // advance B
1210
AS2( mov [edx+ecx-8],esi) // store lower word result
1211
AS2( mov [edx+ecx-4],ebp) // store higher word result
1213
AS1( jnz loopstartSub) // loop until eax overflows and becomes zero
1216
AS2( adc eax, 0) // store carry into eax (return result register)
1221
// On Pentium 4, the adc and sbb instructions are very expensive, so avoid them.
1223
CRYPTOPP_NAKED int P4Optimized::Add(word *C, const word *A, const word *B, size_t N)
1227
// now: ebx = B, ecx = C, edx = A, esi = N
1230
AS1( jz loopendAddP4) // if no dwords then nothing to do
1232
AS2( mov edi, [edx])
1233
AS2( mov ebp, [ebx])
1234
AS1( jmp carry1AddP4)
1236
AS1(loopstartAddP4:)
1237
AS2( mov edi, [edx+8])
1240
AS2( mov ebp, [ebx])
1242
AS1( jc carry1AddP4)
1248
AS2( mov [ecx], edi)
1249
AS2( mov edi, [edx+4])
1250
AS2( cmovc eax, ebp)
1251
AS2( mov ebp, [ebx+4])
1254
AS1( jc carry2AddP4)
1260
AS2( cmovc eax, ebp)
1261
AS2( mov [ecx+4], edi)
1263
AS1( jnz loopstartAddP4)
1270
CRYPTOPP_NAKED int P4Optimized::Subtract(word *C, const word *A, const word *B, size_t N)
1274
// now: ebx = B, ecx = C, edx = A, esi = N
1277
AS1( jz loopendSubP4) // if no dwords then nothing to do
1279
AS2( mov edi, [edx])
1280
AS2( mov ebp, [ebx])
1281
AS1( jmp carry1SubP4)
1283
AS1(loopstartSubP4:)
1284
AS2( mov edi, [edx+8])
1287
AS2( mov ebp, [ebx])
1289
AS1( jc carry1SubP4)
1295
AS2( mov [ecx], edi)
1296
AS2( mov edi, [edx+4])
1297
AS2( cmovc eax, ebp)
1298
AS2( mov ebp, [ebx+4])
1301
AS1( jc carry2SubP4)
1307
AS2( cmovc eax, ebp)
1308
AS2( mov [ecx+4], edi)
1310
AS1( jnz loopstartSubP4)
1317
// multiply assembly code originally contributed by Leonard Janke
1319
#define MulStartup \
1324
#define MulShiftCarry \
1329
#define MulAccumulateBottom(i,j) \
1330
AS2(mov eax, [ecx+4*j]) \
1331
AS2(imul eax, dword ptr [esi+4*i]) \
1334
#define MulAccumulate(i,j) \
1335
AS2(mov eax, [ecx+4*j]) \
1336
AS1(mul dword ptr [esi+4*i]) \
1341
#define MulStoreDigit(i) \
1343
AS2(mov edi, [esp]) \
1344
AS2(mov [edi+4*i], ebp)
1346
#define MulLastDiagonal(digits) \
1347
AS2(mov eax, [ecx+4*(digits-1)]) \
1348
AS1(mul dword ptr [esi+4*(digits-1)]) \
1351
AS2(mov edi, [esp]) \
1352
AS2(mov [edi+4*(2*digits-2)], ebp) \
1353
AS2(mov [edi+4*(2*digits-1)], edx)
1355
CRYPTOPP_NAKED void PentiumOptimized::Multiply4(word* Z, const word* X, const word* Y)
1358
// now: [esp] = Z, esi = X, ecx = Y
1397
CRYPTOPP_NAKED void PentiumOptimized::Multiply8(word* Z, const word* X, const word* Y)
1400
// now: [esp] = Z, esi = X, ecx = Y
1511
CRYPTOPP_NAKED void PentiumOptimized::Multiply8Bottom(word* Z, const word* X, const word* Y)
1514
// now: [esp] = Z, esi = X, ecx = Y
1565
MulAccumulateBottom(7,0)
1566
MulAccumulateBottom(6,1)
1567
MulAccumulateBottom(5,2)
1568
MulAccumulateBottom(4,3)
1569
MulAccumulateBottom(3,4)
1570
MulAccumulateBottom(2,5)
1571
MulAccumulateBottom(1,6)
1572
MulAccumulateBottom(0,7)
1580
#else // not x86 - no processor specific code at this layer
1582
typedef Portable LowLevel;
1586
#ifdef SSE2_INTRINSICS_AVAILABLE
1589
#define CRYPTOPP_FASTCALL
1591
#define CRYPTOPP_FASTCALL __fastcall
1594
static void CRYPTOPP_FASTCALL P4_Mul(__m128i *C, const __m128i *A, const __m128i *B)
1596
__m128i a3210 = _mm_load_si128(A);
1597
__m128i b3210 = _mm_load_si128(B);
1601
__m128i z = _mm_setzero_si128();
1602
__m128i a2b2_a0b0 = _mm_mul_epu32(a3210, b3210);
1605
__m128i a3120 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(3, 1, 2, 0));
1606
__m128i b3021 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 2, 1));
1607
__m128i a1b0_a0b1 = _mm_mul_epu32(a3120, b3021);
1608
__m128i a1b0 = _mm_unpackhi_epi32(a1b0_a0b1, z);
1609
__m128i a0b1 = _mm_unpacklo_epi32(a1b0_a0b1, z);
1610
C[1] = _mm_add_epi64(a1b0, a0b1);
1612
__m128i a31 = _mm_srli_epi64(a3210, 32);
1613
__m128i b31 = _mm_srli_epi64(b3210, 32);
1614
__m128i a3b3_a1b1 = _mm_mul_epu32(a31, b31);
1617
__m128i a1b1 = _mm_unpacklo_epi32(a3b3_a1b1, z);
1618
__m128i b3012 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 1, 2));
1619
__m128i a2b0_a0b2 = _mm_mul_epu32(a3210, b3012);
1620
__m128i a0b2 = _mm_unpacklo_epi32(a2b0_a0b2, z);
1621
__m128i a2b0 = _mm_unpackhi_epi32(a2b0_a0b2, z);
1622
sum = _mm_add_epi64(a1b1, a0b2);
1623
C[2] = _mm_add_epi64(sum, a2b0);
1625
__m128i a2301 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(2, 3, 0, 1));
1626
__m128i b2103 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(2, 1, 0, 3));
1627
__m128i a3b0_a1b2 = _mm_mul_epu32(a2301, b3012);
1628
__m128i a2b1_a0b3 = _mm_mul_epu32(a3210, b2103);
1629
__m128i a3b0 = _mm_unpackhi_epi32(a3b0_a1b2, z);
1630
__m128i a1b2 = _mm_unpacklo_epi32(a3b0_a1b2, z);
1631
__m128i a2b1 = _mm_unpackhi_epi32(a2b1_a0b3, z);
1632
__m128i a0b3 = _mm_unpacklo_epi32(a2b1_a0b3, z);
1633
__m128i sum1 = _mm_add_epi64(a3b0, a1b2);
1634
sum = _mm_add_epi64(a2b1, a0b3);
1635
C[3] = _mm_add_epi64(sum, sum1);
1637
__m128i a3b1_a1b3 = _mm_mul_epu32(a2301, b2103);
1638
__m128i a2b2 = _mm_unpackhi_epi32(a2b2_a0b0, z);
1639
__m128i a3b1 = _mm_unpackhi_epi32(a3b1_a1b3, z);
1640
__m128i a1b3 = _mm_unpacklo_epi32(a3b1_a1b3, z);
1641
sum = _mm_add_epi64(a2b2, a3b1);
1642
C[4] = _mm_add_epi64(sum, a1b3);
1644
__m128i a1302 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(1, 3, 0, 2));
1645
__m128i b1203 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(1, 2, 0, 3));
1646
__m128i a3b2_a2b3 = _mm_mul_epu32(a1302, b1203);
1647
__m128i a3b2 = _mm_unpackhi_epi32(a3b2_a2b3, z);
1648
__m128i a2b3 = _mm_unpacklo_epi32(a3b2_a2b3, z);
1649
C[5] = _mm_add_epi64(a3b2, a2b3);
1652
void P4Optimized::Multiply4(word *C, const word *A, const word *B)
1655
const word *w = (word *)temp;
1656
const __m64 *mw = (__m64 *)w;
1658
P4_Mul(temp, (__m128i *)A, (__m128i *)B);
1664
__m64 w1 = _mm_cvtsi32_si64(w[1]);
1675
__m64 w26 = _mm_cvtsi32_si64(w[26]);
1677
s1 = _mm_add_si64(w1, w4);
1678
C[1] = _mm_cvtsi64_si32(s1);
1679
s1 = _mm_srli_si64(s1, 32);
1681
s2 = _mm_add_si64(w6, w8);
1682
s1 = _mm_add_si64(s1, s2);
1683
C[2] = _mm_cvtsi64_si32(s1);
1684
s1 = _mm_srli_si64(s1, 32);
1686
s2 = _mm_add_si64(w10, w12);
1687
s1 = _mm_add_si64(s1, s2);
1688
C[3] = _mm_cvtsi64_si32(s1);
1689
s1 = _mm_srli_si64(s1, 32);
1691
s2 = _mm_add_si64(w14, w16);
1692
s1 = _mm_add_si64(s1, s2);
1693
C[4] = _mm_cvtsi64_si32(s1);
1694
s1 = _mm_srli_si64(s1, 32);
1696
s2 = _mm_add_si64(w18, w20);
1697
s1 = _mm_add_si64(s1, s2);
1698
C[5] = _mm_cvtsi64_si32(s1);
1699
s1 = _mm_srli_si64(s1, 32);
1701
s2 = _mm_add_si64(w22, w26);
1702
s1 = _mm_add_si64(s1, s2);
1703
C[6] = _mm_cvtsi64_si32(s1);
1704
s1 = _mm_srli_si64(s1, 32);
1706
C[7] = _mm_cvtsi64_si32(s1) + w[27];
1710
void P4Optimized::Multiply8(word *C, const word *A, const word *B)
1713
const word *w = (word *)temp;
1714
const __m64 *mw = (__m64 *)w;
1715
const word *x = (word *)temp+7*4;
1716
const __m64 *mx = (__m64 *)x;
1717
const word *y = (word *)temp+7*4*2;
1718
const __m64 *my = (__m64 *)y;
1719
const word *z = (word *)temp+7*4*3;
1720
const __m64 *mz = (__m64 *)z;
1722
P4_Mul(temp, (__m128i *)A, (__m128i *)B);
1724
P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B);
1726
P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1);
1728
P4_Mul(temp+21, (__m128i *)A+1, (__m128i *)B+1);
1732
__m64 s1, s2, s3, s4;
1734
__m64 w1 = _mm_cvtsi32_si64(w[1]);
1745
__m64 w26 = _mm_cvtsi32_si64(w[26]);
1746
__m64 w27 = _mm_cvtsi32_si64(w[27]);
1748
__m64 x0 = _mm_cvtsi32_si64(x[0]);
1749
__m64 x1 = _mm_cvtsi32_si64(x[1]);
1760
__m64 x26 = _mm_cvtsi32_si64(x[26]);
1761
__m64 x27 = _mm_cvtsi32_si64(x[27]);
1763
__m64 y0 = _mm_cvtsi32_si64(y[0]);
1764
__m64 y1 = _mm_cvtsi32_si64(y[1]);
1775
__m64 y26 = _mm_cvtsi32_si64(y[26]);
1776
__m64 y27 = _mm_cvtsi32_si64(y[27]);
1778
__m64 z0 = _mm_cvtsi32_si64(z[0]);
1779
__m64 z1 = _mm_cvtsi32_si64(z[1]);
1790
__m64 z26 = _mm_cvtsi32_si64(z[26]);
1792
s1 = _mm_add_si64(w1, w4);
1793
C[1] = _mm_cvtsi64_si32(s1);
1794
s1 = _mm_srli_si64(s1, 32);
1796
s2 = _mm_add_si64(w6, w8);
1797
s1 = _mm_add_si64(s1, s2);
1798
C[2] = _mm_cvtsi64_si32(s1);
1799
s1 = _mm_srli_si64(s1, 32);
1801
s2 = _mm_add_si64(w10, w12);
1802
s1 = _mm_add_si64(s1, s2);
1803
C[3] = _mm_cvtsi64_si32(s1);
1804
s1 = _mm_srli_si64(s1, 32);
1806
s3 = _mm_add_si64(x0, y0);
1807
s2 = _mm_add_si64(w14, w16);
1808
s1 = _mm_add_si64(s1, s3);
1809
s1 = _mm_add_si64(s1, s2);
1810
C[4] = _mm_cvtsi64_si32(s1);
1811
s1 = _mm_srli_si64(s1, 32);
1813
s3 = _mm_add_si64(x1, y1);
1814
s4 = _mm_add_si64(x4, y4);
1815
s1 = _mm_add_si64(s1, w18);
1816
s3 = _mm_add_si64(s3, s4);
1817
s1 = _mm_add_si64(s1, w20);
1818
s1 = _mm_add_si64(s1, s3);
1819
C[5] = _mm_cvtsi64_si32(s1);
1820
s1 = _mm_srli_si64(s1, 32);
1822
s3 = _mm_add_si64(x6, y6);
1823
s4 = _mm_add_si64(x8, y8);
1824
s1 = _mm_add_si64(s1, w22);
1825
s3 = _mm_add_si64(s3, s4);
1826
s1 = _mm_add_si64(s1, w26);
1827
s1 = _mm_add_si64(s1, s3);
1828
C[6] = _mm_cvtsi64_si32(s1);
1829
s1 = _mm_srli_si64(s1, 32);
1831
s3 = _mm_add_si64(x10, y10);
1832
s4 = _mm_add_si64(x12, y12);
1833
s1 = _mm_add_si64(s1, w27);
1834
s3 = _mm_add_si64(s3, s4);
1835
s1 = _mm_add_si64(s1, s3);
1836
C[7] = _mm_cvtsi64_si32(s1);
1837
s1 = _mm_srli_si64(s1, 32);
1839
s3 = _mm_add_si64(x14, y14);
1840
s4 = _mm_add_si64(x16, y16);
1841
s1 = _mm_add_si64(s1, z0);
1842
s3 = _mm_add_si64(s3, s4);
1843
s1 = _mm_add_si64(s1, s3);
1844
C[8] = _mm_cvtsi64_si32(s1);
1845
s1 = _mm_srli_si64(s1, 32);
1847
s3 = _mm_add_si64(x18, y18);
1848
s4 = _mm_add_si64(x20, y20);
1849
s1 = _mm_add_si64(s1, z1);
1850
s3 = _mm_add_si64(s3, s4);
1851
s1 = _mm_add_si64(s1, z4);
1852
s1 = _mm_add_si64(s1, s3);
1853
C[9] = _mm_cvtsi64_si32(s1);
1854
s1 = _mm_srli_si64(s1, 32);
1856
s3 = _mm_add_si64(x22, y22);
1857
s4 = _mm_add_si64(x26, y26);
1858
s1 = _mm_add_si64(s1, z6);
1859
s3 = _mm_add_si64(s3, s4);
1860
s1 = _mm_add_si64(s1, z8);
1861
s1 = _mm_add_si64(s1, s3);
1862
C[10] = _mm_cvtsi64_si32(s1);
1863
s1 = _mm_srli_si64(s1, 32);
1865
s3 = _mm_add_si64(x27, y27);
1866
s1 = _mm_add_si64(s1, z10);
1867
s1 = _mm_add_si64(s1, z12);
1868
s1 = _mm_add_si64(s1, s3);
1869
C[11] = _mm_cvtsi64_si32(s1);
1870
s1 = _mm_srli_si64(s1, 32);
1872
s3 = _mm_add_si64(z14, z16);
1873
s1 = _mm_add_si64(s1, s3);
1874
C[12] = _mm_cvtsi64_si32(s1);
1875
s1 = _mm_srli_si64(s1, 32);
1877
s3 = _mm_add_si64(z18, z20);
1878
s1 = _mm_add_si64(s1, s3);
1879
C[13] = _mm_cvtsi64_si32(s1);
1880
s1 = _mm_srli_si64(s1, 32);
1882
s3 = _mm_add_si64(z22, z26);
1883
s1 = _mm_add_si64(s1, s3);
1884
C[14] = _mm_cvtsi64_si32(s1);
1885
s1 = _mm_srli_si64(s1, 32);
1887
C[15] = z[27] + _mm_cvtsi64_si32(s1);
1891
void P4Optimized::Multiply8Bottom(word *C, const word *A, const word *B)
1894
const word *w = (word *)temp;
1895
const __m64 *mw = (__m64 *)w;
1896
const word *x = (word *)temp+7*4;
1897
const __m64 *mx = (__m64 *)x;
1898
const word *y = (word *)temp+7*4*2;
1899
const __m64 *my = (__m64 *)y;
1901
P4_Mul(temp, (__m128i *)A, (__m128i *)B);
1903
P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B);
1905
P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1);
1909
__m64 s1, s2, s3, s4;
1911
__m64 w1 = _mm_cvtsi32_si64(w[1]);
1922
__m64 w26 = _mm_cvtsi32_si64(w[26]);
1924
__m64 x0 = _mm_cvtsi32_si64(x[0]);
1925
__m64 x1 = _mm_cvtsi32_si64(x[1]);
1930
__m64 y0 = _mm_cvtsi32_si64(y[0]);
1931
__m64 y1 = _mm_cvtsi32_si64(y[1]);
1936
s1 = _mm_add_si64(w1, w4);
1937
C[1] = _mm_cvtsi64_si32(s1);
1938
s1 = _mm_srli_si64(s1, 32);
1940
s2 = _mm_add_si64(w6, w8);
1941
s1 = _mm_add_si64(s1, s2);
1942
C[2] = _mm_cvtsi64_si32(s1);
1943
s1 = _mm_srli_si64(s1, 32);
1945
s2 = _mm_add_si64(w10, w12);
1946
s1 = _mm_add_si64(s1, s2);
1947
C[3] = _mm_cvtsi64_si32(s1);
1948
s1 = _mm_srli_si64(s1, 32);
1950
s3 = _mm_add_si64(x0, y0);
1951
s2 = _mm_add_si64(w14, w16);
1952
s1 = _mm_add_si64(s1, s3);
1953
s1 = _mm_add_si64(s1, s2);
1954
C[4] = _mm_cvtsi64_si32(s1);
1955
s1 = _mm_srli_si64(s1, 32);
1957
s3 = _mm_add_si64(x1, y1);
1958
s4 = _mm_add_si64(x4, y4);
1959
s1 = _mm_add_si64(s1, w18);
1960
s3 = _mm_add_si64(s3, s4);
1961
s1 = _mm_add_si64(s1, w20);
1962
s1 = _mm_add_si64(s1, s3);
1963
C[5] = _mm_cvtsi64_si32(s1);
1964
s1 = _mm_srli_si64(s1, 32);
1966
s3 = _mm_add_si64(x6, y6);
1967
s4 = _mm_add_si64(x8, y8);
1968
s1 = _mm_add_si64(s1, w22);
1969
s3 = _mm_add_si64(s3, s4);
1970
s1 = _mm_add_si64(s1, w26);
1971
s1 = _mm_add_si64(s1, s3);
1972
C[6] = _mm_cvtsi64_si32(s1);
1973
s1 = _mm_srli_si64(s1, 32);
1975
C[7] = _mm_cvtsi64_si32(s1) + w[27] + x[10] + y[10] + x[12] + y[12];
1979
#endif // #ifdef SSE2_INTRINSICS_AVAILABLE
2023
inline int Add(word *C, const word *A, const word *B, size_t N)
2025
#if CRYPTOPP_INTEGER_SSE2
2026
return s_pAdd(N, C, A, B);
2028
return Baseline_Add(N, C, A, B);
2032
inline int Subtract(word *C, const word *A, const word *B, size_t N)
2034
#if CRYPTOPP_INTEGER_SSE2
2035
return s_pSub(N, C, A, B);
2037
return Baseline_Sub(N, C, A, B);
1981
2041
// ********************************************************
1984
2045
#define A1 (A+N2)