894
834
#undef SaveSquAcc
896
#ifdef CRYPTOPP_X86ASM_AVAILABLE
898
// ************** x86 feature detection ***************
900
static bool s_sse2Enabled = true;
902
static void CpuId(word32 input, word32 *output)
907
// save ebx in case -fPIC is being used
908
"push %%ebx; cpuid; mov %%ebx, %%edi; pop %%ebx"
909
: "=a" (output[0]), "=D" (output[1]), "=c" (output[2]), "=d" (output[3])
836
// CodeWarrior defines _MSC_VER
837
#if defined(_MSC_VER) && !defined(__MWERKS__) && defined(_M_IX86) && (_M_IX86<=700)
839
class PentiumOptimized : public Portable
842
static word __fastcall Add(word *C, const word *A, const word *B, unsigned int N);
843
static word __fastcall Subtract(word *C, const word *A, const word *B, unsigned int N);
844
// TODO test this with .NET #if _MSC_VER < 1300
845
static inline void Square4(word *R, const word *A)
847
// VC60 workaround: MSVC 6.0 has an optimization bug that makes
848
// (dword)A*B where either A or B has been cast to a dword before
849
// very expensive. Revisit this function when this
856
typedef PentiumOptimized LowLevel;
858
__declspec(naked) word __fastcall PentiumOptimized::Add(word *C, const word *A, const word *B, unsigned int N)
867
mov esi, [esp+24] ; N
868
mov ebx, [esp+20] ; B
870
// now: ebx = B, ecx = C, edx = A, esi = N
872
sub ecx, edx // hold the distance between C & A so we can add this to A to get C
873
xor eax, eax // clear eax
875
sub eax, esi // eax is a negative index from end of B
876
lea ebx, [ebx+4*esi] // ebx is end of B
878
sar eax, 1 // unit of eax is now dwords; this also clears the carry flag
879
jz loopend // if no dwords then nothing to do
882
mov esi,[edx] // load lower word of A
883
mov ebp,[edx+4] // load higher word of A
885
mov edi,[ebx+8*eax] // load lower word of B
886
lea edx,[edx+8] // advance A and C
888
adc esi,edi // add lower words
889
mov edi,[ebx+8*eax+4] // load higher word of B
891
adc ebp,edi // add higher words
894
mov [edx+ecx-8],esi // store lower word result
895
mov [edx+ecx-4],ebp // store higher word result
897
jnz loopstart // loop until eax overflows and becomes zero
900
adc eax, 0 // store carry into eax (return result register)
909
__declspec(naked) word __fastcall PentiumOptimized::Subtract(word *C, const word *A, const word *B, unsigned int N)
918
mov esi, [esp+24] ; N
919
mov ebx, [esp+20] ; B
938
mov edi,[ebx+8*eax+4]
958
#ifdef SSE2_INTRINSICS_AVAILABLE
960
static bool GetSSE2Capability()
926
#ifdef SSE2_INTRINSICS_AVAILABLE
928
static jmp_buf s_env;
929
static void SigIllHandler(int)
935
static bool HasSSE2()
942
if ((cpuid[3] & (1 << 26)) == 0)
948
__asm xorpd xmm0, xmm0 // executing SSE2 instruction
956
typedef void (*SigHandler)(int);
958
SigHandler oldHandler = signal(SIGILL, SigIllHandler);
959
if (oldHandler == SIG_ERR)
966
__asm __volatile ("xorps %xmm0, %xmm0");
968
signal(SIGILL, oldHandler);
979
std::swap(cpuid[2], cpuid[3]);
980
if (memcmp(cpuid+1, "GenuineIntel", 12) != 0)
984
return ((cpuid[0] >> 8) & 0xf) == 0xf;
987
// ************** Pentium/P4 optimizations ***************
989
class PentiumOptimized : public Portable
992
static int Add(word *C, const word *A, const word *B, size_t N);
993
static int Subtract(word *C, const word *A, const word *B, size_t N);
994
static void Multiply4(word *C, const word *A, const word *B);
995
static void Multiply8(word *C, const word *A, const word *B);
996
static void Multiply8Bottom(word *C, const word *A, const word *B);
1002
static int Add(word *C, const word *A, const word *B, size_t N);
1003
static int Subtract(word *C, const word *A, const word *B, size_t N);
1004
#ifdef SSE2_INTRINSICS_AVAILABLE
1005
static void Multiply4(word *C, const word *A, const word *B);
1006
static void Multiply8(word *C, const word *A, const word *B);
1007
static void Multiply8Bottom(word *C, const word *A, const word *B);
1011
typedef int (* PAddSub)(word *C, const word *A, const word *B, size_t N);
1012
typedef void (* PMul)(word *C, const word *A, const word *B);
1014
static PAddSub s_pAdd, s_pSub;
1015
#ifdef SSE2_INTRINSICS_AVAILABLE
1016
static PMul s_pMul4, s_pMul8, s_pMul8B;
1019
static void SetPentiumFunctionPointers()
1023
s_pAdd = &P4Optimized::Add;
1024
s_pSub = &P4Optimized::Subtract;
1028
s_pAdd = &PentiumOptimized::Add;
1029
s_pSub = &PentiumOptimized::Subtract;
1032
#ifdef SSE2_INTRINSICS_AVAILABLE
1035
s_pMul4 = &P4Optimized::Multiply4;
1036
s_pMul8 = &P4Optimized::Multiply8;
1037
s_pMul8B = &P4Optimized::Multiply8Bottom;
1041
s_pMul4 = &PentiumOptimized::Multiply4;
1042
s_pMul8 = &PentiumOptimized::Multiply8;
1043
s_pMul8B = &PentiumOptimized::Multiply8Bottom;
971
return (b & (1 << 26)) != 0;
974
bool g_sse2DetectionDone = false, g_sse2Detected, g_sse2Enabled = true;
1048
976
void DisableSSE2()
1050
s_sse2Enabled = false;
1051
SetPentiumFunctionPointers();
1054
class LowLevel : public PentiumOptimized
978
g_sse2Enabled = false;
981
static inline bool HasSSE2()
983
if (g_sse2Enabled && !g_sse2DetectionDone)
985
g_sse2Detected = GetSSE2Capability();
986
g_sse2DetectionDone = true;
988
return g_sse2Enabled && g_sse2Detected;
991
class P4Optimized : public PentiumOptimized
1057
inline static int Add(word *C, const word *A, const word *B, size_t N)
1058
{return s_pAdd(C, A, B, N);}
1059
inline static int Subtract(word *C, const word *A, const word *B, size_t N)
1060
{return s_pSub(C, A, B, N);}
1061
inline static void Square4(word *R, const word *A)
1062
{Multiply4(R, A, A);}
1063
#ifdef SSE2_INTRINSICS_AVAILABLE
1064
inline static void Multiply4(word *C, const word *A, const word *B)
1066
inline static void Multiply8(word *C, const word *A, const word *B)
1068
inline static void Multiply8Bottom(word *C, const word *A, const word *B)
1069
{s_pMul8B(C, A, B);}
994
static word __fastcall Add(word *C, const word *A, const word *B, unsigned int N);
995
static word __fastcall Subtract(word *C, const word *A, const word *B, unsigned int N);
996
static void Multiply4(word *C, const word *A, const word *B);
997
static void Multiply8(word *C, const word *A, const word *B);
998
static inline void Square4(word *R, const word *A)
1002
static void Multiply8Bottom(word *C, const word *A, const word *B);
1073
// use some tricks to share assembly code between MSVC and GCC
1075
#define CRYPTOPP_NAKED __declspec(naked)
1076
#define AS1(x) __asm x
1077
#define AS2(x, y) __asm x, y
1078
#define AddPrologue \
1083
__asm mov ecx, [esp+20] \
1084
__asm mov edx, [esp+24] \
1085
__asm mov ebx, [esp+28] \
1086
__asm mov esi, [esp+32]
1087
#define AddEpilogue \
1093
#define MulPrologue \
1098
__asm mov ecx, [esp+28] \
1099
__asm mov esi, [esp+24] \
1101
#define MulEpilogue \
1109
#define CRYPTOPP_NAKED
1110
#define AS1(x) #x ";"
1111
#define AS2(x, y) #x ", " #y ";"
1112
#define AddPrologue \
1113
__asm__ __volatile__ \
1115
"push %%ebx;" /* save this manually, in case of -fPIC */ \
1117
".intel_syntax noprefix;" \
1119
#define AddEpilogue \
1121
".att_syntax prefix;" \
1124
: "c" (C), "d" (A), "m" (B), "S" (N) \
1125
: "%edi", "memory", "cc" \
1127
#define MulPrologue \
1128
__asm__ __volatile__ \
1130
"push %%ebx;" /* save this manually, in case of -fPIC */ \
1133
".intel_syntax noprefix;"
1134
#define MulEpilogue \
1138
".att_syntax prefix;" \
1140
: "rm" (Z), "S" (X), "c" (Y) \
1141
: "%eax", "%edx", "%edi", "memory", "cc" \
1145
CRYPTOPP_NAKED int PentiumOptimized::Add(word *C, const word *A, const word *B, size_t N)
1149
// now: ebx = B, ecx = C, edx = A, esi = N
1150
AS2( sub ecx, edx) // hold the distance between C & A so we can add this to A to get C
1151
AS2( xor eax, eax) // clear eax
1153
AS2( sub eax, esi) // eax is a negative index from end of B
1154
AS2( lea ebx, [ebx+4*esi]) // ebx is end of B
1156
AS2( sar eax, 1) // unit of eax is now dwords; this also clears the carry flag
1157
AS1( jz loopendAdd) // if no dwords then nothing to do
1160
AS2( mov esi,[edx]) // load lower word of A
1161
AS2( mov ebp,[edx+4]) // load higher word of A
1163
AS2( mov edi,[ebx+8*eax]) // load lower word of B
1164
AS2( lea edx,[edx+8]) // advance A and C
1166
AS2( adc esi,edi) // add lower words
1167
AS2( mov edi,[ebx+8*eax+4]) // load higher word of B
1169
AS2( adc ebp,edi) // add higher words
1170
AS1( inc eax) // advance B
1172
AS2( mov [edx+ecx-8],esi) // store lower word result
1173
AS2( mov [edx+ecx-4],ebp) // store higher word result
1175
AS1( jnz loopstartAdd) // loop until eax overflows and becomes zero
1178
AS2( adc eax, 0) // store carry into eax (return result register)
1183
CRYPTOPP_NAKED int PentiumOptimized::Subtract(word *C, const word *A, const word *B, size_t N)
1187
// now: ebx = B, ecx = C, edx = A, esi = N
1188
AS2( sub ecx, edx) // hold the distance between C & A so we can add this to A to get C
1189
AS2( xor eax, eax) // clear eax
1191
AS2( sub eax, esi) // eax is a negative index from end of B
1192
AS2( lea ebx, [ebx+4*esi]) // ebx is end of B
1194
AS2( sar eax, 1) // unit of eax is now dwords; this also clears the carry flag
1195
AS1( jz loopendSub) // if no dwords then nothing to do
1198
AS2( mov esi,[edx]) // load lower word of A
1199
AS2( mov ebp,[edx+4]) // load higher word of A
1201
AS2( mov edi,[ebx+8*eax]) // load lower word of B
1202
AS2( lea edx,[edx+8]) // advance A and C
1204
AS2( sbb esi,edi) // subtract lower words
1205
AS2( mov edi,[ebx+8*eax+4]) // load higher word of B
1207
AS2( sbb ebp,edi) // subtract higher words
1208
AS1( inc eax) // advance B
1210
AS2( mov [edx+ecx-8],esi) // store lower word result
1211
AS2( mov [edx+ecx-4],ebp) // store higher word result
1213
AS1( jnz loopstartSub) // loop until eax overflows and becomes zero
1216
AS2( adc eax, 0) // store carry into eax (return result register)
1221
// On Pentium 4, the adc and sbb instructions are very expensive, so avoid them.
1223
CRYPTOPP_NAKED int P4Optimized::Add(word *C, const word *A, const word *B, size_t N)
1227
// now: ebx = B, ecx = C, edx = A, esi = N
1230
AS1( jz loopendAddP4) // if no dwords then nothing to do
1232
AS2( mov edi, [edx])
1233
AS2( mov ebp, [ebx])
1234
AS1( jmp carry1AddP4)
1236
AS1(loopstartAddP4:)
1237
AS2( mov edi, [edx+8])
1240
AS2( mov ebp, [ebx])
1242
AS1( jc carry1AddP4)
1248
AS2( mov [ecx], edi)
1249
AS2( mov edi, [edx+4])
1250
AS2( cmovc eax, ebp)
1251
AS2( mov ebp, [ebx+4])
1254
AS1( jc carry2AddP4)
1260
AS2( cmovc eax, ebp)
1261
AS2( mov [ecx+4], edi)
1263
AS1( jnz loopstartAddP4)
1270
CRYPTOPP_NAKED int P4Optimized::Subtract(word *C, const word *A, const word *B, size_t N)
1274
// now: ebx = B, ecx = C, edx = A, esi = N
1277
AS1( jz loopendSubP4) // if no dwords then nothing to do
1279
AS2( mov edi, [edx])
1280
AS2( mov ebp, [ebx])
1281
AS1( jmp carry1SubP4)
1283
AS1(loopstartSubP4:)
1284
AS2( mov edi, [edx+8])
1287
AS2( mov ebp, [ebx])
1289
AS1( jc carry1SubP4)
1295
AS2( mov [ecx], edi)
1296
AS2( mov edi, [edx+4])
1297
AS2( cmovc eax, ebp)
1298
AS2( mov ebp, [ebx+4])
1301
AS1( jc carry2SubP4)
1307
AS2( cmovc eax, ebp)
1308
AS2( mov [ecx+4], edi)
1310
AS1( jnz loopstartSubP4)
1317
// multiply assembly code originally contributed by Leonard Janke
1319
#define MulStartup \
1324
#define MulShiftCarry \
1329
#define MulAccumulateBottom(i,j) \
1330
AS2(mov eax, [ecx+4*j]) \
1331
AS2(imul eax, dword ptr [esi+4*i]) \
1334
#define MulAccumulate(i,j) \
1335
AS2(mov eax, [ecx+4*j]) \
1336
AS1(mul dword ptr [esi+4*i]) \
1341
#define MulStoreDigit(i) \
1343
AS2(mov edi, [esp]) \
1344
AS2(mov [edi+4*i], ebp)
1346
#define MulLastDiagonal(digits) \
1347
AS2(mov eax, [ecx+4*(digits-1)]) \
1348
AS1(mul dword ptr [esi+4*(digits-1)]) \
1351
AS2(mov edi, [esp]) \
1352
AS2(mov [edi+4*(2*digits-2)], ebp) \
1353
AS2(mov [edi+4*(2*digits-1)], edx)
1355
CRYPTOPP_NAKED void PentiumOptimized::Multiply4(word* Z, const word* X, const word* Y)
1358
// now: [esp] = Z, esi = X, ecx = Y
1397
CRYPTOPP_NAKED void PentiumOptimized::Multiply8(word* Z, const word* X, const word* Y)
1400
// now: [esp] = Z, esi = X, ecx = Y
1511
CRYPTOPP_NAKED void PentiumOptimized::Multiply8Bottom(word* Z, const word* X, const word* Y)
1514
// now: [esp] = Z, esi = X, ecx = Y
1565
MulAccumulateBottom(7,0)
1566
MulAccumulateBottom(6,1)
1567
MulAccumulateBottom(5,2)
1568
MulAccumulateBottom(4,3)
1569
MulAccumulateBottom(3,4)
1570
MulAccumulateBottom(2,5)
1571
MulAccumulateBottom(1,6)
1572
MulAccumulateBottom(0,7)
1580
#else // not x86 - no processor specific code at this layer
1582
typedef Portable LowLevel;
1586
#ifdef SSE2_INTRINSICS_AVAILABLE
1589
#define CRYPTOPP_FASTCALL
1591
#define CRYPTOPP_FASTCALL __fastcall
1594
static void CRYPTOPP_FASTCALL P4_Mul(__m128i *C, const __m128i *A, const __m128i *B)
1005
static void __fastcall P4_Mul(__m128i *C, const __m128i *A, const __m128i *B)
1596
1007
__m128i a3210 = _mm_load_si128(A);
1597
1008
__m128i b3210 = _mm_load_si128(B);
1969
1380
s3 = _mm_add_si64(s3, s4);
1970
1381
s1 = _mm_add_si64(s1, w26);
1971
1382
s1 = _mm_add_si64(s1, s3);
1972
C[6] = _mm_cvtsi64_si32(s1);
1973
s1 = _mm_srli_si64(s1, 32);
1383
C[6] = _m_to_int(s1);
1384
s1 = _m_psrlqi(s1, 32);
1975
C[7] = _mm_cvtsi64_si32(s1) + w[27] + x[10] + y[10] + x[12] + y[12];
1386
C[7] = _m_to_int(s1) + w[27] + x[10] + y[10] + x[12] + y[12];
1390
__declspec(naked) word __fastcall P4Optimized::Add(word *C, const word *A, const word *B, unsigned int N)
1401
mov ebx, [esp+20] // B
1402
mov esi, [esp+24] // N
1404
// now: ebx = B, ecx = C, edx = A, esi = N
1407
jz loopend // if no dwords then nothing to do
1461
__declspec(naked) word __fastcall P4Optimized::Subtract(word *C, const word *A, const word *B, unsigned int N)
1472
mov ebx, [esp+20] // B
1473
mov esi, [esp+24] // N
1475
// now: ebx = B, ecx = C, edx = A, esi = N
1478
jz loopend // if no dwords then nothing to do
1979
1532
#endif // #ifdef SSE2_INTRINSICS_AVAILABLE
1534
#elif defined(__GNUC__) && defined(__i386__)
1536
class PentiumOptimized : public Portable
1539
#ifndef __pic__ // -fpic uses up a register, leaving too few for the asm code
1540
static word Add(word *C, const word *A, const word *B, unsigned int N);
1541
static word Subtract(word *C, const word *A, const word *B, unsigned int N);
1543
static void Square4(word *R, const word *A);
1544
static void Multiply4(word *C, const word *A, const word *B);
1545
static void Multiply8(word *C, const word *A, const word *B);
1548
typedef PentiumOptimized LowLevel;
1550
// Add and Subtract assembly code originally contributed by Alister Lee
1553
__attribute__((regparm(3))) word PentiumOptimized::Add(word *C, const word *A, const word *B, unsigned int N)
1557
register word carry, temp;
1559
__asm__ __volatile__(
1564
"lea (%1,%4,4), %1;"
1571
"mov (%1,%0,8), %5;"
1574
"mov 4(%1,%0,8), %5;"
1577
"mov %4, -8(%3, %2);"
1578
"mov %%ebp, -4(%3, %2);"
1585
: "=aSD" (carry), "+r" (B), "+r" (C), "+r" (A), "+r" (N), "=r" (temp)
1586
: : "cc", "memory");
1591
__attribute__((regparm(3))) word PentiumOptimized::Subtract(word *C, const word *A, const word *B, unsigned int N)
1595
register word carry, temp;
1597
__asm__ __volatile__(
1602
"lea (%1,%4,4), %1;"
1609
"mov (%1,%0,8), %5;"
1612
"mov 4(%1,%0,8), %5;"
1615
"mov %4, -8(%3, %2);"
1616
"mov %%ebp, -4(%3, %2);"
1623
: "=aSD" (carry), "+r" (B), "+r" (C), "+r" (A), "+r" (N), "=r" (temp)
1624
: : "cc", "memory");
1630
// Comba square and multiply assembly code originally contributed by Leonard Janke
1632
#define SqrStartup \
1636
"xor %%ebp, %%ebp\n\t" \
1637
"xor %%ebx, %%ebx\n\t" \
1638
"xor %%ecx, %%ecx\n\t"
1640
#define SqrShiftCarry \
1641
"mov %%ebx, %%ebp\n\t" \
1642
"mov %%ecx, %%ebx\n\t" \
1643
"xor %%ecx, %%ecx\n\t"
1645
#define SqrAccumulate(i,j) \
1646
"mov 4*"#j"(%%esi), %%eax\n\t" \
1647
"mull 4*"#i"(%%esi)\n\t" \
1648
"add %%eax, %%ebp\n\t" \
1649
"adc %%edx, %%ebx\n\t" \
1650
"adc %%ch, %%cl\n\t" \
1651
"add %%eax, %%ebp\n\t" \
1652
"adc %%edx, %%ebx\n\t" \
1653
"adc %%ch, %%cl\n\t"
1655
#define SqrAccumulateCentre(i) \
1656
"mov 4*"#i"(%%esi), %%eax\n\t" \
1657
"mull 4*"#i"(%%esi)\n\t" \
1658
"add %%eax, %%ebp\n\t" \
1659
"adc %%edx, %%ebx\n\t" \
1660
"adc %%ch, %%cl\n\t"
1662
#define SqrStoreDigit(X) \
1663
"mov %%ebp, 4*"#X"(%%edi)\n\t" \
1665
#define SqrLastDiagonal(digits) \
1666
"mov 4*("#digits"-1)(%%esi), %%eax\n\t" \
1667
"mull 4*("#digits"-1)(%%esi)\n\t" \
1668
"add %%eax, %%ebp\n\t" \
1669
"adc %%edx, %%ebx\n\t" \
1670
"mov %%ebp, 4*(2*"#digits"-2)(%%edi)\n\t" \
1671
"mov %%ebx, 4*(2*"#digits"-1)(%%edi)\n\t"
1673
#define SqrCleanup \
1678
void PentiumOptimized::Square4(word* Y, const word* X)
1680
__asm__ __volatile__(
1683
SqrAccumulateCentre(0)
1692
SqrAccumulateCentre(1)
1702
SqrAccumulateCentre(2)
1716
: "eax", "ecx", "edx", "ebp", "memory"
1720
#define MulStartup \
1725
"mov %%eax, %%ebx \n\t" \
1726
"xor %%ebp, %%ebp\n\t" \
1727
"xor %%edi, %%edi\n\t" \
1728
"xor %%ecx, %%ecx\n\t"
1730
#define MulShiftCarry \
1731
"mov %%edx, %%ebp\n\t" \
1732
"mov %%ecx, %%edi\n\t" \
1733
"xor %%ecx, %%ecx\n\t"
1735
#define MulAccumulate(i,j) \
1736
"mov 4*"#j"(%%ebx), %%eax\n\t" \
1737
"mull 4*"#i"(%%esi)\n\t" \
1738
"add %%eax, %%ebp\n\t" \
1739
"adc %%edx, %%edi\n\t" \
1740
"adc %%ch, %%cl\n\t"
1742
#define MulStoreDigit(X) \
1743
"mov %%edi, %%edx \n\t" \
1744
"mov (%%esp), %%edi \n\t" \
1745
"mov %%ebp, 4*"#X"(%%edi)\n\t" \
1746
"mov %%edi, (%%esp)\n\t"
1748
#define MulLastDiagonal(digits) \
1749
"mov 4*("#digits"-1)(%%ebx), %%eax\n\t" \
1750
"mull 4*("#digits"-1)(%%esi)\n\t" \
1751
"add %%eax, %%ebp\n\t" \
1752
"adc %%edi, %%edx\n\t" \
1753
"mov (%%esp), %%edi\n\t" \
1754
"mov %%ebp, 4*(2*"#digits"-2)(%%edi)\n\t" \
1755
"mov %%edx, 4*(2*"#digits"-1)(%%edi)\n\t"
1757
#define MulCleanup \
1763
void PentiumOptimized::Multiply4(word* Z, const word* X, const word* Y)
1765
__asm__ __volatile__(
1805
: "D" (Z), "S" (X), "a" (Y)
1806
: "%ecx", "%edx", "memory"
1810
void PentiumOptimized::Multiply8(word* Z, const word* X, const word* Y)
1812
__asm__ __volatile__(
1924
: "D" (Z), "S" (X), "a" (Y)
1925
: "%ecx", "%edx", "memory"
1929
#else // no processor specific code at this layer
1931
typedef Portable LowLevel;
1981
1935
// ********************************************************
1995
1949
#define R2 (R+N)
1996
1950
#define R3 (R+N+N2)
1952
//VC60 workaround: compiler bug triggered without the extra dummy parameters
1998
1954
// R[2*N] - result = A*B
1999
1955
// T[2*N] - temporary work space
2000
1956
// A[N] --- multiplier
2001
1957
// B[N] --- multiplicant
2003
void RecursiveMultiply(word *R, word *T, const word *A, const word *B, size_t N)
1960
void DoRecursiveMultiply(word *R, word *T, const word *A, const word *B, unsigned int N, const P *dummy=NULL);
1963
inline void RecursiveMultiply(word *R, word *T, const word *A, const word *B, unsigned int N, const P *dummy=NULL)
2005
1965
assert(N>=2 && N%2==0);
2007
if (LowLevel::MultiplyRecursionLimit() >= 8 && N==8)
2008
LowLevel::Multiply8(R, A, B);
2009
else if (LowLevel::MultiplyRecursionLimit() >= 4 && N==4)
2010
LowLevel::Multiply4(R, A, B);
1967
if (P::MultiplyRecursionLimit() >= 8 && N==8)
1968
P::Multiply8(R, A, B);
1969
else if (P::MultiplyRecursionLimit() >= 4 && N==4)
1970
P::Multiply4(R, A, B);
2012
LowLevel::Multiply2(R, A, B);
1972
P::Multiply2(R, A, B);
1974
DoRecursiveMultiply<P>(R, T, A, B, N, NULL); // VC60 workaround: needs this NULL
1978
void DoRecursiveMultiply(word *R, word *T, const word *A, const word *B, unsigned int N, const P *dummy)
1980
const unsigned int N2 = N/2;
1983
int aComp = Compare(A0, A1, N2);
1984
int bComp = Compare(B0, B1, N2);
1986
switch (2*aComp + aComp + bComp)
2015
const size_t N2 = N/2;
2018
int aComp = Compare(A0, A1, N2);
2019
int bComp = Compare(B0, B1, N2);
2021
switch (2*aComp + aComp + bComp)
2024
LowLevel::Subtract(R0, A1, A0, N2);
2025
LowLevel::Subtract(R1, B0, B1, N2);
2026
RecursiveMultiply(T0, T2, R0, R1, N2);
2027
LowLevel::Subtract(T1, T1, R0, N2);
2031
LowLevel::Subtract(R0, A1, A0, N2);
2032
LowLevel::Subtract(R1, B0, B1, N2);
2033
RecursiveMultiply(T0, T2, R0, R1, N2);
2037
LowLevel::Subtract(R0, A0, A1, N2);
2038
LowLevel::Subtract(R1, B1, B0, N2);
2039
RecursiveMultiply(T0, T2, R0, R1, N2);
2043
LowLevel::Subtract(R0, A1, A0, N2);
2044
LowLevel::Subtract(R1, B0, B1, N2);
2045
RecursiveMultiply(T0, T2, R0, R1, N2);
2046
LowLevel::Subtract(T1, T1, R1, N2);
2054
RecursiveMultiply(R0, T2, A0, B0, N2);
2055
RecursiveMultiply(R2, T2, A1, B1, N2);
2057
// now T[01] holds (A1-A0)*(B0-B1), R[01] holds A0*B0, R[23] holds A1*B1
2059
carry += LowLevel::Add(T0, T0, R0, N);
2060
carry += LowLevel::Add(T0, T0, R2, N);
2061
carry += LowLevel::Add(R1, R1, T0, N);
2063
assert (carry >= 0 && carry <= 2);
2064
Increment(R3, N2, carry);
1989
P::Subtract(R0, A1, A0, N2);
1990
P::Subtract(R1, B0, B1, N2);
1991
RecursiveMultiply<P>(T0, T2, R0, R1, N2);
1992
P::Subtract(T1, T1, R0, N2);
1996
P::Subtract(R0, A1, A0, N2);
1997
P::Subtract(R1, B0, B1, N2);
1998
RecursiveMultiply<P>(T0, T2, R0, R1, N2);
2002
P::Subtract(R0, A0, A1, N2);
2003
P::Subtract(R1, B1, B0, N2);
2004
RecursiveMultiply<P>(T0, T2, R0, R1, N2);
2008
P::Subtract(R0, A1, A0, N2);
2009
P::Subtract(R1, B0, B1, N2);
2010
RecursiveMultiply<P>(T0, T2, R0, R1, N2);
2011
P::Subtract(T1, T1, R1, N2);
2019
RecursiveMultiply<P>(R0, T2, A0, B0, N2);
2020
RecursiveMultiply<P>(R2, T2, A1, B1, N2);
2022
// now T[01] holds (A1-A0)*(B0-B1), R[01] holds A0*B0, R[23] holds A1*B1
2024
carry += P::Add(T0, T0, R0, N);
2025
carry += P::Add(T0, T0, R2, N);
2026
carry += P::Add(R1, R1, T0, N);
2028
assert (carry >= 0 && carry <= 2);
2029
Increment(R3, N2, carry);
2068
2032
// R[2*N] - result = A*A
2069
2033
// T[2*N] - temporary work space
2070
2034
// A[N] --- number to be squared
2072
void RecursiveSquare(word *R, word *T, const word *A, size_t N)
2037
void DoRecursiveSquare(word *R, word *T, const word *A, unsigned int N, const P *dummy=NULL);
2040
inline void RecursiveSquare(word *R, word *T, const word *A, unsigned int N, const P *dummy=NULL)
2074
2042
assert(N && N%2==0);
2075
if (LowLevel::SquareRecursionLimit() >= 8 && N==8)
2076
LowLevel::Square8(R, A);
2077
if (LowLevel::SquareRecursionLimit() >= 4 && N==4)
2078
LowLevel::Square4(R, A);
2043
if (P::SquareRecursionLimit() >= 8 && N==8)
2045
if (P::SquareRecursionLimit() >= 4 && N==4)
2080
LowLevel::Square2(R, A);
2083
const size_t N2 = N/2;
2085
RecursiveSquare(R0, T2, A0, N2);
2086
RecursiveSquare(R2, T2, A1, N2);
2087
RecursiveMultiply(T0, T2, A0, A1, N2);
2089
int carry = LowLevel::Add(R1, R1, T0, N);
2090
carry += LowLevel::Add(R1, R1, T0, N);
2091
Increment(R3, N2, carry);
2050
DoRecursiveSquare<P>(R, T, A, N, NULL); // VC60 workaround: needs this NULL
2054
void DoRecursiveSquare(word *R, word *T, const word *A, unsigned int N, const P *dummy)
2056
const unsigned int N2 = N/2;
2058
RecursiveSquare<P>(R0, T2, A0, N2);
2059
RecursiveSquare<P>(R2, T2, A1, N2);
2060
RecursiveMultiply<P>(T0, T2, A0, A1, N2);
2062
word carry = P::Add(R1, R1, T0, N);
2063
carry += P::Add(R1, R1, T0, N);
2064
Increment(R3, N2, carry);
2095
2067
// R[N] - bottom half of A*B
2149
2130
switch (2*aComp + aComp + bComp)
2152
LowLevel::Subtract(R0, A1, A0, N2);
2153
LowLevel::Subtract(R1, B0, B1, N2);
2154
RecursiveMultiply(T0, T2, R0, R1, N2);
2155
LowLevel::Subtract(T1, T1, R0, N2);
2133
P::Subtract(R0, A1, A0, N2);
2134
P::Subtract(R1, B0, B1, N2);
2135
RecursiveMultiply<P>(T0, T2, R0, R1, N2);
2136
P::Subtract(T1, T1, R0, N2);
2159
LowLevel::Subtract(R0, A1, A0, N2);
2160
LowLevel::Subtract(R1, B0, B1, N2);
2161
RecursiveMultiply(T0, T2, R0, R1, N2);
2140
P::Subtract(R0, A1, A0, N2);
2141
P::Subtract(R1, B0, B1, N2);
2142
RecursiveMultiply<P>(T0, T2, R0, R1, N2);
2165
LowLevel::Subtract(R0, A0, A1, N2);
2166
LowLevel::Subtract(R1, B1, B0, N2);
2167
RecursiveMultiply(T0, T2, R0, R1, N2);
2146
P::Subtract(R0, A0, A1, N2);
2147
P::Subtract(R1, B1, B0, N2);
2148
RecursiveMultiply<P>(T0, T2, R0, R1, N2);
2171
LowLevel::Subtract(R0, A1, A0, N2);
2172
LowLevel::Subtract(R1, B0, B1, N2);
2173
RecursiveMultiply(T0, T2, R0, R1, N2);
2174
LowLevel::Subtract(T1, T1, R1, N2);
2152
P::Subtract(R0, A1, A0, N2);
2153
P::Subtract(R1, B0, B1, N2);
2154
RecursiveMultiply<P>(T0, T2, R0, R1, N2);
2155
P::Subtract(T1, T1, R1, N2);
2201
inline int Add(word *C, const word *A, const word *B, size_t N)
2182
inline word Add(word *C, const word *A, const word *B, unsigned int N)
2203
2184
return LowLevel::Add(C, A, B, N);
2206
inline int Subtract(word *C, const word *A, const word *B, size_t N)
2187
inline word Subtract(word *C, const word *A, const word *B, unsigned int N)
2208
2189
return LowLevel::Subtract(C, A, B, N);
2211
inline void Multiply(word *R, word *T, const word *A, const word *B, size_t N)
2213
RecursiveMultiply(R, T, A, B, N);
2216
inline void Square(word *R, word *T, const word *A, size_t N)
2218
RecursiveSquare(R, T, A, N);
2221
inline void MultiplyBottom(word *R, word *T, const word *A, const word *B, size_t N)
2223
RecursiveMultiplyBottom(R, T, A, B, N);
2226
inline void MultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, size_t N)
2228
RecursiveMultiplyTop(R, T, L, A, B, N);
2231
static word LinearMultiply(word *C, const word *A, word B, size_t N)
2192
inline void Multiply(word *R, word *T, const word *A, const word *B, unsigned int N)
2194
#ifdef SSE2_INTRINSICS_AVAILABLE
2196
RecursiveMultiply<P4Optimized>(R, T, A, B, N);
2199
RecursiveMultiply<LowLevel>(R, T, A, B, N);
2202
inline void Square(word *R, word *T, const word *A, unsigned int N)
2204
#ifdef SSE2_INTRINSICS_AVAILABLE
2206
RecursiveSquare<P4Optimized>(R, T, A, N);
2209
RecursiveSquare<LowLevel>(R, T, A, N);
2212
inline void MultiplyBottom(word *R, word *T, const word *A, const word *B, unsigned int N)
2214
#ifdef SSE2_INTRINSICS_AVAILABLE
2216
RecursiveMultiplyBottom<P4Optimized>(R, T, A, B, N);
2219
RecursiveMultiplyBottom<LowLevel>(R, T, A, B, N);
2222
inline void MultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, unsigned int N)
2224
#ifdef SSE2_INTRINSICS_AVAILABLE
2226
RecursiveMultiplyTop<P4Optimized>(R, T, L, A, B, N);
2229
RecursiveMultiplyTop<LowLevel>(R, T, L, A, B, N);
2232
static word LinearMultiply(word *C, const word *A, word B, unsigned int N)
2234
2235
for(unsigned i=0; i<N; i++)
4156
4141
MontgomeryRepresentation::MontgomeryRepresentation(const Integer &m) // modulus must be odd
4157
4142
: ModularArithmetic(m),
4158
m_u((word)0, m_modulus.reg.size()),
4159
m_workspace(5*m_modulus.reg.size())
4143
u((word)0, modulus.reg.size()),
4144
workspace(5*modulus.reg.size())
4161
if (!m_modulus.IsOdd())
4146
if (!modulus.IsOdd())
4162
4147
throw InvalidArgument("MontgomeryRepresentation: Montgomery representation requires an odd modulus");
4164
RecursiveInverseModPower2(m_u.reg, m_workspace, m_modulus.reg, m_modulus.reg.size());
4149
RecursiveInverseModPower2(u.reg, workspace, modulus.reg, modulus.reg.size());
4167
4152
const Integer& MontgomeryRepresentation::Multiply(const Integer &a, const Integer &b) const
4169
word *const T = m_workspace.begin();
4170
word *const R = m_result.reg.begin();
4171
const size_t N = m_modulus.reg.size();
4154
word *const T = workspace.begin();
4155
word *const R = result.reg.begin();
4156
const unsigned int N = modulus.reg.size();
4172
4157
assert(a.reg.size()<=N && b.reg.size()<=N);
4174
4159
AsymmetricMultiply(T, T+2*N, a.reg, a.reg.size(), b.reg, b.reg.size());
4175
4160
SetWords(T+a.reg.size()+b.reg.size(), 0, 2*N-a.reg.size()-b.reg.size());
4176
MontgomeryReduce(R, T+2*N, T, m_modulus.reg, m_u.reg, N);
4161
MontgomeryReduce(R, T+2*N, T, modulus.reg, u.reg, N);
4180
4165
const Integer& MontgomeryRepresentation::Square(const Integer &a) const
4182
word *const T = m_workspace.begin();
4183
word *const R = m_result.reg.begin();
4184
const size_t N = m_modulus.reg.size();
4167
word *const T = workspace.begin();
4168
word *const R = result.reg.begin();
4169
const unsigned int N = modulus.reg.size();
4185
4170
assert(a.reg.size()<=N);
4187
4172
CryptoPP::Square(T, T+2*N, a.reg, a.reg.size());
4188
4173
SetWords(T+2*a.reg.size(), 0, 2*N-2*a.reg.size());
4189
MontgomeryReduce(R, T+2*N, T, m_modulus.reg, m_u.reg, N);
4174
MontgomeryReduce(R, T+2*N, T, modulus.reg, u.reg, N);
4193
4178
Integer MontgomeryRepresentation::ConvertOut(const Integer &a) const
4195
word *const T = m_workspace.begin();
4196
word *const R = m_result.reg.begin();
4197
const size_t N = m_modulus.reg.size();
4180
word *const T = workspace.begin();
4181
word *const R = result.reg.begin();
4182
const unsigned int N = modulus.reg.size();
4198
4183
assert(a.reg.size()<=N);
4200
4185
CopyWords(T, a.reg, a.reg.size());
4201
4186
SetWords(T+a.reg.size(), 0, 2*N-a.reg.size());
4202
MontgomeryReduce(R, T+2*N, T, m_modulus.reg, m_u.reg, N);
4187
MontgomeryReduce(R, T+2*N, T, modulus.reg, u.reg, N);
4206
4191
const Integer& MontgomeryRepresentation::MultiplicativeInverse(const Integer &a) const
4208
4193
// return (EuclideanMultiplicativeInverse(a, modulus)<<(2*WORD_BITS*modulus.reg.size()))%modulus;
4209
word *const T = m_workspace.begin();
4210
word *const R = m_result.reg.begin();
4211
const size_t N = m_modulus.reg.size();
4194
word *const T = workspace.begin();
4195
word *const R = result.reg.begin();
4196
const unsigned int N = modulus.reg.size();
4212
4197
assert(a.reg.size()<=N);
4214
4199
CopyWords(T, a.reg, a.reg.size());
4215
4200
SetWords(T+a.reg.size(), 0, 2*N-a.reg.size());
4216
MontgomeryReduce(R, T+2*N, T, m_modulus.reg, m_u.reg, N);
4217
unsigned k = AlmostInverse(R, T, R, N, m_modulus.reg, N);
4201
MontgomeryReduce(R, T+2*N, T, modulus.reg, u.reg, N);
4202
unsigned k = AlmostInverse(R, T, R, N, modulus.reg, N);
4219
4204
// cout << "k=" << k << " N*32=" << 32*N << endl;
4221
4206
if (k>N*WORD_BITS)
4222
DivideByPower2Mod(R, R, k-N*WORD_BITS, m_modulus.reg, N);
4207
DivideByPower2Mod(R, R, k-N*WORD_BITS, modulus.reg, N);
4224
MultiplyByPower2Mod(R, R, N*WORD_BITS-k, m_modulus.reg, N);
4209
MultiplyByPower2Mod(R, R, N*WORD_BITS-k, modulus.reg, N);