~zooko/cryptopp/trunk : revision 106

17

18

#include <iostream>

19

20

#ifdef _M_X64

21

#include <Intrin.h>

22

#endif

23

24

20

#ifdef SSE2_INTRINSICS_AVAILABLE

25

#ifdef __GNUC__

26

#include <xmmintrin.h>

27

#include <signal.h>

28

#include <setjmp.h>

29

#ifdef CRYPTOPP_MEMALIGN_AVAILABLE

30

#include <malloc.h>

31

#else

32

#include <stdlib.h>

33

#endif

34

#else

35

#include <emmintrin.h>

36

#endif

21

#include <emmintrin.h>

37

22

#elif defined(_MSC_VER) && defined(_M_IX86)

38

#pragma message("You do not seem to have the Visual C++ Processor Pack installed, so use of SSE2 intrinsics will be disabled.")

39

#elif defined(__GNUC__) && defined(__i386__)

40

#warning "You do not have GCC 3.3 or later, or did not specify -msse2 compiler option, so use of SSE2 intrinsics will be disabled."

23

#pragma message("You do no seem to have the Visual C++ Processor Pack installed, so use of SSE2 intrinsics will be disabled.")

41

24

#endif

42

25

43

26

NAMESPACE_BEGIN(CryptoPP)

44

27

45

bool AssignIntToInteger(const std::type_info &valueType, void *pInteger, const void *pInt)

28

bool FunctionAssignIntToInteger(const std::type_info &valueType, void *pInteger, const void *pInt)

46

29

{

47

30

if (valueType != typeid(Integer))

48

31

return false;

50

33

return true;

51

34

}

52

35

53

#ifdef SSE2_INTRINSICS_AVAILABLE

36

static const char s_RunAtStartup = (AssignIntToInteger = FunctionAssignIntToInteger, 0);

37

38

#if defined(SSE2_INTRINSICS_AVAILABLE) || defined(_MSC_VER)

54

39

template <class T>

55

40

CPP_TYPENAME AllocatorBase<T>::pointer AlignedAllocator<T>::allocate(size_type n, const void *)

56

41

{

57

CheckSize(n);

58

if (n == 0)

59

return NULL;

42

#ifdef SSE2_INTRINSICS_AVAILABLE

60

43

if (n >= 4)

61

{

62

void *p;

63

#ifdef CRYPTOPP_MM_MALLOC_AVAILABLE

64

while (!(p = _mm_malloc(sizeof(T)*n, 16)))

65

#elif defined(CRYPTOPP_MEMALIGN_AVAILABLE)

66

while (!(p = memalign(16, sizeof(T)*n)))

67

#elif defined(CRYPTOPP_MALLOC_ALIGNMENT_IS_16)

68

while (!(p = malloc(sizeof(T)*n)))

69

#else

70

while (!(p = (byte *)malloc(sizeof(T)*n + 8))) // assume malloc alignment is at least 8

71

#endif

72

CallNewHandler();

73

74

#ifdef CRYPTOPP_NO_ALIGNED_ALLOC

75

assert(m_pBlock == NULL);

76

m_pBlock = p;

77

if (!IsAlignedOn(p, 16))

78

{

79

assert(IsAlignedOn(p, 8));

80

p = (byte *)p + 8;

81

}

82

#endif

83

84

assert(IsAlignedOn(p, 16));

85

return (T*)p;

86

}

87

return new T[n];

44

return (T *)_mm_malloc(sizeof(T)*n, 16);

45

else

46

#endif

47

return new T[n];

88

48

}

89

49

90

50

template <class T>

91

51

void AlignedAllocator<T>::deallocate(void *p, size_type n)

92

52

{

93

53

memset(p, 0, n*sizeof(T));

54

#ifdef SSE2_INTRINSICS_AVAILABLE

94

55

if (n >= 4)

95

{

96

#ifdef CRYPTOPP_MM_MALLOC_AVAILABLE

97

_mm_free(p);

98

#elif defined(CRYPTOPP_NO_ALIGNED_ALLOC)

99

assert(m_pBlock == p || (byte *)m_pBlock+8 == p);

100

free(m_pBlock);

101

m_pBlock = NULL;

102

#else

103

free(p);

104

#endif

105

}

56

_mm_free(p);

106

57

else

107

delete [] (T *)p;

58

#endif

59

delete [] p;

108

60

}

109

110

template class CRYPTOPP_DLL AlignedAllocator<word>;

111

61

#endif

112

62

113

static int Compare(const word *A, const word *B, size_t N)

63

static int Compare(const word *A, const word *B, unsigned int N)

114

64

{

115

65

while (N--)

116

66

if (A[N] > B[N])

121

71

return 0;

122

72

}

123

73

124

static int Increment(word *A, size_t N, word B=1)

74

static word Increment(word *A, unsigned int N, word B=1)

125

75

{

126

76

assert(N);

127

77

word t = A[0];

134

84

return 1;

135

85

}

136

86

137

static int Decrement(word *A, size_t N, word B=1)

87

static word Decrement(word *A, unsigned int N, word B=1)

138

88

{

139

89

assert(N);

140

90

word t = A[0];

147

97

return 1;

148

98

}

149

99

150

static void TwosComplement(word *A, size_t N)

100

static void TwosComplement(word *A, unsigned int N)

151

101

{

152

102

Decrement(A, N);

153

103

for (unsigned i=0; i<N; i++)

208

158

__asm__("mulq %3" : "=d" (r.m_halfs.high), "=a" (r.m_halfs.low) : "a" (a), "rm" (b) : "cc");

209

159

#elif defined(__mips64)

210

160

__asm__("dmultu %2,%3" : "=h" (r.m_halfs.high), "=l" (r.m_halfs.low) : "r" (a), "r" (b));

211

#elif defined(_M_X64)

212

r.m_halfs.low = _umul128(a, b, &r.m_halfs.high);

213

161

#elif defined(_M_IX86)

214

162

// for testing

215

163

word64 t = (word64)a * b;

460

408

class Portable

461

409

{

462

410

public:

463

static int Add(word *C, const word *A, const word *B, size_t N);

464

static int Subtract(word *C, const word *A, const word *B, size_t N);

411

static word Add(word *C, const word *A, const word *B, unsigned int N);

412

static word Subtract(word *C, const word *A, const word *B, unsigned int N);

465

413

466

414

static inline void Multiply2(word *C, const word *A, const word *B);

467

415

static inline word Multiply2Add(word *C, const word *A, const word *B);

480

428

static inline unsigned int SquareRecursionLimit() {return 4;}

481

429

};

482

430

483

int Portable::Add(word *C, const word *A, const word *B, size_t N)

431

word Portable::Add(word *C, const word *A, const word *B, unsigned int N)

484

432

{

485

433

assert (N%2 == 0);

486

434

492

440

u = DWord(A[i+1]) + B[i+1] + u.GetHighHalf();

493

441

C[i+1] = u.GetLowHalf();

494

442

}

495

return int(u.GetHighHalf());

443

return u.GetHighHalf();

496

444

}

497

445

498

int Portable::Subtract(word *C, const word *A, const word *B, size_t N)

446

word Portable::Subtract(word *C, const word *A, const word *B, unsigned int N)

499

447

{

500

448

assert (N%2 == 0);

501

449

507

455

u = (DWord) A[i+1] - B[i+1] - u.GetHighHalfAsBorrow();

508

456

C[i+1] = u.GetLowHalf();

509

457

}

510

return int(0-u.GetHighHalf());

458

return 0-u.GetHighHalf();

511

459

}

512

460

513

461

void Portable::Multiply2(word *C, const word *A, const word *B)

692

640

693

641

void Portable::Square4(word *R, const word *A)

694

642

{

695

#ifdef _MSC_VER

696

// VC60 workaround: MSVC 6.0 has an optimization bug that makes

697

// (dword)A*B where either A or B has been cast to a dword before

698

// very expensive. Revisit this function when this

699

// bug is fixed.

700

Multiply4(R, A, A);

701

#else

702

643

const word *B = A;

703

644

DWord p, q;

704

645

word c, d, e;

725

666

p = DWord::MultiplyAndAdd(A[3], A[3], d);

726

667

R[6] = p.GetLowHalf();

727

668

R[7] = e + p.GetHighHalf();

728

#endif

729

669

}

730

670

731

671

void Portable::Multiply8(word *R, const word *A, const word *B)

893

833

#undef SquAcc

894

834

#undef SaveSquAcc

895

835

896

#ifdef CRYPTOPP_X86ASM_AVAILABLE

897

898

// ************** x86 feature detection ***************

899

900

static bool s_sse2Enabled = true;

901

902

static void CpuId(word32 input, word32 *output)

903

{

904

#ifdef __GNUC__

905

__asm__

906

(

907

// save ebx in case -fPIC is being used

908

"push %%ebx; cpuid; mov %%ebx, %%edi; pop %%ebx"

909

: "=a" (output[0]), "=D" (output[1]), "=c" (output[2]), "=d" (output[3])

910

: "a" (input)

911

);

912

#else

913

__asm

914

{

915

mov eax, input

836

// CodeWarrior defines _MSC_VER

837

#if defined(_MSC_VER) && !defined(__MWERKS__) && defined(_M_IX86) && (_M_IX86<=700)

838

839

class PentiumOptimized : public Portable

840

{

841

public:

842

static word __fastcall Add(word *C, const word *A, const word *B, unsigned int N);

843

static word __fastcall Subtract(word *C, const word *A, const word *B, unsigned int N);

844

// TODO test this with .NET #if _MSC_VER < 1300

845

static inline void Square4(word *R, const word *A)

846

{

847

// VC60 workaround: MSVC 6.0 has an optimization bug that makes

848

// (dword)A*B where either A or B has been cast to a dword before

849

// very expensive. Revisit this function when this

850

// bug is fixed.

851

Multiply4(R, A, A);

852

}

853

//#endif

854

};

855

856

typedef PentiumOptimized LowLevel;

857

858

__declspec(naked) word __fastcall PentiumOptimized::Add(word *C, const word *A, const word *B, unsigned int N)

859

{

860

__asm

861

{

862

push ebp

863

push ebx

864

push esi

865

push edi

866

867

mov esi, [esp+24] ; N

868

mov ebx, [esp+20] ; B

869

870

// now: ebx = B, ecx = C, edx = A, esi = N

871

872

sub ecx, edx // hold the distance between C & A so we can add this to A to get C

873

xor eax, eax // clear eax

874

875

sub eax, esi // eax is a negative index from end of B

876

lea ebx, [ebx+4*esi] // ebx is end of B

877

878

sar eax, 1 // unit of eax is now dwords; this also clears the carry flag

879

jz loopend // if no dwords then nothing to do

880

881

loopstart:

882

mov esi,[edx] // load lower word of A

883

mov ebp,[edx+4] // load higher word of A

884

885

mov edi,[ebx+8*eax] // load lower word of B

886

lea edx,[edx+8] // advance A and C

887

888

adc esi,edi // add lower words

889

mov edi,[ebx+8*eax+4] // load higher word of B

890

891

adc ebp,edi // add higher words

892

inc eax // advance B

893

894

mov [edx+ecx-8],esi // store lower word result

895

mov [edx+ecx-4],ebp // store higher word result

896

897

jnz loopstart // loop until eax overflows and becomes zero

898

899

loopend:

900

adc eax, 0 // store carry into eax (return result register)

901

pop edi

902

pop esi

903

pop ebx

904

pop ebp

905

ret 8

906

}

907

}

908

909

__declspec(naked) word __fastcall PentiumOptimized::Subtract(word *C, const word *A, const word *B, unsigned int N)

910

{

911

__asm

912

{

913

push ebp

914

push ebx

915

push esi

916

push edi

917

918

mov esi, [esp+24] ; N

919

mov ebx, [esp+20] ; B

920

921

sub ecx, edx

922

xor eax, eax

923

924

sub eax, esi

925

lea ebx, [ebx+4*esi]

926

927

sar eax, 1

928

jz loopend

929

930

loopstart:

931

mov esi,[edx]

932

mov ebp,[edx+4]

933

934

mov edi,[ebx+8*eax]

935

lea edx,[edx+8]

936

937

sbb esi,edi

938

mov edi,[ebx+8*eax+4]

939

940

sbb ebp,edi

941

inc eax

942

943

mov [edx+ecx-8],esi

944

mov [edx+ecx-4],ebp

945

946

jnz loopstart

947

948

loopend:

949

adc eax, 0

950

pop edi

951

pop esi

952

pop ebx

953

pop ebp

954

ret 8

955

}

956

}

957

958

#ifdef SSE2_INTRINSICS_AVAILABLE

959

960

static bool GetSSE2Capability()

961

{

962

word32 b;

963

964

__asm

965

{

966

mov eax, 1

916

967

cpuid

917

mov edi, output

918

mov [edi], eax

919

mov [edi+4], ebx

920

mov [edi+8], ecx

921

mov [edi+12], edx

922

}

923

#endif

924

}

925

926

#ifdef SSE2_INTRINSICS_AVAILABLE

927

#ifndef _MSC_VER

928

static jmp_buf s_env;

929

static void SigIllHandler(int)

930

{

931

longjmp(s_env, 1);

932

}

933

#endif

934

935

static bool HasSSE2()

936

{

937

if (!s_sse2Enabled)

938

return false;

939

940

word32 cpuid[4];

941

CpuId(1, cpuid);

942

if ((cpuid[3] & (1 << 26)) == 0)

943

return false;

944

945

#ifdef _MSC_VER

946

__try

947

{

948

__asm xorpd xmm0, xmm0 // executing SSE2 instruction

949

}

950

__except (1)

951

{

952

return false;

953

}

954

return true;

955

#else

956

typedef void (*SigHandler)(int);

957

958

SigHandler oldHandler = signal(SIGILL, SigIllHandler);

959

if (oldHandler == SIG_ERR)

960

return false;

961

962

bool result = true;

963

if (setjmp(s_env))

964

result = false;

965

else

966

__asm __volatile ("xorps %xmm0, %xmm0");

967

968

signal(SIGILL, oldHandler);

969

return result;

970

#endif

971

}

972

#endif

973

974

static bool IsP4()

975

{

976

word32 cpuid[4];

977

978

CpuId(0, cpuid);

979

std::swap(cpuid[2], cpuid[3]);

980

if (memcmp(cpuid+1, "GenuineIntel", 12) != 0)

981

return false;

982

983

CpuId(1, cpuid);

984

return ((cpuid[0] >> 8) & 0xf) == 0xf;

985

}

986

987

// ************** Pentium/P4 optimizations ***************

988

989

class PentiumOptimized : public Portable

990

{

991

public:

992

static int Add(word *C, const word *A, const word *B, size_t N);

993

static int Subtract(word *C, const word *A, const word *B, size_t N);

994

static void Multiply4(word *C, const word *A, const word *B);

995

static void Multiply8(word *C, const word *A, const word *B);

996

static void Multiply8Bottom(word *C, const word *A, const word *B);

997

};

998

999

class P4Optimized

1000

{

1001

public:

1002

static int Add(word *C, const word *A, const word *B, size_t N);

1003

static int Subtract(word *C, const word *A, const word *B, size_t N);

1004

#ifdef SSE2_INTRINSICS_AVAILABLE

1005

static void Multiply4(word *C, const word *A, const word *B);

1006

static void Multiply8(word *C, const word *A, const word *B);

1007

static void Multiply8Bottom(word *C, const word *A, const word *B);

1008

#endif

1009

};

1010

1011

typedef int (* PAddSub)(word *C, const word *A, const word *B, size_t N);

1012

typedef void (* PMul)(word *C, const word *A, const word *B);

1013

1014

static PAddSub s_pAdd, s_pSub;

1015

#ifdef SSE2_INTRINSICS_AVAILABLE

1016

static PMul s_pMul4, s_pMul8, s_pMul8B;

1017

#endif

1018

1019

static void SetPentiumFunctionPointers()

1020

{

1021

if (IsP4())

1022

{

1023

s_pAdd = &P4Optimized::Add;

1024

s_pSub = &P4Optimized::Subtract;

1025

}

1026

else

1027

{

1028

s_pAdd = &PentiumOptimized::Add;

1029

s_pSub = &PentiumOptimized::Subtract;

1030

}

1031

1032

#ifdef SSE2_INTRINSICS_AVAILABLE

1033

if (HasSSE2())

1034

{

1035

s_pMul4 = &P4Optimized::Multiply4;

1036

s_pMul8 = &P4Optimized::Multiply8;

1037

s_pMul8B = &P4Optimized::Multiply8Bottom;

1038

}

1039

else

1040

{

1041

s_pMul4 = &PentiumOptimized::Multiply4;

1042

s_pMul8 = &PentiumOptimized::Multiply8;

1043

s_pMul8B = &PentiumOptimized::Multiply8Bottom;

1044

}

1045

#endif

1046

}

968

mov b, edx

969

}

970

971

return (b & (1 << 26)) != 0;

972

}

973

974

bool g_sse2DetectionDone = false, g_sse2Detected, g_sse2Enabled = true;

1047

975

1048

976

void DisableSSE2()

1049

977

{

1050

s_sse2Enabled = false;

1051

SetPentiumFunctionPointers();

1052

}

1053

1054

class LowLevel : public PentiumOptimized

978

g_sse2Enabled = false;

979

}

980

981

static inline bool HasSSE2()

982

{

983

if (g_sse2Enabled && !g_sse2DetectionDone)

984

{

985

g_sse2Detected = GetSSE2Capability();

986

g_sse2DetectionDone = true;

987

}

988

return g_sse2Enabled && g_sse2Detected;

989

}

990

991

class P4Optimized : public PentiumOptimized

1055

992

{

1056

993

public:

1057

inline static int Add(word *C, const word *A, const word *B, size_t N)

1058

{return s_pAdd(C, A, B, N);}

1059

inline static int Subtract(word *C, const word *A, const word *B, size_t N)

1060

{return s_pSub(C, A, B, N);}

1061

inline static void Square4(word *R, const word *A)

1062

{Multiply4(R, A, A);}

1063

#ifdef SSE2_INTRINSICS_AVAILABLE

1064

inline static void Multiply4(word *C, const word *A, const word *B)

1065

{s_pMul4(C, A, B);}

1066

inline static void Multiply8(word *C, const word *A, const word *B)

1067

{s_pMul8(C, A, B);}

1068

inline static void Multiply8Bottom(word *C, const word *A, const word *B)

1069

{s_pMul8B(C, A, B);}

1070

#endif

994

static word __fastcall Add(word *C, const word *A, const word *B, unsigned int N);

995

static word __fastcall Subtract(word *C, const word *A, const word *B, unsigned int N);

996

static void Multiply4(word *C, const word *A, const word *B);

997

static void Multiply8(word *C, const word *A, const word *B);

998

static inline void Square4(word *R, const word *A)

999

{

1000

Multiply4(R, A, A);

1001

}

1002

static void Multiply8Bottom(word *C, const word *A, const word *B);

1071

1003

};

1072

1004

1073

// use some tricks to share assembly code between MSVC and GCC

1074

#ifdef _MSC_VER

1075

#define CRYPTOPP_NAKED __declspec(naked)

1076

#define AS1(x) __asm x

1077

#define AS2(x, y) __asm x, y

1078

#define AddPrologue \

1079

__asm push ebp \

1080

__asm push ebx \

1081

__asm push esi \

1082

__asm push edi \

1083

__asm mov ecx, [esp+20] \

1084

__asm mov edx, [esp+24] \

1085

__asm mov ebx, [esp+28] \

1086

__asm mov esi, [esp+32]

1087

#define AddEpilogue \

1088

__asm pop edi \

1089

__asm pop esi \

1090

__asm pop ebx \

1091

__asm pop ebp \

1092

__asm ret

1093

#define MulPrologue \

1094

__asm push ebp \

1095

__asm push ebx \

1096

__asm push esi \

1097

__asm push edi \

1098

__asm mov ecx, [esp+28] \

1099

__asm mov esi, [esp+24] \

1100

__asm push [esp+20]

1101

#define MulEpilogue \

1102

__asm add esp, 4 \

1103

__asm pop edi \

1104

__asm pop esi \

1105

__asm pop ebx \

1106

__asm pop ebp \

1107

__asm ret

1108

#else

1109

#define CRYPTOPP_NAKED

1110

#define AS1(x) #x ";"

1111

#define AS2(x, y) #x ", " #y ";"

1112

#define AddPrologue \

1113

__asm__ __volatile__ \

1114

( \

1115

"push %%ebx;" /* save this manually, in case of -fPIC */ \

1116

"mov %2, %%ebx;" \

1117

".intel_syntax noprefix;" \

1118

"push ebp;"

1119

#define AddEpilogue \

1120

"pop ebp;" \

1121

".att_syntax prefix;" \

1122

"pop %%ebx;" \

1123

: \

1124

: "c" (C), "d" (A), "m" (B), "S" (N) \

1125

: "%edi", "memory", "cc" \

1126

);

1127

#define MulPrologue \

1128

__asm__ __volatile__ \

1129

( \

1130

"push %%ebx;" /* save this manually, in case of -fPIC */ \

1131

"push %%ebp;" \

1132

"push %0;" \

1133

".intel_syntax noprefix;"

1134

#define MulEpilogue \

1135

"add esp, 4;" \

1136

"pop ebp;" \

1137

"pop ebx;" \

1138

".att_syntax prefix;" \

1139

: \

1140

: "rm" (Z), "S" (X), "c" (Y) \

1141

: "%eax", "%edx", "%edi", "memory", "cc" \

1142

);

1143

#endif

1144

1145

CRYPTOPP_NAKED int PentiumOptimized::Add(word *C, const word *A, const word *B, size_t N)

1146

{

1147

AddPrologue

1148

1149

// now: ebx = B, ecx = C, edx = A, esi = N

1150

AS2( sub ecx, edx) // hold the distance between C & A so we can add this to A to get C

1151

AS2( xor eax, eax) // clear eax

1152

1153

AS2( sub eax, esi) // eax is a negative index from end of B

1154

AS2( lea ebx, [ebx+4*esi]) // ebx is end of B

1155

1156

AS2( sar eax, 1) // unit of eax is now dwords; this also clears the carry flag

1157

AS1( jz loopendAdd) // if no dwords then nothing to do

1158

1159

AS1(loopstartAdd:)

1160

AS2( mov esi,[edx]) // load lower word of A

1161

AS2( mov ebp,[edx+4]) // load higher word of A

1162

1163

AS2( mov edi,[ebx+8*eax]) // load lower word of B

1164

AS2( lea edx,[edx+8]) // advance A and C

1165

1166

AS2( adc esi,edi) // add lower words

1167

AS2( mov edi,[ebx+8*eax+4]) // load higher word of B

1168

1169

AS2( adc ebp,edi) // add higher words

1170

AS1( inc eax) // advance B

1171

1172

AS2( mov [edx+ecx-8],esi) // store lower word result

1173

AS2( mov [edx+ecx-4],ebp) // store higher word result

1174

1175

AS1( jnz loopstartAdd) // loop until eax overflows and becomes zero

1176

1177

AS1(loopendAdd:)

1178

AS2( adc eax, 0) // store carry into eax (return result register)

1179

1180

AddEpilogue

1181

}

1182

1183

CRYPTOPP_NAKED int PentiumOptimized::Subtract(word *C, const word *A, const word *B, size_t N)

1184

{

1185

AddPrologue

1186

1187

// now: ebx = B, ecx = C, edx = A, esi = N

1188

AS2( sub ecx, edx) // hold the distance between C & A so we can add this to A to get C

1189

AS2( xor eax, eax) // clear eax

1190

1191

AS2( sub eax, esi) // eax is a negative index from end of B

1192

AS2( lea ebx, [ebx+4*esi]) // ebx is end of B

1193

1194

AS2( sar eax, 1) // unit of eax is now dwords; this also clears the carry flag

1195

AS1( jz loopendSub) // if no dwords then nothing to do

1196

1197

AS1(loopstartSub:)

1198

AS2( mov esi,[edx]) // load lower word of A

1199

AS2( mov ebp,[edx+4]) // load higher word of A

1200

1201

AS2( mov edi,[ebx+8*eax]) // load lower word of B

1202

AS2( lea edx,[edx+8]) // advance A and C

1203

1204

AS2( sbb esi,edi) // subtract lower words

1205

AS2( mov edi,[ebx+8*eax+4]) // load higher word of B

1206

1207

AS2( sbb ebp,edi) // subtract higher words

1208

AS1( inc eax) // advance B

1209

1210

AS2( mov [edx+ecx-8],esi) // store lower word result

1211

AS2( mov [edx+ecx-4],ebp) // store higher word result

1212

1213

AS1( jnz loopstartSub) // loop until eax overflows and becomes zero

1214

1215

AS1(loopendSub:)

1216

AS2( adc eax, 0) // store carry into eax (return result register)

1217

1218

AddEpilogue

1219

}

1220

1221

// On Pentium 4, the adc and sbb instructions are very expensive, so avoid them.

1222

1223

CRYPTOPP_NAKED int P4Optimized::Add(word *C, const word *A, const word *B, size_t N)

1224

{

1225

AddPrologue

1226

1227

// now: ebx = B, ecx = C, edx = A, esi = N

1228

AS2( xor eax, eax)

1229

AS1( neg esi)

1230

AS1( jz loopendAddP4) // if no dwords then nothing to do

1231

1232

AS2( mov edi, [edx])

1233

AS2( mov ebp, [ebx])

1234

AS1( jmp carry1AddP4)

1235

1236

AS1(loopstartAddP4:)

1237

AS2( mov edi, [edx+8])

1238

AS2( add ecx, 8)

1239

AS2( add edx, 8)

1240

AS2( mov ebp, [ebx])

1241

AS2( add edi, eax)

1242

AS1( jc carry1AddP4)

1243

AS2( xor eax, eax)

1244

1245

AS1(carry1AddP4:)

1246

AS2( add edi, ebp)

1247

AS2( mov ebp, 1)

1248

AS2( mov [ecx], edi)

1249

AS2( mov edi, [edx+4])

1250

AS2( cmovc eax, ebp)

1251

AS2( mov ebp, [ebx+4])

1252

AS2( add ebx, 8)

1253

AS2( add edi, eax)

1254

AS1( jc carry2AddP4)

1255

AS2( xor eax, eax)

1256

1257

AS1(carry2AddP4:)

1258

AS2( add edi, ebp)

1259

AS2( mov ebp, 1)

1260

AS2( cmovc eax, ebp)

1261

AS2( mov [ecx+4], edi)

1262

AS2( add esi, 2)

1263

AS1( jnz loopstartAddP4)

1264

1265

AS1(loopendAddP4:)

1266

1267

AddEpilogue

1268

}

1269

1270

CRYPTOPP_NAKED int P4Optimized::Subtract(word *C, const word *A, const word *B, size_t N)

1271

{

1272

AddPrologue

1273

1274

// now: ebx = B, ecx = C, edx = A, esi = N

1275

AS2( xor eax, eax)

1276

AS1( neg esi)

1277

AS1( jz loopendSubP4) // if no dwords then nothing to do

1278

1279

AS2( mov edi, [edx])

1280

AS2( mov ebp, [ebx])

1281

AS1( jmp carry1SubP4)

1282

1283

AS1(loopstartSubP4:)

1284

AS2( mov edi, [edx+8])

1285

AS2( add edx, 8)

1286

AS2( add ecx, 8)

1287

AS2( mov ebp, [ebx])

1288

AS2( sub edi, eax)

1289

AS1( jc carry1SubP4)

1290

AS2( xor eax, eax)

1291

1292

AS1(carry1SubP4:)

1293

AS2( sub edi, ebp)

1294

AS2( mov ebp, 1)

1295

AS2( mov [ecx], edi)

1296

AS2( mov edi, [edx+4])

1297

AS2( cmovc eax, ebp)

1298

AS2( mov ebp, [ebx+4])

1299

AS2( add ebx, 8)

1300

AS2( sub edi, eax)

1301

AS1( jc carry2SubP4)

1302

AS2( xor eax, eax)

1303

1304

AS1(carry2SubP4:)

1305

AS2( sub edi, ebp)

1306

AS2( mov ebp, 1)

1307

AS2( cmovc eax, ebp)

1308

AS2( mov [ecx+4], edi)

1309

AS2( add esi, 2)

1310

AS1( jnz loopstartSubP4)

1311

1312

AS1(loopendSubP4:)

1313

1314

AddEpilogue

1315

}

1316

1317

// multiply assembly code originally contributed by Leonard Janke

1318

1319

#define MulStartup \

1320

AS2(xor ebp, ebp) \

1321

AS2(xor edi, edi) \

1322

AS2(xor ebx, ebx)

1323

1324

#define MulShiftCarry \

1325

AS2(mov ebp, edx) \

1326

AS2(mov edi, ebx) \

1327

AS2(xor ebx, ebx)

1328

1329

#define MulAccumulateBottom(i,j) \

1330

AS2(mov eax, [ecx+4*j]) \

1331

AS2(imul eax, dword ptr [esi+4*i]) \

1332

AS2(add ebp, eax)

1333

1334

#define MulAccumulate(i,j) \

1335

AS2(mov eax, [ecx+4*j]) \

1336

AS1(mul dword ptr [esi+4*i]) \

1337

AS2(add ebp, eax) \

1338

AS2(adc edi, edx) \

1339

AS2(adc bl, bh)

1340

1341

#define MulStoreDigit(i) \

1342

AS2(mov edx, edi) \

1343

AS2(mov edi, [esp]) \

1344

AS2(mov [edi+4*i], ebp)

1345

1346

#define MulLastDiagonal(digits) \

1347

AS2(mov eax, [ecx+4*(digits-1)]) \

1348

AS1(mul dword ptr [esi+4*(digits-1)]) \

1349

AS2(add ebp, eax) \

1350

AS2(adc edx, edi) \

1351

AS2(mov edi, [esp]) \

1352

AS2(mov [edi+4*(2*digits-2)], ebp) \

1353

AS2(mov [edi+4*(2*digits-1)], edx)

1354

1355

CRYPTOPP_NAKED void PentiumOptimized::Multiply4(word* Z, const word* X, const word* Y)

1356

{

1357

MulPrologue

1358

// now: [esp] = Z, esi = X, ecx = Y

1359

MulStartup

1360

MulAccumulate(0,0)

1361

MulStoreDigit(0)

1362

MulShiftCarry

1363

1364

MulAccumulate(1,0)

1365

MulAccumulate(0,1)

1366

MulStoreDigit(1)

1367

MulShiftCarry

1368

1369

MulAccumulate(2,0)

1370

MulAccumulate(1,1)

1371

MulAccumulate(0,2)

1372

MulStoreDigit(2)

1373

MulShiftCarry

1374

1375

MulAccumulate(3,0)

1376

MulAccumulate(2,1)

1377

MulAccumulate(1,2)

1378

MulAccumulate(0,3)

1379

MulStoreDigit(3)

1380

MulShiftCarry

1381

1382

MulAccumulate(3,1)

1383

MulAccumulate(2,2)

1384

MulAccumulate(1,3)

1385

MulStoreDigit(4)

1386

MulShiftCarry

1387

1388

MulAccumulate(3,2)

1389

MulAccumulate(2,3)

1390

MulStoreDigit(5)

1391

MulShiftCarry

1392

1393

MulLastDiagonal(4)

1394

MulEpilogue

1395

}

1396

1397

CRYPTOPP_NAKED void PentiumOptimized::Multiply8(word* Z, const word* X, const word* Y)

1398

{

1399

MulPrologue

1400

// now: [esp] = Z, esi = X, ecx = Y

1401

MulStartup

1402

MulAccumulate(0,0)

1403

MulStoreDigit(0)

1404

MulShiftCarry

1405

1406

MulAccumulate(1,0)

1407

MulAccumulate(0,1)

1408

MulStoreDigit(1)

1409

MulShiftCarry

1410

1411

MulAccumulate(2,0)

1412

MulAccumulate(1,1)

1413

MulAccumulate(0,2)

1414

MulStoreDigit(2)

1415

MulShiftCarry

1416

1417

MulAccumulate(3,0)

1418

MulAccumulate(2,1)

1419

MulAccumulate(1,2)

1420

MulAccumulate(0,3)

1421

MulStoreDigit(3)

1422

MulShiftCarry

1423

1424

MulAccumulate(4,0)

1425

MulAccumulate(3,1)

1426

MulAccumulate(2,2)

1427

MulAccumulate(1,3)

1428

MulAccumulate(0,4)

1429

MulStoreDigit(4)

1430

MulShiftCarry

1431

1432

MulAccumulate(5,0)

1433

MulAccumulate(4,1)

1434

MulAccumulate(3,2)

1435

MulAccumulate(2,3)

1436

MulAccumulate(1,4)

1437

MulAccumulate(0,5)

1438

MulStoreDigit(5)

1439

MulShiftCarry

1440

1441

MulAccumulate(6,0)

1442

MulAccumulate(5,1)

1443

MulAccumulate(4,2)

1444

MulAccumulate(3,3)

1445

MulAccumulate(2,4)

1446

MulAccumulate(1,5)

1447

MulAccumulate(0,6)

1448

MulStoreDigit(6)

1449

MulShiftCarry

1450

1451

MulAccumulate(7,0)

1452

MulAccumulate(6,1)

1453

MulAccumulate(5,2)

1454

MulAccumulate(4,3)

1455

MulAccumulate(3,4)

1456

MulAccumulate(2,5)

1457

MulAccumulate(1,6)

1458

MulAccumulate(0,7)

1459

MulStoreDigit(7)

1460

MulShiftCarry

1461

1462

MulAccumulate(7,1)

1463

MulAccumulate(6,2)

1464

MulAccumulate(5,3)

1465

MulAccumulate(4,4)

1466

MulAccumulate(3,5)

1467

MulAccumulate(2,6)

1468

MulAccumulate(1,7)

1469

MulStoreDigit(8)

1470

MulShiftCarry

1471

1472

MulAccumulate(7,2)

1473

MulAccumulate(6,3)

1474

MulAccumulate(5,4)

1475

MulAccumulate(4,5)

1476

MulAccumulate(3,6)

1477

MulAccumulate(2,7)

1478

MulStoreDigit(9)

1479

MulShiftCarry

1480

1481

MulAccumulate(7,3)

1482

MulAccumulate(6,4)

1483

MulAccumulate(5,5)

1484

MulAccumulate(4,6)

1485

MulAccumulate(3,7)

1486

MulStoreDigit(10)

1487

MulShiftCarry

1488

1489

MulAccumulate(7,4)

1490

MulAccumulate(6,5)

1491

MulAccumulate(5,6)

1492

MulAccumulate(4,7)

1493

MulStoreDigit(11)

1494

MulShiftCarry

1495

1496

MulAccumulate(7,5)

1497

MulAccumulate(6,6)

1498

MulAccumulate(5,7)

1499

MulStoreDigit(12)

1500

MulShiftCarry

1501

1502

MulAccumulate(7,6)

1503

MulAccumulate(6,7)

1504

MulStoreDigit(13)

1505

MulShiftCarry

1506

1507

MulLastDiagonal(8)

1508

MulEpilogue

1509

}

1510

1511

CRYPTOPP_NAKED void PentiumOptimized::Multiply8Bottom(word* Z, const word* X, const word* Y)

1512

{

1513

MulPrologue

1514

// now: [esp] = Z, esi = X, ecx = Y

1515

MulStartup

1516

MulAccumulate(0,0)

1517

MulStoreDigit(0)

1518

MulShiftCarry

1519

1520

MulAccumulate(1,0)

1521

MulAccumulate(0,1)

1522

MulStoreDigit(1)

1523

MulShiftCarry

1524

1525

MulAccumulate(2,0)

1526

MulAccumulate(1,1)

1527

MulAccumulate(0,2)

1528

MulStoreDigit(2)

1529

MulShiftCarry

1530

1531

MulAccumulate(3,0)

1532

MulAccumulate(2,1)

1533

MulAccumulate(1,2)

1534

MulAccumulate(0,3)

1535

MulStoreDigit(3)

1536

MulShiftCarry

1537

1538

MulAccumulate(4,0)

1539

MulAccumulate(3,1)

1540

MulAccumulate(2,2)

1541

MulAccumulate(1,3)

1542

MulAccumulate(0,4)

1543

MulStoreDigit(4)

1544

MulShiftCarry

1545

1546

MulAccumulate(5,0)

1547

MulAccumulate(4,1)

1548

MulAccumulate(3,2)

1549

MulAccumulate(2,3)

1550

MulAccumulate(1,4)

1551

MulAccumulate(0,5)

1552

MulStoreDigit(5)

1553

MulShiftCarry

1554

1555

MulAccumulate(6,0)

1556

MulAccumulate(5,1)

1557

MulAccumulate(4,2)

1558

MulAccumulate(3,3)

1559

MulAccumulate(2,4)

1560

MulAccumulate(1,5)

1561

MulAccumulate(0,6)

1562

MulStoreDigit(6)

1563

MulShiftCarry

1564

1565

MulAccumulateBottom(7,0)

1566

MulAccumulateBottom(6,1)

1567

MulAccumulateBottom(5,2)

1568

MulAccumulateBottom(4,3)

1569

MulAccumulateBottom(3,4)

1570

MulAccumulateBottom(2,5)

1571

MulAccumulateBottom(1,6)

1572

MulAccumulateBottom(0,7)

1573

MulStoreDigit(7)

1574

MulEpilogue

1575

}

1576

1577

#undef AS1

1578

#undef AS2

1579

1580

#else // not x86 - no processor specific code at this layer

1581

1582

typedef Portable LowLevel;

1583

1584

#endif

1585

1586

#ifdef SSE2_INTRINSICS_AVAILABLE

1587

1588

#ifdef __GNUC__

1589

#define CRYPTOPP_FASTCALL

1590

#else

1591

#define CRYPTOPP_FASTCALL __fastcall

1592

#endif

1593

1594

static void CRYPTOPP_FASTCALL P4_Mul(__m128i *C, const __m128i *A, const __m128i *B)

1005

static void __fastcall P4_Mul(__m128i *C, const __m128i *A, const __m128i *B)

1595

1006

{

1596

1007

__m128i a3210 = _mm_load_si128(A);

1597

1008

__m128i b3210 = _mm_load_si128(B);

1661

1072

1662

1073

__m64 s1, s2;

1663

1074

1664

__m64 w1 = _mm_cvtsi32_si64(w[1]);

1075

__m64 w1 = _m_from_int(w[1]);

1665

1076

__m64 w4 = mw[2];

1666

1077

__m64 w6 = mw[3];

1667

1078

__m64 w8 = mw[4];

1672

1083

__m64 w18 = mw[9];

1673

1084

__m64 w20 = mw[10];

1674

1085

__m64 w22 = mw[11];

1675

__m64 w26 = _mm_cvtsi32_si64(w[26]);

1086

__m64 w26 = _m_from_int(w[26]);

1676

1087

1677

1088

s1 = _mm_add_si64(w1, w4);

1678

C[1] = _mm_cvtsi64_si32(s1);

1679

s1 = _mm_srli_si64(s1, 32);

1089

C[1] = _m_to_int(s1);

1090

s1 = _m_psrlqi(s1, 32);

1680

1091

1681

1092

s2 = _mm_add_si64(w6, w8);

1682

1093

s1 = _mm_add_si64(s1, s2);

1683

C[2] = _mm_cvtsi64_si32(s1);

1684

s1 = _mm_srli_si64(s1, 32);

1094

C[2] = _m_to_int(s1);

1095

s1 = _m_psrlqi(s1, 32);

1685

1096

1686

1097

s2 = _mm_add_si64(w10, w12);

1687

1098

s1 = _mm_add_si64(s1, s2);

1688

C[3] = _mm_cvtsi64_si32(s1);

1689

s1 = _mm_srli_si64(s1, 32);

1099

C[3] = _m_to_int(s1);

1100

s1 = _m_psrlqi(s1, 32);

1690

1101

1691

1102

s2 = _mm_add_si64(w14, w16);

1692

1103

s1 = _mm_add_si64(s1, s2);

1693

C[4] = _mm_cvtsi64_si32(s1);

1694

s1 = _mm_srli_si64(s1, 32);

1104

C[4] = _m_to_int(s1);

1105

s1 = _m_psrlqi(s1, 32);

1695

1106

1696

1107

s2 = _mm_add_si64(w18, w20);

1697

1108

s1 = _mm_add_si64(s1, s2);

1698

C[5] = _mm_cvtsi64_si32(s1);

1699

s1 = _mm_srli_si64(s1, 32);

1109

C[5] = _m_to_int(s1);

1110

s1 = _m_psrlqi(s1, 32);

1700

1111

1701

1112

s2 = _mm_add_si64(w22, w26);

1702

1113

s1 = _mm_add_si64(s1, s2);

1703

C[6] = _mm_cvtsi64_si32(s1);

1704

s1 = _mm_srli_si64(s1, 32);

1114

C[6] = _m_to_int(s1);

1115

s1 = _m_psrlqi(s1, 32);

1705

1116

1706

C[7] = _mm_cvtsi64_si32(s1) + w[27];

1117

C[7] = _m_to_int(s1) + w[27];

1707

1118

_mm_empty();

1708

1119

}

1709

1120

1731

1142

1732

1143

__m64 s1, s2, s3, s4;

1733

1144

1734

__m64 w1 = _mm_cvtsi32_si64(w[1]);

1145

__m64 w1 = _m_from_int(w[1]);

1735

1146

__m64 w4 = mw[2];

1736

1147

__m64 w6 = mw[3];

1737

1148

__m64 w8 = mw[4];

1742

1153

__m64 w18 = mw[9];

1743

1154

__m64 w20 = mw[10];

1744

1155

__m64 w22 = mw[11];

1745

__m64 w26 = _mm_cvtsi32_si64(w[26]);

1746

__m64 w27 = _mm_cvtsi32_si64(w[27]);

1156

__m64 w26 = _m_from_int(w[26]);

1157

__m64 w27 = _m_from_int(w[27]);

1747

1158

1748

__m64 x0 = _mm_cvtsi32_si64(x[0]);

1749

__m64 x1 = _mm_cvtsi32_si64(x[1]);

1159

__m64 x0 = _m_from_int(x[0]);

1160

__m64 x1 = _m_from_int(x[1]);

1750

1161

__m64 x4 = mx[2];

1751

1162

__m64 x6 = mx[3];

1752

1163

__m64 x8 = mx[4];

1757

1168

__m64 x18 = mx[9];

1758

1169

__m64 x20 = mx[10];

1759

1170

__m64 x22 = mx[11];

1760

__m64 x26 = _mm_cvtsi32_si64(x[26]);

1761

__m64 x27 = _mm_cvtsi32_si64(x[27]);

1171

__m64 x26 = _m_from_int(x[26]);

1172

__m64 x27 = _m_from_int(x[27]);

1762

1173

1763

__m64 y0 = _mm_cvtsi32_si64(y[0]);

1764

__m64 y1 = _mm_cvtsi32_si64(y[1]);

1174

__m64 y0 = _m_from_int(y[0]);

1175

__m64 y1 = _m_from_int(y[1]);

1765

1176

__m64 y4 = my[2];

1766

1177

__m64 y6 = my[3];

1767

1178

__m64 y8 = my[4];

1772

1183

__m64 y18 = my[9];

1773

1184

__m64 y20 = my[10];

1774

1185

__m64 y22 = my[11];

1775

__m64 y26 = _mm_cvtsi32_si64(y[26]);

1776

__m64 y27 = _mm_cvtsi32_si64(y[27]);

1186

__m64 y26 = _m_from_int(y[26]);

1187

__m64 y27 = _m_from_int(y[27]);

1777

1188

1778

__m64 z0 = _mm_cvtsi32_si64(z[0]);

1779

__m64 z1 = _mm_cvtsi32_si64(z[1]);

1189

__m64 z0 = _m_from_int(z[0]);

1190

__m64 z1 = _m_from_int(z[1]);

1780

1191

__m64 z4 = mz[2];

1781

1192

__m64 z6 = mz[3];

1782

1193

__m64 z8 = mz[4];

1787

1198

__m64 z18 = mz[9];

1788

1199

__m64 z20 = mz[10];

1789

1200

__m64 z22 = mz[11];

1790

__m64 z26 = _mm_cvtsi32_si64(z[26]);

1201

__m64 z26 = _m_from_int(z[26]);

1791

1202

1792

1203

s1 = _mm_add_si64(w1, w4);

1793

C[1] = _mm_cvtsi64_si32(s1);

1794

s1 = _mm_srli_si64(s1, 32);

1204

C[1] = _m_to_int(s1);

1205

s1 = _m_psrlqi(s1, 32);

1795

1206

1796

1207

s2 = _mm_add_si64(w6, w8);

1797

1208

s1 = _mm_add_si64(s1, s2);

1798

C[2] = _mm_cvtsi64_si32(s1);

1799

s1 = _mm_srli_si64(s1, 32);

1209

C[2] = _m_to_int(s1);

1210

s1 = _m_psrlqi(s1, 32);

1800

1211

1801

1212

s2 = _mm_add_si64(w10, w12);

1802

1213

s1 = _mm_add_si64(s1, s2);

1803

C[3] = _mm_cvtsi64_si32(s1);

1804

s1 = _mm_srli_si64(s1, 32);

1214

C[3] = _m_to_int(s1);

1215

s1 = _m_psrlqi(s1, 32);

1805

1216

1806

1217

s3 = _mm_add_si64(x0, y0);

1807

1218

s2 = _mm_add_si64(w14, w16);

1808

1219

s1 = _mm_add_si64(s1, s3);

1809

1220

s1 = _mm_add_si64(s1, s2);

1810

C[4] = _mm_cvtsi64_si32(s1);

1811

s1 = _mm_srli_si64(s1, 32);

1221

C[4] = _m_to_int(s1);

1222

s1 = _m_psrlqi(s1, 32);

1812

1223

1813

1224

s3 = _mm_add_si64(x1, y1);

1814

1225

s4 = _mm_add_si64(x4, y4);

1816

1227

s3 = _mm_add_si64(s3, s4);

1817

1228

s1 = _mm_add_si64(s1, w20);

1818

1229

s1 = _mm_add_si64(s1, s3);

1819

C[5] = _mm_cvtsi64_si32(s1);

1820

s1 = _mm_srli_si64(s1, 32);

1230

C[5] = _m_to_int(s1);

1231

s1 = _m_psrlqi(s1, 32);

1821

1232

1822

1233

s3 = _mm_add_si64(x6, y6);

1823

1234

s4 = _mm_add_si64(x8, y8);

1825

1236

s3 = _mm_add_si64(s3, s4);

1826

1237

s1 = _mm_add_si64(s1, w26);

1827

1238

s1 = _mm_add_si64(s1, s3);

1828

C[6] = _mm_cvtsi64_si32(s1);

1829

s1 = _mm_srli_si64(s1, 32);

1239

C[6] = _m_to_int(s1);

1240

s1 = _m_psrlqi(s1, 32);

1830

1241

1831

1242

s3 = _mm_add_si64(x10, y10);

1832

1243

s4 = _mm_add_si64(x12, y12);

1833

1244

s1 = _mm_add_si64(s1, w27);

1834

1245

s3 = _mm_add_si64(s3, s4);

1835

1246

s1 = _mm_add_si64(s1, s3);

1836

C[7] = _mm_cvtsi64_si32(s1);

1837

s1 = _mm_srli_si64(s1, 32);

1247

C[7] = _m_to_int(s1);

1248

s1 = _m_psrlqi(s1, 32);

1838

1249

1839

1250

s3 = _mm_add_si64(x14, y14);

1840

1251

s4 = _mm_add_si64(x16, y16);

1841

1252

s1 = _mm_add_si64(s1, z0);

1842

1253

s3 = _mm_add_si64(s3, s4);

1843

1254

s1 = _mm_add_si64(s1, s3);

1844

C[8] = _mm_cvtsi64_si32(s1);

1845

s1 = _mm_srli_si64(s1, 32);

1255

C[8] = _m_to_int(s1);

1256

s1 = _m_psrlqi(s1, 32);

1846

1257

1847

1258

s3 = _mm_add_si64(x18, y18);

1848

1259

s4 = _mm_add_si64(x20, y20);

1850

1261

s3 = _mm_add_si64(s3, s4);

1851

1262

s1 = _mm_add_si64(s1, z4);

1852

1263

s1 = _mm_add_si64(s1, s3);

1853

C[9] = _mm_cvtsi64_si32(s1);

1854

s1 = _mm_srli_si64(s1, 32);

1264

C[9] = _m_to_int(s1);

1265

s1 = _m_psrlqi(s1, 32);

1855

1266

1856

1267

s3 = _mm_add_si64(x22, y22);

1857

1268

s4 = _mm_add_si64(x26, y26);

1859

1270

s3 = _mm_add_si64(s3, s4);

1860

1271

s1 = _mm_add_si64(s1, z8);

1861

1272

s1 = _mm_add_si64(s1, s3);

1862

C[10] = _mm_cvtsi64_si32(s1);

1863

s1 = _mm_srli_si64(s1, 32);

1273

C[10] = _m_to_int(s1);

1274

s1 = _m_psrlqi(s1, 32);

1864

1275

1865

1276

s3 = _mm_add_si64(x27, y27);

1866

1277

s1 = _mm_add_si64(s1, z10);

1867

1278

s1 = _mm_add_si64(s1, z12);

1868

1279

s1 = _mm_add_si64(s1, s3);

1869

C[11] = _mm_cvtsi64_si32(s1);

1870

s1 = _mm_srli_si64(s1, 32);

1280

C[11] = _m_to_int(s1);

1281

s1 = _m_psrlqi(s1, 32);

1871

1282

1872

1283

s3 = _mm_add_si64(z14, z16);

1873

1284

s1 = _mm_add_si64(s1, s3);

1874

C[12] = _mm_cvtsi64_si32(s1);

1875

s1 = _mm_srli_si64(s1, 32);

1285

C[12] = _m_to_int(s1);

1286

s1 = _m_psrlqi(s1, 32);

1876

1287

1877

1288

s3 = _mm_add_si64(z18, z20);

1878

1289

s1 = _mm_add_si64(s1, s3);

1879

C[13] = _mm_cvtsi64_si32(s1);

1880

s1 = _mm_srli_si64(s1, 32);

1290

C[13] = _m_to_int(s1);

1291

s1 = _m_psrlqi(s1, 32);

1881

1292

1882

1293

s3 = _mm_add_si64(z22, z26);

1883

1294

s1 = _mm_add_si64(s1, s3);

1884

C[14] = _mm_cvtsi64_si32(s1);

1885

s1 = _mm_srli_si64(s1, 32);

1295

C[14] = _m_to_int(s1);

1296

s1 = _m_psrlqi(s1, 32);

1886

1297

1887

C[15] = z[27] + _mm_cvtsi64_si32(s1);

1298

C[15] = z[27] + _m_to_int(s1);

1888

1299

_mm_empty();

1889

1300

}

1890

1301

1908

1319

1909

1320

__m64 s1, s2, s3, s4;

1910

1321

1911

__m64 w1 = _mm_cvtsi32_si64(w[1]);

1322

__m64 w1 = _m_from_int(w[1]);

1912

1323

__m64 w4 = mw[2];

1913

1324

__m64 w6 = mw[3];

1914

1325

__m64 w8 = mw[4];

1919

1330

__m64 w18 = mw[9];

1920

1331

__m64 w20 = mw[10];

1921

1332

__m64 w22 = mw[11];

1922

__m64 w26 = _mm_cvtsi32_si64(w[26]);

1333

__m64 w26 = _m_from_int(w[26]);

1923

1334

1924

__m64 x0 = _mm_cvtsi32_si64(x[0]);

1925

__m64 x1 = _mm_cvtsi32_si64(x[1]);

1335

__m64 x0 = _m_from_int(x[0]);

1336

__m64 x1 = _m_from_int(x[1]);

1926

1337

__m64 x4 = mx[2];

1927

1338

__m64 x6 = mx[3];

1928

1339

__m64 x8 = mx[4];

1929

1340

1930

__m64 y0 = _mm_cvtsi32_si64(y[0]);

1931

__m64 y1 = _mm_cvtsi32_si64(y[1]);

1341

__m64 y0 = _m_from_int(y[0]);

1342

__m64 y1 = _m_from_int(y[1]);

1932

1343

__m64 y4 = my[2];

1933

1344

__m64 y6 = my[3];

1934

1345

__m64 y8 = my[4];

1935

1346

1936

1347

s1 = _mm_add_si64(w1, w4);

1937

C[1] = _mm_cvtsi64_si32(s1);

1938

s1 = _mm_srli_si64(s1, 32);

1348

C[1] = _m_to_int(s1);

1349

s1 = _m_psrlqi(s1, 32);

1939

1350

1940

1351

s2 = _mm_add_si64(w6, w8);

1941

1352

s1 = _mm_add_si64(s1, s2);

1942

C[2] = _mm_cvtsi64_si32(s1);

1943

s1 = _mm_srli_si64(s1, 32);

1353

C[2] = _m_to_int(s1);

1354

s1 = _m_psrlqi(s1, 32);

1944

1355

1945

1356

s2 = _mm_add_si64(w10, w12);

1946

1357

s1 = _mm_add_si64(s1, s2);

1947

C[3] = _mm_cvtsi64_si32(s1);

1948

s1 = _mm_srli_si64(s1, 32);

1358

C[3] = _m_to_int(s1);

1359

s1 = _m_psrlqi(s1, 32);

1949

1360

1950

1361

s3 = _mm_add_si64(x0, y0);

1951

1362

s2 = _mm_add_si64(w14, w16);

1952

1363

s1 = _mm_add_si64(s1, s3);

1953

1364

s1 = _mm_add_si64(s1, s2);

1954

C[4] = _mm_cvtsi64_si32(s1);

1955

s1 = _mm_srli_si64(s1, 32);

1365

C[4] = _m_to_int(s1);

1366

s1 = _m_psrlqi(s1, 32);

1956

1367

1957

1368

s3 = _mm_add_si64(x1, y1);

1958

1369

s4 = _mm_add_si64(x4, y4);

1960

1371

s3 = _mm_add_si64(s3, s4);

1961

1372

s1 = _mm_add_si64(s1, w20);

1962

1373

s1 = _mm_add_si64(s1, s3);

1963

C[5] = _mm_cvtsi64_si32(s1);

1964

s1 = _mm_srli_si64(s1, 32);

1374

C[5] = _m_to_int(s1);

1375

s1 = _m_psrlqi(s1, 32);

1965

1376

1966

1377

s3 = _mm_add_si64(x6, y6);

1967

1378

s4 = _mm_add_si64(x8, y8);

1969

1380

s3 = _mm_add_si64(s3, s4);

1970

1381

s1 = _mm_add_si64(s1, w26);

1971

1382

s1 = _mm_add_si64(s1, s3);

1972

C[6] = _mm_cvtsi64_si32(s1);

1973

s1 = _mm_srli_si64(s1, 32);

1383

C[6] = _m_to_int(s1);

1384

s1 = _m_psrlqi(s1, 32);

1974

1385

1975

C[7] = _mm_cvtsi64_si32(s1) + w[27] + x[10] + y[10] + x[12] + y[12];

1386

C[7] = _m_to_int(s1) + w[27] + x[10] + y[10] + x[12] + y[12];

1976

1387

_mm_empty();

1977

1388

}

1978

1389

1390

__declspec(naked) word __fastcall P4Optimized::Add(word *C, const word *A, const word *B, unsigned int N)

1391

{

1392

__asm

1393

{

1394

sub esp, 16

1395

xor eax, eax

1396

mov [esp], edi

1397

mov [esp+4], esi

1398

mov [esp+8], ebx

1399

mov [esp+12], ebp

1400

1401

mov ebx, [esp+20] // B

1402

mov esi, [esp+24] // N

1403

1404

// now: ebx = B, ecx = C, edx = A, esi = N

1405

1406

neg esi

1407

jz loopend // if no dwords then nothing to do

1408

1409

mov edi, [edx]

1410

mov ebp, [ebx]

1411

1412

loopstart:

1413

add edi, eax

1414

jc carry1

1415

1416

xor eax, eax

1417

1418

carry1continue:

1419

add edi, ebp

1420

mov ebp, 1

1421

mov [ecx], edi

1422

mov edi, [edx+4]

1423

cmovc eax, ebp

1424

mov ebp, [ebx+4]

1425

lea ebx, [ebx+8]

1426

add edi, eax

1427

jc carry2

1428

1429

xor eax, eax

1430

1431

carry2continue:

1432

add edi, ebp

1433

mov ebp, 1

1434

cmovc eax, ebp

1435

mov [ecx+4], edi

1436

add ecx, 8

1437

mov edi, [edx+8]

1438

add edx, 8

1439

add esi, 2

1440

mov ebp, [ebx]

1441

jnz loopstart

1442

1443

loopend:

1444

mov edi, [esp]

1445

mov esi, [esp+4]

1446

mov ebx, [esp+8]

1447

mov ebp, [esp+12]

1448

add esp, 16

1449

ret 8

1450

1451

carry1:

1452

mov eax, 1

1453

jmp carry1continue

1454

1455

carry2:

1456

mov eax, 1

1457

jmp carry2continue

1458

}

1459

}

1460

1461

__declspec(naked) word __fastcall P4Optimized::Subtract(word *C, const word *A, const word *B, unsigned int N)

1462

{

1463

__asm

1464

{

1465

sub esp, 16

1466

xor eax, eax

1467

mov [esp], edi

1468

mov [esp+4], esi

1469

mov [esp+8], ebx

1470

mov [esp+12], ebp

1471

1472

mov ebx, [esp+20] // B

1473

mov esi, [esp+24] // N

1474

1475

// now: ebx = B, ecx = C, edx = A, esi = N

1476

1477

neg esi

1478

jz loopend // if no dwords then nothing to do

1479

1480

mov edi, [edx]

1481

mov ebp, [ebx]

1482

1483

loopstart:

1484

sub edi, eax

1485

jc carry1

1486

1487

xor eax, eax

1488

1489

carry1continue:

1490

sub edi, ebp

1491

mov ebp, 1

1492

mov [ecx], edi

1493

mov edi, [edx+4]

1494

cmovc eax, ebp

1495

mov ebp, [ebx+4]

1496

lea ebx, [ebx+8]

1497

sub edi, eax

1498

jc carry2

1499

1500

xor eax, eax

1501

1502

carry2continue:

1503

sub edi, ebp

1504

mov ebp, 1

1505

cmovc eax, ebp

1506

mov [ecx+4], edi

1507

add ecx, 8

1508

mov edi, [edx+8]

1509

add edx, 8

1510

add esi, 2

1511

mov ebp, [ebx]

1512

jnz loopstart

1513

1514

loopend:

1515

mov edi, [esp]

1516

mov esi, [esp+4]

1517

mov ebx, [esp+8]

1518

mov ebp, [esp+12]

1519

add esp, 16

1520

ret 8

1521

1522

carry1:

1523

mov eax, 1

1524

jmp carry1continue

1525

1526

carry2:

1527

mov eax, 1

1528

jmp carry2continue

1529

}

1530

}

1531

1979

1532

#endif // #ifdef SSE2_INTRINSICS_AVAILABLE

1980

1533

1534

#elif defined(__GNUC__) && defined(__i386__)

1535

1536

class PentiumOptimized : public Portable

1537

{

1538

public:

1539

#ifndef __pic__ // -fpic uses up a register, leaving too few for the asm code

1540

static word Add(word *C, const word *A, const word *B, unsigned int N);

1541

static word Subtract(word *C, const word *A, const word *B, unsigned int N);

1542

#endif

1543

static void Square4(word *R, const word *A);

1544

static void Multiply4(word *C, const word *A, const word *B);

1545

static void Multiply8(word *C, const word *A, const word *B);

1546

};

1547

1548

typedef PentiumOptimized LowLevel;

1549

1550

// Add and Subtract assembly code originally contributed by Alister Lee

1551

1552

#ifndef __pic__

1553

__attribute__((regparm(3))) word PentiumOptimized::Add(word *C, const word *A, const word *B, unsigned int N)

1554

{

1555

assert (N%2 == 0);

1556

1557

register word carry, temp;

1558

1559

__asm__ __volatile__(

1560

"push %%ebp;"

1561

"sub %3, %2;"

1562

"xor %0, %0;"

1563

"sub %4, %0;"

1564

"lea (%1,%4,4), %1;"

1565

"sar $1, %0;"

1566

"jz 1f;"

1567

1568

"0:;"

1569

"mov 0(%3), %4;"

1570

"mov 4(%3), %%ebp;"

1571

"mov (%1,%0,8), %5;"

1572

"lea 8(%3), %3;"

1573

"adc %5, %4;"

1574

"mov 4(%1,%0,8), %5;"

1575

"adc %5, %%ebp;"

1576

"inc %0;"

1577

"mov %4, -8(%3, %2);"

1578

"mov %%ebp, -4(%3, %2);"

1579

"jnz 0b;"

1580

1581

"1:;"

1582

"adc $0, %0;"

1583

"pop %%ebp;"

1584

1585

: "=aSD" (carry), "+r" (B), "+r" (C), "+r" (A), "+r" (N), "=r" (temp)

1586

: : "cc", "memory");

1587

1588

return carry;

1589

}

1590

1591

__attribute__((regparm(3))) word PentiumOptimized::Subtract(word *C, const word *A, const word *B, unsigned int N)

1592

{

1593

assert (N%2 == 0);

1594

1595

register word carry, temp;

1596

1597

__asm__ __volatile__(

1598

"push %%ebp;"

1599

"sub %3, %2;"

1600

"xor %0, %0;"

1601

"sub %4, %0;"

1602

"lea (%1,%4,4), %1;"

1603

"sar $1, %0;"

1604

"jz 1f;"

1605

1606

"0:;"

1607

"mov 0(%3), %4;"

1608

"mov 4(%3), %%ebp;"

1609

"mov (%1,%0,8), %5;"

1610

"lea 8(%3), %3;"

1611

"sbb %5, %4;"

1612

"mov 4(%1,%0,8), %5;"

1613

"sbb %5, %%ebp;"

1614

"inc %0;"

1615

"mov %4, -8(%3, %2);"

1616

"mov %%ebp, -4(%3, %2);"

1617

"jnz 0b;"

1618

1619

"1:;"

1620

"adc $0, %0;"

1621

"pop %%ebp;"

1622

1623

: "=aSD" (carry), "+r" (B), "+r" (C), "+r" (A), "+r" (N), "=r" (temp)

1624

: : "cc", "memory");

1625

1626

return carry;

1627

}

1628

#endif // __pic__

1629

1630

// Comba square and multiply assembly code originally contributed by Leonard Janke

1631

1632

#define SqrStartup \

1633

"push %%ebp\n\t" \

1634

"push %%esi\n\t" \

1635

"push %%ebx\n\t" \

1636

"xor %%ebp, %%ebp\n\t" \

1637

"xor %%ebx, %%ebx\n\t" \

1638

"xor %%ecx, %%ecx\n\t"

1639

1640

#define SqrShiftCarry \

1641

"mov %%ebx, %%ebp\n\t" \

1642

"mov %%ecx, %%ebx\n\t" \

1643

"xor %%ecx, %%ecx\n\t"

1644

1645

#define SqrAccumulate(i,j) \

1646

"mov 4*"#j"(%%esi), %%eax\n\t" \

1647

"mull 4*"#i"(%%esi)\n\t" \

1648

"add %%eax, %%ebp\n\t" \

1649

"adc %%edx, %%ebx\n\t" \

1650

"adc %%ch, %%cl\n\t" \

1651

"add %%eax, %%ebp\n\t" \

1652

"adc %%edx, %%ebx\n\t" \

1653

"adc %%ch, %%cl\n\t"

1654

1655

#define SqrAccumulateCentre(i) \

1656

"mov 4*"#i"(%%esi), %%eax\n\t" \

1657

"mull 4*"#i"(%%esi)\n\t" \

1658

"add %%eax, %%ebp\n\t" \

1659

"adc %%edx, %%ebx\n\t" \

1660

"adc %%ch, %%cl\n\t"

1661

1662

#define SqrStoreDigit(X) \

1663

"mov %%ebp, 4*"#X"(%%edi)\n\t" \

1664

1665

#define SqrLastDiagonal(digits) \

1666

"mov 4*("#digits"-1)(%%esi), %%eax\n\t" \

1667

"mull 4*("#digits"-1)(%%esi)\n\t" \

1668

"add %%eax, %%ebp\n\t" \

1669

"adc %%edx, %%ebx\n\t" \

1670

"mov %%ebp, 4*(2*"#digits"-2)(%%edi)\n\t" \

1671

"mov %%ebx, 4*(2*"#digits"-1)(%%edi)\n\t"

1672

1673

#define SqrCleanup \

1674

"pop %%ebx\n\t" \

1675

"pop %%esi\n\t" \

1676

"pop %%ebp\n\t"

1677

1678

void PentiumOptimized::Square4(word* Y, const word* X)

1679

{

1680

__asm__ __volatile__(

1681

SqrStartup

1682

1683

SqrAccumulateCentre(0)

1684

SqrStoreDigit(0)

1685

SqrShiftCarry

1686

1687

SqrAccumulate(1,0)

1688

SqrStoreDigit(1)

1689

SqrShiftCarry

1690

1691

SqrAccumulate(2,0)

1692

SqrAccumulateCentre(1)

1693

SqrStoreDigit(2)

1694

SqrShiftCarry

1695

1696

SqrAccumulate(3,0)

1697

SqrAccumulate(2,1)

1698

SqrStoreDigit(3)

1699

SqrShiftCarry

1700

1701

SqrAccumulate(3,1)

1702

SqrAccumulateCentre(2)

1703

SqrStoreDigit(4)

1704

SqrShiftCarry

1705

1706

SqrAccumulate(3,2)

1707

SqrStoreDigit(5)

1708

SqrShiftCarry

1709

1710

SqrLastDiagonal(4)

1711

1712

SqrCleanup

1713

1714

:

1715

: "D" (Y), "S" (X)

1716

: "eax", "ecx", "edx", "ebp", "memory"

1717

);

1718

}

1719

1720

#define MulStartup \

1721

"push %%ebp\n\t" \

1722

"push %%esi\n\t" \

1723

"push %%ebx\n\t" \

1724

"push %%edi\n\t" \

1725

"mov %%eax, %%ebx \n\t" \

1726

"xor %%ebp, %%ebp\n\t" \

1727

"xor %%edi, %%edi\n\t" \

1728

"xor %%ecx, %%ecx\n\t"

1729

1730

#define MulShiftCarry \

1731

"mov %%edx, %%ebp\n\t" \

1732

"mov %%ecx, %%edi\n\t" \

1733

"xor %%ecx, %%ecx\n\t"

1734

1735

#define MulAccumulate(i,j) \

1736

"mov 4*"#j"(%%ebx), %%eax\n\t" \

1737

"mull 4*"#i"(%%esi)\n\t" \

1738

"add %%eax, %%ebp\n\t" \

1739

"adc %%edx, %%edi\n\t" \

1740

"adc %%ch, %%cl\n\t"

1741

1742

#define MulStoreDigit(X) \

1743

"mov %%edi, %%edx \n\t" \

1744

"mov (%%esp), %%edi \n\t" \

1745

"mov %%ebp, 4*"#X"(%%edi)\n\t" \

1746

"mov %%edi, (%%esp)\n\t"

1747

1748

#define MulLastDiagonal(digits) \

1749

"mov 4*("#digits"-1)(%%ebx), %%eax\n\t" \

1750

"mull 4*("#digits"-1)(%%esi)\n\t" \

1751

"add %%eax, %%ebp\n\t" \

1752

"adc %%edi, %%edx\n\t" \

1753

"mov (%%esp), %%edi\n\t" \

1754

"mov %%ebp, 4*(2*"#digits"-2)(%%edi)\n\t" \

1755

"mov %%edx, 4*(2*"#digits"-1)(%%edi)\n\t"

1756

1757

#define MulCleanup \

1758

"pop %%edi\n\t" \

1759

"pop %%ebx\n\t" \

1760

"pop %%esi\n\t" \

1761

"pop %%ebp\n\t"

1762

1763

void PentiumOptimized::Multiply4(word* Z, const word* X, const word* Y)

1764

{

1765

__asm__ __volatile__(

1766

MulStartup

1767

MulAccumulate(0,0)

1768

MulStoreDigit(0)

1769

MulShiftCarry

1770

1771

MulAccumulate(1,0)

1772

MulAccumulate(0,1)

1773

MulStoreDigit(1)

1774

MulShiftCarry

1775

1776

MulAccumulate(2,0)

1777

MulAccumulate(1,1)

1778

MulAccumulate(0,2)

1779

MulStoreDigit(2)

1780

MulShiftCarry

1781

1782

MulAccumulate(3,0)

1783

MulAccumulate(2,1)

1784

MulAccumulate(1,2)

1785

MulAccumulate(0,3)

1786

MulStoreDigit(3)

1787

MulShiftCarry

1788

1789

MulAccumulate(3,1)

1790

MulAccumulate(2,2)

1791

MulAccumulate(1,3)

1792

MulStoreDigit(4)

1793

MulShiftCarry

1794

1795

MulAccumulate(3,2)

1796

MulAccumulate(2,3)

1797

MulStoreDigit(5)

1798

MulShiftCarry

1799

1800

MulLastDiagonal(4)

1801

1802

MulCleanup

1803

1804

:

1805

: "D" (Z), "S" (X), "a" (Y)

1806

: "%ecx", "%edx", "memory"

1807

);

1808

}

1809

1810

void PentiumOptimized::Multiply8(word* Z, const word* X, const word* Y)

1811

{

1812

__asm__ __volatile__(

1813

MulStartup

1814

MulAccumulate(0,0)

1815

MulStoreDigit(0)

1816

MulShiftCarry

1817

1818

MulAccumulate(1,0)

1819

MulAccumulate(0,1)

1820

MulStoreDigit(1)

1821

MulShiftCarry

1822

1823

MulAccumulate(2,0)

1824

MulAccumulate(1,1)

1825

MulAccumulate(0,2)

1826

MulStoreDigit(2)

1827

MulShiftCarry

1828

1829

MulAccumulate(3,0)

1830

MulAccumulate(2,1)

1831

MulAccumulate(1,2)

1832

MulAccumulate(0,3)

1833

MulStoreDigit(3)

1834

MulShiftCarry

1835

1836

MulAccumulate(4,0)

1837

MulAccumulate(3,1)

1838

MulAccumulate(2,2)

1839

MulAccumulate(1,3)

1840

MulAccumulate(0,4)

1841

MulStoreDigit(4)

1842

MulShiftCarry

1843

1844

MulAccumulate(5,0)

1845

MulAccumulate(4,1)

1846

MulAccumulate(3,2)

1847

MulAccumulate(2,3)

1848

MulAccumulate(1,4)

1849

MulAccumulate(0,5)

1850

MulStoreDigit(5)

1851

MulShiftCarry

1852

1853

MulAccumulate(6,0)

1854

MulAccumulate(5,1)

1855

MulAccumulate(4,2)

1856

MulAccumulate(3,3)

1857

MulAccumulate(2,4)

1858

MulAccumulate(1,5)

1859

MulAccumulate(0,6)

1860

MulStoreDigit(6)

1861

MulShiftCarry

1862

1863

MulAccumulate(7,0)

1864

MulAccumulate(6,1)

1865

MulAccumulate(5,2)

1866

MulAccumulate(4,3)

1867

MulAccumulate(3,4)

1868

MulAccumulate(2,5)

1869

MulAccumulate(1,6)

1870

MulAccumulate(0,7)

1871

MulStoreDigit(7)

1872

MulShiftCarry

1873

1874

MulAccumulate(7,1)

1875

MulAccumulate(6,2)

1876

MulAccumulate(5,3)

1877

MulAccumulate(4,4)

1878

MulAccumulate(3,5)

1879

MulAccumulate(2,6)

1880

MulAccumulate(1,7)

1881

MulStoreDigit(8)

1882

MulShiftCarry

1883

1884

MulAccumulate(7,2)

1885

MulAccumulate(6,3)

1886

MulAccumulate(5,4)

1887

MulAccumulate(4,5)

1888

MulAccumulate(3,6)

1889

MulAccumulate(2,7)

1890

MulStoreDigit(9)

1891

MulShiftCarry

1892

1893

MulAccumulate(7,3)

1894

MulAccumulate(6,4)

1895

MulAccumulate(5,5)

1896

MulAccumulate(4,6)

1897

MulAccumulate(3,7)

1898

MulStoreDigit(10)

1899

MulShiftCarry

1900

1901

MulAccumulate(7,4)

1902

MulAccumulate(6,5)

1903

MulAccumulate(5,6)

1904

MulAccumulate(4,7)

1905

MulStoreDigit(11)

1906

MulShiftCarry

1907

1908

MulAccumulate(7,5)

1909

MulAccumulate(6,6)

1910

MulAccumulate(5,7)

1911

MulStoreDigit(12)

1912

MulShiftCarry

1913

1914

MulAccumulate(7,6)

1915

MulAccumulate(6,7)

1916

MulStoreDigit(13)

1917

MulShiftCarry

1918

1919

MulLastDiagonal(8)

1920

1921

MulCleanup

1922

1923

:

1924

: "D" (Z), "S" (X), "a" (Y)

1925

: "%ecx", "%edx", "memory"

1926

);

1927

}

1928

1929

#else // no processor specific code at this layer

1930

1931

typedef Portable LowLevel;

1932

1933

#endif

1934

1981

1935

// ********************************************************

1982

1936

1983

1937

#define A0 A

1995

1949

#define R2 (R+N)

1996

1950

#define R3 (R+N+N2)

1997

1951

1952

//VC60 workaround: compiler bug triggered without the extra dummy parameters

1953

1998

1954

// R[2*N] - result = A*B

1999

1955

// T[2*N] - temporary work space

2000

1956

// A[N] --- multiplier

2001

1957

// B[N] --- multiplicant

2002

1958

2003

void RecursiveMultiply(word *R, word *T, const word *A, const word *B, size_t N)

1959

template <class P>

1960

void DoRecursiveMultiply(word *R, word *T, const word *A, const word *B, unsigned int N, const P *dummy=NULL);

1961

1962

template <class P>

1963

inline void RecursiveMultiply(word *R, word *T, const word *A, const word *B, unsigned int N, const P *dummy=NULL)

2004

1964

{

2005

1965

assert(N>=2 && N%2==0);

2006

1966

2007

if (LowLevel::MultiplyRecursionLimit() >= 8 && N==8)

2008

LowLevel::Multiply8(R, A, B);

2009

else if (LowLevel::MultiplyRecursionLimit() >= 4 && N==4)

2010

LowLevel::Multiply4(R, A, B);

1967

if (P::MultiplyRecursionLimit() >= 8 && N==8)

1968

P::Multiply8(R, A, B);

1969

else if (P::MultiplyRecursionLimit() >= 4 && N==4)

1970

P::Multiply4(R, A, B);

2011

1971

else if (N==2)

2012

LowLevel::Multiply2(R, A, B);

1972

P::Multiply2(R, A, B);

2013

1973

else

1974

DoRecursiveMultiply(R, T, A, B, N, NULL); // VC60 workaround: needs this NULL

1975

}

1976

1977

template <class P>

1978

void DoRecursiveMultiply(word *R, word *T, const word *A, const word *B, unsigned int N, const P *dummy)

1979

{

1980

const unsigned int N2 = N/2;

1981

int carry;

1982

1983

int aComp = Compare(A0, A1, N2);

1984

int bComp = Compare(B0, B1, N2);

1985

1986

switch (2*aComp + aComp + bComp)

2014

1987

{

2015

const size_t N2 = N/2;

2016

int carry;

2017

2018

int aComp = Compare(A0, A1, N2);

2019

int bComp = Compare(B0, B1, N2);

2020

2021

switch (2*aComp + aComp + bComp)

2022

{

2023

case -4:

2024

LowLevel::Subtract(R0, A1, A0, N2);

2025

LowLevel::Subtract(R1, B0, B1, N2);

2026

RecursiveMultiply(T0, T2, R0, R1, N2);

2027

LowLevel::Subtract(T1, T1, R0, N2);

2028

carry = -1;

2029

break;

2030

case -2:

2031

LowLevel::Subtract(R0, A1, A0, N2);

2032

LowLevel::Subtract(R1, B0, B1, N2);

2033

RecursiveMultiply(T0, T2, R0, R1, N2);

2034

carry = 0;

2035

break;

2036

case 2:

2037

LowLevel::Subtract(R0, A0, A1, N2);

2038

LowLevel::Subtract(R1, B1, B0, N2);

2039

RecursiveMultiply(T0, T2, R0, R1, N2);

2040

carry = 0;

2041

break;

2042

case 4:

2043

LowLevel::Subtract(R0, A1, A0, N2);

2044

LowLevel::Subtract(R1, B0, B1, N2);

2045

RecursiveMultiply(T0, T2, R0, R1, N2);

2046

LowLevel::Subtract(T1, T1, R1, N2);

2047

carry = -1;

2048

break;

2049

default:

2050

SetWords(T0, 0, N);

2051

carry = 0;

2052

}

2053

2054

RecursiveMultiply(R0, T2, A0, B0, N2);

2055

RecursiveMultiply(R2, T2, A1, B1, N2);

2056

2057

// now T[01] holds (A1-A0)*(B0-B1), R[01] holds A0*B0, R[23] holds A1*B1

2058

2059

carry += LowLevel::Add(T0, T0, R0, N);

2060

carry += LowLevel::Add(T0, T0, R2, N);

2061

carry += LowLevel::Add(R1, R1, T0, N);

2062

2063

assert (carry >= 0 && carry <= 2);

2064

Increment(R3, N2, carry);

1988

case -4:

1989

P::Subtract(R0, A1, A0, N2);

1990

P::Subtract(R1, B0, B1, N2);

1991

RecursiveMultiply(T0, T2, R0, R1, N2);

1992

P::Subtract(T1, T1, R0, N2);

1993

carry = -1;

1994

break;

1995

case -2:

1996

P::Subtract(R0, A1, A0, N2);

1997

P::Subtract(R1, B0, B1, N2);

1998

RecursiveMultiply(T0, T2, R0, R1, N2);

1999

carry = 0;

2000

break;

2001

case 2:

2002

P::Subtract(R0, A0, A1, N2);

2003

P::Subtract(R1, B1, B0, N2);

2004

RecursiveMultiply(T0, T2, R0, R1, N2);

2005

carry = 0;

2006

break;

2007

case 4:

2008

P::Subtract(R0, A1, A0, N2);

2009

P::Subtract(R1, B0, B1, N2);

2010

RecursiveMultiply(T0, T2, R0, R1, N2);

2011

P::Subtract(T1, T1, R1, N2);

2012

carry = -1;

2013

break;

2014

default:

2015

SetWords(T0, 0, N);

2016

carry = 0;

2065

2017

}

2018

2019

RecursiveMultiply(R0, T2, A0, B0, N2);

2020

RecursiveMultiply(R2, T2, A1, B1, N2);

2021

2022

// now T[01] holds (A1-A0)*(B0-B1), R[01] holds A0*B0, R[23] holds A1*B1

2023

2024

carry += P::Add(T0, T0, R0, N);

2025

carry += P::Add(T0, T0, R2, N);

2026

carry += P::Add(R1, R1, T0, N);

2027

2028

assert (carry >= 0 && carry <= 2);

2029

Increment(R3, N2, carry);

2066

2030

}

2067

2031

2068

2032

// R[2*N] - result = A*A

2069

2033

// T[2*N] - temporary work space

2070

2034

// A[N] --- number to be squared

2071

2035

2072

void RecursiveSquare(word *R, word *T, const word *A, size_t N)

2036

template <class P>

2037

void DoRecursiveSquare(word *R, word *T, const word *A, unsigned int N, const P *dummy=NULL);

2038

2039

template <class P>

2040

inline void RecursiveSquare(word *R, word *T, const word *A, unsigned int N, const P *dummy=NULL)

2073

2041

{

2074

2042

assert(N && N%2==0);

2075

if (LowLevel::SquareRecursionLimit() >= 8 && N==8)

2076

LowLevel::Square8(R, A);

2077

if (LowLevel::SquareRecursionLimit() >= 4 && N==4)

2078

LowLevel::Square4(R, A);

2043

if (P::SquareRecursionLimit() >= 8 && N==8)

2044

P::Square8(R, A);

2045

if (P::SquareRecursionLimit() >= 4 && N==4)

2046

P::Square4(R, A);

2079

2047

else if (N==2)

2080

LowLevel::Square2(R, A);

2048

P::Square2(R, A);

2081

2049

else

2082

{

2083

const size_t N2 = N/2;

2084

2085

RecursiveSquare(R0, T2, A0, N2);

2086

RecursiveSquare(R2, T2, A1, N2);

2087

RecursiveMultiply(T0, T2, A0, A1, N2);

2088

2089

int carry = LowLevel::Add(R1, R1, T0, N);

2090

carry += LowLevel::Add(R1, R1, T0, N);

2091

Increment(R3, N2, carry);

2092

}

2050

DoRecursiveSquare(R, T, A, N, NULL); // VC60 workaround: needs this NULL

2051

}

2052

2053

template <class P>

2054

void DoRecursiveSquare(word *R, word *T, const word *A, unsigned int N, const P *dummy)

2055

{

2056

const unsigned int N2 = N/2;

2057

2058

RecursiveSquare(R0, T2, A0, N2);

2059

RecursiveSquare(R2, T2, A1, N2);

2060

RecursiveMultiply(T0, T2, A0, A1, N2);

2061

2062

word carry = P::Add(R1, R1, T0, N);

2063

carry += P::Add(R1, R1, T0, N);

2064

Increment(R3, N2, carry);

2093

2065

}

2094

2066

2095

2067

// R[N] - bottom half of A*B

2097

2069

// A[N] - multiplier

2098

2070

// B[N] - multiplicant

2099

2071

2100

void RecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, size_t N)

2072

template <class P>

2073

void DoRecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, unsigned int N, const P *dummy=NULL);

2074

2075

template <class P>

2076

inline void RecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, unsigned int N, const P *dummy=NULL)

2101

2077

{

2102

2078

assert(N>=2 && N%2==0);

2103

if (LowLevel::MultiplyBottomRecursionLimit() >= 8 && N==8)

2104

LowLevel::Multiply8Bottom(R, A, B);

2105

else if (LowLevel::MultiplyBottomRecursionLimit() >= 4 && N==4)

2106

LowLevel::Multiply4Bottom(R, A, B);

2079

if (P::MultiplyBottomRecursionLimit() >= 8 && N==8)

2080

P::Multiply8Bottom(R, A, B);

2081

else if (P::MultiplyBottomRecursionLimit() >= 4 && N==4)

2082

P::Multiply4Bottom(R, A, B);

2107

2083

else if (N==2)

2108

LowLevel::Multiply2Bottom(R, A, B);

2084

P::Multiply2Bottom(R, A, B);

2109

2085

else

2110

{

2111

const size_t N2 = N/2;

2112

2113

RecursiveMultiply(R, T, A0, B0, N2);

2114

RecursiveMultiplyBottom(T0, T1, A1, B0, N2);

2115

LowLevel::Add(R1, R1, T0, N2);

2116

RecursiveMultiplyBottom(T0, T1, A0, B1, N2);

2117

LowLevel::Add(R1, R1, T0, N2);

2118

}

2086

DoRecursiveMultiplyBottom(R, T, A, B, N, NULL);

2087

}

2088

2089

template <class P>

2090

void DoRecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, unsigned int N, const P *dummy)

2091

{

2092

const unsigned int N2 = N/2;

2093

2094

RecursiveMultiply(R, T, A0, B0, N2);

2095

RecursiveMultiplyBottom(T0, T1, A1, B0, N2);

2096

P::Add(R1, R1, T0, N2);

2097

RecursiveMultiplyBottom(T0, T1, A0, B1, N2);

2098

P::Add(R1, R1, T0, N2);

2119

2099

}

2120

2100

2121

2101

// R[N] --- upper half of A*B

2124

2104

// A[N] --- multiplier

2125

2105

// B[N] --- multiplicant

2126

2106

2127

void RecursiveMultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, size_t N)

2107

template <class P>

2108

void RecursiveMultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, unsigned int N, const P *dummy=NULL)

2128

2109

{

2129

2110

assert(N>=2 && N%2==0);

2130

2111

2131

2112

if (N==4)

2132

2113

{

2133

LowLevel::Multiply4(T, A, B);

2114

P::Multiply4(T, A, B);

2134

2115

memcpy(R, T+4, 4*WORD_SIZE);

2135

2116

}

2136

2117

else if (N==2)

2137

2118

{

2138

LowLevel::Multiply2(T, A, B);

2119

P::Multiply2(T, A, B);

2139

2120

memcpy(R, T+2, 2*WORD_SIZE);

2140

2121

}

2141

2122

else

2142

2123

{

2143

const size_t N2 = N/2;

2124

const unsigned int N2 = N/2;

2144

2125

int carry;

2145

2126

2146

2127

int aComp = Compare(A0, A1, N2);

2149

2130

switch (2*aComp + aComp + bComp)

2150

2131

{

2151

2132

case -4:

2152

LowLevel::Subtract(R0, A1, A0, N2);

2153

LowLevel::Subtract(R1, B0, B1, N2);

2154

RecursiveMultiply(T0, T2, R0, R1, N2);

2155

LowLevel::Subtract(T1, T1, R0, N2);

2133

P::Subtract(R0, A1, A0, N2);

2134

P::Subtract(R1, B0, B1, N2);

2135

RecursiveMultiply(T0, T2, R0, R1, N2);

2136

P::Subtract(T1, T1, R0, N2);

2156

2137

carry = -1;

2157

2138

break;

2158

2139

case -2:

2159

LowLevel::Subtract(R0, A1, A0, N2);

2160

LowLevel::Subtract(R1, B0, B1, N2);

2161

RecursiveMultiply(T0, T2, R0, R1, N2);

2140

P::Subtract(R0, A1, A0, N2);

2141

P::Subtract(R1, B0, B1, N2);

2142

RecursiveMultiply(T0, T2, R0, R1, N2);

2162

2143

carry = 0;

2163

2144

break;

2164

2145

case 2:

2165

LowLevel::Subtract(R0, A0, A1, N2);

2166

LowLevel::Subtract(R1, B1, B0, N2);

2167

RecursiveMultiply(T0, T2, R0, R1, N2);

2146

P::Subtract(R0, A0, A1, N2);

2147

P::Subtract(R1, B1, B0, N2);

2148

RecursiveMultiply(T0, T2, R0, R1, N2);

2168

2149

carry = 0;

2169

2150

break;

2170

2151

case 4:

2171

LowLevel::Subtract(R0, A1, A0, N2);

2172

LowLevel::Subtract(R1, B0, B1, N2);

2173

RecursiveMultiply(T0, T2, R0, R1, N2);

2174

LowLevel::Subtract(T1, T1, R1, N2);

2152

P::Subtract(R0, A1, A0, N2);

2153

P::Subtract(R1, B0, B1, N2);

2154

RecursiveMultiply(T0, T2, R0, R1, N2);

2155

P::Subtract(T1, T1, R1, N2);

2175

2156

carry = -1;

2176

2157

break;

2177

2158

default:

2179

2160

carry = 0;

2180

2161

}

2181

2162

2182

RecursiveMultiply(T2, R0, A1, B1, N2);

2163

RecursiveMultiply(T2, R0, A1, B1, N2);

2183

2164

2184

2165

// now T[01] holds (A1-A0)*(B0-B1), T[23] holds A1*B1

2185

2166

2186

int c2 = LowLevel::Subtract(R0, L+N2, L, N2);

2187

c2 += LowLevel::Subtract(R0, R0, T0, N2);

2188

int t = (Compare(R0, T2, N2) == -1);

2167

word c2 = P::Subtract(R0, L+N2, L, N2);

2168

c2 += P::Subtract(R0, R0, T0, N2);

2169

word t = (Compare(R0, T2, N2) == -1);

2189

2170

2190

2171

carry += t;

2191

2172

carry += Increment(R0, N2, c2+t);

2192

carry += LowLevel::Add(R0, R0, T1, N2);

2193

carry += LowLevel::Add(R0, R0, T3, N2);

2173

carry += P::Add(R0, R0, T1, N2);

2174

carry += P::Add(R0, R0, T3, N2);

2194

2175

assert (carry >= 0 && carry <= 2);

2195

2176

2196

2177

CopyWords(R1, T3, N2);

2198

2179

}

2199

2180

}

2200

2181

2201

inline int Add(word *C, const word *A, const word *B, size_t N)

2182

inline word Add(word *C, const word *A, const word *B, unsigned int N)

2202

2183

{

2203

2184

return LowLevel::Add(C, A, B, N);

2204

2185

}

2205

2186

2206

inline int Subtract(word *C, const word *A, const word *B, size_t N)

2187

inline word Subtract(word *C, const word *A, const word *B, unsigned int N)

2207

2188

{

2208

2189

return LowLevel::Subtract(C, A, B, N);

2209

2190

}

2210

2191

2211

inline void Multiply(word *R, word *T, const word *A, const word *B, size_t N)

2212

{

2213

RecursiveMultiply(R, T, A, B, N);

2214

}

2215

2216

inline void Square(word *R, word *T, const word *A, size_t N)

2217

{

2218

RecursiveSquare(R, T, A, N);

2219

}

2220

2221

inline void MultiplyBottom(word *R, word *T, const word *A, const word *B, size_t N)

2222

{

2223

RecursiveMultiplyBottom(R, T, A, B, N);

2224

}

2225

2226

inline void MultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, size_t N)

2227

{

2228

RecursiveMultiplyTop(R, T, L, A, B, N);

2229

}

2230

2231

static word LinearMultiply(word *C, const word *A, word B, size_t N)

2192

inline void Multiply(word *R, word *T, const word *A, const word *B, unsigned int N)

2193

{

2194

#ifdef SSE2_INTRINSICS_AVAILABLE

2195

if (HasSSE2())

2196

RecursiveMultiply<P4Optimized>(R, T, A, B, N);

2197

else

2198

#endif

2199

RecursiveMultiply<LowLevel>(R, T, A, B, N);

2200

}

2201

2202

inline void Square(word *R, word *T, const word *A, unsigned int N)

2203

{

2204

#ifdef SSE2_INTRINSICS_AVAILABLE

2205

if (HasSSE2())

2206

RecursiveSquare<P4Optimized>(R, T, A, N);

2207

else

2208

#endif

2209

RecursiveSquare<LowLevel>(R, T, A, N);

2210

}

2211

2212

inline void MultiplyBottom(word *R, word *T, const word *A, const word *B, unsigned int N)

2213

{

2214

#ifdef SSE2_INTRINSICS_AVAILABLE

2215

if (HasSSE2())

2216

RecursiveMultiplyBottom<P4Optimized>(R, T, A, B, N);

2217

else

2218

#endif

2219

RecursiveMultiplyBottom<LowLevel>(R, T, A, B, N);

2220

}

2221

2222

inline void MultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, unsigned int N)

2223

{

2224

#ifdef SSE2_INTRINSICS_AVAILABLE

2225

if (HasSSE2())

2226

RecursiveMultiplyTop<P4Optimized>(R, T, L, A, B, N);

2227

else

2228

#endif

2229

RecursiveMultiplyTop<LowLevel>(R, T, L, A, B, N);

2230

}

2231

2232

static word LinearMultiply(word *C, const word *A, word B, unsigned int N)

2232

2233

{

2233

2234

word carry=0;

2234

2235

for(unsigned i=0; i<N; i++)

2245

2246

// A[NA] ---- multiplier

2246

2247

// B[NB] ---- multiplicant

2247

2248

void AsymmetricMultiply(word *R, word *T, const word *A, size_t NA, const word *B, size_t NB)

2249

void AsymmetricMultiply(word *R, word *T, const word *A, unsigned int NA, const word *B, unsigned int NB)

2249

2250

{

2250

2251

if (NA == NB)

2251

2252

{

2287

2288

Multiply(R, T, A, B, NA);

2288

2289

CopyWords(T+2*NA, R+NA, NA);

2289

2290

size_t i;

2291

unsigned i;

2291

2292

2293

for (i=2*NA; i<NB; i+=2*NA)

2293

2294

Multiply(T+NA+i, T, A, B+i, NA);

2302

2303

// T[3*N/2] - temporary work space

2303

2304

// A[N] ----- an odd number as input

2304

2305

void RecursiveInverseModPower2(word *R, word *T, const word *A, size_t N)

2306

void RecursiveInverseModPower2(word *R, word *T, const word *A, unsigned int N)

2306

2307

{

2307

2308

if (N==2)

2308

2309

{

2315

2316

}

2316

2317

else

2317

2318

{

2318

const size_t N2 = N/2;

2319

const unsigned int N2 = N/2;

2319

2320

RecursiveInverseModPower2(R0, T0, A0, N2);

2320

2321

T0[0] = 1;

2321

2322

SetWords(T0+1, 0, N2-1);

2333

2334

// M[N] --- modulus

2334

2335

// U[N] --- multiplicative inverse of M mod 2**(WORD_BITS*N)

2335

2336

void MontgomeryReduce(word *R, word *T, const word *X, const word *M, const word *U, size_t N)

2337

void MontgomeryReduce(word *R, word *T, const word *X, const word *M, const word *U, unsigned int N)

2337

2338

{

2338

2339

MultiplyBottom(R, T, X, U, N);

2339

2340

MultiplyTop(T, T+N, X, R, M, N);

2351

2352

// U[N/2] - multiplicative inverse of M mod 2**(WORD_BITS*N/2)

2352

2353

// V[N] --- 2**(WORD_BITS*3*N/2) mod M

2353

2354

void HalfMontgomeryReduce(word *R, word *T, const word *X, const word *M, const word *U, const word *V, size_t N)

2355

void HalfMontgomeryReduce(word *R, word *T, const word *X, const word *M, const word *U, const word *V, unsigned int N)

2355

2356

{

2356

2357

assert(N%2==0 && N>=4);

2357

2358

2365

2366

#define X2 (X+N)

2366

2367

#define X3 (X+N+N2)

2367

2368

const size_t N2 = N/2;

2369

const unsigned int N2 = N/2;

2369

2370

Multiply(T0, T2, V0, X3, N2);

2370

2371

int c2 = Add(T0, T0, X0, N);

2371

2372

MultiplyBottom(T3, T2, T0, U, N2);

2499

2500

}

2500

2501

2502

// for use by Divide(), corrects the underestimated quotient {Q1,Q0}

2502

static void CorrectQuotientEstimate(word *R, word *T, word *Q, const word *B, size_t N)

2503

static void CorrectQuotientEstimate(word *R, word *T, word *Q, const word *B, unsigned int N)

2503

2504

{

2504

2505

assert(N && N%2==0);

2505

2506

2536

2537

// A[NA] -------- dividend

2537

2538

// B[NB] -------- divisor

2538

2539

void Divide(word *R, word *Q, word *T, const word *A, size_t NA, const word *B, size_t NB)

2540

void Divide(word *R, word *Q, word *T, const word *A, unsigned int NA, const word *B, unsigned int NB)

2540

2541

{

2541

2542

assert(NA && NB && NA%2==0 && NB%2==0);

2542

2543

assert(B[NB-1] || B[NB-2]);

2580

2581

BT[1] = TB[NB-1] + (BT[0]==0);

2581

2582

2583

// start reducing TA mod TB, 2 words at a time

2583

for (size_t i=NA-2; i>=NB; i-=2)

2584

for (unsigned i=NA-2; i>=NB; i-=2)

2584

2585

{

2585

2586

AtomicDivide(Q+i-NB, TA+i-2, BT);

2586

2587

CorrectQuotientEstimate(TA+i-NB, TP, Q+i-NB, TB, NB);

2591

2592

ShiftWordsRightByBits(R, NB, shiftBits);

2592

2593

}

2593

2594

static inline size_t EvenWordCount(const word *X, size_t N)

2595

static inline unsigned int EvenWordCount(const word *X, unsigned int N)

2595

2596

{

2596

2597

while (N && X[N-2]==0 && X[N-1]==0)

2597

2598

N-=2;

2604

2605

// A[NA] -- number to take inverse of

2605

2606

// M[N] --- modulus

2606

2607

unsigned int AlmostInverse(word *R, word *T, const word *A, size_t NA, const word *M, size_t N)

2608

unsigned int AlmostInverse(word *R, word *T, const word *A, unsigned int NA, const word *M, unsigned int N)

2608

2609

{

2609

2610

assert(NA<=N && N && N%2==0);

2610

2611

2612

2613

word *c = T+N;

2613

2614

word *f = T+2*N;

2614

2615

word *g = T+3*N;

2615

size_t bcLen=2, fgLen=EvenWordCount(M, N);

2616

unsigned int bcLen=2, fgLen=EvenWordCount(M, N);

2616

2617

unsigned int k=0, s=0;

2617

2618

2619

SetWords(T, 0, 3*N);

2690

2691

// A[N] - input

2691

2692

// M[N] - modulus

2692

2693

void DivideByPower2Mod(word *R, const word *A, size_t k, const word *M, size_t N)

2694

void DivideByPower2Mod(word *R, const word *A, unsigned int k, const word *M, unsigned int N)

2694

2695

{

2695

2696

CopyWords(R, A, N);

2696

2697

2711

2712

// A[N] - input

2712

2713

// M[N] - modulus

2713

2714

void MultiplyByPower2Mod(word *R, const word *A, size_t k, const word *M, size_t N)

2715

void MultiplyByPower2Mod(word *R, const word *A, unsigned int k, const word *M, unsigned int N)

2715

2716

{

2716

2717

CopyWords(R, A, N);

2717

2718

2722

2723

2724

// ******************************************************************

2724

2725

InitializeInteger::InitializeInteger()

2726

{

2727

if (!g_pAssignIntToInteger)

2728

{

2729

#ifdef CRYPTOPP_X86ASM_AVAILABLE

2730

SetPentiumFunctionPointers();

2731

#endif

2732

g_pAssignIntToInteger = AssignIntToInteger;

2733

}

2734

}

2735

2736

2726

static const unsigned int RoundupSizeTable[] = {2, 2, 2, 4, 4, 8, 8, 8, 8};

2737

2727

2738

static inline size_t RoundupSize(size_t n)

2728

static inline unsigned int RoundupSize(unsigned int n)

2739

2729

{

2740

2730

if (n<=8)

2741

2731

return RoundupSizeTable[n];

2745

2735

return 32;

2746

2736

else if (n<=64)

2747

2737

return 64;

2748

else return size_t(1) << BitPrecision(n-1);

2738

else return 1U << BitPrecision(n-1);

2749

2739

}

2750

2740

2751

2741

Integer::Integer()

2793

2783

if (ByteCount() > sizeof(long))

2794

2784

return false;

2795

2785

2796

unsigned long value = (unsigned long)reg[0];

2797

value += SafeLeftShift<WORD_BITS, unsigned long>((unsigned long)reg[1]);

2786

unsigned long value = reg[0];

2787

value += SafeLeftShift<WORD_BITS, unsigned long>(reg[1]);

2798

2788

2799

2789

if (sign==POSITIVE)

2800

2790

return (signed long)value >= 0;

2806

2796

{

2807

2797

assert(IsConvertableToLong());

2808

2798

2809

unsigned long value = (unsigned long)reg[0];

2810

value += SafeLeftShift<WORD_BITS, unsigned long>((unsigned long)reg[1]);

2799

unsigned long value = reg[0];

2800

value += SafeLeftShift<WORD_BITS, unsigned long>(reg[1]);

2811

2801

return sign==POSITIVE ? value : -(signed long)value;

2812

2802

}

2813

2803

2814

Integer::Integer(BufferedTransformation &encodedInteger, size_t byteCount, Signedness s)

2804

Integer::Integer(BufferedTransformation &encodedInteger, unsigned int byteCount, Signedness s)

2815

2805

{

2816

2806

Decode(encodedInteger, byteCount, s);

2817

2807

}

2818

2808

2819

Integer::Integer(const byte *encodedInteger, size_t byteCount, Signedness s)

2809

Integer::Integer(const byte *encodedInteger, unsigned int byteCount, Signedness s)

2820

2810

{

2821

2811

Decode(encodedInteger, byteCount, s);

2822

2812

}

2826

2816

BERDecode(bt);

2827

2817

}

2828

2818

2829

Integer::Integer(RandomNumberGenerator &rng, size_t bitcount)

2819

Integer::Integer(RandomNumberGenerator &rng, unsigned int bitcount)

2830

2820

{

2831

2821

Randomize(rng, bitcount);

2832

2822

}

2837

2827

throw Integer::RandomNumberNotFound();

2838

2828

}

2839

2829

2840

Integer Integer::Power2(size_t e)

2830

Integer Integer::Power2(unsigned int e)

2841

2831

{

2842

2832

Integer r((word)0, BitsToWords(e+1));

2843

2833

r.SetBit(e);

2884

2874

return *this;

2885

2875

}

2886

2876

2887

bool Integer::GetBit(size_t n) const

2877

bool Integer::GetBit(unsigned int n) const

2888

2878

{

2889

2879

if (n/WORD_BITS >= reg.size())

2890

2880

return 0;

2892

2882

return bool((reg[n/WORD_BITS] >> (n % WORD_BITS)) & 1);

2893

2883

}

2894

2884

2895

void Integer::SetBit(size_t n, bool value)

2885

void Integer::SetBit(unsigned int n, bool value)

2896

2886

{

2897

2887

if (value)

2898

2888

{

2906

2896

}

2907

2897

}

2908

2898

2909

byte Integer::GetByte(size_t n) const

2899

byte Integer::GetByte(unsigned int n) const

2910

2900

{

2911

2901

if (n/WORD_SIZE >= reg.size())

2912

2902

return 0;

2914

2904

return byte(reg[n/WORD_SIZE] >> ((n%WORD_SIZE)*8));

2915

2905

}

2916

2906

2917

void Integer::SetByte(size_t n, byte value)

2907

void Integer::SetByte(unsigned int n, byte value)

2918

2908

{

2919

2909

reg.CleanGrow(RoundupSize(BytesToWords(n+1)));

2920

2910

reg[n/WORD_SIZE] &= ~(word(0xff) << 8*(n%WORD_SIZE));

2921

2911

reg[n/WORD_SIZE] |= (word(value) << 8*(n%WORD_SIZE));

2922

2912

}

2923

2913

2924

lword Integer::GetBits(size_t i, size_t n) const

2914

unsigned long Integer::GetBits(unsigned int i, unsigned int n) const

2925

2915

{

2926

lword v = 0;

2927

assert(n <= sizeof(v)*8);

2916

assert(n <= sizeof(unsigned long)*8);

2917

unsigned long v = 0;

2928

2918

for (unsigned int j=0; j<n; j++)

2929

v |= lword(GetBit(i+j)) << j;

2919

v |= GetBit(i+j) << j;

2930

2920

return v;

2931

2921

}

2932

2922

2950

2940

std::swap(sign, a.sign);

2951

2941

}

2952

2942

2953

Integer::Integer(word value, size_t length)

2943

Integer::Integer(word value, unsigned int length)

2954

2944

: reg(RoundupSize(length)), sign(POSITIVE)

2955

2945

{

2956

2946

reg[0] = value;

2960

2950

template <class T>

2961

2951

static Integer StringToInteger(const T *str)

2962

2952

{

2963

int radix;

2953

word radix;

2964

2954

// GCC workaround

2955

// std::char_traits doesn't exist in GCC 2.x

2965

2956

// std::char_traits<wchar_t>::length() not defined in GCC 3.2 and STLport 4.5.3

2966

2957

unsigned int length;

2967

2958

for (length = 0; str[length] != 0; length++) {}

2994

2985

2995

2986

for (unsigned i=0; i<length; i++)

2996

2987

{

2997

int digit;

2988

word digit;

2998

2989

2999

2990

if (str[i] >= '0' && str[i] <= '9')

3000

2991

digit = str[i] - '0';

3032

3023

3033

3024

unsigned int Integer::WordCount() const

3034

3025

{

3035

return (unsigned int)CountWords(reg, reg.size());

3026

return CountWords(reg, reg.size());

3036

3027

}

3037

3028

3038

3029

unsigned int Integer::ByteCount() const

3053

3044

return 0;

3054

3045

}

3055

3046

3056

void Integer::Decode(const byte *input, size_t inputLen, Signedness s)

3047

void Integer::Decode(const byte *input, unsigned int inputLen, Signedness s)

3057

3048

{

3058

3049

StringStore store(input, inputLen);

3059

3050

Decode(store, inputLen, s);

3060

3051

}

3061

3052

3062

void Integer::Decode(BufferedTransformation &bt, size_t inputLen, Signedness s)

3053

void Integer::Decode(BufferedTransformation &bt, unsigned int inputLen, Signedness s)

3063

3054

{

3064

3055

assert(bt.MaxRetrievable() >= inputLen);

3065

3056

3076

3067

3077

3068

reg.CleanNew(RoundupSize(BytesToWords(inputLen)));

3078

3069

3079

for (size_t i=inputLen; i > 0; i--)

3070

for (unsigned int i=inputLen; i > 0; i--)

3080

3071

{

3081

3072

bt.Get(b);

3082

3073

reg[(i-1)/WORD_SIZE] |= word(b) << ((i-1)%WORD_SIZE)*8;

3084

3075

3085

3076

if (sign == NEGATIVE)

3086

3077

{

3087

for (size_t i=inputLen; i<reg.size()*WORD_SIZE; i++)

3078

for (unsigned i=inputLen; i<reg.size()*WORD_SIZE; i++)

3088

3079

reg[i/WORD_SIZE] |= word(0xff) << (i%WORD_SIZE)*8;

3089

3080

TwosComplement(reg, reg.size());

3090

3081

}

3091

3082

}

3092

3083

3093

size_t Integer::MinEncodedSize(Signedness signedness) const

3084

unsigned int Integer::MinEncodedSize(Signedness signedness) const

3094

3085

{

3095

3086

unsigned int outputLen = STDMAX(1U, ByteCount());

3096

3087

if (signedness == UNSIGNED)

3102

3093

return outputLen;

3103

3094

}

3104

3095

3105

void Integer::Encode(byte *output, size_t outputLen, Signedness signedness) const

3096

unsigned int Integer::Encode(byte *output, unsigned int outputLen, Signedness signedness) const

3106

3097

{

3107

3098

ArraySink sink(output, outputLen);

3108

Encode(sink, outputLen, signedness);

3099

return Encode(sink, outputLen, signedness);

3109

3100

}

3110

3101

3111

void Integer::Encode(BufferedTransformation &bt, size_t outputLen, Signedness signedness) const

3102

unsigned int Integer::Encode(BufferedTransformation &bt, unsigned int outputLen, Signedness signedness) const

3112

3103

{

3113

3104

if (signedness == UNSIGNED || NotNegative())

3114

3105

{

3115

for (size_t i=outputLen; i > 0; i--)

3106

for (unsigned int i=outputLen; i > 0; i--)

3116

3107

bt.Put(GetByte(i-1));

3117

3108

}

3118

3109

else

3119

3110

{

3120

3111

// take two's complement of *this

3121

Integer temp = Integer::Power2(8*STDMAX((size_t)ByteCount(), outputLen)) + *this;

3122

temp.Encode(bt, outputLen, UNSIGNED);

3112

Integer temp = Integer::Power2(8*STDMAX(ByteCount(), outputLen)) + *this;

3113

for (unsigned i=0; i<outputLen; i++)

3114

bt.Put(temp.GetByte(outputLen-i-1));

3123

3115

}

3116

return outputLen;

3124

3117

}

3125

3118

3126

3119

void Integer::DEREncode(BufferedTransformation &bt) const

3130

3123

enc.MessageEnd();

3131

3124

}

3132

3125

3133

void Integer::BERDecode(const byte *input, size_t len)

3126

void Integer::BERDecode(const byte *input, unsigned int len)

3134

3127

{

3135

3128

StringStore store(input, len);

3136

3129

BERDecode(store);

3141

3134

BERGeneralDecoder dec(bt, INTEGER);

3142

3135

if (!dec.IsDefiniteLength() || dec.MaxRetrievable() < dec.RemainingLength())

3143

3136

BERDecodeError();

3144

Decode(dec, (size_t)dec.RemainingLength(), SIGNED);

3137

Decode(dec, dec.RemainingLength(), SIGNED);

3145

3138

dec.MessageEnd();

3146

3139

}

3147

3140

3148

void Integer::DEREncodeAsOctetString(BufferedTransformation &bt, size_t length) const

3141

void Integer::DEREncodeAsOctetString(BufferedTransformation &bt, unsigned int length) const

3149

3142

{

3150

3143

DERGeneralEncoder enc(bt, OCTET_STRING);

3151

3144

Encode(enc, length);

3152

3145

enc.MessageEnd();

3153

3146

}

3154

3147

3155

void Integer::BERDecodeAsOctetString(BufferedTransformation &bt, size_t length)

3148

void Integer::BERDecodeAsOctetString(BufferedTransformation &bt, unsigned int length)

3156

3149

{

3157

3150

BERGeneralDecoder dec(bt, OCTET_STRING);

3158

3151

if (!dec.IsDefiniteLength() || dec.RemainingLength() != length)

3161

3154

dec.MessageEnd();

3162

3155

}

3163

3156

3164

size_t Integer::OpenPGPEncode(byte *output, size_t len) const

3157

unsigned int Integer::OpenPGPEncode(byte *output, unsigned int len) const

3165

3158

{

3166

3159

ArraySink sink(output, len);

3167

3160

return OpenPGPEncode(sink);

3168

3161

}

3169

3162

3170

size_t Integer::OpenPGPEncode(BufferedTransformation &bt) const

3163

unsigned int Integer::OpenPGPEncode(BufferedTransformation &bt) const

3171

3164

{

3172

3165

word16 bitCount = BitCount();

3173

3166

bt.PutWord16(bitCount);

3174

size_t byteCount = BitsToBytes(bitCount);

3175

Encode(bt, byteCount);

3176

return 2 + byteCount;

3167

return 2 + Encode(bt, BitsToBytes(bitCount));

3177

3168

}

3178

3169

3179

void Integer::OpenPGPDecode(const byte *input, size_t len)

3170

void Integer::OpenPGPDecode(const byte *input, unsigned int len)

3180

3171

{

3181

3172

StringStore store(input, len);

3182

3173

OpenPGPDecode(store);

3190

3181

Decode(bt, BitsToBytes(bitCount));

3191

3182

}

3192

3183

3193

void Integer::Randomize(RandomNumberGenerator &rng, size_t nbits)

3184

void Integer::Randomize(RandomNumberGenerator &rng, unsigned int nbits)

3194

3185

{

3195

const size_t nbytes = nbits/8 + 1;

3186

const unsigned int nbytes = nbits/8 + 1;

3196

3187

SecByteBlock buf(nbytes);

3197

3188

rng.GenerateBlock(buf, nbytes);

3198

3189

if (nbytes)

3225

3216

class KDF2_RNG : public RandomNumberGenerator

3226

3217

{

3227

3218

public:

3228

KDF2_RNG(const byte *seed, size_t seedSize)

3219

KDF2_RNG(const byte *seed, unsigned int seedSize)

3229

3220

: m_counter(0), m_counterAndSeed(seedSize + 4)

3230

3221

{

3231

3222

memcpy(m_counterAndSeed + 4, seed, seedSize);

3287

3278

DEREncodeOctetString(seq, seed.begin(), seed.size());

3288

3279

seq.MessageEnd();

3289

3280

3290

SecByteBlock finalSeed((size_t)bq.MaxRetrievable());

3281

SecByteBlock finalSeed(bq.MaxRetrievable());

3291

3282

bq.Get(finalSeed, finalSeed.size());

3292

3283

kdf2Rng.reset(new KDF2_RNG(finalSeed.begin(), finalSeed.size()));

3293

3284

}

3463

3454

3464

3455

void PositiveAdd(Integer &sum, const Integer &a, const Integer& b)

3465

3456

{

3466

int carry;

3457

word carry;

3467

3458

if (a.reg.size() == b.reg.size())

3468

3459

carry = Add(sum.reg, a.reg, b.reg, a.reg.size());

3469

3460

else if (a.reg.size() > b.reg.size())

3525

3516

}

3526

3517

}

3527

3518

3528

// MSVC .NET 2003 workaround

3529

template <class T> inline const T& STDMAX2(const T& a, const T& b)

3530

{

3531

return a < b ? b : a;

3532

}

3533

3534

3519

Integer Integer::Plus(const Integer& b) const

3535

3520

{

3536

Integer sum((word)0, STDMAX2(reg.size(), b.reg.size()));

3521

Integer sum((word)0, STDMAX(reg.size(), b.reg.size()));

3537

3522

if (NotNegative())

3538

3523

{

3539

3524

if (b.NotNegative())

3579

3564

3580

3565

Integer Integer::Minus(const Integer& b) const

3581

3566

{

3582

Integer diff((word)0, STDMAX2(reg.size(), b.reg.size()));

3567

Integer diff((word)0, STDMAX(reg.size(), b.reg.size()));

3583

3568

if (NotNegative())

3584

3569

{

3585

3570

if (b.NotNegative())

3623

3608

return *this;

3624

3609

}

3625

3610

3626

Integer& Integer::operator<<=(size_t n)

3611

Integer& Integer::operator<<=(unsigned int n)

3627

3612

{

3628

const size_t wordCount = WordCount();

3629

const size_t shiftWords = n / WORD_BITS;

3630

const unsigned int shiftBits = (unsigned int)(n % WORD_BITS);

3613

const unsigned int wordCount = WordCount();

3614

const unsigned int shiftWords = n / WORD_BITS;

3615

const unsigned int shiftBits = n % WORD_BITS;

3631

3616

3632

3617

reg.CleanGrow(RoundupSize(wordCount+BitsToWords(n)));

3633

3618

ShiftWordsLeftByWords(reg, wordCount + shiftWords, shiftWords);

3635

3620

return *this;

3636

3621

}

3637

3622

3638

Integer& Integer::operator>>=(size_t n)

3623

Integer& Integer::operator>>=(unsigned int n)

3639

3624

{

3640

const size_t wordCount = WordCount();

3641

const size_t shiftWords = n / WORD_BITS;

3642

const unsigned int shiftBits = (unsigned int)(n % WORD_BITS);

3625

const unsigned int wordCount = WordCount();

3626

const unsigned int shiftWords = n / WORD_BITS;

3627

const unsigned int shiftBits = n % WORD_BITS;

3643

3628

3644

3629

ShiftWordsRightByWords(reg, wordCount, shiftWords);

3645

3630

if (wordCount > shiftWords)

3651

3636

3652

3637

void PositiveMultiply(Integer &product, const Integer &a, const Integer &b)

3653

3638

{

3654

size_t aSize = RoundupSize(a.WordCount());

3655

size_t bSize = RoundupSize(b.WordCount());

3639

unsigned aSize = RoundupSize(a.WordCount());

3640

unsigned bSize = RoundupSize(b.WordCount());

3656

3641

3657

3642

product.reg.CleanNew(RoundupSize(aSize+bSize));

3658

3643

product.sign = Integer::POSITIVE;

3750

3735

q = a;

3751

3736

q >>= n;

3752

3737

3753

const size_t wordCount = BitsToWords(n);

3738

const unsigned int wordCount = BitsToWords(n);

3754

3739

if (wordCount <= a.WordCount())

3755

3740

{

3756

3741

r.reg.resize(RoundupSize(wordCount));

3757

3742

CopyWords(r.reg, a.reg, wordCount);

3758

3743

SetWords(r.reg+wordCount, 0, r.reg.size()-wordCount);

3759

3744

if (n % WORD_BITS != 0)

3760

r.reg[wordCount-1] %= (word(1) << (n % WORD_BITS));

3745

r.reg[wordCount-1] %= (1 << (n % WORD_BITS));

3761

3746

}

3762

3747

else

3763

3748

{

4009

3994

OID oid(seq);

4010

3995

if (oid != ASN1::prime_field())

4011

3996

BERDecodeError();

4012

m_modulus.BERDecode(seq);

3997

modulus.BERDecode(seq);

4013

3998

seq.MessageEnd();

4014

m_result.reg.resize(m_modulus.reg.size());

3999

result.reg.resize(modulus.reg.size());

4015

4000

}

4016

4001

4017

4002

void ModularArithmetic::DEREncode(BufferedTransformation &bt) const

4018

4003

{

4019

4004

DERSequenceEncoder seq(bt);

4020

4005

ASN1::prime_field().DEREncode(seq);

4021

m_modulus.DEREncode(seq);

4006

modulus.DEREncode(seq);

4022

4007

seq.MessageEnd();

4023

4008

}

4024

4009

4034

4019

4035

4020

const Integer& ModularArithmetic::Half(const Integer &a) const

4036

4021

{

4037

if (a.reg.size()==m_modulus.reg.size())

4022

if (a.reg.size()==modulus.reg.size())

4038

4023

{

4039

CryptoPP::DivideByPower2Mod(m_result.reg.begin(), a.reg, 1, m_modulus.reg, a.reg.size());

4040

return m_result;

4024

CryptoPP::DivideByPower2Mod(result.reg.begin(), a.reg, 1, modulus.reg, a.reg.size());

4025

return result;

4041

4026

}

4042

4027

else

4043

return m_result1 = (a.IsEven() ? (a >> 1) : ((a+m_modulus) >> 1));

4028

return result1 = (a.IsEven() ? (a >> 1) : ((a+modulus) >> 1));

4044

4029

}

4045

4030

4046

4031

const Integer& ModularArithmetic::Add(const Integer &a, const Integer &b) const

4047

4032

{

4048

if (a.reg.size()==m_modulus.reg.size() && b.reg.size()==m_modulus.reg.size())

4033

if (a.reg.size()==modulus.reg.size() && b.reg.size()==modulus.reg.size())

4049

4034

{

4050

if (CryptoPP::Add(m_result.reg.begin(), a.reg, b.reg, a.reg.size())

4051

|| Compare(m_result.reg, m_modulus.reg, a.reg.size()) >= 0)

4035

if (CryptoPP::Add(result.reg.begin(), a.reg, b.reg, a.reg.size())

4036

|| Compare(result.reg, modulus.reg, a.reg.size()) >= 0)

4052

4037

{

4053

CryptoPP::Subtract(m_result.reg.begin(), m_result.reg, m_modulus.reg, a.reg.size());

4038

CryptoPP::Subtract(result.reg.begin(), result.reg, modulus.reg, a.reg.size());

4054

4039

}

4055

return m_result;

4040

return result;

4056

4041

}

4057

4042

else

4058

4043

{

4059

m_result1 = a+b;

4060

if (m_result1 >= m_modulus)

4061

m_result1 -= m_modulus;

4062

return m_result1;

4044

result1 = a+b;

4045

if (result1 >= modulus)

4046

result1 -= modulus;

4047

return result1;

4063

4048

}

4064

4049

}

4065

4050

4066

4051

Integer& ModularArithmetic::Accumulate(Integer &a, const Integer &b) const

4067

4052

{

4068

if (a.reg.size()==m_modulus.reg.size() && b.reg.size()==m_modulus.reg.size())

4053

if (a.reg.size()==modulus.reg.size() && b.reg.size()==modulus.reg.size())

4069

4054

{

4070

4055

if (CryptoPP::Add(a.reg, a.reg, b.reg, a.reg.size())

4071

|| Compare(a.reg, m_modulus.reg, a.reg.size()) >= 0)

4056

|| Compare(a.reg, modulus.reg, a.reg.size()) >= 0)

4072

4057

{

4073

CryptoPP::Subtract(a.reg, a.reg, m_modulus.reg, a.reg.size());

4058

CryptoPP::Subtract(a.reg, a.reg, modulus.reg, a.reg.size());

4074

4059

}

4075

4060

}

4076

4061

else

4077

4062

{

4078

4063

a+=b;

4079

if (a>=m_modulus)

4080

a-=m_modulus;

4064

if (a>=modulus)

4065

a-=modulus;

4081

4066

}

4082

4067

4083

4068

return a;

4085

4070

4086

4071

const Integer& ModularArithmetic::Subtract(const Integer &a, const Integer &b) const

4087

4072

{

4088

if (a.reg.size()==m_modulus.reg.size() && b.reg.size()==m_modulus.reg.size())

4073

if (a.reg.size()==modulus.reg.size() && b.reg.size()==modulus.reg.size())

4089

4074

{

4090

if (CryptoPP::Subtract(m_result.reg.begin(), a.reg, b.reg, a.reg.size()))

4091

CryptoPP::Add(m_result.reg.begin(), m_result.reg, m_modulus.reg, a.reg.size());

4092

return m_result;

4075

if (CryptoPP::Subtract(result.reg.begin(), a.reg, b.reg, a.reg.size()))

4076

CryptoPP::Add(result.reg.begin(), result.reg, modulus.reg, a.reg.size());

4077

return result;

4093

4078

}

4094

4079

else

4095

4080

{

4096

m_result1 = a-b;

4097

if (m_result1.IsNegative())

4098

m_result1 += m_modulus;

4099

return m_result1;

4081

result1 = a-b;

4082

if (result1.IsNegative())

4083

result1 += modulus;

4084

return result1;

4100

4085

}

4101

4086

}

4102

4087

4103

4088

Integer& ModularArithmetic::Reduce(Integer &a, const Integer &b) const

4104

4089

{

4105

if (a.reg.size()==m_modulus.reg.size() && b.reg.size()==m_modulus.reg.size())

4090

if (a.reg.size()==modulus.reg.size() && b.reg.size()==modulus.reg.size())

4106

4091

{

4107

4092

if (CryptoPP::Subtract(a.reg, a.reg, b.reg, a.reg.size()))

4108

CryptoPP::Add(a.reg, a.reg, m_modulus.reg, a.reg.size());

4093

CryptoPP::Add(a.reg, a.reg, modulus.reg, a.reg.size());

4109

4094

}

4110

4095

else

4111

4096

{

4112

4097

a-=b;

4113

4098

if (a.IsNegative())

4114

a+=m_modulus;

4099

a+=modulus;

4115

4100

}

4116

4101

4117

4102

return a;

4122

4107

if (!a)

4123

4108

return a;

4124

4109

4125

CopyWords(m_result.reg.begin(), m_modulus.reg, m_modulus.reg.size());

4126

if (CryptoPP::Subtract(m_result.reg.begin(), m_result.reg, a.reg, a.reg.size()))

4127

Decrement(m_result.reg.begin()+a.reg.size(), m_modulus.reg.size()-a.reg.size());

4110

CopyWords(result.reg.begin(), modulus.reg, modulus.reg.size());

4111

if (CryptoPP::Subtract(result.reg.begin(), result.reg, a.reg, a.reg.size()))

4112

Decrement(result.reg.begin()+a.reg.size(), 1, modulus.reg.size()-a.reg.size());

4128

4113

4129

return m_result;

4114

return result;

4130

4115

}

4131

4116

4132

4117

Integer ModularArithmetic::CascadeExponentiate(const Integer &x, const Integer &e1, const Integer &y, const Integer &e2) const

4133

4118

{

4134

if (m_modulus.IsOdd())

4119

if (modulus.IsOdd())

4135

4120

{

4136

MontgomeryRepresentation dr(m_modulus);

4121

MontgomeryRepresentation dr(modulus);

4137

4122

return dr.ConvertOut(dr.CascadeExponentiate(dr.ConvertIn(x), e1, dr.ConvertIn(y), e2));

4138

4123

}

4139

4124

else

4142

4127

4143

4128

void ModularArithmetic::SimultaneousExponentiate(Integer *results, const Integer &base, const Integer *exponents, unsigned int exponentsCount) const

4144

4129

{

4145

if (m_modulus.IsOdd())

4130

if (modulus.IsOdd())

4146

4131

{

4147

MontgomeryRepresentation dr(m_modulus);

4132

MontgomeryRepresentation dr(modulus);

4148

4133

dr.SimultaneousExponentiate(results, dr.ConvertIn(base), exponents, exponentsCount);

4149

4134

for (unsigned int i=0; i<exponentsCount; i++)

4150

4135

results[i] = dr.ConvertOut(results[i]);

4155

4140

4156

4141

MontgomeryRepresentation::MontgomeryRepresentation(const Integer &m) // modulus must be odd

4157

4142

: ModularArithmetic(m),

4158

m_u((word)0, m_modulus.reg.size()),

4159

m_workspace(5*m_modulus.reg.size())

4143

u((word)0, modulus.reg.size()),

4144

workspace(5*modulus.reg.size())

4160

4145

{

4161

if (!m_modulus.IsOdd())

4146

if (!modulus.IsOdd())

4162

4147

throw InvalidArgument("MontgomeryRepresentation: Montgomery representation requires an odd modulus");

4163

4148

4164

RecursiveInverseModPower2(m_u.reg, m_workspace, m_modulus.reg, m_modulus.reg.size());

4149

RecursiveInverseModPower2(u.reg, workspace, modulus.reg, modulus.reg.size());

4165

4150

}

4166

4151

4167

4152

const Integer& MontgomeryRepresentation::Multiply(const Integer &a, const Integer &b) const

4168

4153

{

4169

word *const T = m_workspace.begin();

4170

word *const R = m_result.reg.begin();

4171

const size_t N = m_modulus.reg.size();

4154

word *const T = workspace.begin();

4155

word *const R = result.reg.begin();

4156

const unsigned int N = modulus.reg.size();

4172

4157

assert(a.reg.size()<=N && b.reg.size()<=N);

4173

4158

4174

4159

AsymmetricMultiply(T, T+2*N, a.reg, a.reg.size(), b.reg, b.reg.size());

4175

4160

SetWords(T+a.reg.size()+b.reg.size(), 0, 2*N-a.reg.size()-b.reg.size());

4176

MontgomeryReduce(R, T+2*N, T, m_modulus.reg, m_u.reg, N);

4177

return m_result;

4161

MontgomeryReduce(R, T+2*N, T, modulus.reg, u.reg, N);

4162

return result;

4178

4163

}

4179

4164

4180

4165

const Integer& MontgomeryRepresentation::Square(const Integer &a) const

4181

4166

{

4182

word *const T = m_workspace.begin();

4183

word *const R = m_result.reg.begin();

4184

const size_t N = m_modulus.reg.size();

4167

word *const T = workspace.begin();

4168

word *const R = result.reg.begin();

4169

const unsigned int N = modulus.reg.size();

4185

4170

assert(a.reg.size()<=N);

4186

4171

4187

4172

CryptoPP::Square(T, T+2*N, a.reg, a.reg.size());

4188

4173

SetWords(T+2*a.reg.size(), 0, 2*N-2*a.reg.size());

4189

MontgomeryReduce(R, T+2*N, T, m_modulus.reg, m_u.reg, N);

4190

return m_result;

4174

MontgomeryReduce(R, T+2*N, T, modulus.reg, u.reg, N);

4175

return result;

4191

4176

}

4192

4177

4193

4178

Integer MontgomeryRepresentation::ConvertOut(const Integer &a) const

4194

4179

{

4195

word *const T = m_workspace.begin();

4196

word *const R = m_result.reg.begin();

4197

const size_t N = m_modulus.reg.size();

4180

word *const T = workspace.begin();

4181

word *const R = result.reg.begin();

4182

const unsigned int N = modulus.reg.size();

4198

4183

assert(a.reg.size()<=N);

4199

4184

4200

4185

CopyWords(T, a.reg, a.reg.size());

4201

4186

SetWords(T+a.reg.size(), 0, 2*N-a.reg.size());

4202

MontgomeryReduce(R, T+2*N, T, m_modulus.reg, m_u.reg, N);

4203

return m_result;

4187

MontgomeryReduce(R, T+2*N, T, modulus.reg, u.reg, N);

4188

return result;

4204

4189

}

4205

4190

4206

4191

const Integer& MontgomeryRepresentation::MultiplicativeInverse(const Integer &a) const

4207

4192

{

4208

4193

// return (EuclideanMultiplicativeInverse(a, modulus)<<(2*WORD_BITS*modulus.reg.size()))%modulus;

4209

word *const T = m_workspace.begin();

4210

word *const R = m_result.reg.begin();

4211

const size_t N = m_modulus.reg.size();

4194

word *const T = workspace.begin();

4195

word *const R = result.reg.begin();

4196

const unsigned int N = modulus.reg.size();

4212

4197

assert(a.reg.size()<=N);

4213

4198

4214

4199

CopyWords(T, a.reg, a.reg.size());

4215

4200

SetWords(T+a.reg.size(), 0, 2*N-a.reg.size());

4216

MontgomeryReduce(R, T+2*N, T, m_modulus.reg, m_u.reg, N);

4217

unsigned k = AlmostInverse(R, T, R, N, m_modulus.reg, N);

4201

MontgomeryReduce(R, T+2*N, T, modulus.reg, u.reg, N);

4202

unsigned k = AlmostInverse(R, T, R, N, modulus.reg, N);

4218

4203

4219

4204

// cout << "k=" << k << " N*32=" << 32*N << endl;

4220

4205

4221

4206

if (k>N*WORD_BITS)

4222

DivideByPower2Mod(R, R, k-N*WORD_BITS, m_modulus.reg, N);

4207

DivideByPower2Mod(R, R, k-N*WORD_BITS, modulus.reg, N);

4223

4208

else

4224

MultiplyByPower2Mod(R, R, N*WORD_BITS-k, m_modulus.reg, N);

4209

MultiplyByPower2Mod(R, R, N*WORD_BITS-k, modulus.reg, N);

4225

4210

4226

return m_result;

4211

return result;

4227

4212

}

4228

4213

4229

4214

NAMESPACE_END