~zooko/cryptopp/trunk

1 by weidai
Initial revision
1
// integer.cpp - written and placed in the public domain by Wei Dai
2
// contains public domain code contributed by Alister Lee and Leonard Janke
3
4
#include "pch.h"
75 by weidai
create DLL version, fix GetNextIV() bug in CTR and OFB modes
5
6
#ifndef CRYPTOPP_IMPORTS
7
1 by weidai
Initial revision
8
#include "integer.h"
9
#include "modarith.h"
10
#include "nbtheory.h"
11
#include "asn.h"
12
#include "oids.h"
13
#include "words.h"
14
#include "algparam.h"
15
#include "pubkey.h"		// for P1363_KDF2
16
#include "sha.h"
270 by weidai
MMX/SSE2 optimizations
17
#include "cpu.h"
1 by weidai
Initial revision
18
19
#include <iostream>
20
315 by weidai
fix compile for x64, DLL and VC 6
21
#if _MSC_VER >= 1400
270 by weidai
MMX/SSE2 optimizations
22
	#include <intrin.h>
23
#endif
24
25
#ifdef __DECCXX
26
	#include <c_asm.h>
27
#endif
28
29
#ifdef CRYPTOPP_MSVC6_NO_PP
30
	#pragma message("You do not seem to have the Visual C++ Processor Pack installed, so use of SSE2 instructions will be disabled.")
1 by weidai
Initial revision
31
#endif
32
532 by noloader
Added GCC_DIAGNOSTIC_AWARE to help suppress some warnings on contemporary compilers. The macro was needed to help with managing old compilers, like GCC 4.2.1, present on OpenBSD
33
#if GCC_DIAGNOSTIC_AWARE
34
# pragma GCC diagnostic ignored "-Wunused-value"
35
# pragma GCC diagnostic ignored "-Wunused-variable"
36
#endif
37
315 by weidai
fix compile for x64, DLL and VC 6
38
#define CRYPTOPP_INTEGER_SSE2 (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && CRYPTOPP_BOOL_X86)
39
1 by weidai
Initial revision
40
NAMESPACE_BEGIN(CryptoPP)
41
181 by weidai
changes done for FIPS-140 lab code drop
42
bool AssignIntToInteger(const std::type_info &valueType, void *pInteger, const void *pInt)
27 by weidai
various changes for 5.1
43
{
44
	if (valueType != typeid(Integer))
45
		return false;
46
	*reinterpret_cast<Integer *>(pInteger) = *reinterpret_cast<const int *>(pInt);
47
	return true;
48
}
49
270 by weidai
MMX/SSE2 optimizations
50
inline static int Compare(const word *A, const word *B, size_t N)
1 by weidai
Initial revision
51
{
52
	while (N--)
53
		if (A[N] > B[N])
54
			return 1;
55
		else if (A[N] < B[N])
56
			return -1;
57
58
	return 0;
59
}
60
270 by weidai
MMX/SSE2 optimizations
61
inline static int Increment(word *A, size_t N, word B=1)
1 by weidai
Initial revision
62
{
63
	assert(N);
64
	word t = A[0];
65
	A[0] = t+B;
66
	if (A[0] >= t)
67
		return 0;
68
	for (unsigned i=1; i<N; i++)
69
		if (++A[i])
70
			return 0;
71
	return 1;
72
}
73
270 by weidai
MMX/SSE2 optimizations
74
inline static int Decrement(word *A, size_t N, word B=1)
1 by weidai
Initial revision
75
{
76
	assert(N);
77
	word t = A[0];
78
	A[0] = t-B;
79
	if (A[0] <= t)
80
		return 0;
81
	for (unsigned i=1; i<N; i++)
82
		if (A[i]--)
83
			return 0;
84
	return 1;
85
}
86
184 by weidai
port to MSVC .NET 2005 beta 2
87
static void TwosComplement(word *A, size_t N)
1 by weidai
Initial revision
88
{
89
	Decrement(A, N);
90
	for (unsigned i=0; i<N; i++)
91
		A[i] = ~A[i];
92
}
93
100 by weidai
fix bugs in 64-bit CPU support
94
static word AtomicInverseModPower2(word A)
95
{
96
	assert(A%2==1);
97
98
	word R=A%8;
99
100
	for (unsigned i=3; i<WORD_BITS; i*=2)
1 by weidai
Initial revision
101
		R = R*(2-R*A);
102
103
	assert(R*A==1);
100 by weidai
fix bugs in 64-bit CPU support
104
	return R;
105
}
106
107
// ********************************************************
108
409 by weidai
fix compile with GCC 4.0.1 on MacOS X 64-bit
109
#if !defined(CRYPTOPP_NATIVE_DWORD_AVAILABLE) || (defined(__x86_64__) && defined(CRYPTOPP_WORD128_AVAILABLE))
315 by weidai
fix compile for x64, DLL and VC 6
110
	#define Declare2Words(x)			word x##0, x##1;
111
	#define AssignWord(a, b)			a##0 = b; a##1 = 0;
112
	#define Add2WordsBy1(a, b, c)		a##0 = b##0 + c; a##1 = b##1 + (a##0 < c);
113
	#define LowWord(a)					a##0
114
	#define HighWord(a)					a##1
115
	#ifdef _MSC_VER
411 by weidai
port to Sun Studio 12's 64-bit C++ Compiler 5.9 Patch 124864-09 2008/12/16
116
		#define MultiplyWordsLoHi(p0, p1, a, b)		p0 = _umul128(a, b, &p1);
387 by weidai
fix compile with ICC 10
117
		#ifndef __INTEL_COMPILER
118
			#define Double3Words(c, d)		d##1 = __shiftleft128(d##0, d##1, 1); d##0 = __shiftleft128(c, d##0, 1); c *= 2;
119
		#endif
315 by weidai
fix compile for x64, DLL and VC 6
120
	#elif defined(__DECCXX)
411 by weidai
port to Sun Studio 12's 64-bit C++ Compiler 5.9 Patch 124864-09 2008/12/16
121
		#define MultiplyWordsLoHi(p0, p1, a, b)		p0 = a*b; p1 = asm("umulh %a0, %a1, %v0", a, b);
387 by weidai
fix compile with ICC 10
122
	#elif defined(__x86_64__)
477 by weidai
port to Sun Studio 12u1 Sun C++ 5.10 SunOS_i386 128229-02 2009/09/21
123
		#if defined(__SUNPRO_CC) && __SUNPRO_CC < 0x5100
411 by weidai
port to Sun Studio 12's 64-bit C++ Compiler 5.9 Patch 124864-09 2008/12/16
124
			// Sun Studio's gcc-style inline assembly is heavily bugged as of version 5.9 Patch 124864-09 2008/12/16, but this one works
125
			#define MultiplyWordsLoHi(p0, p1, a, b)		asm ("mulq %3" : "=a"(p0), "=d"(p1) : "a"(a), "r"(b) : "cc");
126
		#else
127
			#define MultiplyWordsLoHi(p0, p1, a, b)		asm ("mulq %3" : "=a"(p0), "=d"(p1) : "a"(a), "g"(b) : "cc");
128
			#define MulAcc(c, d, a, b)		asm ("mulq %6; addq %3, %0; adcq %4, %1; adcq $0, %2;" : "+r"(c), "+r"(d##0), "+r"(d##1), "=a"(p0), "=d"(p1) : "a"(a), "g"(b) : "cc");
129
			#define Double3Words(c, d)		asm ("addq %0, %0; adcq %1, %1; adcq %2, %2;" : "+r"(c), "+r"(d##0), "+r"(d##1) : : "cc");
130
			#define Acc2WordsBy1(a, b)		asm ("addq %2, %0; adcq $0, %1;" : "+r"(a##0), "+r"(a##1) : "r"(b) : "cc");
131
			#define Acc2WordsBy2(a, b)		asm ("addq %2, %0; adcq %3, %1;" : "+r"(a##0), "+r"(a##1) : "r"(b##0), "r"(b##1) : "cc");
132
			#define Acc3WordsBy2(c, d, e)	asm ("addq %5, %0; adcq %6, %1; adcq $0, %2;" : "+r"(c), "=r"(e##0), "=r"(e##1) : "1"(d##0), "2"(d##1), "r"(e##0), "r"(e##1) : "cc");
133
		#endif
315 by weidai
fix compile for x64, DLL and VC 6
134
	#endif
411 by weidai
port to Sun Studio 12's 64-bit C++ Compiler 5.9 Patch 124864-09 2008/12/16
135
	#define MultiplyWords(p, a, b)		MultiplyWordsLoHi(p##0, p##1, a, b)
315 by weidai
fix compile for x64, DLL and VC 6
136
	#ifndef Double3Words
137
		#define Double3Words(c, d)		d##1 = 2*d##1 + (d##0>>(WORD_BITS-1)); d##0 = 2*d##0 + (c>>(WORD_BITS-1)); c *= 2;
138
	#endif
139
	#ifndef Acc2WordsBy2
140
		#define Acc2WordsBy2(a, b)		a##0 += b##0; a##1 += a##0 < b##0; a##1 += b##1;
141
	#endif
142
	#define AddWithCarry(u, a, b)		{word t = a+b; u##0 = t + u##1; u##1 = (t<a) + (u##0<t);}
143
	#define SubtractWithBorrow(u, a, b)	{word t = a-b; u##0 = t - u##1; u##1 = (t>a) + (u##0>t);}
144
	#define GetCarry(u)					u##1
145
	#define GetBorrow(u)				u##1
146
#else
270 by weidai
MMX/SSE2 optimizations
147
	#define Declare2Words(x)			dword x;
148
	#if _MSC_VER >= 1400 && !defined(__INTEL_COMPILER)
149
		#define MultiplyWords(p, a, b)		p = __emulu(a, b);
150
	#else
151
		#define MultiplyWords(p, a, b)		p = (dword)a*b;
152
	#endif
153
	#define AssignWord(a, b)			a = b;
154
	#define Add2WordsBy1(a, b, c)		a = b + c;
155
	#define Acc2WordsBy2(a, b)			a += b;
315 by weidai
fix compile for x64, DLL and VC 6
156
	#define LowWord(a)					word(a)
157
	#define HighWord(a)					word(a>>WORD_BITS)
158
	#define Double3Words(c, d)			d = 2*d + (c>>(WORD_BITS-1)); c *= 2;
270 by weidai
MMX/SSE2 optimizations
159
	#define AddWithCarry(u, a, b)		u = dword(a) + b + GetCarry(u);
160
	#define SubtractWithBorrow(u, a, b)	u = dword(a) - b - GetBorrow(u);
161
	#define GetCarry(u)					HighWord(u)
162
	#define GetBorrow(u)				word(u>>(WORD_BITS*2-1))
315 by weidai
fix compile for x64, DLL and VC 6
163
#endif
164
#ifndef MulAcc
165
	#define MulAcc(c, d, a, b)			MultiplyWords(p, a, b); Acc2WordsBy1(p, c); c = LowWord(p); Acc2WordsBy1(d, HighWord(p));
166
#endif
167
#ifndef Acc2WordsBy1
270 by weidai
MMX/SSE2 optimizations
168
	#define Acc2WordsBy1(a, b)			Add2WordsBy1(a, a, b)
315 by weidai
fix compile for x64, DLL and VC 6
169
#endif
170
#ifndef Acc3WordsBy2
171
	#define Acc3WordsBy2(c, d, e)		Acc2WordsBy1(e, c); c = LowWord(e); Add2WordsBy1(e, d, HighWord(e));
270 by weidai
MMX/SSE2 optimizations
172
#endif
173
100 by weidai
fix bugs in 64-bit CPU support
174
class DWord
175
{
176
public:
177
	DWord() {}
178
179
#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
180
	explicit DWord(word low)
181
	{
182
		m_whole = low;
183
	}
184
#else
185
	explicit DWord(word low)
186
	{
187
		m_halfs.low = low;
188
		m_halfs.high = 0;
189
	}
190
#endif
191
192
	DWord(word low, word high)
193
	{
194
		m_halfs.low = low;
195
		m_halfs.high = high;
196
	}
197
198
	static DWord Multiply(word a, word b)
199
	{
200
		DWord r;
201
		#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
202
			r.m_whole = (dword)a * b;
411 by weidai
port to Sun Studio 12's 64-bit C++ Compiler 5.9 Patch 124864-09 2008/12/16
203
		#elif defined(MultiplyWordsLoHi)
204
			MultiplyWordsLoHi(r.m_halfs.low, r.m_halfs.high, a, b);
100 by weidai
fix bugs in 64-bit CPU support
205
		#endif
206
		return r;
207
	}
208
209
	static DWord MultiplyAndAdd(word a, word b, word c)
210
	{
211
		DWord r = Multiply(a, b);
212
		return r += c;
213
	}
214
215
	DWord & operator+=(word a)
216
	{
217
		#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
218
			m_whole = m_whole + a;
219
		#else
220
			m_halfs.low += a;
221
			m_halfs.high += (m_halfs.low < a);
222
		#endif
223
		return *this;
224
	}
225
226
	DWord operator+(word a)
227
	{
228
		DWord r;
229
		#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
230
			r.m_whole = m_whole + a;
231
		#else
232
			r.m_halfs.low = m_halfs.low + a;
233
			r.m_halfs.high = m_halfs.high + (r.m_halfs.low < a);
234
		#endif
235
		return r;
236
	}
237
238
	DWord operator-(DWord a)
239
	{
240
		DWord r;
241
		#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
242
			r.m_whole = m_whole - a.m_whole;
243
		#else
244
			r.m_halfs.low = m_halfs.low - a.m_halfs.low;
245
			r.m_halfs.high = m_halfs.high - a.m_halfs.high - (r.m_halfs.low > m_halfs.low);
246
		#endif
247
		return r;
248
	}
249
250
	DWord operator-(word a)
251
	{
252
		DWord r;
253
		#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
254
			r.m_whole = m_whole - a;
255
		#else
256
			r.m_halfs.low = m_halfs.low - a;
257
			r.m_halfs.high = m_halfs.high - (r.m_halfs.low > m_halfs.low);
258
		#endif
259
		return r;
260
	}
261
262
	// returns quotient, which must fit in a word
263
	word operator/(word divisor);
264
265
	word operator%(word a);
266
267
	bool operator!() const
268
	{
269
	#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
270
		return !m_whole;
271
	#else
272
		return !m_halfs.high && !m_halfs.low;
273
	#endif
274
	}
275
276
	word GetLowHalf() const {return m_halfs.low;}
277
	word GetHighHalf() const {return m_halfs.high;}
278
	word GetHighHalfAsBorrow() const {return 0-m_halfs.high;}
279
280
private:
281
	union
282
	{
283
	#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
284
		dword m_whole;
285
	#endif
286
		struct
287
		{
288
		#ifdef IS_LITTLE_ENDIAN
289
			word low;
290
			word high;
291
		#else
292
			word high;
293
			word low;
294
		#endif
295
		} m_halfs;
296
	};
297
};
298
299
class Word
300
{
301
public:
302
	Word() {}
303
304
	Word(word value)
305
	{
306
		m_whole = value;
307
	}
308
309
	Word(hword low, hword high)
310
	{
311
		m_whole = low | (word(high) << (WORD_BITS/2));
312
	}
313
314
	static Word Multiply(hword a, hword b)
315
	{
316
		Word r;
317
		r.m_whole = (word)a * b;
318
		return r;
319
	}
320
321
	Word operator-(Word a)
322
	{
323
		Word r;
324
		r.m_whole = m_whole - a.m_whole;
325
		return r;
326
	}
327
328
	Word operator-(hword a)
329
	{
330
		Word r;
331
		r.m_whole = m_whole - a;
332
		return r;
333
	}
334
335
	// returns quotient, which must fit in a word
336
	hword operator/(hword divisor)
337
	{
338
		return hword(m_whole / divisor);
339
	}
340
341
	bool operator!() const
342
	{
343
		return !m_whole;
344
	}
345
346
	word GetWhole() const {return m_whole;}
347
	hword GetLowHalf() const {return hword(m_whole);}
348
	hword GetHighHalf() const {return hword(m_whole>>(WORD_BITS/2));}
349
	hword GetHighHalfAsBorrow() const {return 0-hword(m_whole>>(WORD_BITS/2));}
350
	
351
private:
352
	word m_whole;
353
};
354
355
// do a 3 word by 2 word divide, returns quotient and leaves remainder in A
356
template <class S, class D>
357
S DivideThreeWordsByTwo(S *A, S B0, S B1, D *dummy=NULL)
358
{
359
	// assert {A[2],A[1]} < {B1,B0}, so quotient can fit in a S
360
	assert(A[2] < B1 || (A[2]==B1 && A[1] < B0));
361
362
	// estimate the quotient: do a 2 S by 1 S divide
363
	S Q;
364
	if (S(B1+1) == 0)
365
		Q = A[2];
443 by weidai
fix Integer operator<< output on Windows x64
366
	else if (B1 > 0)
100 by weidai
fix bugs in 64-bit CPU support
367
		Q = D(A[1], A[2]) / S(B1+1);
443 by weidai
fix Integer operator<< output on Windows x64
368
	else
369
		Q = D(A[0], A[1]) / B0;
100 by weidai
fix bugs in 64-bit CPU support
370
371
	// now subtract Q*B from A
372
	D p = D::Multiply(B0, Q);
373
	D u = (D) A[0] - p.GetLowHalf();
374
	A[0] = u.GetLowHalf();
375
	u = (D) A[1] - p.GetHighHalf() - u.GetHighHalfAsBorrow() - D::Multiply(B1, Q);
376
	A[1] = u.GetLowHalf();
377
	A[2] += u.GetHighHalf();
378
379
	// Q <= actual quotient, so fix it
380
	while (A[2] || A[1] > B1 || (A[1]==B1 && A[0]>=B0))
381
	{
382
		u = (D) A[0] - B0;
383
		A[0] = u.GetLowHalf();
384
		u = (D) A[1] - B1 - u.GetHighHalfAsBorrow();
385
		A[1] = u.GetLowHalf();
386
		A[2] += u.GetHighHalf();
387
		Q++;
388
		assert(Q);	// shouldn't overflow
389
	}
390
391
	return Q;
392
}
393
394
// do a 4 word by 2 word divide, returns 2 word quotient in Q0 and Q1
395
template <class S, class D>
396
inline D DivideFourWordsByTwo(S *T, const D &Al, const D &Ah, const D &B)
397
{
398
	if (!B) // if divisor is 0, we assume divisor==2**(2*WORD_BITS)
399
		return D(Ah.GetLowHalf(), Ah.GetHighHalf());
400
	else
401
	{
402
		S Q[2];
403
		T[0] = Al.GetLowHalf();
404
		T[1] = Al.GetHighHalf(); 
405
		T[2] = Ah.GetLowHalf();
406
		T[3] = Ah.GetHighHalf();
407
		Q[1] = DivideThreeWordsByTwo<S, D>(T+1, B.GetLowHalf(), B.GetHighHalf());
408
		Q[0] = DivideThreeWordsByTwo<S, D>(T, B.GetLowHalf(), B.GetHighHalf());
409
		return D(Q[0], Q[1]);
410
	}
411
}
412
413
// returns quotient, which must fit in a word
414
inline word DWord::operator/(word a)
415
{
416
	#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
417
		return word(m_whole / a);
418
	#else
419
		hword r[4];
420
		return DivideFourWordsByTwo<hword, Word>(r, m_halfs.low, m_halfs.high, a).GetWhole();
421
	#endif
422
}
423
424
inline word DWord::operator%(word a)
425
{
426
	#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
427
		return word(m_whole % a);
428
	#else
429
		if (a < (word(1) << (WORD_BITS/2)))
430
		{
431
			hword h = hword(a);
432
			word r = m_halfs.high % h;
433
			r = ((m_halfs.low >> (WORD_BITS/2)) + (r << (WORD_BITS/2))) % h;
434
			return hword((hword(m_halfs.low) + (r << (WORD_BITS/2))) % h);
435
		}
436
		else
437
		{
438
			hword r[4];
439
			DivideFourWordsByTwo<hword, Word>(r, m_halfs.low, m_halfs.high, a);
440
			return Word(r[0], r[1]).GetWhole();
441
		}
442
	#endif
1 by weidai
Initial revision
443
}
444
445
// ********************************************************
446
109 by weidai
enable SSE2 intrinsics on GCC 3.3 or later
447
// use some tricks to share assembly code between MSVC and GCC
270 by weidai
MMX/SSE2 optimizations
448
#if defined(__GNUC__)
113 by weidai
unify GCC and MSVC multiplication code
449
	#define AddPrologue \
351 by weidai
revert to int return value for Add and Sub
450
		int result;	\
113 by weidai
unify GCC and MSVC multiplication code
451
		__asm__ __volatile__ \
109 by weidai
enable SSE2 intrinsics on GCC 3.3 or later
452
		( \
270 by weidai
MMX/SSE2 optimizations
453
			".intel_syntax noprefix;"
113 by weidai
unify GCC and MSVC multiplication code
454
	#define AddEpilogue \
455
			".att_syntax prefix;" \
307 by weidai
fix compile with Intel compiler
456
					: "=a" (result)\
457
					: "d" (C), "a" (A), "D" (B), "c" (N) \
270 by weidai
MMX/SSE2 optimizations
458
					: "%esi", "memory", "cc" \
307 by weidai
fix compile with Intel compiler
459
		);\
460
		return result;
113 by weidai
unify GCC and MSVC multiplication code
461
	#define MulPrologue \
462
		__asm__ __volatile__ \
463
		( \
270 by weidai
MMX/SSE2 optimizations
464
			".intel_syntax noprefix;" \
465
			AS1(	push	ebx) \
466
			AS2(	mov		ebx, edx)
113 by weidai
unify GCC and MSVC multiplication code
467
	#define MulEpilogue \
270 by weidai
MMX/SSE2 optimizations
468
			AS1(	pop		ebx) \
469
			".att_syntax prefix;" \
470
			: \
471
			: "d" (s_maskLow16), "c" (C), "a" (A), "D" (B) \
472
			: "%esi", "memory", "cc" \
473
		);
474
	#define SquPrologue		MulPrologue
475
	#define SquEpilogue	\
476
			AS1(	pop		ebx) \
477
			".att_syntax prefix;" \
478
			: \
479
			: "d" (s_maskLow16), "c" (C), "a" (A) \
480
			: "%esi", "%edi", "memory", "cc" \
481
		);
482
	#define TopPrologue		MulPrologue
483
	#define TopEpilogue	\
484
			AS1(	pop		ebx) \
485
			".att_syntax prefix;" \
486
			: \
487
			: "d" (s_maskLow16), "c" (C), "a" (A), "D" (B), "S" (L) \
488
			: "memory", "cc" \
489
		);
490
#else
491
	#define AddPrologue \
307 by weidai
fix compile with Intel compiler
492
		__asm	push edi \
270 by weidai
MMX/SSE2 optimizations
493
		__asm	push esi \
494
		__asm	mov		eax, [esp+12] \
307 by weidai
fix compile with Intel compiler
495
		__asm	mov		edi, [esp+16]
270 by weidai
MMX/SSE2 optimizations
496
	#define AddEpilogue \
497
		__asm	pop esi \
307 by weidai
fix compile with Intel compiler
498
		__asm	pop edi \
270 by weidai
MMX/SSE2 optimizations
499
		__asm	ret 8
315 by weidai
fix compile for x64, DLL and VC 6
500
#if _MSC_VER < 1300
501
	#define SaveEBX		__asm push ebx
502
	#define RestoreEBX	__asm pop ebx
503
#else
504
	#define SaveEBX
505
	#define RestoreEBX
506
#endif
270 by weidai
MMX/SSE2 optimizations
507
	#define SquPrologue					\
508
		AS2(	mov		eax, A)			\
509
		AS2(	mov		ecx, C)			\
315 by weidai
fix compile for x64, DLL and VC 6
510
		SaveEBX							\
270 by weidai
MMX/SSE2 optimizations
511
		AS2(	lea		ebx, s_maskLow16)
512
	#define MulPrologue					\
513
		AS2(	mov		eax, A)			\
514
		AS2(	mov		edi, B)			\
515
		AS2(	mov		ecx, C)			\
315 by weidai
fix compile for x64, DLL and VC 6
516
		SaveEBX							\
270 by weidai
MMX/SSE2 optimizations
517
		AS2(	lea		ebx, s_maskLow16)
518
	#define TopPrologue					\
519
		AS2(	mov		eax, A)			\
520
		AS2(	mov		edi, B)			\
521
		AS2(	mov		ecx, C)			\
522
		AS2(	mov		esi, L)			\
315 by weidai
fix compile for x64, DLL and VC 6
523
		SaveEBX							\
270 by weidai
MMX/SSE2 optimizations
524
		AS2(	lea		ebx, s_maskLow16)
315 by weidai
fix compile for x64, DLL and VC 6
525
	#define SquEpilogue		RestoreEBX
526
	#define MulEpilogue		RestoreEBX
527
	#define TopEpilogue		RestoreEBX
270 by weidai
MMX/SSE2 optimizations
528
#endif
529
315 by weidai
fix compile for x64, DLL and VC 6
530
#ifdef CRYPTOPP_X64_MASM_AVAILABLE
270 by weidai
MMX/SSE2 optimizations
531
extern "C" {
351 by weidai
revert to int return value for Add and Sub
532
int Baseline_Add(size_t N, word *C, const word *A, const word *B);
533
int Baseline_Sub(size_t N, word *C, const word *A, const word *B);
315 by weidai
fix compile for x64, DLL and VC 6
534
}
432 by weidai
fix compile on OpenBSD 4.4
535
#elif defined(CRYPTOPP_X64_ASM_AVAILABLE) && defined(__GNUC__) && defined(CRYPTOPP_WORD128_AVAILABLE)
351 by weidai
revert to int return value for Add and Sub
536
int Baseline_Add(size_t N, word *C, const word *A, const word *B)
315 by weidai
fix compile for x64, DLL and VC 6
537
{
538
	word result;
539
	__asm__ __volatile__
540
	(
541
	".intel_syntax;"
542
	AS1(	neg		%1)
543
	ASJ(	jz,		1, f)
544
	AS2(	mov		%0,[%3+8*%1])
545
	AS2(	add		%0,[%4+8*%1])
546
	AS2(	mov		[%2+8*%1],%0)
547
	ASL(0)
548
	AS2(	mov		%0,[%3+8*%1+8])
549
	AS2(	adc		%0,[%4+8*%1+8])
550
	AS2(	mov		[%2+8*%1+8],%0)
551
	AS2(	lea		%1,[%1+2])
552
	ASJ(	jrcxz,	1, f)
553
	AS2(	mov		%0,[%3+8*%1])
554
	AS2(	adc		%0,[%4+8*%1])
555
	AS2(	mov		[%2+8*%1],%0)
556
	ASJ(	jmp,	0, b)
557
	ASL(1)
558
	AS2(	mov		%0, 0)
559
	AS2(	adc		%0, %0)
560
	".att_syntax;"
402 by weidai
fixes for GCC 4.3.2 (reports from Chris Morgan and DiegoT)
561
	: "=&r" (result), "+c" (N)
562
	: "r" (C+N), "r" (A+N), "r" (B+N)
315 by weidai
fix compile for x64, DLL and VC 6
563
	: "memory", "cc"
564
	);
351 by weidai
revert to int return value for Add and Sub
565
	return (int)result;
315 by weidai
fix compile for x64, DLL and VC 6
566
}
567
351 by weidai
revert to int return value for Add and Sub
568
int Baseline_Sub(size_t N, word *C, const word *A, const word *B)
315 by weidai
fix compile for x64, DLL and VC 6
569
{
570
	word result;
571
	__asm__ __volatile__
572
	(
573
	".intel_syntax;"
574
	AS1(	neg		%1)
575
	ASJ(	jz,		1, f)
576
	AS2(	mov		%0,[%3+8*%1])
577
	AS2(	sub		%0,[%4+8*%1])
578
	AS2(	mov		[%2+8*%1],%0)
579
	ASL(0)
580
	AS2(	mov		%0,[%3+8*%1+8])
581
	AS2(	sbb		%0,[%4+8*%1+8])
582
	AS2(	mov		[%2+8*%1+8],%0)
583
	AS2(	lea		%1,[%1+2])
584
	ASJ(	jrcxz,	1, f)
585
	AS2(	mov		%0,[%3+8*%1])
586
	AS2(	sbb		%0,[%4+8*%1])
587
	AS2(	mov		[%2+8*%1],%0)
588
	ASJ(	jmp,	0, b)
589
	ASL(1)
590
	AS2(	mov		%0, 0)
591
	AS2(	adc		%0, %0)
592
	".att_syntax;"
402 by weidai
fixes for GCC 4.3.2 (reports from Chris Morgan and DiegoT)
593
	: "=&r" (result), "+c" (N)
594
	: "r" (C+N), "r" (A+N), "r" (B+N)
315 by weidai
fix compile for x64, DLL and VC 6
595
	: "memory", "cc"
596
	);
351 by weidai
revert to int return value for Add and Sub
597
	return (int)result;
315 by weidai
fix compile for x64, DLL and VC 6
598
}
599
#elif defined(CRYPTOPP_X86_ASM_AVAILABLE) && CRYPTOPP_BOOL_X86
351 by weidai
revert to int return value for Add and Sub
600
CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B)
270 by weidai
MMX/SSE2 optimizations
601
{
602
	AddPrologue
603
307 by weidai
fix compile with Intel compiler
604
	// now: eax = A, edi = B, edx = C, ecx = N
270 by weidai
MMX/SSE2 optimizations
605
	AS2(	lea		eax, [eax+4*ecx])
307 by weidai
fix compile with Intel compiler
606
	AS2(	lea		edi, [edi+4*ecx])
270 by weidai
MMX/SSE2 optimizations
607
	AS2(	lea		edx, [edx+4*ecx])
608
609
	AS1(	neg		ecx)				// ecx is negative index
610
	AS2(	test	ecx, 2)				// this clears carry flag
611
	ASJ(	jz,		0, f)
612
	AS2(	sub		ecx, 2)
613
	ASJ(	jmp,	1, f)
614
615
	ASL(0)
616
	ASJ(	jecxz,	2, f)				// loop until ecx overflows and becomes zero
617
	AS2(	mov		esi,[eax+4*ecx])
307 by weidai
fix compile with Intel compiler
618
	AS2(	adc		esi,[edi+4*ecx])
270 by weidai
MMX/SSE2 optimizations
619
	AS2(	mov		[edx+4*ecx],esi)
620
	AS2(	mov		esi,[eax+4*ecx+4])
307 by weidai
fix compile with Intel compiler
621
	AS2(	adc		esi,[edi+4*ecx+4])
270 by weidai
MMX/SSE2 optimizations
622
	AS2(	mov		[edx+4*ecx+4],esi)
623
	ASL(1)
624
	AS2(	mov		esi,[eax+4*ecx+8])
307 by weidai
fix compile with Intel compiler
625
	AS2(	adc		esi,[edi+4*ecx+8])
270 by weidai
MMX/SSE2 optimizations
626
	AS2(	mov		[edx+4*ecx+8],esi)
627
	AS2(	mov		esi,[eax+4*ecx+12])
307 by weidai
fix compile with Intel compiler
628
	AS2(	adc		esi,[edi+4*ecx+12])
270 by weidai
MMX/SSE2 optimizations
629
	AS2(	mov		[edx+4*ecx+12],esi)
630
631
	AS2(	lea		ecx,[ecx+4])		// advance index, avoid inc which causes slowdown on Intel Core 2
632
	ASJ(	jmp,	0, b)
633
634
	ASL(2)
635
	AS2(	mov		eax, 0)
636
	AS1(	setc	al)					// store carry into eax (return result register)
637
638
	AddEpilogue
639
}
640
351 by weidai
revert to int return value for Add and Sub
641
CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B)
270 by weidai
MMX/SSE2 optimizations
642
{
643
	AddPrologue
644
307 by weidai
fix compile with Intel compiler
645
	// now: eax = A, edi = B, edx = C, ecx = N
270 by weidai
MMX/SSE2 optimizations
646
	AS2(	lea		eax, [eax+4*ecx])
307 by weidai
fix compile with Intel compiler
647
	AS2(	lea		edi, [edi+4*ecx])
270 by weidai
MMX/SSE2 optimizations
648
	AS2(	lea		edx, [edx+4*ecx])
649
650
	AS1(	neg		ecx)				// ecx is negative index
651
	AS2(	test	ecx, 2)				// this clears carry flag
652
	ASJ(	jz,		0, f)
653
	AS2(	sub		ecx, 2)
654
	ASJ(	jmp,	1, f)
655
656
	ASL(0)
657
	ASJ(	jecxz,	2, f)				// loop until ecx overflows and becomes zero
658
	AS2(	mov		esi,[eax+4*ecx])
307 by weidai
fix compile with Intel compiler
659
	AS2(	sbb		esi,[edi+4*ecx])
270 by weidai
MMX/SSE2 optimizations
660
	AS2(	mov		[edx+4*ecx],esi)
661
	AS2(	mov		esi,[eax+4*ecx+4])
307 by weidai
fix compile with Intel compiler
662
	AS2(	sbb		esi,[edi+4*ecx+4])
270 by weidai
MMX/SSE2 optimizations
663
	AS2(	mov		[edx+4*ecx+4],esi)
664
	ASL(1)
665
	AS2(	mov		esi,[eax+4*ecx+8])
307 by weidai
fix compile with Intel compiler
666
	AS2(	sbb		esi,[edi+4*ecx+8])
270 by weidai
MMX/SSE2 optimizations
667
	AS2(	mov		[edx+4*ecx+8],esi)
668
	AS2(	mov		esi,[eax+4*ecx+12])
307 by weidai
fix compile with Intel compiler
669
	AS2(	sbb		esi,[edi+4*ecx+12])
270 by weidai
MMX/SSE2 optimizations
670
	AS2(	mov		[edx+4*ecx+12],esi)
671
672
	AS2(	lea		ecx,[ecx+4])		// advance index, avoid inc which causes slowdown on Intel Core 2
673
	ASJ(	jmp,	0, b)
674
675
	ASL(2)
676
	AS2(	mov		eax, 0)
677
	AS1(	setc	al)					// store carry into eax (return result register)
678
679
	AddEpilogue
680
}
681
315 by weidai
fix compile for x64, DLL and VC 6
682
#if CRYPTOPP_INTEGER_SSE2
351 by weidai
revert to int return value for Add and Sub
683
CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Add(size_t N, word *C, const word *A, const word *B)
270 by weidai
MMX/SSE2 optimizations
684
{
685
	AddPrologue
686
307 by weidai
fix compile with Intel compiler
687
	// now: eax = A, edi = B, edx = C, ecx = N
270 by weidai
MMX/SSE2 optimizations
688
	AS2(	lea		eax, [eax+4*ecx])
307 by weidai
fix compile with Intel compiler
689
	AS2(	lea		edi, [edi+4*ecx])
270 by weidai
MMX/SSE2 optimizations
690
	AS2(	lea		edx, [edx+4*ecx])
691
692
	AS1(	neg		ecx)				// ecx is negative index
693
	AS2(	pxor    mm2, mm2)
694
	ASJ(	jz,		2, f)
695
	AS2(	test	ecx, 2)				// this clears carry flag
696
	ASJ(	jz,		0, f)
697
	AS2(	sub		ecx, 2)
698
	ASJ(	jmp,	1, f)
699
700
	ASL(0)
701
	AS2(	movd     mm0, DWORD PTR [eax+4*ecx])
307 by weidai
fix compile with Intel compiler
702
	AS2(	movd     mm1, DWORD PTR [edi+4*ecx])
270 by weidai
MMX/SSE2 optimizations
703
	AS2(	paddq    mm0, mm1)
704
	AS2(	paddq	 mm2, mm0)
705
	AS2(	movd	 DWORD PTR [edx+4*ecx], mm2)
706
	AS2(	psrlq    mm2, 32)
707
708
	AS2(	movd     mm0, DWORD PTR [eax+4*ecx+4])
307 by weidai
fix compile with Intel compiler
709
	AS2(	movd     mm1, DWORD PTR [edi+4*ecx+4])
270 by weidai
MMX/SSE2 optimizations
710
	AS2(	paddq    mm0, mm1)
711
	AS2(	paddq	 mm2, mm0)
712
	AS2(	movd	 DWORD PTR [edx+4*ecx+4], mm2)
713
	AS2(	psrlq    mm2, 32)
714
715
	ASL(1)
716
	AS2(	movd     mm0, DWORD PTR [eax+4*ecx+8])
307 by weidai
fix compile with Intel compiler
717
	AS2(	movd     mm1, DWORD PTR [edi+4*ecx+8])
270 by weidai
MMX/SSE2 optimizations
718
	AS2(	paddq    mm0, mm1)
719
	AS2(	paddq	 mm2, mm0)
720
	AS2(	movd	 DWORD PTR [edx+4*ecx+8], mm2)
721
	AS2(	psrlq    mm2, 32)
722
723
	AS2(	movd     mm0, DWORD PTR [eax+4*ecx+12])
307 by weidai
fix compile with Intel compiler
724
	AS2(	movd     mm1, DWORD PTR [edi+4*ecx+12])
270 by weidai
MMX/SSE2 optimizations
725
	AS2(	paddq    mm0, mm1)
726
	AS2(	paddq	 mm2, mm0)
727
	AS2(	movd	 DWORD PTR [edx+4*ecx+12], mm2)
728
	AS2(	psrlq    mm2, 32)
729
730
	AS2(	add		ecx, 4)
731
	ASJ(	jnz,	0, b)
732
733
	ASL(2)
734
	AS2(	movd	eax, mm2)
735
	AS1(	emms)
736
737
	AddEpilogue
738
}
351 by weidai
revert to int return value for Add and Sub
739
CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Sub(size_t N, word *C, const word *A, const word *B)
270 by weidai
MMX/SSE2 optimizations
740
{
741
	AddPrologue
742
307 by weidai
fix compile with Intel compiler
743
	// now: eax = A, edi = B, edx = C, ecx = N
270 by weidai
MMX/SSE2 optimizations
744
	AS2(	lea		eax, [eax+4*ecx])
307 by weidai
fix compile with Intel compiler
745
	AS2(	lea		edi, [edi+4*ecx])
270 by weidai
MMX/SSE2 optimizations
746
	AS2(	lea		edx, [edx+4*ecx])
747
748
	AS1(	neg		ecx)				// ecx is negative index
749
	AS2(	pxor    mm2, mm2)
750
	ASJ(	jz,		2, f)
751
	AS2(	test	ecx, 2)				// this clears carry flag
752
	ASJ(	jz,		0, f)
753
	AS2(	sub		ecx, 2)
754
	ASJ(	jmp,	1, f)
755
756
	ASL(0)
757
	AS2(	movd     mm0, DWORD PTR [eax+4*ecx])
307 by weidai
fix compile with Intel compiler
758
	AS2(	movd     mm1, DWORD PTR [edi+4*ecx])
270 by weidai
MMX/SSE2 optimizations
759
	AS2(	psubq    mm0, mm1)
760
	AS2(	psubq	 mm0, mm2)
761
	AS2(	movd	 DWORD PTR [edx+4*ecx], mm0)
762
	AS2(	psrlq    mm0, 63)
763
764
	AS2(	movd     mm2, DWORD PTR [eax+4*ecx+4])
307 by weidai
fix compile with Intel compiler
765
	AS2(	movd     mm1, DWORD PTR [edi+4*ecx+4])
270 by weidai
MMX/SSE2 optimizations
766
	AS2(	psubq    mm2, mm1)
767
	AS2(	psubq	 mm2, mm0)
768
	AS2(	movd	 DWORD PTR [edx+4*ecx+4], mm2)
769
	AS2(	psrlq    mm2, 63)
770
771
	ASL(1)
772
	AS2(	movd     mm0, DWORD PTR [eax+4*ecx+8])
307 by weidai
fix compile with Intel compiler
773
	AS2(	movd     mm1, DWORD PTR [edi+4*ecx+8])
270 by weidai
MMX/SSE2 optimizations
774
	AS2(	psubq    mm0, mm1)
775
	AS2(	psubq	 mm0, mm2)
776
	AS2(	movd	 DWORD PTR [edx+4*ecx+8], mm0)
777
	AS2(	psrlq    mm0, 63)
778
779
	AS2(	movd     mm2, DWORD PTR [eax+4*ecx+12])
307 by weidai
fix compile with Intel compiler
780
	AS2(	movd     mm1, DWORD PTR [edi+4*ecx+12])
270 by weidai
MMX/SSE2 optimizations
781
	AS2(	psubq    mm2, mm1)
782
	AS2(	psubq	 mm2, mm0)
783
	AS2(	movd	 DWORD PTR [edx+4*ecx+12], mm2)
784
	AS2(	psrlq    mm2, 63)
785
786
	AS2(	add		ecx, 4)
787
	ASJ(	jnz,	0, b)
788
789
	ASL(2)
790
	AS2(	movd	eax, mm2)
791
	AS1(	emms)
792
793
	AddEpilogue
794
}
307 by weidai
fix compile with Intel compiler
795
#endif	// #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
270 by weidai
MMX/SSE2 optimizations
796
#else
351 by weidai
revert to int return value for Add and Sub
797
int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B)
270 by weidai
MMX/SSE2 optimizations
798
{
799
	assert (N%2 == 0);
800
801
	Declare2Words(u);
308 by weidai
fix compile on Sun CC
802
	AssignWord(u, 0);
270 by weidai
MMX/SSE2 optimizations
803
	for (size_t i=0; i<N; i+=2)
804
	{
805
		AddWithCarry(u, A[i], B[i]);
806
		C[i] = LowWord(u);
807
		AddWithCarry(u, A[i+1], B[i+1]);
808
		C[i+1] = LowWord(u);
809
	}
810
	return int(GetCarry(u));
811
}
812
351 by weidai
revert to int return value for Add and Sub
813
int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B)
270 by weidai
MMX/SSE2 optimizations
814
{
815
	assert (N%2 == 0);
816
817
	Declare2Words(u);
308 by weidai
fix compile on Sun CC
818
	AssignWord(u, 0);
270 by weidai
MMX/SSE2 optimizations
819
	for (size_t i=0; i<N; i+=2)
820
	{
821
		SubtractWithBorrow(u, A[i], B[i]);
822
		C[i] = LowWord(u);
823
		SubtractWithBorrow(u, A[i+1], B[i+1]);
824
		C[i+1] = LowWord(u);
825
	}
826
	return int(GetBorrow(u));
827
}
828
#endif
829
830
static word LinearMultiply(word *C, const word *A, word B, size_t N)
831
{
832
	word carry=0;
833
	for(unsigned i=0; i<N; i++)
834
	{
835
		Declare2Words(p);
836
		MultiplyWords(p, A[i], B);
837
		Acc2WordsBy1(p, carry);
838
		C[i] = LowWord(p);
839
		carry = HighWord(p);
840
	}
841
	return carry;
842
}
843
363 by weidai
fix possible branch prediction analysis (BPA) vulnerability
844
#ifndef CRYPTOPP_DOXYGEN_PROCESSING
845
270 by weidai
MMX/SSE2 optimizations
846
#define Mul_2 \
847
	Mul_Begin(2) \
848
	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
315 by weidai
fix compile for x64, DLL and VC 6
849
	Mul_End(1, 1)
270 by weidai
MMX/SSE2 optimizations
850
851
#define Mul_4 \
852
	Mul_Begin(4) \
853
	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
854
	Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0)  \
855
	Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0)  \
856
	Mul_SaveAcc(3, 1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1)  \
857
	Mul_SaveAcc(4, 2, 3) Mul_Acc(3, 2) \
315 by weidai
fix compile for x64, DLL and VC 6
858
	Mul_End(5, 3)
270 by weidai
MMX/SSE2 optimizations
859
860
#define Mul_8 \
861
	Mul_Begin(8) \
862
	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
863
	Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0)  \
864
	Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0)  \
865
	Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
866
	Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
867
	Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
868
	Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
869
	Mul_SaveAcc(7, 1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) \
870
	Mul_SaveAcc(8, 2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) \
871
	Mul_SaveAcc(9, 3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) \
872
	Mul_SaveAcc(10, 4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) \
873
	Mul_SaveAcc(11, 5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) \
874
	Mul_SaveAcc(12, 6, 7) Mul_Acc(7, 6) \
315 by weidai
fix compile for x64, DLL and VC 6
875
	Mul_End(13, 7)
270 by weidai
MMX/SSE2 optimizations
876
877
#define Mul_16 \
878
	Mul_Begin(16) \
879
	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
880
	Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \
881
	Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
882
	Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
883
	Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
884
	Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
885
	Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
886
	Mul_SaveAcc(7, 0, 8) Mul_Acc(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) Mul_Acc(8, 0) \
887
	Mul_SaveAcc(8, 0, 9) Mul_Acc(1, 8) Mul_Acc(2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) Mul_Acc(8, 1) Mul_Acc(9, 0) \
888
	Mul_SaveAcc(9, 0, 10) Mul_Acc(1, 9) Mul_Acc(2, 8) Mul_Acc(3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) Mul_Acc(8, 2) Mul_Acc(9, 1) Mul_Acc(10, 0) \
889
	Mul_SaveAcc(10, 0, 11) Mul_Acc(1, 10) Mul_Acc(2, 9) Mul_Acc(3, 8) Mul_Acc(4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) Mul_Acc(8, 3) Mul_Acc(9, 2) Mul_Acc(10, 1) Mul_Acc(11, 0) \
890
	Mul_SaveAcc(11, 0, 12) Mul_Acc(1, 11) Mul_Acc(2, 10) Mul_Acc(3, 9) Mul_Acc(4, 8) Mul_Acc(5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) Mul_Acc(8, 4) Mul_Acc(9, 3) Mul_Acc(10, 2) Mul_Acc(11, 1) Mul_Acc(12, 0) \
891
	Mul_SaveAcc(12, 0, 13) Mul_Acc(1, 12) Mul_Acc(2, 11) Mul_Acc(3, 10) Mul_Acc(4, 9) Mul_Acc(5, 8) Mul_Acc(6, 7) Mul_Acc(7, 6) Mul_Acc(8, 5) Mul_Acc(9, 4) Mul_Acc(10, 3) Mul_Acc(11, 2) Mul_Acc(12, 1) Mul_Acc(13, 0) \
892
	Mul_SaveAcc(13, 0, 14) Mul_Acc(1, 13) Mul_Acc(2, 12) Mul_Acc(3, 11) Mul_Acc(4, 10) Mul_Acc(5, 9) Mul_Acc(6, 8) Mul_Acc(7, 7) Mul_Acc(8, 6) Mul_Acc(9, 5) Mul_Acc(10, 4) Mul_Acc(11, 3) Mul_Acc(12, 2) Mul_Acc(13, 1) Mul_Acc(14, 0) \
893
	Mul_SaveAcc(14, 0, 15) Mul_Acc(1, 14) Mul_Acc(2, 13) Mul_Acc(3, 12) Mul_Acc(4, 11) Mul_Acc(5, 10) Mul_Acc(6, 9) Mul_Acc(7, 8) Mul_Acc(8, 7) Mul_Acc(9, 6) Mul_Acc(10, 5) Mul_Acc(11, 4) Mul_Acc(12, 3) Mul_Acc(13, 2) Mul_Acc(14, 1) Mul_Acc(15, 0) \
894
	Mul_SaveAcc(15, 1, 15) Mul_Acc(2, 14) Mul_Acc(3, 13) Mul_Acc(4, 12) Mul_Acc(5, 11) Mul_Acc(6, 10) Mul_Acc(7, 9) Mul_Acc(8, 8) Mul_Acc(9, 7) Mul_Acc(10, 6) Mul_Acc(11, 5) Mul_Acc(12, 4) Mul_Acc(13, 3) Mul_Acc(14, 2) Mul_Acc(15, 1) \
895
	Mul_SaveAcc(16, 2, 15) Mul_Acc(3, 14) Mul_Acc(4, 13) Mul_Acc(5, 12) Mul_Acc(6, 11) Mul_Acc(7, 10) Mul_Acc(8, 9) Mul_Acc(9, 8) Mul_Acc(10, 7) Mul_Acc(11, 6) Mul_Acc(12, 5) Mul_Acc(13, 4) Mul_Acc(14, 3) Mul_Acc(15, 2) \
896
	Mul_SaveAcc(17, 3, 15) Mul_Acc(4, 14) Mul_Acc(5, 13) Mul_Acc(6, 12) Mul_Acc(7, 11) Mul_Acc(8, 10) Mul_Acc(9, 9) Mul_Acc(10, 8) Mul_Acc(11, 7) Mul_Acc(12, 6) Mul_Acc(13, 5) Mul_Acc(14, 4) Mul_Acc(15, 3) \
897
	Mul_SaveAcc(18, 4, 15) Mul_Acc(5, 14) Mul_Acc(6, 13) Mul_Acc(7, 12) Mul_Acc(8, 11) Mul_Acc(9, 10) Mul_Acc(10, 9) Mul_Acc(11, 8) Mul_Acc(12, 7) Mul_Acc(13, 6) Mul_Acc(14, 5) Mul_Acc(15, 4) \
898
	Mul_SaveAcc(19, 5, 15) Mul_Acc(6, 14) Mul_Acc(7, 13) Mul_Acc(8, 12) Mul_Acc(9, 11) Mul_Acc(10, 10) Mul_Acc(11, 9) Mul_Acc(12, 8) Mul_Acc(13, 7) Mul_Acc(14, 6) Mul_Acc(15, 5) \
899
	Mul_SaveAcc(20, 6, 15) Mul_Acc(7, 14) Mul_Acc(8, 13) Mul_Acc(9, 12) Mul_Acc(10, 11) Mul_Acc(11, 10) Mul_Acc(12, 9) Mul_Acc(13, 8) Mul_Acc(14, 7) Mul_Acc(15, 6) \
900
	Mul_SaveAcc(21, 7, 15) Mul_Acc(8, 14) Mul_Acc(9, 13) Mul_Acc(10, 12) Mul_Acc(11, 11) Mul_Acc(12, 10) Mul_Acc(13, 9) Mul_Acc(14, 8) Mul_Acc(15, 7) \
901
	Mul_SaveAcc(22, 8, 15) Mul_Acc(9, 14) Mul_Acc(10, 13) Mul_Acc(11, 12) Mul_Acc(12, 11) Mul_Acc(13, 10) Mul_Acc(14, 9) Mul_Acc(15, 8) \
902
	Mul_SaveAcc(23, 9, 15) Mul_Acc(10, 14) Mul_Acc(11, 13) Mul_Acc(12, 12) Mul_Acc(13, 11) Mul_Acc(14, 10) Mul_Acc(15, 9) \
903
	Mul_SaveAcc(24, 10, 15) Mul_Acc(11, 14) Mul_Acc(12, 13) Mul_Acc(13, 12) Mul_Acc(14, 11) Mul_Acc(15, 10) \
904
	Mul_SaveAcc(25, 11, 15) Mul_Acc(12, 14) Mul_Acc(13, 13) Mul_Acc(14, 12) Mul_Acc(15, 11) \
905
	Mul_SaveAcc(26, 12, 15) Mul_Acc(13, 14) Mul_Acc(14, 13) Mul_Acc(15, 12) \
906
	Mul_SaveAcc(27, 13, 15) Mul_Acc(14, 14) Mul_Acc(15, 13) \
907
	Mul_SaveAcc(28, 14, 15) Mul_Acc(15, 14) \
315 by weidai
fix compile for x64, DLL and VC 6
908
	Mul_End(29, 15)
270 by weidai
MMX/SSE2 optimizations
909
910
#define Squ_2 \
911
	Squ_Begin(2) \
912
	Squ_End(2)
913
914
#define Squ_4 \
915
	Squ_Begin(4) \
916
	Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \
917
	Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \
918
	Squ_SaveAcc(3, 1, 3) Squ_Diag(2) \
919
	Squ_SaveAcc(4, 2, 3) Squ_NonDiag \
920
	Squ_End(4)
921
922
#define Squ_8 \
923
	Squ_Begin(8) \
924
	Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \
925
	Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \
926
	Squ_SaveAcc(3, 0, 4) Squ_Acc(1, 3) Squ_Diag(2) \
927
	Squ_SaveAcc(4, 0, 5) Squ_Acc(1, 4) Squ_Acc(2, 3) Squ_NonDiag \
928
	Squ_SaveAcc(5, 0, 6) Squ_Acc(1, 5) Squ_Acc(2, 4) Squ_Diag(3) \
929
	Squ_SaveAcc(6, 0, 7) Squ_Acc(1, 6) Squ_Acc(2, 5) Squ_Acc(3, 4) Squ_NonDiag \
930
	Squ_SaveAcc(7, 1, 7) Squ_Acc(2, 6) Squ_Acc(3, 5) Squ_Diag(4) \
931
	Squ_SaveAcc(8, 2, 7) Squ_Acc(3, 6) Squ_Acc(4, 5)  Squ_NonDiag \
932
	Squ_SaveAcc(9, 3, 7) Squ_Acc(4, 6) Squ_Diag(5) \
933
	Squ_SaveAcc(10, 4, 7) Squ_Acc(5, 6) Squ_NonDiag \
934
	Squ_SaveAcc(11, 5, 7) Squ_Diag(6) \
935
	Squ_SaveAcc(12, 6, 7) Squ_NonDiag \
936
	Squ_End(8)
937
938
#define Squ_16 \
939
	Squ_Begin(16) \
940
	Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \
941
	Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \
942
	Squ_SaveAcc(3, 0, 4) Squ_Acc(1, 3) Squ_Diag(2) \
943
	Squ_SaveAcc(4, 0, 5) Squ_Acc(1, 4) Squ_Acc(2, 3) Squ_NonDiag \
944
	Squ_SaveAcc(5, 0, 6) Squ_Acc(1, 5) Squ_Acc(2, 4) Squ_Diag(3) \
945
	Squ_SaveAcc(6, 0, 7) Squ_Acc(1, 6) Squ_Acc(2, 5) Squ_Acc(3, 4) Squ_NonDiag \
946
	Squ_SaveAcc(7, 0, 8) Squ_Acc(1, 7) Squ_Acc(2, 6) Squ_Acc(3, 5) Squ_Diag(4) \
947
	Squ_SaveAcc(8, 0, 9) Squ_Acc(1, 8) Squ_Acc(2, 7) Squ_Acc(3, 6) Squ_Acc(4, 5) Squ_NonDiag \
948
	Squ_SaveAcc(9, 0, 10) Squ_Acc(1, 9) Squ_Acc(2, 8) Squ_Acc(3, 7) Squ_Acc(4, 6) Squ_Diag(5) \
949
	Squ_SaveAcc(10, 0, 11) Squ_Acc(1, 10) Squ_Acc(2, 9) Squ_Acc(3, 8) Squ_Acc(4, 7) Squ_Acc(5, 6) Squ_NonDiag \
950
	Squ_SaveAcc(11, 0, 12) Squ_Acc(1, 11) Squ_Acc(2, 10) Squ_Acc(3, 9) Squ_Acc(4, 8) Squ_Acc(5, 7) Squ_Diag(6) \
951
	Squ_SaveAcc(12, 0, 13) Squ_Acc(1, 12) Squ_Acc(2, 11) Squ_Acc(3, 10) Squ_Acc(4, 9) Squ_Acc(5, 8) Squ_Acc(6, 7) Squ_NonDiag \
952
	Squ_SaveAcc(13, 0, 14) Squ_Acc(1, 13) Squ_Acc(2, 12) Squ_Acc(3, 11) Squ_Acc(4, 10) Squ_Acc(5, 9) Squ_Acc(6, 8) Squ_Diag(7) \
953
	Squ_SaveAcc(14, 0, 15) Squ_Acc(1, 14) Squ_Acc(2, 13) Squ_Acc(3, 12) Squ_Acc(4, 11) Squ_Acc(5, 10) Squ_Acc(6, 9) Squ_Acc(7, 8) Squ_NonDiag \
954
	Squ_SaveAcc(15, 1, 15) Squ_Acc(2, 14) Squ_Acc(3, 13) Squ_Acc(4, 12) Squ_Acc(5, 11) Squ_Acc(6, 10) Squ_Acc(7, 9) Squ_Diag(8) \
955
	Squ_SaveAcc(16, 2, 15) Squ_Acc(3, 14) Squ_Acc(4, 13) Squ_Acc(5, 12) Squ_Acc(6, 11) Squ_Acc(7, 10) Squ_Acc(8, 9) Squ_NonDiag \
956
	Squ_SaveAcc(17, 3, 15) Squ_Acc(4, 14) Squ_Acc(5, 13) Squ_Acc(6, 12) Squ_Acc(7, 11) Squ_Acc(8, 10) Squ_Diag(9) \
957
	Squ_SaveAcc(18, 4, 15) Squ_Acc(5, 14) Squ_Acc(6, 13) Squ_Acc(7, 12) Squ_Acc(8, 11) Squ_Acc(9, 10) Squ_NonDiag \
958
	Squ_SaveAcc(19, 5, 15) Squ_Acc(6, 14) Squ_Acc(7, 13) Squ_Acc(8, 12) Squ_Acc(9, 11) Squ_Diag(10) \
959
	Squ_SaveAcc(20, 6, 15) Squ_Acc(7, 14) Squ_Acc(8, 13) Squ_Acc(9, 12) Squ_Acc(10, 11) Squ_NonDiag \
960
	Squ_SaveAcc(21, 7, 15) Squ_Acc(8, 14) Squ_Acc(9, 13) Squ_Acc(10, 12) Squ_Diag(11) \
961
	Squ_SaveAcc(22, 8, 15) Squ_Acc(9, 14) Squ_Acc(10, 13) Squ_Acc(11, 12) Squ_NonDiag \
962
	Squ_SaveAcc(23, 9, 15) Squ_Acc(10, 14) Squ_Acc(11, 13) Squ_Diag(12) \
963
	Squ_SaveAcc(24, 10, 15) Squ_Acc(11, 14) Squ_Acc(12, 13) Squ_NonDiag \
964
	Squ_SaveAcc(25, 11, 15) Squ_Acc(12, 14) Squ_Diag(13) \
965
	Squ_SaveAcc(26, 12, 15) Squ_Acc(13, 14) Squ_NonDiag \
966
	Squ_SaveAcc(27, 13, 15) Squ_Diag(14) \
967
	Squ_SaveAcc(28, 14, 15) Squ_NonDiag \
968
	Squ_End(16)
969
970
#define Bot_2 \
971
	Mul_Begin(2) \
972
	Bot_SaveAcc(0, 0, 1) Bot_Acc(1, 0) \
973
	Bot_End(2)
974
975
#define Bot_4 \
976
	Mul_Begin(4) \
977
	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
978
	Mul_SaveAcc(1, 2, 0) Mul_Acc(1, 1) Mul_Acc(0, 2)  \
979
	Bot_SaveAcc(2, 0, 3) Bot_Acc(1, 2) Bot_Acc(2, 1) Bot_Acc(3, 0)  \
980
	Bot_End(4)
981
982
#define Bot_8 \
983
	Mul_Begin(8) \
984
	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
985
	Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0)  \
986
	Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0)  \
987
	Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
988
	Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
989
	Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
990
	Bot_SaveAcc(6, 0, 7) Bot_Acc(1, 6) Bot_Acc(2, 5) Bot_Acc(3, 4) Bot_Acc(4, 3) Bot_Acc(5, 2) Bot_Acc(6, 1) Bot_Acc(7, 0) \
991
	Bot_End(8)
992
993
#define Bot_16 \
994
	Mul_Begin(16) \
995
	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
996
	Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \
997
	Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
998
	Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
999
	Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
1000
	Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
1001
	Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
1002
	Mul_SaveAcc(7, 0, 8) Mul_Acc(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) Mul_Acc(8, 0) \
1003
	Mul_SaveAcc(8, 0, 9) Mul_Acc(1, 8) Mul_Acc(2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) Mul_Acc(8, 1) Mul_Acc(9, 0) \
1004
	Mul_SaveAcc(9, 0, 10) Mul_Acc(1, 9) Mul_Acc(2, 8) Mul_Acc(3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) Mul_Acc(8, 2) Mul_Acc(9, 1) Mul_Acc(10, 0) \
1005
	Mul_SaveAcc(10, 0, 11) Mul_Acc(1, 10) Mul_Acc(2, 9) Mul_Acc(3, 8) Mul_Acc(4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) Mul_Acc(8, 3) Mul_Acc(9, 2) Mul_Acc(10, 1) Mul_Acc(11, 0) \
1006
	Mul_SaveAcc(11, 0, 12) Mul_Acc(1, 11) Mul_Acc(2, 10) Mul_Acc(3, 9) Mul_Acc(4, 8) Mul_Acc(5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) Mul_Acc(8, 4) Mul_Acc(9, 3) Mul_Acc(10, 2) Mul_Acc(11, 1) Mul_Acc(12, 0) \
1007
	Mul_SaveAcc(12, 0, 13) Mul_Acc(1, 12) Mul_Acc(2, 11) Mul_Acc(3, 10) Mul_Acc(4, 9) Mul_Acc(5, 8) Mul_Acc(6, 7) Mul_Acc(7, 6) Mul_Acc(8, 5) Mul_Acc(9, 4) Mul_Acc(10, 3) Mul_Acc(11, 2) Mul_Acc(12, 1) Mul_Acc(13, 0) \
1008
	Mul_SaveAcc(13, 0, 14) Mul_Acc(1, 13) Mul_Acc(2, 12) Mul_Acc(3, 11) Mul_Acc(4, 10) Mul_Acc(5, 9) Mul_Acc(6, 8) Mul_Acc(7, 7) Mul_Acc(8, 6) Mul_Acc(9, 5) Mul_Acc(10, 4) Mul_Acc(11, 3) Mul_Acc(12, 2) Mul_Acc(13, 1) Mul_Acc(14, 0) \
1009
	Bot_SaveAcc(14, 0, 15) Bot_Acc(1, 14) Bot_Acc(2, 13) Bot_Acc(3, 12) Bot_Acc(4, 11) Bot_Acc(5, 10) Bot_Acc(6, 9) Bot_Acc(7, 8) Bot_Acc(8, 7) Bot_Acc(9, 6) Bot_Acc(10, 5) Bot_Acc(11, 4) Bot_Acc(12, 3) Bot_Acc(13, 2) Bot_Acc(14, 1) Bot_Acc(15, 0) \
1010
	Bot_End(16)
363 by weidai
fix possible branch prediction analysis (BPA) vulnerability
1011
	
1012
#endif
270 by weidai
MMX/SSE2 optimizations
1013
315 by weidai
fix compile for x64, DLL and VC 6
1014
#if 0
270 by weidai
MMX/SSE2 optimizations
1015
#define Mul_Begin(n)				\
1016
	Declare2Words(p)				\
1017
	Declare2Words(c)				\
1018
	Declare2Words(d)				\
1019
	MultiplyWords(p, A[0], B[0])	\
1020
	AssignWord(c, LowWord(p))		\
1021
	AssignWord(d, HighWord(p))
1022
1023
#define Mul_Acc(i, j)				\
1024
	MultiplyWords(p, A[i], B[j])	\
1025
	Acc2WordsBy1(c, LowWord(p))		\
1026
	Acc2WordsBy1(d, HighWord(p))
1027
1028
#define Mul_SaveAcc(k, i, j) 		\
1029
	R[k] = LowWord(c);				\
1030
	Add2WordsBy1(c, d, HighWord(c))	\
1031
	MultiplyWords(p, A[i], B[j])	\
1032
	AssignWord(d, HighWord(p))		\
1033
	Acc2WordsBy1(c, LowWord(p))
1034
1035
#define Mul_End(n)					\
1036
	R[2*n-3] = LowWord(c);			\
1037
	Acc2WordsBy1(d, HighWord(c))	\
1038
	MultiplyWords(p, A[n-1], B[n-1])\
1039
	Acc2WordsBy2(d, p)				\
1040
	R[2*n-2] = LowWord(d);			\
1041
	R[2*n-1] = HighWord(d);
1042
1043
#define Bot_SaveAcc(k, i, j)		\
1044
	R[k] = LowWord(c);				\
1045
	word e = LowWord(d) + HighWord(c);	\
1046
	e += A[i] * B[j];
1047
1048
#define Bot_Acc(i, j)	\
1049
	e += A[i] * B[j];
1050
1051
#define Bot_End(n)		\
1052
	R[n-1] = e;
315 by weidai
fix compile for x64, DLL and VC 6
1053
#else
270 by weidai
MMX/SSE2 optimizations
1054
#define Mul_Begin(n)				\
1055
	Declare2Words(p)				\
1056
	word c;	\
1057
	Declare2Words(d)				\
1058
	MultiplyWords(p, A[0], B[0])	\
1059
	c = LowWord(p);		\
1060
	AssignWord(d, HighWord(p))
1061
1062
#define Mul_Acc(i, j)				\
315 by weidai
fix compile for x64, DLL and VC 6
1063
	MulAcc(c, d, A[i], B[j])
270 by weidai
MMX/SSE2 optimizations
1064
1065
#define Mul_SaveAcc(k, i, j) 		\
1066
	R[k] = c;				\
315 by weidai
fix compile for x64, DLL and VC 6
1067
	c = LowWord(d);	\
270 by weidai
MMX/SSE2 optimizations
1068
	AssignWord(d, HighWord(d))	\
315 by weidai
fix compile for x64, DLL and VC 6
1069
	MulAcc(c, d, A[i], B[j])
270 by weidai
MMX/SSE2 optimizations
1070
315 by weidai
fix compile for x64, DLL and VC 6
1071
#define Mul_End(k, i)					\
1072
	R[k] = c;			\
1073
	MultiplyWords(p, A[i], B[i])	\
1074
	Acc2WordsBy2(p, d)				\
1075
	R[k+1] = LowWord(p);			\
1076
	R[k+2] = HighWord(p);
270 by weidai
MMX/SSE2 optimizations
1077
1078
#define Bot_SaveAcc(k, i, j)		\
1079
	R[k] = c;				\
1080
	c = LowWord(d);	\
1081
	c += A[i] * B[j];
1082
1083
#define Bot_Acc(i, j)	\
1084
	c += A[i] * B[j];
1085
1086
#define Bot_End(n)		\
1087
	R[n-1] = c;
315 by weidai
fix compile for x64, DLL and VC 6
1088
#endif
270 by weidai
MMX/SSE2 optimizations
1089
1090
#define Squ_Begin(n)				\
1091
	Declare2Words(p)				\
315 by weidai
fix compile for x64, DLL and VC 6
1092
	word c;				\
270 by weidai
MMX/SSE2 optimizations
1093
	Declare2Words(d)				\
1094
	Declare2Words(e)				\
1095
	MultiplyWords(p, A[0], A[0])	\
1096
	R[0] = LowWord(p);				\
1097
	AssignWord(e, HighWord(p))		\
1098
	MultiplyWords(p, A[0], A[1])	\
315 by weidai
fix compile for x64, DLL and VC 6
1099
	c = LowWord(p);		\
270 by weidai
MMX/SSE2 optimizations
1100
	AssignWord(d, HighWord(p))		\
1101
	Squ_NonDiag						\
1102
1103
#define Squ_NonDiag				\
315 by weidai
fix compile for x64, DLL and VC 6
1104
	Double3Words(c, d)
270 by weidai
MMX/SSE2 optimizations
1105
1106
#define Squ_SaveAcc(k, i, j) 		\
315 by weidai
fix compile for x64, DLL and VC 6
1107
	Acc3WordsBy2(c, d, e)			\
1108
	R[k] = c;				\
270 by weidai
MMX/SSE2 optimizations
1109
	MultiplyWords(p, A[i], A[j])	\
315 by weidai
fix compile for x64, DLL and VC 6
1110
	c = LowWord(p);		\
270 by weidai
MMX/SSE2 optimizations
1111
	AssignWord(d, HighWord(p))		\
1112
1113
#define Squ_Acc(i, j)				\
315 by weidai
fix compile for x64, DLL and VC 6
1114
	MulAcc(c, d, A[i], A[j])
270 by weidai
MMX/SSE2 optimizations
1115
1116
#define Squ_Diag(i)					\
1117
	Squ_NonDiag						\
315 by weidai
fix compile for x64, DLL and VC 6
1118
	MulAcc(c, d, A[i], A[i])
270 by weidai
MMX/SSE2 optimizations
1119
1120
#define Squ_End(n)					\
315 by weidai
fix compile for x64, DLL and VC 6
1121
	Acc3WordsBy2(c, d, e)			\
1122
	R[2*n-3] = c;			\
270 by weidai
MMX/SSE2 optimizations
1123
	MultiplyWords(p, A[n-1], A[n-1])\
315 by weidai
fix compile for x64, DLL and VC 6
1124
	Acc2WordsBy2(p, e)				\
1125
	R[2*n-2] = LowWord(p);			\
1126
	R[2*n-1] = HighWord(p);
270 by weidai
MMX/SSE2 optimizations
1127
1128
void Baseline_Multiply2(word *R, const word *A, const word *B)
1129
{
1130
	Mul_2
1131
}
1132
1133
void Baseline_Multiply4(word *R, const word *A, const word *B)
1134
{
1135
	Mul_4
1136
}
1137
1138
void Baseline_Multiply8(word *R, const word *A, const word *B)
1139
{
1140
	Mul_8
1141
}
1142
1143
void Baseline_Square2(word *R, const word *A)
1144
{
1145
	Squ_2
1146
}
1147
1148
void Baseline_Square4(word *R, const word *A)
1149
{
1150
	Squ_4
1151
}
1152
1153
void Baseline_Square8(word *R, const word *A)
1154
{
1155
	Squ_8
1156
}
1157
1158
void Baseline_MultiplyBottom2(word *R, const word *A, const word *B)
1159
{
1160
	Bot_2
1161
}
1162
1163
void Baseline_MultiplyBottom4(word *R, const word *A, const word *B)
1164
{
1165
	Bot_4
1166
}
1167
1168
void Baseline_MultiplyBottom8(word *R, const word *A, const word *B)
1169
{
1170
	Bot_8
1171
}
1172
315 by weidai
fix compile for x64, DLL and VC 6
1173
#define Top_Begin(n)				\
1174
	Declare2Words(p)				\
1175
	word c;	\
1176
	Declare2Words(d)				\
1177
	MultiplyWords(p, A[0], B[n-2]);\
1178
	AssignWord(d, HighWord(p));
1179
1180
#define Top_Acc(i, j)	\
1181
	MultiplyWords(p, A[i], B[j]);\
1182
	Acc2WordsBy1(d, HighWord(p));
1183
1184
#define Top_SaveAcc0(i, j) 		\
1185
	c = LowWord(d);	\
1186
	AssignWord(d, HighWord(d))	\
1187
	MulAcc(c, d, A[i], B[j])
1188
1189
#define Top_SaveAcc1(i, j) 		\
1190
	c = L<c; \
1191
	Acc2WordsBy1(d, c);	\
1192
	c = LowWord(d);	\
1193
	AssignWord(d, HighWord(d))	\
1194
	MulAcc(c, d, A[i], B[j])
1195
1196
void Baseline_MultiplyTop2(word *R, const word *A, const word *B, word L)
1197
{
1198
	word T[4];
1199
	Baseline_Multiply2(T, A, B);
1200
	R[0] = T[2];
1201
	R[1] = T[3];
1202
}
1203
1204
void Baseline_MultiplyTop4(word *R, const word *A, const word *B, word L)
1205
{
1206
	Top_Begin(4)
1207
	Top_Acc(1, 1) Top_Acc(2, 0)  \
1208
	Top_SaveAcc0(0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0)  \
1209
	Top_SaveAcc1(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1)  \
1210
	Mul_SaveAcc(0, 2, 3) Mul_Acc(3, 2) \
1211
	Mul_End(1, 3)
1212
}
1213
1214
void Baseline_MultiplyTop8(word *R, const word *A, const word *B, word L)
1215
{
1216
	Top_Begin(8)
1217
	Top_Acc(1, 5) Top_Acc(2, 4) Top_Acc(3, 3) Top_Acc(4, 2) Top_Acc(5, 1) Top_Acc(6, 0) \
1218
	Top_SaveAcc0(0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
1219
	Top_SaveAcc1(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) \
1220
	Mul_SaveAcc(0, 2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) \
1221
	Mul_SaveAcc(1, 3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) \
1222
	Mul_SaveAcc(2, 4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) \
1223
	Mul_SaveAcc(3, 5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) \
1224
	Mul_SaveAcc(4, 6, 7) Mul_Acc(7, 6) \
1225
	Mul_End(5, 7)
1226
}
1227
1228
#if !CRYPTOPP_INTEGER_SSE2	// save memory by not compiling these functions when SSE2 is available
270 by weidai
MMX/SSE2 optimizations
1229
void Baseline_Multiply16(word *R, const word *A, const word *B)
1230
{
1231
	Mul_16
1232
}
1233
1234
void Baseline_Square16(word *R, const word *A)
1235
{
1236
	Squ_16
1237
}
1238
1239
void Baseline_MultiplyBottom16(word *R, const word *A, const word *B)
1240
{
1241
	Bot_16
1242
}
315 by weidai
fix compile for x64, DLL and VC 6
1243
1244
void Baseline_MultiplyTop16(word *R, const word *A, const word *B, word L)
1245
{
1246
	Top_Begin(16)
1247
	Top_Acc(1, 13) Top_Acc(2, 12) Top_Acc(3, 11) Top_Acc(4, 10) Top_Acc(5, 9) Top_Acc(6, 8) Top_Acc(7, 7) Top_Acc(8, 6) Top_Acc(9, 5) Top_Acc(10, 4) Top_Acc(11, 3) Top_Acc(12, 2) Top_Acc(13, 1) Top_Acc(14, 0) \
1248
	Top_SaveAcc0(0, 15) Mul_Acc(1, 14) Mul_Acc(2, 13) Mul_Acc(3, 12) Mul_Acc(4, 11) Mul_Acc(5, 10) Mul_Acc(6, 9) Mul_Acc(7, 8) Mul_Acc(8, 7) Mul_Acc(9, 6) Mul_Acc(10, 5) Mul_Acc(11, 4) Mul_Acc(12, 3) Mul_Acc(13, 2) Mul_Acc(14, 1) Mul_Acc(15, 0) \
1249
	Top_SaveAcc1(1, 15) Mul_Acc(2, 14) Mul_Acc(3, 13) Mul_Acc(4, 12) Mul_Acc(5, 11) Mul_Acc(6, 10) Mul_Acc(7, 9) Mul_Acc(8, 8) Mul_Acc(9, 7) Mul_Acc(10, 6) Mul_Acc(11, 5) Mul_Acc(12, 4) Mul_Acc(13, 3) Mul_Acc(14, 2) Mul_Acc(15, 1) \
1250
	Mul_SaveAcc(0, 2, 15) Mul_Acc(3, 14) Mul_Acc(4, 13) Mul_Acc(5, 12) Mul_Acc(6, 11) Mul_Acc(7, 10) Mul_Acc(8, 9) Mul_Acc(9, 8) Mul_Acc(10, 7) Mul_Acc(11, 6) Mul_Acc(12, 5) Mul_Acc(13, 4) Mul_Acc(14, 3) Mul_Acc(15, 2) \
1251
	Mul_SaveAcc(1, 3, 15) Mul_Acc(4, 14) Mul_Acc(5, 13) Mul_Acc(6, 12) Mul_Acc(7, 11) Mul_Acc(8, 10) Mul_Acc(9, 9) Mul_Acc(10, 8) Mul_Acc(11, 7) Mul_Acc(12, 6) Mul_Acc(13, 5) Mul_Acc(14, 4) Mul_Acc(15, 3) \
1252
	Mul_SaveAcc(2, 4, 15) Mul_Acc(5, 14) Mul_Acc(6, 13) Mul_Acc(7, 12) Mul_Acc(8, 11) Mul_Acc(9, 10) Mul_Acc(10, 9) Mul_Acc(11, 8) Mul_Acc(12, 7) Mul_Acc(13, 6) Mul_Acc(14, 5) Mul_Acc(15, 4) \
1253
	Mul_SaveAcc(3, 5, 15) Mul_Acc(6, 14) Mul_Acc(7, 13) Mul_Acc(8, 12) Mul_Acc(9, 11) Mul_Acc(10, 10) Mul_Acc(11, 9) Mul_Acc(12, 8) Mul_Acc(13, 7) Mul_Acc(14, 6) Mul_Acc(15, 5) \
1254
	Mul_SaveAcc(4, 6, 15) Mul_Acc(7, 14) Mul_Acc(8, 13) Mul_Acc(9, 12) Mul_Acc(10, 11) Mul_Acc(11, 10) Mul_Acc(12, 9) Mul_Acc(13, 8) Mul_Acc(14, 7) Mul_Acc(15, 6) \
1255
	Mul_SaveAcc(5, 7, 15) Mul_Acc(8, 14) Mul_Acc(9, 13) Mul_Acc(10, 12) Mul_Acc(11, 11) Mul_Acc(12, 10) Mul_Acc(13, 9) Mul_Acc(14, 8) Mul_Acc(15, 7) \
1256
	Mul_SaveAcc(6, 8, 15) Mul_Acc(9, 14) Mul_Acc(10, 13) Mul_Acc(11, 12) Mul_Acc(12, 11) Mul_Acc(13, 10) Mul_Acc(14, 9) Mul_Acc(15, 8) \
1257
	Mul_SaveAcc(7, 9, 15) Mul_Acc(10, 14) Mul_Acc(11, 13) Mul_Acc(12, 12) Mul_Acc(13, 11) Mul_Acc(14, 10) Mul_Acc(15, 9) \
1258
	Mul_SaveAcc(8, 10, 15) Mul_Acc(11, 14) Mul_Acc(12, 13) Mul_Acc(13, 12) Mul_Acc(14, 11) Mul_Acc(15, 10) \
1259
	Mul_SaveAcc(9, 11, 15) Mul_Acc(12, 14) Mul_Acc(13, 13) Mul_Acc(14, 12) Mul_Acc(15, 11) \
1260
	Mul_SaveAcc(10, 12, 15) Mul_Acc(13, 14) Mul_Acc(14, 13) Mul_Acc(15, 12) \
1261
	Mul_SaveAcc(11, 13, 15) Mul_Acc(14, 14) Mul_Acc(15, 13) \
1262
	Mul_SaveAcc(12, 14, 15) Mul_Acc(15, 14) \
1263
	Mul_End(13, 15)
1264
}
1265
#endif
270 by weidai
MMX/SSE2 optimizations
1266
1267
// ********************************************************
1268
315 by weidai
fix compile for x64, DLL and VC 6
1269
#if CRYPTOPP_INTEGER_SSE2
270 by weidai
MMX/SSE2 optimizations
1270
1271
CRYPTOPP_ALIGN_DATA(16) static const word32 s_maskLow16[4] CRYPTOPP_SECTION_ALIGN16 = {0xffff,0xffff,0xffff,0xffff};
1272
1273
#undef Mul_Begin
1274
#undef Mul_Acc
315 by weidai
fix compile for x64, DLL and VC 6
1275
#undef Top_Begin
1276
#undef Top_Acc
270 by weidai
MMX/SSE2 optimizations
1277
#undef Squ_Acc
1278
#undef Squ_NonDiag
1279
#undef Squ_Diag
1280
#undef Squ_SaveAcc
1281
#undef Squ_Begin
1282
#undef Mul_SaveAcc
1283
#undef Bot_Acc
1284
#undef Bot_SaveAcc
1285
#undef Bot_End
1286
#undef Squ_End
1287
#undef Mul_End
1288
1289
#define SSE2_FinalSave(k)			\
1290
	AS2(	psllq		xmm5, 16)	\
1291
	AS2(	paddq		xmm4, xmm5)	\
1292
	AS2(	movq		QWORD PTR [ecx+8*(k)], xmm4)
1293
1294
#define SSE2_SaveShift(k)			\
1295
	AS2(	movq		xmm0, xmm6)	\
1296
	AS2(	punpckhqdq	xmm6, xmm0)	\
1297
	AS2(	movq		xmm1, xmm7)	\
1298
	AS2(	punpckhqdq	xmm7, xmm1)	\
1299
	AS2(	paddd		xmm6, xmm0)	\
1300
	AS2(	pslldq		xmm6, 4)	\
1301
	AS2(	paddd		xmm7, xmm1)	\
1302
	AS2(	paddd		xmm4, xmm6)	\
1303
	AS2(	pslldq		xmm7, 4)	\
1304
	AS2(	movq		xmm6, xmm4)	\
1305
	AS2(	paddd		xmm5, xmm7)	\
1306
	AS2(	movq		xmm7, xmm5)	\
1307
	AS2(	movd		DWORD PTR [ecx+8*(k)], xmm4)	\
1308
	AS2(	psrlq		xmm6, 16)	\
1309
	AS2(	paddq		xmm6, xmm7)	\
1310
	AS2(	punpckhqdq	xmm4, xmm0)	\
1311
	AS2(	punpckhqdq	xmm5, xmm0)	\
1312
	AS2(	movq		QWORD PTR [ecx+8*(k)+2], xmm6)	\
1313
	AS2(	psrlq		xmm6, 3*16)	\
1314
	AS2(	paddd		xmm4, xmm6)	\
1315
1316
#define Squ_SSE2_SaveShift(k)			\
1317
	AS2(	movq		xmm0, xmm6)	\
1318
	AS2(	punpckhqdq	xmm6, xmm0)	\
1319
	AS2(	movq		xmm1, xmm7)	\
1320
	AS2(	punpckhqdq	xmm7, xmm1)	\
1321
	AS2(	paddd		xmm6, xmm0)	\
1322
	AS2(	pslldq		xmm6, 4)	\
1323
	AS2(	paddd		xmm7, xmm1)	\
1324
	AS2(	paddd		xmm4, xmm6)	\
1325
	AS2(	pslldq		xmm7, 4)	\
1326
	AS2(	movhlps		xmm6, xmm4)	\
1327
	AS2(	movd		DWORD PTR [ecx+8*(k)], xmm4)	\
1328
	AS2(	paddd		xmm5, xmm7)	\
1329
	AS2(	movhps		QWORD PTR [esp+12], xmm5)\
1330
	AS2(	psrlq		xmm4, 16)	\
1331
	AS2(	paddq		xmm4, xmm5)	\
1332
	AS2(	movq		QWORD PTR [ecx+8*(k)+2], xmm4)	\
1333
	AS2(	psrlq		xmm4, 3*16)	\
1334
	AS2(	paddd		xmm4, xmm6)	\
1335
	AS2(	movq		QWORD PTR [esp+4], xmm4)\
1336
1337
#define SSE2_FirstMultiply(i)				\
1338
	AS2(	movdqa		xmm7, [esi+(i)*16])\
1339
	AS2(	movdqa		xmm5, [edi-(i)*16])\
1340
	AS2(	pmuludq		xmm5, xmm7)		\
1341
	AS2(	movdqa		xmm4, [ebx])\
1342
	AS2(	movdqa		xmm6, xmm4)		\
1343
	AS2(	pand		xmm4, xmm5)		\
1344
	AS2(	psrld		xmm5, 16)		\
1345
	AS2(	pmuludq		xmm7, [edx-(i)*16])\
1346
	AS2(	pand		xmm6, xmm7)		\
1347
	AS2(	psrld		xmm7, 16)
1348
1349
#define Squ_Begin(n)							\
1350
	SquPrologue									\
1351
	AS2(	mov		esi, esp)\
1352
	AS2(	and		esp, 0xfffffff0)\
1353
	AS2(	lea		edi, [esp-32*n])\
1354
	AS2(	sub		esp, 32*n+16)\
1355
	AS1(	push	esi)\
1356
	AS2(	mov		esi, edi)					\
1357
	AS2(	xor		edx, edx)					\
1358
	ASL(1)										\
1359
	ASS(	pshufd	xmm0, [eax+edx], 3,1,2,0)	\
1360
	ASS(	pshufd	xmm1, [eax+edx], 2,0,3,1)	\
1361
	AS2(	movdqa	[edi+2*edx], xmm0)		\
1362
	AS2(	psrlq	xmm0, 32)					\
1363
	AS2(	movdqa	[edi+2*edx+16], xmm0)	\
1364
	AS2(	movdqa	[edi+16*n+2*edx], xmm1)		\
1365
	AS2(	psrlq	xmm1, 32)					\
1366
	AS2(	movdqa	[edi+16*n+2*edx+16], xmm1)	\
1367
	AS2(	add		edx, 16)					\
1368
	AS2(	cmp		edx, 8*(n))					\
1369
	ASJ(	jne,	1, b)						\
1370
	AS2(	lea		edx, [edi+16*n])\
1371
	SSE2_FirstMultiply(0)							\
1372
1373
#define Squ_Acc(i)								\
1374
	ASL(LSqu##i)								\
1375
	AS2(	movdqa		xmm1, [esi+(i)*16])	\
1376
	AS2(	movdqa		xmm0, [edi-(i)*16])	\
1377
	AS2(	movdqa		xmm2, [ebx])	\
1378
	AS2(	pmuludq		xmm0, xmm1)				\
1379
	AS2(	pmuludq		xmm1, [edx-(i)*16])	\
1380
	AS2(	movdqa		xmm3, xmm2)			\
1381
	AS2(	pand		xmm2, xmm0)			\
1382
	AS2(	psrld		xmm0, 16)			\
1383
	AS2(	paddd		xmm4, xmm2)			\
1384
	AS2(	paddd		xmm5, xmm0)			\
1385
	AS2(	pand		xmm3, xmm1)			\
1386
	AS2(	psrld		xmm1, 16)			\
1387
	AS2(	paddd		xmm6, xmm3)			\
1388
	AS2(	paddd		xmm7, xmm1)		\
1389
1390
#define Squ_Acc1(i)		
1391
#define Squ_Acc2(i)		ASC(call, LSqu##i)
1392
#define Squ_Acc3(i)		Squ_Acc2(i)
1393
#define Squ_Acc4(i)		Squ_Acc2(i)
1394
#define Squ_Acc5(i)		Squ_Acc2(i)
1395
#define Squ_Acc6(i)		Squ_Acc2(i)
1396
#define Squ_Acc7(i)		Squ_Acc2(i)
1397
#define Squ_Acc8(i)		Squ_Acc2(i)
1398
1399
#define SSE2_End(E, n)					\
1400
	SSE2_SaveShift(2*(n)-3)			\
1401
	AS2(	movdqa		xmm7, [esi+16])	\
1402
	AS2(	movdqa		xmm0, [edi])	\
1403
	AS2(	pmuludq		xmm0, xmm7)				\
1404
	AS2(	movdqa		xmm2, [ebx])		\
1405
	AS2(	pmuludq		xmm7, [edx])	\
1406
	AS2(	movdqa		xmm6, xmm2)				\
1407
	AS2(	pand		xmm2, xmm0)				\
1408
	AS2(	psrld		xmm0, 16)				\
1409
	AS2(	paddd		xmm4, xmm2)				\
1410
	AS2(	paddd		xmm5, xmm0)				\
1411
	AS2(	pand		xmm6, xmm7)				\
1412
	AS2(	psrld		xmm7, 16)	\
1413
	SSE2_SaveShift(2*(n)-2)			\
1414
	SSE2_FinalSave(2*(n)-1)			\
1415
	AS1(	pop		esp)\
1416
	E
1417
1418
#define Squ_End(n)		SSE2_End(SquEpilogue, n)
1419
#define Mul_End(n)		SSE2_End(MulEpilogue, n)
1420
#define Top_End(n)		SSE2_End(TopEpilogue, n)
1421
1422
#define Squ_Column1(k, i)	\
1423
	Squ_SSE2_SaveShift(k)					\
1424
	AS2(	add			esi, 16)	\
1425
	SSE2_FirstMultiply(1)\
1426
	Squ_Acc##i(i)	\
1427
	AS2(	paddd		xmm4, xmm4)		\
1428
	AS2(	paddd		xmm5, xmm5)		\
1429
	AS2(	movdqa		xmm3, [esi])				\
1430
	AS2(	movq		xmm1, QWORD PTR [esi+8])	\
1431
	AS2(	pmuludq		xmm1, xmm3)		\
1432
	AS2(	pmuludq		xmm3, xmm3)		\
1433
	AS2(	movdqa		xmm0, [ebx])\
1434
	AS2(	movdqa		xmm2, xmm0)		\
1435
	AS2(	pand		xmm0, xmm1)		\
1436
	AS2(	psrld		xmm1, 16)		\
1437
	AS2(	paddd		xmm6, xmm0)		\
1438
	AS2(	paddd		xmm7, xmm1)		\
1439
	AS2(	pand		xmm2, xmm3)		\
1440
	AS2(	psrld		xmm3, 16)		\
1441
	AS2(	paddd		xmm6, xmm6)		\
1442
	AS2(	paddd		xmm7, xmm7)		\
1443
	AS2(	paddd		xmm4, xmm2)		\
1444
	AS2(	paddd		xmm5, xmm3)		\
1445
	AS2(	movq		xmm0, QWORD PTR [esp+4])\
1446
	AS2(	movq		xmm1, QWORD PTR [esp+12])\
1447
	AS2(	paddd		xmm4, xmm0)\
1448
	AS2(	paddd		xmm5, xmm1)\
1449
1450
#define Squ_Column0(k, i)	\
1451
	Squ_SSE2_SaveShift(k)					\
1452
	AS2(	add			edi, 16)	\
1453
	AS2(	add			edx, 16)	\
1454
	SSE2_FirstMultiply(1)\
1455
	Squ_Acc##i(i)	\
1456
	AS2(	paddd		xmm6, xmm6)		\
1457
	AS2(	paddd		xmm7, xmm7)		\
1458
	AS2(	paddd		xmm4, xmm4)		\
1459
	AS2(	paddd		xmm5, xmm5)		\
1460
	AS2(	movq		xmm0, QWORD PTR [esp+4])\
1461
	AS2(	movq		xmm1, QWORD PTR [esp+12])\
1462
	AS2(	paddd		xmm4, xmm0)\
1463
	AS2(	paddd		xmm5, xmm1)\
1464
1465
#define SSE2_MulAdd45						\
1466
	AS2(	movdqa		xmm7, [esi])	\
1467
	AS2(	movdqa		xmm0, [edi])	\
1468
	AS2(	pmuludq		xmm0, xmm7)				\
1469
	AS2(	movdqa		xmm2, [ebx])		\
1470
	AS2(	pmuludq		xmm7, [edx])	\
1471
	AS2(	movdqa		xmm6, xmm2)				\
1472
	AS2(	pand		xmm2, xmm0)				\
1473
	AS2(	psrld		xmm0, 16)				\
1474
	AS2(	paddd		xmm4, xmm2)				\
1475
	AS2(	paddd		xmm5, xmm0)				\
1476
	AS2(	pand		xmm6, xmm7)				\
1477
	AS2(	psrld		xmm7, 16)
1478
1479
#define Mul_Begin(n)							\
1480
	MulPrologue									\
1481
	AS2(	mov		esi, esp)\
1482
	AS2(	and		esp, 0xfffffff0)\
1483
	AS2(	sub		esp, 48*n+16)\
1484
	AS1(	push	esi)\
1485
	AS2(	xor		edx, edx)					\
1486
	ASL(1)										\
1487
	ASS(	pshufd	xmm0, [eax+edx], 3,1,2,0)	\
1488
	ASS(	pshufd	xmm1, [eax+edx], 2,0,3,1)	\
1489
	ASS(	pshufd	xmm2, [edi+edx], 3,1,2,0)	\
1490
	AS2(	movdqa	[esp+20+2*edx], xmm0)		\
1491
	AS2(	psrlq	xmm0, 32)					\
1492
	AS2(	movdqa	[esp+20+2*edx+16], xmm0)	\
1493
	AS2(	movdqa	[esp+20+16*n+2*edx], xmm1)		\
1494
	AS2(	psrlq	xmm1, 32)					\
1495
	AS2(	movdqa	[esp+20+16*n+2*edx+16], xmm1)	\
1496
	AS2(	movdqa	[esp+20+32*n+2*edx], xmm2)		\
1497
	AS2(	psrlq	xmm2, 32)					\
1498
	AS2(	movdqa	[esp+20+32*n+2*edx+16], xmm2)	\
1499
	AS2(	add		edx, 16)					\
1500
	AS2(	cmp		edx, 8*(n))					\
1501
	ASJ(	jne,	1, b)						\
1502
	AS2(	lea		edi, [esp+20])\
1503
	AS2(	lea		edx, [esp+20+16*n])\
1504
	AS2(	lea		esi, [esp+20+32*n])\
1505
	SSE2_FirstMultiply(0)							\
1506
1507
#define Mul_Acc(i)								\
1508
	ASL(LMul##i)										\
1509
	AS2(	movdqa		xmm1, [esi+i/2*(1-(i-2*(i/2))*2)*16])	\
1510
	AS2(	movdqa		xmm0, [edi-i/2*(1-(i-2*(i/2))*2)*16])	\
1511
	AS2(	movdqa		xmm2, [ebx])	\
1512
	AS2(	pmuludq		xmm0, xmm1)				\
1513
	AS2(	pmuludq		xmm1, [edx-i/2*(1-(i-2*(i/2))*2)*16])	\
1514
	AS2(	movdqa		xmm3, xmm2)			\
1515
	AS2(	pand		xmm2, xmm0)			\
1516
	AS2(	psrld		xmm0, 16)			\
1517
	AS2(	paddd		xmm4, xmm2)			\
1518
	AS2(	paddd		xmm5, xmm0)			\
1519
	AS2(	pand		xmm3, xmm1)			\
1520
	AS2(	psrld		xmm1, 16)			\
1521
	AS2(	paddd		xmm6, xmm3)			\
1522
	AS2(	paddd		xmm7, xmm1)		\
1523
1524
#define Mul_Acc1(i)		
1525
#define Mul_Acc2(i)		ASC(call, LMul##i)
1526
#define Mul_Acc3(i)		Mul_Acc2(i)
1527
#define Mul_Acc4(i)		Mul_Acc2(i)
1528
#define Mul_Acc5(i)		Mul_Acc2(i)
1529
#define Mul_Acc6(i)		Mul_Acc2(i)
1530
#define Mul_Acc7(i)		Mul_Acc2(i)
1531
#define Mul_Acc8(i)		Mul_Acc2(i)
1532
#define Mul_Acc9(i)		Mul_Acc2(i)
1533
#define Mul_Acc10(i)	Mul_Acc2(i)
1534
#define Mul_Acc11(i)	Mul_Acc2(i)
1535
#define Mul_Acc12(i)	Mul_Acc2(i)
1536
#define Mul_Acc13(i)	Mul_Acc2(i)
1537
#define Mul_Acc14(i)	Mul_Acc2(i)
1538
#define Mul_Acc15(i)	Mul_Acc2(i)
1539
#define Mul_Acc16(i)	Mul_Acc2(i)
1540
1541
#define Mul_Column1(k, i)	\
1542
	SSE2_SaveShift(k)					\
1543
	AS2(	add			esi, 16)	\
1544
	SSE2_MulAdd45\
1545
	Mul_Acc##i(i)	\
1546
1547
#define Mul_Column0(k, i)	\
1548
	SSE2_SaveShift(k)					\
1549
	AS2(	add			edi, 16)	\
1550
	AS2(	add			edx, 16)	\
1551
	SSE2_MulAdd45\
1552
	Mul_Acc##i(i)	\
1553
1554
#define Bot_Acc(i)							\
1555
	AS2(	movdqa		xmm1, [esi+i/2*(1-(i-2*(i/2))*2)*16])	\
1556
	AS2(	movdqa		xmm0, [edi-i/2*(1-(i-2*(i/2))*2)*16])	\
1557
	AS2(	pmuludq		xmm0, xmm1)				\
1558
	AS2(	pmuludq		xmm1, [edx-i/2*(1-(i-2*(i/2))*2)*16])		\
1559
	AS2(	paddq		xmm4, xmm0)				\
1560
	AS2(	paddd		xmm6, xmm1)
1561
1562
#define Bot_SaveAcc(k)					\
1563
	SSE2_SaveShift(k)							\
1564
	AS2(	add			edi, 16)	\
1565
	AS2(	add			edx, 16)	\
1566
	AS2(	movdqa		xmm6, [esi])	\
1567
	AS2(	movdqa		xmm0, [edi])	\
1568
	AS2(	pmuludq		xmm0, xmm6)				\
1569
	AS2(	paddq		xmm4, xmm0)				\
1570
	AS2(	psllq		xmm5, 16)				\
1571
	AS2(	paddq		xmm4, xmm5)				\
1572
	AS2(	pmuludq		xmm6, [edx])
1573
1574
#define Bot_End(n)							\
1575
	AS2(	movhlps		xmm7, xmm6)			\
1576
	AS2(	paddd		xmm6, xmm7)			\
1577
	AS2(	psllq		xmm6, 32)			\
1578
	AS2(	paddd		xmm4, xmm6)			\
1579
	AS2(	movq		QWORD PTR [ecx+8*((n)-1)], xmm4)	\
1580
	AS1(	pop		esp)\
1581
	MulEpilogue
1582
1583
#define Top_Begin(n)							\
1584
	TopPrologue									\
1585
	AS2(	mov		edx, esp)\
1586
	AS2(	and		esp, 0xfffffff0)\
1587
	AS2(	sub		esp, 48*n+16)\
1588
	AS1(	push	edx)\
1589
	AS2(	xor		edx, edx)					\
1590
	ASL(1)										\
1591
	ASS(	pshufd	xmm0, [eax+edx], 3,1,2,0)	\
1592
	ASS(	pshufd	xmm1, [eax+edx], 2,0,3,1)	\
1593
	ASS(	pshufd	xmm2, [edi+edx], 3,1,2,0)	\
1594
	AS2(	movdqa	[esp+20+2*edx], xmm0)		\
1595
	AS2(	psrlq	xmm0, 32)					\
1596
	AS2(	movdqa	[esp+20+2*edx+16], xmm0)	\
1597
	AS2(	movdqa	[esp+20+16*n+2*edx], xmm1)		\
1598
	AS2(	psrlq	xmm1, 32)					\
1599
	AS2(	movdqa	[esp+20+16*n+2*edx+16], xmm1)	\
1600
	AS2(	movdqa	[esp+20+32*n+2*edx], xmm2)		\
1601
	AS2(	psrlq	xmm2, 32)					\
1602
	AS2(	movdqa	[esp+20+32*n+2*edx+16], xmm2)	\
1603
	AS2(	add		edx, 16)					\
1604
	AS2(	cmp		edx, 8*(n))					\
1605
	ASJ(	jne,	1, b)						\
1606
	AS2(	mov		eax, esi)					\
1607
	AS2(	lea		edi, [esp+20+00*n+16*(n/2-1)])\
1608
	AS2(	lea		edx, [esp+20+16*n+16*(n/2-1)])\
1609
	AS2(	lea		esi, [esp+20+32*n+16*(n/2-1)])\
1610
	AS2(	pxor	xmm4, xmm4)\
1611
	AS2(	pxor	xmm5, xmm5)
1612
1613
#define Top_Acc(i)							\
1614
	AS2(	movq		xmm0, QWORD PTR [esi+i/2*(1-(i-2*(i/2))*2)*16+8])	\
1615
	AS2(	pmuludq		xmm0, [edx-i/2*(1-(i-2*(i/2))*2)*16])	\
1616
	AS2(	psrlq		xmm0, 48)				\
1617
	AS2(	paddd		xmm5, xmm0)\
1618
1619
#define Top_Column0(i)	\
1620
	AS2(	psllq		xmm5, 32)				\
1621
	AS2(	add			edi, 16)	\
1622
	AS2(	add			edx, 16)	\
1623
	SSE2_MulAdd45\
1624
	Mul_Acc##i(i)	\
1625
1626
#define Top_Column1(i)	\
1627
	SSE2_SaveShift(0)					\
1628
	AS2(	add			esi, 16)	\
1629
	SSE2_MulAdd45\
1630
	Mul_Acc##i(i)	\
1631
	AS2(	shr			eax, 16)	\
1632
	AS2(	movd		xmm0, eax)\
1633
	AS2(	movd		xmm1, [ecx+4])\
1634
	AS2(	psrld		xmm1, 16)\
1635
	AS2(	pcmpgtd		xmm1, xmm0)\
1636
	AS2(	psrld		xmm1, 31)\
1637
	AS2(	paddd		xmm4, xmm1)\
1638
1639
void SSE2_Square4(word *C, const word *A)
1640
{
1641
	Squ_Begin(2)
1642
	Squ_Column0(0, 1)
1643
	Squ_End(2)
1644
}
1645
1646
void SSE2_Square8(word *C, const word *A)
1647
{
1648
	Squ_Begin(4)
1649
#ifndef __GNUC__
1650
	ASJ(	jmp,	0, f)
1651
	Squ_Acc(2)
1652
	AS1(	ret) ASL(0)
1653
#endif
1654
	Squ_Column0(0, 1)
1655
	Squ_Column1(1, 1)
1656
	Squ_Column0(2, 2)
1657
	Squ_Column1(3, 1)
1658
	Squ_Column0(4, 1)
1659
	Squ_End(4)
1660
}
1661
1662
void SSE2_Square16(word *C, const word *A)
1663
{
1664
	Squ_Begin(8)
1665
#ifndef __GNUC__
1666
	ASJ(	jmp,	0, f)
1667
	Squ_Acc(4) Squ_Acc(3) Squ_Acc(2)
1668
	AS1(	ret) ASL(0)
1669
#endif
1670
	Squ_Column0(0, 1)
1671
	Squ_Column1(1, 1)
1672
	Squ_Column0(2, 2)
1673
	Squ_Column1(3, 2)
1674
	Squ_Column0(4, 3)
1675
	Squ_Column1(5, 3)
1676
	Squ_Column0(6, 4)
1677
	Squ_Column1(7, 3)
1678
	Squ_Column0(8, 3)
1679
	Squ_Column1(9, 2)
1680
	Squ_Column0(10, 2)
1681
	Squ_Column1(11, 1)
1682
	Squ_Column0(12, 1)
1683
	Squ_End(8)
1684
}
1685
1686
void SSE2_Square32(word *C, const word *A)
1687
{
1688
	Squ_Begin(16)
1689
	ASJ(	jmp,	0, f)
1690
	Squ_Acc(8) Squ_Acc(7) Squ_Acc(6) Squ_Acc(5) Squ_Acc(4) Squ_Acc(3) Squ_Acc(2)
1691
	AS1(	ret) ASL(0)
1692
	Squ_Column0(0, 1)
1693
	Squ_Column1(1, 1)
1694
	Squ_Column0(2, 2)
1695
	Squ_Column1(3, 2)
1696
	Squ_Column0(4, 3)
1697
	Squ_Column1(5, 3)
1698
	Squ_Column0(6, 4)
1699
	Squ_Column1(7, 4)
1700
	Squ_Column0(8, 5)
1701
	Squ_Column1(9, 5)
1702
	Squ_Column0(10, 6)
1703
	Squ_Column1(11, 6)
1704
	Squ_Column0(12, 7)
1705
	Squ_Column1(13, 7)
1706
	Squ_Column0(14, 8)
1707
	Squ_Column1(15, 7)
1708
	Squ_Column0(16, 7)
1709
	Squ_Column1(17, 6)
1710
	Squ_Column0(18, 6)
1711
	Squ_Column1(19, 5)
1712
	Squ_Column0(20, 5)
1713
	Squ_Column1(21, 4)
1714
	Squ_Column0(22, 4)
1715
	Squ_Column1(23, 3)
1716
	Squ_Column0(24, 3)
1717
	Squ_Column1(25, 2)
1718
	Squ_Column0(26, 2)
1719
	Squ_Column1(27, 1)
1720
	Squ_Column0(28, 1)
1721
	Squ_End(16)
1722
}
1723
1724
void SSE2_Multiply4(word *C, const word *A, const word *B)
1725
{
1726
	Mul_Begin(2)
1727
#ifndef __GNUC__
1728
	ASJ(	jmp,	0, f)
1729
	Mul_Acc(2)
1730
	AS1(	ret) ASL(0)
1731
#endif
1732
	Mul_Column0(0, 2)
1733
	Mul_End(2)
1734
}
1735
1736
void SSE2_Multiply8(word *C, const word *A, const word *B)
1737
{
1738
	Mul_Begin(4)
1739
#ifndef __GNUC__
1740
	ASJ(	jmp,	0, f)
1741
	Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1742
	AS1(	ret) ASL(0)
1743
#endif
1744
	Mul_Column0(0, 2)
1745
	Mul_Column1(1, 3)
1746
	Mul_Column0(2, 4)
1747
	Mul_Column1(3, 3)
1748
	Mul_Column0(4, 2)
1749
	Mul_End(4)
1750
}
1751
1752
void SSE2_Multiply16(word *C, const word *A, const word *B)
1753
{
1754
	Mul_Begin(8)
1755
#ifndef __GNUC__
1756
	ASJ(	jmp,	0, f)
1757
	Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1758
	AS1(	ret) ASL(0)
1759
#endif
1760
	Mul_Column0(0, 2)
1761
	Mul_Column1(1, 3)
1762
	Mul_Column0(2, 4)
1763
	Mul_Column1(3, 5)
1764
	Mul_Column0(4, 6)
1765
	Mul_Column1(5, 7)
1766
	Mul_Column0(6, 8)
1767
	Mul_Column1(7, 7)
1768
	Mul_Column0(8, 6)
1769
	Mul_Column1(9, 5)
1770
	Mul_Column0(10, 4)
1771
	Mul_Column1(11, 3)
1772
	Mul_Column0(12, 2)
1773
	Mul_End(8)
1774
}
1775
1776
void SSE2_Multiply32(word *C, const word *A, const word *B)
1777
{
1778
	Mul_Begin(16)
1779
	ASJ(	jmp,	0, f)
1780
	Mul_Acc(16) Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1781
	AS1(	ret) ASL(0)
1782
	Mul_Column0(0, 2)
1783
	Mul_Column1(1, 3)
1784
	Mul_Column0(2, 4)
1785
	Mul_Column1(3, 5)
1786
	Mul_Column0(4, 6)
1787
	Mul_Column1(5, 7)
1788
	Mul_Column0(6, 8)
1789
	Mul_Column1(7, 9)
1790
	Mul_Column0(8, 10)
1791
	Mul_Column1(9, 11)
1792
	Mul_Column0(10, 12)
1793
	Mul_Column1(11, 13)
1794
	Mul_Column0(12, 14)
1795
	Mul_Column1(13, 15)
1796
	Mul_Column0(14, 16)
1797
	Mul_Column1(15, 15)
1798
	Mul_Column0(16, 14)
1799
	Mul_Column1(17, 13)
1800
	Mul_Column0(18, 12)
1801
	Mul_Column1(19, 11)
1802
	Mul_Column0(20, 10)
1803
	Mul_Column1(21, 9)
1804
	Mul_Column0(22, 8)
1805
	Mul_Column1(23, 7)
1806
	Mul_Column0(24, 6)
1807
	Mul_Column1(25, 5)
1808
	Mul_Column0(26, 4)
1809
	Mul_Column1(27, 3)
1810
	Mul_Column0(28, 2)
1811
	Mul_End(16)
1812
}
1813
1814
void SSE2_MultiplyBottom4(word *C, const word *A, const word *B)
1815
{
1816
	Mul_Begin(2)
1817
	Bot_SaveAcc(0) Bot_Acc(2)
1818
	Bot_End(2)
1819
}
1820
1821
void SSE2_MultiplyBottom8(word *C, const word *A, const word *B)
1822
{
1823
	Mul_Begin(4)
1824
#ifndef __GNUC__
1825
	ASJ(	jmp,	0, f)
1826
	Mul_Acc(3) Mul_Acc(2)
1827
	AS1(	ret) ASL(0)
1828
#endif
1829
	Mul_Column0(0, 2)
1830
	Mul_Column1(1, 3)
1831
	Bot_SaveAcc(2) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2)
1832
	Bot_End(4)
1833
}
1834
1835
void SSE2_MultiplyBottom16(word *C, const word *A, const word *B)
1836
{
1837
	Mul_Begin(8)
1838
#ifndef __GNUC__
1839
	ASJ(	jmp,	0, f)
1840
	Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1841
	AS1(	ret) ASL(0)
1842
#endif
1843
	Mul_Column0(0, 2)
1844
	Mul_Column1(1, 3)
1845
	Mul_Column0(2, 4)
1846
	Mul_Column1(3, 5)
1847
	Mul_Column0(4, 6)
1848
	Mul_Column1(5, 7)
1849
	Bot_SaveAcc(6) Bot_Acc(8) Bot_Acc(7) Bot_Acc(6) Bot_Acc(5) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2)
1850
	Bot_End(8)
1851
}
1852
1853
void SSE2_MultiplyBottom32(word *C, const word *A, const word *B)
1854
{
1855
	Mul_Begin(16)
1856
#ifndef __GNUC__
1857
	ASJ(	jmp,	0, f)
1858
	Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1859
	AS1(	ret) ASL(0)
1860
#endif
1861
	Mul_Column0(0, 2)
1862
	Mul_Column1(1, 3)
1863
	Mul_Column0(2, 4)
1864
	Mul_Column1(3, 5)
1865
	Mul_Column0(4, 6)
1866
	Mul_Column1(5, 7)
1867
	Mul_Column0(6, 8)
1868
	Mul_Column1(7, 9)
1869
	Mul_Column0(8, 10)
1870
	Mul_Column1(9, 11)
1871
	Mul_Column0(10, 12)
1872
	Mul_Column1(11, 13)
1873
	Mul_Column0(12, 14)
1874
	Mul_Column1(13, 15)
1875
	Bot_SaveAcc(14) Bot_Acc(16) Bot_Acc(15) Bot_Acc(14) Bot_Acc(13) Bot_Acc(12) Bot_Acc(11) Bot_Acc(10) Bot_Acc(9) Bot_Acc(8) Bot_Acc(7) Bot_Acc(6) Bot_Acc(5) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2)
1876
	Bot_End(16)
1877
}
1878
1879
void SSE2_MultiplyTop8(word *C, const word *A, const word *B, word L)
1880
{
1881
	Top_Begin(4)
1882
	Top_Acc(3) Top_Acc(2) Top_Acc(1)
1883
#ifndef __GNUC__
1884
	ASJ(	jmp,	0, f)
1885
	Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1886
	AS1(	ret) ASL(0)
1887
#endif
1888
	Top_Column0(4)
1889
	Top_Column1(3)
1890
	Mul_Column0(0, 2)
1891
	Top_End(2)
1892
}
1893
1894
void SSE2_MultiplyTop16(word *C, const word *A, const word *B, word L)
1895
{
1896
	Top_Begin(8)
1897
	Top_Acc(7) Top_Acc(6) Top_Acc(5) Top_Acc(4) Top_Acc(3) Top_Acc(2) Top_Acc(1)
1898
#ifndef __GNUC__
1899
	ASJ(	jmp,	0, f)
1900
	Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1901
	AS1(	ret) ASL(0)
1902
#endif
1903
	Top_Column0(8)
1904
	Top_Column1(7)
1905
	Mul_Column0(0, 6)
1906
	Mul_Column1(1, 5)
1907
	Mul_Column0(2, 4)
1908
	Mul_Column1(3, 3)
1909
	Mul_Column0(4, 2)
1910
	Top_End(4)
1911
}
1912
1913
void SSE2_MultiplyTop32(word *C, const word *A, const word *B, word L)
1914
{
1915
	Top_Begin(16)
1916
	Top_Acc(15) Top_Acc(14) Top_Acc(13) Top_Acc(12) Top_Acc(11) Top_Acc(10) Top_Acc(9) Top_Acc(8) Top_Acc(7) Top_Acc(6) Top_Acc(5) Top_Acc(4) Top_Acc(3) Top_Acc(2) Top_Acc(1)
1917
#ifndef __GNUC__
1918
	ASJ(	jmp,	0, f)
1919
	Mul_Acc(16) Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1920
	AS1(	ret) ASL(0)
1921
#endif
1922
	Top_Column0(16)
1923
	Top_Column1(15)
1924
	Mul_Column0(0, 14)
1925
	Mul_Column1(1, 13)
1926
	Mul_Column0(2, 12)
1927
	Mul_Column1(3, 11)
1928
	Mul_Column0(4, 10)
1929
	Mul_Column1(5, 9)
1930
	Mul_Column0(6, 8)
1931
	Mul_Column1(7, 7)
1932
	Mul_Column0(8, 6)
1933
	Mul_Column1(9, 5)
1934
	Mul_Column0(10, 4)
1935
	Mul_Column1(11, 3)
1936
	Mul_Column0(12, 2)
1937
	Top_End(8)
1938
}
1939
315 by weidai
fix compile for x64, DLL and VC 6
1940
#endif	// #if CRYPTOPP_INTEGER_SSE2
270 by weidai
MMX/SSE2 optimizations
1941
1942
// ********************************************************
1943
351 by weidai
revert to int return value for Add and Sub
1944
typedef int (CRYPTOPP_FASTCALL * PAdd)(size_t N, word *C, const word *A, const word *B);
270 by weidai
MMX/SSE2 optimizations
1945
typedef void (* PMul)(word *C, const word *A, const word *B);
1946
typedef void (* PSqu)(word *C, const word *A);
1947
typedef void (* PMulTop)(word *C, const word *A, const word *B, word L);
1948
315 by weidai
fix compile for x64, DLL and VC 6
1949
#if CRYPTOPP_INTEGER_SSE2
270 by weidai
MMX/SSE2 optimizations
1950
static PAdd s_pAdd = &Baseline_Add, s_pSub = &Baseline_Sub;
1951
static size_t s_recursionLimit = 8;
1952
#else
315 by weidai
fix compile for x64, DLL and VC 6
1953
static const size_t s_recursionLimit = 16;
270 by weidai
MMX/SSE2 optimizations
1954
#endif
1955
1956
static PMul s_pMul[9], s_pBot[9];
1957
static PSqu s_pSqu[9];
315 by weidai
fix compile for x64, DLL and VC 6
1958
static PMulTop s_pTop[9];
270 by weidai
MMX/SSE2 optimizations
1959
1960
static void SetFunctionPointers()
1961
{
1962
	s_pMul[0] = &Baseline_Multiply2;
1963
	s_pBot[0] = &Baseline_MultiplyBottom2;
1964
	s_pSqu[0] = &Baseline_Square2;
315 by weidai
fix compile for x64, DLL and VC 6
1965
	s_pTop[0] = &Baseline_MultiplyTop2;
1966
	s_pTop[1] = &Baseline_MultiplyTop4;
270 by weidai
MMX/SSE2 optimizations
1967
315 by weidai
fix compile for x64, DLL and VC 6
1968
#if CRYPTOPP_INTEGER_SSE2
270 by weidai
MMX/SSE2 optimizations
1969
	if (HasSSE2())
1970
	{
359 by weidai
fix crash in SSE2_Add on P4 when compiled with MSVC 6.0 with Processor Pack
1971
#if _MSC_VER != 1200 || defined(NDEBUG)
270 by weidai
MMX/SSE2 optimizations
1972
		if (IsP4())
1973
		{
1974
			s_pAdd = &SSE2_Add;
1975
			s_pSub = &SSE2_Sub;
1976
		}
359 by weidai
fix crash in SSE2_Add on P4 when compiled with MSVC 6.0 with Processor Pack
1977
#endif
270 by weidai
MMX/SSE2 optimizations
1978
1979
		s_recursionLimit = 32;
1980
1981
		s_pMul[1] = &SSE2_Multiply4;
1982
		s_pMul[2] = &SSE2_Multiply8;
1983
		s_pMul[4] = &SSE2_Multiply16;
1984
		s_pMul[8] = &SSE2_Multiply32;
1985
1986
		s_pBot[1] = &SSE2_MultiplyBottom4;
1987
		s_pBot[2] = &SSE2_MultiplyBottom8;
1988
		s_pBot[4] = &SSE2_MultiplyBottom16;
1989
		s_pBot[8] = &SSE2_MultiplyBottom32;
1990
1991
		s_pSqu[1] = &SSE2_Square4;
1992
		s_pSqu[2] = &SSE2_Square8;
1993
		s_pSqu[4] = &SSE2_Square16;
1994
		s_pSqu[8] = &SSE2_Square32;
1995
315 by weidai
fix compile for x64, DLL and VC 6
1996
		s_pTop[2] = &SSE2_MultiplyTop8;
1997
		s_pTop[4] = &SSE2_MultiplyTop16;
1998
		s_pTop[8] = &SSE2_MultiplyTop32;
270 by weidai
MMX/SSE2 optimizations
1999
	}
2000
	else
2001
#endif
2002
	{
2003
		s_pMul[1] = &Baseline_Multiply4;
2004
		s_pMul[2] = &Baseline_Multiply8;
2005
2006
		s_pBot[1] = &Baseline_MultiplyBottom4;
2007
		s_pBot[2] = &Baseline_MultiplyBottom8;
2008
2009
		s_pSqu[1] = &Baseline_Square4;
2010
		s_pSqu[2] = &Baseline_Square8;
315 by weidai
fix compile for x64, DLL and VC 6
2011
2012
		s_pTop[2] = &Baseline_MultiplyTop8;
2013
2014
#if	!CRYPTOPP_INTEGER_SSE2
2015
		s_pMul[4] = &Baseline_Multiply16;
2016
		s_pBot[4] = &Baseline_MultiplyBottom16;
2017
		s_pSqu[4] = &Baseline_Square16;
2018
		s_pTop[4] = &Baseline_MultiplyTop16;
2019
#endif
270 by weidai
MMX/SSE2 optimizations
2020
	}
2021
}
2022
351 by weidai
revert to int return value for Add and Sub
2023
inline int Add(word *C, const word *A, const word *B, size_t N)
270 by weidai
MMX/SSE2 optimizations
2024
{
315 by weidai
fix compile for x64, DLL and VC 6
2025
#if CRYPTOPP_INTEGER_SSE2
270 by weidai
MMX/SSE2 optimizations
2026
	return s_pAdd(N, C, A, B);
2027
#else
2028
	return Baseline_Add(N, C, A, B);
2029
#endif
2030
}
2031
351 by weidai
revert to int return value for Add and Sub
2032
inline int Subtract(word *C, const word *A, const word *B, size_t N)
270 by weidai
MMX/SSE2 optimizations
2033
{
315 by weidai
fix compile for x64, DLL and VC 6
2034
#if CRYPTOPP_INTEGER_SSE2
270 by weidai
MMX/SSE2 optimizations
2035
	return s_pSub(N, C, A, B);
2036
#else
2037
	return Baseline_Sub(N, C, A, B);
2038
#endif
2039
}
2040
2041
// ********************************************************
2042
1 by weidai
Initial revision
2043
2044
#define A0		A
2045
#define A1		(A+N2)
2046
#define B0		B
2047
#define B1		(B+N2)
2048
2049
#define T0		T
2050
#define T1		(T+N2)
2051
#define T2		(T+N)
2052
#define T3		(T+N+N2)
2053
2054
#define R0		R
2055
#define R1		(R+N2)
2056
#define R2		(R+N)
2057
#define R3		(R+N+N2)
2058
2059
// R[2*N] - result = A*B
2060
// T[2*N] - temporary work space
2061
// A[N] --- multiplier
2062
// B[N] --- multiplicant
2063
184 by weidai
port to MSVC .NET 2005 beta 2
2064
void RecursiveMultiply(word *R, word *T, const word *A, const word *B, size_t N)
1 by weidai
Initial revision
2065
{
2066
	assert(N>=2 && N%2==0);
2067
270 by weidai
MMX/SSE2 optimizations
2068
	if (N <= s_recursionLimit)
2069
		s_pMul[N/4](R, A, B);
1 by weidai
Initial revision
2070
	else
2071
	{
184 by weidai
port to MSVC .NET 2005 beta 2
2072
		const size_t N2 = N/2;
270 by weidai
MMX/SSE2 optimizations
2073
2074
		size_t AN2 = Compare(A0, A1, N2) > 0 ?  0 : N2;
2075
		Subtract(R0, A + AN2, A + (N2 ^ AN2), N2);
2076
2077
		size_t BN2 = Compare(B0, B1, N2) > 0 ?  0 : N2;
2078
		Subtract(R1, B + BN2, B + (N2 ^ BN2), N2);
2079
2080
		RecursiveMultiply(R2, T2, A1, B1, N2);
2081
		RecursiveMultiply(T0, T2, R0, R1, N2);
113 by weidai
unify GCC and MSVC multiplication code
2082
		RecursiveMultiply(R0, T2, A0, B0, N2);
2083
2084
		// now T[01] holds (A1-A0)*(B0-B1), R[01] holds A0*B0, R[23] holds A1*B1
2085
270 by weidai
MMX/SSE2 optimizations
2086
		int c2 = Add(R2, R2, R1, N2);
2087
		int c3 = c2;
2088
		c2 += Add(R1, R2, R0, N2);
2089
		c3 += Add(R2, R2, R3, N2);
2090
2091
		if (AN2 == BN2)
2092
			c3 -= Subtract(R1, R1, T0, N);
2093
		else
2094
			c3 += Add(R1, R1, T0, N);
2095
2096
		c3 += Increment(R2, N2, c2);
2097
		assert (c3 >= 0 && c3 <= 2);
2098
		Increment(R3, N2, c3);
1 by weidai
Initial revision
2099
	}
2100
}
2101
2102
// R[2*N] - result = A*A
2103
// T[2*N] - temporary work space
2104
// A[N] --- number to be squared
2105
184 by weidai
port to MSVC .NET 2005 beta 2
2106
void RecursiveSquare(word *R, word *T, const word *A, size_t N)
1 by weidai
Initial revision
2107
{
2108
	assert(N && N%2==0);
270 by weidai
MMX/SSE2 optimizations
2109
2110
	if (N <= s_recursionLimit)
2111
		s_pSqu[N/4](R, A);
1 by weidai
Initial revision
2112
	else
113 by weidai
unify GCC and MSVC multiplication code
2113
	{
184 by weidai
port to MSVC .NET 2005 beta 2
2114
		const size_t N2 = N/2;
113 by weidai
unify GCC and MSVC multiplication code
2115
2116
		RecursiveSquare(R0, T2, A0, N2);
2117
		RecursiveSquare(R2, T2, A1, N2);
2118
		RecursiveMultiply(T0, T2, A0, A1, N2);
2119
270 by weidai
MMX/SSE2 optimizations
2120
		int carry = Add(R1, R1, T0, N);
2121
		carry += Add(R1, R1, T0, N);
113 by weidai
unify GCC and MSVC multiplication code
2122
		Increment(R3, N2, carry);
2123
	}
1 by weidai
Initial revision
2124
}
2125
2126
// R[N] - bottom half of A*B
270 by weidai
MMX/SSE2 optimizations
2127
// T[3*N/2] - temporary work space
1 by weidai
Initial revision
2128
// A[N] - multiplier
2129
// B[N] - multiplicant
2130
184 by weidai
port to MSVC .NET 2005 beta 2
2131
void RecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, size_t N)
1 by weidai
Initial revision
2132
{
2133
	assert(N>=2 && N%2==0);
270 by weidai
MMX/SSE2 optimizations
2134
2135
	if (N <= s_recursionLimit)
2136
		s_pBot[N/4](R, A, B);
1 by weidai
Initial revision
2137
	else
113 by weidai
unify GCC and MSVC multiplication code
2138
	{
184 by weidai
port to MSVC .NET 2005 beta 2
2139
		const size_t N2 = N/2;
113 by weidai
unify GCC and MSVC multiplication code
2140
2141
		RecursiveMultiply(R, T, A0, B0, N2);
2142
		RecursiveMultiplyBottom(T0, T1, A1, B0, N2);
270 by weidai
MMX/SSE2 optimizations
2143
		Add(R1, R1, T0, N2);
113 by weidai
unify GCC and MSVC multiplication code
2144
		RecursiveMultiplyBottom(T0, T1, A0, B1, N2);
270 by weidai
MMX/SSE2 optimizations
2145
		Add(R1, R1, T0, N2);
113 by weidai
unify GCC and MSVC multiplication code
2146
	}
1 by weidai
Initial revision
2147
}
2148
2149
// R[N] --- upper half of A*B
2150
// T[2*N] - temporary work space
2151
// L[N] --- lower half of A*B
2152
// A[N] --- multiplier
2153
// B[N] --- multiplicant
2154
270 by weidai
MMX/SSE2 optimizations
2155
void MultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, size_t N)
1 by weidai
Initial revision
2156
{
2157
	assert(N>=2 && N%2==0);
2158
315 by weidai
fix compile for x64, DLL and VC 6
2159
	if (N <= s_recursionLimit)
2160
		s_pTop[N/4](R, A, B, L[N-1]);
1 by weidai
Initial revision
2161
	else
2162
	{
184 by weidai
port to MSVC .NET 2005 beta 2
2163
		const size_t N2 = N/2;
270 by weidai
MMX/SSE2 optimizations
2164
2165
		size_t AN2 = Compare(A0, A1, N2) > 0 ?  0 : N2;
2166
		Subtract(R0, A + AN2, A + (N2 ^ AN2), N2);
2167
2168
		size_t BN2 = Compare(B0, B1, N2) > 0 ?  0 : N2;
2169
		Subtract(R1, B + BN2, B + (N2 ^ BN2), N2);
2170
2171
		RecursiveMultiply(T0, T2, R0, R1, N2);
2172
		RecursiveMultiply(R0, T2, A1, B1, N2);
2173
2174
		// now T[01] holds (A1-A0)*(B0-B1) = A1*B0+A0*B1-A1*B1-A0*B0, R[01] holds A1*B1
2175
2176
		int t, c3;
2177
		int c2 = Subtract(T2, L+N2, L, N2);
2178
2179
		if (AN2 == BN2)
2180
		{
2181
			c2 -= Add(T2, T2, T0, N2);
2182
			t = (Compare(T2, R0, N2) == -1);
2183
			c3 = t - Subtract(T2, T2, T1, N2);
2184
		}
2185
		else
2186
		{
2187
			c2 += Subtract(T2, T2, T0, N2);
2188
			t = (Compare(T2, R0, N2) == -1);
2189
			c3 = t + Add(T2, T2, T1, N2);
2190
		}
2191
2192
		c2 += t;
2193
		if (c2 >= 0)
2194
			c3 += Increment(T2, N2, c2);
2195
		else
2196
			c3 -= Decrement(T2, N2, -c2);
2197
		c3 += Add(R0, T2, R1, N2);
2198
2199
		assert (c3 >= 0 && c3 <= 2);
2200
		Increment(R1, N2, c3);
1 by weidai
Initial revision
2201
	}
2202
}
2203
184 by weidai
port to MSVC .NET 2005 beta 2
2204
inline void Multiply(word *R, word *T, const word *A, const word *B, size_t N)
1 by weidai
Initial revision
2205
{
113 by weidai
unify GCC and MSVC multiplication code
2206
	RecursiveMultiply(R, T, A, B, N);
1 by weidai
Initial revision
2207
}
2208
184 by weidai
port to MSVC .NET 2005 beta 2
2209
inline void Square(word *R, word *T, const word *A, size_t N)
1 by weidai
Initial revision
2210
{
113 by weidai
unify GCC and MSVC multiplication code
2211
	RecursiveSquare(R, T, A, N);
1 by weidai
Initial revision
2212
}
2213
184 by weidai
port to MSVC .NET 2005 beta 2
2214
inline void MultiplyBottom(word *R, word *T, const word *A, const word *B, size_t N)
1 by weidai
Initial revision
2215
{
113 by weidai
unify GCC and MSVC multiplication code
2216
	RecursiveMultiplyBottom(R, T, A, B, N);
1 by weidai
Initial revision
2217
}
2218
2219
// R[NA+NB] - result = A*B
2220
// T[NA+NB] - temporary work space
2221
// A[NA] ---- multiplier
2222
// B[NB] ---- multiplicant
2223
184 by weidai
port to MSVC .NET 2005 beta 2
2224
void AsymmetricMultiply(word *R, word *T, const word *A, size_t NA, const word *B, size_t NB)
1 by weidai
Initial revision
2225
{
2226
	if (NA == NB)
2227
	{
2228
		if (A == B)
2229
			Square(R, T, A, NA);
2230
		else
2231
			Multiply(R, T, A, B, NA);
2232
2233
		return;
2234
	}
2235
2236
	if (NA > NB)
2237
	{
2238
		std::swap(A, B);
2239
		std::swap(NA, NB);
2240
	}
2241
2242
	assert(NB % NA == 0);
2243
2244
	if (NA==2 && !A[1])
2245
	{
2246
		switch (A[0])
2247
		{
2248
		case 0:
2249
			SetWords(R, 0, NB+2);
2250
			return;
2251
		case 1:
2252
			CopyWords(R, B, NB);
2253
			R[NB] = R[NB+1] = 0;
2254
			return;
2255
		default:
2256
			R[NB] = LinearMultiply(R, B, A[0], NB);
2257
			R[NB+1] = 0;
2258
			return;
2259
		}
2260
	}
2261
184 by weidai
port to MSVC .NET 2005 beta 2
2262
	size_t i;
270 by weidai
MMX/SSE2 optimizations
2263
	if ((NB/NA)%2 == 0)
2264
	{
2265
		Multiply(R, T, A, B, NA);
2266
		CopyWords(T+2*NA, R+NA, NA);
1 by weidai
Initial revision
2267
270 by weidai
MMX/SSE2 optimizations
2268
		for (i=2*NA; i<NB; i+=2*NA)
2269
			Multiply(T+NA+i, T, A, B+i, NA);
2270
		for (i=NA; i<NB; i+=2*NA)
2271
			Multiply(R+i, T, A, B+i, NA);
2272
	}
2273
	else
2274
	{
2275
		for (i=0; i<NB; i+=2*NA)
2276
			Multiply(R+i, T, A, B+i, NA);
2277
		for (i=NA; i<NB; i+=2*NA)
2278
			Multiply(T+NA+i, T, A, B+i, NA);
2279
	}
1 by weidai
Initial revision
2280
2281
	if (Add(R+NA, R+NA, T+2*NA, NB-NA))
2282
		Increment(R+NB, NA);
2283
}
2284
2285
// R[N] ----- result = A inverse mod 2**(WORD_BITS*N)
2286
// T[3*N/2] - temporary work space
2287
// A[N] ----- an odd number as input
2288
184 by weidai
port to MSVC .NET 2005 beta 2
2289
void RecursiveInverseModPower2(word *R, word *T, const word *A, size_t N)
1 by weidai
Initial revision
2290
{
2291
	if (N==2)
100 by weidai
fix bugs in 64-bit CPU support
2292
	{
2293
		T[0] = AtomicInverseModPower2(A[0]);
2294
		T[1] = 0;
270 by weidai
MMX/SSE2 optimizations
2295
		s_pBot[0](T+2, T, A);
100 by weidai
fix bugs in 64-bit CPU support
2296
		TwosComplement(T+2, 2);
2297
		Increment(T+2, 2, 2);
270 by weidai
MMX/SSE2 optimizations
2298
		s_pBot[0](R, T, T+2);
100 by weidai
fix bugs in 64-bit CPU support
2299
	}
1 by weidai
Initial revision
2300
	else
2301
	{
184 by weidai
port to MSVC .NET 2005 beta 2
2302
		const size_t N2 = N/2;
1 by weidai
Initial revision
2303
		RecursiveInverseModPower2(R0, T0, A0, N2);
2304
		T0[0] = 1;
2305
		SetWords(T0+1, 0, N2-1);
2306
		MultiplyTop(R1, T1, T0, R0, A0, N2);
2307
		MultiplyBottom(T0, T1, R0, A1, N2);
2308
		Add(T0, R1, T0, N2);
2309
		TwosComplement(T0, N2);
2310
		MultiplyBottom(R1, T1, R0, T0, N2);
2311
	}
2312
}
2313
2314
// R[N] --- result = X/(2**(WORD_BITS*N)) mod M
2315
// T[3*N] - temporary work space
2316
// X[2*N] - number to be reduced
2317
// M[N] --- modulus
2318
// U[N] --- multiplicative inverse of M mod 2**(WORD_BITS*N)
2319
270 by weidai
MMX/SSE2 optimizations
2320
void MontgomeryReduce(word *R, word *T, word *X, const word *M, const word *U, size_t N)
1 by weidai
Initial revision
2321
{
270 by weidai
MMX/SSE2 optimizations
2322
#if 1
1 by weidai
Initial revision
2323
	MultiplyBottom(R, T, X, U, N);
2324
	MultiplyTop(T, T+N, X, R, M, N);
25 by weidai
increase resistance against timing attacks
2325
	word borrow = Subtract(T, X+N, T, N);
2326
	// defend against timing attack by doing this Add even when not needed
2327
	word carry = Add(T+N, T, M, N);
377 by weidai
remove branch in assert
2328
	assert(carry | !borrow);
363 by weidai
fix possible branch prediction analysis (BPA) vulnerability
2329
	CopyWords(R, T + ((0-borrow) & N), N);
270 by weidai
MMX/SSE2 optimizations
2330
#elif 0
2331
	const word u = 0-U[0];
2332
	Declare2Words(p)
2333
	for (size_t i=0; i<N; i++)
2334
	{
2335
		const word t = u * X[i];
2336
		word c = 0;
2337
		for (size_t j=0; j<N; j+=2)
2338
		{
2339
			MultiplyWords(p, t, M[j]);
2340
			Acc2WordsBy1(p, X[i+j]);
2341
			Acc2WordsBy1(p, c);
2342
			X[i+j] = LowWord(p);
2343
			c = HighWord(p);
2344
			MultiplyWords(p, t, M[j+1]);
2345
			Acc2WordsBy1(p, X[i+j+1]);
2346
			Acc2WordsBy1(p, c);
2347
			X[i+j+1] = LowWord(p);
2348
			c = HighWord(p);
2349
		}
2350
2351
		if (Increment(X+N+i, N-i, c))
2352
			while (!Subtract(X+N, X+N, M, N)) {}
2353
	}
2354
2355
	memcpy(R, X+N, N*WORD_SIZE);
2356
#else
2357
	__m64 u = _mm_cvtsi32_si64(0-U[0]), p;
2358
	for (size_t i=0; i<N; i++)
2359
	{
2360
		__m64 t = _mm_cvtsi32_si64(X[i]);
2361
		t = _mm_mul_su32(t, u);
2362
		__m64 c = _mm_setzero_si64();
2363
		for (size_t j=0; j<N; j+=2)
2364
		{
2365
			p = _mm_mul_su32(t, _mm_cvtsi32_si64(M[j]));
2366
			p = _mm_add_si64(p, _mm_cvtsi32_si64(X[i+j]));
2367
			c = _mm_add_si64(c, p);
2368
			X[i+j] = _mm_cvtsi64_si32(c);
2369
			c = _mm_srli_si64(c, 32);
2370
			p = _mm_mul_su32(t, _mm_cvtsi32_si64(M[j+1]));
2371
			p = _mm_add_si64(p, _mm_cvtsi32_si64(X[i+j+1]));
2372
			c = _mm_add_si64(c, p);
2373
			X[i+j+1] = _mm_cvtsi64_si32(c);
2374
			c = _mm_srli_si64(c, 32);
2375
		}
2376
2377
		if (Increment(X+N+i, N-i, _mm_cvtsi64_si32(c)))
2378
			while (!Subtract(X+N, X+N, M, N)) {}
2379
	}
2380
2381
	memcpy(R, X+N, N*WORD_SIZE);
2382
	_mm_empty();
2383
#endif
1 by weidai
Initial revision
2384
}
2385
2386
// R[N] --- result = X/(2**(WORD_BITS*N/2)) mod M
2387
// T[2*N] - temporary work space
2388
// X[2*N] - number to be reduced
2389
// M[N] --- modulus
2390
// U[N/2] - multiplicative inverse of M mod 2**(WORD_BITS*N/2)
2391
// V[N] --- 2**(WORD_BITS*3*N/2) mod M
2392
184 by weidai
port to MSVC .NET 2005 beta 2
2393
void HalfMontgomeryReduce(word *R, word *T, const word *X, const word *M, const word *U, const word *V, size_t N)
1 by weidai
Initial revision
2394
{
2395
	assert(N%2==0 && N>=4);
2396
2397
#define M0		M
2398
#define M1		(M+N2)
2399
#define V0		V
2400
#define V1		(V+N2)
2401
2402
#define X0		X
2403
#define X1		(X+N2)
2404
#define X2		(X+N)
2405
#define X3		(X+N+N2)
2406
184 by weidai
port to MSVC .NET 2005 beta 2
2407
	const size_t N2 = N/2;
1 by weidai
Initial revision
2408
	Multiply(T0, T2, V0, X3, N2);
2409
	int c2 = Add(T0, T0, X0, N);
2410
	MultiplyBottom(T3, T2, T0, U, N2);
2411
	MultiplyTop(T2, R, T0, T3, M0, N2);
2412
	c2 -= Subtract(T2, T1, T2, N2);
2413
	Multiply(T0, R, T3, M1, N2);
2414
	c2 -= Subtract(T0, T2, T0, N2);
2415
	int c3 = -(int)Subtract(T1, X2, T1, N2);
2416
	Multiply(R0, T2, V1, X3, N2);
2417
	c3 += Add(R, R, T, N);
2418
2419
	if (c2>0)
2420
		c3 += Increment(R1, N2);
2421
	else if (c2<0)
2422
		c3 -= Decrement(R1, N2, -c2);
2423
2424
	assert(c3>=-1 && c3<=1);
2425
	if (c3>0)
2426
		Subtract(R, R, M, N);
2427
	else if (c3<0)
2428
		Add(R, R, M, N);
2429
2430
#undef M0
2431
#undef M1
2432
#undef V0
2433
#undef V1
2434
2435
#undef X0
2436
#undef X1
2437
#undef X2
2438
#undef X3
2439
}
2440
2441
#undef A0
2442
#undef A1
2443
#undef B0
2444
#undef B1
2445
2446
#undef T0
2447
#undef T1
2448
#undef T2
2449
#undef T3
2450
2451
#undef R0
2452
#undef R1
2453
#undef R2
2454
#undef R3
2455
100 by weidai
fix bugs in 64-bit CPU support
2456
/*
1 by weidai
Initial revision
2457
// do a 3 word by 2 word divide, returns quotient and leaves remainder in A
2458
static word SubatomicDivide(word *A, word B0, word B1)
2459
{
2460
	// assert {A[2],A[1]} < {B1,B0}, so quotient can fit in a word
2461
	assert(A[2] < B1 || (A[2]==B1 && A[1] < B0));
2462
100 by weidai
fix bugs in 64-bit CPU support
2463
	// estimate the quotient: do a 2 word by 1 word divide
1 by weidai
Initial revision
2464
	word Q;
2465
	if (B1+1 == 0)
2466
		Q = A[2];
2467
	else
100 by weidai
fix bugs in 64-bit CPU support
2468
		Q = DWord(A[1], A[2]).DividedBy(B1+1);
1 by weidai
Initial revision
2469
2470
	// now subtract Q*B from A
100 by weidai
fix bugs in 64-bit CPU support
2471
	DWord p = DWord::Multiply(B0, Q);
2472
	DWord u = (DWord) A[0] - p.GetLowHalf();
2473
	A[0] = u.GetLowHalf();
2474
	u = (DWord) A[1] - p.GetHighHalf() - u.GetHighHalfAsBorrow() - DWord::Multiply(B1, Q);
2475
	A[1] = u.GetLowHalf();
2476
	A[2] += u.GetHighHalf();
1 by weidai
Initial revision
2477
2478
	// Q <= actual quotient, so fix it
2479
	while (A[2] || A[1] > B1 || (A[1]==B1 && A[0]>=B0))
2480
	{
100 by weidai
fix bugs in 64-bit CPU support
2481
		u = (DWord) A[0] - B0;
2482
		A[0] = u.GetLowHalf();
2483
		u = (DWord) A[1] - B1 - u.GetHighHalfAsBorrow();
2484
		A[1] = u.GetLowHalf();
2485
		A[2] += u.GetHighHalf();
1 by weidai
Initial revision
2486
		Q++;
2487
		assert(Q);	// shouldn't overflow
2488
	}
2489
2490
	return Q;
2491
}
2492
2493
// do a 4 word by 2 word divide, returns 2 word quotient in Q0 and Q1
2494
static inline void AtomicDivide(word *Q, const word *A, const word *B)
2495
{
2496
	if (!B[0] && !B[1]) // if divisor is 0, we assume divisor==2**(2*WORD_BITS)
2497
	{
2498
		Q[0] = A[2];
2499
		Q[1] = A[3];
2500
	}
2501
	else
2502
	{
2503
		word T[4];
2504
		T[0] = A[0]; T[1] = A[1]; T[2] = A[2]; T[3] = A[3];
2505
		Q[1] = SubatomicDivide(T+1, B[0], B[1]);
2506
		Q[0] = SubatomicDivide(T, B[0], B[1]);
2507
2508
#ifndef NDEBUG
2509
		// multiply quotient and divisor and add remainder, make sure it equals dividend
2510
		assert(!T[2] && !T[3] && (T[1] < B[1] || (T[1]==B[1] && T[0]<B[0])));
2511
		word P[4];
2512
		LowLevel::Multiply2(P, Q, B);
2513
		Add(P, P, T, 4);
2514
		assert(memcmp(P, A, 4*WORD_SIZE)==0);
2515
#endif
2516
	}
2517
}
100 by weidai
fix bugs in 64-bit CPU support
2518
*/
2519
2520
static inline void AtomicDivide(word *Q, const word *A, const word *B)
2521
{
2522
	word T[4];
2523
	DWord q = DivideFourWordsByTwo<word, DWord>(T, DWord(A[0], A[1]), DWord(A[2], A[3]), DWord(B[0], B[1]));
2524
	Q[0] = q.GetLowHalf();
2525
	Q[1] = q.GetHighHalf();
2526
2527
#ifndef NDEBUG
2528
	if (B[0] || B[1])
2529
	{
2530
		// multiply quotient and divisor and add remainder, make sure it equals dividend
2531
		assert(!T[2] && !T[3] && (T[1] < B[1] || (T[1]==B[1] && T[0]<B[0])));
2532
		word P[4];
270 by weidai
MMX/SSE2 optimizations
2533
		s_pMul[0](P, Q, B);
100 by weidai
fix bugs in 64-bit CPU support
2534
		Add(P, P, T, 4);
2535
		assert(memcmp(P, A, 4*WORD_SIZE)==0);
2536
	}
2537
#endif
2538
}
1 by weidai
Initial revision
2539
2540
// for use by Divide(), corrects the underestimated quotient {Q1,Q0}
184 by weidai
port to MSVC .NET 2005 beta 2
2541
static void CorrectQuotientEstimate(word *R, word *T, word *Q, const word *B, size_t N)
1 by weidai
Initial revision
2542
{
2543
	assert(N && N%2==0);
2544
270 by weidai
MMX/SSE2 optimizations
2545
	AsymmetricMultiply(T, T+N+2, Q, 2, B, N);
1 by weidai
Initial revision
2546
2547
	word borrow = Subtract(R, R, T, N+2);
2548
	assert(!borrow && !R[N+1]);
2549
2550
	while (R[N] || Compare(R, B, N) >= 0)
2551
	{
2552
		R[N] -= Subtract(R, R, B, N);
2553
		Q[1] += (++Q[0]==0);
2554
		assert(Q[0] || Q[1]); // no overflow
2555
	}
2556
}
2557
2558
// R[NB] -------- remainder = A%B
2559
// Q[NA-NB+2] --- quotient	= A/B
270 by weidai
MMX/SSE2 optimizations
2560
// T[NA+3*(NB+2)] - temp work space
1 by weidai
Initial revision
2561
// A[NA] -------- dividend
2562
// B[NB] -------- divisor
2563
184 by weidai
port to MSVC .NET 2005 beta 2
2564
void Divide(word *R, word *Q, word *T, const word *A, size_t NA, const word *B, size_t NB)
1 by weidai
Initial revision
2565
{
2566
	assert(NA && NB && NA%2==0 && NB%2==0);
2567
	assert(B[NB-1] || B[NB-2]);
2568
	assert(NB <= NA);
2569
2570
	// set up temporary work space
2571
	word *const TA=T;
2572
	word *const TB=T+NA+2;
2573
	word *const TP=T+NA+2+NB;
2574
2575
	// copy B into TB and normalize it so that TB has highest bit set to 1
2576
	unsigned shiftWords = (B[NB-1]==0);
2577
	TB[0] = TB[NB-1] = 0;
2578
	CopyWords(TB+shiftWords, B, NB-shiftWords);
2579
	unsigned shiftBits = WORD_BITS - BitPrecision(TB[NB-1]);
2580
	assert(shiftBits < WORD_BITS);
2581
	ShiftWordsLeftByBits(TB, NB, shiftBits);
2582
2583
	// copy A into TA and normalize it
2584
	TA[0] = TA[NA] = TA[NA+1] = 0;
2585
	CopyWords(TA+shiftWords, A, NA);
2586
	ShiftWordsLeftByBits(TA, NA+2, shiftBits);
2587
2588
	if (TA[NA+1]==0 && TA[NA] <= 1)
2589
	{
2590
		Q[NA-NB+1] = Q[NA-NB] = 0;
2591
		while (TA[NA] || Compare(TA+NA-NB, TB, NB) >= 0)
2592
		{
2593
			TA[NA] -= Subtract(TA+NA-NB, TA+NA-NB, TB, NB);
2594
			++Q[NA-NB];
2595
		}
2596
	}
2597
	else
2598
	{
2599
		NA+=2;
2600
		assert(Compare(TA+NA-NB, TB, NB) < 0);
2601
	}
2602
2603
	word BT[2];
2604
	BT[0] = TB[NB-2] + 1;
2605
	BT[1] = TB[NB-1] + (BT[0]==0);
2606
2607
	// start reducing TA mod TB, 2 words at a time
184 by weidai
port to MSVC .NET 2005 beta 2
2608
	for (size_t i=NA-2; i>=NB; i-=2)
1 by weidai
Initial revision
2609
	{
2610
		AtomicDivide(Q+i-NB, TA+i-2, BT);
2611
		CorrectQuotientEstimate(TA+i-NB, TP, Q+i-NB, TB, NB);
2612
	}
2613
2614
	// copy TA into R, and denormalize it
2615
	CopyWords(R, TA+shiftWords, NB);
2616
	ShiftWordsRightByBits(R, NB, shiftBits);
2617
}
2618
184 by weidai
port to MSVC .NET 2005 beta 2
2619
static inline size_t EvenWordCount(const word *X, size_t N)
1 by weidai
Initial revision
2620
{
2621
	while (N && X[N-2]==0 && X[N-1]==0)
2622
		N-=2;
2623
	return N;
2624
}
2625
2626
// return k
2627
// R[N] --- result = A^(-1) * 2^k mod M
2628
// T[4*N] - temporary work space
2629
// A[NA] -- number to take inverse of
2630
// M[N] --- modulus
2631
184 by weidai
port to MSVC .NET 2005 beta 2
2632
unsigned int AlmostInverse(word *R, word *T, const word *A, size_t NA, const word *M, size_t N)
1 by weidai
Initial revision
2633
{
2634
	assert(NA<=N && N && N%2==0);
2635
2636
	word *b = T;
2637
	word *c = T+N;
2638
	word *f = T+2*N;
2639
	word *g = T+3*N;
184 by weidai
port to MSVC .NET 2005 beta 2
2640
	size_t bcLen=2, fgLen=EvenWordCount(M, N);
468 by weidai
switch to non-branching code in AlmostInverse()
2641
	unsigned int k=0;
2642
	bool s=false;
1 by weidai
Initial revision
2643
2644
	SetWords(T, 0, 3*N);
2645
	b[0]=1;
2646
	CopyWords(f, A, NA);
2647
	CopyWords(g, M, N);
2648
2649
	while (1)
2650
	{
2651
		word t=f[0];
2652
		while (!t)
2653
		{
2654
			if (EvenWordCount(f, fgLen)==0)
2655
			{
2656
				SetWords(R, 0, N);
2657
				return 0;
2658
			}
2659
2660
			ShiftWordsRightByWords(f, fgLen, 1);
468 by weidai
switch to non-branching code in AlmostInverse()
2661
			bcLen += 2 * (c[bcLen-1] != 0);
1 by weidai
Initial revision
2662
			assert(bcLen <= N);
2663
			ShiftWordsLeftByWords(c, bcLen, 1);
2664
			k+=WORD_BITS;
2665
			t=f[0];
2666
		}
2667
468 by weidai
switch to non-branching code in AlmostInverse()
2668
		unsigned int i = TrailingZeros(t);
2669
		t >>= i;
2670
		k += i;
1 by weidai
Initial revision
2671
468 by weidai
switch to non-branching code in AlmostInverse()
2672
		if (t==1 && f[1]==0 && EvenWordCount(f+2, fgLen-2)==0)
1 by weidai
Initial revision
2673
		{
468 by weidai
switch to non-branching code in AlmostInverse()
2674
			if (s)
2675
				Subtract(R, M, b, N);
2676
			else
1 by weidai
Initial revision
2677
				CopyWords(R, b, N);
2678
			return k;
2679
		}
2680
2681
		ShiftWordsRightByBits(f, fgLen, i);
468 by weidai
switch to non-branching code in AlmostInverse()
2682
		t = ShiftWordsLeftByBits(c, bcLen, i);
2683
		c[bcLen] += t;
2684
		bcLen += 2 * (t!=0);
2685
		assert(bcLen <= N);
2686
2687
		bool swap = Compare(f, g, fgLen)==-1;
2688
		ConditionalSwapPointers(swap, f, g);
2689
		ConditionalSwapPointers(swap, b, c);
2690
		s ^= swap;
2691
2692
		fgLen -= 2 * !(f[fgLen-2] | f[fgLen-1]);
1 by weidai
Initial revision
2693
2694
		Subtract(f, f, g, fgLen);
468 by weidai
switch to non-branching code in AlmostInverse()
2695
		t = Add(b, b, c, bcLen);
2696
		b[bcLen] += t;
2697
		bcLen += 2*t;
2698
		assert(bcLen <= N);
1 by weidai
Initial revision
2699
	}
2700
}
2701
2702
// R[N] - result = A/(2^k) mod M
2703
// A[N] - input
2704
// M[N] - modulus
2705
184 by weidai
port to MSVC .NET 2005 beta 2
2706
void DivideByPower2Mod(word *R, const word *A, size_t k, const word *M, size_t N)
1 by weidai
Initial revision
2707
{
2708
	CopyWords(R, A, N);
2709
2710
	while (k--)
2711
	{
2712
		if (R[0]%2==0)
2713
			ShiftWordsRightByBits(R, N, 1);
2714
		else
2715
		{
2716
			word carry = Add(R, R, M, N);
2717
			ShiftWordsRightByBits(R, N, 1);
2718
			R[N-1] += carry<<(WORD_BITS-1);
2719
		}
2720
	}
2721
}
2722
2723
// R[N] - result = A*(2^k) mod M
2724
// A[N] - input
2725
// M[N] - modulus
2726
184 by weidai
port to MSVC .NET 2005 beta 2
2727
void MultiplyByPower2Mod(word *R, const word *A, size_t k, const word *M, size_t N)
1 by weidai
Initial revision
2728
{
2729
	CopyWords(R, A, N);
2730
2731
	while (k--)
2732
		if (ShiftWordsLeftByBits(R, N, 1) || Compare(R, M, N)>=0)
2733
			Subtract(R, R, M, N);
2734
}
2735
2736
// ******************************************************************
2737
207 by weidai
improve Integer initialization
2738
InitializeInteger::InitializeInteger()
2739
{
2740
	if (!g_pAssignIntToInteger)
2741
	{
270 by weidai
MMX/SSE2 optimizations
2742
		SetFunctionPointers();
207 by weidai
improve Integer initialization
2743
		g_pAssignIntToInteger = AssignIntToInteger;
2744
	}
2745
}
2746
1 by weidai
Initial revision
2747
static const unsigned int RoundupSizeTable[] = {2, 2, 2, 4, 4, 8, 8, 8, 8};
2748
184 by weidai
port to MSVC .NET 2005 beta 2
2749
static inline size_t RoundupSize(size_t n)
1 by weidai
Initial revision
2750
{
2751
	if (n<=8)
2752
		return RoundupSizeTable[n];
2753
	else if (n<=16)
2754
		return 16;
2755
	else if (n<=32)
2756
		return 32;
2757
	else if (n<=64)
2758
		return 64;
202 by weidai
fix MSVC 2005 warnings
2759
	else return size_t(1) << BitPrecision(n-1);
1 by weidai
Initial revision
2760
}
2761
2762
Integer::Integer()
2763
	: reg(2), sign(POSITIVE)
2764
{
2765
	reg[0] = reg[1] = 0;
2766
}
2767
2768
Integer::Integer(const Integer& t)
2769
	: reg(RoundupSize(t.WordCount())), sign(t.sign)
2770
{
2771
	CopyWords(reg, t.reg, reg.size());
2772
}
2773
100 by weidai
fix bugs in 64-bit CPU support
2774
Integer::Integer(Sign s, lword value)
2775
	: reg(2), sign(s)
2776
{
2777
	reg[0] = word(value);
2778
	reg[1] = word(SafeRightShift<WORD_BITS>(value));
2779
}
2780
1 by weidai
Initial revision
2781
Integer::Integer(signed long value)
2782
	: reg(2)
2783
{
2784
	if (value >= 0)
2785
		sign = POSITIVE;
2786
	else
2787
	{
2788
		sign = NEGATIVE;
2789
		value = -value;
2790
	}
2791
	reg[0] = word(value);
100 by weidai
fix bugs in 64-bit CPU support
2792
	reg[1] = word(SafeRightShift<WORD_BITS>((unsigned long)value));
1 by weidai
Initial revision
2793
}
2794
32 by weidai
fix warnings for VC7 and GCC
2795
Integer::Integer(Sign s, word high, word low)
2796
	: reg(2), sign(s)
2797
{
2798
	reg[0] = low;
2799
	reg[1] = high;
2800
}
2801
1 by weidai
Initial revision
2802
bool Integer::IsConvertableToLong() const
2803
{
2804
	if (ByteCount() > sizeof(long))
2805
		return false;
2806
202 by weidai
fix MSVC 2005 warnings
2807
	unsigned long value = (unsigned long)reg[0];
2808
	value += SafeLeftShift<WORD_BITS, unsigned long>((unsigned long)reg[1]);
1 by weidai
Initial revision
2809
2810
	if (sign==POSITIVE)
2811
		return (signed long)value >= 0;
2812
	else
2813
		return -(signed long)value < 0;
2814
}
2815
2816
signed long Integer::ConvertToLong() const
2817
{
2818
	assert(IsConvertableToLong());
2819
202 by weidai
fix MSVC 2005 warnings
2820
	unsigned long value = (unsigned long)reg[0];
2821
	value += SafeLeftShift<WORD_BITS, unsigned long>((unsigned long)reg[1]);
1 by weidai
Initial revision
2822
	return sign==POSITIVE ? value : -(signed long)value;
2823
}
2824
184 by weidai
port to MSVC .NET 2005 beta 2
2825
Integer::Integer(BufferedTransformation &encodedInteger, size_t byteCount, Signedness s)
1 by weidai
Initial revision
2826
{
2827
	Decode(encodedInteger, byteCount, s);
2828
}
2829
184 by weidai
port to MSVC .NET 2005 beta 2
2830
Integer::Integer(const byte *encodedInteger, size_t byteCount, Signedness s)
1 by weidai
Initial revision
2831
{
2832
	Decode(encodedInteger, byteCount, s);
2833
}
2834
2835
Integer::Integer(BufferedTransformation &bt)
2836
{
2837
	BERDecode(bt);
2838
}
2839
184 by weidai
port to MSVC .NET 2005 beta 2
2840
Integer::Integer(RandomNumberGenerator &rng, size_t bitcount)
1 by weidai
Initial revision
2841
{
2842
	Randomize(rng, bitcount);
2843
}
2844
2845
Integer::Integer(RandomNumberGenerator &rng, const Integer &min, const Integer &max, RandomNumberType rnType, const Integer &equiv, const Integer &mod)
2846
{
2847
	if (!Randomize(rng, min, max, rnType, equiv, mod))
2848
		throw Integer::RandomNumberNotFound();
2849
}
2850
184 by weidai
port to MSVC .NET 2005 beta 2
2851
Integer Integer::Power2(size_t e)
1 by weidai
Initial revision
2852
{
2853
	Integer r((word)0, BitsToWords(e+1));
2854
	r.SetBit(e);
2855
	return r;
2856
}
2857
106 by weidai
fix potential threading problem with initialization of static objects
2858
template <long i>
2859
struct NewInteger
2860
{
2861
	Integer * operator()() const
2862
	{
2863
		return new Integer(i);
2864
	}
2865
};
2866
1 by weidai
Initial revision
2867
const Integer &Integer::Zero()
2868
{
106 by weidai
fix potential threading problem with initialization of static objects
2869
	return Singleton<Integer>().Ref();
1 by weidai
Initial revision
2870
}
2871
2872
const Integer &Integer::One()
2873
{
106 by weidai
fix potential threading problem with initialization of static objects
2874
	return Singleton<Integer, NewInteger<1> >().Ref();
1 by weidai
Initial revision
2875
}
2876
2877
const Integer &Integer::Two()
2878
{
106 by weidai
fix potential threading problem with initialization of static objects
2879
	return Singleton<Integer, NewInteger<2> >().Ref();
1 by weidai
Initial revision
2880
}
2881
2882
bool Integer::operator!() const
2883
{
2884
	return IsNegative() ? false : (reg[0]==0 && WordCount()==0);
2885
}
2886
2887
Integer& Integer::operator=(const Integer& t)
2888
{
2889
	if (this != &t)
2890
	{
270 by weidai
MMX/SSE2 optimizations
2891
		if (reg.size() != t.reg.size() || t.reg[t.reg.size()/2] == 0)
2892
			reg.New(RoundupSize(t.WordCount()));
1 by weidai
Initial revision
2893
		CopyWords(reg, t.reg, reg.size());
2894
		sign = t.sign;
2895
	}
2896
	return *this;
2897
}
2898
184 by weidai
port to MSVC .NET 2005 beta 2
2899
bool Integer::GetBit(size_t n) const
1 by weidai
Initial revision
2900
{
2901
	if (n/WORD_BITS >= reg.size())
2902
		return 0;
2903
	else
2904
		return bool((reg[n/WORD_BITS] >> (n % WORD_BITS)) & 1);
2905
}
2906
184 by weidai
port to MSVC .NET 2005 beta 2
2907
void Integer::SetBit(size_t n, bool value)
1 by weidai
Initial revision
2908
{
2909
	if (value)
2910
	{
2911
		reg.CleanGrow(RoundupSize(BitsToWords(n+1)));
2912
		reg[n/WORD_BITS] |= (word(1) << (n%WORD_BITS));
2913
	}
2914
	else
2915
	{
2916
		if (n/WORD_BITS < reg.size())
2917
			reg[n/WORD_BITS] &= ~(word(1) << (n%WORD_BITS));
2918
	}
2919
}
2920
184 by weidai
port to MSVC .NET 2005 beta 2
2921
byte Integer::GetByte(size_t n) const
1 by weidai
Initial revision
2922
{
2923
	if (n/WORD_SIZE >= reg.size())
2924
		return 0;
2925
	else
2926
		return byte(reg[n/WORD_SIZE] >> ((n%WORD_SIZE)*8));
2927
}
2928
184 by weidai
port to MSVC .NET 2005 beta 2
2929
void Integer::SetByte(size_t n, byte value)
1 by weidai
Initial revision
2930
{
2931
	reg.CleanGrow(RoundupSize(BytesToWords(n+1)));
2932
	reg[n/WORD_SIZE] &= ~(word(0xff) << 8*(n%WORD_SIZE));
2933
	reg[n/WORD_SIZE] |= (word(value) << 8*(n%WORD_SIZE));
2934
}
2935
184 by weidai
port to MSVC .NET 2005 beta 2
2936
lword Integer::GetBits(size_t i, size_t n) const
1 by weidai
Initial revision
2937
{
184 by weidai
port to MSVC .NET 2005 beta 2
2938
	lword v = 0;
2939
	assert(n <= sizeof(v)*8);
1 by weidai
Initial revision
2940
	for (unsigned int j=0; j<n; j++)
184 by weidai
port to MSVC .NET 2005 beta 2
2941
		v |= lword(GetBit(i+j)) << j;
1 by weidai
Initial revision
2942
	return v;
2943
}
2944
2945
Integer Integer::operator-() const
2946
{
2947
	Integer result(*this);
2948
	result.Negate();
2949
	return result;
2950
}
2951
2952
Integer Integer::AbsoluteValue() const
2953
{
2954
	Integer result(*this);
2955
	result.sign = POSITIVE;
2956
	return result;
2957
}
2958
2959
void Integer::swap(Integer &a)
2960
{
2961
	reg.swap(a.reg);
2962
	std::swap(sign, a.sign);
2963
}
2964
184 by weidai
port to MSVC .NET 2005 beta 2
2965
Integer::Integer(word value, size_t length)
1 by weidai
Initial revision
2966
	: reg(RoundupSize(length)), sign(POSITIVE)
2967
{
2968
	reg[0] = value;
2969
	SetWords(reg+1, 0, reg.size()-1);
2970
}
2971
2972
template <class T>
2973
static Integer StringToInteger(const T *str)
2974
{
202 by weidai
fix MSVC 2005 warnings
2975
	int radix;
38 by weidai
STLport workaround
2976
	// GCC workaround
2977
	// std::char_traits<wchar_t>::length() not defined in GCC 3.2 and STLport 4.5.3
1 by weidai
Initial revision
2978
	unsigned int length;
2979
	for (length = 0; str[length] != 0; length++) {}
2980
2981
	Integer v;
2982
2983
	if (length == 0)
2984
		return v;
2985
2986
	switch (str[length-1])
2987
	{
2988
	case 'h':
2989
	case 'H':
2990
		radix=16;
2991
		break;
2992
	case 'o':
2993
	case 'O':
2994
		radix=8;
2995
		break;
2996
	case 'b':
2997
	case 'B':
2998
		radix=2;
2999
		break;
3000
	default:
3001
		radix=10;
3002
	}
3003
3004
	if (length > 2 && str[0] == '0' && str[1] == 'x')
3005
		radix = 16;
3006
3007
	for (unsigned i=0; i<length; i++)
3008
	{
202 by weidai
fix MSVC 2005 warnings
3009
		int digit;
1 by weidai
Initial revision
3010
3011
		if (str[i] >= '0' && str[i] <= '9')
3012
			digit = str[i] - '0';
3013
		else if (str[i] >= 'A' && str[i] <= 'F')
3014
			digit = str[i] - 'A' + 10;
3015
		else if (str[i] >= 'a' && str[i] <= 'f')
3016
			digit = str[i] - 'a' + 10;
3017
		else
3018
			digit = radix;
3019
3020
		if (digit < radix)
3021
		{
3022
			v *= radix;
3023
			v += digit;
3024
		}
3025
	}
3026
3027
	if (str[0] == '-')
3028
		v.Negate();
3029
3030
	return v;
3031
}
3032
3033
Integer::Integer(const char *str)
3034
	: reg(2), sign(POSITIVE)
3035
{
3036
	*this = StringToInteger(str);
3037
}
3038
3039
Integer::Integer(const wchar_t *str)
3040
	: reg(2), sign(POSITIVE)
3041
{
3042
	*this = StringToInteger(str);
3043
}
3044
3045
unsigned int Integer::WordCount() const
3046
{
184 by weidai
port to MSVC .NET 2005 beta 2
3047
	return (unsigned int)CountWords(reg, reg.size());
1 by weidai
Initial revision
3048
}
3049
3050
unsigned int Integer::ByteCount() const
3051
{
3052
	unsigned wordCount = WordCount();
3053
	if (wordCount)
3054
		return (wordCount-1)*WORD_SIZE + BytePrecision(reg[wordCount-1]);
3055
	else
3056
		return 0;
3057
}
3058
3059
unsigned int Integer::BitCount() const
3060
{
3061
	unsigned wordCount = WordCount();
3062
	if (wordCount)
3063
		return (wordCount-1)*WORD_BITS + BitPrecision(reg[wordCount-1]);
3064
	else
3065
		return 0;
3066
}
3067
184 by weidai
port to MSVC .NET 2005 beta 2
3068
void Integer::Decode(const byte *input, size_t inputLen, Signedness s)
1 by weidai
Initial revision
3069
{
3070
	StringStore store(input, inputLen);
3071
	Decode(store, inputLen, s);
3072
}
3073
184 by weidai
port to MSVC .NET 2005 beta 2
3074
void Integer::Decode(BufferedTransformation &bt, size_t inputLen, Signedness s)
1 by weidai
Initial revision
3075
{
3076
	assert(bt.MaxRetrievable() >= inputLen);
3077
3078
	byte b;
3079
	bt.Peek(b);
3080
	sign = ((s==SIGNED) && (b & 0x80)) ? NEGATIVE : POSITIVE;
3081
3082
	while (inputLen>0 && (sign==POSITIVE ? b==0 : b==0xff))
3083
	{
3084
		bt.Skip(1);
3085
		inputLen--;
3086
		bt.Peek(b);
3087
	}
3088
3089
	reg.CleanNew(RoundupSize(BytesToWords(inputLen)));
3090
184 by weidai
port to MSVC .NET 2005 beta 2
3091
	for (size_t i=inputLen; i > 0; i--)
1 by weidai
Initial revision
3092
	{
3093
		bt.Get(b);
100 by weidai
fix bugs in 64-bit CPU support
3094
		reg[(i-1)/WORD_SIZE] |= word(b) << ((i-1)%WORD_SIZE)*8;
1 by weidai
Initial revision
3095
	}
3096
3097
	if (sign == NEGATIVE)
3098
	{
184 by weidai
port to MSVC .NET 2005 beta 2
3099
		for (size_t i=inputLen; i<reg.size()*WORD_SIZE; i++)
100 by weidai
fix bugs in 64-bit CPU support
3100
			reg[i/WORD_SIZE] |= word(0xff) << (i%WORD_SIZE)*8;
1 by weidai
Initial revision
3101
		TwosComplement(reg, reg.size());
3102
	}
3103
}
3104
184 by weidai
port to MSVC .NET 2005 beta 2
3105
size_t Integer::MinEncodedSize(Signedness signedness) const
1 by weidai
Initial revision
3106
{
3107
	unsigned int outputLen = STDMAX(1U, ByteCount());
3108
	if (signedness == UNSIGNED)
3109
		return outputLen;
3110
	if (NotNegative() && (GetByte(outputLen-1) & 0x80))
3111
		outputLen++;
3112
	if (IsNegative() && *this < -Power2(outputLen*8-1))
3113
		outputLen++;
3114
	return outputLen;
3115
}
3116
184 by weidai
port to MSVC .NET 2005 beta 2
3117
void Integer::Encode(byte *output, size_t outputLen, Signedness signedness) const
1 by weidai
Initial revision
3118
{
3119
	ArraySink sink(output, outputLen);
184 by weidai
port to MSVC .NET 2005 beta 2
3120
	Encode(sink, outputLen, signedness);
1 by weidai
Initial revision
3121
}
3122
184 by weidai
port to MSVC .NET 2005 beta 2
3123
void Integer::Encode(BufferedTransformation &bt, size_t outputLen, Signedness signedness) const
1 by weidai
Initial revision
3124
{
3125
	if (signedness == UNSIGNED || NotNegative())
3126
	{
184 by weidai
port to MSVC .NET 2005 beta 2
3127
		for (size_t i=outputLen; i > 0; i--)
1 by weidai
Initial revision
3128
			bt.Put(GetByte(i-1));
3129
	}
3130
	else
3131
	{
3132
		// take two's complement of *this
203 by weidai
fix Integer::Encode
3133
		Integer temp = Integer::Power2(8*STDMAX((size_t)ByteCount(), outputLen)) + *this;
184 by weidai
port to MSVC .NET 2005 beta 2
3134
		temp.Encode(bt, outputLen, UNSIGNED);
1 by weidai
Initial revision
3135
	}
3136
}
3137
3138
void Integer::DEREncode(BufferedTransformation &bt) const
3139
{
3140
	DERGeneralEncoder enc(bt, INTEGER);
3141
	Encode(enc, MinEncodedSize(SIGNED), SIGNED);
3142
	enc.MessageEnd();
3143
}
3144
184 by weidai
port to MSVC .NET 2005 beta 2
3145
void Integer::BERDecode(const byte *input, size_t len)
1 by weidai
Initial revision
3146
{
3147
	StringStore store(input, len);
3148
	BERDecode(store);
3149
}
3150
3151
void Integer::BERDecode(BufferedTransformation &bt)
3152
{
3153
	BERGeneralDecoder dec(bt, INTEGER);
3154
	if (!dec.IsDefiniteLength() || dec.MaxRetrievable() < dec.RemainingLength())
3155
		BERDecodeError();
184 by weidai
port to MSVC .NET 2005 beta 2
3156
	Decode(dec, (size_t)dec.RemainingLength(), SIGNED);
1 by weidai
Initial revision
3157
	dec.MessageEnd();
3158
}
3159
184 by weidai
port to MSVC .NET 2005 beta 2
3160
void Integer::DEREncodeAsOctetString(BufferedTransformation &bt, size_t length) const
1 by weidai
Initial revision
3161
{
3162
	DERGeneralEncoder enc(bt, OCTET_STRING);
3163
	Encode(enc, length);
3164
	enc.MessageEnd();
3165
}
3166
184 by weidai
port to MSVC .NET 2005 beta 2
3167
void Integer::BERDecodeAsOctetString(BufferedTransformation &bt, size_t length)
1 by weidai
Initial revision
3168
{
3169
	BERGeneralDecoder dec(bt, OCTET_STRING);
3170
	if (!dec.IsDefiniteLength() || dec.RemainingLength() != length)
3171
		BERDecodeError();
3172
	Decode(dec, length);
3173
	dec.MessageEnd();
3174
}
3175
184 by weidai
port to MSVC .NET 2005 beta 2
3176
size_t Integer::OpenPGPEncode(byte *output, size_t len) const
1 by weidai
Initial revision
3177
{
3178
	ArraySink sink(output, len);
3179
	return OpenPGPEncode(sink);
3180
}
3181
184 by weidai
port to MSVC .NET 2005 beta 2
3182
size_t Integer::OpenPGPEncode(BufferedTransformation &bt) const
1 by weidai
Initial revision
3183
{
3184
	word16 bitCount = BitCount();
3185
	bt.PutWord16(bitCount);
184 by weidai
port to MSVC .NET 2005 beta 2
3186
	size_t byteCount = BitsToBytes(bitCount);
3187
	Encode(bt, byteCount);
3188
	return 2 + byteCount;
1 by weidai
Initial revision
3189
}
3190
184 by weidai
port to MSVC .NET 2005 beta 2
3191
void Integer::OpenPGPDecode(const byte *input, size_t len)
1 by weidai
Initial revision
3192
{
3193
	StringStore store(input, len);
3194
	OpenPGPDecode(store);
3195
}
3196
3197
void Integer::OpenPGPDecode(BufferedTransformation &bt)
3198
{
3199
	word16 bitCount;
3200
	if (bt.GetWord16(bitCount) != 2 || bt.MaxRetrievable() < BitsToBytes(bitCount))
3201
		throw OpenPGPDecodeErr();
3202
	Decode(bt, BitsToBytes(bitCount));
3203
}
3204
184 by weidai
port to MSVC .NET 2005 beta 2
3205
void Integer::Randomize(RandomNumberGenerator &rng, size_t nbits)
1 by weidai
Initial revision
3206
{
184 by weidai
port to MSVC .NET 2005 beta 2
3207
	const size_t nbytes = nbits/8 + 1;
1 by weidai
Initial revision
3208
	SecByteBlock buf(nbytes);
3209
	rng.GenerateBlock(buf, nbytes);
3210
	if (nbytes)
3211
		buf[0] = (byte)Crop(buf[0], nbits % 8);
3212
	Decode(buf, nbytes, UNSIGNED);
3213
}
3214
3215
void Integer::Randomize(RandomNumberGenerator &rng, const Integer &min, const Integer &max)
3216
{
3217
	if (min > max)
3218
		throw InvalidArgument("Integer: Min must be no greater than Max");
3219
3220
	Integer range = max - min;
3221
	const unsigned int nbits = range.BitCount();
3222
3223
	do
3224
	{
3225
		Randomize(rng, nbits);
3226
	}
3227
	while (*this > range);
3228
3229
	*this += min;
3230
}
3231
3232
bool Integer::Randomize(RandomNumberGenerator &rng, const Integer &min, const Integer &max, RandomNumberType rnType, const Integer &equiv, const Integer &mod)
3233
{
3234
	return GenerateRandomNoThrow(rng, MakeParameters("Min", min)("Max", max)("RandomNumberType", rnType)("EquivalentTo", equiv)("Mod", mod));
3235
}
3236
3237
class KDF2_RNG : public RandomNumberGenerator
3238
{
3239
public:
184 by weidai
port to MSVC .NET 2005 beta 2
3240
	KDF2_RNG(const byte *seed, size_t seedSize)
1 by weidai
Initial revision
3241
		: m_counter(0), m_counterAndSeed(seedSize + 4)
3242
	{
3243
		memcpy(m_counterAndSeed + 4, seed, seedSize);
3244
	}
3245
250 by weidai
fix compile with Sun CC 64-bit
3246
	void GenerateBlock(byte *output, size_t size)
1 by weidai
Initial revision
3247
	{
270 by weidai
MMX/SSE2 optimizations
3248
		PutWord(false, BIG_ENDIAN_ORDER, m_counterAndSeed, m_counter);
1 by weidai
Initial revision
3249
		++m_counter;
86 by weidai
added support for using encoding parameters and key derivation parameters
3250
		P1363_KDF2<SHA1>::DeriveKey(output, size, m_counterAndSeed, m_counterAndSeed.size(), NULL, 0);
1 by weidai
Initial revision
3251
	}
3252
3253
private:
3254
	word32 m_counter;
3255
	SecByteBlock m_counterAndSeed;
3256
};
3257
3258
bool Integer::GenerateRandomNoThrow(RandomNumberGenerator &i_rng, const NameValuePairs &params)
3259
{
3260
	Integer min = params.GetValueWithDefault("Min", Integer::Zero());
3261
	Integer max;
3262
	if (!params.GetValue("Max", max))
3263
	{
3264
		int bitLength;
3265
		if (params.GetIntValue("BitLength", bitLength))
3266
			max = Integer::Power2(bitLength);
3267
		else
3268
			throw InvalidArgument("Integer: missing Max argument");
3269
	}
3270
	if (min > max)
3271
		throw InvalidArgument("Integer: Min must be no greater than Max");
3272
3273
	Integer equiv = params.GetValueWithDefault("EquivalentTo", Integer::Zero());
3274
	Integer mod = params.GetValueWithDefault("Mod", Integer::One());
3275
3276
	if (equiv.IsNegative() || equiv >= mod)
3277
		throw InvalidArgument("Integer: invalid EquivalentTo and/or Mod argument");
3278
3279
	Integer::RandomNumberType rnType = params.GetValueWithDefault("RandomNumberType", Integer::ANY);
3280
3281
	member_ptr<KDF2_RNG> kdf2Rng;
3282
	ConstByteArrayParameter seed;
412 by weidai
changes for 5.6:
3283
	if (params.GetValue(Name::Seed(), seed))
1 by weidai
Initial revision
3284
	{
3285
		ByteQueue bq;
3286
		DERSequenceEncoder seq(bq);
3287
		min.DEREncode(seq);
3288
		max.DEREncode(seq);
3289
		equiv.DEREncode(seq);
3290
		mod.DEREncode(seq);
3291
		DEREncodeUnsigned(seq, rnType);
3292
		DEREncodeOctetString(seq, seed.begin(), seed.size());
3293
		seq.MessageEnd();
3294
184 by weidai
port to MSVC .NET 2005 beta 2
3295
		SecByteBlock finalSeed((size_t)bq.MaxRetrievable());
1 by weidai
Initial revision
3296
		bq.Get(finalSeed, finalSeed.size());
3297
		kdf2Rng.reset(new KDF2_RNG(finalSeed.begin(), finalSeed.size()));
3298
	}
3299
	RandomNumberGenerator &rng = kdf2Rng.get() ? (RandomNumberGenerator &)*kdf2Rng : i_rng;
3300
3301
	switch (rnType)
3302
	{
3303
		case ANY:
3304
			if (mod == One())
3305
				Randomize(rng, min, max);
3306
			else
3307
			{
3308
				Integer min1 = min + (equiv-min)%mod;
3309
				if (max < min1)
3310
					return false;
3311
				Randomize(rng, Zero(), (max - min1) / mod);
3312
				*this *= mod;
3313
				*this += min1;
3314
			}
3315
			return true;
3316
3317
		case PRIME:
3318
		{
86 by weidai
added support for using encoding parameters and key derivation parameters
3319
			const PrimeSelector *pSelector = params.GetValueWithDefault(Name::PointerToPrimeSelector(), (const PrimeSelector *)NULL);
1 by weidai
Initial revision
3320
3321
			int i;
3322
			i = 0;
3323
			while (1)
3324
			{
3325
				if (++i==16)
3326
				{
3327
					// check if there are any suitable primes in [min, max]
3328
					Integer first = min;
3329
					if (FirstPrime(first, max, equiv, mod, pSelector))
3330
					{
3331
						// if there is only one suitable prime, we're done
3332
						*this = first;
3333
						if (!FirstPrime(first, max, equiv, mod, pSelector))
3334
							return true;
3335
					}
3336
					else
3337
						return false;
3338
				}
3339
3340
				Randomize(rng, min, max);
3341
				if (FirstPrime(*this, STDMIN(*this+mod*PrimeSearchInterval(max), max), equiv, mod, pSelector))
3342
					return true;
3343
			}
3344
		}
3345
3346
		default:
3347
			throw InvalidArgument("Integer: invalid RandomNumberType argument");
3348
	}
3349
}
3350
3351
std::istream& operator>>(std::istream& in, Integer &a)
3352
{
3353
	char c;
3354
	unsigned int length = 0;
3355
	SecBlock<char> str(length + 16);
3356
3357
	std::ws(in);
3358
3359
	do
3360
	{
3361
		in.read(&c, 1);
3362
		str[length++] = c;
3363
		if (length >= str.size())
3364
			str.Grow(length + 16);
3365
	}
3366
	while (in && (c=='-' || c=='x' || (c>='0' && c<='9') || (c>='a' && c<='f') || (c>='A' && c<='F') || c=='h' || c=='H' || c=='o' || c=='O' || c==',' || c=='.'));
3367
3368
	if (in.gcount())
3369
		in.putback(c);
3370
	str[length-1] = '\0';
3371
	a = Integer(str);
3372
3373
	return in;
3374
}
3375
3376
std::ostream& operator<<(std::ostream& out, const Integer &a)
3377
{
3378
	// Get relevant conversion specifications from ostream.
3379
	long f = out.flags() & std::ios::basefield; // Get base digits.
3380
	int base, block;
3381
	char suffix;
3382
	switch(f)
3383
	{
3384
	case std::ios::oct :
3385
		base = 8;
3386
		block = 8;
3387
		suffix = 'o';
3388
		break;
3389
	case std::ios::hex :
3390
		base = 16;
3391
		block = 4;
3392
		suffix = 'h';
3393
		break;
3394
	default :
3395
		base = 10;
3396
		block = 3;
3397
		suffix = '.';
3398
	}
3399
3400
	Integer temp1=a, temp2;
421 by weidai
from Jeffery Walton: move *.dat files into TestData, make Integer operator<< respect ios::uppercase flag
3401
    
1 by weidai
Initial revision
3402
	if (a.IsNegative())
3403
	{
3404
		out << '-';
3405
		temp1.Negate();
3406
	}
3407
3408
	if (!a)
3409
		out << '0';
3410
421 by weidai
from Jeffery Walton: move *.dat files into TestData, make Integer operator<< respect ios::uppercase flag
3411
	static const char upper[]="0123456789ABCDEF";
3412
	static const char lower[]="0123456789abcdef";
3413
3414
	const char* vec = (out.flags() & std::ios::uppercase) ? upper : lower;
3415
	unsigned i=0;
3416
	SecBlock<char> s(a.BitCount() / (BitPrecision(base)-1) + 1);
3417
1 by weidai
Initial revision
3418
	while (!!temp1)
3419
	{
3420
		word digit;
3421
		Integer::Divide(digit, temp2, temp1, base);
3422
		s[i++]=vec[digit];
421 by weidai
from Jeffery Walton: move *.dat files into TestData, make Integer operator<< respect ios::uppercase flag
3423
		temp1.swap(temp2);
1 by weidai
Initial revision
3424
	}
3425
3426
	while (i--)
3427
	{
3428
		out << s[i];
3429
//		if (i && !(i%block))
3430
//			out << ",";
3431
	}
3432
	return out << suffix;
3433
}
3434
3435
Integer& Integer::operator++()
3436
{
3437
	if (NotNegative())
3438
	{
3439
		if (Increment(reg, reg.size()))
3440
		{
3441
			reg.CleanGrow(2*reg.size());
3442
			reg[reg.size()/2]=1;
3443
		}
3444
	}
3445
	else
3446
	{
3447
		word borrow = Decrement(reg, reg.size());
3448
		assert(!borrow);
3449
		if (WordCount()==0)
3450
			*this = Zero();
3451
	}
3452
	return *this;
3453
}
3454
3455
Integer& Integer::operator--()
3456
{
3457
	if (IsNegative())
3458
	{
3459
		if (Increment(reg, reg.size()))
3460
		{
3461
			reg.CleanGrow(2*reg.size());
3462
			reg[reg.size()/2]=1;
3463
		}
3464
	}
3465
	else
3466
	{
3467
		if (Decrement(reg, reg.size()))
3468
			*this = -One();
3469
	}
3470
	return *this;
3471
}
3472
3473
void PositiveAdd(Integer &sum, const Integer &a, const Integer& b)
3474
{
202 by weidai
fix MSVC 2005 warnings
3475
	int carry;
1 by weidai
Initial revision
3476
	if (a.reg.size() == b.reg.size())
3477
		carry = Add(sum.reg, a.reg, b.reg, a.reg.size());
3478
	else if (a.reg.size() > b.reg.size())
3479
	{
3480
		carry = Add(sum.reg, a.reg, b.reg, b.reg.size());
3481
		CopyWords(sum.reg+b.reg.size(), a.reg+b.reg.size(), a.reg.size()-b.reg.size());
3482
		carry = Increment(sum.reg+b.reg.size(), a.reg.size()-b.reg.size(), carry);
3483
	}
3484
	else
3485
	{
3486
		carry = Add(sum.reg, a.reg, b.reg, a.reg.size());
3487
		CopyWords(sum.reg+a.reg.size(), b.reg+a.reg.size(), b.reg.size()-a.reg.size());
3488
		carry = Increment(sum.reg+a.reg.size(), b.reg.size()-a.reg.size(), carry);
3489
	}
3490
3491
	if (carry)
3492
	{
3493
		sum.reg.CleanGrow(2*sum.reg.size());
3494
		sum.reg[sum.reg.size()/2] = 1;
3495
	}
3496
	sum.sign = Integer::POSITIVE;
3497
}
3498
3499
void PositiveSubtract(Integer &diff, const Integer &a, const Integer& b)
3500
{
3501
	unsigned aSize = a.WordCount();
3502
	aSize += aSize%2;
3503
	unsigned bSize = b.WordCount();
3504
	bSize += bSize%2;
3505
3506
	if (aSize == bSize)
3507
	{
3508
		if (Compare(a.reg, b.reg, aSize) >= 0)
3509
		{
3510
			Subtract(diff.reg, a.reg, b.reg, aSize);
3511
			diff.sign = Integer::POSITIVE;
3512
		}
3513
		else
3514
		{
3515
			Subtract(diff.reg, b.reg, a.reg, aSize);
3516
			diff.sign = Integer::NEGATIVE;
3517
		}
3518
	}
3519
	else if (aSize > bSize)
3520
	{
3521
		word borrow = Subtract(diff.reg, a.reg, b.reg, bSize);
3522
		CopyWords(diff.reg+bSize, a.reg+bSize, aSize-bSize);
3523
		borrow = Decrement(diff.reg+bSize, aSize-bSize, borrow);
3524
		assert(!borrow);
3525
		diff.sign = Integer::POSITIVE;
3526
	}
3527
	else
3528
	{
3529
		word borrow = Subtract(diff.reg, b.reg, a.reg, aSize);
3530
		CopyWords(diff.reg+aSize, b.reg+aSize, bSize-aSize);
3531
		borrow = Decrement(diff.reg+aSize, bSize-aSize, borrow);
3532
		assert(!borrow);
3533
		diff.sign = Integer::NEGATIVE;
3534
	}
3535
}
3536
184 by weidai
port to MSVC .NET 2005 beta 2
3537
// MSVC .NET 2003 workaround
3538
template <class T> inline const T& STDMAX2(const T& a, const T& b)
3539
{
3540
	return a < b ? b : a;
3541
}
3542
1 by weidai
Initial revision
3543
Integer Integer::Plus(const Integer& b) const
3544
{
184 by weidai
port to MSVC .NET 2005 beta 2
3545
	Integer sum((word)0, STDMAX2(reg.size(), b.reg.size()));
1 by weidai
Initial revision
3546
	if (NotNegative())
3547
	{
3548
		if (b.NotNegative())
3549
			PositiveAdd(sum, *this, b);
3550
		else
3551
			PositiveSubtract(sum, *this, b);
3552
	}
3553
	else
3554
	{
3555
		if (b.NotNegative())
3556
			PositiveSubtract(sum, b, *this);
3557
		else
3558
		{
3559
			PositiveAdd(sum, *this, b);
3560
			sum.sign = Integer::NEGATIVE;
3561
		}
3562
	}
3563
	return sum;
3564
}
3565
3566
Integer& Integer::operator+=(const Integer& t)
3567
{
3568
	reg.CleanGrow(t.reg.size());
3569
	if (NotNegative())
3570
	{
3571
		if (t.NotNegative())
3572
			PositiveAdd(*this, *this, t);
3573
		else
3574
			PositiveSubtract(*this, *this, t);
3575
	}
3576
	else
3577
	{
3578
		if (t.NotNegative())
3579
			PositiveSubtract(*this, t, *this);
3580
		else
3581
		{
3582
			PositiveAdd(*this, *this, t);
3583
			sign = Integer::NEGATIVE;
3584
		}
3585
	}
3586
	return *this;
3587
}
3588
3589
Integer Integer::Minus(const Integer& b) const
3590
{
184 by weidai
port to MSVC .NET 2005 beta 2
3591
	Integer diff((word)0, STDMAX2(reg.size(), b.reg.size()));
1 by weidai
Initial revision
3592
	if (NotNegative())
3593
	{
3594
		if (b.NotNegative())
3595
			PositiveSubtract(diff, *this, b);
3596
		else
3597
			PositiveAdd(diff, *this, b);
3598
	}
3599
	else
3600
	{
3601
		if (b.NotNegative())
3602
		{
3603
			PositiveAdd(diff, *this, b);
3604
			diff.sign = Integer::NEGATIVE;
3605
		}
3606
		else
3607
			PositiveSubtract(diff, b, *this);
3608
	}
3609
	return diff;
3610
}
3611
3612
Integer& Integer::operator-=(const Integer& t)
3613
{
3614
	reg.CleanGrow(t.reg.size());
3615
	if (NotNegative())
3616
	{
3617
		if (t.NotNegative())
3618
			PositiveSubtract(*this, *this, t);
3619
		else
3620
			PositiveAdd(*this, *this, t);
3621
	}
3622
	else
3623
	{
3624
		if (t.NotNegative())
3625
		{
3626
			PositiveAdd(*this, *this, t);
3627
			sign = Integer::NEGATIVE;
3628
		}
3629
		else
3630
			PositiveSubtract(*this, t, *this);
3631
	}
3632
	return *this;
3633
}
3634
184 by weidai
port to MSVC .NET 2005 beta 2
3635
Integer& Integer::operator<<=(size_t n)
1 by weidai
Initial revision
3636
{
184 by weidai
port to MSVC .NET 2005 beta 2
3637
	const size_t wordCount = WordCount();
3638
	const size_t shiftWords = n / WORD_BITS;
3639
	const unsigned int shiftBits = (unsigned int)(n % WORD_BITS);
1 by weidai
Initial revision
3640
3641
	reg.CleanGrow(RoundupSize(wordCount+BitsToWords(n)));
3642
	ShiftWordsLeftByWords(reg, wordCount + shiftWords, shiftWords);
3643
	ShiftWordsLeftByBits(reg+shiftWords, wordCount+BitsToWords(shiftBits), shiftBits);
3644
	return *this;
3645
}
3646
184 by weidai
port to MSVC .NET 2005 beta 2
3647
Integer& Integer::operator>>=(size_t n)
1 by weidai
Initial revision
3648
{
184 by weidai
port to MSVC .NET 2005 beta 2
3649
	const size_t wordCount = WordCount();
3650
	const size_t shiftWords = n / WORD_BITS;
3651
	const unsigned int shiftBits = (unsigned int)(n % WORD_BITS);
1 by weidai
Initial revision
3652
3653
	ShiftWordsRightByWords(reg, wordCount, shiftWords);
3654
	if (wordCount > shiftWords)
3655
		ShiftWordsRightByBits(reg, wordCount-shiftWords, shiftBits);
3656
	if (IsNegative() && WordCount()==0)   // avoid -0
3657
		*this = Zero();
3658
	return *this;
3659
}
3660
3661
void PositiveMultiply(Integer &product, const Integer &a, const Integer &b)
3662
{
184 by weidai
port to MSVC .NET 2005 beta 2
3663
	size_t aSize = RoundupSize(a.WordCount());
3664
	size_t bSize = RoundupSize(b.WordCount());
1 by weidai
Initial revision
3665
3666
	product.reg.CleanNew(RoundupSize(aSize+bSize));
3667
	product.sign = Integer::POSITIVE;
3668
270 by weidai
MMX/SSE2 optimizations
3669
	IntegerSecBlock workspace(aSize + bSize);
1 by weidai
Initial revision
3670
	AsymmetricMultiply(product.reg, workspace, a.reg, aSize, b.reg, bSize);
3671
}
3672
3673
void Multiply(Integer &product, const Integer &a, const Integer &b)
3674
{
3675
	PositiveMultiply(product, a, b);
3676
3677
	if (a.NotNegative() != b.NotNegative())
3678
		product.Negate();
3679
}
3680
3681
Integer Integer::Times(const Integer &b) const
3682
{
3683
	Integer product;
3684
	Multiply(product, *this, b);
3685
	return product;
3686
}
3687
3688
/*
3689
void PositiveDivide(Integer &remainder, Integer &quotient,
3690
				   const Integer &dividend, const Integer &divisor)
3691
{
3692
	remainder.reg.CleanNew(divisor.reg.size());
3693
	remainder.sign = Integer::POSITIVE;
3694
	quotient.reg.New(0);
3695
	quotient.sign = Integer::POSITIVE;
3696
	unsigned i=dividend.BitCount();
3697
	while (i--)
3698
	{
3699
		word overflow = ShiftWordsLeftByBits(remainder.reg, remainder.reg.size(), 1);
3700
		remainder.reg[0] |= dividend[i];
3701
		if (overflow || remainder >= divisor)
3702
		{
3703
			Subtract(remainder.reg, remainder.reg, divisor.reg, remainder.reg.size());
3704
			quotient.SetBit(i);
3705
		}
3706
	}
3707
}
3708
*/
3709
3710
void PositiveDivide(Integer &remainder, Integer &quotient,
3711
				   const Integer &a, const Integer &b)
3712
{
3713
	unsigned aSize = a.WordCount();
3714
	unsigned bSize = b.WordCount();
3715
3716
	if (!bSize)
3717
		throw Integer::DivideByZero();
3718
412 by weidai
changes for 5.6:
3719
	if (aSize < bSize)
1 by weidai
Initial revision
3720
	{
3721
		remainder = a;
3722
		remainder.sign = Integer::POSITIVE;
3723
		quotient = Integer::Zero();
3724
		return;
3725
	}
3726
3727
	aSize += aSize%2;	// round up to next even number
3728
	bSize += bSize%2;
3729
3730
	remainder.reg.CleanNew(RoundupSize(bSize));
3731
	remainder.sign = Integer::POSITIVE;
3732
	quotient.reg.CleanNew(RoundupSize(aSize-bSize+2));
3733
	quotient.sign = Integer::POSITIVE;
3734
270 by weidai
MMX/SSE2 optimizations
3735
	IntegerSecBlock T(aSize+3*(bSize+2));
1 by weidai
Initial revision
3736
	Divide(remainder.reg, quotient.reg, T, a.reg, aSize, b.reg, bSize);
3737
}
3738
3739
void Integer::Divide(Integer &remainder, Integer &quotient, const Integer &dividend, const Integer &divisor)
3740
{
3741
	PositiveDivide(remainder, quotient, dividend, divisor);
3742
3743
	if (dividend.IsNegative())
3744
	{
3745
		quotient.Negate();
3746
		if (remainder.NotZero())
3747
		{
3748
			--quotient;
3749
			remainder = divisor.AbsoluteValue() - remainder;
3750
		}
3751
	}
3752
3753
	if (divisor.IsNegative())
3754
		quotient.Negate();
3755
}
3756
3757
void Integer::DivideByPowerOf2(Integer &r, Integer &q, const Integer &a, unsigned int n)
3758
{
3759
	q = a;
3760
	q >>= n;
3761
184 by weidai
port to MSVC .NET 2005 beta 2
3762
	const size_t wordCount = BitsToWords(n);
1 by weidai
Initial revision
3763
	if (wordCount <= a.WordCount())
3764
	{
3765
		r.reg.resize(RoundupSize(wordCount));
3766
		CopyWords(r.reg, a.reg, wordCount);
3767
		SetWords(r.reg+wordCount, 0, r.reg.size()-wordCount);
3768
		if (n % WORD_BITS != 0)
202 by weidai
fix MSVC 2005 warnings
3769
			r.reg[wordCount-1] %= (word(1) << (n % WORD_BITS));
1 by weidai
Initial revision
3770
	}
3771
	else
3772
	{
3773
		r.reg.resize(RoundupSize(a.WordCount()));
3774
		CopyWords(r.reg, a.reg, r.reg.size());
3775
	}
3776
	r.sign = POSITIVE;
3777
3778
	if (a.IsNegative() && r.NotZero())
3779
	{
3780
		--q;
3781
		r = Power2(n) - r;
3782
	}
3783
}
3784
3785
Integer Integer::DividedBy(const Integer &b) const
3786
{
3787
	Integer remainder, quotient;
3788
	Integer::Divide(remainder, quotient, *this, b);
3789
	return quotient;
3790
}
3791
3792
Integer Integer::Modulo(const Integer &b) const
3793
{
3794
	Integer remainder, quotient;
3795
	Integer::Divide(remainder, quotient, *this, b);
3796
	return remainder;
3797
}
3798
3799
void Integer::Divide(word &remainder, Integer &quotient, const Integer &dividend, word divisor)
3800
{
3801
	if (!divisor)
3802
		throw Integer::DivideByZero();
3803
3804
	assert(divisor);
3805
3806
	if ((divisor & (divisor-1)) == 0)	// divisor is a power of 2
3807
	{
3808
		quotient = dividend >> (BitPrecision(divisor)-1);
3809
		remainder = dividend.reg[0] & (divisor-1);
3810
		return;
3811
	}
3812
3813
	unsigned int i = dividend.WordCount();
3814
	quotient.reg.CleanNew(RoundupSize(i));
3815
	remainder = 0;
3816
	while (i--)
3817
	{
100 by weidai
fix bugs in 64-bit CPU support
3818
		quotient.reg[i] = DWord(dividend.reg[i], remainder) / divisor;
3819
		remainder = DWord(dividend.reg[i], remainder) % divisor;
1 by weidai
Initial revision
3820
	}
3821
3822
	if (dividend.NotNegative())
3823
		quotient.sign = POSITIVE;
3824
	else
3825
	{
3826
		quotient.sign = NEGATIVE;
3827
		if (remainder)
3828
		{
3829
			--quotient;
3830
			remainder = divisor - remainder;
3831
		}
3832
	}
3833
}
3834
3835
Integer Integer::DividedBy(word b) const
3836
{
3837
	word remainder;
3838
	Integer quotient;
3839
	Integer::Divide(remainder, quotient, *this, b);
3840
	return quotient;
3841
}
3842
3843
word Integer::Modulo(word divisor) const
3844
{
3845
	if (!divisor)
3846
		throw Integer::DivideByZero();
3847
3848
	assert(divisor);
3849
3850
	word remainder;
3851
3852
	if ((divisor & (divisor-1)) == 0)	// divisor is a power of 2
3853
		remainder = reg[0] & (divisor-1);
3854
	else
3855
	{
3856
		unsigned int i = WordCount();
3857
3858
		if (divisor <= 5)
3859
		{
100 by weidai
fix bugs in 64-bit CPU support
3860
			DWord sum(0, 0);
1 by weidai
Initial revision
3861
			while (i--)
3862
				sum += reg[i];
100 by weidai
fix bugs in 64-bit CPU support
3863
			remainder = sum % divisor;
1 by weidai
Initial revision
3864
		}
3865
		else
3866
		{
3867
			remainder = 0;
3868
			while (i--)
100 by weidai
fix bugs in 64-bit CPU support
3869
				remainder = DWord(reg[i], remainder) % divisor;
1 by weidai
Initial revision
3870
		}
3871
	}
3872
3873
	if (IsNegative() && remainder)
3874
		remainder = divisor - remainder;
3875
3876
	return remainder;
3877
}
3878
3879
void Integer::Negate()
3880
{
3881
	if (!!(*this))	// don't flip sign if *this==0
3882
		sign = Sign(1-sign);
3883
}
3884
3885
int Integer::PositiveCompare(const Integer& t) const
3886
{
3887
	unsigned size = WordCount(), tSize = t.WordCount();
3888
3889
	if (size == tSize)
3890
		return CryptoPP::Compare(reg, t.reg, size);
3891
	else
3892
		return size > tSize ? 1 : -1;
3893
}
3894
3895
int Integer::Compare(const Integer& t) const
3896
{
3897
	if (NotNegative())
3898
	{
3899
		if (t.NotNegative())
3900
			return PositiveCompare(t);
3901
		else
3902
			return 1;
3903
	}
3904
	else
3905
	{
3906
		if (t.NotNegative())
3907
			return -1;
3908
		else
3909
			return -PositiveCompare(t);
3910
	}
3911
}
3912
3913
Integer Integer::SquareRoot() const
3914
{
3915
	if (!IsPositive())
3916
		return Zero();
3917
3918
	// overestimate square root
3919
	Integer x, y = Power2((BitCount()+1)/2);
3920
	assert(y*y >= *this);
3921
3922
	do
3923
	{
3924
		x = y;
3925
		y = (x + *this/x) >> 1;
3926
	} while (y<x);
3927
3928
	return x;
3929
}
3930
3931
bool Integer::IsSquare() const
3932
{
3933
	Integer r = SquareRoot();
3934
	return *this == r.Squared();
3935
}
3936
3937
bool Integer::IsUnit() const
3938
{
3939
	return (WordCount() == 1) && (reg[0] == 1);
3940
}
3941
3942
Integer Integer::MultiplicativeInverse() const
3943
{
3944
	return IsUnit() ? *this : Zero();
3945
}
3946
3947
Integer a_times_b_mod_c(const Integer &x, const Integer& y, const Integer& m)
3948
{
3949
	return x*y%m;
3950
}
3951
3952
Integer a_exp_b_mod_c(const Integer &x, const Integer& e, const Integer& m)
3953
{
3954
	ModularArithmetic mr(m);
3955
	return mr.Exponentiate(x, e);
3956
}
3957
3958
Integer Integer::Gcd(const Integer &a, const Integer &b)
3959
{
3960
	return EuclideanDomainOf<Integer>().Gcd(a, b);
3961
}
3962
3963
Integer Integer::InverseMod(const Integer &m) const
3964
{
3965
	assert(m.NotNegative());
3966
412 by weidai
changes for 5.6:
3967
	if (IsNegative())
3968
		return Modulo(m).InverseMod(m);
1 by weidai
Initial revision
3969
3970
	if (m.IsEven())
3971
	{
3972
		if (!m || IsEven())
3973
			return Zero();	// no inverse
3974
		if (*this == One())
3975
			return One();
3976
412 by weidai
changes for 5.6:
3977
		Integer u = m.Modulo(*this).InverseMod(*this);
1 by weidai
Initial revision
3978
		return !u ? Zero() : (m*(*this-u)+1)/(*this);
3979
	}
3980
3981
	SecBlock<word> T(m.reg.size() * 4);
3982
	Integer r((word)0, m.reg.size());
3983
	unsigned k = AlmostInverse(r.reg, T, reg, reg.size(), m.reg, m.reg.size());
3984
	DivideByPower2Mod(r.reg, r.reg, k, m.reg, m.reg.size());
3985
	return r;
3986
}
3987
249 by weidai
update version number, port to Sun C++ 5.8
3988
word Integer::InverseMod(word mod) const
1 by weidai
Initial revision
3989
{
3990
	word g0 = mod, g1 = *this % mod;
3991
	word v0 = 0, v1 = 1;
3992
	word y;
3993
3994
	while (g1)
3995
	{
3996
		if (g1 == 1)
3997
			return v1;
3998
		y = g0 / g1;
3999
		g0 = g0 % g1;
4000
		v0 += y * v1;
4001
4002
		if (!g0)
4003
			break;
4004
		if (g0 == 1)
4005
			return mod-v0;
4006
		y = g1 / g0;
4007
		g1 = g1 % g0;
4008
		v1 += y * v0;
4009
	}
4010
	return 0;
4011
}
4012
4013
// ********************************************************
4014
4015
ModularArithmetic::ModularArithmetic(BufferedTransformation &bt)
4016
{
4017
	BERSequenceDecoder seq(bt);
4018
	OID oid(seq);
4019
	if (oid != ASN1::prime_field())
4020
		BERDecodeError();
181 by weidai
changes done for FIPS-140 lab code drop
4021
	m_modulus.BERDecode(seq);
1 by weidai
Initial revision
4022
	seq.MessageEnd();
181 by weidai
changes done for FIPS-140 lab code drop
4023
	m_result.reg.resize(m_modulus.reg.size());
1 by weidai
Initial revision
4024
}
4025
4026
void ModularArithmetic::DEREncode(BufferedTransformation &bt) const
4027
{
4028
	DERSequenceEncoder seq(bt);
4029
	ASN1::prime_field().DEREncode(seq);
181 by weidai
changes done for FIPS-140 lab code drop
4030
	m_modulus.DEREncode(seq);
1 by weidai
Initial revision
4031
	seq.MessageEnd();
4032
}
4033
4034
void ModularArithmetic::DEREncodeElement(BufferedTransformation &out, const Element &a) const
4035
{
4036
	a.DEREncodeAsOctetString(out, MaxElementByteLength());
4037
}
4038
4039
void ModularArithmetic::BERDecodeElement(BufferedTransformation &in, Element &a) const
4040
{
4041
	a.BERDecodeAsOctetString(in, MaxElementByteLength());
4042
}
4043
4044
const Integer& ModularArithmetic::Half(const Integer &a) const
4045
{
181 by weidai
changes done for FIPS-140 lab code drop
4046
	if (a.reg.size()==m_modulus.reg.size())
1 by weidai
Initial revision
4047
	{
181 by weidai
changes done for FIPS-140 lab code drop
4048
		CryptoPP::DivideByPower2Mod(m_result.reg.begin(), a.reg, 1, m_modulus.reg, a.reg.size());
4049
		return m_result;
1 by weidai
Initial revision
4050
	}
4051
	else
181 by weidai
changes done for FIPS-140 lab code drop
4052
		return m_result1 = (a.IsEven() ? (a >> 1) : ((a+m_modulus) >> 1));
1 by weidai
Initial revision
4053
}
4054
4055
const Integer& ModularArithmetic::Add(const Integer &a, const Integer &b) const
4056
{
181 by weidai
changes done for FIPS-140 lab code drop
4057
	if (a.reg.size()==m_modulus.reg.size() && b.reg.size()==m_modulus.reg.size())
1 by weidai
Initial revision
4058
	{
181 by weidai
changes done for FIPS-140 lab code drop
4059
		if (CryptoPP::Add(m_result.reg.begin(), a.reg, b.reg, a.reg.size())
4060
			|| Compare(m_result.reg, m_modulus.reg, a.reg.size()) >= 0)
1 by weidai
Initial revision
4061
		{
181 by weidai
changes done for FIPS-140 lab code drop
4062
			CryptoPP::Subtract(m_result.reg.begin(), m_result.reg, m_modulus.reg, a.reg.size());
1 by weidai
Initial revision
4063
		}
181 by weidai
changes done for FIPS-140 lab code drop
4064
		return m_result;
1 by weidai
Initial revision
4065
	}
4066
	else
4067
	{
181 by weidai
changes done for FIPS-140 lab code drop
4068
		m_result1 = a+b;
4069
		if (m_result1 >= m_modulus)
4070
			m_result1 -= m_modulus;
4071
		return m_result1;
1 by weidai
Initial revision
4072
	}
4073
}
4074
4075
Integer& ModularArithmetic::Accumulate(Integer &a, const Integer &b) const
4076
{
181 by weidai
changes done for FIPS-140 lab code drop
4077
	if (a.reg.size()==m_modulus.reg.size() && b.reg.size()==m_modulus.reg.size())
1 by weidai
Initial revision
4078
	{
4079
		if (CryptoPP::Add(a.reg, a.reg, b.reg, a.reg.size())
181 by weidai
changes done for FIPS-140 lab code drop
4080
			|| Compare(a.reg, m_modulus.reg, a.reg.size()) >= 0)
1 by weidai
Initial revision
4081
		{
181 by weidai
changes done for FIPS-140 lab code drop
4082
			CryptoPP::Subtract(a.reg, a.reg, m_modulus.reg, a.reg.size());
1 by weidai
Initial revision
4083
		}
4084
	}
4085
	else
4086
	{
4087
		a+=b;
181 by weidai
changes done for FIPS-140 lab code drop
4088
		if (a>=m_modulus)
4089
			a-=m_modulus;
1 by weidai
Initial revision
4090
	}
4091
4092
	return a;
4093
}
4094
4095
const Integer& ModularArithmetic::Subtract(const Integer &a, const Integer &b) const
4096
{
181 by weidai
changes done for FIPS-140 lab code drop
4097
	if (a.reg.size()==m_modulus.reg.size() && b.reg.size()==m_modulus.reg.size())
1 by weidai
Initial revision
4098
	{
181 by weidai
changes done for FIPS-140 lab code drop
4099
		if (CryptoPP::Subtract(m_result.reg.begin(), a.reg, b.reg, a.reg.size()))
4100
			CryptoPP::Add(m_result.reg.begin(), m_result.reg, m_modulus.reg, a.reg.size());
4101
		return m_result;
1 by weidai
Initial revision
4102
	}
4103
	else
4104
	{
181 by weidai
changes done for FIPS-140 lab code drop
4105
		m_result1 = a-b;
4106
		if (m_result1.IsNegative())
4107
			m_result1 += m_modulus;
4108
		return m_result1;
1 by weidai
Initial revision
4109
	}
4110
}
4111
4112
Integer& ModularArithmetic::Reduce(Integer &a, const Integer &b) const
4113
{
181 by weidai
changes done for FIPS-140 lab code drop
4114
	if (a.reg.size()==m_modulus.reg.size() && b.reg.size()==m_modulus.reg.size())
1 by weidai
Initial revision
4115
	{
4116
		if (CryptoPP::Subtract(a.reg, a.reg, b.reg, a.reg.size()))
181 by weidai
changes done for FIPS-140 lab code drop
4117
			CryptoPP::Add(a.reg, a.reg, m_modulus.reg, a.reg.size());
1 by weidai
Initial revision
4118
	}
4119
	else
4120
	{
4121
		a-=b;
4122
		if (a.IsNegative())
181 by weidai
changes done for FIPS-140 lab code drop
4123
			a+=m_modulus;
1 by weidai
Initial revision
4124
	}
4125
4126
	return a;
4127
}
4128
4129
const Integer& ModularArithmetic::Inverse(const Integer &a) const
4130
{
4131
	if (!a)
4132
		return a;
4133
181 by weidai
changes done for FIPS-140 lab code drop
4134
	CopyWords(m_result.reg.begin(), m_modulus.reg, m_modulus.reg.size());
4135
	if (CryptoPP::Subtract(m_result.reg.begin(), m_result.reg, a.reg, a.reg.size()))
184 by weidai
port to MSVC .NET 2005 beta 2
4136
		Decrement(m_result.reg.begin()+a.reg.size(), m_modulus.reg.size()-a.reg.size());
1 by weidai
Initial revision
4137
181 by weidai
changes done for FIPS-140 lab code drop
4138
	return m_result;
1 by weidai
Initial revision
4139
}
4140
4141
Integer ModularArithmetic::CascadeExponentiate(const Integer &x, const Integer &e1, const Integer &y, const Integer &e2) const
4142
{
181 by weidai
changes done for FIPS-140 lab code drop
4143
	if (m_modulus.IsOdd())
1 by weidai
Initial revision
4144
	{
181 by weidai
changes done for FIPS-140 lab code drop
4145
		MontgomeryRepresentation dr(m_modulus);
1 by weidai
Initial revision
4146
		return dr.ConvertOut(dr.CascadeExponentiate(dr.ConvertIn(x), e1, dr.ConvertIn(y), e2));
4147
	}
4148
	else
4149
		return AbstractRing<Integer>::CascadeExponentiate(x, e1, y, e2);
4150
}
4151
4152
void ModularArithmetic::SimultaneousExponentiate(Integer *results, const Integer &base, const Integer *exponents, unsigned int exponentsCount) const
4153
{
181 by weidai
changes done for FIPS-140 lab code drop
4154
	if (m_modulus.IsOdd())
1 by weidai
Initial revision
4155
	{
181 by weidai
changes done for FIPS-140 lab code drop
4156
		MontgomeryRepresentation dr(m_modulus);
1 by weidai
Initial revision
4157
		dr.SimultaneousExponentiate(results, dr.ConvertIn(base), exponents, exponentsCount);
4158
		for (unsigned int i=0; i<exponentsCount; i++)
4159
			results[i] = dr.ConvertOut(results[i]);
4160
	}
4161
	else
4162
		AbstractRing<Integer>::SimultaneousExponentiate(results, base, exponents, exponentsCount);
4163
}
4164
4165
MontgomeryRepresentation::MontgomeryRepresentation(const Integer &m)	// modulus must be odd
4166
	: ModularArithmetic(m),
181 by weidai
changes done for FIPS-140 lab code drop
4167
	  m_u((word)0, m_modulus.reg.size()),
4168
	  m_workspace(5*m_modulus.reg.size())
1 by weidai
Initial revision
4169
{
181 by weidai
changes done for FIPS-140 lab code drop
4170
	if (!m_modulus.IsOdd())
1 by weidai
Initial revision
4171
		throw InvalidArgument("MontgomeryRepresentation: Montgomery representation requires an odd modulus");
4172
181 by weidai
changes done for FIPS-140 lab code drop
4173
	RecursiveInverseModPower2(m_u.reg, m_workspace, m_modulus.reg, m_modulus.reg.size());
1 by weidai
Initial revision
4174
}
4175
4176
const Integer& MontgomeryRepresentation::Multiply(const Integer &a, const Integer &b) const
4177
{
181 by weidai
changes done for FIPS-140 lab code drop
4178
	word *const T = m_workspace.begin();
4179
	word *const R = m_result.reg.begin();
184 by weidai
port to MSVC .NET 2005 beta 2
4180
	const size_t N = m_modulus.reg.size();
1 by weidai
Initial revision
4181
	assert(a.reg.size()<=N && b.reg.size()<=N);
4182
4183
	AsymmetricMultiply(T, T+2*N, a.reg, a.reg.size(), b.reg, b.reg.size());
4184
	SetWords(T+a.reg.size()+b.reg.size(), 0, 2*N-a.reg.size()-b.reg.size());
181 by weidai
changes done for FIPS-140 lab code drop
4185
	MontgomeryReduce(R, T+2*N, T, m_modulus.reg, m_u.reg, N);
4186
	return m_result;
1 by weidai
Initial revision
4187
}
4188
4189
const Integer& MontgomeryRepresentation::Square(const Integer &a) const
4190
{
181 by weidai
changes done for FIPS-140 lab code drop
4191
	word *const T = m_workspace.begin();
4192
	word *const R = m_result.reg.begin();
184 by weidai
port to MSVC .NET 2005 beta 2
4193
	const size_t N = m_modulus.reg.size();
1 by weidai
Initial revision
4194
	assert(a.reg.size()<=N);
4195
4196
	CryptoPP::Square(T, T+2*N, a.reg, a.reg.size());
4197
	SetWords(T+2*a.reg.size(), 0, 2*N-2*a.reg.size());
181 by weidai
changes done for FIPS-140 lab code drop
4198
	MontgomeryReduce(R, T+2*N, T, m_modulus.reg, m_u.reg, N);
4199
	return m_result;
1 by weidai
Initial revision
4200
}
4201
4202
Integer MontgomeryRepresentation::ConvertOut(const Integer &a) const
4203
{
181 by weidai
changes done for FIPS-140 lab code drop
4204
	word *const T = m_workspace.begin();
4205
	word *const R = m_result.reg.begin();
184 by weidai
port to MSVC .NET 2005 beta 2
4206
	const size_t N = m_modulus.reg.size();
1 by weidai
Initial revision
4207
	assert(a.reg.size()<=N);
4208
4209
	CopyWords(T, a.reg, a.reg.size());
4210
	SetWords(T+a.reg.size(), 0, 2*N-a.reg.size());
181 by weidai
changes done for FIPS-140 lab code drop
4211
	MontgomeryReduce(R, T+2*N, T, m_modulus.reg, m_u.reg, N);
4212
	return m_result;
1 by weidai
Initial revision
4213
}
4214
4215
const Integer& MontgomeryRepresentation::MultiplicativeInverse(const Integer &a) const
4216
{
4217
//	  return (EuclideanMultiplicativeInverse(a, modulus)<<(2*WORD_BITS*modulus.reg.size()))%modulus;
181 by weidai
changes done for FIPS-140 lab code drop
4218
	word *const T = m_workspace.begin();
4219
	word *const R = m_result.reg.begin();
184 by weidai
port to MSVC .NET 2005 beta 2
4220
	const size_t N = m_modulus.reg.size();
1 by weidai
Initial revision
4221
	assert(a.reg.size()<=N);
4222
4223
	CopyWords(T, a.reg, a.reg.size());
4224
	SetWords(T+a.reg.size(), 0, 2*N-a.reg.size());
181 by weidai
changes done for FIPS-140 lab code drop
4225
	MontgomeryReduce(R, T+2*N, T, m_modulus.reg, m_u.reg, N);
4226
	unsigned k = AlmostInverse(R, T, R, N, m_modulus.reg, N);
1 by weidai
Initial revision
4227
4228
//	cout << "k=" << k << " N*32=" << 32*N << endl;
4229
4230
	if (k>N*WORD_BITS)
181 by weidai
changes done for FIPS-140 lab code drop
4231
		DivideByPower2Mod(R, R, k-N*WORD_BITS, m_modulus.reg, N);
1 by weidai
Initial revision
4232
	else
181 by weidai
changes done for FIPS-140 lab code drop
4233
		MultiplyByPower2Mod(R, R, N*WORD_BITS-k, m_modulus.reg, N);
1 by weidai
Initial revision
4234
181 by weidai
changes done for FIPS-140 lab code drop
4235
	return m_result;
1 by weidai
Initial revision
4236
}
4237
4238
NAMESPACE_END
75 by weidai
create DLL version, fix GetNextIV() bug in CTR and OFB modes
4239
4240
#endif