~zooko/cryptopp/trunk : contents of integer.cpp at revision 554

~zooko/cryptopp/trunk : (revision 554)

1 by weidai Initial revision	1	// integer.cpp - written and placed in the public domain by Wei Dai
	2	// contains public domain code contributed by Alister Lee and Leonard Janke
	3
	4	#include "pch.h"
75 by weidai create DLL version, fix GetNextIV() bug in CTR and OFB modes	5
	6	#ifndef CRYPTOPP_IMPORTS
	7
1 by weidai Initial revision	8	#include "integer.h"
	9	#include "modarith.h"
	10	#include "nbtheory.h"
	11	#include "asn.h"
	12	#include "oids.h"
	13	#include "words.h"
	14	#include "algparam.h"
	15	#include "pubkey.h" // for P1363_KDF2
	16	#include "sha.h"
270 by weidai MMX/SSE2 optimizations	17	#include "cpu.h"
1 by weidai Initial revision	18
	19	#include <iostream>
	20
315 by weidai fix compile for x64, DLL and VC 6	21	#if _MSC_VER >= 1400
270 by weidai MMX/SSE2 optimizations	22	#include <intrin.h>
	23	#endif
	24
	25	#ifdef __DECCXX
	26	#include <c_asm.h>
	27	#endif
	28
	29	#ifdef CRYPTOPP_MSVC6_NO_PP
	30	#pragma message("You do not seem to have the Visual C++ Processor Pack installed, so use of SSE2 instructions will be disabled.")
1 by weidai Initial revision	31	#endif
1 by weidai Initial revision	32
532 by noloader Added GCC_DIAGNOSTIC_AWARE to help suppress some warnings on contemporary compilers. The macro was needed to help with managing old compilers, like GCC 4.2.1, present on OpenBSD	33	#if GCC_DIAGNOSTIC_AWARE
	34	# pragma GCC diagnostic ignored "-Wunused-value"
	35	# pragma GCC diagnostic ignored "-Wunused-variable"
	36	#endif
	37
315 by weidai fix compile for x64, DLL and VC 6	38	#define CRYPTOPP_INTEGER_SSE2 (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && CRYPTOPP_BOOL_X86)
315 by weidai fix compile for x64, DLL and VC 6	39
1 by weidai Initial revision	40	NAMESPACE_BEGIN(CryptoPP)
1 by weidai Initial revision	41
181 by weidai changes done for FIPS-140 lab code drop	42	bool AssignIntToInteger(const std::type_info &valueType, void pInteger, const void pInt)
27 by weidai various changes for 5.1	43	{
	44	if (valueType != typeid(Integer))
	45	return false;
	46	reinterpret_cast<Integer >(pInteger) = reinterpret_cast<const int >(pInt);
	47	return true;
	48	}
	49
270 by weidai MMX/SSE2 optimizations	50	inline static int Compare(const word A, const word B, size_t N)
1 by weidai Initial revision	51	{
	52	while (N--)
	53	if (A[N] > B[N])
	54	return 1;
	55	else if (A[N] < B[N])
	56	return -1;
	57
	58	return 0;
	59	}
	60
270 by weidai MMX/SSE2 optimizations	61	inline static int Increment(word *A, size_t N, word B=1)
1 by weidai Initial revision	62	{
	63	assert(N);
	64	word t = A[0];
	65	A[0] = t+B;
	66	if (A[0] >= t)
	67	return 0;
	68	for (unsigned i=1; i<N; i++)
	69	if (++A[i])
	70	return 0;
	71	return 1;
	72	}
	73
270 by weidai MMX/SSE2 optimizations	74	inline static int Decrement(word *A, size_t N, word B=1)
1 by weidai Initial revision	75	{
	76	assert(N);
	77	word t = A[0];
	78	A[0] = t-B;
	79	if (A[0] <= t)
	80	return 0;
	81	for (unsigned i=1; i<N; i++)
	82	if (A[i]--)
	83	return 0;
	84	return 1;
	85	}
	86
184 by weidai port to MSVC .NET 2005 beta 2	87	static void TwosComplement(word *A, size_t N)
1 by weidai Initial revision	88	{
	89	Decrement(A, N);
	90	for (unsigned i=0; i<N; i++)
	91	A[i] = ~A[i];
	92	}
	93
100 by weidai fix bugs in 64-bit CPU support	94	static word AtomicInverseModPower2(word A)
	95	{
	96	assert(A%2==1);
	97
	98	word R=A%8;
	99
	100	for (unsigned i=3; i<WORD_BITS; i*=2)
1 by weidai Initial revision	101	R = R(2-RA);
	102
	103	assert(R*A==1);
100 by weidai fix bugs in 64-bit CPU support	104	return R;
	105	}
	106
	107	// ********************************************************
	108
409 by weidai fix compile with GCC 4.0.1 on MacOS X 64-bit	109	#if !defined(CRYPTOPP_NATIVE_DWORD_AVAILABLE) \|\| (defined(__x86_64__) && defined(CRYPTOPP_WORD128_AVAILABLE))
315 by weidai fix compile for x64, DLL and VC 6	110	#define Declare2Words(x) word x##0, x##1;
	111	#define AssignWord(a, b) a##0 = b; a##1 = 0;
	112	#define Add2WordsBy1(a, b, c) a##0 = b##0 + c; a##1 = b##1 + (a##0 < c);
	113	#define LowWord(a) a##0
	114	#define HighWord(a) a##1
	115	#ifdef _MSC_VER
411 by weidai port to Sun Studio 12's 64-bit C++ Compiler 5.9 Patch 124864-09 2008/12/16	116	#define MultiplyWordsLoHi(p0, p1, a, b) p0 = _umul128(a, b, &p1);
387 by weidai fix compile with ICC 10	117	#ifndef __INTEL_COMPILER
	118	#define Double3Words(c, d) d##1 = __shiftleft128(d##0, d##1, 1); d##0 = __shiftleft128(c, d##0, 1); c *= 2;
	119	#endif
315 by weidai fix compile for x64, DLL and VC 6	120	#elif defined(__DECCXX)
411 by weidai port to Sun Studio 12's 64-bit C++ Compiler 5.9 Patch 124864-09 2008/12/16	121	#define MultiplyWordsLoHi(p0, p1, a, b) p0 = a*b; p1 = asm("umulh %a0, %a1, %v0", a, b);
387 by weidai fix compile with ICC 10	122	#elif defined(__x86_64__)
477 by weidai port to Sun Studio 12u1 Sun C++ 5.10 SunOS_i386 128229-02 2009/09/21	123	#if defined(__SUNPRO_CC) && __SUNPRO_CC < 0x5100
411 by weidai port to Sun Studio 12's 64-bit C++ Compiler 5.9 Patch 124864-09 2008/12/16	124	// Sun Studio's gcc-style inline assembly is heavily bugged as of version 5.9 Patch 124864-09 2008/12/16, but this one works
	125	#define MultiplyWordsLoHi(p0, p1, a, b) asm ("mulq %3" : "=a"(p0), "=d"(p1) : "a"(a), "r"(b) : "cc");
	126	#else
	127	#define MultiplyWordsLoHi(p0, p1, a, b) asm ("mulq %3" : "=a"(p0), "=d"(p1) : "a"(a), "g"(b) : "cc");
	128	#define MulAcc(c, d, a, b) asm ("mulq %6; addq %3, %0; adcq %4, %1; adcq $0, %2;" : "+r"(c), "+r"(d##0), "+r"(d##1), "=a"(p0), "=d"(p1) : "a"(a), "g"(b) : "cc");
	129	#define Double3Words(c, d) asm ("addq %0, %0; adcq %1, %1; adcq %2, %2;" : "+r"(c), "+r"(d##0), "+r"(d##1) : : "cc");
	130	#define Acc2WordsBy1(a, b) asm ("addq %2, %0; adcq $0, %1;" : "+r"(a##0), "+r"(a##1) : "r"(b) : "cc");
	131	#define Acc2WordsBy2(a, b) asm ("addq %2, %0; adcq %3, %1;" : "+r"(a##0), "+r"(a##1) : "r"(b##0), "r"(b##1) : "cc");
	132	#define Acc3WordsBy2(c, d, e) asm ("addq %5, %0; adcq %6, %1; adcq $0, %2;" : "+r"(c), "=r"(e##0), "=r"(e##1) : "1"(d##0), "2"(d##1), "r"(e##0), "r"(e##1) : "cc");
	133	#endif
315 by weidai fix compile for x64, DLL and VC 6	134	#endif
411 by weidai port to Sun Studio 12's 64-bit C++ Compiler 5.9 Patch 124864-09 2008/12/16	135	#define MultiplyWords(p, a, b) MultiplyWordsLoHi(p##0, p##1, a, b)
315 by weidai fix compile for x64, DLL and VC 6	136	#ifndef Double3Words
	137	#define Double3Words(c, d) d##1 = 2d##1 + (d##0>>(WORD_BITS-1)); d##0 = 2d##0 + (c>>(WORD_BITS-1)); c *= 2;
	138	#endif
	139	#ifndef Acc2WordsBy2
	140	#define Acc2WordsBy2(a, b) a##0 += b##0; a##1 += a##0 < b##0; a##1 += b##1;
	141	#endif
	142	#define AddWithCarry(u, a, b) {word t = a+b; u##0 = t + u##1; u##1 = (t<a) + (u##0<t);}
	143	#define SubtractWithBorrow(u, a, b) {word t = a-b; u##0 = t - u##1; u##1 = (t>a) + (u##0>t);}
	144	#define GetCarry(u) u##1
	145	#define GetBorrow(u) u##1
	146	#else
270 by weidai MMX/SSE2 optimizations	147	#define Declare2Words(x) dword x;
	148	#if _MSC_VER >= 1400 && !defined(__INTEL_COMPILER)
	149	#define MultiplyWords(p, a, b) p = __emulu(a, b);
	150	#else
	151	#define MultiplyWords(p, a, b) p = (dword)a*b;
	152	#endif
	153	#define AssignWord(a, b) a = b;
	154	#define Add2WordsBy1(a, b, c) a = b + c;
	155	#define Acc2WordsBy2(a, b) a += b;
315 by weidai fix compile for x64, DLL and VC 6	156	#define LowWord(a) word(a)
	157	#define HighWord(a) word(a>>WORD_BITS)
	158	#define Double3Words(c, d) d = 2d + (c>>(WORD_BITS-1)); c = 2;
270 by weidai MMX/SSE2 optimizations	159	#define AddWithCarry(u, a, b) u = dword(a) + b + GetCarry(u);
	160	#define SubtractWithBorrow(u, a, b) u = dword(a) - b - GetBorrow(u);
	161	#define GetCarry(u) HighWord(u)
	162	#define GetBorrow(u) word(u>>(WORD_BITS*2-1))
315 by weidai fix compile for x64, DLL and VC 6	163	#endif
	164	#ifndef MulAcc
	165	#define MulAcc(c, d, a, b) MultiplyWords(p, a, b); Acc2WordsBy1(p, c); c = LowWord(p); Acc2WordsBy1(d, HighWord(p));
	166	#endif
	167	#ifndef Acc2WordsBy1
270 by weidai MMX/SSE2 optimizations	168	#define Acc2WordsBy1(a, b) Add2WordsBy1(a, a, b)
315 by weidai fix compile for x64, DLL and VC 6	169	#endif
	170	#ifndef Acc3WordsBy2
	171	#define Acc3WordsBy2(c, d, e) Acc2WordsBy1(e, c); c = LowWord(e); Add2WordsBy1(e, d, HighWord(e));
270 by weidai MMX/SSE2 optimizations	172	#endif
270 by weidai MMX/SSE2 optimizations	173
100 by weidai fix bugs in 64-bit CPU support	174	class DWord
	175	{
	176	public:
	177	DWord() {}
	178
	179	#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
	180	explicit DWord(word low)
	181	{
	182	m_whole = low;
	183	}
	184	#else
	185	explicit DWord(word low)
	186	{
	187	m_halfs.low = low;
	188	m_halfs.high = 0;
	189	}
	190	#endif
	191
	192	DWord(word low, word high)
	193	{
	194	m_halfs.low = low;
	195	m_halfs.high = high;
	196	}
	197
	198	static DWord Multiply(word a, word b)
	199	{
	200	DWord r;
	201	#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
	202	r.m_whole = (dword)a * b;
411 by weidai port to Sun Studio 12's 64-bit C++ Compiler 5.9 Patch 124864-09 2008/12/16	203	#elif defined(MultiplyWordsLoHi)
	204	MultiplyWordsLoHi(r.m_halfs.low, r.m_halfs.high, a, b);
100 by weidai fix bugs in 64-bit CPU support	205	#endif
	206	return r;
	207	}
	208
	209	static DWord MultiplyAndAdd(word a, word b, word c)
	210	{
	211	DWord r = Multiply(a, b);
	212	return r += c;
	213	}
	214
	215	DWord & operator+=(word a)
	216	{
	217	#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
	218	m_whole = m_whole + a;
	219	#else
	220	m_halfs.low += a;
	221	m_halfs.high += (m_halfs.low < a);
	222	#endif
	223	return *this;
	224	}
	225
	226	DWord operator+(word a)
	227	{
	228	DWord r;
	229	#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
	230	r.m_whole = m_whole + a;
	231	#else
	232	r.m_halfs.low = m_halfs.low + a;
	233	r.m_halfs.high = m_halfs.high + (r.m_halfs.low < a);
	234	#endif
	235	return r;
	236	}
	237
	238	DWord operator-(DWord a)
	239	{
	240	DWord r;
	241	#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
	242	r.m_whole = m_whole - a.m_whole;
	243	#else
	244	r.m_halfs.low = m_halfs.low - a.m_halfs.low;
	245	r.m_halfs.high = m_halfs.high - a.m_halfs.high - (r.m_halfs.low > m_halfs.low);
	246	#endif
	247	return r;
	248	}
	249
	250	DWord operator-(word a)
	251	{
	252	DWord r;
	253	#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
	254	r.m_whole = m_whole - a;
	255	#else
	256	r.m_halfs.low = m_halfs.low - a;
	257	r.m_halfs.high = m_halfs.high - (r.m_halfs.low > m_halfs.low);
	258	#endif
	259	return r;
	260	}
	261
	262	// returns quotient, which must fit in a word
	263	word operator/(word divisor);
	264
	265	word operator%(word a);
	266
	267	bool operator!() const
	268	{
269	#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
270	return !m_whole;
271	#else
272	return !m_halfs.high && !m_halfs.low;
273	#endif
274	}
275
276	word GetLowHalf() const {return m_halfs.low;}
277	word GetHighHalf() const {return m_halfs.high;}
278	word GetHighHalfAsBorrow() const {return 0-m_halfs.high;}
279
280	private:
281	union
282	{
283	#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
284	dword m_whole;
285	#endif
286	struct
287	{
288	#ifdef IS_LITTLE_ENDIAN
289	word low;
290	word high;
291	#else
292	word high;
293	word low;
294	#endif
295	} m_halfs;
296	};
297	};
298
299	class Word
300	{
301	public:
302	Word() {}
303
304	Word(word value)
305	{
306	m_whole = value;
307	}
308
309	Word(hword low, hword high)
310	{
311	m_whole = low \| (word(high) << (WORD_BITS/2));
312	}
313
314	static Word Multiply(hword a, hword b)
315	{
316	Word r;
317	r.m_whole = (word)a * b;
318	return r;
319	}
320
321	Word operator-(Word a)
322	{
323	Word r;
324	r.m_whole = m_whole - a.m_whole;
325	return r;
326	}
327
328	Word operator-(hword a)
329	{
330	Word r;
331	r.m_whole = m_whole - a;
332	return r;
333	}
334
335	// returns quotient, which must fit in a word
336	hword operator/(hword divisor)
337	{
338	return hword(m_whole / divisor);
339	}
340
341	bool operator!() const
342	{
343	return !m_whole;
344	}
345
346	word GetWhole() const {return m_whole;}
347	hword GetLowHalf() const {return hword(m_whole);}
348	hword GetHighHalf() const {return hword(m_whole>>(WORD_BITS/2));}
349	hword GetHighHalfAsBorrow() const {return 0-hword(m_whole>>(WORD_BITS/2));}
350
351	private:
352	word m_whole;
353	};
354
355	// do a 3 word by 2 word divide, returns quotient and leaves remainder in A
356	template <class S, class D>
357	S DivideThreeWordsByTwo(S A, S B0, S B1, D dummy=NULL)
358	{
359	// assert {A[2],A[1]} < {B1,B0}, so quotient can fit in a S
360	assert(A[2] < B1 \|\| (A[2]==B1 && A[1] < B0));
361
362	// estimate the quotient: do a 2 S by 1 S divide
363	S Q;
364	if (S(B1+1) == 0)
365	Q = A[2];
443 by weidai fix Integer operator<< output on Windows x64	366	else if (B1 > 0)
100 by weidai fix bugs in 64-bit CPU support	367	Q = D(A[1], A[2]) / S(B1+1);
443 by weidai fix Integer operator<< output on Windows x64	368	else
443 by weidai fix Integer operator<< output on Windows x64	369	Q = D(A[0], A[1]) / B0;
100 by weidai fix bugs in 64-bit CPU support	370
	371	// now subtract Q*B from A
	372	D p = D::Multiply(B0, Q);
	373	D u = (D) A[0] - p.GetLowHalf();
	374	A[0] = u.GetLowHalf();
	375	u = (D) A[1] - p.GetHighHalf() - u.GetHighHalfAsBorrow() - D::Multiply(B1, Q);
	376	A[1] = u.GetLowHalf();
	377	A[2] += u.GetHighHalf();
	378
	379	// Q <= actual quotient, so fix it
	380	while (A[2] \|\| A[1] > B1 \|\| (A[1]==B1 && A[0]>=B0))
	381	{
	382	u = (D) A[0] - B0;
	383	A[0] = u.GetLowHalf();
	384	u = (D) A[1] - B1 - u.GetHighHalfAsBorrow();
	385	A[1] = u.GetLowHalf();
	386	A[2] += u.GetHighHalf();
	387	Q++;
	388	assert(Q); // shouldn't overflow
	389	}
	390
	391	return Q;
	392	}
	393
	394	// do a 4 word by 2 word divide, returns 2 word quotient in Q0 and Q1
	395	template <class S, class D>
	396	inline D DivideFourWordsByTwo(S *T, const D &Al, const D &Ah, const D &B)
	397	{
	398	if (!B) // if divisor is 0, we assume divisor==2*(2WORD_BITS)
	399	return D(Ah.GetLowHalf(), Ah.GetHighHalf());
	400	else
	401	{
	402	S Q[2];
	403	T[0] = Al.GetLowHalf();
	404	T[1] = Al.GetHighHalf();
	405	T[2] = Ah.GetLowHalf();
	406	T[3] = Ah.GetHighHalf();
	407	Q[1] = DivideThreeWordsByTwo<S, D>(T+1, B.GetLowHalf(), B.GetHighHalf());
	408	Q[0] = DivideThreeWordsByTwo<S, D>(T, B.GetLowHalf(), B.GetHighHalf());
	409	return D(Q[0], Q[1]);
	410	}
	411	}
	412
	413	// returns quotient, which must fit in a word
	414	inline word DWord::operator/(word a)
	415	{
	416	#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
	417	return word(m_whole / a);
	418	#else
	419	hword r[4];
	420	return DivideFourWordsByTwo<hword, Word>(r, m_halfs.low, m_halfs.high, a).GetWhole();
	421	#endif
	422	}
	423
	424	inline word DWord::operator%(word a)
	425	{
	426	#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
	427	return word(m_whole % a);
	428	#else
	429	if (a < (word(1) << (WORD_BITS/2)))
	430	{
	431	hword h = hword(a);
	432	word r = m_halfs.high % h;
	433	r = ((m_halfs.low >> (WORD_BITS/2)) + (r << (WORD_BITS/2))) % h;
434	return hword((hword(m_halfs.low) + (r << (WORD_BITS/2))) % h);
435	}
436	else
437	{
438	hword r[4];
439	DivideFourWordsByTwo<hword, Word>(r, m_halfs.low, m_halfs.high, a);
440	return Word(r[0], r[1]).GetWhole();
441	}
442	#endif
1 by weidai Initial revision	443	}
	444
	445	// ********************************************************
	446
109 by weidai enable SSE2 intrinsics on GCC 3.3 or later	447	// use some tricks to share assembly code between MSVC and GCC
270 by weidai MMX/SSE2 optimizations	448	#if defined(__GNUC__)
113 by weidai unify GCC and MSVC multiplication code	449	#define AddPrologue \
351 by weidai revert to int return value for Add and Sub	450	int result; \
113 by weidai unify GCC and MSVC multiplication code	451	__asm__ __volatile__ \
109 by weidai enable SSE2 intrinsics on GCC 3.3 or later	452	( \
270 by weidai MMX/SSE2 optimizations	453	".intel_syntax noprefix;"
113 by weidai unify GCC and MSVC multiplication code	454	#define AddEpilogue \
113 by weidai unify GCC and MSVC multiplication code	455	".att_syntax prefix;" \
307 by weidai fix compile with Intel compiler	456	: "=a" (result)\
307 by weidai fix compile with Intel compiler	457	: "d" (C), "a" (A), "D" (B), "c" (N) \
270 by weidai MMX/SSE2 optimizations	458	: "%esi", "memory", "cc" \
307 by weidai fix compile with Intel compiler	459	);\
307 by weidai fix compile with Intel compiler	460	return result;
113 by weidai unify GCC and MSVC multiplication code	461	#define MulPrologue \
	462	__asm__ __volatile__ \
	463	( \
270 by weidai MMX/SSE2 optimizations	464	".intel_syntax noprefix;" \
	465	AS1( push ebx) \
	466	AS2( mov ebx, edx)
113 by weidai unify GCC and MSVC multiplication code	467	#define MulEpilogue \
270 by weidai MMX/SSE2 optimizations	468	AS1( pop ebx) \
	469	".att_syntax prefix;" \
	470	: \
	471	: "d" (s_maskLow16), "c" (C), "a" (A), "D" (B) \
	472	: "%esi", "memory", "cc" \
	473	);
	474	#define SquPrologue MulPrologue
	475	#define SquEpilogue \
	476	AS1( pop ebx) \
	477	".att_syntax prefix;" \
	478	: \
	479	: "d" (s_maskLow16), "c" (C), "a" (A) \
	480	: "%esi", "%edi", "memory", "cc" \
	481	);
	482	#define TopPrologue MulPrologue
	483	#define TopEpilogue \
	484	AS1( pop ebx) \
	485	".att_syntax prefix;" \
	486	: \
	487	: "d" (s_maskLow16), "c" (C), "a" (A), "D" (B), "S" (L) \
	488	: "memory", "cc" \
	489	);
	490	#else
	491	#define AddPrologue \
307 by weidai fix compile with Intel compiler	492	__asm push edi \
270 by weidai MMX/SSE2 optimizations	493	__asm push esi \
270 by weidai MMX/SSE2 optimizations	494	__asm mov eax, [esp+12] \
307 by weidai fix compile with Intel compiler	495	__asm mov edi, [esp+16]
270 by weidai MMX/SSE2 optimizations	496	#define AddEpilogue \
270 by weidai MMX/SSE2 optimizations	497	__asm pop esi \
307 by weidai fix compile with Intel compiler	498	__asm pop edi \
270 by weidai MMX/SSE2 optimizations	499	__asm ret 8
315 by weidai fix compile for x64, DLL and VC 6	500	#if _MSC_VER < 1300
	501	#define SaveEBX __asm push ebx
	502	#define RestoreEBX __asm pop ebx
	503	#else
	504	#define SaveEBX
	505	#define RestoreEBX
	506	#endif
270 by weidai MMX/SSE2 optimizations	507	#define SquPrologue \
	508	AS2( mov eax, A) \
	509	AS2( mov ecx, C) \
315 by weidai fix compile for x64, DLL and VC 6	510	SaveEBX \
270 by weidai MMX/SSE2 optimizations	511	AS2( lea ebx, s_maskLow16)
	512	#define MulPrologue \
	513	AS2( mov eax, A) \
	514	AS2( mov edi, B) \
	515	AS2( mov ecx, C) \
315 by weidai fix compile for x64, DLL and VC 6	516	SaveEBX \
270 by weidai MMX/SSE2 optimizations	517	AS2( lea ebx, s_maskLow16)
	518	#define TopPrologue \
	519	AS2( mov eax, A) \
	520	AS2( mov edi, B) \
	521	AS2( mov ecx, C) \
	522	AS2( mov esi, L) \
315 by weidai fix compile for x64, DLL and VC 6	523	SaveEBX \
270 by weidai MMX/SSE2 optimizations	524	AS2( lea ebx, s_maskLow16)
315 by weidai fix compile for x64, DLL and VC 6	525	#define SquEpilogue RestoreEBX
	526	#define MulEpilogue RestoreEBX
	527	#define TopEpilogue RestoreEBX
270 by weidai MMX/SSE2 optimizations	528	#endif
270 by weidai MMX/SSE2 optimizations	529
315 by weidai fix compile for x64, DLL and VC 6	530	#ifdef CRYPTOPP_X64_MASM_AVAILABLE
270 by weidai MMX/SSE2 optimizations	531	extern "C" {
351 by weidai revert to int return value for Add and Sub	532	int Baseline_Add(size_t N, word C, const word A, const word *B);
351 by weidai revert to int return value for Add and Sub	533	int Baseline_Sub(size_t N, word C, const word A, const word *B);
315 by weidai fix compile for x64, DLL and VC 6	534	}
432 by weidai fix compile on OpenBSD 4.4	535	#elif defined(CRYPTOPP_X64_ASM_AVAILABLE) && defined(__GNUC__) && defined(CRYPTOPP_WORD128_AVAILABLE)
351 by weidai revert to int return value for Add and Sub	536	int Baseline_Add(size_t N, word C, const word A, const word *B)
315 by weidai fix compile for x64, DLL and VC 6	537	{
	538	word result;
	539	__asm__ __volatile__
	540	(
	541	".intel_syntax;"
	542	AS1( neg %1)
	543	ASJ( jz, 1, f)
	544	AS2( mov %0,[%3+8*%1])
	545	AS2( add %0,[%4+8*%1])
	546	AS2( mov [%2+8*%1],%0)
	547	ASL(0)
	548	AS2( mov %0,[%3+8*%1+8])
	549	AS2( adc %0,[%4+8*%1+8])
	550	AS2( mov [%2+8*%1+8],%0)
	551	AS2( lea %1,[%1+2])
	552	ASJ( jrcxz, 1, f)
	553	AS2( mov %0,[%3+8*%1])
	554	AS2( adc %0,[%4+8*%1])
	555	AS2( mov [%2+8*%1],%0)
	556	ASJ( jmp, 0, b)
	557	ASL(1)
	558	AS2( mov %0, 0)
	559	AS2( adc %0, %0)
	560	".att_syntax;"
402 by weidai fixes for GCC 4.3.2 (reports from Chris Morgan and DiegoT)	561	: "=&r" (result), "+c" (N)
	562	: "r" (C+N), "r" (A+N), "r" (B+N)
315 by weidai fix compile for x64, DLL and VC 6	563	: "memory", "cc"
315 by weidai fix compile for x64, DLL and VC 6	564	);
351 by weidai revert to int return value for Add and Sub	565	return (int)result;
315 by weidai fix compile for x64, DLL and VC 6	566	}
315 by weidai fix compile for x64, DLL and VC 6	567
351 by weidai revert to int return value for Add and Sub	568	int Baseline_Sub(size_t N, word C, const word A, const word *B)
315 by weidai fix compile for x64, DLL and VC 6	569	{
	570	word result;
	571	__asm__ __volatile__
	572	(
	573	".intel_syntax;"
	574	AS1( neg %1)
	575	ASJ( jz, 1, f)
	576	AS2( mov %0,[%3+8*%1])
	577	AS2( sub %0,[%4+8*%1])
	578	AS2( mov [%2+8*%1],%0)
	579	ASL(0)
	580	AS2( mov %0,[%3+8*%1+8])
	581	AS2( sbb %0,[%4+8*%1+8])
	582	AS2( mov [%2+8*%1+8],%0)
	583	AS2( lea %1,[%1+2])
	584	ASJ( jrcxz, 1, f)
	585	AS2( mov %0,[%3+8*%1])
	586	AS2( sbb %0,[%4+8*%1])
	587	AS2( mov [%2+8*%1],%0)
	588	ASJ( jmp, 0, b)
	589	ASL(1)
	590	AS2( mov %0, 0)
	591	AS2( adc %0, %0)
	592	".att_syntax;"
402 by weidai fixes for GCC 4.3.2 (reports from Chris Morgan and DiegoT)	593	: "=&r" (result), "+c" (N)
	594	: "r" (C+N), "r" (A+N), "r" (B+N)
315 by weidai fix compile for x64, DLL and VC 6	595	: "memory", "cc"
315 by weidai fix compile for x64, DLL and VC 6	596	);
351 by weidai revert to int return value for Add and Sub	597	return (int)result;
315 by weidai fix compile for x64, DLL and VC 6	598	}
315 by weidai fix compile for x64, DLL and VC 6	599	#elif defined(CRYPTOPP_X86_ASM_AVAILABLE) && CRYPTOPP_BOOL_X86
351 by weidai revert to int return value for Add and Sub	600	CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word C, const word A, const word *B)
270 by weidai MMX/SSE2 optimizations	601	{
	602	AddPrologue
	603
307 by weidai fix compile with Intel compiler	604	// now: eax = A, edi = B, edx = C, ecx = N
270 by weidai MMX/SSE2 optimizations	605	AS2( lea eax, [eax+4*ecx])
307 by weidai fix compile with Intel compiler	606	AS2( lea edi, [edi+4*ecx])
270 by weidai MMX/SSE2 optimizations	607	AS2( lea edx, [edx+4*ecx])
	608
	609	AS1( neg ecx) // ecx is negative index
	610	AS2( test ecx, 2) // this clears carry flag
	611	ASJ( jz, 0, f)
	612	AS2( sub ecx, 2)
	613	ASJ( jmp, 1, f)
	614
	615	ASL(0)
	616	ASJ( jecxz, 2, f) // loop until ecx overflows and becomes zero
	617	AS2( mov esi,[eax+4*ecx])
307 by weidai fix compile with Intel compiler	618	AS2( adc esi,[edi+4*ecx])
270 by weidai MMX/SSE2 optimizations	619	AS2( mov [edx+4*ecx],esi)
270 by weidai MMX/SSE2 optimizations	620	AS2( mov esi,[eax+4*ecx+4])
307 by weidai fix compile with Intel compiler	621	AS2( adc esi,[edi+4*ecx+4])
270 by weidai MMX/SSE2 optimizations	622	AS2( mov [edx+4*ecx+4],esi)
	623	ASL(1)
	624	AS2( mov esi,[eax+4*ecx+8])
307 by weidai fix compile with Intel compiler	625	AS2( adc esi,[edi+4*ecx+8])
270 by weidai MMX/SSE2 optimizations	626	AS2( mov [edx+4*ecx+8],esi)
270 by weidai MMX/SSE2 optimizations	627	AS2( mov esi,[eax+4*ecx+12])
307 by weidai fix compile with Intel compiler	628	AS2( adc esi,[edi+4*ecx+12])
270 by weidai MMX/SSE2 optimizations	629	AS2( mov [edx+4*ecx+12],esi)
	630
	631	AS2( lea ecx,[ecx+4]) // advance index, avoid inc which causes slowdown on Intel Core 2
	632	ASJ( jmp, 0, b)
	633
	634	ASL(2)
	635	AS2( mov eax, 0)
	636	AS1( setc al) // store carry into eax (return result register)
	637
	638	AddEpilogue
	639	}
	640
351 by weidai revert to int return value for Add and Sub	641	CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word C, const word A, const word *B)
270 by weidai MMX/SSE2 optimizations	642	{
	643	AddPrologue
	644
307 by weidai fix compile with Intel compiler	645	// now: eax = A, edi = B, edx = C, ecx = N
270 by weidai MMX/SSE2 optimizations	646	AS2( lea eax, [eax+4*ecx])
307 by weidai fix compile with Intel compiler	647	AS2( lea edi, [edi+4*ecx])
270 by weidai MMX/SSE2 optimizations	648	AS2( lea edx, [edx+4*ecx])
	649
	650	AS1( neg ecx) // ecx is negative index
	651	AS2( test ecx, 2) // this clears carry flag
	652	ASJ( jz, 0, f)
	653	AS2( sub ecx, 2)
	654	ASJ( jmp, 1, f)
	655
	656	ASL(0)
	657	ASJ( jecxz, 2, f) // loop until ecx overflows and becomes zero
	658	AS2( mov esi,[eax+4*ecx])
307 by weidai fix compile with Intel compiler	659	AS2( sbb esi,[edi+4*ecx])
270 by weidai MMX/SSE2 optimizations	660	AS2( mov [edx+4*ecx],esi)
270 by weidai MMX/SSE2 optimizations	661	AS2( mov esi,[eax+4*ecx+4])
307 by weidai fix compile with Intel compiler	662	AS2( sbb esi,[edi+4*ecx+4])
270 by weidai MMX/SSE2 optimizations	663	AS2( mov [edx+4*ecx+4],esi)
	664	ASL(1)
	665	AS2( mov esi,[eax+4*ecx+8])
307 by weidai fix compile with Intel compiler	666	AS2( sbb esi,[edi+4*ecx+8])
270 by weidai MMX/SSE2 optimizations	667	AS2( mov [edx+4*ecx+8],esi)
270 by weidai MMX/SSE2 optimizations	668	AS2( mov esi,[eax+4*ecx+12])
307 by weidai fix compile with Intel compiler	669	AS2( sbb esi,[edi+4*ecx+12])
270 by weidai MMX/SSE2 optimizations	670	AS2( mov [edx+4*ecx+12],esi)
	671
	672	AS2( lea ecx,[ecx+4]) // advance index, avoid inc which causes slowdown on Intel Core 2
	673	ASJ( jmp, 0, b)
	674
	675	ASL(2)
	676	AS2( mov eax, 0)
	677	AS1( setc al) // store carry into eax (return result register)
	678
	679	AddEpilogue
	680	}
	681
315 by weidai fix compile for x64, DLL and VC 6	682	#if CRYPTOPP_INTEGER_SSE2
351 by weidai revert to int return value for Add and Sub	683	CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Add(size_t N, word C, const word A, const word *B)
270 by weidai MMX/SSE2 optimizations	684	{
	685	AddPrologue
	686
307 by weidai fix compile with Intel compiler	687	// now: eax = A, edi = B, edx = C, ecx = N
270 by weidai MMX/SSE2 optimizations	688	AS2( lea eax, [eax+4*ecx])
307 by weidai fix compile with Intel compiler	689	AS2( lea edi, [edi+4*ecx])
270 by weidai MMX/SSE2 optimizations	690	AS2( lea edx, [edx+4*ecx])
	691
	692	AS1( neg ecx) // ecx is negative index
	693	AS2( pxor mm2, mm2)
	694	ASJ( jz, 2, f)
	695	AS2( test ecx, 2) // this clears carry flag
	696	ASJ( jz, 0, f)
	697	AS2( sub ecx, 2)
	698	ASJ( jmp, 1, f)
	699
	700	ASL(0)
	701	AS2( movd mm0, DWORD PTR [eax+4*ecx])
307 by weidai fix compile with Intel compiler	702	AS2( movd mm1, DWORD PTR [edi+4*ecx])
270 by weidai MMX/SSE2 optimizations	703	AS2( paddq mm0, mm1)
	704	AS2( paddq mm2, mm0)
	705	AS2( movd DWORD PTR [edx+4*ecx], mm2)
	706	AS2( psrlq mm2, 32)
	707
	708	AS2( movd mm0, DWORD PTR [eax+4*ecx+4])
307 by weidai fix compile with Intel compiler	709	AS2( movd mm1, DWORD PTR [edi+4*ecx+4])
270 by weidai MMX/SSE2 optimizations	710	AS2( paddq mm0, mm1)
	711	AS2( paddq mm2, mm0)
	712	AS2( movd DWORD PTR [edx+4*ecx+4], mm2)
	713	AS2( psrlq mm2, 32)
	714
	715	ASL(1)
	716	AS2( movd mm0, DWORD PTR [eax+4*ecx+8])
307 by weidai fix compile with Intel compiler	717	AS2( movd mm1, DWORD PTR [edi+4*ecx+8])
270 by weidai MMX/SSE2 optimizations	718	AS2( paddq mm0, mm1)
	719	AS2( paddq mm2, mm0)
	720	AS2( movd DWORD PTR [edx+4*ecx+8], mm2)
	721	AS2( psrlq mm2, 32)
	722
	723	AS2( movd mm0, DWORD PTR [eax+4*ecx+12])
307 by weidai fix compile with Intel compiler	724	AS2( movd mm1, DWORD PTR [edi+4*ecx+12])
270 by weidai MMX/SSE2 optimizations	725	AS2( paddq mm0, mm1)
	726	AS2( paddq mm2, mm0)
	727	AS2( movd DWORD PTR [edx+4*ecx+12], mm2)
	728	AS2( psrlq mm2, 32)
	729
	730	AS2( add ecx, 4)
	731	ASJ( jnz, 0, b)
	732
	733	ASL(2)
	734	AS2( movd eax, mm2)
	735	AS1( emms)
	736
	737	AddEpilogue
	738	}
351 by weidai revert to int return value for Add and Sub	739	CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Sub(size_t N, word C, const word A, const word *B)
270 by weidai MMX/SSE2 optimizations	740	{
	741	AddPrologue
	742
307 by weidai fix compile with Intel compiler	743	// now: eax = A, edi = B, edx = C, ecx = N
270 by weidai MMX/SSE2 optimizations	744	AS2( lea eax, [eax+4*ecx])
307 by weidai fix compile with Intel compiler	745	AS2( lea edi, [edi+4*ecx])
270 by weidai MMX/SSE2 optimizations	746	AS2( lea edx, [edx+4*ecx])
	747
	748	AS1( neg ecx) // ecx is negative index
	749	AS2( pxor mm2, mm2)
	750	ASJ( jz, 2, f)
	751	AS2( test ecx, 2) // this clears carry flag
	752	ASJ( jz, 0, f)
	753	AS2( sub ecx, 2)
	754	ASJ( jmp, 1, f)
	755
	756	ASL(0)
	757	AS2( movd mm0, DWORD PTR [eax+4*ecx])
307 by weidai fix compile with Intel compiler	758	AS2( movd mm1, DWORD PTR [edi+4*ecx])
270 by weidai MMX/SSE2 optimizations	759	AS2( psubq mm0, mm1)
	760	AS2( psubq mm0, mm2)
	761	AS2( movd DWORD PTR [edx+4*ecx], mm0)
	762	AS2( psrlq mm0, 63)
	763
	764	AS2( movd mm2, DWORD PTR [eax+4*ecx+4])
307 by weidai fix compile with Intel compiler	765	AS2( movd mm1, DWORD PTR [edi+4*ecx+4])
270 by weidai MMX/SSE2 optimizations	766	AS2( psubq mm2, mm1)
	767	AS2( psubq mm2, mm0)
	768	AS2( movd DWORD PTR [edx+4*ecx+4], mm2)
	769	AS2( psrlq mm2, 63)
	770
	771	ASL(1)
	772	AS2( movd mm0, DWORD PTR [eax+4*ecx+8])
307 by weidai fix compile with Intel compiler	773	AS2( movd mm1, DWORD PTR [edi+4*ecx+8])
270 by weidai MMX/SSE2 optimizations	774	AS2( psubq mm0, mm1)
	775	AS2( psubq mm0, mm2)
	776	AS2( movd DWORD PTR [edx+4*ecx+8], mm0)
	777	AS2( psrlq mm0, 63)
	778
	779	AS2( movd mm2, DWORD PTR [eax+4*ecx+12])
307 by weidai fix compile with Intel compiler	780	AS2( movd mm1, DWORD PTR [edi+4*ecx+12])
270 by weidai MMX/SSE2 optimizations	781	AS2( psubq mm2, mm1)
	782	AS2( psubq mm2, mm0)
	783	AS2( movd DWORD PTR [edx+4*ecx+12], mm2)
	784	AS2( psrlq mm2, 63)
	785
	786	AS2( add ecx, 4)
	787	ASJ( jnz, 0, b)
	788
	789	ASL(2)
	790	AS2( movd eax, mm2)
	791	AS1( emms)
	792
	793	AddEpilogue
	794	}
307 by weidai fix compile with Intel compiler	795	#endif // #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
270 by weidai MMX/SSE2 optimizations	796	#else
351 by weidai revert to int return value for Add and Sub	797	int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word C, const word A, const word *B)
270 by weidai MMX/SSE2 optimizations	798	{
	799	assert (N%2 == 0);
	800
	801	Declare2Words(u);
308 by weidai fix compile on Sun CC	802	AssignWord(u, 0);
270 by weidai MMX/SSE2 optimizations	803	for (size_t i=0; i<N; i+=2)
	804	{
	805	AddWithCarry(u, A[i], B[i]);
	806	C[i] = LowWord(u);
	807	AddWithCarry(u, A[i+1], B[i+1]);
	808	C[i+1] = LowWord(u);
	809	}
	810	return int(GetCarry(u));
	811	}
	812
351 by weidai revert to int return value for Add and Sub	813	int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word C, const word A, const word *B)
270 by weidai MMX/SSE2 optimizations	814	{
	815	assert (N%2 == 0);
	816
	817	Declare2Words(u);
308 by weidai fix compile on Sun CC	818	AssignWord(u, 0);
270 by weidai MMX/SSE2 optimizations	819	for (size_t i=0; i<N; i+=2)
	820	{
	821	SubtractWithBorrow(u, A[i], B[i]);
	822	C[i] = LowWord(u);
	823	SubtractWithBorrow(u, A[i+1], B[i+1]);
	824	C[i+1] = LowWord(u);
	825	}
	826	return int(GetBorrow(u));
	827	}
	828	#endif
	829
	830	static word LinearMultiply(word C, const word A, word B, size_t N)
	831	{
	832	word carry=0;
	833	for(unsigned i=0; i<N; i++)
	834	{
	835	Declare2Words(p);
	836	MultiplyWords(p, A[i], B);
	837	Acc2WordsBy1(p, carry);
	838	C[i] = LowWord(p);
	839	carry = HighWord(p);
	840	}
	841	return carry;
	842	}
	843
363 by weidai fix possible branch prediction analysis (BPA) vulnerability	844	#ifndef CRYPTOPP_DOXYGEN_PROCESSING
	845
270 by weidai MMX/SSE2 optimizations	846	#define Mul_2 \
	847	Mul_Begin(2) \
	848	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
315 by weidai fix compile for x64, DLL and VC 6	849	Mul_End(1, 1)
270 by weidai MMX/SSE2 optimizations	850
	851	#define Mul_4 \
	852	Mul_Begin(4) \
	853	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
	854	Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \
	855	Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
	856	Mul_SaveAcc(3, 1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) \
	857	Mul_SaveAcc(4, 2, 3) Mul_Acc(3, 2) \
315 by weidai fix compile for x64, DLL and VC 6	858	Mul_End(5, 3)
270 by weidai MMX/SSE2 optimizations	859
	860	#define Mul_8 \
	861	Mul_Begin(8) \
	862	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
	863	Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \
	864	Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
	865	Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
	866	Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
	867	Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
	868	Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
	869	Mul_SaveAcc(7, 1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) \
	870	Mul_SaveAcc(8, 2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) \
	871	Mul_SaveAcc(9, 3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) \
	872	Mul_SaveAcc(10, 4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) \
	873	Mul_SaveAcc(11, 5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) \
	874	Mul_SaveAcc(12, 6, 7) Mul_Acc(7, 6) \
315 by weidai fix compile for x64, DLL and VC 6	875	Mul_End(13, 7)
270 by weidai MMX/SSE2 optimizations	876
	877	#define Mul_16 \
	878	Mul_Begin(16) \
	879	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
	880	Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \
	881	Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
	882	Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
	883	Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
	884	Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
	885	Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
	886	Mul_SaveAcc(7, 0, 8) Mul_Acc(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) Mul_Acc(8, 0) \
	887	Mul_SaveAcc(8, 0, 9) Mul_Acc(1, 8) Mul_Acc(2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) Mul_Acc(8, 1) Mul_Acc(9, 0) \
	888	Mul_SaveAcc(9, 0, 10) Mul_Acc(1, 9) Mul_Acc(2, 8) Mul_Acc(3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) Mul_Acc(8, 2) Mul_Acc(9, 1) Mul_Acc(10, 0) \
	889	Mul_SaveAcc(10, 0, 11) Mul_Acc(1, 10) Mul_Acc(2, 9) Mul_Acc(3, 8) Mul_Acc(4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) Mul_Acc(8, 3) Mul_Acc(9, 2) Mul_Acc(10, 1) Mul_Acc(11, 0) \
	890	Mul_SaveAcc(11, 0, 12) Mul_Acc(1, 11) Mul_Acc(2, 10) Mul_Acc(3, 9) Mul_Acc(4, 8) Mul_Acc(5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) Mul_Acc(8, 4) Mul_Acc(9, 3) Mul_Acc(10, 2) Mul_Acc(11, 1) Mul_Acc(12, 0) \
	891	Mul_SaveAcc(12, 0, 13) Mul_Acc(1, 12) Mul_Acc(2, 11) Mul_Acc(3, 10) Mul_Acc(4, 9) Mul_Acc(5, 8) Mul_Acc(6, 7) Mul_Acc(7, 6) Mul_Acc(8, 5) Mul_Acc(9, 4) Mul_Acc(10, 3) Mul_Acc(11, 2) Mul_Acc(12, 1) Mul_Acc(13, 0) \
	892	Mul_SaveAcc(13, 0, 14) Mul_Acc(1, 13) Mul_Acc(2, 12) Mul_Acc(3, 11) Mul_Acc(4, 10) Mul_Acc(5, 9) Mul_Acc(6, 8) Mul_Acc(7, 7) Mul_Acc(8, 6) Mul_Acc(9, 5) Mul_Acc(10, 4) Mul_Acc(11, 3) Mul_Acc(12, 2) Mul_Acc(13, 1) Mul_Acc(14, 0) \
	893	Mul_SaveAcc(14, 0, 15) Mul_Acc(1, 14) Mul_Acc(2, 13) Mul_Acc(3, 12) Mul_Acc(4, 11) Mul_Acc(5, 10) Mul_Acc(6, 9) Mul_Acc(7, 8) Mul_Acc(8, 7) Mul_Acc(9, 6) Mul_Acc(10, 5) Mul_Acc(11, 4) Mul_Acc(12, 3) Mul_Acc(13, 2) Mul_Acc(14, 1) Mul_Acc(15, 0) \
	894	Mul_SaveAcc(15, 1, 15) Mul_Acc(2, 14) Mul_Acc(3, 13) Mul_Acc(4, 12) Mul_Acc(5, 11) Mul_Acc(6, 10) Mul_Acc(7, 9) Mul_Acc(8, 8) Mul_Acc(9, 7) Mul_Acc(10, 6) Mul_Acc(11, 5) Mul_Acc(12, 4) Mul_Acc(13, 3) Mul_Acc(14, 2) Mul_Acc(15, 1) \
	895	Mul_SaveAcc(16, 2, 15) Mul_Acc(3, 14) Mul_Acc(4, 13) Mul_Acc(5, 12) Mul_Acc(6, 11) Mul_Acc(7, 10) Mul_Acc(8, 9) Mul_Acc(9, 8) Mul_Acc(10, 7) Mul_Acc(11, 6) Mul_Acc(12, 5) Mul_Acc(13, 4) Mul_Acc(14, 3) Mul_Acc(15, 2) \
	896	Mul_SaveAcc(17, 3, 15) Mul_Acc(4, 14) Mul_Acc(5, 13) Mul_Acc(6, 12) Mul_Acc(7, 11) Mul_Acc(8, 10) Mul_Acc(9, 9) Mul_Acc(10, 8) Mul_Acc(11, 7) Mul_Acc(12, 6) Mul_Acc(13, 5) Mul_Acc(14, 4) Mul_Acc(15, 3) \
	897	Mul_SaveAcc(18, 4, 15) Mul_Acc(5, 14) Mul_Acc(6, 13) Mul_Acc(7, 12) Mul_Acc(8, 11) Mul_Acc(9, 10) Mul_Acc(10, 9) Mul_Acc(11, 8) Mul_Acc(12, 7) Mul_Acc(13, 6) Mul_Acc(14, 5) Mul_Acc(15, 4) \
	898	Mul_SaveAcc(19, 5, 15) Mul_Acc(6, 14) Mul_Acc(7, 13) Mul_Acc(8, 12) Mul_Acc(9, 11) Mul_Acc(10, 10) Mul_Acc(11, 9) Mul_Acc(12, 8) Mul_Acc(13, 7) Mul_Acc(14, 6) Mul_Acc(15, 5) \
	899	Mul_SaveAcc(20, 6, 15) Mul_Acc(7, 14) Mul_Acc(8, 13) Mul_Acc(9, 12) Mul_Acc(10, 11) Mul_Acc(11, 10) Mul_Acc(12, 9) Mul_Acc(13, 8) Mul_Acc(14, 7) Mul_Acc(15, 6) \
	900	Mul_SaveAcc(21, 7, 15) Mul_Acc(8, 14) Mul_Acc(9, 13) Mul_Acc(10, 12) Mul_Acc(11, 11) Mul_Acc(12, 10) Mul_Acc(13, 9) Mul_Acc(14, 8) Mul_Acc(15, 7) \
	901	Mul_SaveAcc(22, 8, 15) Mul_Acc(9, 14) Mul_Acc(10, 13) Mul_Acc(11, 12) Mul_Acc(12, 11) Mul_Acc(13, 10) Mul_Acc(14, 9) Mul_Acc(15, 8) \
	902	Mul_SaveAcc(23, 9, 15) Mul_Acc(10, 14) Mul_Acc(11, 13) Mul_Acc(12, 12) Mul_Acc(13, 11) Mul_Acc(14, 10) Mul_Acc(15, 9) \
	903	Mul_SaveAcc(24, 10, 15) Mul_Acc(11, 14) Mul_Acc(12, 13) Mul_Acc(13, 12) Mul_Acc(14, 11) Mul_Acc(15, 10) \
	904	Mul_SaveAcc(25, 11, 15) Mul_Acc(12, 14) Mul_Acc(13, 13) Mul_Acc(14, 12) Mul_Acc(15, 11) \
	905	Mul_SaveAcc(26, 12, 15) Mul_Acc(13, 14) Mul_Acc(14, 13) Mul_Acc(15, 12) \
	906	Mul_SaveAcc(27, 13, 15) Mul_Acc(14, 14) Mul_Acc(15, 13) \
	907	Mul_SaveAcc(28, 14, 15) Mul_Acc(15, 14) \
315 by weidai fix compile for x64, DLL and VC 6	908	Mul_End(29, 15)
270 by weidai MMX/SSE2 optimizations	909
	910	#define Squ_2 \
	911	Squ_Begin(2) \
	912	Squ_End(2)
	913
	914	#define Squ_4 \
	915	Squ_Begin(4) \
	916	Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \
	917	Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \
	918	Squ_SaveAcc(3, 1, 3) Squ_Diag(2) \
	919	Squ_SaveAcc(4, 2, 3) Squ_NonDiag \
	920	Squ_End(4)
	921
	922	#define Squ_8 \
	923	Squ_Begin(8) \
	924	Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \
	925	Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \
	926	Squ_SaveAcc(3, 0, 4) Squ_Acc(1, 3) Squ_Diag(2) \
	927	Squ_SaveAcc(4, 0, 5) Squ_Acc(1, 4) Squ_Acc(2, 3) Squ_NonDiag \
	928	Squ_SaveAcc(5, 0, 6) Squ_Acc(1, 5) Squ_Acc(2, 4) Squ_Diag(3) \
	929	Squ_SaveAcc(6, 0, 7) Squ_Acc(1, 6) Squ_Acc(2, 5) Squ_Acc(3, 4) Squ_NonDiag \
	930	Squ_SaveAcc(7, 1, 7) Squ_Acc(2, 6) Squ_Acc(3, 5) Squ_Diag(4) \
	931	Squ_SaveAcc(8, 2, 7) Squ_Acc(3, 6) Squ_Acc(4, 5) Squ_NonDiag \
	932	Squ_SaveAcc(9, 3, 7) Squ_Acc(4, 6) Squ_Diag(5) \
	933	Squ_SaveAcc(10, 4, 7) Squ_Acc(5, 6) Squ_NonDiag \
	934	Squ_SaveAcc(11, 5, 7) Squ_Diag(6) \
	935	Squ_SaveAcc(12, 6, 7) Squ_NonDiag \
	936	Squ_End(8)
	937
	938	#define Squ_16 \
	939	Squ_Begin(16) \
	940	Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \
	941	Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \
	942	Squ_SaveAcc(3, 0, 4) Squ_Acc(1, 3) Squ_Diag(2) \
	943	Squ_SaveAcc(4, 0, 5) Squ_Acc(1, 4) Squ_Acc(2, 3) Squ_NonDiag \
	944	Squ_SaveAcc(5, 0, 6) Squ_Acc(1, 5) Squ_Acc(2, 4) Squ_Diag(3) \
	945	Squ_SaveAcc(6, 0, 7) Squ_Acc(1, 6) Squ_Acc(2, 5) Squ_Acc(3, 4) Squ_NonDiag \
	946	Squ_SaveAcc(7, 0, 8) Squ_Acc(1, 7) Squ_Acc(2, 6) Squ_Acc(3, 5) Squ_Diag(4) \
	947	Squ_SaveAcc(8, 0, 9) Squ_Acc(1, 8) Squ_Acc(2, 7) Squ_Acc(3, 6) Squ_Acc(4, 5) Squ_NonDiag \
	948	Squ_SaveAcc(9, 0, 10) Squ_Acc(1, 9) Squ_Acc(2, 8) Squ_Acc(3, 7) Squ_Acc(4, 6) Squ_Diag(5) \
	949	Squ_SaveAcc(10, 0, 11) Squ_Acc(1, 10) Squ_Acc(2, 9) Squ_Acc(3, 8) Squ_Acc(4, 7) Squ_Acc(5, 6) Squ_NonDiag \
	950	Squ_SaveAcc(11, 0, 12) Squ_Acc(1, 11) Squ_Acc(2, 10) Squ_Acc(3, 9) Squ_Acc(4, 8) Squ_Acc(5, 7) Squ_Diag(6) \
	951	Squ_SaveAcc(12, 0, 13) Squ_Acc(1, 12) Squ_Acc(2, 11) Squ_Acc(3, 10) Squ_Acc(4, 9) Squ_Acc(5, 8) Squ_Acc(6, 7) Squ_NonDiag \
	952	Squ_SaveAcc(13, 0, 14) Squ_Acc(1, 13) Squ_Acc(2, 12) Squ_Acc(3, 11) Squ_Acc(4, 10) Squ_Acc(5, 9) Squ_Acc(6, 8) Squ_Diag(7) \
	953	Squ_SaveAcc(14, 0, 15) Squ_Acc(1, 14) Squ_Acc(2, 13) Squ_Acc(3, 12) Squ_Acc(4, 11) Squ_Acc(5, 10) Squ_Acc(6, 9) Squ_Acc(7, 8) Squ_NonDiag \
	954	Squ_SaveAcc(15, 1, 15) Squ_Acc(2, 14) Squ_Acc(3, 13) Squ_Acc(4, 12) Squ_Acc(5, 11) Squ_Acc(6, 10) Squ_Acc(7, 9) Squ_Diag(8) \
	955	Squ_SaveAcc(16, 2, 15) Squ_Acc(3, 14) Squ_Acc(4, 13) Squ_Acc(5, 12) Squ_Acc(6, 11) Squ_Acc(7, 10) Squ_Acc(8, 9) Squ_NonDiag \
	956	Squ_SaveAcc(17, 3, 15) Squ_Acc(4, 14) Squ_Acc(5, 13) Squ_Acc(6, 12) Squ_Acc(7, 11) Squ_Acc(8, 10) Squ_Diag(9) \
	957	Squ_SaveAcc(18, 4, 15) Squ_Acc(5, 14) Squ_Acc(6, 13) Squ_Acc(7, 12) Squ_Acc(8, 11) Squ_Acc(9, 10) Squ_NonDiag \
	958	Squ_SaveAcc(19, 5, 15) Squ_Acc(6, 14) Squ_Acc(7, 13) Squ_Acc(8, 12) Squ_Acc(9, 11) Squ_Diag(10) \
	959	Squ_SaveAcc(20, 6, 15) Squ_Acc(7, 14) Squ_Acc(8, 13) Squ_Acc(9, 12) Squ_Acc(10, 11) Squ_NonDiag \
	960	Squ_SaveAcc(21, 7, 15) Squ_Acc(8, 14) Squ_Acc(9, 13) Squ_Acc(10, 12) Squ_Diag(11) \
	961	Squ_SaveAcc(22, 8, 15) Squ_Acc(9, 14) Squ_Acc(10, 13) Squ_Acc(11, 12) Squ_NonDiag \
	962	Squ_SaveAcc(23, 9, 15) Squ_Acc(10, 14) Squ_Acc(11, 13) Squ_Diag(12) \
	963	Squ_SaveAcc(24, 10, 15) Squ_Acc(11, 14) Squ_Acc(12, 13) Squ_NonDiag \
	964	Squ_SaveAcc(25, 11, 15) Squ_Acc(12, 14) Squ_Diag(13) \
	965	Squ_SaveAcc(26, 12, 15) Squ_Acc(13, 14) Squ_NonDiag \
	966	Squ_SaveAcc(27, 13, 15) Squ_Diag(14) \
	967	Squ_SaveAcc(28, 14, 15) Squ_NonDiag \
	968	Squ_End(16)
	969
	970	#define Bot_2 \
	971	Mul_Begin(2) \
	972	Bot_SaveAcc(0, 0, 1) Bot_Acc(1, 0) \
973	Bot_End(2)
974
975	#define Bot_4 \
976	Mul_Begin(4) \
977	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
978	Mul_SaveAcc(1, 2, 0) Mul_Acc(1, 1) Mul_Acc(0, 2) \
979	Bot_SaveAcc(2, 0, 3) Bot_Acc(1, 2) Bot_Acc(2, 1) Bot_Acc(3, 0) \
980	Bot_End(4)
981
982	#define Bot_8 \
983	Mul_Begin(8) \
984	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
985	Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \
986	Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
987	Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
988	Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
989	Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
990	Bot_SaveAcc(6, 0, 7) Bot_Acc(1, 6) Bot_Acc(2, 5) Bot_Acc(3, 4) Bot_Acc(4, 3) Bot_Acc(5, 2) Bot_Acc(6, 1) Bot_Acc(7, 0) \
991	Bot_End(8)
992
993	#define Bot_16 \
994	Mul_Begin(16) \
995	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
996	Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \
997	Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
998	Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
999	Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
1000	Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
1001	Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
1002	Mul_SaveAcc(7, 0, 8) Mul_Acc(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) Mul_Acc(8, 0) \
1003	Mul_SaveAcc(8, 0, 9) Mul_Acc(1, 8) Mul_Acc(2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) Mul_Acc(8, 1) Mul_Acc(9, 0) \
1004	Mul_SaveAcc(9, 0, 10) Mul_Acc(1, 9) Mul_Acc(2, 8) Mul_Acc(3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) Mul_Acc(8, 2) Mul_Acc(9, 1) Mul_Acc(10, 0) \
1005	Mul_SaveAcc(10, 0, 11) Mul_Acc(1, 10) Mul_Acc(2, 9) Mul_Acc(3, 8) Mul_Acc(4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) Mul_Acc(8, 3) Mul_Acc(9, 2) Mul_Acc(10, 1) Mul_Acc(11, 0) \
1006	Mul_SaveAcc(11, 0, 12) Mul_Acc(1, 11) Mul_Acc(2, 10) Mul_Acc(3, 9) Mul_Acc(4, 8) Mul_Acc(5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) Mul_Acc(8, 4) Mul_Acc(9, 3) Mul_Acc(10, 2) Mul_Acc(11, 1) Mul_Acc(12, 0) \
1007	Mul_SaveAcc(12, 0, 13) Mul_Acc(1, 12) Mul_Acc(2, 11) Mul_Acc(3, 10) Mul_Acc(4, 9) Mul_Acc(5, 8) Mul_Acc(6, 7) Mul_Acc(7, 6) Mul_Acc(8, 5) Mul_Acc(9, 4) Mul_Acc(10, 3) Mul_Acc(11, 2) Mul_Acc(12, 1) Mul_Acc(13, 0) \
1008	Mul_SaveAcc(13, 0, 14) Mul_Acc(1, 13) Mul_Acc(2, 12) Mul_Acc(3, 11) Mul_Acc(4, 10) Mul_Acc(5, 9) Mul_Acc(6, 8) Mul_Acc(7, 7) Mul_Acc(8, 6) Mul_Acc(9, 5) Mul_Acc(10, 4) Mul_Acc(11, 3) Mul_Acc(12, 2) Mul_Acc(13, 1) Mul_Acc(14, 0) \
1009	Bot_SaveAcc(14, 0, 15) Bot_Acc(1, 14) Bot_Acc(2, 13) Bot_Acc(3, 12) Bot_Acc(4, 11) Bot_Acc(5, 10) Bot_Acc(6, 9) Bot_Acc(7, 8) Bot_Acc(8, 7) Bot_Acc(9, 6) Bot_Acc(10, 5) Bot_Acc(11, 4) Bot_Acc(12, 3) Bot_Acc(13, 2) Bot_Acc(14, 1) Bot_Acc(15, 0) \
1010	Bot_End(16)
363 by weidai fix possible branch prediction analysis (BPA) vulnerability	1011
	1012	#endif
270 by weidai MMX/SSE2 optimizations	1013
315 by weidai fix compile for x64, DLL and VC 6	1014	#if 0
270 by weidai MMX/SSE2 optimizations	1015	#define Mul_Begin(n) \
	1016	Declare2Words(p) \
	1017	Declare2Words(c) \
	1018	Declare2Words(d) \
	1019	MultiplyWords(p, A[0], B[0]) \
	1020	AssignWord(c, LowWord(p)) \
	1021	AssignWord(d, HighWord(p))
	1022
	1023	#define Mul_Acc(i, j) \
	1024	MultiplyWords(p, A[i], B[j]) \
	1025	Acc2WordsBy1(c, LowWord(p)) \
	1026	Acc2WordsBy1(d, HighWord(p))
	1027
	1028	#define Mul_SaveAcc(k, i, j) \
	1029	R[k] = LowWord(c); \
	1030	Add2WordsBy1(c, d, HighWord(c)) \
	1031	MultiplyWords(p, A[i], B[j]) \
	1032	AssignWord(d, HighWord(p)) \
	1033	Acc2WordsBy1(c, LowWord(p))
	1034
	1035	#define Mul_End(n) \
	1036	R[2*n-3] = LowWord(c); \
	1037	Acc2WordsBy1(d, HighWord(c)) \
	1038	MultiplyWords(p, A[n-1], B[n-1])\
	1039	Acc2WordsBy2(d, p) \
	1040	R[2*n-2] = LowWord(d); \
	1041	R[2*n-1] = HighWord(d);
	1042
	1043	#define Bot_SaveAcc(k, i, j) \
	1044	R[k] = LowWord(c); \
	1045	word e = LowWord(d) + HighWord(c); \
	1046	e += A[i] * B[j];
	1047
	1048	#define Bot_Acc(i, j) \
	1049	e += A[i] * B[j];
	1050
	1051	#define Bot_End(n) \
	1052	R[n-1] = e;
315 by weidai fix compile for x64, DLL and VC 6	1053	#else
270 by weidai MMX/SSE2 optimizations	1054	#define Mul_Begin(n) \
	1055	Declare2Words(p) \
	1056	word c; \
	1057	Declare2Words(d) \
	1058	MultiplyWords(p, A[0], B[0]) \
	1059	c = LowWord(p); \
	1060	AssignWord(d, HighWord(p))
	1061
	1062	#define Mul_Acc(i, j) \
315 by weidai fix compile for x64, DLL and VC 6	1063	MulAcc(c, d, A[i], B[j])
270 by weidai MMX/SSE2 optimizations	1064
	1065	#define Mul_SaveAcc(k, i, j) \
	1066	R[k] = c; \
315 by weidai fix compile for x64, DLL and VC 6	1067	c = LowWord(d); \
270 by weidai MMX/SSE2 optimizations	1068	AssignWord(d, HighWord(d)) \
315 by weidai fix compile for x64, DLL and VC 6	1069	MulAcc(c, d, A[i], B[j])
270 by weidai MMX/SSE2 optimizations	1070
315 by weidai fix compile for x64, DLL and VC 6	1071	#define Mul_End(k, i) \
	1072	R[k] = c; \
	1073	MultiplyWords(p, A[i], B[i]) \
	1074	Acc2WordsBy2(p, d) \
	1075	R[k+1] = LowWord(p); \
	1076	R[k+2] = HighWord(p);
270 by weidai MMX/SSE2 optimizations	1077
	1078	#define Bot_SaveAcc(k, i, j) \
	1079	R[k] = c; \
	1080	c = LowWord(d); \
	1081	c += A[i] * B[j];
	1082
	1083	#define Bot_Acc(i, j) \
	1084	c += A[i] * B[j];
	1085
	1086	#define Bot_End(n) \
	1087	R[n-1] = c;
315 by weidai fix compile for x64, DLL and VC 6	1088	#endif
270 by weidai MMX/SSE2 optimizations	1089
	1090	#define Squ_Begin(n) \
	1091	Declare2Words(p) \
315 by weidai fix compile for x64, DLL and VC 6	1092	word c; \
270 by weidai MMX/SSE2 optimizations	1093	Declare2Words(d) \
	1094	Declare2Words(e) \
	1095	MultiplyWords(p, A[0], A[0]) \
	1096	R[0] = LowWord(p); \
	1097	AssignWord(e, HighWord(p)) \
	1098	MultiplyWords(p, A[0], A[1]) \
315 by weidai fix compile for x64, DLL and VC 6	1099	c = LowWord(p); \
270 by weidai MMX/SSE2 optimizations	1100	AssignWord(d, HighWord(p)) \
	1101	Squ_NonDiag \
	1102
	1103	#define Squ_NonDiag \
315 by weidai fix compile for x64, DLL and VC 6	1104	Double3Words(c, d)
270 by weidai MMX/SSE2 optimizations	1105
270 by weidai MMX/SSE2 optimizations	1106	#define Squ_SaveAcc(k, i, j) \
315 by weidai fix compile for x64, DLL and VC 6	1107	Acc3WordsBy2(c, d, e) \
315 by weidai fix compile for x64, DLL and VC 6	1108	R[k] = c; \
270 by weidai MMX/SSE2 optimizations	1109	MultiplyWords(p, A[i], A[j]) \
315 by weidai fix compile for x64, DLL and VC 6	1110	c = LowWord(p); \
270 by weidai MMX/SSE2 optimizations	1111	AssignWord(d, HighWord(p)) \
	1112
	1113	#define Squ_Acc(i, j) \
315 by weidai fix compile for x64, DLL and VC 6	1114	MulAcc(c, d, A[i], A[j])
270 by weidai MMX/SSE2 optimizations	1115
	1116	#define Squ_Diag(i) \
	1117	Squ_NonDiag \
315 by weidai fix compile for x64, DLL and VC 6	1118	MulAcc(c, d, A[i], A[i])
270 by weidai MMX/SSE2 optimizations	1119
270 by weidai MMX/SSE2 optimizations	1120	#define Squ_End(n) \
315 by weidai fix compile for x64, DLL and VC 6	1121	Acc3WordsBy2(c, d, e) \
315 by weidai fix compile for x64, DLL and VC 6	1122	R[2*n-3] = c; \
270 by weidai MMX/SSE2 optimizations	1123	MultiplyWords(p, A[n-1], A[n-1])\
315 by weidai fix compile for x64, DLL and VC 6	1124	Acc2WordsBy2(p, e) \
	1125	R[2*n-2] = LowWord(p); \
	1126	R[2*n-1] = HighWord(p);
270 by weidai MMX/SSE2 optimizations	1127
	1128	void Baseline_Multiply2(word R, const word A, const word *B)
	1129	{
	1130	Mul_2
	1131	}
	1132
	1133	void Baseline_Multiply4(word R, const word A, const word *B)
	1134	{
	1135	Mul_4
	1136	}
	1137
	1138	void Baseline_Multiply8(word R, const word A, const word *B)
	1139	{
	1140	Mul_8
	1141	}
	1142
	1143	void Baseline_Square2(word R, const word A)
	1144	{
	1145	Squ_2
	1146	}
	1147
	1148	void Baseline_Square4(word R, const word A)
	1149	{
	1150	Squ_4
	1151	}
	1152
	1153	void Baseline_Square8(word R, const word A)
	1154	{
	1155	Squ_8
	1156	}
	1157
	1158	void Baseline_MultiplyBottom2(word R, const word A, const word *B)
	1159	{
	1160	Bot_2
	1161	}
	1162
	1163	void Baseline_MultiplyBottom4(word R, const word A, const word *B)
	1164	{
	1165	Bot_4
	1166	}
	1167
	1168	void Baseline_MultiplyBottom8(word R, const word A, const word *B)
	1169	{
	1170	Bot_8
	1171	}
	1172
315 by weidai fix compile for x64, DLL and VC 6	1173	#define Top_Begin(n) \
	1174	Declare2Words(p) \
	1175	word c; \
	1176	Declare2Words(d) \
	1177	MultiplyWords(p, A[0], B[n-2]);\
	1178	AssignWord(d, HighWord(p));
	1179
	1180	#define Top_Acc(i, j) \
	1181	MultiplyWords(p, A[i], B[j]);\
	1182	Acc2WordsBy1(d, HighWord(p));
	1183
	1184	#define Top_SaveAcc0(i, j) \
	1185	c = LowWord(d); \
	1186	AssignWord(d, HighWord(d)) \
	1187	MulAcc(c, d, A[i], B[j])
	1188
	1189	#define Top_SaveAcc1(i, j) \
	1190	c = L<c; \
	1191	Acc2WordsBy1(d, c); \
	1192	c = LowWord(d); \
	1193	AssignWord(d, HighWord(d)) \
	1194	MulAcc(c, d, A[i], B[j])
	1195
	1196	void Baseline_MultiplyTop2(word R, const word A, const word *B, word L)
	1197	{
	1198	word T[4];
	1199	Baseline_Multiply2(T, A, B);
	1200	R[0] = T[2];
	1201	R[1] = T[3];
	1202	}
	1203
	1204	void Baseline_MultiplyTop4(word R, const word A, const word *B, word L)
	1205	{
	1206	Top_Begin(4)
	1207	Top_Acc(1, 1) Top_Acc(2, 0) \
	1208	Top_SaveAcc0(0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
	1209	Top_SaveAcc1(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) \
	1210	Mul_SaveAcc(0, 2, 3) Mul_Acc(3, 2) \
	1211	Mul_End(1, 3)
	1212	}
	1213
	1214	void Baseline_MultiplyTop8(word R, const word A, const word *B, word L)
	1215	{
	1216	Top_Begin(8)
	1217	Top_Acc(1, 5) Top_Acc(2, 4) Top_Acc(3, 3) Top_Acc(4, 2) Top_Acc(5, 1) Top_Acc(6, 0) \
	1218	Top_SaveAcc0(0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
	1219	Top_SaveAcc1(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) \
	1220	Mul_SaveAcc(0, 2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) \
	1221	Mul_SaveAcc(1, 3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) \
	1222	Mul_SaveAcc(2, 4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) \
	1223	Mul_SaveAcc(3, 5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) \
	1224	Mul_SaveAcc(4, 6, 7) Mul_Acc(7, 6) \
	1225	Mul_End(5, 7)
	1226	}
	1227
	1228	#if !CRYPTOPP_INTEGER_SSE2 // save memory by not compiling these functions when SSE2 is available
270 by weidai MMX/SSE2 optimizations	1229	void Baseline_Multiply16(word R, const word A, const word *B)
	1230	{
	1231	Mul_16
	1232	}
	1233
	1234	void Baseline_Square16(word R, const word A)
	1235	{
	1236	Squ_16
	1237	}
	1238
	1239	void Baseline_MultiplyBottom16(word R, const word A, const word *B)
	1240	{
	1241	Bot_16
	1242	}
315 by weidai fix compile for x64, DLL and VC 6	1243
	1244	void Baseline_MultiplyTop16(word R, const word A, const word *B, word L)
	1245	{
	1246	Top_Begin(16)
	1247	Top_Acc(1, 13) Top_Acc(2, 12) Top_Acc(3, 11) Top_Acc(4, 10) Top_Acc(5, 9) Top_Acc(6, 8) Top_Acc(7, 7) Top_Acc(8, 6) Top_Acc(9, 5) Top_Acc(10, 4) Top_Acc(11, 3) Top_Acc(12, 2) Top_Acc(13, 1) Top_Acc(14, 0) \
	1248	Top_SaveAcc0(0, 15) Mul_Acc(1, 14) Mul_Acc(2, 13) Mul_Acc(3, 12) Mul_Acc(4, 11) Mul_Acc(5, 10) Mul_Acc(6, 9) Mul_Acc(7, 8) Mul_Acc(8, 7) Mul_Acc(9, 6) Mul_Acc(10, 5) Mul_Acc(11, 4) Mul_Acc(12, 3) Mul_Acc(13, 2) Mul_Acc(14, 1) Mul_Acc(15, 0) \
	1249	Top_SaveAcc1(1, 15) Mul_Acc(2, 14) Mul_Acc(3, 13) Mul_Acc(4, 12) Mul_Acc(5, 11) Mul_Acc(6, 10) Mul_Acc(7, 9) Mul_Acc(8, 8) Mul_Acc(9, 7) Mul_Acc(10, 6) Mul_Acc(11, 5) Mul_Acc(12, 4) Mul_Acc(13, 3) Mul_Acc(14, 2) Mul_Acc(15, 1) \
	1250	Mul_SaveAcc(0, 2, 15) Mul_Acc(3, 14) Mul_Acc(4, 13) Mul_Acc(5, 12) Mul_Acc(6, 11) Mul_Acc(7, 10) Mul_Acc(8, 9) Mul_Acc(9, 8) Mul_Acc(10, 7) Mul_Acc(11, 6) Mul_Acc(12, 5) Mul_Acc(13, 4) Mul_Acc(14, 3) Mul_Acc(15, 2) \
	1251	Mul_SaveAcc(1, 3, 15) Mul_Acc(4, 14) Mul_Acc(5, 13) Mul_Acc(6, 12) Mul_Acc(7, 11) Mul_Acc(8, 10) Mul_Acc(9, 9) Mul_Acc(10, 8) Mul_Acc(11, 7) Mul_Acc(12, 6) Mul_Acc(13, 5) Mul_Acc(14, 4) Mul_Acc(15, 3) \
	1252	Mul_SaveAcc(2, 4, 15) Mul_Acc(5, 14) Mul_Acc(6, 13) Mul_Acc(7, 12) Mul_Acc(8, 11) Mul_Acc(9, 10) Mul_Acc(10, 9) Mul_Acc(11, 8) Mul_Acc(12, 7) Mul_Acc(13, 6) Mul_Acc(14, 5) Mul_Acc(15, 4) \
	1253	Mul_SaveAcc(3, 5, 15) Mul_Acc(6, 14) Mul_Acc(7, 13) Mul_Acc(8, 12) Mul_Acc(9, 11) Mul_Acc(10, 10) Mul_Acc(11, 9) Mul_Acc(12, 8) Mul_Acc(13, 7) Mul_Acc(14, 6) Mul_Acc(15, 5) \
	1254	Mul_SaveAcc(4, 6, 15) Mul_Acc(7, 14) Mul_Acc(8, 13) Mul_Acc(9, 12) Mul_Acc(10, 11) Mul_Acc(11, 10) Mul_Acc(12, 9) Mul_Acc(13, 8) Mul_Acc(14, 7) Mul_Acc(15, 6) \
	1255	Mul_SaveAcc(5, 7, 15) Mul_Acc(8, 14) Mul_Acc(9, 13) Mul_Acc(10, 12) Mul_Acc(11, 11) Mul_Acc(12, 10) Mul_Acc(13, 9) Mul_Acc(14, 8) Mul_Acc(15, 7) \
	1256	Mul_SaveAcc(6, 8, 15) Mul_Acc(9, 14) Mul_Acc(10, 13) Mul_Acc(11, 12) Mul_Acc(12, 11) Mul_Acc(13, 10) Mul_Acc(14, 9) Mul_Acc(15, 8) \
	1257	Mul_SaveAcc(7, 9, 15) Mul_Acc(10, 14) Mul_Acc(11, 13) Mul_Acc(12, 12) Mul_Acc(13, 11) Mul_Acc(14, 10) Mul_Acc(15, 9) \
	1258	Mul_SaveAcc(8, 10, 15) Mul_Acc(11, 14) Mul_Acc(12, 13) Mul_Acc(13, 12) Mul_Acc(14, 11) Mul_Acc(15, 10) \
	1259	Mul_SaveAcc(9, 11, 15) Mul_Acc(12, 14) Mul_Acc(13, 13) Mul_Acc(14, 12) Mul_Acc(15, 11) \
	1260	Mul_SaveAcc(10, 12, 15) Mul_Acc(13, 14) Mul_Acc(14, 13) Mul_Acc(15, 12) \
	1261	Mul_SaveAcc(11, 13, 15) Mul_Acc(14, 14) Mul_Acc(15, 13) \
	1262	Mul_SaveAcc(12, 14, 15) Mul_Acc(15, 14) \
	1263	Mul_End(13, 15)
	1264	}
	1265	#endif
270 by weidai MMX/SSE2 optimizations	1266
	1267	// ********************************************************
	1268
315 by weidai fix compile for x64, DLL and VC 6	1269	#if CRYPTOPP_INTEGER_SSE2
270 by weidai MMX/SSE2 optimizations	1270
	1271	CRYPTOPP_ALIGN_DATA(16) static const word32 s_maskLow16[4] CRYPTOPP_SECTION_ALIGN16 = {0xffff,0xffff,0xffff,0xffff};
	1272
	1273	#undef Mul_Begin
	1274	#undef Mul_Acc
315 by weidai fix compile for x64, DLL and VC 6	1275	#undef Top_Begin
315 by weidai fix compile for x64, DLL and VC 6	1276	#undef Top_Acc
270 by weidai MMX/SSE2 optimizations	1277	#undef Squ_Acc
	1278	#undef Squ_NonDiag
	1279	#undef Squ_Diag
	1280	#undef Squ_SaveAcc
	1281	#undef Squ_Begin
	1282	#undef Mul_SaveAcc
	1283	#undef Bot_Acc
	1284	#undef Bot_SaveAcc
	1285	#undef Bot_End
	1286	#undef Squ_End
	1287	#undef Mul_End
	1288
	1289	#define SSE2_FinalSave(k) \
	1290	AS2( psllq xmm5, 16) \
	1291	AS2( paddq xmm4, xmm5) \
	1292	AS2( movq QWORD PTR [ecx+8*(k)], xmm4)
	1293
	1294	#define SSE2_SaveShift(k) \
	1295	AS2( movq xmm0, xmm6) \
	1296	AS2( punpckhqdq xmm6, xmm0) \
	1297	AS2( movq xmm1, xmm7) \
	1298	AS2( punpckhqdq xmm7, xmm1) \
	1299	AS2( paddd xmm6, xmm0) \
	1300	AS2( pslldq xmm6, 4) \
	1301	AS2( paddd xmm7, xmm1) \
	1302	AS2( paddd xmm4, xmm6) \
	1303	AS2( pslldq xmm7, 4) \
	1304	AS2( movq xmm6, xmm4) \
	1305	AS2( paddd xmm5, xmm7) \
	1306	AS2( movq xmm7, xmm5) \
	1307	AS2( movd DWORD PTR [ecx+8*(k)], xmm4) \
	1308	AS2( psrlq xmm6, 16) \
	1309	AS2( paddq xmm6, xmm7) \
	1310	AS2( punpckhqdq xmm4, xmm0) \
	1311	AS2( punpckhqdq xmm5, xmm0) \
	1312	AS2( movq QWORD PTR [ecx+8*(k)+2], xmm6) \
	1313	AS2( psrlq xmm6, 3*16) \
	1314	AS2( paddd xmm4, xmm6) \
	1315
	1316	#define Squ_SSE2_SaveShift(k) \
	1317	AS2( movq xmm0, xmm6) \
	1318	AS2( punpckhqdq xmm6, xmm0) \
	1319	AS2( movq xmm1, xmm7) \
	1320	AS2( punpckhqdq xmm7, xmm1) \
	1321	AS2( paddd xmm6, xmm0) \
	1322	AS2( pslldq xmm6, 4) \
	1323	AS2( paddd xmm7, xmm1) \
	1324	AS2( paddd xmm4, xmm6) \
	1325	AS2( pslldq xmm7, 4) \
	1326	AS2( movhlps xmm6, xmm4) \
	1327	AS2( movd DWORD PTR [ecx+8*(k)], xmm4) \
	1328	AS2( paddd xmm5, xmm7) \
	1329	AS2( movhps QWORD PTR [esp+12], xmm5)\
	1330	AS2( psrlq xmm4, 16) \
	1331	AS2( paddq xmm4, xmm5) \
	1332	AS2( movq QWORD PTR [ecx+8*(k)+2], xmm4) \
	1333	AS2( psrlq xmm4, 3*16) \
	1334	AS2( paddd xmm4, xmm6) \
	1335	AS2( movq QWORD PTR [esp+4], xmm4)\
	1336
	1337	#define SSE2_FirstMultiply(i) \
	1338	AS2( movdqa xmm7, [esi+(i)*16])\
	1339	AS2( movdqa xmm5, [edi-(i)*16])\
	1340	AS2( pmuludq xmm5, xmm7) \
1341	AS2( movdqa xmm4, [ebx])\
1342	AS2( movdqa xmm6, xmm4) \
1343	AS2( pand xmm4, xmm5) \
1344	AS2( psrld xmm5, 16) \
1345	AS2( pmuludq xmm7, [edx-(i)*16])\
1346	AS2( pand xmm6, xmm7) \
1347	AS2( psrld xmm7, 16)
1348
1349	#define Squ_Begin(n) \
1350	SquPrologue \
1351	AS2( mov esi, esp)\
1352	AS2( and esp, 0xfffffff0)\
1353	AS2( lea edi, [esp-32*n])\
1354	AS2( sub esp, 32*n+16)\
1355	AS1( push esi)\
1356	AS2( mov esi, edi) \
1357	AS2( xor edx, edx) \
1358	ASL(1) \
1359	ASS( pshufd xmm0, [eax+edx], 3,1,2,0) \
1360	ASS( pshufd xmm1, [eax+edx], 2,0,3,1) \
1361	AS2( movdqa [edi+2*edx], xmm0) \
1362	AS2( psrlq xmm0, 32) \
1363	AS2( movdqa [edi+2*edx+16], xmm0) \
1364	AS2( movdqa [edi+16n+2edx], xmm1) \
1365	AS2( psrlq xmm1, 32) \
1366	AS2( movdqa [edi+16n+2edx+16], xmm1) \
1367	AS2( add edx, 16) \
1368	AS2( cmp edx, 8*(n)) \
1369	ASJ( jne, 1, b) \
1370	AS2( lea edx, [edi+16*n])\
1371	SSE2_FirstMultiply(0) \
1372
1373	#define Squ_Acc(i) \
1374	ASL(LSqu##i) \
1375	AS2( movdqa xmm1, [esi+(i)*16]) \
1376	AS2( movdqa xmm0, [edi-(i)*16]) \
1377	AS2( movdqa xmm2, [ebx]) \
1378	AS2( pmuludq xmm0, xmm1) \
1379	AS2( pmuludq xmm1, [edx-(i)*16]) \
1380	AS2( movdqa xmm3, xmm2) \
1381	AS2( pand xmm2, xmm0) \
1382	AS2( psrld xmm0, 16) \
1383	AS2( paddd xmm4, xmm2) \
1384	AS2( paddd xmm5, xmm0) \
1385	AS2( pand xmm3, xmm1) \
1386	AS2( psrld xmm1, 16) \
1387	AS2( paddd xmm6, xmm3) \
1388	AS2( paddd xmm7, xmm1) \
1389
1390	#define Squ_Acc1(i)
1391	#define Squ_Acc2(i) ASC(call, LSqu##i)
1392	#define Squ_Acc3(i) Squ_Acc2(i)
1393	#define Squ_Acc4(i) Squ_Acc2(i)
1394	#define Squ_Acc5(i) Squ_Acc2(i)
1395	#define Squ_Acc6(i) Squ_Acc2(i)
1396	#define Squ_Acc7(i) Squ_Acc2(i)
1397	#define Squ_Acc8(i) Squ_Acc2(i)
1398
1399	#define SSE2_End(E, n) \
1400	SSE2_SaveShift(2*(n)-3) \
1401	AS2( movdqa xmm7, [esi+16]) \
1402	AS2( movdqa xmm0, [edi]) \
1403	AS2( pmuludq xmm0, xmm7) \
1404	AS2( movdqa xmm2, [ebx]) \
1405	AS2( pmuludq xmm7, [edx]) \
1406	AS2( movdqa xmm6, xmm2) \
1407	AS2( pand xmm2, xmm0) \
1408	AS2( psrld xmm0, 16) \
1409	AS2( paddd xmm4, xmm2) \
1410	AS2( paddd xmm5, xmm0) \
1411	AS2( pand xmm6, xmm7) \
1412	AS2( psrld xmm7, 16) \
1413	SSE2_SaveShift(2*(n)-2) \
1414	SSE2_FinalSave(2*(n)-1) \
1415	AS1( pop esp)\
1416	E
1417
1418	#define Squ_End(n) SSE2_End(SquEpilogue, n)
1419	#define Mul_End(n) SSE2_End(MulEpilogue, n)
1420	#define Top_End(n) SSE2_End(TopEpilogue, n)
1421
1422	#define Squ_Column1(k, i) \
1423	Squ_SSE2_SaveShift(k) \
1424	AS2( add esi, 16) \
1425	SSE2_FirstMultiply(1)\
1426	Squ_Acc##i(i) \
1427	AS2( paddd xmm4, xmm4) \
1428	AS2( paddd xmm5, xmm5) \
1429	AS2( movdqa xmm3, [esi]) \
1430	AS2( movq xmm1, QWORD PTR [esi+8]) \
1431	AS2( pmuludq xmm1, xmm3) \
1432	AS2( pmuludq xmm3, xmm3) \
1433	AS2( movdqa xmm0, [ebx])\
1434	AS2( movdqa xmm2, xmm0) \
1435	AS2( pand xmm0, xmm1) \
1436	AS2( psrld xmm1, 16) \
1437	AS2( paddd xmm6, xmm0) \
1438	AS2( paddd xmm7, xmm1) \
1439	AS2( pand xmm2, xmm3) \
1440	AS2( psrld xmm3, 16) \
1441	AS2( paddd xmm6, xmm6) \
1442	AS2( paddd xmm7, xmm7) \
1443	AS2( paddd xmm4, xmm2) \
1444	AS2( paddd xmm5, xmm3) \
1445	AS2( movq xmm0, QWORD PTR [esp+4])\
1446	AS2( movq xmm1, QWORD PTR [esp+12])\
1447	AS2( paddd xmm4, xmm0)\
1448	AS2( paddd xmm5, xmm1)\
1449
1450	#define Squ_Column0(k, i) \
1451	Squ_SSE2_SaveShift(k) \
1452	AS2( add edi, 16) \
1453	AS2( add edx, 16) \
1454	SSE2_FirstMultiply(1)\
1455	Squ_Acc##i(i) \
1456	AS2( paddd xmm6, xmm6) \
1457	AS2( paddd xmm7, xmm7) \
1458	AS2( paddd xmm4, xmm4) \
1459	AS2( paddd xmm5, xmm5) \
1460	AS2( movq xmm0, QWORD PTR [esp+4])\
1461	AS2( movq xmm1, QWORD PTR [esp+12])\
1462	AS2( paddd xmm4, xmm0)\
1463	AS2( paddd xmm5, xmm1)\
1464
1465	#define SSE2_MulAdd45 \
1466	AS2( movdqa xmm7, [esi]) \
1467	AS2( movdqa xmm0, [edi]) \
1468	AS2( pmuludq xmm0, xmm7) \
1469	AS2( movdqa xmm2, [ebx]) \
1470	AS2( pmuludq xmm7, [edx]) \
1471	AS2( movdqa xmm6, xmm2) \
1472	AS2( pand xmm2, xmm0) \
1473	AS2( psrld xmm0, 16) \
1474	AS2( paddd xmm4, xmm2) \
1475	AS2( paddd xmm5, xmm0) \
1476	AS2( pand xmm6, xmm7) \
1477	AS2( psrld xmm7, 16)
1478
1479	#define Mul_Begin(n) \
1480	MulPrologue \
1481	AS2( mov esi, esp)\
1482	AS2( and esp, 0xfffffff0)\
1483	AS2( sub esp, 48*n+16)\
1484	AS1( push esi)\
1485	AS2( xor edx, edx) \
1486	ASL(1) \
1487	ASS( pshufd xmm0, [eax+edx], 3,1,2,0) \
1488	ASS( pshufd xmm1, [eax+edx], 2,0,3,1) \
1489	ASS( pshufd xmm2, [edi+edx], 3,1,2,0) \
1490	AS2( movdqa [esp+20+2*edx], xmm0) \
1491	AS2( psrlq xmm0, 32) \
1492	AS2( movdqa [esp+20+2*edx+16], xmm0) \
1493	AS2( movdqa [esp+20+16n+2edx], xmm1) \
1494	AS2( psrlq xmm1, 32) \
1495	AS2( movdqa [esp+20+16n+2edx+16], xmm1) \
1496	AS2( movdqa [esp+20+32n+2edx], xmm2) \
1497	AS2( psrlq xmm2, 32) \
1498	AS2( movdqa [esp+20+32n+2edx+16], xmm2) \
1499	AS2( add edx, 16) \
1500	AS2( cmp edx, 8*(n)) \
1501	ASJ( jne, 1, b) \
1502	AS2( lea edi, [esp+20])\
1503	AS2( lea edx, [esp+20+16*n])\
1504	AS2( lea esi, [esp+20+32*n])\
1505	SSE2_FirstMultiply(0) \
1506
1507	#define Mul_Acc(i) \
1508	ASL(LMul##i) \
1509	AS2( movdqa xmm1, [esi+i/2(1-(i-2(i/2))2)16]) \
1510	AS2( movdqa xmm0, [edi-i/2(1-(i-2(i/2))2)16]) \
1511	AS2( movdqa xmm2, [ebx]) \
1512	AS2( pmuludq xmm0, xmm1) \
1513	AS2( pmuludq xmm1, [edx-i/2(1-(i-2(i/2))2)16]) \
1514	AS2( movdqa xmm3, xmm2) \
1515	AS2( pand xmm2, xmm0) \
1516	AS2( psrld xmm0, 16) \
1517	AS2( paddd xmm4, xmm2) \
1518	AS2( paddd xmm5, xmm0) \
1519	AS2( pand xmm3, xmm1) \
1520	AS2( psrld xmm1, 16) \
1521	AS2( paddd xmm6, xmm3) \
1522	AS2( paddd xmm7, xmm1) \
1523
1524	#define Mul_Acc1(i)
1525	#define Mul_Acc2(i) ASC(call, LMul##i)
1526	#define Mul_Acc3(i) Mul_Acc2(i)
1527	#define Mul_Acc4(i) Mul_Acc2(i)
1528	#define Mul_Acc5(i) Mul_Acc2(i)
1529	#define Mul_Acc6(i) Mul_Acc2(i)
1530	#define Mul_Acc7(i) Mul_Acc2(i)
1531	#define Mul_Acc8(i) Mul_Acc2(i)
1532	#define Mul_Acc9(i) Mul_Acc2(i)
1533	#define Mul_Acc10(i) Mul_Acc2(i)
1534	#define Mul_Acc11(i) Mul_Acc2(i)
1535	#define Mul_Acc12(i) Mul_Acc2(i)
1536	#define Mul_Acc13(i) Mul_Acc2(i)
1537	#define Mul_Acc14(i) Mul_Acc2(i)
1538	#define Mul_Acc15(i) Mul_Acc2(i)
1539	#define Mul_Acc16(i) Mul_Acc2(i)
1540
1541	#define Mul_Column1(k, i) \
1542	SSE2_SaveShift(k) \
1543	AS2( add esi, 16) \
1544	SSE2_MulAdd45\
1545	Mul_Acc##i(i) \
1546
1547	#define Mul_Column0(k, i) \
1548	SSE2_SaveShift(k) \
1549	AS2( add edi, 16) \
1550	AS2( add edx, 16) \
1551	SSE2_MulAdd45\
1552	Mul_Acc##i(i) \
1553
1554	#define Bot_Acc(i) \
1555	AS2( movdqa xmm1, [esi+i/2(1-(i-2(i/2))2)16]) \
1556	AS2( movdqa xmm0, [edi-i/2(1-(i-2(i/2))2)16]) \
1557	AS2( pmuludq xmm0, xmm1) \
1558	AS2( pmuludq xmm1, [edx-i/2(1-(i-2(i/2))2)16]) \
1559	AS2( paddq xmm4, xmm0) \
1560	AS2( paddd xmm6, xmm1)
1561
1562	#define Bot_SaveAcc(k) \
1563	SSE2_SaveShift(k) \
1564	AS2( add edi, 16) \
1565	AS2( add edx, 16) \
1566	AS2( movdqa xmm6, [esi]) \
1567	AS2( movdqa xmm0, [edi]) \
1568	AS2( pmuludq xmm0, xmm6) \
1569	AS2( paddq xmm4, xmm0) \
1570	AS2( psllq xmm5, 16) \
1571	AS2( paddq xmm4, xmm5) \
1572	AS2( pmuludq xmm6, [edx])
1573
1574	#define Bot_End(n) \
1575	AS2( movhlps xmm7, xmm6) \
1576	AS2( paddd xmm6, xmm7) \
1577	AS2( psllq xmm6, 32) \
1578	AS2( paddd xmm4, xmm6) \
1579	AS2( movq QWORD PTR [ecx+8*((n)-1)], xmm4) \
1580	AS1( pop esp)\
1581	MulEpilogue
1582
1583	#define Top_Begin(n) \
1584	TopPrologue \
1585	AS2( mov edx, esp)\
1586	AS2( and esp, 0xfffffff0)\
1587	AS2( sub esp, 48*n+16)\
1588	AS1( push edx)\
1589	AS2( xor edx, edx) \
1590	ASL(1) \
1591	ASS( pshufd xmm0, [eax+edx], 3,1,2,0) \
1592	ASS( pshufd xmm1, [eax+edx], 2,0,3,1) \
1593	ASS( pshufd xmm2, [edi+edx], 3,1,2,0) \
1594	AS2( movdqa [esp+20+2*edx], xmm0) \
1595	AS2( psrlq xmm0, 32) \
1596	AS2( movdqa [esp+20+2*edx+16], xmm0) \
1597	AS2( movdqa [esp+20+16n+2edx], xmm1) \
1598	AS2( psrlq xmm1, 32) \
1599	AS2( movdqa [esp+20+16n+2edx+16], xmm1) \
1600	AS2( movdqa [esp+20+32n+2edx], xmm2) \
1601	AS2( psrlq xmm2, 32) \
1602	AS2( movdqa [esp+20+32n+2edx+16], xmm2) \
1603	AS2( add edx, 16) \
1604	AS2( cmp edx, 8*(n)) \
1605	ASJ( jne, 1, b) \
1606	AS2( mov eax, esi) \
1607	AS2( lea edi, [esp+20+00n+16(n/2-1)])\
1608	AS2( lea edx, [esp+20+16n+16(n/2-1)])\
1609	AS2( lea esi, [esp+20+32n+16(n/2-1)])\
1610	AS2( pxor xmm4, xmm4)\
1611	AS2( pxor xmm5, xmm5)
1612
1613	#define Top_Acc(i) \
1614	AS2( movq xmm0, QWORD PTR [esi+i/2(1-(i-2(i/2))2)16+8]) \
1615	AS2( pmuludq xmm0, [edx-i/2(1-(i-2(i/2))2)16]) \
1616	AS2( psrlq xmm0, 48) \
1617	AS2( paddd xmm5, xmm0)\
1618
1619	#define Top_Column0(i) \
1620	AS2( psllq xmm5, 32) \
1621	AS2( add edi, 16) \
1622	AS2( add edx, 16) \
1623	SSE2_MulAdd45\
1624	Mul_Acc##i(i) \
1625
1626	#define Top_Column1(i) \
1627	SSE2_SaveShift(0) \
1628	AS2( add esi, 16) \
1629	SSE2_MulAdd45\
1630	Mul_Acc##i(i) \
1631	AS2( shr eax, 16) \
1632	AS2( movd xmm0, eax)\
1633	AS2( movd xmm1, [ecx+4])\
1634	AS2( psrld xmm1, 16)\
1635	AS2( pcmpgtd xmm1, xmm0)\
1636	AS2( psrld xmm1, 31)\
1637	AS2( paddd xmm4, xmm1)\
1638
1639	void SSE2_Square4(word C, const word A)
1640	{
1641	Squ_Begin(2)
1642	Squ_Column0(0, 1)
1643	Squ_End(2)
1644	}
1645
1646	void SSE2_Square8(word C, const word A)
1647	{
1648	Squ_Begin(4)
1649	#ifndef __GNUC__
1650	ASJ( jmp, 0, f)
1651	Squ_Acc(2)
1652	AS1( ret) ASL(0)
1653	#endif
1654	Squ_Column0(0, 1)
1655	Squ_Column1(1, 1)
1656	Squ_Column0(2, 2)
1657	Squ_Column1(3, 1)
1658	Squ_Column0(4, 1)
1659	Squ_End(4)
1660	}
1661
1662	void SSE2_Square16(word C, const word A)
1663	{
1664	Squ_Begin(8)
1665	#ifndef __GNUC__
1666	ASJ( jmp, 0, f)
1667	Squ_Acc(4) Squ_Acc(3) Squ_Acc(2)
1668	AS1( ret) ASL(0)
1669	#endif
1670	Squ_Column0(0, 1)
1671	Squ_Column1(1, 1)
1672	Squ_Column0(2, 2)
1673	Squ_Column1(3, 2)
1674	Squ_Column0(4, 3)
1675	Squ_Column1(5, 3)
1676	Squ_Column0(6, 4)
1677	Squ_Column1(7, 3)
1678	Squ_Column0(8, 3)
1679	Squ_Column1(9, 2)
1680	Squ_Column0(10, 2)
1681	Squ_Column1(11, 1)
1682	Squ_Column0(12, 1)
1683	Squ_End(8)
1684	}
1685
1686	void SSE2_Square32(word C, const word A)
1687	{
1688	Squ_Begin(16)
1689	ASJ( jmp, 0, f)
1690	Squ_Acc(8) Squ_Acc(7) Squ_Acc(6) Squ_Acc(5) Squ_Acc(4) Squ_Acc(3) Squ_Acc(2)
1691	AS1( ret) ASL(0)
1692	Squ_Column0(0, 1)
1693	Squ_Column1(1, 1)
1694	Squ_Column0(2, 2)
1695	Squ_Column1(3, 2)
1696	Squ_Column0(4, 3)
1697	Squ_Column1(5, 3)
1698	Squ_Column0(6, 4)
1699	Squ_Column1(7, 4)
1700	Squ_Column0(8, 5)
1701	Squ_Column1(9, 5)
1702	Squ_Column0(10, 6)
1703	Squ_Column1(11, 6)
1704	Squ_Column0(12, 7)
1705	Squ_Column1(13, 7)
1706	Squ_Column0(14, 8)
1707	Squ_Column1(15, 7)
1708	Squ_Column0(16, 7)
1709	Squ_Column1(17, 6)
1710	Squ_Column0(18, 6)
1711	Squ_Column1(19, 5)
1712	Squ_Column0(20, 5)
1713	Squ_Column1(21, 4)
1714	Squ_Column0(22, 4)
1715	Squ_Column1(23, 3)
1716	Squ_Column0(24, 3)
1717	Squ_Column1(25, 2)
1718	Squ_Column0(26, 2)
1719	Squ_Column1(27, 1)
1720	Squ_Column0(28, 1)
1721	Squ_End(16)
1722	}
1723
1724	void SSE2_Multiply4(word C, const word A, const word *B)
1725	{
1726	Mul_Begin(2)
1727	#ifndef __GNUC__
1728	ASJ( jmp, 0, f)
1729	Mul_Acc(2)
1730	AS1( ret) ASL(0)
1731	#endif
1732	Mul_Column0(0, 2)
1733	Mul_End(2)
1734	}
1735
1736	void SSE2_Multiply8(word C, const word A, const word *B)
1737	{
1738	Mul_Begin(4)
1739	#ifndef __GNUC__
1740	ASJ( jmp, 0, f)
1741	Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1742	AS1( ret) ASL(0)
1743	#endif
1744	Mul_Column0(0, 2)
1745	Mul_Column1(1, 3)
1746	Mul_Column0(2, 4)
1747	Mul_Column1(3, 3)
1748	Mul_Column0(4, 2)
1749	Mul_End(4)
1750	}
1751
1752	void SSE2_Multiply16(word C, const word A, const word *B)
1753	{
1754	Mul_Begin(8)
1755	#ifndef __GNUC__
1756	ASJ( jmp, 0, f)
1757	Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1758	AS1( ret) ASL(0)
1759	#endif
1760	Mul_Column0(0, 2)
1761	Mul_Column1(1, 3)
1762	Mul_Column0(2, 4)
1763	Mul_Column1(3, 5)
1764	Mul_Column0(4, 6)
1765	Mul_Column1(5, 7)
1766	Mul_Column0(6, 8)
1767	Mul_Column1(7, 7)
1768	Mul_Column0(8, 6)
1769	Mul_Column1(9, 5)
1770	Mul_Column0(10, 4)
1771	Mul_Column1(11, 3)
1772	Mul_Column0(12, 2)
1773	Mul_End(8)
1774	}
1775
1776	void SSE2_Multiply32(word C, const word A, const word *B)
1777	{
1778	Mul_Begin(16)
1779	ASJ( jmp, 0, f)
1780	Mul_Acc(16) Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1781	AS1( ret) ASL(0)
1782	Mul_Column0(0, 2)
1783	Mul_Column1(1, 3)
1784	Mul_Column0(2, 4)
1785	Mul_Column1(3, 5)
1786	Mul_Column0(4, 6)
1787	Mul_Column1(5, 7)
1788	Mul_Column0(6, 8)
1789	Mul_Column1(7, 9)
1790	Mul_Column0(8, 10)
1791	Mul_Column1(9, 11)
1792	Mul_Column0(10, 12)
1793	Mul_Column1(11, 13)
1794	Mul_Column0(12, 14)
1795	Mul_Column1(13, 15)
1796	Mul_Column0(14, 16)
1797	Mul_Column1(15, 15)
1798	Mul_Column0(16, 14)
1799	Mul_Column1(17, 13)
1800	Mul_Column0(18, 12)
1801	Mul_Column1(19, 11)
1802	Mul_Column0(20, 10)
1803	Mul_Column1(21, 9)
1804	Mul_Column0(22, 8)
1805	Mul_Column1(23, 7)
1806	Mul_Column0(24, 6)
1807	Mul_Column1(25, 5)
1808	Mul_Column0(26, 4)
1809	Mul_Column1(27, 3)
1810	Mul_Column0(28, 2)
1811	Mul_End(16)
1812	}
1813
1814	void SSE2_MultiplyBottom4(word C, const word A, const word *B)
1815	{
1816	Mul_Begin(2)
1817	Bot_SaveAcc(0) Bot_Acc(2)
1818	Bot_End(2)
1819	}
1820
1821	void SSE2_MultiplyBottom8(word C, const word A, const word *B)
1822	{
1823	Mul_Begin(4)
1824	#ifndef __GNUC__
1825	ASJ( jmp, 0, f)
1826	Mul_Acc(3) Mul_Acc(2)
1827	AS1( ret) ASL(0)
1828	#endif
1829	Mul_Column0(0, 2)
1830	Mul_Column1(1, 3)
1831	Bot_SaveAcc(2) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2)
1832	Bot_End(4)
1833	}
1834
1835	void SSE2_MultiplyBottom16(word C, const word A, const word *B)
1836	{
1837	Mul_Begin(8)
1838	#ifndef __GNUC__
1839	ASJ( jmp, 0, f)
1840	Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1841	AS1( ret) ASL(0)
1842	#endif
1843	Mul_Column0(0, 2)
1844	Mul_Column1(1, 3)
1845	Mul_Column0(2, 4)
1846	Mul_Column1(3, 5)
1847	Mul_Column0(4, 6)
1848	Mul_Column1(5, 7)
1849	Bot_SaveAcc(6) Bot_Acc(8) Bot_Acc(7) Bot_Acc(6) Bot_Acc(5) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2)
1850	Bot_End(8)
1851	}
1852
1853	void SSE2_MultiplyBottom32(word C, const word A, const word *B)
1854	{
1855	Mul_Begin(16)
1856	#ifndef __GNUC__
1857	ASJ( jmp, 0, f)
1858	Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1859	AS1( ret) ASL(0)
1860	#endif
1861	Mul_Column0(0, 2)
1862	Mul_Column1(1, 3)
1863	Mul_Column0(2, 4)
1864	Mul_Column1(3, 5)
1865	Mul_Column0(4, 6)
1866	Mul_Column1(5, 7)
1867	Mul_Column0(6, 8)
1868	Mul_Column1(7, 9)
1869	Mul_Column0(8, 10)
1870	Mul_Column1(9, 11)
1871	Mul_Column0(10, 12)
1872	Mul_Column1(11, 13)
1873	Mul_Column0(12, 14)
1874	Mul_Column1(13, 15)
1875	Bot_SaveAcc(14) Bot_Acc(16) Bot_Acc(15) Bot_Acc(14) Bot_Acc(13) Bot_Acc(12) Bot_Acc(11) Bot_Acc(10) Bot_Acc(9) Bot_Acc(8) Bot_Acc(7) Bot_Acc(6) Bot_Acc(5) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2)
1876	Bot_End(16)
1877	}
1878
1879	void SSE2_MultiplyTop8(word C, const word A, const word *B, word L)
1880	{
1881	Top_Begin(4)
1882	Top_Acc(3) Top_Acc(2) Top_Acc(1)
1883	#ifndef __GNUC__
1884	ASJ( jmp, 0, f)
1885	Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1886	AS1( ret) ASL(0)
1887	#endif
1888	Top_Column0(4)
1889	Top_Column1(3)
1890	Mul_Column0(0, 2)
1891	Top_End(2)
1892	}
1893
1894	void SSE2_MultiplyTop16(word C, const word A, const word *B, word L)
1895	{
1896	Top_Begin(8)
1897	Top_Acc(7) Top_Acc(6) Top_Acc(5) Top_Acc(4) Top_Acc(3) Top_Acc(2) Top_Acc(1)
1898	#ifndef __GNUC__
1899	ASJ( jmp, 0, f)
1900	Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1901	AS1( ret) ASL(0)
1902	#endif
1903	Top_Column0(8)
1904	Top_Column1(7)
1905	Mul_Column0(0, 6)
1906	Mul_Column1(1, 5)
1907	Mul_Column0(2, 4)
1908	Mul_Column1(3, 3)
1909	Mul_Column0(4, 2)
1910	Top_End(4)
1911	}
1912
1913	void SSE2_MultiplyTop32(word C, const word A, const word *B, word L)
1914	{
1915	Top_Begin(16)
1916	Top_Acc(15) Top_Acc(14) Top_Acc(13) Top_Acc(12) Top_Acc(11) Top_Acc(10) Top_Acc(9) Top_Acc(8) Top_Acc(7) Top_Acc(6) Top_Acc(5) Top_Acc(4) Top_Acc(3) Top_Acc(2) Top_Acc(1)
1917	#ifndef __GNUC__
1918	ASJ( jmp, 0, f)
1919	Mul_Acc(16) Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1920	AS1( ret) ASL(0)
1921	#endif
1922	Top_Column0(16)
1923	Top_Column1(15)
1924	Mul_Column0(0, 14)
1925	Mul_Column1(1, 13)
1926	Mul_Column0(2, 12)
1927	Mul_Column1(3, 11)
1928	Mul_Column0(4, 10)
1929	Mul_Column1(5, 9)
1930	Mul_Column0(6, 8)
1931	Mul_Column1(7, 7)
1932	Mul_Column0(8, 6)
1933	Mul_Column1(9, 5)
1934	Mul_Column0(10, 4)
1935	Mul_Column1(11, 3)
1936	Mul_Column0(12, 2)
1937	Top_End(8)
1938	}
1939
315 by weidai fix compile for x64, DLL and VC 6	1940	#endif // #if CRYPTOPP_INTEGER_SSE2
270 by weidai MMX/SSE2 optimizations	1941
	1942	// ********************************************************
	1943
351 by weidai revert to int return value for Add and Sub	1944	typedef int (CRYPTOPP_FASTCALL * PAdd)(size_t N, word C, const word A, const word *B);
270 by weidai MMX/SSE2 optimizations	1945	typedef void (* PMul)(word C, const word A, const word *B);
	1946	typedef void (* PSqu)(word C, const word A);
	1947	typedef void (* PMulTop)(word C, const word A, const word *B, word L);
	1948
315 by weidai fix compile for x64, DLL and VC 6	1949	#if CRYPTOPP_INTEGER_SSE2
270 by weidai MMX/SSE2 optimizations	1950	static PAdd s_pAdd = &Baseline_Add, s_pSub = &Baseline_Sub;
	1951	static size_t s_recursionLimit = 8;
	1952	#else
315 by weidai fix compile for x64, DLL and VC 6	1953	static const size_t s_recursionLimit = 16;
270 by weidai MMX/SSE2 optimizations	1954	#endif
	1955
	1956	static PMul s_pMul[9], s_pBot[9];
	1957	static PSqu s_pSqu[9];
315 by weidai fix compile for x64, DLL and VC 6	1958	static PMulTop s_pTop[9];
270 by weidai MMX/SSE2 optimizations	1959
	1960	static void SetFunctionPointers()
	1961	{
	1962	s_pMul[0] = &Baseline_Multiply2;
	1963	s_pBot[0] = &Baseline_MultiplyBottom2;
	1964	s_pSqu[0] = &Baseline_Square2;
315 by weidai fix compile for x64, DLL and VC 6	1965	s_pTop[0] = &Baseline_MultiplyTop2;
315 by weidai fix compile for x64, DLL and VC 6	1966	s_pTop[1] = &Baseline_MultiplyTop4;
270 by weidai MMX/SSE2 optimizations	1967
315 by weidai fix compile for x64, DLL and VC 6	1968	#if CRYPTOPP_INTEGER_SSE2
270 by weidai MMX/SSE2 optimizations	1969	if (HasSSE2())
270 by weidai MMX/SSE2 optimizations	1970	{
359 by weidai fix crash in SSE2_Add on P4 when compiled with MSVC 6.0 with Processor Pack	1971	#if _MSC_VER != 1200 \|\| defined(NDEBUG)
270 by weidai MMX/SSE2 optimizations	1972	if (IsP4())
	1973	{
	1974	s_pAdd = &SSE2_Add;
	1975	s_pSub = &SSE2_Sub;
	1976	}
359 by weidai fix crash in SSE2_Add on P4 when compiled with MSVC 6.0 with Processor Pack	1977	#endif
270 by weidai MMX/SSE2 optimizations	1978
	1979	s_recursionLimit = 32;
	1980
	1981	s_pMul[1] = &SSE2_Multiply4;
	1982	s_pMul[2] = &SSE2_Multiply8;
	1983	s_pMul[4] = &SSE2_Multiply16;
	1984	s_pMul[8] = &SSE2_Multiply32;
	1985
	1986	s_pBot[1] = &SSE2_MultiplyBottom4;
	1987	s_pBot[2] = &SSE2_MultiplyBottom8;
	1988	s_pBot[4] = &SSE2_MultiplyBottom16;
	1989	s_pBot[8] = &SSE2_MultiplyBottom32;
	1990
	1991	s_pSqu[1] = &SSE2_Square4;
	1992	s_pSqu[2] = &SSE2_Square8;
	1993	s_pSqu[4] = &SSE2_Square16;
	1994	s_pSqu[8] = &SSE2_Square32;
	1995
315 by weidai fix compile for x64, DLL and VC 6	1996	s_pTop[2] = &SSE2_MultiplyTop8;
	1997	s_pTop[4] = &SSE2_MultiplyTop16;
	1998	s_pTop[8] = &SSE2_MultiplyTop32;
270 by weidai MMX/SSE2 optimizations	1999	}
	2000	else
	2001	#endif
	2002	{
	2003	s_pMul[1] = &Baseline_Multiply4;
	2004	s_pMul[2] = &Baseline_Multiply8;
	2005
	2006	s_pBot[1] = &Baseline_MultiplyBottom4;
	2007	s_pBot[2] = &Baseline_MultiplyBottom8;
	2008
	2009	s_pSqu[1] = &Baseline_Square4;
	2010	s_pSqu[2] = &Baseline_Square8;
315 by weidai fix compile for x64, DLL and VC 6	2011
	2012	s_pTop[2] = &Baseline_MultiplyTop8;
	2013
	2014	#if !CRYPTOPP_INTEGER_SSE2
	2015	s_pMul[4] = &Baseline_Multiply16;
	2016	s_pBot[4] = &Baseline_MultiplyBottom16;
	2017	s_pSqu[4] = &Baseline_Square16;
	2018	s_pTop[4] = &Baseline_MultiplyTop16;
	2019	#endif
270 by weidai MMX/SSE2 optimizations	2020	}
	2021	}
	2022
351 by weidai revert to int return value for Add and Sub	2023	inline int Add(word C, const word A, const word *B, size_t N)
270 by weidai MMX/SSE2 optimizations	2024	{
315 by weidai fix compile for x64, DLL and VC 6	2025	#if CRYPTOPP_INTEGER_SSE2
270 by weidai MMX/SSE2 optimizations	2026	return s_pAdd(N, C, A, B);
	2027	#else
	2028	return Baseline_Add(N, C, A, B);
	2029	#endif
	2030	}
	2031
351 by weidai revert to int return value for Add and Sub	2032	inline int Subtract(word C, const word A, const word *B, size_t N)
270 by weidai MMX/SSE2 optimizations	2033	{
315 by weidai fix compile for x64, DLL and VC 6	2034	#if CRYPTOPP_INTEGER_SSE2
270 by weidai MMX/SSE2 optimizations	2035	return s_pSub(N, C, A, B);
	2036	#else
	2037	return Baseline_Sub(N, C, A, B);
	2038	#endif
	2039	}
	2040
	2041	// ********************************************************
	2042
1 by weidai Initial revision	2043
	2044	#define A0 A
	2045	#define A1 (A+N2)
	2046	#define B0 B
	2047	#define B1 (B+N2)
	2048
	2049	#define T0 T
	2050	#define T1 (T+N2)
	2051	#define T2 (T+N)
	2052	#define T3 (T+N+N2)
	2053
	2054	#define R0 R
	2055	#define R1 (R+N2)
	2056	#define R2 (R+N)
	2057	#define R3 (R+N+N2)
	2058
	2059	// R[2N] - result = AB
	2060	// T[2*N] - temporary work space
	2061	// A[N] --- multiplier
	2062	// B[N] --- multiplicant
	2063
184 by weidai port to MSVC .NET 2005 beta 2	2064	void RecursiveMultiply(word R, word T, const word A, const word B, size_t N)
1 by weidai Initial revision	2065	{
	2066	assert(N>=2 && N%2==0);
	2067
270 by weidai MMX/SSE2 optimizations	2068	if (N <= s_recursionLimit)
270 by weidai MMX/SSE2 optimizations	2069	s_pMul[N/4](R, A, B);
1 by weidai Initial revision	2070	else
1 by weidai Initial revision	2071	{
184 by weidai port to MSVC .NET 2005 beta 2	2072	const size_t N2 = N/2;
270 by weidai MMX/SSE2 optimizations	2073
	2074	size_t AN2 = Compare(A0, A1, N2) > 0 ? 0 : N2;
	2075	Subtract(R0, A + AN2, A + (N2 ^ AN2), N2);
	2076
	2077	size_t BN2 = Compare(B0, B1, N2) > 0 ? 0 : N2;
	2078	Subtract(R1, B + BN2, B + (N2 ^ BN2), N2);
	2079
	2080	RecursiveMultiply(R2, T2, A1, B1, N2);
	2081	RecursiveMultiply(T0, T2, R0, R1, N2);
113 by weidai unify GCC and MSVC multiplication code	2082	RecursiveMultiply(R0, T2, A0, B0, N2);
	2083
	2084	// now T[01] holds (A1-A0)(B0-B1), R[01] holds A0B0, R[23] holds A1*B1
	2085
270 by weidai MMX/SSE2 optimizations	2086	int c2 = Add(R2, R2, R1, N2);
	2087	int c3 = c2;
	2088	c2 += Add(R1, R2, R0, N2);
	2089	c3 += Add(R2, R2, R3, N2);
	2090
	2091	if (AN2 == BN2)
	2092	c3 -= Subtract(R1, R1, T0, N);
	2093	else
	2094	c3 += Add(R1, R1, T0, N);
	2095
	2096	c3 += Increment(R2, N2, c2);
	2097	assert (c3 >= 0 && c3 <= 2);
	2098	Increment(R3, N2, c3);
1 by weidai Initial revision	2099	}
	2100	}
	2101
	2102	// R[2N] - result = AA
	2103	// T[2*N] - temporary work space
	2104	// A[N] --- number to be squared
	2105
184 by weidai port to MSVC .NET 2005 beta 2	2106	void RecursiveSquare(word R, word T, const word *A, size_t N)
1 by weidai Initial revision	2107	{
1 by weidai Initial revision	2108	assert(N && N%2==0);
270 by weidai MMX/SSE2 optimizations	2109
	2110	if (N <= s_recursionLimit)
	2111	s_pSqu[N/4](R, A);
1 by weidai Initial revision	2112	else
113 by weidai unify GCC and MSVC multiplication code	2113	{
184 by weidai port to MSVC .NET 2005 beta 2	2114	const size_t N2 = N/2;
113 by weidai unify GCC and MSVC multiplication code	2115
	2116	RecursiveSquare(R0, T2, A0, N2);
	2117	RecursiveSquare(R2, T2, A1, N2);
	2118	RecursiveMultiply(T0, T2, A0, A1, N2);
	2119
270 by weidai MMX/SSE2 optimizations	2120	int carry = Add(R1, R1, T0, N);
270 by weidai MMX/SSE2 optimizations	2121	carry += Add(R1, R1, T0, N);
113 by weidai unify GCC and MSVC multiplication code	2122	Increment(R3, N2, carry);
113 by weidai unify GCC and MSVC multiplication code	2123	}
1 by weidai Initial revision	2124	}
	2125
	2126	// R[N] - bottom half of A*B
270 by weidai MMX/SSE2 optimizations	2127	// T[3*N/2] - temporary work space
1 by weidai Initial revision	2128	// A[N] - multiplier
	2129	// B[N] - multiplicant
	2130
184 by weidai port to MSVC .NET 2005 beta 2	2131	void RecursiveMultiplyBottom(word R, word T, const word A, const word B, size_t N)
1 by weidai Initial revision	2132	{
1 by weidai Initial revision	2133	assert(N>=2 && N%2==0);
270 by weidai MMX/SSE2 optimizations	2134
	2135	if (N <= s_recursionLimit)
	2136	s_pBot[N/4](R, A, B);
1 by weidai Initial revision	2137	else
113 by weidai unify GCC and MSVC multiplication code	2138	{
184 by weidai port to MSVC .NET 2005 beta 2	2139	const size_t N2 = N/2;
113 by weidai unify GCC and MSVC multiplication code	2140
	2141	RecursiveMultiply(R, T, A0, B0, N2);
	2142	RecursiveMultiplyBottom(T0, T1, A1, B0, N2);
270 by weidai MMX/SSE2 optimizations	2143	Add(R1, R1, T0, N2);
113 by weidai unify GCC and MSVC multiplication code	2144	RecursiveMultiplyBottom(T0, T1, A0, B1, N2);
270 by weidai MMX/SSE2 optimizations	2145	Add(R1, R1, T0, N2);
113 by weidai unify GCC and MSVC multiplication code	2146	}
1 by weidai Initial revision	2147	}
	2148
	2149	// R[N] --- upper half of A*B
	2150	// T[2*N] - temporary work space
	2151	// L[N] --- lower half of A*B
	2152	// A[N] --- multiplier
	2153	// B[N] --- multiplicant
	2154
270 by weidai MMX/SSE2 optimizations	2155	void MultiplyTop(word R, word T, const word L, const word A, const word *B, size_t N)
1 by weidai Initial revision	2156	{
	2157	assert(N>=2 && N%2==0);
	2158
315 by weidai fix compile for x64, DLL and VC 6	2159	if (N <= s_recursionLimit)
315 by weidai fix compile for x64, DLL and VC 6	2160	s_pTop[N/4](R, A, B, L[N-1]);
1 by weidai Initial revision	2161	else
1 by weidai Initial revision	2162	{
184 by weidai port to MSVC .NET 2005 beta 2	2163	const size_t N2 = N/2;
270 by weidai MMX/SSE2 optimizations	2164
	2165	size_t AN2 = Compare(A0, A1, N2) > 0 ? 0 : N2;
	2166	Subtract(R0, A + AN2, A + (N2 ^ AN2), N2);
	2167
	2168	size_t BN2 = Compare(B0, B1, N2) > 0 ? 0 : N2;
	2169	Subtract(R1, B + BN2, B + (N2 ^ BN2), N2);
	2170
	2171	RecursiveMultiply(T0, T2, R0, R1, N2);
	2172	RecursiveMultiply(R0, T2, A1, B1, N2);
	2173
	2174	// now T[01] holds (A1-A0)(B0-B1) = A1B0+A0B1-A1B1-A0B0, R[01] holds A1B1
	2175
	2176	int t, c3;
	2177	int c2 = Subtract(T2, L+N2, L, N2);
	2178
	2179	if (AN2 == BN2)
	2180	{
	2181	c2 -= Add(T2, T2, T0, N2);
	2182	t = (Compare(T2, R0, N2) == -1);
	2183	c3 = t - Subtract(T2, T2, T1, N2);
	2184	}
	2185	else
	2186	{
	2187	c2 += Subtract(T2, T2, T0, N2);
	2188	t = (Compare(T2, R0, N2) == -1);
	2189	c3 = t + Add(T2, T2, T1, N2);
	2190	}
	2191
	2192	c2 += t;
	2193	if (c2 >= 0)
	2194	c3 += Increment(T2, N2, c2);
	2195	else
	2196	c3 -= Decrement(T2, N2, -c2);
	2197	c3 += Add(R0, T2, R1, N2);
	2198
	2199	assert (c3 >= 0 && c3 <= 2);
	2200	Increment(R1, N2, c3);
1 by weidai Initial revision	2201	}
	2202	}
	2203
184 by weidai port to MSVC .NET 2005 beta 2	2204	inline void Multiply(word R, word T, const word A, const word B, size_t N)
1 by weidai Initial revision	2205	{
113 by weidai unify GCC and MSVC multiplication code	2206	RecursiveMultiply(R, T, A, B, N);
1 by weidai Initial revision	2207	}
1 by weidai Initial revision	2208
184 by weidai port to MSVC .NET 2005 beta 2	2209	inline void Square(word R, word T, const word *A, size_t N)
1 by weidai Initial revision	2210	{
113 by weidai unify GCC and MSVC multiplication code	2211	RecursiveSquare(R, T, A, N);
1 by weidai Initial revision	2212	}
1 by weidai Initial revision	2213
184 by weidai port to MSVC .NET 2005 beta 2	2214	inline void MultiplyBottom(word R, word T, const word A, const word B, size_t N)
1 by weidai Initial revision	2215	{
113 by weidai unify GCC and MSVC multiplication code	2216	RecursiveMultiplyBottom(R, T, A, B, N);
1 by weidai Initial revision	2217	}
	2218
	2219	// R[NA+NB] - result = A*B
	2220	// T[NA+NB] - temporary work space
	2221	// A[NA] ---- multiplier
	2222	// B[NB] ---- multiplicant
	2223
184 by weidai port to MSVC .NET 2005 beta 2	2224	void AsymmetricMultiply(word R, word T, const word A, size_t NA, const word B, size_t NB)
1 by weidai Initial revision	2225	{
	2226	if (NA == NB)
	2227	{
	2228	if (A == B)
	2229	Square(R, T, A, NA);
	2230	else
	2231	Multiply(R, T, A, B, NA);
	2232
	2233	return;
	2234	}
	2235
	2236	if (NA > NB)
	2237	{
	2238	std::swap(A, B);
	2239	std::swap(NA, NB);
	2240	}
	2241
	2242	assert(NB % NA == 0);
	2243
	2244	if (NA==2 && !A[1])
	2245	{
	2246	switch (A[0])
	2247	{
	2248	case 0:
	2249	SetWords(R, 0, NB+2);
	2250	return;
	2251	case 1:
	2252	CopyWords(R, B, NB);
	2253	R[NB] = R[NB+1] = 0;
	2254	return;
	2255	default:
	2256	R[NB] = LinearMultiply(R, B, A[0], NB);
	2257	R[NB+1] = 0;
	2258	return;
	2259	}
	2260	}
	2261
184 by weidai port to MSVC .NET 2005 beta 2	2262	size_t i;
270 by weidai MMX/SSE2 optimizations	2263	if ((NB/NA)%2 == 0)
	2264	{
	2265	Multiply(R, T, A, B, NA);
	2266	CopyWords(T+2*NA, R+NA, NA);
1 by weidai Initial revision	2267
270 by weidai MMX/SSE2 optimizations	2268	for (i=2NA; i<NB; i+=2NA)
	2269	Multiply(T+NA+i, T, A, B+i, NA);
	2270	for (i=NA; i<NB; i+=2*NA)
	2271	Multiply(R+i, T, A, B+i, NA);
	2272	}
	2273	else
	2274	{
	2275	for (i=0; i<NB; i+=2*NA)
	2276	Multiply(R+i, T, A, B+i, NA);
	2277	for (i=NA; i<NB; i+=2*NA)
	2278	Multiply(T+NA+i, T, A, B+i, NA);
	2279	}
1 by weidai Initial revision	2280
	2281	if (Add(R+NA, R+NA, T+2*NA, NB-NA))
	2282	Increment(R+NB, NA);
	2283	}
	2284
	2285	// R[N] ----- result = A inverse mod 2*(WORD_BITSN)
	2286	// T[3*N/2] - temporary work space
	2287	// A[N] ----- an odd number as input
	2288
184 by weidai port to MSVC .NET 2005 beta 2	2289	void RecursiveInverseModPower2(word R, word T, const word *A, size_t N)
1 by weidai Initial revision	2290	{
1 by weidai Initial revision	2291	if (N==2)
100 by weidai fix bugs in 64-bit CPU support	2292	{
	2293	T[0] = AtomicInverseModPower2(A[0]);
	2294	T[1] = 0;
270 by weidai MMX/SSE2 optimizations	2295	s_pBot[0](T+2, T, A);
100 by weidai fix bugs in 64-bit CPU support	2296	TwosComplement(T+2, 2);
100 by weidai fix bugs in 64-bit CPU support	2297	Increment(T+2, 2, 2);
270 by weidai MMX/SSE2 optimizations	2298	s_pBot[0](R, T, T+2);
100 by weidai fix bugs in 64-bit CPU support	2299	}
1 by weidai Initial revision	2300	else
1 by weidai Initial revision	2301	{
184 by weidai port to MSVC .NET 2005 beta 2	2302	const size_t N2 = N/2;
1 by weidai Initial revision	2303	RecursiveInverseModPower2(R0, T0, A0, N2);
	2304	T0[0] = 1;
	2305	SetWords(T0+1, 0, N2-1);
	2306	MultiplyTop(R1, T1, T0, R0, A0, N2);
	2307	MultiplyBottom(T0, T1, R0, A1, N2);
	2308	Add(T0, R1, T0, N2);
	2309	TwosComplement(T0, N2);
	2310	MultiplyBottom(R1, T1, R0, T0, N2);
	2311	}
	2312	}
	2313
	2314	// R[N] --- result = X/(2*(WORD_BITSN)) mod M
	2315	// T[3*N] - temporary work space
	2316	// X[2*N] - number to be reduced
	2317	// M[N] --- modulus
	2318	// U[N] --- multiplicative inverse of M mod 2*(WORD_BITSN)
	2319
270 by weidai MMX/SSE2 optimizations	2320	void MontgomeryReduce(word R, word T, word X, const word M, const word *U, size_t N)
1 by weidai Initial revision	2321	{
270 by weidai MMX/SSE2 optimizations	2322	#if 1
1 by weidai Initial revision	2323	MultiplyBottom(R, T, X, U, N);
1 by weidai Initial revision	2324	MultiplyTop(T, T+N, X, R, M, N);
25 by weidai increase resistance against timing attacks	2325	word borrow = Subtract(T, X+N, T, N);
	2326	// defend against timing attack by doing this Add even when not needed
	2327	word carry = Add(T+N, T, M, N);
377 by weidai remove branch in assert	2328	assert(carry \| !borrow);
363 by weidai fix possible branch prediction analysis (BPA) vulnerability	2329	CopyWords(R, T + ((0-borrow) & N), N);
270 by weidai MMX/SSE2 optimizations	2330	#elif 0
	2331	const word u = 0-U[0];
	2332	Declare2Words(p)
	2333	for (size_t i=0; i<N; i++)
	2334	{
	2335	const word t = u * X[i];
	2336	word c = 0;
	2337	for (size_t j=0; j<N; j+=2)
	2338	{
	2339	MultiplyWords(p, t, M[j]);
	2340	Acc2WordsBy1(p, X[i+j]);
	2341	Acc2WordsBy1(p, c);
	2342	X[i+j] = LowWord(p);
	2343	c = HighWord(p);
	2344	MultiplyWords(p, t, M[j+1]);
	2345	Acc2WordsBy1(p, X[i+j+1]);
	2346	Acc2WordsBy1(p, c);
	2347	X[i+j+1] = LowWord(p);
	2348	c = HighWord(p);
	2349	}
	2350
	2351	if (Increment(X+N+i, N-i, c))
	2352	while (!Subtract(X+N, X+N, M, N)) {}
	2353	}
	2354
	2355	memcpy(R, X+N, N*WORD_SIZE);
	2356	#else
	2357	__m64 u = _mm_cvtsi32_si64(0-U[0]), p;
	2358	for (size_t i=0; i<N; i++)
	2359	{
	2360	__m64 t = _mm_cvtsi32_si64(X[i]);
	2361	t = _mm_mul_su32(t, u);
	2362	__m64 c = _mm_setzero_si64();
	2363	for (size_t j=0; j<N; j+=2)
	2364	{
	2365	p = _mm_mul_su32(t, _mm_cvtsi32_si64(M[j]));
	2366	p = _mm_add_si64(p, _mm_cvtsi32_si64(X[i+j]));
	2367	c = _mm_add_si64(c, p);
	2368	X[i+j] = _mm_cvtsi64_si32(c);
	2369	c = _mm_srli_si64(c, 32);
	2370	p = _mm_mul_su32(t, _mm_cvtsi32_si64(M[j+1]));
	2371	p = _mm_add_si64(p, _mm_cvtsi32_si64(X[i+j+1]));
	2372	c = _mm_add_si64(c, p);
	2373	X[i+j+1] = _mm_cvtsi64_si32(c);
	2374	c = _mm_srli_si64(c, 32);
	2375	}
	2376
	2377	if (Increment(X+N+i, N-i, _mm_cvtsi64_si32(c)))
	2378	while (!Subtract(X+N, X+N, M, N)) {}
	2379	}
	2380
	2381	memcpy(R, X+N, N*WORD_SIZE);
	2382	_mm_empty();
	2383	#endif
1 by weidai Initial revision	2384	}
	2385
	2386	// R[N] --- result = X/(2*(WORD_BITSN/2)) mod M
	2387	// T[2*N] - temporary work space
	2388	// X[2*N] - number to be reduced
	2389	// M[N] --- modulus
	2390	// U[N/2] - multiplicative inverse of M mod 2*(WORD_BITSN/2)
	2391	// V[N] --- 2*(WORD_BITS3*N/2) mod M
	2392
184 by weidai port to MSVC .NET 2005 beta 2	2393	void HalfMontgomeryReduce(word R, word T, const word X, const word M, const word U, const word V, size_t N)
1 by weidai Initial revision	2394	{
	2395	assert(N%2==0 && N>=4);
	2396
	2397	#define M0 M
	2398	#define M1 (M+N2)
	2399	#define V0 V
	2400	#define V1 (V+N2)
	2401
	2402	#define X0 X
	2403	#define X1 (X+N2)
	2404	#define X2 (X+N)
	2405	#define X3 (X+N+N2)
	2406
184 by weidai port to MSVC .NET 2005 beta 2	2407	const size_t N2 = N/2;
1 by weidai Initial revision	2408	Multiply(T0, T2, V0, X3, N2);
	2409	int c2 = Add(T0, T0, X0, N);
	2410	MultiplyBottom(T3, T2, T0, U, N2);
	2411	MultiplyTop(T2, R, T0, T3, M0, N2);
	2412	c2 -= Subtract(T2, T1, T2, N2);
	2413	Multiply(T0, R, T3, M1, N2);
	2414	c2 -= Subtract(T0, T2, T0, N2);
	2415	int c3 = -(int)Subtract(T1, X2, T1, N2);
	2416	Multiply(R0, T2, V1, X3, N2);
	2417	c3 += Add(R, R, T, N);
	2418
	2419	if (c2>0)
	2420	c3 += Increment(R1, N2);
	2421	else if (c2<0)
	2422	c3 -= Decrement(R1, N2, -c2);
	2423
	2424	assert(c3>=-1 && c3<=1);
	2425	if (c3>0)
	2426	Subtract(R, R, M, N);
	2427	else if (c3<0)
	2428	Add(R, R, M, N);
	2429
	2430	#undef M0
	2431	#undef M1
	2432	#undef V0
	2433	#undef V1
	2434
	2435	#undef X0
	2436	#undef X1
	2437	#undef X2
	2438	#undef X3
	2439	}
	2440
	2441	#undef A0
	2442	#undef A1
	2443	#undef B0
	2444	#undef B1
	2445
	2446	#undef T0
	2447	#undef T1
	2448	#undef T2
	2449	#undef T3
	2450
	2451	#undef R0
	2452	#undef R1
	2453	#undef R2
	2454	#undef R3
	2455
100 by weidai fix bugs in 64-bit CPU support	2456	/*
1 by weidai Initial revision	2457	// do a 3 word by 2 word divide, returns quotient and leaves remainder in A
	2458	static word SubatomicDivide(word *A, word B0, word B1)
	2459	{
	2460	// assert {A[2],A[1]} < {B1,B0}, so quotient can fit in a word
	2461	assert(A[2] < B1 \|\| (A[2]==B1 && A[1] < B0));
	2462
100 by weidai fix bugs in 64-bit CPU support	2463	// estimate the quotient: do a 2 word by 1 word divide
1 by weidai Initial revision	2464	word Q;
	2465	if (B1+1 == 0)
	2466	Q = A[2];
	2467	else
100 by weidai fix bugs in 64-bit CPU support	2468	Q = DWord(A[1], A[2]).DividedBy(B1+1);
1 by weidai Initial revision	2469
1 by weidai Initial revision	2470	// now subtract Q*B from A
100 by weidai fix bugs in 64-bit CPU support	2471	DWord p = DWord::Multiply(B0, Q);
	2472	DWord u = (DWord) A[0] - p.GetLowHalf();
	2473	A[0] = u.GetLowHalf();
	2474	u = (DWord) A[1] - p.GetHighHalf() - u.GetHighHalfAsBorrow() - DWord::Multiply(B1, Q);
	2475	A[1] = u.GetLowHalf();
	2476	A[2] += u.GetHighHalf();
1 by weidai Initial revision	2477
	2478	// Q <= actual quotient, so fix it
	2479	while (A[2] \|\| A[1] > B1 \|\| (A[1]==B1 && A[0]>=B0))
	2480	{
100 by weidai fix bugs in 64-bit CPU support	2481	u = (DWord) A[0] - B0;
	2482	A[0] = u.GetLowHalf();
	2483	u = (DWord) A[1] - B1 - u.GetHighHalfAsBorrow();
	2484	A[1] = u.GetLowHalf();
	2485	A[2] += u.GetHighHalf();
1 by weidai Initial revision	2486	Q++;
	2487	assert(Q); // shouldn't overflow
	2488	}
	2489
	2490	return Q;
	2491	}
	2492
	2493	// do a 4 word by 2 word divide, returns 2 word quotient in Q0 and Q1
	2494	static inline void AtomicDivide(word Q, const word A, const word *B)
	2495	{
	2496	if (!B[0] && !B[1]) // if divisor is 0, we assume divisor==2*(2WORD_BITS)
	2497	{
	2498	Q[0] = A[2];
	2499	Q[1] = A[3];
	2500	}
	2501	else
	2502	{
	2503	word T[4];
	2504	T[0] = A[0]; T[1] = A[1]; T[2] = A[2]; T[3] = A[3];
	2505	Q[1] = SubatomicDivide(T+1, B[0], B[1]);
	2506	Q[0] = SubatomicDivide(T, B[0], B[1]);
	2507
	2508	#ifndef NDEBUG
	2509	// multiply quotient and divisor and add remainder, make sure it equals dividend
	2510	assert(!T[2] && !T[3] && (T[1] < B[1] \|\| (T[1]==B[1] && T[0]<B[0])));
	2511	word P[4];
	2512	LowLevel::Multiply2(P, Q, B);
	2513	Add(P, P, T, 4);
	2514	assert(memcmp(P, A, 4*WORD_SIZE)==0);
	2515	#endif
	2516	}
	2517	}
100 by weidai fix bugs in 64-bit CPU support	2518	*/
	2519
	2520	static inline void AtomicDivide(word Q, const word A, const word *B)
	2521	{
	2522	word T[4];
	2523	DWord q = DivideFourWordsByTwo<word, DWord>(T, DWord(A[0], A[1]), DWord(A[2], A[3]), DWord(B[0], B[1]));
	2524	Q[0] = q.GetLowHalf();
	2525	Q[1] = q.GetHighHalf();
	2526
	2527	#ifndef NDEBUG
	2528	if (B[0] \|\| B[1])
	2529	{
	2530	// multiply quotient and divisor and add remainder, make sure it equals dividend
	2531	assert(!T[2] && !T[3] && (T[1] < B[1] \|\| (T[1]==B[1] && T[0]<B[0])));
	2532	word P[4];
270 by weidai MMX/SSE2 optimizations	2533	s_pMul[0](P, Q, B);
100 by weidai fix bugs in 64-bit CPU support	2534	Add(P, P, T, 4);
	2535	assert(memcmp(P, A, 4*WORD_SIZE)==0);
	2536	}
	2537	#endif
	2538	}
1 by weidai Initial revision	2539
1 by weidai Initial revision	2540	// for use by Divide(), corrects the underestimated quotient {Q1,Q0}
184 by weidai port to MSVC .NET 2005 beta 2	2541	static void CorrectQuotientEstimate(word R, word T, word Q, const word B, size_t N)
1 by weidai Initial revision	2542	{
	2543	assert(N && N%2==0);
	2544
270 by weidai MMX/SSE2 optimizations	2545	AsymmetricMultiply(T, T+N+2, Q, 2, B, N);
1 by weidai Initial revision	2546
	2547	word borrow = Subtract(R, R, T, N+2);
	2548	assert(!borrow && !R[N+1]);
	2549
	2550	while (R[N] \|\| Compare(R, B, N) >= 0)
	2551	{
	2552	R[N] -= Subtract(R, R, B, N);
	2553	Q[1] += (++Q[0]==0);
	2554	assert(Q[0] \|\| Q[1]); // no overflow
	2555	}
	2556	}
	2557
	2558	// R[NB] -------- remainder = A%B
	2559	// Q[NA-NB+2] --- quotient = A/B
270 by weidai MMX/SSE2 optimizations	2560	// T[NA+3*(NB+2)] - temp work space
1 by weidai Initial revision	2561	// A[NA] -------- dividend
	2562	// B[NB] -------- divisor
	2563
184 by weidai port to MSVC .NET 2005 beta 2	2564	void Divide(word R, word Q, word T, const word A, size_t NA, const word *B, size_t NB)
1 by weidai Initial revision	2565	{
	2566	assert(NA && NB && NA%2==0 && NB%2==0);
	2567	assert(B[NB-1] \|\| B[NB-2]);
	2568	assert(NB <= NA);
	2569
	2570	// set up temporary work space
	2571	word *const TA=T;
	2572	word *const TB=T+NA+2;
	2573	word *const TP=T+NA+2+NB;
	2574
	2575	// copy B into TB and normalize it so that TB has highest bit set to 1
	2576	unsigned shiftWords = (B[NB-1]==0);
	2577	TB[0] = TB[NB-1] = 0;
	2578	CopyWords(TB+shiftWords, B, NB-shiftWords);
	2579	unsigned shiftBits = WORD_BITS - BitPrecision(TB[NB-1]);
	2580	assert(shiftBits < WORD_BITS);
	2581	ShiftWordsLeftByBits(TB, NB, shiftBits);
	2582
	2583	// copy A into TA and normalize it
	2584	TA[0] = TA[NA] = TA[NA+1] = 0;
	2585	CopyWords(TA+shiftWords, A, NA);
	2586	ShiftWordsLeftByBits(TA, NA+2, shiftBits);
	2587
	2588	if (TA[NA+1]==0 && TA[NA] <= 1)
	2589	{
	2590	Q[NA-NB+1] = Q[NA-NB] = 0;
	2591	while (TA[NA] \|\| Compare(TA+NA-NB, TB, NB) >= 0)
	2592	{
	2593	TA[NA] -= Subtract(TA+NA-NB, TA+NA-NB, TB, NB);
	2594	++Q[NA-NB];
	2595	}
	2596	}
	2597	else
	2598	{
	2599	NA+=2;
	2600	assert(Compare(TA+NA-NB, TB, NB) < 0);
	2601	}
	2602
	2603	word BT[2];
	2604	BT[0] = TB[NB-2] + 1;
	2605	BT[1] = TB[NB-1] + (BT[0]==0);
	2606
	2607	// start reducing TA mod TB, 2 words at a time
184 by weidai port to MSVC .NET 2005 beta 2	2608	for (size_t i=NA-2; i>=NB; i-=2)
1 by weidai Initial revision	2609	{
	2610	AtomicDivide(Q+i-NB, TA+i-2, BT);
	2611	CorrectQuotientEstimate(TA+i-NB, TP, Q+i-NB, TB, NB);
	2612	}
	2613
	2614	// copy TA into R, and denormalize it
	2615	CopyWords(R, TA+shiftWords, NB);
	2616	ShiftWordsRightByBits(R, NB, shiftBits);
	2617	}
	2618
184 by weidai port to MSVC .NET 2005 beta 2	2619	static inline size_t EvenWordCount(const word *X, size_t N)
1 by weidai Initial revision	2620	{
	2621	while (N && X[N-2]==0 && X[N-1]==0)
	2622	N-=2;
	2623	return N;
	2624	}
	2625
	2626	// return k
	2627	// R[N] --- result = A^(-1) * 2^k mod M
	2628	// T[4*N] - temporary work space
	2629	// A[NA] -- number to take inverse of
	2630	// M[N] --- modulus
	2631
184 by weidai port to MSVC .NET 2005 beta 2	2632	unsigned int AlmostInverse(word R, word T, const word A, size_t NA, const word M, size_t N)
1 by weidai Initial revision	2633	{
	2634	assert(NA<=N && N && N%2==0);
	2635
	2636	word *b = T;
	2637	word *c = T+N;
	2638	word f = T+2N;
	2639	word g = T+3N;
184 by weidai port to MSVC .NET 2005 beta 2	2640	size_t bcLen=2, fgLen=EvenWordCount(M, N);
468 by weidai switch to non-branching code in AlmostInverse()	2641	unsigned int k=0;
	2642	bool s=false;
1 by weidai Initial revision	2643
	2644	SetWords(T, 0, 3*N);
	2645	b[0]=1;
	2646	CopyWords(f, A, NA);
	2647	CopyWords(g, M, N);
	2648
	2649	while (1)
	2650	{
	2651	word t=f[0];
	2652	while (!t)
	2653	{
	2654	if (EvenWordCount(f, fgLen)==0)
	2655	{
	2656	SetWords(R, 0, N);
	2657	return 0;
	2658	}
	2659
	2660	ShiftWordsRightByWords(f, fgLen, 1);
468 by weidai switch to non-branching code in AlmostInverse()	2661	bcLen += 2 * (c[bcLen-1] != 0);
1 by weidai Initial revision	2662	assert(bcLen <= N);
	2663	ShiftWordsLeftByWords(c, bcLen, 1);
	2664	k+=WORD_BITS;
	2665	t=f[0];
	2666	}
	2667
468 by weidai switch to non-branching code in AlmostInverse()	2668	unsigned int i = TrailingZeros(t);
	2669	t >>= i;
	2670	k += i;
1 by weidai Initial revision	2671
468 by weidai switch to non-branching code in AlmostInverse()	2672	if (t==1 && f[1]==0 && EvenWordCount(f+2, fgLen-2)==0)
1 by weidai Initial revision	2673	{
468 by weidai switch to non-branching code in AlmostInverse()	2674	if (s)
	2675	Subtract(R, M, b, N);
	2676	else
1 by weidai Initial revision	2677	CopyWords(R, b, N);
	2678	return k;
	2679	}
	2680
	2681	ShiftWordsRightByBits(f, fgLen, i);
468 by weidai switch to non-branching code in AlmostInverse()	2682	t = ShiftWordsLeftByBits(c, bcLen, i);
	2683	c[bcLen] += t;
	2684	bcLen += 2 * (t!=0);
	2685	assert(bcLen <= N);
	2686
	2687	bool swap = Compare(f, g, fgLen)==-1;
	2688	ConditionalSwapPointers(swap, f, g);
	2689	ConditionalSwapPointers(swap, b, c);
	2690	s ^= swap;
	2691
	2692	fgLen -= 2 * !(f[fgLen-2] \| f[fgLen-1]);
1 by weidai Initial revision	2693
1 by weidai Initial revision	2694	Subtract(f, f, g, fgLen);
468 by weidai switch to non-branching code in AlmostInverse()	2695	t = Add(b, b, c, bcLen);
	2696	b[bcLen] += t;
	2697	bcLen += 2*t;
	2698	assert(bcLen <= N);
1 by weidai Initial revision	2699	}
	2700	}
	2701
	2702	// R[N] - result = A/(2^k) mod M
	2703	// A[N] - input
	2704	// M[N] - modulus
	2705
184 by weidai port to MSVC .NET 2005 beta 2	2706	void DivideByPower2Mod(word R, const word A, size_t k, const word *M, size_t N)
1 by weidai Initial revision	2707	{
	2708	CopyWords(R, A, N);
	2709
	2710	while (k--)
	2711	{
	2712	if (R[0]%2==0)
	2713	ShiftWordsRightByBits(R, N, 1);
	2714	else
	2715	{
	2716	word carry = Add(R, R, M, N);
	2717	ShiftWordsRightByBits(R, N, 1);
	2718	R[N-1] += carry<<(WORD_BITS-1);
	2719	}
	2720	}
	2721	}
	2722
	2723	// R[N] - result = A*(2^k) mod M
	2724	// A[N] - input
	2725	// M[N] - modulus
	2726
184 by weidai port to MSVC .NET 2005 beta 2	2727	void MultiplyByPower2Mod(word R, const word A, size_t k, const word *M, size_t N)
1 by weidai Initial revision	2728	{
	2729	CopyWords(R, A, N);
	2730
	2731	while (k--)
	2732	if (ShiftWordsLeftByBits(R, N, 1) \|\| Compare(R, M, N)>=0)
	2733	Subtract(R, R, M, N);
	2734	}
	2735
	2736	// ******************************************************************
	2737
207 by weidai improve Integer initialization	2738	InitializeInteger::InitializeInteger()
	2739	{
	2740	if (!g_pAssignIntToInteger)
	2741	{
270 by weidai MMX/SSE2 optimizations	2742	SetFunctionPointers();
207 by weidai improve Integer initialization	2743	g_pAssignIntToInteger = AssignIntToInteger;
	2744	}
	2745	}
	2746
1 by weidai Initial revision	2747	static const unsigned int RoundupSizeTable[] = {2, 2, 2, 4, 4, 8, 8, 8, 8};
1 by weidai Initial revision	2748
184 by weidai port to MSVC .NET 2005 beta 2	2749	static inline size_t RoundupSize(size_t n)
1 by weidai Initial revision	2750	{
	2751	if (n<=8)
	2752	return RoundupSizeTable[n];
	2753	else if (n<=16)
	2754	return 16;
	2755	else if (n<=32)
	2756	return 32;
	2757	else if (n<=64)
	2758	return 64;
202 by weidai fix MSVC 2005 warnings	2759	else return size_t(1) << BitPrecision(n-1);
1 by weidai Initial revision	2760	}
	2761
	2762	Integer::Integer()
	2763	: reg(2), sign(POSITIVE)
	2764	{
	2765	reg[0] = reg[1] = 0;
	2766	}
	2767
	2768	Integer::Integer(const Integer& t)
	2769	: reg(RoundupSize(t.WordCount())), sign(t.sign)
	2770	{
	2771	CopyWords(reg, t.reg, reg.size());
	2772	}
	2773
100 by weidai fix bugs in 64-bit CPU support	2774	Integer::Integer(Sign s, lword value)
	2775	: reg(2), sign(s)
	2776	{
	2777	reg[0] = word(value);
	2778	reg[1] = word(SafeRightShift<WORD_BITS>(value));
	2779	}
	2780
1 by weidai Initial revision	2781	Integer::Integer(signed long value)
	2782	: reg(2)
	2783	{
	2784	if (value >= 0)
	2785	sign = POSITIVE;
	2786	else
	2787	{
	2788	sign = NEGATIVE;
	2789	value = -value;
	2790	}
	2791	reg[0] = word(value);
100 by weidai fix bugs in 64-bit CPU support	2792	reg[1] = word(SafeRightShift<WORD_BITS>((unsigned long)value));
1 by weidai Initial revision	2793	}
1 by weidai Initial revision	2794
32 by weidai fix warnings for VC7 and GCC	2795	Integer::Integer(Sign s, word high, word low)
	2796	: reg(2), sign(s)
	2797	{
	2798	reg[0] = low;
	2799	reg[1] = high;
	2800	}
	2801
1 by weidai Initial revision	2802	bool Integer::IsConvertableToLong() const
	2803	{
	2804	if (ByteCount() > sizeof(long))
	2805	return false;
	2806
202 by weidai fix MSVC 2005 warnings	2807	unsigned long value = (unsigned long)reg[0];
202 by weidai fix MSVC 2005 warnings	2808	value += SafeLeftShift<WORD_BITS, unsigned long>((unsigned long)reg[1]);
1 by weidai Initial revision	2809
	2810	if (sign==POSITIVE)
	2811	return (signed long)value >= 0;
	2812	else
	2813	return -(signed long)value < 0;
	2814	}
	2815
	2816	signed long Integer::ConvertToLong() const
	2817	{
	2818	assert(IsConvertableToLong());
	2819
202 by weidai fix MSVC 2005 warnings	2820	unsigned long value = (unsigned long)reg[0];
202 by weidai fix MSVC 2005 warnings	2821	value += SafeLeftShift<WORD_BITS, unsigned long>((unsigned long)reg[1]);
1 by weidai Initial revision	2822	return sign==POSITIVE ? value : -(signed long)value;
	2823	}
	2824
184 by weidai port to MSVC .NET 2005 beta 2	2825	Integer::Integer(BufferedTransformation &encodedInteger, size_t byteCount, Signedness s)
1 by weidai Initial revision	2826	{
	2827	Decode(encodedInteger, byteCount, s);
	2828	}
	2829
184 by weidai port to MSVC .NET 2005 beta 2	2830	Integer::Integer(const byte *encodedInteger, size_t byteCount, Signedness s)
1 by weidai Initial revision	2831	{
	2832	Decode(encodedInteger, byteCount, s);
	2833	}
	2834
	2835	Integer::Integer(BufferedTransformation &bt)
	2836	{
	2837	BERDecode(bt);
	2838	}
	2839
184 by weidai port to MSVC .NET 2005 beta 2	2840	Integer::Integer(RandomNumberGenerator &rng, size_t bitcount)
1 by weidai Initial revision	2841	{
	2842	Randomize(rng, bitcount);
	2843	}
	2844
	2845	Integer::Integer(RandomNumberGenerator &rng, const Integer &min, const Integer &max, RandomNumberType rnType, const Integer &equiv, const Integer &mod)
	2846	{
	2847	if (!Randomize(rng, min, max, rnType, equiv, mod))
	2848	throw Integer::RandomNumberNotFound();
	2849	}
	2850
184 by weidai port to MSVC .NET 2005 beta 2	2851	Integer Integer::Power2(size_t e)
1 by weidai Initial revision	2852	{
	2853	Integer r((word)0, BitsToWords(e+1));
	2854	r.SetBit(e);
	2855	return r;
	2856	}
	2857
106 by weidai fix potential threading problem with initialization of static objects	2858	template <long i>
	2859	struct NewInteger
	2860	{
	2861	Integer * operator()() const
	2862	{
	2863	return new Integer(i);
	2864	}
	2865	};
	2866
1 by weidai Initial revision	2867	const Integer &Integer::Zero()
1 by weidai Initial revision	2868	{
106 by weidai fix potential threading problem with initialization of static objects	2869	return Singleton<Integer>().Ref();
1 by weidai Initial revision	2870	}
	2871
	2872	const Integer &Integer::One()
	2873	{
106 by weidai fix potential threading problem with initialization of static objects	2874	return Singleton<Integer, NewInteger<1> >().Ref();
1 by weidai Initial revision	2875	}
	2876
	2877	const Integer &Integer::Two()
	2878	{
106 by weidai fix potential threading problem with initialization of static objects	2879	return Singleton<Integer, NewInteger<2> >().Ref();
1 by weidai Initial revision	2880	}
	2881
	2882	bool Integer::operator!() const
	2883	{
	2884	return IsNegative() ? false : (reg[0]==0 && WordCount()==0);
	2885	}
	2886
	2887	Integer& Integer::operator=(const Integer& t)
	2888	{
	2889	if (this != &t)
	2890	{
270 by weidai MMX/SSE2 optimizations	2891	if (reg.size() != t.reg.size() \|\| t.reg[t.reg.size()/2] == 0)
270 by weidai MMX/SSE2 optimizations	2892	reg.New(RoundupSize(t.WordCount()));
1 by weidai Initial revision	2893	CopyWords(reg, t.reg, reg.size());
	2894	sign = t.sign;
	2895	}
	2896	return *this;
	2897	}
	2898
184 by weidai port to MSVC .NET 2005 beta 2	2899	bool Integer::GetBit(size_t n) const
1 by weidai Initial revision	2900	{
	2901	if (n/WORD_BITS >= reg.size())
	2902	return 0;
	2903	else
	2904	return bool((reg[n/WORD_BITS] >> (n % WORD_BITS)) & 1);
	2905	}
	2906
184 by weidai port to MSVC .NET 2005 beta 2	2907	void Integer::SetBit(size_t n, bool value)
1 by weidai Initial revision	2908	{
	2909	if (value)
	2910	{
	2911	reg.CleanGrow(RoundupSize(BitsToWords(n+1)));
	2912	reg[n/WORD_BITS] \|= (word(1) << (n%WORD_BITS));
	2913	}
	2914	else
	2915	{
	2916	if (n/WORD_BITS < reg.size())
	2917	reg[n/WORD_BITS] &= ~(word(1) << (n%WORD_BITS));
	2918	}
	2919	}
	2920
184 by weidai port to MSVC .NET 2005 beta 2	2921	byte Integer::GetByte(size_t n) const
1 by weidai Initial revision	2922	{
	2923	if (n/WORD_SIZE >= reg.size())
	2924	return 0;
	2925	else
	2926	return byte(reg[n/WORD_SIZE] >> ((n%WORD_SIZE)*8));
	2927	}
	2928
184 by weidai port to MSVC .NET 2005 beta 2	2929	void Integer::SetByte(size_t n, byte value)
1 by weidai Initial revision	2930	{
	2931	reg.CleanGrow(RoundupSize(BytesToWords(n+1)));
	2932	reg[n/WORD_SIZE] &= ~(word(0xff) << 8*(n%WORD_SIZE));
	2933	reg[n/WORD_SIZE] \|= (word(value) << 8*(n%WORD_SIZE));
	2934	}
	2935
184 by weidai port to MSVC .NET 2005 beta 2	2936	lword Integer::GetBits(size_t i, size_t n) const
1 by weidai Initial revision	2937	{
184 by weidai port to MSVC .NET 2005 beta 2	2938	lword v = 0;
184 by weidai port to MSVC .NET 2005 beta 2	2939	assert(n <= sizeof(v)*8);
1 by weidai Initial revision	2940	for (unsigned int j=0; j<n; j++)
184 by weidai port to MSVC .NET 2005 beta 2	2941	v \|= lword(GetBit(i+j)) << j;
1 by weidai Initial revision	2942	return v;
	2943	}
	2944
	2945	Integer Integer::operator-() const
	2946	{
	2947	Integer result(*this);
	2948	result.Negate();
	2949	return result;
	2950	}
	2951
	2952	Integer Integer::AbsoluteValue() const
	2953	{
	2954	Integer result(*this);
	2955	result.sign = POSITIVE;
	2956	return result;
	2957	}
	2958
	2959	void Integer::swap(Integer &a)
	2960	{
	2961	reg.swap(a.reg);
	2962	std::swap(sign, a.sign);
	2963	}
	2964
184 by weidai port to MSVC .NET 2005 beta 2	2965	Integer::Integer(word value, size_t length)
1 by weidai Initial revision	2966	: reg(RoundupSize(length)), sign(POSITIVE)
	2967	{
	2968	reg[0] = value;
	2969	SetWords(reg+1, 0, reg.size()-1);
	2970	}
	2971
	2972	template <class T>
	2973	static Integer StringToInteger(const T *str)
	2974	{
202 by weidai fix MSVC 2005 warnings	2975	int radix;
38 by weidai STLport workaround	2976	// GCC workaround
38 by weidai STLport workaround	2977	// std::char_traits<wchar_t>::length() not defined in GCC 3.2 and STLport 4.5.3
1 by weidai Initial revision	2978	unsigned int length;
	2979	for (length = 0; str[length] != 0; length++) {}
	2980
	2981	Integer v;
	2982
	2983	if (length == 0)
	2984	return v;
	2985
	2986	switch (str[length-1])
	2987	{
	2988	case 'h':
	2989	case 'H':
	2990	radix=16;
	2991	break;
	2992	case 'o':
	2993	case 'O':
	2994	radix=8;
	2995	break;
	2996	case 'b':
	2997	case 'B':
	2998	radix=2;
	2999	break;
	3000	default:
	3001	radix=10;
	3002	}
	3003
	3004	if (length > 2 && str[0] == '0' && str[1] == 'x')
	3005	radix = 16;
	3006
	3007	for (unsigned i=0; i<length; i++)
	3008	{
202 by weidai fix MSVC 2005 warnings	3009	int digit;
1 by weidai Initial revision	3010
	3011	if (str[i] >= '0' && str[i] <= '9')
	3012	digit = str[i] - '0';
	3013	else if (str[i] >= 'A' && str[i] <= 'F')
	3014	digit = str[i] - 'A' + 10;
	3015	else if (str[i] >= 'a' && str[i] <= 'f')
	3016	digit = str[i] - 'a' + 10;
	3017	else
	3018	digit = radix;
	3019
	3020	if (digit < radix)
	3021	{
	3022	v *= radix;
	3023	v += digit;
	3024	}
	3025	}
	3026
	3027	if (str[0] == '-')
	3028	v.Negate();
	3029
	3030	return v;
	3031	}
	3032
	3033	Integer::Integer(const char *str)
	3034	: reg(2), sign(POSITIVE)
	3035	{
	3036	*this = StringToInteger(str);
	3037	}
	3038
	3039	Integer::Integer(const wchar_t *str)
	3040	: reg(2), sign(POSITIVE)
	3041	{
	3042	*this = StringToInteger(str);
	3043	}
	3044
	3045	unsigned int Integer::WordCount() const
	3046	{
184 by weidai port to MSVC .NET 2005 beta 2	3047	return (unsigned int)CountWords(reg, reg.size());
1 by weidai Initial revision	3048	}
	3049
	3050	unsigned int Integer::ByteCount() const
	3051	{
	3052	unsigned wordCount = WordCount();
	3053	if (wordCount)
	3054	return (wordCount-1)*WORD_SIZE + BytePrecision(reg[wordCount-1]);
	3055	else
	3056	return 0;
	3057	}
	3058
	3059	unsigned int Integer::BitCount() const
	3060	{
	3061	unsigned wordCount = WordCount();
	3062	if (wordCount)
	3063	return (wordCount-1)*WORD_BITS + BitPrecision(reg[wordCount-1]);
	3064	else
	3065	return 0;
	3066	}
	3067
184 by weidai port to MSVC .NET 2005 beta 2	3068	void Integer::Decode(const byte *input, size_t inputLen, Signedness s)
1 by weidai Initial revision	3069	{
	3070	StringStore store(input, inputLen);
	3071	Decode(store, inputLen, s);
	3072	}
	3073
184 by weidai port to MSVC .NET 2005 beta 2	3074	void Integer::Decode(BufferedTransformation &bt, size_t inputLen, Signedness s)
1 by weidai Initial revision	3075	{
	3076	assert(bt.MaxRetrievable() >= inputLen);
	3077
	3078	byte b;
	3079	bt.Peek(b);
	3080	sign = ((s==SIGNED) && (b & 0x80)) ? NEGATIVE : POSITIVE;
	3081
	3082	while (inputLen>0 && (sign==POSITIVE ? b==0 : b==0xff))
	3083	{
	3084	bt.Skip(1);
	3085	inputLen--;
	3086	bt.Peek(b);
	3087	}
	3088
	3089	reg.CleanNew(RoundupSize(BytesToWords(inputLen)));
	3090
184 by weidai port to MSVC .NET 2005 beta 2	3091	for (size_t i=inputLen; i > 0; i--)
1 by weidai Initial revision	3092	{
1 by weidai Initial revision	3093	bt.Get(b);
100 by weidai fix bugs in 64-bit CPU support	3094	reg[(i-1)/WORD_SIZE] \|= word(b) << ((i-1)%WORD_SIZE)*8;
1 by weidai Initial revision	3095	}
	3096
	3097	if (sign == NEGATIVE)
	3098	{
184 by weidai port to MSVC .NET 2005 beta 2	3099	for (size_t i=inputLen; i<reg.size()*WORD_SIZE; i++)
100 by weidai fix bugs in 64-bit CPU support	3100	reg[i/WORD_SIZE] \|= word(0xff) << (i%WORD_SIZE)*8;
1 by weidai Initial revision	3101	TwosComplement(reg, reg.size());
	3102	}
	3103	}
	3104
184 by weidai port to MSVC .NET 2005 beta 2	3105	size_t Integer::MinEncodedSize(Signedness signedness) const
1 by weidai Initial revision	3106	{
	3107	unsigned int outputLen = STDMAX(1U, ByteCount());
	3108	if (signedness == UNSIGNED)
	3109	return outputLen;
	3110	if (NotNegative() && (GetByte(outputLen-1) & 0x80))
	3111	outputLen++;
	3112	if (IsNegative() && this < -Power2(outputLen8-1))
	3113	outputLen++;
	3114	return outputLen;
	3115	}
	3116
184 by weidai port to MSVC .NET 2005 beta 2	3117	void Integer::Encode(byte *output, size_t outputLen, Signedness signedness) const
1 by weidai Initial revision	3118	{
1 by weidai Initial revision	3119	ArraySink sink(output, outputLen);
184 by weidai port to MSVC .NET 2005 beta 2	3120	Encode(sink, outputLen, signedness);
1 by weidai Initial revision	3121	}
1 by weidai Initial revision	3122
184 by weidai port to MSVC .NET 2005 beta 2	3123	void Integer::Encode(BufferedTransformation &bt, size_t outputLen, Signedness signedness) const
1 by weidai Initial revision	3124	{
	3125	if (signedness == UNSIGNED \|\| NotNegative())
	3126	{
184 by weidai port to MSVC .NET 2005 beta 2	3127	for (size_t i=outputLen; i > 0; i--)
1 by weidai Initial revision	3128	bt.Put(GetByte(i-1));
	3129	}
	3130	else
	3131	{
	3132	// take two's complement of *this
203 by weidai fix Integer::Encode	3133	Integer temp = Integer::Power2(8STDMAX((size_t)ByteCount(), outputLen)) + this;
184 by weidai port to MSVC .NET 2005 beta 2	3134	temp.Encode(bt, outputLen, UNSIGNED);
1 by weidai Initial revision	3135	}
	3136	}
	3137
	3138	void Integer::DEREncode(BufferedTransformation &bt) const
	3139	{
	3140	DERGeneralEncoder enc(bt, INTEGER);
	3141	Encode(enc, MinEncodedSize(SIGNED), SIGNED);
	3142	enc.MessageEnd();
	3143	}
	3144
184 by weidai port to MSVC .NET 2005 beta 2	3145	void Integer::BERDecode(const byte *input, size_t len)
1 by weidai Initial revision	3146	{
	3147	StringStore store(input, len);
	3148	BERDecode(store);
	3149	}
	3150
	3151	void Integer::BERDecode(BufferedTransformation &bt)
	3152	{
	3153	BERGeneralDecoder dec(bt, INTEGER);
	3154	if (!dec.IsDefiniteLength() \|\| dec.MaxRetrievable() < dec.RemainingLength())
	3155	BERDecodeError();
184 by weidai port to MSVC .NET 2005 beta 2	3156	Decode(dec, (size_t)dec.RemainingLength(), SIGNED);
1 by weidai Initial revision	3157	dec.MessageEnd();
	3158	}
	3159
184 by weidai port to MSVC .NET 2005 beta 2	3160	void Integer::DEREncodeAsOctetString(BufferedTransformation &bt, size_t length) const
1 by weidai Initial revision	3161	{
	3162	DERGeneralEncoder enc(bt, OCTET_STRING);
	3163	Encode(enc, length);
	3164	enc.MessageEnd();
	3165	}
	3166
184 by weidai port to MSVC .NET 2005 beta 2	3167	void Integer::BERDecodeAsOctetString(BufferedTransformation &bt, size_t length)
1 by weidai Initial revision	3168	{
	3169	BERGeneralDecoder dec(bt, OCTET_STRING);
	3170	if (!dec.IsDefiniteLength() \|\| dec.RemainingLength() != length)
	3171	BERDecodeError();
	3172	Decode(dec, length);
	3173	dec.MessageEnd();
	3174	}
	3175
184 by weidai port to MSVC .NET 2005 beta 2	3176	size_t Integer::OpenPGPEncode(byte *output, size_t len) const
1 by weidai Initial revision	3177	{
	3178	ArraySink sink(output, len);
	3179	return OpenPGPEncode(sink);
	3180	}
	3181
184 by weidai port to MSVC .NET 2005 beta 2	3182	size_t Integer::OpenPGPEncode(BufferedTransformation &bt) const
1 by weidai Initial revision	3183	{
	3184	word16 bitCount = BitCount();
	3185	bt.PutWord16(bitCount);
184 by weidai port to MSVC .NET 2005 beta 2	3186	size_t byteCount = BitsToBytes(bitCount);
	3187	Encode(bt, byteCount);
	3188	return 2 + byteCount;
1 by weidai Initial revision	3189	}
1 by weidai Initial revision	3190
184 by weidai port to MSVC .NET 2005 beta 2	3191	void Integer::OpenPGPDecode(const byte *input, size_t len)
1 by weidai Initial revision	3192	{
	3193	StringStore store(input, len);
	3194	OpenPGPDecode(store);
	3195	}
	3196
	3197	void Integer::OpenPGPDecode(BufferedTransformation &bt)
	3198	{
	3199	word16 bitCount;
	3200	if (bt.GetWord16(bitCount) != 2 \|\| bt.MaxRetrievable() < BitsToBytes(bitCount))
	3201	throw OpenPGPDecodeErr();
	3202	Decode(bt, BitsToBytes(bitCount));
	3203	}
	3204
184 by weidai port to MSVC .NET 2005 beta 2	3205	void Integer::Randomize(RandomNumberGenerator &rng, size_t nbits)
1 by weidai Initial revision	3206	{
184 by weidai port to MSVC .NET 2005 beta 2	3207	const size_t nbytes = nbits/8 + 1;
1 by weidai Initial revision	3208	SecByteBlock buf(nbytes);
	3209	rng.GenerateBlock(buf, nbytes);
	3210	if (nbytes)
	3211	buf[0] = (byte)Crop(buf[0], nbits % 8);
	3212	Decode(buf, nbytes, UNSIGNED);
	3213	}
	3214
	3215	void Integer::Randomize(RandomNumberGenerator &rng, const Integer &min, const Integer &max)
	3216	{
	3217	if (min > max)
	3218	throw InvalidArgument("Integer: Min must be no greater than Max");
	3219
	3220	Integer range = max - min;
	3221	const unsigned int nbits = range.BitCount();
	3222
	3223	do
	3224	{
	3225	Randomize(rng, nbits);
	3226	}
	3227	while (*this > range);
	3228
	3229	*this += min;
	3230	}
	3231
	3232	bool Integer::Randomize(RandomNumberGenerator &rng, const Integer &min, const Integer &max, RandomNumberType rnType, const Integer &equiv, const Integer &mod)
	3233	{
	3234	return GenerateRandomNoThrow(rng, MakeParameters("Min", min)("Max", max)("RandomNumberType", rnType)("EquivalentTo", equiv)("Mod", mod));
	3235	}
	3236
	3237	class KDF2_RNG : public RandomNumberGenerator
	3238	{
	3239	public:
184 by weidai port to MSVC .NET 2005 beta 2	3240	KDF2_RNG(const byte *seed, size_t seedSize)
1 by weidai Initial revision	3241	: m_counter(0), m_counterAndSeed(seedSize + 4)
	3242	{
	3243	memcpy(m_counterAndSeed + 4, seed, seedSize);
	3244	}
	3245
250 by weidai fix compile with Sun CC 64-bit	3246	void GenerateBlock(byte *output, size_t size)
1 by weidai Initial revision	3247	{
270 by weidai MMX/SSE2 optimizations	3248	PutWord(false, BIG_ENDIAN_ORDER, m_counterAndSeed, m_counter);
1 by weidai Initial revision	3249	++m_counter;
86 by weidai added support for using encoding parameters and key derivation parameters	3250	P1363_KDF2<SHA1>::DeriveKey(output, size, m_counterAndSeed, m_counterAndSeed.size(), NULL, 0);
1 by weidai Initial revision	3251	}
	3252
	3253	private:
	3254	word32 m_counter;
	3255	SecByteBlock m_counterAndSeed;
	3256	};
	3257
	3258	bool Integer::GenerateRandomNoThrow(RandomNumberGenerator &i_rng, const NameValuePairs &params)
	3259	{
	3260	Integer min = params.GetValueWithDefault("Min", Integer::Zero());
	3261	Integer max;
	3262	if (!params.GetValue("Max", max))
	3263	{
	3264	int bitLength;
	3265	if (params.GetIntValue("BitLength", bitLength))
	3266	max = Integer::Power2(bitLength);
	3267	else
	3268	throw InvalidArgument("Integer: missing Max argument");
	3269	}
	3270	if (min > max)
	3271	throw InvalidArgument("Integer: Min must be no greater than Max");
	3272
	3273	Integer equiv = params.GetValueWithDefault("EquivalentTo", Integer::Zero());
	3274	Integer mod = params.GetValueWithDefault("Mod", Integer::One());
	3275
	3276	if (equiv.IsNegative() \|\| equiv >= mod)
	3277	throw InvalidArgument("Integer: invalid EquivalentTo and/or Mod argument");
	3278
	3279	Integer::RandomNumberType rnType = params.GetValueWithDefault("RandomNumberType", Integer::ANY);
	3280
	3281	member_ptr<KDF2_RNG> kdf2Rng;
	3282	ConstByteArrayParameter seed;
412 by weidai changes for 5.6:	3283	if (params.GetValue(Name::Seed(), seed))
1 by weidai Initial revision	3284	{
	3285	ByteQueue bq;
	3286	DERSequenceEncoder seq(bq);
	3287	min.DEREncode(seq);
	3288	max.DEREncode(seq);
	3289	equiv.DEREncode(seq);
	3290	mod.DEREncode(seq);
	3291	DEREncodeUnsigned(seq, rnType);
	3292	DEREncodeOctetString(seq, seed.begin(), seed.size());
	3293	seq.MessageEnd();
	3294
184 by weidai port to MSVC .NET 2005 beta 2	3295	SecByteBlock finalSeed((size_t)bq.MaxRetrievable());
1 by weidai Initial revision	3296	bq.Get(finalSeed, finalSeed.size());
	3297	kdf2Rng.reset(new KDF2_RNG(finalSeed.begin(), finalSeed.size()));
	3298	}
	3299	RandomNumberGenerator &rng = kdf2Rng.get() ? (RandomNumberGenerator &)*kdf2Rng : i_rng;
	3300
	3301	switch (rnType)
	3302	{
	3303	case ANY:
	3304	if (mod == One())
	3305	Randomize(rng, min, max);
	3306	else
	3307	{
	3308	Integer min1 = min + (equiv-min)%mod;
	3309	if (max < min1)
	3310	return false;
	3311	Randomize(rng, Zero(), (max - min1) / mod);
	3312	this = mod;
	3313	*this += min1;
	3314	}
	3315	return true;
	3316
	3317	case PRIME:
	3318	{
86 by weidai added support for using encoding parameters and key derivation parameters	3319	const PrimeSelector pSelector = params.GetValueWithDefault(Name::PointerToPrimeSelector(), (const PrimeSelector )NULL);
1 by weidai Initial revision	3320
	3321	int i;
	3322	i = 0;
	3323	while (1)
	3324	{
	3325	if (++i==16)
	3326	{
	3327	// check if there are any suitable primes in [min, max]
	3328	Integer first = min;
	3329	if (FirstPrime(first, max, equiv, mod, pSelector))
	3330	{
	3331	// if there is only one suitable prime, we're done
	3332	*this = first;
	3333	if (!FirstPrime(first, max, equiv, mod, pSelector))
	3334	return true;
	3335	}
	3336	else
	3337	return false;
	3338	}
	3339
	3340	Randomize(rng, min, max);
	3341	if (FirstPrime(this, STDMIN(this+mod*PrimeSearchInterval(max), max), equiv, mod, pSelector))
	3342	return true;
	3343	}
	3344	}
	3345
	3346	default:
	3347	throw InvalidArgument("Integer: invalid RandomNumberType argument");
	3348	}
	3349	}
	3350
	3351	std::istream& operator>>(std::istream& in, Integer &a)
	3352	{
	3353	char c;
	3354	unsigned int length = 0;
	3355	SecBlock<char> str(length + 16);
	3356
	3357	std::ws(in);
	3358
	3359	do
	3360	{
	3361	in.read(&c, 1);
	3362	str[length++] = c;
	3363	if (length >= str.size())
	3364	str.Grow(length + 16);
	3365	}
	3366	while (in && (c=='-' \|\| c=='x' \|\| (c>='0' && c<='9') \|\| (c>='a' && c<='f') \|\| (c>='A' && c<='F') \|\| c=='h' \|\| c=='H' \|\| c=='o' \|\| c=='O' \|\| c==',' \|\| c=='.'));
	3367
	3368	if (in.gcount())
	3369	in.putback(c);
	3370	str[length-1] = '\0';
	3371	a = Integer(str);
	3372
	3373	return in;
	3374	}
	3375
	3376	std::ostream& operator<<(std::ostream& out, const Integer &a)
	3377	{
	3378	// Get relevant conversion specifications from ostream.
	3379	long f = out.flags() & std::ios::basefield; // Get base digits.
	3380	int base, block;
	3381	char suffix;
	3382	switch(f)
	3383	{
3384	case std::ios::oct :
3385	base = 8;
3386	block = 8;
3387	suffix = 'o';
3388	break;
3389	case std::ios::hex :
3390	base = 16;
3391	block = 4;
3392	suffix = 'h';
3393	break;
3394	default :
3395	base = 10;
3396	block = 3;
3397	suffix = '.';
3398	}
3399
3400	Integer temp1=a, temp2;
421 by weidai from Jeffery Walton: move *.dat files into TestData, make Integer operator<< respect ios::uppercase flag	3401
1 by weidai Initial revision	3402	if (a.IsNegative())
	3403	{
	3404	out << '-';
	3405	temp1.Negate();
	3406	}
	3407
	3408	if (!a)
	3409	out << '0';
	3410
421 by weidai from Jeffery Walton: move *.dat files into TestData, make Integer operator<< respect ios::uppercase flag	3411	static const char upper[]="0123456789ABCDEF";
	3412	static const char lower[]="0123456789abcdef";
	3413
	3414	const char* vec = (out.flags() & std::ios::uppercase) ? upper : lower;
	3415	unsigned i=0;
	3416	SecBlock<char> s(a.BitCount() / (BitPrecision(base)-1) + 1);
	3417
1 by weidai Initial revision	3418	while (!!temp1)
	3419	{
	3420	word digit;
	3421	Integer::Divide(digit, temp2, temp1, base);
	3422	s[i++]=vec[digit];
421 by weidai from Jeffery Walton: move *.dat files into TestData, make Integer operator<< respect ios::uppercase flag	3423	temp1.swap(temp2);
1 by weidai Initial revision	3424	}
	3425
	3426	while (i--)
	3427	{
	3428	out << s[i];
	3429	// if (i && !(i%block))
	3430	// out << ",";
	3431	}
	3432	return out << suffix;
	3433	}
	3434
	3435	Integer& Integer::operator++()
	3436	{
	3437	if (NotNegative())
	3438	{
	3439	if (Increment(reg, reg.size()))
	3440	{
	3441	reg.CleanGrow(2*reg.size());
	3442	reg[reg.size()/2]=1;
	3443	}
	3444	}
	3445	else
	3446	{
	3447	word borrow = Decrement(reg, reg.size());
	3448	assert(!borrow);
	3449	if (WordCount()==0)
	3450	*this = Zero();
	3451	}
	3452	return *this;
	3453	}
	3454
	3455	Integer& Integer::operator--()
	3456	{
	3457	if (IsNegative())
	3458	{
	3459	if (Increment(reg, reg.size()))
	3460	{
	3461	reg.CleanGrow(2*reg.size());
	3462	reg[reg.size()/2]=1;
	3463	}
	3464	}
	3465	else
	3466	{
	3467	if (Decrement(reg, reg.size()))
	3468	*this = -One();
	3469	}
	3470	return *this;
	3471	}
	3472
	3473	void PositiveAdd(Integer &sum, const Integer &a, const Integer& b)
	3474	{
202 by weidai fix MSVC 2005 warnings	3475	int carry;
1 by weidai Initial revision	3476	if (a.reg.size() == b.reg.size())
	3477	carry = Add(sum.reg, a.reg, b.reg, a.reg.size());
	3478	else if (a.reg.size() > b.reg.size())
	3479	{
	3480	carry = Add(sum.reg, a.reg, b.reg, b.reg.size());
	3481	CopyWords(sum.reg+b.reg.size(), a.reg+b.reg.size(), a.reg.size()-b.reg.size());
	3482	carry = Increment(sum.reg+b.reg.size(), a.reg.size()-b.reg.size(), carry);
	3483	}
	3484	else
	3485	{
	3486	carry = Add(sum.reg, a.reg, b.reg, a.reg.size());
	3487	CopyWords(sum.reg+a.reg.size(), b.reg+a.reg.size(), b.reg.size()-a.reg.size());
	3488	carry = Increment(sum.reg+a.reg.size(), b.reg.size()-a.reg.size(), carry);
	3489	}
	3490
	3491	if (carry)
	3492	{
	3493	sum.reg.CleanGrow(2*sum.reg.size());
	3494	sum.reg[sum.reg.size()/2] = 1;
	3495	}
	3496	sum.sign = Integer::POSITIVE;
	3497	}
	3498
	3499	void PositiveSubtract(Integer &diff, const Integer &a, const Integer& b)
	3500	{
	3501	unsigned aSize = a.WordCount();
	3502	aSize += aSize%2;
	3503	unsigned bSize = b.WordCount();
	3504	bSize += bSize%2;
	3505
	3506	if (aSize == bSize)
	3507	{
	3508	if (Compare(a.reg, b.reg, aSize) >= 0)
	3509	{
	3510	Subtract(diff.reg, a.reg, b.reg, aSize);
	3511	diff.sign = Integer::POSITIVE;
	3512	}
	3513	else
	3514	{
	3515	Subtract(diff.reg, b.reg, a.reg, aSize);
	3516	diff.sign = Integer::NEGATIVE;
	3517	}
	3518	}
	3519	else if (aSize > bSize)
	3520	{
	3521	word borrow = Subtract(diff.reg, a.reg, b.reg, bSize);
	3522	CopyWords(diff.reg+bSize, a.reg+bSize, aSize-bSize);
	3523	borrow = Decrement(diff.reg+bSize, aSize-bSize, borrow);
	3524	assert(!borrow);
	3525	diff.sign = Integer::POSITIVE;
	3526	}
	3527	else
	3528	{
	3529	word borrow = Subtract(diff.reg, b.reg, a.reg, aSize);
	3530	CopyWords(diff.reg+aSize, b.reg+aSize, bSize-aSize);
	3531	borrow = Decrement(diff.reg+aSize, bSize-aSize, borrow);
	3532	assert(!borrow);
	3533	diff.sign = Integer::NEGATIVE;
	3534	}
	3535	}
	3536
184 by weidai port to MSVC .NET 2005 beta 2	3537	// MSVC .NET 2003 workaround
	3538	template <class T> inline const T& STDMAX2(const T& a, const T& b)
	3539	{
	3540	return a < b ? b : a;
	3541	}
	3542
1 by weidai Initial revision	3543	Integer Integer::Plus(const Integer& b) const
1 by weidai Initial revision	3544	{
184 by weidai port to MSVC .NET 2005 beta 2	3545	Integer sum((word)0, STDMAX2(reg.size(), b.reg.size()));
1 by weidai Initial revision	3546	if (NotNegative())
	3547	{
	3548	if (b.NotNegative())
	3549	PositiveAdd(sum, *this, b);
	3550	else
	3551	PositiveSubtract(sum, *this, b);
	3552	}
	3553	else
	3554	{
	3555	if (b.NotNegative())
	3556	PositiveSubtract(sum, b, *this);
	3557	else
	3558	{
	3559	PositiveAdd(sum, *this, b);
	3560	sum.sign = Integer::NEGATIVE;
	3561	}
	3562	}
	3563	return sum;
	3564	}
	3565
	3566	Integer& Integer::operator+=(const Integer& t)
	3567	{
	3568	reg.CleanGrow(t.reg.size());
	3569	if (NotNegative())
	3570	{
	3571	if (t.NotNegative())
	3572	PositiveAdd(this, this, t);
	3573	else
	3574	PositiveSubtract(this, this, t);
	3575	}
	3576	else
	3577	{
	3578	if (t.NotNegative())
	3579	PositiveSubtract(this, t, this);
	3580	else
	3581	{
	3582	PositiveAdd(this, this, t);
	3583	sign = Integer::NEGATIVE;
	3584	}
	3585	}
	3586	return *this;
	3587	}
	3588
	3589	Integer Integer::Minus(const Integer& b) const
	3590	{
184 by weidai port to MSVC .NET 2005 beta 2	3591	Integer diff((word)0, STDMAX2(reg.size(), b.reg.size()));
1 by weidai Initial revision	3592	if (NotNegative())
	3593	{
	3594	if (b.NotNegative())
	3595	PositiveSubtract(diff, *this, b);
	3596	else
	3597	PositiveAdd(diff, *this, b);
	3598	}
	3599	else
	3600	{
	3601	if (b.NotNegative())
	3602	{
	3603	PositiveAdd(diff, *this, b);
	3604	diff.sign = Integer::NEGATIVE;
	3605	}
	3606	else
	3607	PositiveSubtract(diff, b, *this);
	3608	}
	3609	return diff;
	3610	}
	3611
	3612	Integer& Integer::operator-=(const Integer& t)
	3613	{
	3614	reg.CleanGrow(t.reg.size());
	3615	if (NotNegative())
	3616	{
	3617	if (t.NotNegative())
	3618	PositiveSubtract(this, this, t);
	3619	else
	3620	PositiveAdd(this, this, t);
	3621	}
	3622	else
	3623	{
	3624	if (t.NotNegative())
	3625	{
	3626	PositiveAdd(this, this, t);
	3627	sign = Integer::NEGATIVE;
	3628	}
	3629	else
	3630	PositiveSubtract(this, t, this);
	3631	}
	3632	return *this;
	3633	}
	3634
184 by weidai port to MSVC .NET 2005 beta 2	3635	Integer& Integer::operator<<=(size_t n)
1 by weidai Initial revision	3636	{
184 by weidai port to MSVC .NET 2005 beta 2	3637	const size_t wordCount = WordCount();
	3638	const size_t shiftWords = n / WORD_BITS;
	3639	const unsigned int shiftBits = (unsigned int)(n % WORD_BITS);
1 by weidai Initial revision	3640
	3641	reg.CleanGrow(RoundupSize(wordCount+BitsToWords(n)));
	3642	ShiftWordsLeftByWords(reg, wordCount + shiftWords, shiftWords);
	3643	ShiftWordsLeftByBits(reg+shiftWords, wordCount+BitsToWords(shiftBits), shiftBits);
	3644	return *this;
	3645	}
	3646
184 by weidai port to MSVC .NET 2005 beta 2	3647	Integer& Integer::operator>>=(size_t n)
1 by weidai Initial revision	3648	{
184 by weidai port to MSVC .NET 2005 beta 2	3649	const size_t wordCount = WordCount();
	3650	const size_t shiftWords = n / WORD_BITS;
	3651	const unsigned int shiftBits = (unsigned int)(n % WORD_BITS);
1 by weidai Initial revision	3652
	3653	ShiftWordsRightByWords(reg, wordCount, shiftWords);
	3654	if (wordCount > shiftWords)
	3655	ShiftWordsRightByBits(reg, wordCount-shiftWords, shiftBits);
	3656	if (IsNegative() && WordCount()==0) // avoid -0
	3657	*this = Zero();
	3658	return *this;
	3659	}
	3660
	3661	void PositiveMultiply(Integer &product, const Integer &a, const Integer &b)
	3662	{
184 by weidai port to MSVC .NET 2005 beta 2	3663	size_t aSize = RoundupSize(a.WordCount());
184 by weidai port to MSVC .NET 2005 beta 2	3664	size_t bSize = RoundupSize(b.WordCount());
1 by weidai Initial revision	3665
	3666	product.reg.CleanNew(RoundupSize(aSize+bSize));
	3667	product.sign = Integer::POSITIVE;
	3668
270 by weidai MMX/SSE2 optimizations	3669	IntegerSecBlock workspace(aSize + bSize);
1 by weidai Initial revision	3670	AsymmetricMultiply(product.reg, workspace, a.reg, aSize, b.reg, bSize);
	3671	}
	3672
	3673	void Multiply(Integer &product, const Integer &a, const Integer &b)
	3674	{
	3675	PositiveMultiply(product, a, b);
	3676
	3677	if (a.NotNegative() != b.NotNegative())
	3678	product.Negate();
	3679	}
	3680
	3681	Integer Integer::Times(const Integer &b) const
	3682	{
	3683	Integer product;
	3684	Multiply(product, *this, b);
	3685	return product;
	3686	}
	3687
	3688	/*
	3689	void PositiveDivide(Integer &remainder, Integer &quotient,
	3690	const Integer &dividend, const Integer &divisor)
	3691	{
	3692	remainder.reg.CleanNew(divisor.reg.size());
	3693	remainder.sign = Integer::POSITIVE;
	3694	quotient.reg.New(0);
	3695	quotient.sign = Integer::POSITIVE;
	3696	unsigned i=dividend.BitCount();
	3697	while (i--)
	3698	{
	3699	word overflow = ShiftWordsLeftByBits(remainder.reg, remainder.reg.size(), 1);
	3700	remainder.reg[0] \|= dividend[i];
	3701	if (overflow \|\| remainder >= divisor)
	3702	{
	3703	Subtract(remainder.reg, remainder.reg, divisor.reg, remainder.reg.size());
	3704	quotient.SetBit(i);
	3705	}
	3706	}
	3707	}
	3708	*/
	3709
	3710	void PositiveDivide(Integer &remainder, Integer &quotient,
	3711	const Integer &a, const Integer &b)
	3712	{
	3713	unsigned aSize = a.WordCount();
	3714	unsigned bSize = b.WordCount();
	3715
	3716	if (!bSize)
	3717	throw Integer::DivideByZero();
	3718
412 by weidai changes for 5.6:	3719	if (aSize < bSize)
1 by weidai Initial revision	3720	{
	3721	remainder = a;
	3722	remainder.sign = Integer::POSITIVE;
	3723	quotient = Integer::Zero();
	3724	return;
	3725	}
	3726
	3727	aSize += aSize%2; // round up to next even number
	3728	bSize += bSize%2;
	3729
	3730	remainder.reg.CleanNew(RoundupSize(bSize));
	3731	remainder.sign = Integer::POSITIVE;
	3732	quotient.reg.CleanNew(RoundupSize(aSize-bSize+2));
	3733	quotient.sign = Integer::POSITIVE;
	3734
270 by weidai MMX/SSE2 optimizations	3735	IntegerSecBlock T(aSize+3*(bSize+2));
1 by weidai Initial revision	3736	Divide(remainder.reg, quotient.reg, T, a.reg, aSize, b.reg, bSize);
	3737	}
	3738
	3739	void Integer::Divide(Integer &remainder, Integer &quotient, const Integer &dividend, const Integer &divisor)
	3740	{
	3741	PositiveDivide(remainder, quotient, dividend, divisor);
	3742
	3743	if (dividend.IsNegative())
	3744	{
	3745	quotient.Negate();
	3746	if (remainder.NotZero())
	3747	{
	3748	--quotient;
	3749	remainder = divisor.AbsoluteValue() - remainder;
	3750	}
	3751	}
	3752
	3753	if (divisor.IsNegative())
	3754	quotient.Negate();
	3755	}
	3756
	3757	void Integer::DivideByPowerOf2(Integer &r, Integer &q, const Integer &a, unsigned int n)
	3758	{
	3759	q = a;
	3760	q >>= n;
	3761
184 by weidai port to MSVC .NET 2005 beta 2	3762	const size_t wordCount = BitsToWords(n);
1 by weidai Initial revision	3763	if (wordCount <= a.WordCount())
	3764	{
	3765	r.reg.resize(RoundupSize(wordCount));
	3766	CopyWords(r.reg, a.reg, wordCount);
	3767	SetWords(r.reg+wordCount, 0, r.reg.size()-wordCount);
	3768	if (n % WORD_BITS != 0)
202 by weidai fix MSVC 2005 warnings	3769	r.reg[wordCount-1] %= (word(1) << (n % WORD_BITS));
1 by weidai Initial revision	3770	}
	3771	else
	3772	{
	3773	r.reg.resize(RoundupSize(a.WordCount()));
	3774	CopyWords(r.reg, a.reg, r.reg.size());
	3775	}
	3776	r.sign = POSITIVE;
	3777
	3778	if (a.IsNegative() && r.NotZero())
	3779	{
	3780	--q;
	3781	r = Power2(n) - r;
	3782	}
	3783	}
	3784
	3785	Integer Integer::DividedBy(const Integer &b) const
	3786	{
	3787	Integer remainder, quotient;
	3788	Integer::Divide(remainder, quotient, *this, b);
	3789	return quotient;
	3790	}
	3791
	3792	Integer Integer::Modulo(const Integer &b) const
	3793	{
	3794	Integer remainder, quotient;
	3795	Integer::Divide(remainder, quotient, *this, b);
	3796	return remainder;
	3797	}
	3798
	3799	void Integer::Divide(word &remainder, Integer &quotient, const Integer &dividend, word divisor)
	3800	{
	3801	if (!divisor)
	3802	throw Integer::DivideByZero();
	3803
	3804	assert(divisor);
	3805
	3806	if ((divisor & (divisor-1)) == 0) // divisor is a power of 2
	3807	{
	3808	quotient = dividend >> (BitPrecision(divisor)-1);
	3809	remainder = dividend.reg[0] & (divisor-1);
	3810	return;
	3811	}
	3812
	3813	unsigned int i = dividend.WordCount();
	3814	quotient.reg.CleanNew(RoundupSize(i));
	3815	remainder = 0;
	3816	while (i--)
	3817	{
100 by weidai fix bugs in 64-bit CPU support	3818	quotient.reg[i] = DWord(dividend.reg[i], remainder) / divisor;
100 by weidai fix bugs in 64-bit CPU support	3819	remainder = DWord(dividend.reg[i], remainder) % divisor;
1 by weidai Initial revision	3820	}
	3821
	3822	if (dividend.NotNegative())
	3823	quotient.sign = POSITIVE;
	3824	else
	3825	{
	3826	quotient.sign = NEGATIVE;
	3827	if (remainder)
	3828	{
	3829	--quotient;
	3830	remainder = divisor - remainder;
	3831	}
	3832	}
	3833	}
	3834
	3835	Integer Integer::DividedBy(word b) const
	3836	{
	3837	word remainder;
	3838	Integer quotient;
	3839	Integer::Divide(remainder, quotient, *this, b);
	3840	return quotient;
	3841	}
	3842
	3843	word Integer::Modulo(word divisor) const
	3844	{
	3845	if (!divisor)
	3846	throw Integer::DivideByZero();
	3847
	3848	assert(divisor);
	3849
	3850	word remainder;
	3851
	3852	if ((divisor & (divisor-1)) == 0) // divisor is a power of 2
	3853	remainder = reg[0] & (divisor-1);
	3854	else
	3855	{
	3856	unsigned int i = WordCount();
	3857
	3858	if (divisor <= 5)
	3859	{
100 by weidai fix bugs in 64-bit CPU support	3860	DWord sum(0, 0);
1 by weidai Initial revision	3861	while (i--)
1 by weidai Initial revision	3862	sum += reg[i];
100 by weidai fix bugs in 64-bit CPU support	3863	remainder = sum % divisor;
1 by weidai Initial revision	3864	}
	3865	else
	3866	{
	3867	remainder = 0;
	3868	while (i--)
100 by weidai fix bugs in 64-bit CPU support	3869	remainder = DWord(reg[i], remainder) % divisor;
1 by weidai Initial revision	3870	}
	3871	}
	3872
	3873	if (IsNegative() && remainder)
	3874	remainder = divisor - remainder;
	3875
	3876	return remainder;
	3877	}
	3878
	3879	void Integer::Negate()
	3880	{
	3881	if (!!(this)) // don't flip sign if this==0
	3882	sign = Sign(1-sign);
	3883	}
	3884
	3885	int Integer::PositiveCompare(const Integer& t) const
	3886	{
	3887	unsigned size = WordCount(), tSize = t.WordCount();
	3888
	3889	if (size == tSize)
	3890	return CryptoPP::Compare(reg, t.reg, size);
	3891	else
	3892	return size > tSize ? 1 : -1;
	3893	}
	3894
	3895	int Integer::Compare(const Integer& t) const
	3896	{
	3897	if (NotNegative())
	3898	{
	3899	if (t.NotNegative())
	3900	return PositiveCompare(t);
	3901	else
	3902	return 1;
	3903	}
	3904	else
	3905	{
	3906	if (t.NotNegative())
	3907	return -1;
	3908	else
	3909	return -PositiveCompare(t);
	3910	}
	3911	}
	3912
	3913	Integer Integer::SquareRoot() const
	3914	{
	3915	if (!IsPositive())
	3916	return Zero();
	3917
	3918	// overestimate square root
	3919	Integer x, y = Power2((BitCount()+1)/2);
	3920	assert(yy >= this);
	3921
	3922	do
	3923	{
	3924	x = y;
	3925	y = (x + *this/x) >> 1;
	3926	} while (y<x);
	3927
	3928	return x;
	3929	}
	3930
	3931	bool Integer::IsSquare() const
	3932	{
	3933	Integer r = SquareRoot();
3934	return *this == r.Squared();
3935	}
3936
3937	bool Integer::IsUnit() const
3938	{
3939	return (WordCount() == 1) && (reg[0] == 1);
3940	}
3941
3942	Integer Integer::MultiplicativeInverse() const
3943	{
3944	return IsUnit() ? *this : Zero();
3945	}
3946
3947	Integer a_times_b_mod_c(const Integer &x, const Integer& y, const Integer& m)
3948	{
3949	return x*y%m;
3950	}
3951
3952	Integer a_exp_b_mod_c(const Integer &x, const Integer& e, const Integer& m)
3953	{
3954	ModularArithmetic mr(m);
3955	return mr.Exponentiate(x, e);
3956	}
3957
3958	Integer Integer::Gcd(const Integer &a, const Integer &b)
3959	{
3960	return EuclideanDomainOf<Integer>().Gcd(a, b);
3961	}
3962
3963	Integer Integer::InverseMod(const Integer &m) const
3964	{
3965	assert(m.NotNegative());
3966
412 by weidai changes for 5.6:	3967	if (IsNegative())
412 by weidai changes for 5.6:	3968	return Modulo(m).InverseMod(m);
1 by weidai Initial revision	3969
	3970	if (m.IsEven())
	3971	{
	3972	if (!m \|\| IsEven())
	3973	return Zero(); // no inverse
	3974	if (*this == One())
	3975	return One();
	3976
412 by weidai changes for 5.6:	3977	Integer u = m.Modulo(this).InverseMod(this);
1 by weidai Initial revision	3978	return !u ? Zero() : (m(this-u)+1)/(*this);
	3979	}
	3980
	3981	SecBlock<word> T(m.reg.size() * 4);
	3982	Integer r((word)0, m.reg.size());
	3983	unsigned k = AlmostInverse(r.reg, T, reg, reg.size(), m.reg, m.reg.size());
	3984	DivideByPower2Mod(r.reg, r.reg, k, m.reg, m.reg.size());
	3985	return r;
	3986	}
	3987
249 by weidai update version number, port to Sun C++ 5.8	3988	word Integer::InverseMod(word mod) const
1 by weidai Initial revision	3989	{
	3990	word g0 = mod, g1 = *this % mod;
	3991	word v0 = 0, v1 = 1;
	3992	word y;
	3993
	3994	while (g1)
	3995	{
	3996	if (g1 == 1)
	3997	return v1;
	3998	y = g0 / g1;
	3999	g0 = g0 % g1;
	4000	v0 += y * v1;
	4001
	4002	if (!g0)
	4003	break;
	4004	if (g0 == 1)
	4005	return mod-v0;
	4006	y = g1 / g0;
	4007	g1 = g1 % g0;
	4008	v1 += y * v0;
	4009	}
	4010	return 0;
	4011	}
	4012
	4013	// ********************************************************
	4014
	4015	ModularArithmetic::ModularArithmetic(BufferedTransformation &bt)
	4016	{
	4017	BERSequenceDecoder seq(bt);
	4018	OID oid(seq);
	4019	if (oid != ASN1::prime_field())
	4020	BERDecodeError();
181 by weidai changes done for FIPS-140 lab code drop	4021	m_modulus.BERDecode(seq);
1 by weidai Initial revision	4022	seq.MessageEnd();
181 by weidai changes done for FIPS-140 lab code drop	4023	m_result.reg.resize(m_modulus.reg.size());
1 by weidai Initial revision	4024	}
	4025
	4026	void ModularArithmetic::DEREncode(BufferedTransformation &bt) const
	4027	{
	4028	DERSequenceEncoder seq(bt);
	4029	ASN1::prime_field().DEREncode(seq);
181 by weidai changes done for FIPS-140 lab code drop	4030	m_modulus.DEREncode(seq);
1 by weidai Initial revision	4031	seq.MessageEnd();
	4032	}
	4033
	4034	void ModularArithmetic::DEREncodeElement(BufferedTransformation &out, const Element &a) const
	4035	{
	4036	a.DEREncodeAsOctetString(out, MaxElementByteLength());
	4037	}
	4038
	4039	void ModularArithmetic::BERDecodeElement(BufferedTransformation &in, Element &a) const
	4040	{
	4041	a.BERDecodeAsOctetString(in, MaxElementByteLength());
	4042	}
	4043
	4044	const Integer& ModularArithmetic::Half(const Integer &a) const
	4045	{
181 by weidai changes done for FIPS-140 lab code drop	4046	if (a.reg.size()==m_modulus.reg.size())
1 by weidai Initial revision	4047	{
181 by weidai changes done for FIPS-140 lab code drop	4048	CryptoPP::DivideByPower2Mod(m_result.reg.begin(), a.reg, 1, m_modulus.reg, a.reg.size());
181 by weidai changes done for FIPS-140 lab code drop	4049	return m_result;
1 by weidai Initial revision	4050	}
1 by weidai Initial revision	4051	else
181 by weidai changes done for FIPS-140 lab code drop	4052	return m_result1 = (a.IsEven() ? (a >> 1) : ((a+m_modulus) >> 1));
1 by weidai Initial revision	4053	}
	4054
	4055	const Integer& ModularArithmetic::Add(const Integer &a, const Integer &b) const
	4056	{
181 by weidai changes done for FIPS-140 lab code drop	4057	if (a.reg.size()==m_modulus.reg.size() && b.reg.size()==m_modulus.reg.size())
1 by weidai Initial revision	4058	{
181 by weidai changes done for FIPS-140 lab code drop	4059	if (CryptoPP::Add(m_result.reg.begin(), a.reg, b.reg, a.reg.size())
181 by weidai changes done for FIPS-140 lab code drop	4060	\|\| Compare(m_result.reg, m_modulus.reg, a.reg.size()) >= 0)
1 by weidai Initial revision	4061	{
181 by weidai changes done for FIPS-140 lab code drop	4062	CryptoPP::Subtract(m_result.reg.begin(), m_result.reg, m_modulus.reg, a.reg.size());
1 by weidai Initial revision	4063	}
181 by weidai changes done for FIPS-140 lab code drop	4064	return m_result;
1 by weidai Initial revision	4065	}
	4066	else
	4067	{
181 by weidai changes done for FIPS-140 lab code drop	4068	m_result1 = a+b;
	4069	if (m_result1 >= m_modulus)
	4070	m_result1 -= m_modulus;
	4071	return m_result1;
1 by weidai Initial revision	4072	}
	4073	}
	4074
	4075	Integer& ModularArithmetic::Accumulate(Integer &a, const Integer &b) const
	4076	{
181 by weidai changes done for FIPS-140 lab code drop	4077	if (a.reg.size()==m_modulus.reg.size() && b.reg.size()==m_modulus.reg.size())
1 by weidai Initial revision	4078	{
1 by weidai Initial revision	4079	if (CryptoPP::Add(a.reg, a.reg, b.reg, a.reg.size())
181 by weidai changes done for FIPS-140 lab code drop	4080	\|\| Compare(a.reg, m_modulus.reg, a.reg.size()) >= 0)
1 by weidai Initial revision	4081	{
181 by weidai changes done for FIPS-140 lab code drop	4082	CryptoPP::Subtract(a.reg, a.reg, m_modulus.reg, a.reg.size());
1 by weidai Initial revision	4083	}
	4084	}
	4085	else
	4086	{
	4087	a+=b;
181 by weidai changes done for FIPS-140 lab code drop	4088	if (a>=m_modulus)
181 by weidai changes done for FIPS-140 lab code drop	4089	a-=m_modulus;
1 by weidai Initial revision	4090	}
	4091
	4092	return a;
	4093	}
	4094
	4095	const Integer& ModularArithmetic::Subtract(const Integer &a, const Integer &b) const
	4096	{
181 by weidai changes done for FIPS-140 lab code drop	4097	if (a.reg.size()==m_modulus.reg.size() && b.reg.size()==m_modulus.reg.size())
1 by weidai Initial revision	4098	{
181 by weidai changes done for FIPS-140 lab code drop	4099	if (CryptoPP::Subtract(m_result.reg.begin(), a.reg, b.reg, a.reg.size()))
	4100	CryptoPP::Add(m_result.reg.begin(), m_result.reg, m_modulus.reg, a.reg.size());
	4101	return m_result;
1 by weidai Initial revision	4102	}
	4103	else
	4104	{
181 by weidai changes done for FIPS-140 lab code drop	4105	m_result1 = a-b;
	4106	if (m_result1.IsNegative())
	4107	m_result1 += m_modulus;
	4108	return m_result1;
1 by weidai Initial revision	4109	}
	4110	}
	4111
	4112	Integer& ModularArithmetic::Reduce(Integer &a, const Integer &b) const
	4113	{
181 by weidai changes done for FIPS-140 lab code drop	4114	if (a.reg.size()==m_modulus.reg.size() && b.reg.size()==m_modulus.reg.size())
1 by weidai Initial revision	4115	{
1 by weidai Initial revision	4116	if (CryptoPP::Subtract(a.reg, a.reg, b.reg, a.reg.size()))
181 by weidai changes done for FIPS-140 lab code drop	4117	CryptoPP::Add(a.reg, a.reg, m_modulus.reg, a.reg.size());
1 by weidai Initial revision	4118	}
	4119	else
	4120	{
	4121	a-=b;
	4122	if (a.IsNegative())
181 by weidai changes done for FIPS-140 lab code drop	4123	a+=m_modulus;
1 by weidai Initial revision	4124	}
	4125
	4126	return a;
	4127	}
	4128
	4129	const Integer& ModularArithmetic::Inverse(const Integer &a) const
	4130	{
	4131	if (!a)
	4132	return a;
	4133
181 by weidai changes done for FIPS-140 lab code drop	4134	CopyWords(m_result.reg.begin(), m_modulus.reg, m_modulus.reg.size());
181 by weidai changes done for FIPS-140 lab code drop	4135	if (CryptoPP::Subtract(m_result.reg.begin(), m_result.reg, a.reg, a.reg.size()))
184 by weidai port to MSVC .NET 2005 beta 2	4136	Decrement(m_result.reg.begin()+a.reg.size(), m_modulus.reg.size()-a.reg.size());
1 by weidai Initial revision	4137
181 by weidai changes done for FIPS-140 lab code drop	4138	return m_result;
1 by weidai Initial revision	4139	}
	4140
	4141	Integer ModularArithmetic::CascadeExponentiate(const Integer &x, const Integer &e1, const Integer &y, const Integer &e2) const
	4142	{
181 by weidai changes done for FIPS-140 lab code drop	4143	if (m_modulus.IsOdd())
1 by weidai Initial revision	4144	{
181 by weidai changes done for FIPS-140 lab code drop	4145	MontgomeryRepresentation dr(m_modulus);
1 by weidai Initial revision	4146	return dr.ConvertOut(dr.CascadeExponentiate(dr.ConvertIn(x), e1, dr.ConvertIn(y), e2));
	4147	}
	4148	else
	4149	return AbstractRing<Integer>::CascadeExponentiate(x, e1, y, e2);
	4150	}
	4151
	4152	void ModularArithmetic::SimultaneousExponentiate(Integer results, const Integer &base, const Integer exponents, unsigned int exponentsCount) const
	4153	{
181 by weidai changes done for FIPS-140 lab code drop	4154	if (m_modulus.IsOdd())
1 by weidai Initial revision	4155	{
181 by weidai changes done for FIPS-140 lab code drop	4156	MontgomeryRepresentation dr(m_modulus);
1 by weidai Initial revision	4157	dr.SimultaneousExponentiate(results, dr.ConvertIn(base), exponents, exponentsCount);
	4158	for (unsigned int i=0; i<exponentsCount; i++)
	4159	results[i] = dr.ConvertOut(results[i]);
	4160	}
	4161	else
	4162	AbstractRing<Integer>::SimultaneousExponentiate(results, base, exponents, exponentsCount);
	4163	}
	4164
	4165	MontgomeryRepresentation::MontgomeryRepresentation(const Integer &m) // modulus must be odd
	4166	: ModularArithmetic(m),
181 by weidai changes done for FIPS-140 lab code drop	4167	m_u((word)0, m_modulus.reg.size()),
181 by weidai changes done for FIPS-140 lab code drop	4168	m_workspace(5*m_modulus.reg.size())
1 by weidai Initial revision	4169	{
181 by weidai changes done for FIPS-140 lab code drop	4170	if (!m_modulus.IsOdd())
1 by weidai Initial revision	4171	throw InvalidArgument("MontgomeryRepresentation: Montgomery representation requires an odd modulus");
1 by weidai Initial revision	4172
181 by weidai changes done for FIPS-140 lab code drop	4173	RecursiveInverseModPower2(m_u.reg, m_workspace, m_modulus.reg, m_modulus.reg.size());
1 by weidai Initial revision	4174	}
	4175
	4176	const Integer& MontgomeryRepresentation::Multiply(const Integer &a, const Integer &b) const
	4177	{
181 by weidai changes done for FIPS-140 lab code drop	4178	word *const T = m_workspace.begin();
181 by weidai changes done for FIPS-140 lab code drop	4179	word *const R = m_result.reg.begin();
184 by weidai port to MSVC .NET 2005 beta 2	4180	const size_t N = m_modulus.reg.size();
1 by weidai Initial revision	4181	assert(a.reg.size()<=N && b.reg.size()<=N);
	4182
	4183	AsymmetricMultiply(T, T+2*N, a.reg, a.reg.size(), b.reg, b.reg.size());
	4184	SetWords(T+a.reg.size()+b.reg.size(), 0, 2*N-a.reg.size()-b.reg.size());
181 by weidai changes done for FIPS-140 lab code drop	4185	MontgomeryReduce(R, T+2*N, T, m_modulus.reg, m_u.reg, N);
181 by weidai changes done for FIPS-140 lab code drop	4186	return m_result;
1 by weidai Initial revision	4187	}
	4188
	4189	const Integer& MontgomeryRepresentation::Square(const Integer &a) const
	4190	{
181 by weidai changes done for FIPS-140 lab code drop	4191	word *const T = m_workspace.begin();
181 by weidai changes done for FIPS-140 lab code drop	4192	word *const R = m_result.reg.begin();
184 by weidai port to MSVC .NET 2005 beta 2	4193	const size_t N = m_modulus.reg.size();
1 by weidai Initial revision	4194	assert(a.reg.size()<=N);
	4195
	4196	CryptoPP::Square(T, T+2*N, a.reg, a.reg.size());
	4197	SetWords(T+2a.reg.size(), 0, 2N-2*a.reg.size());
181 by weidai changes done for FIPS-140 lab code drop	4198	MontgomeryReduce(R, T+2*N, T, m_modulus.reg, m_u.reg, N);
181 by weidai changes done for FIPS-140 lab code drop	4199	return m_result;
1 by weidai Initial revision	4200	}
	4201
	4202	Integer MontgomeryRepresentation::ConvertOut(const Integer &a) const
	4203	{
181 by weidai changes done for FIPS-140 lab code drop	4204	word *const T = m_workspace.begin();
181 by weidai changes done for FIPS-140 lab code drop	4205	word *const R = m_result.reg.begin();
184 by weidai port to MSVC .NET 2005 beta 2	4206	const size_t N = m_modulus.reg.size();
1 by weidai Initial revision	4207	assert(a.reg.size()<=N);
	4208
	4209	CopyWords(T, a.reg, a.reg.size());
	4210	SetWords(T+a.reg.size(), 0, 2*N-a.reg.size());
181 by weidai changes done for FIPS-140 lab code drop	4211	MontgomeryReduce(R, T+2*N, T, m_modulus.reg, m_u.reg, N);
181 by weidai changes done for FIPS-140 lab code drop	4212	return m_result;
1 by weidai Initial revision	4213	}
	4214
	4215	const Integer& MontgomeryRepresentation::MultiplicativeInverse(const Integer &a) const
	4216	{
	4217	// return (EuclideanMultiplicativeInverse(a, modulus)<<(2WORD_BITSmodulus.reg.size()))%modulus;
181 by weidai changes done for FIPS-140 lab code drop	4218	word *const T = m_workspace.begin();
181 by weidai changes done for FIPS-140 lab code drop	4219	word *const R = m_result.reg.begin();
184 by weidai port to MSVC .NET 2005 beta 2	4220	const size_t N = m_modulus.reg.size();
1 by weidai Initial revision	4221	assert(a.reg.size()<=N);
	4222
	4223	CopyWords(T, a.reg, a.reg.size());
	4224	SetWords(T+a.reg.size(), 0, 2*N-a.reg.size());
181 by weidai changes done for FIPS-140 lab code drop	4225	MontgomeryReduce(R, T+2*N, T, m_modulus.reg, m_u.reg, N);
181 by weidai changes done for FIPS-140 lab code drop	4226	unsigned k = AlmostInverse(R, T, R, N, m_modulus.reg, N);
1 by weidai Initial revision	4227
	4228	// cout << "k=" << k << " N32=" << 32N << endl;
	4229
	4230	if (k>N*WORD_BITS)
181 by weidai changes done for FIPS-140 lab code drop	4231	DivideByPower2Mod(R, R, k-N*WORD_BITS, m_modulus.reg, N);
1 by weidai Initial revision	4232	else
181 by weidai changes done for FIPS-140 lab code drop	4233	MultiplyByPower2Mod(R, R, N*WORD_BITS-k, m_modulus.reg, N);
1 by weidai Initial revision	4234
181 by weidai changes done for FIPS-140 lab code drop	4235	return m_result;
1 by weidai Initial revision	4236	}
	4237
	4238	NAMESPACE_END
75 by weidai create DLL version, fix GetNextIV() bug in CTR and OFB modes	4239
	4240	#endif