1
/* TomsFastMath, a fast ISO C bignum library.
3
* This project is meant to fill in where LibTomMath
4
* falls short. That is speed ;-)
6
* This project is public domain and free for all purposes.
8
* Tom St Denis, tomstdenis@gmail.com
10
#include "bignum_fast.h"
12
/******************************************************************/
13
#if defined(TFM_X86) && !defined(TFM_SSE2)
24
"movl %5,%%eax \n\t" \
26
"addl %1,%%eax \n\t" \
27
"adcl $0,%%edx \n\t" \
28
"addl %%eax,%0 \n\t" \
29
"adcl $0,%%edx \n\t" \
30
"movl %%edx,%1 \n\t" \
31
:"=g"(_c[LO]), "=r"(cy) \
32
:"0"(_c[LO]), "1"(cy), "g"(mu), "g"(*tmpm++) \
33
: "%eax", "%edx", "cc")
39
"movzbl %%al,%1 \n\t" \
40
:"=g"(_c[LO]), "=r"(cy) \
41
:"0"(_c[LO]), "1"(cy) \
44
/******************************************************************/
45
#elif defined(TFM_X86_64)
56
"movq %5,%%rax \n\t" \
58
"addq %1,%%rax \n\t" \
59
"adcq $0,%%rdx \n\t" \
60
"addq %%rax,%0 \n\t" \
61
"adcq $0,%%rdx \n\t" \
62
"movq %%rdx,%1 \n\t" \
63
:"=g"(_c[LO]), "=r"(cy) \
64
:"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++) \
65
: "%rax", "%rdx", "cc")
69
"movq 0(%5),%%rax \n\t" \
70
"movq 0(%2),%%r10 \n\t" \
71
"movq 0x8(%5),%%r11 \n\t" \
73
"addq %%r10,%%rax \n\t" \
74
"adcq $0,%%rdx \n\t" \
75
"movq 0x8(%2),%%r10 \n\t" \
76
"addq %3,%%rax \n\t" \
77
"adcq $0,%%rdx \n\t" \
78
"movq %%rax,0(%0) \n\t" \
79
"movq %%rdx,%1 \n\t" \
81
"movq %%r11,%%rax \n\t" \
82
"movq 0x10(%5),%%r11 \n\t" \
84
"addq %%r10,%%rax \n\t" \
85
"adcq $0,%%rdx \n\t" \
86
"movq 0x10(%2),%%r10 \n\t" \
87
"addq %3,%%rax \n\t" \
88
"adcq $0,%%rdx \n\t" \
89
"movq %%rax,0x8(%0) \n\t" \
90
"movq %%rdx,%1 \n\t" \
92
"movq %%r11,%%rax \n\t" \
93
"movq 0x18(%5),%%r11 \n\t" \
95
"addq %%r10,%%rax \n\t" \
96
"adcq $0,%%rdx \n\t" \
97
"movq 0x18(%2),%%r10 \n\t" \
98
"addq %3,%%rax \n\t" \
99
"adcq $0,%%rdx \n\t" \
100
"movq %%rax,0x10(%0) \n\t" \
101
"movq %%rdx,%1 \n\t" \
103
"movq %%r11,%%rax \n\t" \
104
"movq 0x20(%5),%%r11 \n\t" \
106
"addq %%r10,%%rax \n\t" \
107
"adcq $0,%%rdx \n\t" \
108
"movq 0x20(%2),%%r10 \n\t" \
109
"addq %3,%%rax \n\t" \
110
"adcq $0,%%rdx \n\t" \
111
"movq %%rax,0x18(%0) \n\t" \
112
"movq %%rdx,%1 \n\t" \
114
"movq %%r11,%%rax \n\t" \
115
"movq 0x28(%5),%%r11 \n\t" \
117
"addq %%r10,%%rax \n\t" \
118
"adcq $0,%%rdx \n\t" \
119
"movq 0x28(%2),%%r10 \n\t" \
120
"addq %3,%%rax \n\t" \
121
"adcq $0,%%rdx \n\t" \
122
"movq %%rax,0x20(%0) \n\t" \
123
"movq %%rdx,%1 \n\t" \
125
"movq %%r11,%%rax \n\t" \
126
"movq 0x30(%5),%%r11 \n\t" \
128
"addq %%r10,%%rax \n\t" \
129
"adcq $0,%%rdx \n\t" \
130
"movq 0x30(%2),%%r10 \n\t" \
131
"addq %3,%%rax \n\t" \
132
"adcq $0,%%rdx \n\t" \
133
"movq %%rax,0x28(%0) \n\t" \
134
"movq %%rdx,%1 \n\t" \
136
"movq %%r11,%%rax \n\t" \
137
"movq 0x38(%5),%%r11 \n\t" \
139
"addq %%r10,%%rax \n\t" \
140
"adcq $0,%%rdx \n\t" \
141
"movq 0x38(%2),%%r10 \n\t" \
142
"addq %3,%%rax \n\t" \
143
"adcq $0,%%rdx \n\t" \
144
"movq %%rax,0x30(%0) \n\t" \
145
"movq %%rdx,%1 \n\t" \
147
"movq %%r11,%%rax \n\t" \
149
"addq %%r10,%%rax \n\t" \
150
"adcq $0,%%rdx \n\t" \
151
"addq %3,%%rax \n\t" \
152
"adcq $0,%%rdx \n\t" \
153
"movq %%rax,0x38(%0) \n\t" \
154
"movq %%rdx,%1 \n\t" \
156
:"=r"(_c), "=r"(cy) \
157
: "0"(_c), "1"(cy), "g"(mu), "r"(tmpm)\
158
: "%rax", "%rdx", "%r10", "%r11", "cc")
165
"movzbq %%al,%1 \n\t" \
166
:"=g"(_c[LO]), "=r"(cy) \
167
:"0"(_c[LO]), "1"(cy) \
170
/******************************************************************/
171
#elif defined(TFM_SSE2)
172
/* SSE2 code (assumes 32-bit fp_digits) */
173
/* XMM register assignments:
174
* xmm0 *tmpm++, then Mu * (*tmpm++)
182
asm("movd %0,%%mm2"::"g"(mp))
189
"movd %0,%%mm1 \n\t" \
190
"pxor %%mm3,%%mm3 \n\t" \
191
"pmuludq %%mm2,%%mm1 \n\t" \
194
/* pmuludq on mmx registers does a 32x32->64 multiply. */
197
"movd %1,%%mm4 \n\t" \
198
"movd %2,%%mm0 \n\t" \
199
"paddq %%mm4,%%mm3 \n\t" \
200
"pmuludq %%mm1,%%mm0 \n\t" \
201
"paddq %%mm0,%%mm3 \n\t" \
202
"movd %%mm3,%0 \n\t" \
203
"psrlq $32, %%mm3 \n\t" \
204
:"=g"(_c[LO]) : "0"(_c[LO]), "g"(*tmpm++) );
208
"movd 0(%1),%%mm4 \n\t" \
209
"movd 0(%2),%%mm0 \n\t" \
210
"paddq %%mm4,%%mm3 \n\t" \
211
"pmuludq %%mm1,%%mm0 \n\t" \
212
"movd 4(%2),%%mm5 \n\t" \
213
"paddq %%mm0,%%mm3 \n\t" \
214
"movd 4(%1),%%mm6 \n\t" \
215
"movd %%mm3,0(%0) \n\t" \
216
"psrlq $32, %%mm3 \n\t" \
218
"paddq %%mm6,%%mm3 \n\t" \
219
"pmuludq %%mm1,%%mm5 \n\t" \
220
"movd 8(%2),%%mm6 \n\t" \
221
"paddq %%mm5,%%mm3 \n\t" \
222
"movd 8(%1),%%mm7 \n\t" \
223
"movd %%mm3,4(%0) \n\t" \
224
"psrlq $32, %%mm3 \n\t" \
226
"paddq %%mm7,%%mm3 \n\t" \
227
"pmuludq %%mm1,%%mm6 \n\t" \
228
"movd 12(%2),%%mm7 \n\t" \
229
"paddq %%mm6,%%mm3 \n\t" \
230
"movd 12(%1),%%mm5 \n\t" \
231
"movd %%mm3,8(%0) \n\t" \
232
"psrlq $32, %%mm3 \n\t" \
234
"paddq %%mm5,%%mm3 \n\t" \
235
"pmuludq %%mm1,%%mm7 \n\t" \
236
"movd 16(%2),%%mm5 \n\t" \
237
"paddq %%mm7,%%mm3 \n\t" \
238
"movd 16(%1),%%mm6 \n\t" \
239
"movd %%mm3,12(%0) \n\t" \
240
"psrlq $32, %%mm3 \n\t" \
242
"paddq %%mm6,%%mm3 \n\t" \
243
"pmuludq %%mm1,%%mm5 \n\t" \
244
"movd 20(%2),%%mm6 \n\t" \
245
"paddq %%mm5,%%mm3 \n\t" \
246
"movd 20(%1),%%mm7 \n\t" \
247
"movd %%mm3,16(%0) \n\t" \
248
"psrlq $32, %%mm3 \n\t" \
250
"paddq %%mm7,%%mm3 \n\t" \
251
"pmuludq %%mm1,%%mm6 \n\t" \
252
"movd 24(%2),%%mm7 \n\t" \
253
"paddq %%mm6,%%mm3 \n\t" \
254
"movd 24(%1),%%mm5 \n\t" \
255
"movd %%mm3,20(%0) \n\t" \
256
"psrlq $32, %%mm3 \n\t" \
258
"paddq %%mm5,%%mm3 \n\t" \
259
"pmuludq %%mm1,%%mm7 \n\t" \
260
"movd 28(%2),%%mm5 \n\t" \
261
"paddq %%mm7,%%mm3 \n\t" \
262
"movd 28(%1),%%mm6 \n\t" \
263
"movd %%mm3,24(%0) \n\t" \
264
"psrlq $32, %%mm3 \n\t" \
266
"paddq %%mm6,%%mm3 \n\t" \
267
"pmuludq %%mm1,%%mm5 \n\t" \
268
"paddq %%mm5,%%mm3 \n\t" \
269
"movd %%mm3,28(%0) \n\t" \
270
"psrlq $32, %%mm3 \n\t" \
271
:"=r"(_c) : "0"(_c), "g"(tmpm) );
274
asm( "movd %%mm3,%0 \n" :"=r"(cy))
280
"movzbl %%al,%1 \n\t" \
281
:"=g"(_c[LO]), "=r"(cy) \
282
:"0"(_c[LO]), "1"(cy) \
285
/******************************************************************/
286
#elif defined(TFM_ARM)
298
" ADDS r0,r0,%0 \n\t" \
299
" MOVCS %0,#1 \n\t" \
300
" MOVCC %0,#0 \n\t" \
301
" UMLAL r0,%0,%3,%4 \n\t" \
303
:"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c[0]):"r0","cc");
308
" ADDS r0,r0,%0 \n\t" \
310
" MOVCS %0,#1 \n\t" \
311
" MOVCC %0,#0 \n\t" \
312
:"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r0","cc");
314
/******************************************************************/
315
#elif defined(TFM_PPC32)
326
" mullw 16,%3,%4 \n\t" \
327
" mulhwu 17,%3,%4 \n\t" \
328
" addc 16,16,%0 \n\t" \
329
" addze 17,17 \n\t" \
331
" addc 16,16,18 \n\t" \
332
" addze %0,17 \n\t" \
334
:"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"16", "17", "18","cc"); ++tmpm;
339
" addc 16,16,%0 \n\t" \
341
" xor %0,%0,%0 \n\t" \
342
" addze %0,%0 \n\t" \
343
:"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"16","cc");
345
/******************************************************************/
346
#elif defined(TFM_PPC64)
357
" mulld r16,%3,%4 \n\t" \
358
" mulhdu r17,%3,%4 \n\t" \
359
" addc r16,16,%0 \n\t" \
360
" addze r17,r17 \n\t" \
361
" ldx r18,0,%1 \n\t" \
362
" addc r16,r16,r18 \n\t" \
363
" addze %0,r17 \n\t" \
364
" sdx r16,0,%1 \n\t" \
365
:"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"r16", "r17", "r18","cc"); ++tmpm;
369
" ldx r16,0,%1 \n\t" \
370
" addc r16,r16,%0 \n\t" \
371
" sdx r16,0,%1 \n\t" \
372
" xor %0,%0,%0 \n\t" \
373
" addze %0,%0 \n\t" \
374
:"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r16","cc");
376
/******************************************************************/
377
#elif defined(TFM_AVR32)
392
" macu.d r2,%3,%4 \n\t" \
395
:"=r"(cy),"=r"(_c):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c):"r2","r3");
404
:"=r"(cy),"=r"(&_c[0]):"0"(cy),"1"(&_c[0]):"r2","cc");
406
/******************************************************************/
407
#elif defined(TFM_MIPS)
418
" multu %3,%4 \n\t" \
421
" addu $12,$12,%0 \n\t" \
422
" sltu $10,$12,%0 \n\t" \
423
" addu $13,$13,$10 \n\t" \
425
" addu $12,$12,$10 \n\t" \
426
" sltu $10,$12,$10 \n\t" \
427
" addu %0,$13,$10 \n\t" \
429
:"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"$10","$12","$13"); ++tmpm;
434
" addu $10,$10,%0 \n\t" \
436
" sltu %0,$10,%0 \n\t" \
437
:"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"$10");
439
/******************************************************************/
451
_c[0] = t = ((fp_word)_c[0] + (fp_word)cy) + \
452
(((fp_word)mu) * ((fp_word)*tmpm++)); \
453
cy = (t >> DIGIT_BIT); \
457
do { fp_digit t = _c[0] += cy; cy = (t < cy); } while (0)
460
/******************************************************************/
465
#ifdef TFM_SMALL_MONT_SET
466
#include "fp_mont_small.i"
469
/* computes x/R == x (mod N) via Montgomery Reduction */
470
void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
472
fp_digit c[FP_SIZE], *_c, *tmpm, mu;
473
int oldused, x, y, pa;
475
/* bail if too large */
476
if (m->used > (FP_SIZE/2)) {
480
#ifdef TFM_SMALL_MONT_SET
482
fp_montgomery_reduce_small(a, m, mp);
487
#if defined(USE_MEMSET)
488
/* now zero the buff */
489
memset(c, 0, sizeof c);
495
for (x = 0; x < oldused; x++) {
498
#if !defined(USE_MEMSET)
499
for (; x < 2*pa+1; x++) {
505
for (x = 0; x < pa; x++) {
507
/* get Mu for this round */
512
#if (defined(TFM_SSE2) || defined(TFM_X86_64))
513
for (; y < (pa & ~7); y += 8) {
520
for (; y < pa; y++) {
534
for (x = 0; x < pa+1; x++) {
538
for (; x < oldused; x++) {
547
/* if A >= m then A = A - m */
548
if (fp_cmp_mag (a, m) != FP_LT) {
554
/* $Source: /cvs/libtom/tomsfastmath/src/mont/fp_montgomery_reduce.c,v $ */
555
/* $Revision: 1.2 $ */
556
/* $Date: 2007/03/14 23:47:42 $ */