1
/* An implementation in GMP of Scho"nhage's fast multiplication algorithm
2
modulo 2^N+1, by Paul Zimmermann, INRIA Lorraine, February 1998.
4
THE CONTENTS OF THIS FILE ARE FOR INTERNAL USE AND THE FUNCTIONS HAVE
5
MUTABLE INTERFACES. IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED
6
INTERFACES. IT IS ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN
7
A FUTURE GNU MP RELEASE.
9
Copyright 1998, 1999, 2000, 2001, 2002, 2004 Free Software Foundation, Inc.
11
This file is part of the GNU MP Library.
13
The GNU MP Library is free software; you can redistribute it and/or modify
14
it under the terms of the GNU Lesser General Public License as published by
15
the Free Software Foundation; either version 2.1 of the License, or (at your
16
option) any later version.
18
The GNU MP Library is distributed in the hope that it will be useful, but
19
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
20
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
21
License for more details.
23
You should have received a copy of the GNU Lesser General Public License
24
along with the GNU MP Library; see the file COPYING.LIB. If not, write to
25
the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
26
MA 02111-1307, USA. */
31
Schnelle Multiplikation grosser Zahlen, by Arnold Scho"nhage and Volker
32
Strassen, Computing 7, p. 281-292, 1971.
34
Asymptotically fast algorithms for the numerical multiplication
35
and division of polynomials with complex coefficients, by Arnold Scho"nhage,
36
Computer Algebra, EUROCAM'82, LNCS 144, p. 3-15, 1982.
38
Tapes versus Pointers, a study in implementing fast algorithms,
39
by Arnold Scho"nhage, Bulletin of the EATCS, 30, p. 23-32, 1986.
41
See also http://www.loria.fr/~zimmerma/bignum
46
It might be possible to avoid a small number of MPN_COPYs by using a
47
rotating temporary or two.
49
Multiplications of unequal sized operands can be done with this code, but
50
it needs a tighter test for identifying squaring (same sizes as well as
59
/* Change this to "#define TRACE(x) x" for some traces. */
64
FFT_TABLE_ATTRS mp_size_t mpn_fft_table[2][MPN_FFT_TABLE_SIZE] = {
70
static void mpn_mul_fft_internal
71
_PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t, int, int, mp_ptr *, mp_ptr *,
72
mp_ptr, mp_ptr, mp_size_t, mp_size_t, mp_size_t, int **, mp_ptr,int));
75
/* Find the best k to use for a mod 2^(m*BITS_PER_MP_LIMB)+1 FFT
77
sqr==0 if for a multiply, sqr==1 for a square.
80
mpn_fft_best_k (mp_size_t n, int sqr)
84
for (i = 0; mpn_fft_table[sqr][i] != 0; i++)
85
if (n < mpn_fft_table[sqr][i])
86
return i + FFT_FIRST_K;
88
/* treat 4*last as one further entry */
89
if (i == 0 || n < 4 * mpn_fft_table[sqr][i - 1])
90
return i + FFT_FIRST_K;
92
return i + FFT_FIRST_K + 1;
96
/* Returns smallest possible number of limbs >= pl for a fft of size 2^k,
97
i.e. smallest multiple of 2^k >= pl. */
100
mpn_fft_next_size (mp_size_t pl, int k)
105
pl = 1 + (pl - 1) / K; /* ceil(pl/K) */
111
mpn_fft_initl (int **l, int k)
116
for (i = 1,K = 2; i <= k; i++,K *= 2)
118
for (j = 0; j < K / 2; j++)
120
l[i][j] = 2 * l[i - 1][j];
121
l[i][K / 2 + j] = 1 + l[i][j];
127
/* a <- a*2^e mod 2^(n*BITS_PER_MP_LIMB)+1 */
129
mpn_fft_mul_2exp_modF (mp_ptr ap, int e, mp_size_t n, mp_ptr tp)
134
d = e % (n * BITS_PER_MP_LIMB); /* 2^e = (+/-) 2^d */
135
sh = d % BITS_PER_MP_LIMB;
137
mpn_lshift (tp, ap, n + 1, sh); /* no carry here */
139
MPN_COPY (tp, ap, n + 1);
140
d /= BITS_PER_MP_LIMB; /* now shift of d limbs to the left */
143
/* ap[d..n-1] = tp[0..n-d-1], ap[0..d-1] = -tp[n-d..n-1] */
144
/* mpn_xor would be more efficient here */
145
for (i = d - 1; i >= 0; i--)
146
ap[i] = ~tp[n - d + i];
147
cc = 1 - mpn_add_1 (ap, ap, d, CNST_LIMB(1));
149
cc = mpn_sub_1 (ap + d, tp, n - d, CNST_LIMB(1));
151
MPN_COPY (ap + d, tp, n - d);
152
cc += mpn_sub_1 (ap + d, ap + d, n - d, tp[n]);
154
ap[n] = mpn_add_1 (ap, ap, n, cc);
158
else if ((ap[n] = mpn_sub_1 (ap, tp, n, tp[n])))
160
ap[n] = mpn_add_1 (ap, ap, n, CNST_LIMB(1));
162
if ((e / (n * BITS_PER_MP_LIMB)) % 2)
165
mpn_com_n (ap, ap, n);
173
/* a <- a+b mod 2^(n*BITS_PER_MP_LIMB)+1 */
175
mpn_fft_add_modF (mp_ptr ap, mp_ptr bp, int n)
179
c = ap[n] + bp[n] + mpn_add_n (ap, ap, bp, n);
180
if (c > 1) /* subtract c-1 to both ap[0] and ap[n] */
183
mpn_decr_u (ap, c - 1);
190
/* input: A[0] ... A[inc*(K-1)] are residues mod 2^N+1 where
192
2^omega is a primitive root mod 2^N+1
193
output: A[inc*l[k][i]] <- \sum (2^omega)^(ij) A[inc*j] mod 2^N+1 */
196
mpn_fft_fft_sqr (mp_ptr *Ap, mp_size_t K, int **ll,
197
mp_size_t omega, mp_size_t n, mp_size_t inc, mp_ptr tp)
202
#if HAVE_NATIVE_mpn_addsub_n
203
cy = mpn_addsub_n (Ap[0], Ap[inc], Ap[0], Ap[inc], n + 1) & 1;
205
MPN_COPY (tp, Ap[0], n + 1);
206
mpn_add_n (Ap[0], Ap[0], Ap[inc], n + 1);
207
cy = mpn_sub_n (Ap[inc], tp, Ap[inc], n + 1);
209
if (Ap[0][n] > CNST_LIMB(1)) /* can be 2 or 3 */
210
Ap[0][n] = CNST_LIMB(1) - mpn_sub_1 (Ap[0], Ap[0], n, Ap[0][n] - CNST_LIMB(1));
211
if (cy) /* Ap[inc][n] can be -1 or -2 */
212
Ap[inc][n] = mpn_add_1 (Ap[inc], Ap[inc], n, ~Ap[inc][n] + CNST_LIMB(1));
216
int j, inc2 = 2 * inc;
222
tmp = TMP_ALLOC_LIMBS (n + 1);
223
mpn_fft_fft_sqr (Ap, K/2,ll-1,2 * omega,n,inc2, tp);
224
mpn_fft_fft_sqr (Ap+inc, K/2,ll-1,2 * omega,n,inc2, tp);
225
/* A[2*j*inc] <- A[2*j*inc] + omega^l[k][2*j*inc] A[(2j+1)inc]
226
A[(2j+1)inc] <- A[2*j*inc] + omega^l[k][(2j+1)inc] A[(2j+1)inc] */
227
for (j = 0; j < K / 2; j++,lk += 2,Ap += 2 * inc)
229
MPN_COPY (tp, Ap[inc], n + 1);
230
mpn_fft_mul_2exp_modF (Ap[inc], lk[1] * omega, n, tmp);
231
mpn_fft_add_modF (Ap[inc], Ap[0], n);
232
mpn_fft_mul_2exp_modF (tp, lk[0] * omega, n, tmp);
233
mpn_fft_add_modF (Ap[0], tp, n);
240
/* input: A[0] ... A[inc*(K-1)] are residues mod 2^N+1 where
242
2^omega is a primitive root mod 2^N+1
243
output: A[inc*l[k][i]] <- \sum (2^omega)^(ij) A[inc*j] mod 2^N+1 */
246
mpn_fft_fft (mp_ptr *Ap, mp_ptr *Bp, mp_size_t K, int **ll,
247
mp_size_t omega, mp_size_t n, mp_size_t inc, mp_ptr tp)
252
#if HAVE_NATIVE_mpn_addsub_n
253
ca = mpn_addsub_n (Ap[0], Ap[inc], Ap[0], Ap[inc], n + 1) & 1;
254
cb = mpn_addsub_n (Bp[0], Bp[inc], Bp[0], Bp[inc], n + 1) & 1;
256
MPN_COPY (tp, Ap[0], n + 1);
257
mpn_add_n (Ap[0], Ap[0], Ap[inc], n + 1);
258
ca = mpn_sub_n (Ap[inc], tp, Ap[inc], n + 1);
259
MPN_COPY (tp, Bp[0], n + 1);
260
mpn_add_n (Bp[0], Bp[0], Bp[inc], n + 1);
261
cb = mpn_sub_n (Bp[inc], tp, Bp[inc], n + 1);
263
if (Ap[0][n] > CNST_LIMB(1)) /* can be 2 or 3 */
264
Ap[0][n] = CNST_LIMB(1) - mpn_sub_1 (Ap[0], Ap[0], n, Ap[0][n] - CNST_LIMB(1));
265
if (ca) /* Ap[inc][n] can be -1 or -2 */
266
Ap[inc][n] = mpn_add_1 (Ap[inc], Ap[inc], n, ~Ap[inc][n] + CNST_LIMB(1));
267
if (Bp[0][n] > CNST_LIMB(1)) /* can be 2 or 3 */
268
Bp[0][n] = CNST_LIMB(1) - mpn_sub_1 (Bp[0], Bp[0], n, Bp[0][n] - CNST_LIMB(1));
269
if (cb) /* Bp[inc][n] can be -1 or -2 */
270
Bp[inc][n] = mpn_add_1 (Bp[inc], Bp[inc], n, ~Bp[inc][n] + CNST_LIMB(1));
280
tmp = TMP_ALLOC_LIMBS (n + 1);
281
mpn_fft_fft (Ap, Bp, K/2,ll-1,2 * omega,n,inc2, tp);
282
mpn_fft_fft (Ap+inc, Bp+inc, K/2,ll-1,2 * omega,n,inc2, tp);
283
/* A[2*j*inc] <- A[2*j*inc] + omega^l[k][2*j*inc] A[(2j+1)inc]
284
A[(2j+1)inc] <- A[2*j*inc] + omega^l[k][(2j+1)inc] A[(2j+1)inc] */
285
for (j = 0; j < K / 2; j++,lk += 2,Ap += 2 * inc,Bp += 2 * inc)
287
MPN_COPY (tp, Ap[inc], n + 1);
288
mpn_fft_mul_2exp_modF (Ap[inc], lk[1] * omega, n, tmp);
289
mpn_fft_add_modF (Ap[inc], Ap[0], n);
290
mpn_fft_mul_2exp_modF (tp, lk[0] * omega, n, tmp);
291
mpn_fft_add_modF (Ap[0], tp, n);
292
MPN_COPY (tp, Bp[inc], n + 1);
293
mpn_fft_mul_2exp_modF (Bp[inc], lk[1] * omega, n, tmp);
294
mpn_fft_add_modF (Bp[inc], Bp[0], n);
295
mpn_fft_mul_2exp_modF (tp, lk[0] * omega, n, tmp);
296
mpn_fft_add_modF (Bp[0], tp, n);
303
/* Given ap[0..n] with ap[n]<=1, reduce it modulo 2^(n*BITS_PER_MP_LIMB)+1,
304
by subtracting that modulus if necessary.
306
If ap[0..n] is exactly 2^(n*BITS_PER_MP_LIMB) then the sub_1 produces a
307
borrow and the limbs must be zeroed out again. This will occur very
311
mpn_fft_norm (mp_ptr ap, mp_size_t n)
317
if ((ap[n] = mpn_sub_1 (ap, ap, n, CNST_LIMB(1))))
323
/* a[i] <- a[i]*b[i] mod 2^(n*BITS_PER_MP_LIMB)+1 for 0 <= i < K */
325
mpn_fft_mul_modF_K (mp_ptr *ap, mp_ptr *bp, mp_size_t n, int K)
328
int sqr = (ap == bp);
333
if (n >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
335
int k, K2,nprime2,Nprime2,M2,maxLK,l,Mp2;
337
mp_ptr *Ap,*Bp,A,B,T;
339
k = mpn_fft_best_k (n, sqr);
341
ASSERT_ALWAYS(n % K2 == 0);
342
maxLK = (K2>BITS_PER_MP_LIMB) ? K2 : BITS_PER_MP_LIMB;
343
M2 = n*BITS_PER_MP_LIMB/K2;
345
Nprime2 = ((2 * M2+k+2+maxLK)/maxLK)*maxLK; /* ceil((2*M2+k+3)/maxLK)*maxLK*/
346
nprime2 = Nprime2 / BITS_PER_MP_LIMB;
348
/* we should ensure that nprime2 is a multiple of the next K */
349
if (nprime2 >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
352
while (nprime2 % (K3 = 1 << mpn_fft_best_k (nprime2, sqr)))
354
nprime2 = ((nprime2 + K3 - 1) / K3) * K3;
355
Nprime2 = nprime2 * BITS_PER_MP_LIMB;
356
/* warning: since nprime2 changed, K3 may change too! */
358
ASSERT(nprime2 % K3 == 0);
360
ASSERT_ALWAYS(nprime2 < n); /* otherwise we'll loop */
364
Ap = TMP_ALLOC_MP_PTRS (K2);
365
Bp = TMP_ALLOC_MP_PTRS (K2);
366
A = TMP_ALLOC_LIMBS (2 * K2 * (nprime2 + 1));
367
T = TMP_ALLOC_LIMBS (nprime2 + 1);
368
B = A + K2 * (nprime2 + 1);
369
_fft_l = TMP_ALLOC_TYPE (k + 1, int*);
370
for (i = 0; i <= k; i++)
371
_fft_l[i] = TMP_ALLOC_TYPE (1<<i, int);
372
mpn_fft_initl (_fft_l, k);
374
TRACE (printf ("recurse: %dx%d limbs -> %d times %dx%d (%1.2f)\n", n,
375
n, K2, nprime2, nprime2, 2.0*(double)n/nprime2/K2));
376
for (i = 0; i < K; i++,ap++,bp++)
378
mpn_fft_norm (*ap, n);
379
if (!sqr) mpn_fft_norm (*bp, n);
380
mpn_mul_fft_internal (*ap, *ap, *bp, n, k, K2, Ap, Bp, A, B, nprime2,
381
l, Mp2, _fft_l, T, 1);
386
mp_ptr a, b, tp, tpn;
389
tp = TMP_ALLOC_LIMBS (n2);
391
TRACE (printf (" mpn_mul_n %d of %d limbs\n", K, n));
392
for (i = 0; i < K; i++)
397
mpn_sqr_n (tp, a, n);
399
mpn_mul_n (tp, b, a, n);
401
cc = mpn_add_n (tpn, tpn, b, n);
405
cc += mpn_add_n (tpn, tpn, a, n) + a[n];
408
cc = mpn_add_1 (tp, tp, n2, cc);
409
ASSERT_NOCARRY (mpn_add_1 (tp, tp, n2, cc));
411
a[n] = mpn_sub_n (a, tp, tpn, n) && mpn_add_1 (a, a, n, CNST_LIMB(1));
418
/* input: A^[l[k][0]] A^[l[k][1]] ... A^[l[k][K-1]]
419
output: K*A[0] K*A[K-1] ... K*A[1].
420
Assumes the Ap[] are pseudo-normalized, i.e. 0 <= Ap[][n] <= 1.
421
This condition is also fulfilled at exit.
425
mpn_fft_fftinv (mp_ptr *Ap, int K, mp_size_t omega, mp_size_t n, mp_ptr tp)
430
#if HAVE_NATIVE_mpn_addsub_n
431
cy = mpn_addsub_n (Ap[0], Ap[1], Ap[0], Ap[1], n + 1) & 1;
433
MPN_COPY (tp, Ap[0], n + 1);
434
mpn_add_n (Ap[0], Ap[0], Ap[1], n + 1);
435
cy = mpn_sub_n (Ap[1], tp, Ap[1], n + 1);
437
if (Ap[0][n] > CNST_LIMB(1)) /* can be 2 or 3 */
438
Ap[0][n] = CNST_LIMB(1) - mpn_sub_1 (Ap[0], Ap[0], n, Ap[0][n] - CNST_LIMB(1));
439
if (cy) /* Ap[1][n] can be -1 or -2 */
440
Ap[1][n] = mpn_add_1 (Ap[1], Ap[1], n, ~Ap[1][n] + CNST_LIMB(1));
445
mp_ptr *Bp = Ap + K2, tmp;
449
tmp = TMP_ALLOC_LIMBS (n + 1);
450
mpn_fft_fftinv (Ap, K2, 2 * omega, n, tp);
451
mpn_fft_fftinv (Bp, K2, 2 * omega, n, tp);
452
/* A[j] <- A[j] + omega^j A[j+K/2]
453
A[j+K/2] <- A[j] + omega^(j+K/2) A[j+K/2] */
454
for (j = 0; j < K2; j++,Ap++,Bp++)
456
MPN_COPY (tp, Bp[0], n + 1);
457
mpn_fft_mul_2exp_modF (Bp[0], (j + K2) * omega, n, tmp);
458
mpn_fft_add_modF (Bp[0], Ap[0], n);
459
mpn_fft_mul_2exp_modF (tp, j * omega, n, tmp);
460
mpn_fft_add_modF (Ap[0], tp, n);
467
/* A <- A/2^k mod 2^(n*BITS_PER_MP_LIMB)+1 */
469
mpn_fft_div_2exp_modF (mp_ptr ap, int k, mp_size_t n, mp_ptr tp)
473
i = 2 * n * BITS_PER_MP_LIMB;
475
mpn_fft_mul_2exp_modF (ap, i, n, tp);
476
/* 1/2^k = 2^(2nL-k) mod 2^(n*BITS_PER_MP_LIMB)+1 */
477
/* normalize so that A < 2^(n*BITS_PER_MP_LIMB)+1 */
478
mpn_fft_norm (ap, n);
482
/* R <- A mod 2^(n*BITS_PER_MP_LIMB)+1, n <= an <= 3*n */
484
mpn_fft_norm_modF (mp_ptr rp, mp_ptr ap, mp_size_t n, mp_size_t an)
488
ASSERT (n <= an && an <= 3 * n);
492
rp[n] = mpn_add_1 (rp + an - 2 * n, ap + an - 2 * n, 3 * n - an,
493
mpn_add_n (rp, ap, ap + 2 * n, an - 2 * n));
498
MPN_COPY (rp, ap, n);
501
if (mpn_sub_n (rp, rp, ap + n, l))
503
if (mpn_sub_1 (rp + l, rp + l, n + 1 - l, CNST_LIMB(1)))
504
rp[n] = mpn_add_1 (rp, rp, n, CNST_LIMB(1));
510
mpn_mul_fft_internal (mp_ptr op, mp_srcptr n, mp_srcptr m, mp_size_t pl,
512
mp_ptr *Ap, mp_ptr *Bp,
514
mp_size_t nprime, mp_size_t l, mp_size_t Mp,
518
int i, sqr, pla, lo, sh, j;
523
TRACE (printf ("pl=%d k=%d K=%d np=%d l=%d Mp=%d rec=%d sqr=%d\n",
524
pl,k,K,nprime,l,Mp,rec,sqr));
526
/* decomposition of inputs into arrays Ap[i] and Bp[i] */
528
for (i = 0; i < K; i++)
530
Ap[i] = A + i * (nprime + 1); Bp[i] = B + i * (nprime + 1);
531
/* store the next M bits of n into A[i] */
532
/* supposes that M is a multiple of BITS_PER_MP_LIMB */
533
MPN_COPY (Ap[i], n, l); n += l;
534
MPN_ZERO (Ap[i]+l, nprime + 1 - l);
535
/* set most significant bits of n and m (important in recursive calls) */
538
mpn_fft_mul_2exp_modF (Ap[i], i * Mp, nprime, T);
541
MPN_COPY (Bp[i], m, l); m += l;
542
MPN_ZERO (Bp[i] + l, nprime + 1 - l);
545
mpn_fft_mul_2exp_modF (Bp[i], i * Mp, nprime, T);
551
mpn_fft_fft_sqr (Ap, K, _fft_l + k, 2 * Mp, nprime, 1, T);
553
mpn_fft_fft (Ap, Bp, K, _fft_l + k, 2 * Mp, nprime, 1, T);
555
/* term to term multiplications */
556
mpn_fft_mul_modF_K (Ap, (sqr) ? Ap : Bp, nprime, K);
559
mpn_fft_fftinv (Ap, K, 2 * Mp, nprime, T);
561
/* division of terms after inverse fft */
562
for (i = 0; i < K; i++)
563
mpn_fft_div_2exp_modF (Ap[i], k + ((K - i) % K) * Mp, nprime, T);
565
/* addition of terms in result p */
566
MPN_ZERO (T, nprime + 1);
567
pla = l * (K - 1) + nprime + 1; /* number of required limbs for p */
568
p = B; /* B has K*(n' + 1) limbs, which is >= pla, i.e. enough */
570
sqr = 0; /* will accumulate the (signed) carry at p[pla] */
571
for (i = K - 1,lo = l * i + nprime,sh = l * i; i >= 0; i--,lo -= l,sh -= l)
575
if (mpn_add_n (n, n, Ap[j], nprime + 1))
576
sqr += mpn_add_1 (n + nprime + 1, n + nprime + 1, pla - sh - nprime - 1, CNST_LIMB(1));
577
T[2 * l]=i + 1; /* T = (i + 1)*2^(2*M) */
578
if (mpn_cmp (Ap[j],T,nprime + 1)>0)
579
{ /* subtract 2^N'+1 */
580
sqr -= mpn_sub_1 (n, n, pla - sh, CNST_LIMB(1));
581
sqr -= mpn_sub_1 (p + lo, p + lo, pla - lo, CNST_LIMB(1));
586
if ((sqr = mpn_add_1 (p + pla - pl,p + pla - pl,pl, CNST_LIMB(1))))
588
/* p[pla-pl]...p[pla-1] are all zero */
589
mpn_sub_1 (p + pla - pl - 1, p + pla - pl - 1, pl + 1, CNST_LIMB(1));
590
mpn_sub_1 (p + pla - 1, p + pla - 1, 1, CNST_LIMB(1));
597
while ((sqr = mpn_add_1 (p + pla - 2 * pl, p + pla - 2 * pl, 2 * pl, sqr)))
602
sqr = mpn_sub_1 (p + pla - pl, p + pla - pl, pl, sqr);
609
/* here p < 2^(2M) [K 2^(M(K-1)) + (K-1) 2^(M(K-2)) + ... ]
610
< K 2^(2M) [2^(M(K-1)) + 2^(M(K-2)) + ... ]
611
< K 2^(2M) 2^(M(K-1))*2 = 2^(M*K+M+k+1) */
612
mpn_fft_norm_modF (op, p, pl, pla);
616
/* op <- n*m mod 2^N+1 with fft of size 2^k where N=pl*BITS_PER_MP_LIMB
617
n and m have respectively nl and ml limbs
618
op must have space for pl+1 limbs
619
Assumes pl is multiple of 2^k.
623
mpn_mul_fft (mp_ptr op, mp_size_t pl,
624
mp_srcptr n, mp_size_t nl,
625
mp_srcptr m, mp_size_t ml,
629
mp_size_t N, Nprime, nprime, M, Mp, l;
630
mp_ptr *Ap,*Bp, A, T, B;
632
int sqr = (n == m && nl == ml);
635
TRACE (printf ("\nmpn_mul_fft pl=%ld nl=%ld ml=%ld k=%d\n", pl, nl, ml, k));
638
N = pl * BITS_PER_MP_LIMB;
639
_fft_l = TMP_ALLOC_TYPE (k + 1, int*);
640
for (i = 0; i <= k; i++)
641
_fft_l[i] = TMP_ALLOC_TYPE (1<<i, int);
642
mpn_fft_initl (_fft_l, k);
644
ASSERT_ALWAYS (pl % K == 0);
645
M = N/K; /* exact: N = 2^k M */
646
l = M / BITS_PER_MP_LIMB; /* l = pl / K also */
647
maxLK = (K>BITS_PER_MP_LIMB) ? K : BITS_PER_MP_LIMB;
649
Nprime = ((2 * M + k + 2 + maxLK) / maxLK) * maxLK; /* ceil((2*M+k+3)/maxLK)*maxLK; */
650
nprime = Nprime / BITS_PER_MP_LIMB;
651
/* with B := BITS_PER_MP_LIMB, nprime >= 2*M/B = 2*N/(K*B) = 2*pl/K = 2*l */
652
TRACE (printf ("N=%d K=%d, M=%d, l=%d, maxLK=%d, Np=%d, np=%d\n",
653
N, K, M, l, maxLK, Nprime, nprime));
654
/* we should ensure that recursively, nprime is a multiple of the next K */
655
if (nprime >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
658
while (nprime % (K2 = 1 << mpn_fft_best_k (nprime, sqr)))
660
nprime = ((nprime + K2 - 1) / K2) * K2;
661
Nprime = nprime * BITS_PER_MP_LIMB;
662
/* warning: since nprime changed, K2 may change too! */
664
TRACE (printf ("new maxLK=%d, Np=%d, np=%d\n", maxLK, Nprime, nprime));
665
ASSERT(nprime % K2 == 0);
667
ASSERT_ALWAYS (nprime < pl); /* otherwise we'll loop */
669
T = TMP_ALLOC_LIMBS (nprime + 1);
672
TRACE (printf ("%dx%d limbs -> %d times %dx%d limbs (%1.2f)\n",
673
pl,pl,K,nprime,nprime,2.0*(double)N/Nprime/K);
674
printf (" temp space %ld\n", 2 * K * (nprime + 1)));
676
A = __GMP_ALLOCATE_FUNC_LIMBS (2 * K * (nprime + 1));
677
B = A + K * (nprime + 1);
678
Ap = TMP_ALLOC_MP_PTRS (K);
679
Bp = TMP_ALLOC_MP_PTRS (K);
680
/* special decomposition for main call */
681
for (i = 0; i < K; i++)
683
Ap[i] = A + i * (nprime + 1); Bp[i] = B + i * (nprime + 1);
684
/* store the next M bits of n into A[i] */
685
/* supposes that M is a multiple of BITS_PER_MP_LIMB */
688
j = (nl>=l) ? l : nl; /* limbs to store in Ap[i] */
689
MPN_COPY (Ap[i], n, j); n += l;
690
MPN_ZERO (Ap[i] + j, nprime + 1 - j);
691
mpn_fft_mul_2exp_modF (Ap[i], i * Mp, nprime, T);
693
else MPN_ZERO (Ap[i], nprime + 1);
699
j = (ml>=l) ? l : ml; /* limbs to store in Bp[i] */
700
MPN_COPY (Bp[i], m, j); m += l;
701
MPN_ZERO (Bp[i] + j, nprime + 1 - j);
702
mpn_fft_mul_2exp_modF (Bp[i], i * Mp, nprime, T);
705
MPN_ZERO (Bp[i], nprime + 1);
709
mpn_mul_fft_internal (op, n, m, pl, k, K, Ap, Bp, A, B, nprime, l, Mp, _fft_l, T, 0);
711
__GMP_FREE_FUNC_LIMBS (A, 2 * K * (nprime + 1));
715
/* Multiply {n,nl}*{m,ml} and write the result to {op,nl+ml}.
717
FIXME: Duplicating the result like this is wasteful, do something better
718
perhaps at the norm_modF stage above. */
721
mpn_mul_fft_full (mp_ptr op,
722
mp_srcptr n, mp_size_t nl,
723
mp_srcptr m, mp_size_t ml)
728
int sqr = (n == m && nl == ml);
730
k = mpn_fft_best_k (nl + ml, sqr);
731
pl = mpn_fft_next_size (nl + ml, k);
733
TRACE (printf ("mpn_mul_fft_full nl=%ld ml=%ld -> pl=%ld k=%d\n", nl, ml, pl, k));
735
pad_op = __GMP_ALLOCATE_FUNC_LIMBS (pl + 1);
736
mpn_mul_fft (pad_op, pl, n, nl, m, ml, k);
738
ASSERT_MPN_ZERO_P (pad_op + nl + ml, pl + 1 - (nl + ml));
739
MPN_COPY (op, pad_op, nl + ml);
741
__GMP_FREE_FUNC_LIMBS (pad_op, pl + 1);