1
/* mpn_mul_n and helper function -- Multiply/square natural numbers.
3
THE HELPER FUNCTIONS IN THIS FILE (meaning everything except mpn_mul_n)
4
ARE INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS ONLY SAFE TO REACH
5
THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST GUARANTEED
6
THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
9
Copyright 1991, 1993, 1994, 1996, 1997, 1998, 1999, 2000, 2001 Free Software
12
This file is part of the GNU MP Library.
14
The GNU MP Library is free software; you can redistribute it and/or modify
15
it under the terms of the GNU Lesser General Public License as published by
16
the Free Software Foundation; either version 2.1 of the License, or (at your
17
option) any later version.
19
The GNU MP Library is distributed in the hope that it will be useful, but
20
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
21
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
22
License for more details.
24
You should have received a copy of the GNU Lesser General Public License
25
along with the GNU MP Library; see the file COPYING.LIB. If not, write to
26
the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
27
MA 02111-1307, USA. */
35
#if !defined (__alpha) && !defined (__mips)
36
/* For all other machines, we want to call mpn functions for the compund
37
operations instead of open-coding them. */
38
#define USE_MORE_MPN 1
42
/*== Function declarations =================================================*/
44
static void evaluate3 _PROTO ((mp_ptr, mp_ptr, mp_ptr,
45
mp_ptr, mp_ptr, mp_ptr,
46
mp_srcptr, mp_srcptr, mp_srcptr,
47
mp_size_t, mp_size_t));
48
static void interpolate3 _PROTO ((mp_srcptr,
49
mp_ptr, mp_ptr, mp_ptr,
51
mp_ptr, mp_ptr, mp_ptr,
52
mp_size_t, mp_size_t));
53
static mp_limb_t add2Times _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
56
/*-- mpn_kara_mul_n ---------------------------------------------------------------*/
58
/* Multiplies using 3 half-sized mults and so on recursively.
59
* p[0..2*n-1] := product of a[0..n-1] and b[0..n-1].
60
* No overlap of p[...] with a[...] or b[...].
65
mpn_kara_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws)
79
mp_size_t n1, n3, nm1;
86
w -= mpn_sub_n (p, a, a + n3, n2);
96
while (w0 == w1 && i != 0);
108
mpn_sub_n (p, x, y, n2);
114
w -= mpn_sub_n (p + n3, b, b + n3, n2);
124
while (w0 == w1 && i != 0);
136
mpn_sub_n (p + n3, x, y, n2);
141
if (n2 < KARATSUBA_MUL_THRESHOLD)
143
if (n3 < KARATSUBA_MUL_THRESHOLD)
145
mpn_mul_basecase (ws, p, n3, p + n3, n3);
146
mpn_mul_basecase (p, a, n3, b, n3);
150
mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1);
151
mpn_kara_mul_n (p, a, b, n3, ws + n1);
153
mpn_mul_basecase (p + n1, a + n3, n2, b + n3, n2);
157
mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1);
158
mpn_kara_mul_n (p, a, b, n3, ws + n1);
159
mpn_kara_mul_n (p + n1, a + n3, b + n3, n2, ws + n1);
163
mpn_add_n (ws, p, ws, n1);
165
mpn_sub_n (ws, p, ws, n1);
168
if (mpn_add_n (ws, p + n1, ws, nm1))
170
mp_limb_t x = ws[nm1] + 1;
175
if (mpn_add_n (p + n3, p + n3, ws, n1))
196
while (w0 == w1 && i != 0);
209
mpn_sub_n (p, x, y, n2);
218
while (w0 == w1 && i != 0);
230
mpn_sub_n (p + n2, x, y, n2);
232
/* Pointwise products. */
233
if (n2 < KARATSUBA_MUL_THRESHOLD)
235
mpn_mul_basecase (ws, p, n2, p + n2, n2);
236
mpn_mul_basecase (p, a, n2, b, n2);
237
mpn_mul_basecase (p + n, a + n2, n2, b + n2, n2);
241
mpn_kara_mul_n (ws, p, p + n2, n2, ws + n);
242
mpn_kara_mul_n (p, a, b, n2, ws + n);
243
mpn_kara_mul_n (p + n, a + n2, b + n2, n2, ws + n);
248
w = mpn_add_n (ws, p, ws, n);
250
w = -mpn_sub_n (ws, p, ws, n);
251
w += mpn_add_n (ws, p + n, ws, n);
252
w += mpn_add_n (p + n2, p + n2, ws, n);
253
MPN_INCR_U (p + n2 + n, 2*n - (n2 + n), w);
258
mpn_kara_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws)
271
mp_size_t n1, n3, nm1;
277
w -= mpn_sub_n (p, a, a + n3, n2);
287
while (w0 == w1 && i != 0);
298
mpn_sub_n (p, x, y, n2);
304
/* n2 is always either n3 or n3-1 so maybe the two sets of tests here
305
could be combined. But that's not important, since the tests will
306
take a miniscule amount of time compared to the function calls. */
307
if (BELOW_THRESHOLD (n3, BASECASE_SQR_THRESHOLD))
309
mpn_mul_basecase (ws, p, n3, p, n3);
310
mpn_mul_basecase (p, a, n3, a, n3);
312
else if (BELOW_THRESHOLD (n3, KARATSUBA_SQR_THRESHOLD))
314
mpn_sqr_basecase (ws, p, n3);
315
mpn_sqr_basecase (p, a, n3);
319
mpn_kara_sqr_n (ws, p, n3, ws + n1); /* (x-y)^2 */
320
mpn_kara_sqr_n (p, a, n3, ws + n1); /* x^2 */
322
if (BELOW_THRESHOLD (n2, BASECASE_SQR_THRESHOLD))
323
mpn_mul_basecase (p + n1, a + n3, n2, a + n3, n2);
324
else if (BELOW_THRESHOLD (n2, KARATSUBA_SQR_THRESHOLD))
325
mpn_sqr_basecase (p + n1, a + n3, n2);
327
mpn_kara_sqr_n (p + n1, a + n3, n2, ws + n1); /* y^2 */
330
/* Since x^2+y^2-(x-y)^2 = 2xy >= 0 there's no need to track the
331
borrow from mpn_sub_n. If it occurs then it'll be cancelled by a
332
carry from ws[n]. Further, since 2xy fits in n1 limbs there won't
333
be any carry out of ws[n] other than cancelling that borrow. */
335
mpn_sub_n (ws, p, ws, n1); /* x^2-(x-y)^2 */
338
if (mpn_add_n (ws, p + n1, ws, nm1)) /* x^2+y^2-(x-y)^2 = 2xy */
340
mp_limb_t x = ws[nm1] + 1;
345
if (mpn_add_n (p + n3, p + n3, ws, n1))
366
while (w0 == w1 && i != 0);
377
mpn_sub_n (p, x, y, n2);
379
/* Pointwise products. */
380
if (BELOW_THRESHOLD (n2, BASECASE_SQR_THRESHOLD))
382
mpn_mul_basecase (ws, p, n2, p, n2);
383
mpn_mul_basecase (p, a, n2, a, n2);
384
mpn_mul_basecase (p + n, a + n2, n2, a + n2, n2);
386
else if (BELOW_THRESHOLD (n2, KARATSUBA_SQR_THRESHOLD))
388
mpn_sqr_basecase (ws, p, n2);
389
mpn_sqr_basecase (p, a, n2);
390
mpn_sqr_basecase (p + n, a + n2, n2);
394
mpn_kara_sqr_n (ws, p, n2, ws + n);
395
mpn_kara_sqr_n (p, a, n2, ws + n);
396
mpn_kara_sqr_n (p + n, a + n2, n2, ws + n);
400
w = -mpn_sub_n (ws, p, ws, n);
401
w += mpn_add_n (ws, p + n, ws, n);
402
w += mpn_add_n (p + n2, p + n2, ws, n);
403
MPN_INCR_U (p + n2 + n, 2*n - (n2 + n), w);
407
/*-- add2Times -------------------------------------------------------------*/
409
/* z[] = x[] + 2 * y[]
410
Note that z and x might point to the same vectors.
411
FIXME: gcc won't inline this because it uses alloca. */
413
static inline mp_limb_t
414
add2Times (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_size_t n)
420
t = (mp_ptr) TMP_ALLOC (n * BYTES_PER_MP_LIMB);
421
c = mpn_lshift (t, y, n, 1);
422
c += mpn_add_n (z, x, t, n);
429
add2Times (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_size_t n)
435
c = w >> (BITS_PER_MP_LIMB - 1);
447
c += w >> (BITS_PER_MP_LIMB - 1);
459
/*-- evaluate3 -------------------------------------------------------------*/
466
* ph[], p1[], p2[], A[] and B[] all have length len,
467
* C[] has length len2 with len-len2 = 0, 1 or 2.
468
* Returns top words (overflow) at pth, pt1 and pt2 respectively.
472
evaluate3 (mp_ptr ph, mp_ptr p1, mp_ptr p2, mp_ptr pth, mp_ptr pt1, mp_ptr pt2,
473
mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t len,mp_size_t len2)
477
ASSERT (len - len2 <= 2);
479
e = mpn_lshift (p1, B, len, 1);
481
c = mpn_lshift (ph, A, len, 2);
482
c += e + mpn_add_n (ph, ph, p1, len);
483
d = mpn_add_n (ph, ph, C, len2);
484
if (len2 == len) c += d; else c += mpn_add_1 (ph + len2, ph + len2, len-len2, d);
488
c = mpn_lshift (p2, C, len2, 2);
490
if (len2 != len) { p2[len-1] = 0; p2[len2] = c; c = 0; }
491
c += e + mpn_add_n (p2, p2, p1, len);
493
d = mpn_add_n (p2, p2, p1, len2);
495
if (len2 != len) c = mpn_add_1 (p2+len2, p1+len2, len-len2, c);
498
c += mpn_add_n (p2, p2, A, len);
502
c = mpn_add_n (p1, A, B, len);
503
d = mpn_add_n (p1, p1, C, len2);
504
if (len2 == len) c += d;
505
else c += mpn_add_1 (p1+len2, p1+len2, len-len2, d);
514
evaluate3 (mp_ptr ph, mp_ptr p1, mp_ptr p2, mp_ptr pth, mp_ptr pt1, mp_ptr pt2,
515
mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t l, mp_size_t ls)
517
mp_limb_t a,b,c, i, t, th,t1,t2, vh,v1,v2;
519
ASSERT (l - ls <= 2);
522
for (i = 0; i < l; ++i)
528
/* TO DO: choose one of the following alternatives. */
533
th += a >> (BITS_PER_MP_LIMB - 2);
537
th += b >> (BITS_PER_MP_LIMB - 1);
546
th += b >> (BITS_PER_MP_LIMB - 1);
550
th += a >> (BITS_PER_MP_LIMB - 2);
565
t2 += b >> (BITS_PER_MP_LIMB - 1);
569
t2 += c >> (BITS_PER_MP_LIMB - 2);
590
/*-- interpolate3 ----------------------------------------------------------*/
592
/* Interpolates B, C, D (in-place) from:
597
* A[], B[], C[] and D[] all have length l,
598
* E[] has length ls with l-ls = 0, 2 or 4.
600
* Reads top words (from earlier overflow) from ptb, ptc and ptd,
601
* and returns new top words there.
606
interpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E,
607
mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t len,mp_size_t len2)
610
mp_limb_t t, tb,tc,td;
614
ASSERT (len - len2 == 0 || len - len2 == 2 || len - len2 == 4);
616
/* Let x1, x2, x3 be the values to interpolate. We have:
617
* b = 16*a + 8*x1 + 4*x2 + 2*x3 + e
618
* c = a + x1 + x2 + x3 + e
619
* d = a + 2*x1 + 4*x2 + 8*x3 + 16*e
622
ws = (mp_ptr) TMP_ALLOC (len * BYTES_PER_MP_LIMB);
624
tb = *ptb; tc = *ptc; td = *ptd;
632
t = mpn_lshift (ws, A, len, 4);
633
tb -= t + mpn_sub_n (B, B, ws, len);
634
t = mpn_sub_n (B, B, E, len2);
635
if (len2 == len) tb -= t;
636
else tb -= mpn_sub_1 (B+len2, B+len2, len-len2, t);
638
tc -= mpn_sub_n (C, C, A, len);
639
t = mpn_sub_n (C, C, E, len2);
640
if (len2 == len) tc -= t;
641
else tc -= mpn_sub_1 (C+len2, C+len2, len-len2, t);
643
t = mpn_lshift (ws, E, len2, 4);
644
t += mpn_add_n (ws, ws, A, len2);
646
if (len2 != len) t = mpn_add_1 (ws+len2, A+len2, len-len2, t);
647
td -= t + mpn_sub_n (D, D, ws, len);
649
t += mpn_sub_n (D, D, ws, len2);
651
t = mpn_sub_1 (D+len2, D+len2, len-len2, t);
652
t += mpn_sub_n (D+len2, D+len2, A+len2, len-len2);
658
/* b, d := b + d, b - d */
660
#ifdef HAVE_MPN_ADD_SUB_N
661
/* #error TO DO ... */
663
t = tb + td + mpn_add_n (ws, B, D, len);
664
td = tb - td - mpn_sub_n (D, B, D, len);
666
MPN_COPY (B, ws, len);
670
t = 8 * tc + mpn_lshift (ws, C, len, 3);
671
tb -= t + mpn_sub_n (B, B, ws, len);
674
tc = 2 * tc + mpn_lshift (C, C, len, 1);
675
tc -= tb + mpn_sub_n (C, C, B, len);
678
td = (td - mpn_divexact_by3 (D, D, len)) * MODLIMB_INVERSE_3;
680
/* b, d := b + d, b - d */
681
#ifdef HAVE_MPN_ADD_SUB_N
682
/* #error TO DO ... */
684
t = tb + td + mpn_add_n (ws, B, D, len);
685
td = tb - td - mpn_sub_n (D, B, D, len);
687
MPN_COPY (B, ws, len);
697
mpn_rshift (B, B, len, 2);
698
B[len-1] |= tb<<(BITS_PER_MP_LIMB-2);
699
ASSERT((mp_limb_signed_t)tb >= 0);
703
mpn_rshift (C, C, len, 1);
704
C[len-1] |= tc<<(BITS_PER_MP_LIMB-1);
705
ASSERT((mp_limb_signed_t)tc >= 0);
709
mpn_rshift (D, D, len, 2);
710
D[len-1] |= td<<(BITS_PER_MP_LIMB-2);
711
ASSERT((mp_limb_signed_t)td >= 0);
738
interpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E,
739
mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t l, mp_size_t ls)
741
mp_limb_t a,b,c,d,e,t, i, sb,sc,sd, ob,oc,od;
742
const mp_limb_t maskOffHalf = (~(mp_limb_t) 0) << (BITS_PER_MP_LIMB >> 1);
746
ASSERT (t == 0 || t == 2 || t == 4);
750
for (i = 0; i < l; ++i)
752
mp_limb_t tb, tc, td, tt;
760
/* Let x1, x2, x3 be the values to interpolate. We have:
761
* b = 16*a + 8*x1 + 4*x2 + 2*x3 + e
762
* c = a + x1 + x2 + x3 + e
763
* d = a + 2*x1 + 4*x2 + 8*x3 + 16*e
771
tb = -(a >> (BITS_PER_MP_LIMB - 4)) - (b < t);
782
td = td - (e >> (BITS_PER_MP_LIMB - 4)) - (d < t);
785
/* b, d := b + d, b - d */
787
tt = tb + td + (t < b);
788
td = tb - td - (b < d);
795
tb = tb - (tc << 3) - (c >> (BITS_PER_MP_LIMB - 3)) - (b < t);
800
tc = (tc << 1) + (c >> (BITS_PER_MP_LIMB - 1)) - tb - (t < b);
804
d *= MODLIMB_INVERSE_3;
805
td = td - (d >> (BITS_PER_MP_LIMB - 1)) - (d*3 < d);
806
td *= MODLIMB_INVERSE_3;
808
/* b, d := b + d, b - d */
810
tt = tb + td + (t < b);
811
td = tb - td - (b < d);
822
/* sb has period 2. */
826
sb |= sb >> (BITS_PER_MP_LIMB >> 1);
829
/* sc has period 1. */
832
/* TO DO: choose one of the following alternatives. */
834
sc = (mp_limb_signed_t) sc >> (BITS_PER_MP_LIMB - 1);
837
sc = tc - ((mp_limb_signed_t) sc < 0L);
840
/* sd has period 2. */
844
sd |= sd >> (BITS_PER_MP_LIMB >> 1);
849
B[-1] = ob | b << (BITS_PER_MP_LIMB - 2);
850
C[-1] = oc | c << (BITS_PER_MP_LIMB - 1);
851
D[-1] = od | d << (BITS_PER_MP_LIMB - 2);
857
++A; ++B; ++C; ++D; ++E;
860
/* Handle top words. */
870
d *= MODLIMB_INVERSE_3;
879
B[-1] = ob | b << (BITS_PER_MP_LIMB - 2);
880
C[-1] = oc | c << (BITS_PER_MP_LIMB - 1);
881
D[-1] = od | d << (BITS_PER_MP_LIMB - 2);
908
/*-- mpn_toom3_mul_n --------------------------------------------------------------*/
910
/* Multiplies using 5 mults of one third size and so on recursively.
911
* p[0..2*n-1] := product of a[0..n-1] and b[0..n-1].
912
* No overlap of p[...] with a[...] or b[...].
916
/* TO DO: If TOOM3_MUL_THRESHOLD is much bigger than KARATSUBA_MUL_THRESHOLD then the
917
* recursion in mpn_toom3_mul_n() will always bottom out with mpn_kara_mul_n()
918
* because the "n < KARATSUBA_MUL_THRESHOLD" test here will always be false.
921
#define TOOM3_MUL_REC(p, a, b, n, ws) \
923
if (n < KARATSUBA_MUL_THRESHOLD) \
924
mpn_mul_basecase (p, a, n, b, n); \
925
else if (n < TOOM3_MUL_THRESHOLD) \
926
mpn_kara_mul_n (p, a, b, n, ws); \
928
mpn_toom3_mul_n (p, a, b, n, ws); \
932
mpn_toom3_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws)
934
mp_limb_t cB,cC,cD, dB,dC,dD, tB,tC,tD;
935
mp_limb_t *A,*B,*C,*D,*E, *W;
936
mp_size_t l,l2,l3,l4,l5,ls;
938
/* Break n words into chunks of size l, l and ls.
939
* n = 3*k => l = k, ls = k
940
* n = 3*k+1 => l = k+1, ls = k-1
941
* n = 3*k+2 => l = k+1, ls = k
946
/* this is probably unnecessarily strict */
947
ASSERT (n >= TOOM3_MUL_THRESHOLD);
971
/** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/
972
evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls);
973
evaluate3 (A + l, B + l, C + l, &dB, &dC, &dD, b, b + l, b + l2, l, ls);
975
/** Second stage: pointwise multiplies. **/
976
TOOM3_MUL_REC(D, C, C + l, l, W);
978
if (cD) tD += mpn_addmul_1 (D + l, C + l, l, cD);
979
if (dD) tD += mpn_addmul_1 (D + l, C, l, dD);
981
TOOM3_MUL_REC(C, B, B + l, l, W);
983
/* TO DO: choose one of the following alternatives. */
985
if (cC) tC += mpn_addmul_1 (C + l, B + l, l, cC);
986
if (dC) tC += mpn_addmul_1 (C + l, B, l, dC);
990
if (cC == 1) tC += mpn_add_n (C + l, C + l, B + l, l);
991
else tC += add2Times (C + l, C + l, B + l, l);
995
if (dC == 1) tC += mpn_add_n (C + l, C + l, B, l);
996
else tC += add2Times (C + l, C + l, B, l);
1000
TOOM3_MUL_REC(B, A, A + l, l, W);
1002
if (cB) tB += mpn_addmul_1 (B + l, A + l, l, cB);
1003
if (dB) tB += mpn_addmul_1 (B + l, A, l, dB);
1005
TOOM3_MUL_REC(A, a, b, l, W);
1006
TOOM3_MUL_REC(E, a + l2, b + l2, ls, W);
1008
/** Third stage: interpolation. **/
1009
interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1);
1011
/** Final stage: add up the coefficients. **/
1012
tB += mpn_add_n (p + l, p + l, B, l2);
1013
tD += mpn_add_n (p + l3, p + l3, D, l2);
1014
MPN_INCR_U (p + l3, 2*n - l3, tB);
1015
MPN_INCR_U (p + l4, 2*n - l4, tC);
1016
MPN_INCR_U (p + l5, 2*n - l5, tD);
1019
/*-- mpn_toom3_sqr_n --------------------------------------------------------------*/
1021
/* Like previous function but for squaring */
1023
/* FIXME: If TOOM3_SQR_THRESHOLD is big enough it might never get into the
1024
basecase range. Try to arrange those conditonals go dead. */
1025
#define TOOM3_SQR_REC(p, a, n, ws) \
1027
if (BELOW_THRESHOLD (n, BASECASE_SQR_THRESHOLD)) \
1028
mpn_mul_basecase (p, a, n, a, n); \
1029
else if (BELOW_THRESHOLD (n, KARATSUBA_SQR_THRESHOLD)) \
1030
mpn_sqr_basecase (p, a, n); \
1031
else if (BELOW_THRESHOLD (n, TOOM3_SQR_THRESHOLD)) \
1032
mpn_kara_sqr_n (p, a, n, ws); \
1034
mpn_toom3_sqr_n (p, a, n, ws); \
1038
mpn_toom3_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws)
1040
mp_limb_t cB,cC,cD, tB,tC,tD;
1041
mp_limb_t *A,*B,*C,*D,*E, *W;
1042
mp_size_t l,l2,l3,l4,l5,ls;
1044
/* Break n words into chunks of size l, l and ls.
1045
* n = 3*k => l = k, ls = k
1046
* n = 3*k+1 => l = k+1, ls = k-1
1047
* n = 3*k+2 => l = k+1, ls = k
1052
/* this is probably unnecessarily strict */
1053
ASSERT (n >= TOOM3_SQR_THRESHOLD);
1077
/** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/
1078
evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls);
1080
/** Second stage: pointwise multiplies. **/
1081
TOOM3_SQR_REC(D, C, l, W);
1083
if (cD) tD += mpn_addmul_1 (D + l, C, l, 2*cD);
1085
TOOM3_SQR_REC(C, B, l, W);
1087
/* TO DO: choose one of the following alternatives. */
1089
if (cC) tC += mpn_addmul_1 (C + l, B, l, 2*cC);
1093
tC += add2Times (C + l, C + l, B, l);
1095
tC += add2Times (C + l, C + l, B, l);
1099
TOOM3_SQR_REC(B, A, l, W);
1101
if (cB) tB += mpn_addmul_1 (B + l, A, l, 2*cB);
1103
TOOM3_SQR_REC(A, a, l, W);
1104
TOOM3_SQR_REC(E, a + l2, ls, W);
1106
/** Third stage: interpolation. **/
1107
interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1);
1109
/** Final stage: add up the coefficients. **/
1110
tB += mpn_add_n (p + l, p + l, B, l2);
1111
tD += mpn_add_n (p + l3, p + l3, D, l2);
1112
MPN_INCR_U (p + l3, 2*n - l3, tB);
1113
MPN_INCR_U (p + l4, 2*n - l4, tC);
1114
MPN_INCR_U (p + l5, 2*n - l5, tD);
1118
mpn_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n)
1121
ASSERT (! MPN_OVERLAP_P (p, 2*n, a, n));
1122
ASSERT (! MPN_OVERLAP_P (p, 2*n, b, n));
1124
if (n < KARATSUBA_MUL_THRESHOLD)
1125
mpn_mul_basecase (p, a, n, b, n);
1126
else if (n < TOOM3_MUL_THRESHOLD)
1128
/* Allocate workspace of fixed size on stack: fast! */
1129
#if TUNE_PROGRAM_BUILD
1130
mp_limb_t ws[MPN_KARA_MUL_N_TSIZE (TOOM3_MUL_THRESHOLD_LIMIT-1)];
1132
mp_limb_t ws[MPN_KARA_MUL_N_TSIZE (TOOM3_MUL_THRESHOLD-1)];
1134
mpn_kara_mul_n (p, a, b, n, ws);
1136
#if WANT_FFT || TUNE_PROGRAM_BUILD
1137
else if (n < FFT_MUL_THRESHOLD)
1142
/* Use workspace of unknown size in heap, as stack space may
1143
* be limited. Since n is at least TOOM3_MUL_THRESHOLD, the
1144
* multiplication will take much longer than malloc()/free(). */
1145
mp_limb_t wsLen, *ws;
1146
wsLen = MPN_TOOM3_MUL_N_TSIZE (n);
1148
ws = __GMP_ALLOCATE_FUNC_LIMBS ((size_t) wsLen);
1150
ws = TMP_ALLOC ((size_t) wsLen * sizeof(mp_limb_t));
1152
mpn_toom3_mul_n (p, a, b, n, ws);
1154
__GMP_FREE_FUNC_LIMBS (ws, (size_t) wsLen);
1157
#if WANT_FFT || TUNE_PROGRAM_BUILD
1160
mpn_mul_fft_full (p, a, n, b, n);