1
/* mpn_gcdext -- Extended Greatest Common Divisor.
3
Copyright (C) 1996, 1998, 2000 Free Software Foundation, Inc.
5
This file is part of the GNU MP Library.
7
The GNU MP Library is free software; you can redistribute it and/or modify
8
it under the terms of the GNU Lesser General Public License as published by
9
the Free Software Foundation; either version 2.1 of the License, or (at your
10
option) any later version.
12
The GNU MP Library is distributed in the hope that it will be useful, but
13
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15
License for more details.
17
You should have received a copy of the GNU Lesser General Public License
18
along with the GNU MP Library; see the file COPYING.LIB. If not, write to
19
the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
20
MA 02111-1307, USA. */
27
#ifndef GCDEXT_THRESHOLD
28
#define GCDEXT_THRESHOLD 17
36
int arr[BITS_PER_MP_LIMB];
40
/* mpn_gcdext (GP, SP, SSIZE, UP, USIZE, VP, VSIZE)
42
Compute the extended GCD of {UP,USIZE} and {VP,VSIZE} and store the
43
greatest common divisor at GP (unless it is 0), and the first cofactor at
44
SP. Write the size of the cofactor through the pointer SSIZE. Return the
45
size of the value at GP. Note that SP might be a negative number; this is
46
denoted by storing the negative of the size through SSIZE.
48
{UP,USIZE} and {VP,VSIZE} are both clobbered.
50
The space allocation for all four areas needs to be USIZE+1.
52
Preconditions: 1) U >= V.
55
/* We use Lehmer's algorithm. The idea is to extract the most significant
56
bits of the operands, and compute the continued fraction for them. We then
57
apply the gathered cofactors to the full operands.
59
Idea 1: After we have performed a full division, don't shift operands back,
60
but instead account for the extra factors-of-2 thus introduced.
61
Idea 2: Simple generalization to use divide-and-conquer would give us an
62
algorithm that runs faster than O(n^2).
63
Idea 3: The input numbers need less space as the computation progresses,
64
while the s0 and s1 variables need more space. To save memory, we
65
could make them share space, and have the latter variables grow
67
Idea 4: We should not do double-limb arithmetic from the start. Instead,
68
do things in single-limb arithmetic until the quotients differ,
69
and then switch to double-limb arithmetic. */
72
/* Division optimized for small quotients. If the quotient is more than one limb,
73
store 1 in *qh and return 0. */
76
div2 (mp_limb_t *qh, mp_limb_t n1, mp_limb_t n0, mp_limb_t d1, mp_limb_t d0)
78
div2 (qh, n1, n0, d1, d0)
92
if ((mp_limb_signed_t) n1 < 0)
96
for (cnt = 1; (mp_limb_signed_t) d1 >= 0; cnt++)
98
d1 = (d1 << 1) | (d0 >> (BITS_PER_MP_LIMB - 1));
106
if (n1 > d1 || (n1 == d1 && n0 >= d0))
108
sub_ddmmss (n1, n0, n1, n0, d1, d0);
111
d0 = (d1 << (BITS_PER_MP_LIMB - 1)) | (d0 >> 1);
123
for (cnt = 0; n1 > d1 || (n1 == d1 && n0 >= d0); cnt++)
125
d1 = (d1 << 1) | (d0 >> (BITS_PER_MP_LIMB - 1));
132
d0 = (d1 << (BITS_PER_MP_LIMB - 1)) | (d0 >> 1);
135
if (n1 > d1 || (n1 == d1 && n0 >= d0))
137
sub_ddmmss (n1, n0, n1, n0, d1, d0);
151
mpn_gcdext (mp_ptr gp, mp_ptr s0p, mp_size_t *s0size,
152
mp_ptr up, mp_size_t size, mp_ptr vp, mp_size_t vsize)
154
mpn_gcdext (gp, s0p, s0size, up, size, vp, vsize)
166
mp_ptr up, mp_size_t size, mp_ptr vp, mp_size_t vsize)
168
mpn_gcd (gp, up, size, vp, vsize)
177
mp_limb_t A, B, C, D;
185
mp_ptr orig_s0p = s0p;
194
use_double_flag = (size >= GCDEXT_THRESHOLD);
196
tp = (mp_ptr) TMP_ALLOC ((size + 1) * BYTES_PER_MP_LIMB);
197
wp = (mp_ptr) TMP_ALLOC ((size + 1) * BYTES_PER_MP_LIMB);
199
s1p = (mp_ptr) TMP_ALLOC ((size + 1) * BYTES_PER_MP_LIMB);
201
MPN_ZERO (s0p, size);
202
MPN_ZERO (s1p, size);
211
/* Normalize V (and shift up U the same amount). */
212
count_leading_zeros (cnt, vp[vsize - 1]);
216
mpn_lshift (vp, vp, vsize, cnt);
217
cy = mpn_lshift (up, up, size, cnt);
222
mpn_divmod (up + vsize, up, size, vp, vsize);
224
/* This is really what it boils down to in this case... */
232
mpn_rshift (up, up, size, cnt);
233
mpn_rshift (vp, vp, size, cnt);
235
MP_PTR_SWAP (up, vp);
241
/* Figure out exact size of V. */
243
MPN_NORMALIZE (vp, vsize);
249
mp_limb_t uh, vh, ul, vl;
250
/* Let UH,UL be the most significant limbs of U, and let VH,VL be
251
the corresponding bits from V. */
256
count_leading_zeros (cnt, uh);
259
uh = (uh << cnt) | (ul >> (BITS_PER_MP_LIMB - cnt));
260
vh = (vh << cnt) | (vl >> (BITS_PER_MP_LIMB - cnt));
265
ul |= (up[size - 3] >> (BITS_PER_MP_LIMB - cnt));
266
vl |= (vp[size - 3] >> (BITS_PER_MP_LIMB - cnt));
279
mp_limb_t qh, q1, q2;
280
mp_limb_t nh, nl, dh, dl;
284
sub_ddmmss (dh, dl, vh, vl, 0, C);
287
add_ssaaaa (nh, nl, uh, ul, 0, A);
288
q1 = div2 (&qh, nh, nl, dh, dl);
290
break; /* could handle this */
292
add_ssaaaa (dh, dl, vh, vl, 0, D);
295
sub_ddmmss (nh, nl, uh, ul, 0, B);
296
q2 = div2 (&qh, nh, nl, dh, dl);
298
break; /* could handle this */
311
umul_ppmm (t1, t0, q1, vl);
313
sub_ddmmss (Th, Tl, uh, ul, t1, t0);
317
add_ssaaaa (dh, dl, vh, vl, 0, C);
318
sub_ddmmss (nh, nl, uh, ul, 0, A);
319
q1 = div2 (&qh, nh, nl, dh, dl);
321
break; /* could handle this */
323
sub_ddmmss (dh, dl, vh, vl, 0, D);
326
add_ssaaaa (nh, nl, uh, ul, 0, B);
327
q2 = div2 (&qh, nh, nl, dh, dl);
329
break; /* could handle this */
342
umul_ppmm (t1, t0, q1, vl);
344
sub_ddmmss (Th, Tl, uh, ul, t1, t0);
353
else /* Same, but using single-limb calculations. */
356
/* Make UH be the most significant limb of U, and make VH be
357
corresponding bits from V. */
360
count_leading_zeros (cnt, uh);
363
uh = (uh << cnt) | (up[size - 2] >> (BITS_PER_MP_LIMB - cnt));
364
vh = (vh << cnt) | (vp[size - 2] >> (BITS_PER_MP_LIMB - cnt));
376
if (vh - C == 0 || vh + D == 0)
379
q = (uh + A) / (vh - C);
380
if (q != (uh - B) / (vh + D))
398
q = (uh - A) / (vh + C);
399
if (q != (uh + B) / (vh - D))
421
max = MAX (A, max); max = MAX (B, max);
422
max = MAX (C, max); max = MAX (D, max);
429
/* This is quite rare. I.e., optimize something else! */
431
/* Normalize V (and shift up U the same amount). */
432
count_leading_zeros (cnt, vp[vsize - 1]);
436
mpn_lshift (vp, vp, vsize, cnt);
437
cy = mpn_lshift (up, up, size, cnt);
442
qh = mpn_divmod (up + vsize, up, size, vp, vsize);
444
MPN_COPY (tp, s0p, ssize);
448
qsize = size - vsize; /* size of stored quotient from division */
451
MPN_ZERO (tp + ssize, qsize - ssize);
452
MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */
453
for (i = 0; i < ssize; i++)
456
cy = mpn_addmul_1 (tp + i, up + vsize, qsize, s1p[i]);
462
cy = mpn_add_n (tp + qsize, tp + qsize, s1p, ssize);
469
MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */
470
for (i = 0; i < qsize; i++)
473
cy = mpn_addmul_1 (tp + i, s1p, ssize, up[vsize + i]);
479
cy = mpn_add_n (tp + qsize, tp + qsize, s1p, ssize);
482
tp[qsize + ssize] = cy;
483
s1p[qsize + ssize] = 0;
489
ssize -= tp[ssize - 1] == 0;
493
MP_PTR_SWAP (s0p, s1p);
494
MP_PTR_SWAP (s1p, tp);
499
mpn_rshift (up, up, size, cnt);
500
mpn_rshift (vp, vp, size, cnt);
502
MP_PTR_SWAP (up, vp);
507
mp_size_t tsize, wsize;
515
{ mp_limb_t x; x = A | B | C | D; count_leading_zeros (cnt, x);
516
arr[BITS_PER_MP_LIMB - cnt]++; }
520
/* B == 1 and C == 1 (D is arbitrary) */
522
MPN_COPY (tp, vp, size);
523
MPN_COPY (wp, up, size);
524
mpn_submul_1 (wp, vp, size, D);
525
MP_PTR_SWAP (tp, up);
526
MP_PTR_SWAP (wp, vp);
528
MPN_COPY (tp, s1p, ssize);
530
tp[ssize] = 0; /* must zero since wp might spill below */
531
MPN_COPY (wp, s0p, ssize);
532
cy = mpn_addmul_1 (wp, s1p, ssize, D);
534
wsize = ssize + (cy != 0);
535
MP_PTR_SWAP (tp, s0p);
536
MP_PTR_SWAP (wp, s1p);
537
ssize = MAX (wsize, tsize);
545
mpn_mul_1 (tp, vp, size, B);
546
mpn_submul_1 (tp, up, size, A);
547
mpn_mul_1 (wp, up, size, C);
548
mpn_submul_1 (wp, vp, size, D);
549
MP_PTR_SWAP (tp, up);
550
MP_PTR_SWAP (wp, vp);
552
cy = mpn_mul_1 (tp, s1p, ssize, B);
553
cy += mpn_addmul_1 (tp, s0p, ssize, A);
555
tsize = ssize + (cy != 0);
556
cy = mpn_mul_1 (wp, s0p, ssize, C);
557
cy += mpn_addmul_1 (wp, s1p, ssize, D);
559
wsize = ssize + (cy != 0);
560
MP_PTR_SWAP (tp, s0p);
561
MP_PTR_SWAP (wp, s1p);
562
ssize = MAX (wsize, tsize);
568
mpn_mul_1 (tp, up, size, A);
569
mpn_submul_1 (tp, vp, size, B);
570
mpn_mul_1 (wp, vp, size, D);
571
mpn_submul_1 (wp, up, size, C);
572
MP_PTR_SWAP (tp, up);
573
MP_PTR_SWAP (wp, vp);
575
cy = mpn_mul_1 (tp, s0p, ssize, A);
576
cy += mpn_addmul_1 (tp, s1p, ssize, B);
578
tsize = ssize + (cy != 0);
579
cy = mpn_mul_1 (wp, s1p, ssize, D);
580
cy += mpn_addmul_1 (wp, s0p, ssize, C);
582
wsize = ssize + (cy != 0);
583
MP_PTR_SWAP (tp, s0p);
584
MP_PTR_SWAP (wp, s1p);
585
ssize = MAX (wsize, tsize);
590
size -= up[size - 1] == 0;
595
printf ("max: %lx\n", max);
599
{int i; for (i = 0; i < BITS_PER_MP_LIMB; i++) printf ("%d:%d\n", i, arr[i]);}
604
if (gp != up && gp != 0)
605
MPN_COPY (gp, up, size);
607
MPN_NORMALIZE (s0p, ssize);
609
MPN_COPY (orig_s0p, s0p, ssize);
610
*s0size = sign >= 0 ? ssize : -ssize;
623
t = mpn_divmod_1 (wp, up, size, vl);
625
MPN_COPY (tp, s0p, ssize);
627
qsize = size - (wp[size - 1] == 0); /* size of quotient from division */
630
MPN_ZERO (tp + ssize, qsize - ssize);
631
MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */
632
for (i = 0; i < ssize; i++)
635
cy = mpn_addmul_1 (tp + i, wp, qsize, s1p[i]);
641
MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */
642
for (i = 0; i < qsize; i++)
645
cy = mpn_addmul_1 (tp + i, s1p, ssize, wp[i]);
650
ssize -= tp[ssize - 1] == 0;
653
MP_PTR_SWAP (s0p, s1p);
654
MP_PTR_SWAP (s1p, tp);
656
t = mpn_mod_1 (up, size, vl);
668
MPN_COPY (tp, s0p, ssize);
670
MPN_ZERO (s1p + ssize, 1); /* zero s1 too */
674
cy = mpn_addmul_1 (tp, s1p, ssize, q);
679
ssize -= tp[ssize - 1] == 0;
682
MP_PTR_SWAP (s0p, s1p);
683
MP_PTR_SWAP (s1p, tp);
693
MPN_NORMALIZE (s0p, ssize);
695
MPN_COPY (orig_s0p, s0p, ssize);
696
*s0size = sign >= 0 ? ssize : -ssize;