1
/* mpz_n_pow_ui -- mpn raised to ulong.
3
Copyright 2001 Free Software Foundation, Inc.
5
This file is part of the GNU MP Library.
7
The GNU MP Library is free software; you can redistribute it and/or modify
8
it under the terms of the GNU Lesser General Public License as published by
9
the Free Software Foundation; either version 2.1 of the License, or (at your
10
option) any later version.
12
The GNU MP Library is distributed in the hope that it will be useful, but
13
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15
License for more details.
17
You should have received a copy of the GNU Lesser General Public License
18
along with the GNU MP Library; see the file COPYING.LIB. If not, write to
19
the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
20
MA 02111-1307, USA. */
27
/* Change this to "#define TRACE(x) x" for some traces. */
31
/* Use this to test the mul_2 code on a CPU without a native version of that
34
#define mpn_mul_2 refmpn_mul_2
35
#define HAVE_NATIVE_mpn_mul_2 1
39
/* mpz_pow_ui and mpz_ui_pow_ui want to share almost all of this code.
40
ui_pow_ui doesn't need the mpn_mul based powering loop or the tests on
41
bsize==2 or >2, but separating that isn't easy because there's shared
42
code both before and after (the size calculations and the powers of 2
47
It would work to just use the mpn_mul powering loop for 1 and 2 limb
48
bases, but the current separate loop allows mul_1 and mul_2 to be done
49
in-place, which might help cache locality a bit. If mpn_mul was relaxed
50
to allow source==dest when vn==1 or 2 then some pointer twiddling might
51
let us get the same effect in one loop.
53
The initial powering for bsize==1 into blimb or blimb:blimb_low doesn't
54
form the biggest possible power of b that fits, only the biggest power of
55
2 power, ie. b^(2^n). It'd be possible to choose a bigger power, perhaps
56
using __mp_bases[b].big_base for small b, and thereby get better value
57
from mpn_mul_1 or mpn_mul_2 in the bignum powering. It's felt that doing
58
so would be more complicated than it's worth, and could well end up being
59
a slowdown for small e. For big e on the other hand the algorithm is
60
dominated by mpn_sqr_n so there wouldn't much of a saving. The current
61
code can be viewed as simply doing the first few steps of the powering in
62
a single or double limb where possible.
64
If r==b, and blow_twos==0, and r must be realloc'ed, then the temporary
65
copy made of b is unnecessary. We could just use the old alloc'ed block
66
and free it at the end. But arranging this seems like a lot more trouble
70
/* floor(sqrt(MP_LIMB_T_MAX)), ie. the biggest value that can be squared in
71
a limb without overflowing.
72
FIXME: This formula is an underestimate when BITS_PER_MP_LIMB is odd. */
74
#define MP_LIMB_T_HALFMAX (MP_LIMB_T_MAX >> ((BITS_PER_MP_LIMB+1)/2))
77
/* The following are for convenience, they update the size and check the
80
#define MPN_SQR_N(dst, alloc, src, size) \
82
ASSERT (2*(size) <= (alloc)); \
83
mpn_sqr_n (dst, src, size); \
85
(size) -= ((dst)[(size)-1] == 0); \
88
#define MPN_MUL(dst, alloc, src, size, src2, size2) \
91
ASSERT ((size) + (size2) <= (alloc)); \
92
cy = mpn_mul (dst, src, size, src2, size2); \
93
(size) += (size2) - (cy == 0); \
96
#define MPN_MUL_2(ptr, size, alloc, low, high) \
99
ASSERT ((size)+2 <= (alloc)); \
100
cy = mpn_mul_2 (ptr, ptr, size, low, high); \
102
(ptr)[(size)] = cy; \
103
(size) += (cy != 0); \
106
#define MPN_MUL_1(ptr, size, alloc, limb) \
109
ASSERT ((size)+1 <= (alloc)); \
110
cy = mpn_mul_1 (ptr, ptr, size, limb); \
112
(size) += (cy != 0); \
115
#define MPN_LSHIFT(ptr, size, alloc, shift) \
118
ASSERT ((size)+1 <= (alloc)); \
119
cy = mpn_lshift (ptr, ptr, size, shift); \
121
(size) += (cy != 0); \
124
#define MPN_RSHIFT_OR_COPY(dst, src, size, shift) \
127
MPN_COPY (dst, src, size); \
130
mpn_rshift (dst, src, size, shift); \
131
(size) -= ((dst)[(size)-1] == 0); \
136
/* We're only interested in ralloc and talloc for assertions and tracing.
137
gcc 2.95 recognises those variables are dead in a normal build and
138
eliminates them. Something explicit could be done to avoid touching them
139
if other compilers don't manage this. */
141
#define SWAP_RP_TP MPN_PTR_SWAP (rp,ralloc, tp,talloc)
145
mpz_n_pow_ui (mpz_ptr r, mp_srcptr bp, mp_size_t bsize, unsigned long int e)
148
mp_size_t rtwos_limbs, ralloc, rsize;
149
int rneg, i, cnt, btwos, r_bp_overlap;
151
unsigned long rtwos_bits;
152
#if HAVE_NATIVE_mpn_mul_2
153
mp_limb_t blimb_low, rl_high;
155
mp_limb_t b_twolimbs[2];
159
TRACE (printf ("mpz_n_pow_ui rp=0x%lX bp=0x%lX bsize=%ld e=%lu (0x%lX)\n",
160
PTR(r), bp, bsize, e, e);
161
mpn_trace ("b", bp, bsize));
163
ASSERT (bsize == 0 || bp[ABS(bsize)-1] != 0);
164
ASSERT (MPN_SAME_OR_SEPARATE2_P (PTR(r), ABSIZ(r), bp, bsize));
166
/* b^0 == 1, including 0^0 == 1 */
174
/* 0^e == 0 apart from 0^0 above */
181
/* Sign of the final result. */
182
rneg = (bsize < 0 && (e & 1) != 0);
184
TRACE (printf ("rneg %d\n", rneg));
186
r_bp_overlap = (PTR(r) == bp);
188
/* Strip low zero limbs from b. */
190
for (blimb = *bp; blimb == 0; blimb = *++bp)
193
bsize--; ASSERT (bsize >= 1);
195
TRACE (printf ("trailing zero rtwos_limbs=%ld\n", rtwos_limbs));
197
/* Strip low zero bits from b. */
198
count_trailing_zeros (btwos, blimb);
200
rtwos_bits = e * btwos;
201
rtwos_limbs += rtwos_bits / BITS_PER_MP_LIMB;
202
rtwos_bits %= BITS_PER_MP_LIMB;
203
TRACE (printf ("trailing zero btwos=%d rtwos_limbs=%ld rtwos_bits=%lu\n",
204
btwos, rtwos_limbs, rtwos_bits));
209
#if HAVE_NATIVE_mpn_mul_2
216
/* Power up as far as possible within blimb. We start here with e!=0,
217
but if e is small then we might reach e==0 and the whole b^e in rl.
218
Notice this code works when blimb==1 too, reaching e==0. */
220
while (blimb <= MP_LIMB_T_HALFMAX)
222
TRACE (printf ("small e=0x%lX blimb=0x%lX rl=0x%lX\n",
233
#if HAVE_NATIVE_mpn_mul_2
234
TRACE (printf ("single power, e=0x%lX b=0x%lX rl=0x%lX\n",
237
/* Can power b once more into blimb:blimb_low */
241
umul_ppmm (rl_high, rl, rl, blimb);
243
umul_ppmm (blimb, blimb_low, blimb, blimb);
246
TRACE (printf ("double power e=0x%lX blimb=0x%lX:0x%lX rl=0x%lX:%lX\n",
247
e, blimb, blimb_low, rl_high, rl));
249
/* Combine left-over rtwos_bits into rl_high:rl to be handled by the
250
final mul_1 or mul_2 rather than a separate lshift.
251
- rl_high:rl mustn't be 1 (since then there's no final mul)
252
- rl_high mustn't overflow
253
- rl_high mustn't change to non-zero, since mul_1+lshift is
254
probably faster than mul_2 (FIXME: is this true?) */
257
&& ! (rl_high == 0 && rl == 1)
258
&& (rl_high >> (BITS_PER_MP_LIMB-rtwos_bits)) == 0)
260
mp_limb_t new_rl_high = (rl_high << rtwos_bits)
261
| (rl >> (BITS_PER_MP_LIMB-rtwos_bits));
262
if (! (rl_high == 0 && new_rl_high != 0))
264
rl_high = new_rl_high;
267
TRACE (printf ("merged rtwos_bits, rl=0x%lX:%lX\n",
273
TRACE (printf ("small power e=0x%lX blimb=0x%lX rl=0x%lX\n",
276
/* Combine left-over rtwos_bits into rl to be handled by the final
277
mul_1 rather than a separate lshift.
278
- rl mustn't be 1 (since then there's no final mul)
279
- rl mustn't overflow */
283
&& (rl >> (BITS_PER_MP_LIMB-rtwos_bits)) == 0)
287
TRACE (printf ("merged rtwos_bits, rl=0x%lX\n", rl));
293
mp_limb_t bsecond = bp[1];
295
blimb |= (bsecond << (BITS_PER_MP_LIMB - btwos));
299
/* Two limbs became one after rshift. */
304
TRACE (printf ("bsize==2 using b=0x%lX:%lX", bsecond, blimb));
305
#if HAVE_NATIVE_mpn_mul_2
309
b_twolimbs[0] = blimb;
310
b_twolimbs[1] = bsecond;
316
if (r_bp_overlap || btwos != 0)
318
mp_ptr tp = TMP_ALLOC_LIMBS (bsize);
319
MPN_RSHIFT_OR_COPY (tp, bp, bsize, btwos);
321
TRACE (printf ("rshift or copy bp,bsize, new bsize=%ld\n", bsize));
323
#if HAVE_NATIVE_mpn_mul_2
324
/* in case 3 limbs rshift to 2 and hence use the mul_2 loop below */
329
TRACE (printf ("big bsize=%ld ", bsize);
330
mpn_trace ("b", bp, bsize));
333
/* At this point blimb is the most significant limb of the base to use.
335
Each factor of b takes (bsize*BPML-cnt) bits and there's e of them; +1
336
limb to round up the division; +1 for multiplies all using an extra
337
limb over the true size; +2 for rl at the end; +1 for lshift at the
340
The size calculation here is reasonably accurate. The base is at least
341
half a limb, so in 32 bits the worst case is 2^16+1 treated as 17 bits
342
when it will power up as just over 16, an overestimate of 17/16 =
343
6.25%. For a 64-bit limb it's half that.
345
If e==0 then blimb won't be anything useful (though it will be
346
non-zero), but that doesn't matter since we just end up with ralloc==5,
347
and that's fine for 2 limbs of rl and 1 of lshift. */
350
count_leading_zeros (cnt, blimb);
351
ralloc = (bsize*BITS_PER_MP_LIMB - cnt) * e / BITS_PER_MP_LIMB + 5;
352
TRACE (printf ("ralloc %ld, from bsize=%ld blimb=0x%lX cnt=%d\n",
353
ralloc, bsize, blimb, cnt));
354
MPZ_REALLOC (r, ralloc + rtwos_limbs);
357
/* Low zero limbs resulting from powers of 2. */
358
MPN_ZERO (rp, rtwos_limbs);
363
/* Any e==0 other than via bsize==1 or bsize==2 is covered at the
367
#if HAVE_NATIVE_mpn_mul_2
369
rsize += (rl_high != 0);
371
ASSERT (rp[rsize-1] != 0);
378
/* In the mpn_mul_1 or mpn_mul_2 loops or in the mpn_mul loop when the
379
low bit of e is zero, tp only has to hold the second last power
380
step, which is half the size of the final result. There's no need
381
to round up the divide by 2, since ralloc includes a +2 for rl
382
which not needed by tp. In the mpn_mul loop when the low bit of e
383
is 1, tp must hold nearly the full result, so just size it the same
387
#if HAVE_NATIVE_mpn_mul_2
388
if (bsize <= 2 || (e & 1) == 0)
391
if (bsize <= 1 || (e & 1) == 0)
394
TRACE (printf ("talloc %ld\n", talloc));
395
tp = TMP_ALLOC_LIMBS (talloc);
397
/* Go from high to low over the bits of e, starting with i pointing at
398
the bit below the highest 1 (which will mean i==-1 if e==1). */
399
count_leading_zeros (cnt, e);
400
i = BITS_PER_MP_LIMB - cnt - 2;
402
#if HAVE_NATIVE_mpn_mul_2
405
/* Any bsize==1 will have been powered above to be two limbs. */
409
/* Arrange the final result ends up in r, not in the temp space */
419
TRACE (printf ("mul_2 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
420
i, e, rsize, ralloc, talloc);
421
mpn_trace ("r", rp, rsize));
423
MPN_SQR_N (tp, talloc, rp, rsize);
425
if ((e & (1L << i)) != 0)
426
MPN_MUL_2 (rp, rsize, ralloc, blimb_low, blimb);
429
TRACE (mpn_trace ("mul_2 before rl, r", rp, rsize));
431
MPN_MUL_2 (rp, rsize, ralloc, rl, rl_high);
433
MPN_MUL_1 (rp, rsize, ralloc, rl);
438
/* Arrange the final result ends up in r, not in the temp space */
447
TRACE (printf ("mul_1 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
448
i, e, rsize, ralloc, talloc);
449
mpn_trace ("r", rp, rsize));
451
MPN_SQR_N (tp, talloc, rp, rsize);
453
if ((e & (1L << i)) != 0)
454
MPN_MUL_1 (rp, rsize, ralloc, blimb);
457
TRACE (mpn_trace ("mul_1 before rl, r", rp, rsize));
459
MPN_MUL_1 (rp, rsize, ralloc, rl);
466
/* Arrange the final result ends up in r, not in the temp space */
467
ULONG_PARITY (parity, e);
468
if (((parity ^ i) & 1) != 0)
471
MPN_COPY (rp, bp, bsize);
476
TRACE (printf ("mul loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
477
i, e, rsize, ralloc, talloc);
478
mpn_trace ("r", rp, rsize));
480
MPN_SQR_N (tp, talloc, rp, rsize);
482
if ((e & (1L << i)) != 0)
484
MPN_MUL (tp, talloc, rp, rsize, bp, bsize);
492
ASSERT (rp == PTR(r) + rtwos_limbs);
493
TRACE (mpn_trace ("end loop r", rp, rsize));
496
/* Apply any partial limb factors of 2. */
499
MPN_LSHIFT (rp, rsize, ralloc, (unsigned) rtwos_bits);
500
TRACE (mpn_trace ("lshift r", rp, rsize));
503
rsize += rtwos_limbs;
504
SIZ(r) = (rneg ? -rsize : rsize);