1
/* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
3
Copyright 1991, 1992, 1993, 1994, 1996, 1997, 1999, 2000, 2001, 2002, 2003,
4
2004, 2005 Free Software Foundation, Inc.
6
This file is free software; you can redistribute it and/or modify
7
it under the terms of the GNU Lesser General Public License as published by
8
the Free Software Foundation; either version 2.1 of the License, or (at your
9
option) any later version.
11
This file is distributed in the hope that it will be useful, but
12
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
14
License for more details.
16
You should have received a copy of the GNU Lesser General Public License
17
along with this file; see the file COPYING.LIB. If not, write to
18
the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19
MA 02110-1301, USA. */
21
/* You have to define the following before including this file:
23
UWtype -- An unsigned type, default type for operations (typically a "word")
24
UHWtype -- An unsigned type, at least half the size of UWtype.
25
UDWtype -- An unsigned type, at least twice as large a UWtype
26
W_TYPE_SIZE -- size in bits of UWtype
28
SItype, USItype -- Signed and unsigned 32 bit types.
29
DItype, UDItype -- Signed and unsigned 64 bit types.
31
On a 32 bit machine UWtype should typically be USItype;
32
on a 64 bit machine, UWtype should typically be UDItype.
35
#define __BITS4 (W_TYPE_SIZE / 4)
36
#define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
37
#define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
38
#define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
40
/* This is used to make sure no undesirable sharing between different libraries
41
that use this file takes place. */
43
#define __MPN(x) __##x
47
#if (__STDC__-0) || defined (__cplusplus)
54
/* Define auxiliary asm macros.
56
1) umul_ppmm(high_prod, low_prod, multipler, multiplicand) multiplies two
57
UWtype integers MULTIPLER and MULTIPLICAND, and generates a two UWtype
58
word product in HIGH_PROD and LOW_PROD.
60
2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
61
UDWtype product. This is just a variant of umul_ppmm.
63
3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
64
denominator) divides a UDWtype, composed by the UWtype integers
65
HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
66
in QUOTIENT and the remainder in REMAINDER. HIGH_NUMERATOR must be less
67
than DENOMINATOR for correct operation. If, in addition, the most
68
significant bit of DENOMINATOR must be 1, then the pre-processor symbol
69
UDIV_NEEDS_NORMALIZATION is defined to 1.
71
4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
72
denominator). Like udiv_qrnnd but the numbers are signed. The quotient
75
5) count_leading_zeros(count, x) counts the number of zero-bits from the
76
msb to the first non-zero bit in the UWtype X. This is the number of
77
steps X needs to be shifted left to set the msb. Undefined for X == 0,
78
unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value.
80
6) count_trailing_zeros(count, x) like count_leading_zeros, but counts
81
from the least significant end.
83
7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
84
high_addend_2, low_addend_2) adds two UWtype integers, composed by
85
HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
86
respectively. The result is placed in HIGH_SUM and LOW_SUM. Overflow
87
(i.e. carry out) is not stored anywhere, and is lost.
89
8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
90
high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
91
composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
92
LOW_SUBTRAHEND_2 respectively. The result is placed in HIGH_DIFFERENCE
93
and LOW_DIFFERENCE. Overflow (i.e. carry out) is not stored anywhere,
96
If any of these macros are left undefined for a particular CPU,
102
For add_ssaaaa the two high and two low addends can both commute, but
103
unfortunately gcc only supports one "%" commutative in each asm block.
104
This has always been so but is only documented in recent versions
105
(eg. pre-release 3.3). Having two or more "%"s can cause an internal
106
compiler error in certain rare circumstances.
108
Apparently it was only the last "%" that was ever actually respected, so
109
the code has been updated to leave just that. Clearly there's a free
110
choice whether high or low should get it, if there's a reason to favour
111
one over the other. Also obviously when the constraints on the two
112
operands are identical there's no benefit to the reloader in any "%" at
117
/* The CPUs come in alphabetical order below.
119
Please add support for more CPUs here, or improve the current support
120
for the CPUs below! */
123
/* count_leading_zeros_gcc_clz is count_leading_zeros implemented with gcc
124
3.4 __builtin_clzl or __builtin_clzll, according to our limb size.
125
Similarly count_trailing_zeros_gcc_ctz using __builtin_ctzl or
128
These builtins are only used when we check what code comes out, on some
129
chips they're merely libgcc calls, where we will instead want an inline
130
in that case (either asm or generic C).
132
These builtins are better than an asm block of the same insn, since an
133
asm block doesn't give gcc any information about scheduling or resource
134
usage. We keep an asm block for use on prior versions of gcc though.
136
For reference, __builtin_ffs existed in gcc prior to __builtin_clz, but
137
it's not used (for count_leading_zeros) because it generally gives extra
138
code to ensure the result is 0 when the input is 0, which we don't need
141
#ifdef _LONG_LONG_LIMB
142
#define count_leading_zeros_gcc_clz(count,x) \
145
(count) = __builtin_clzll (x); \
148
#define count_leading_zeros_gcc_clz(count,x) \
151
(count) = __builtin_clzl (x); \
155
#ifdef _LONG_LONG_LIMB
156
#define count_trailing_zeros_gcc_ctz(count,x) \
159
(count) = __builtin_ctzll (x); \
162
#define count_trailing_zeros_gcc_ctz(count,x) \
165
(count) = __builtin_ctzl (x); \
170
/* FIXME: The macros using external routines like __MPN(count_leading_zeros)
171
don't need to be under !NO_ASM */
172
#if ! defined (NO_ASM)
174
#if defined (__alpha) && W_TYPE_SIZE == 64
175
/* Most alpha-based machines, except Cray systems. */
176
#if defined (__GNUC__)
177
#define umul_ppmm(ph, pl, m0, m1) \
179
UDItype __m0 = (m0), __m1 = (m1); \
180
__asm__ ("umulh %r1,%2,%0" \
182
: "%rJ" (m0), "rI" (m1)); \
183
(pl) = __m0 * __m1; \
186
#else /* ! __GNUC__ */
187
#include <machine/builtins.h>
188
#define umul_ppmm(ph, pl, m0, m1) \
190
UDItype __m0 = (m0), __m1 = (m1); \
191
(ph) = __UMULH (m0, m1); \
192
(pl) = __m0 * __m1; \
195
#ifndef LONGLONG_STANDALONE
196
#define udiv_qrnnd(q, r, n1, n0, d) \
198
__di = __MPN(invert_limb) (d); \
199
udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \
201
#define UDIV_PREINV_ALWAYS 1
202
#define UDIV_NEEDS_NORMALIZATION 1
203
#define UDIV_TIME 220
204
#endif /* LONGLONG_STANDALONE */
206
/* clz_tab is required in all configurations, since mpn/alpha/cntlz.asm
207
always goes into libgmp.so, even when not actually used. */
208
#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
210
#if defined (__GNUC__) && HAVE_HOST_CPU_alpha_CIX
211
#define count_leading_zeros(COUNT,X) \
212
__asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X))
213
#define count_trailing_zeros(COUNT,X) \
214
__asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X))
215
#endif /* clz/ctz using cix */
217
#if ! defined (count_leading_zeros) \
218
&& defined (__GNUC__) && ! defined (LONGLONG_STANDALONE)
219
/* ALPHA_CMPBGE_0 gives "cmpbge $31,src,dst", ie. test src bytes == 0.
220
"$31" is written explicitly in the asm, since an "r" constraint won't
221
select reg 31. There seems no need to worry about "r31" syntax for cray,
222
since gcc itself (pre-release 3.4) emits just $31 in various places. */
223
#define ALPHA_CMPBGE_0(dst, src) \
224
do { asm ("cmpbge $31, %1, %0" : "=r" (dst) : "r" (src)); } while (0)
225
/* Zero bytes are turned into bits with cmpbge, a __clz_tab lookup counts
226
them, locating the highest non-zero byte. A second __clz_tab lookup
227
counts the leading zero bits in that byte, giving the result. */
228
#define count_leading_zeros(count, x) \
230
UWtype __clz__b, __clz__c, __clz__x = (x); \
231
ALPHA_CMPBGE_0 (__clz__b, __clz__x); /* zero bytes */ \
232
__clz__b = __clz_tab [(__clz__b >> 1) ^ 0x7F]; /* 8 to 1 byte */ \
233
__clz__b = __clz__b * 8 - 7; /* 57 to 1 shift */ \
234
__clz__x >>= __clz__b; \
235
__clz__c = __clz_tab [__clz__x]; /* 8 to 1 bit */ \
236
__clz__b = 65 - __clz__b; \
237
(count) = __clz__b - __clz__c; \
239
#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
240
#endif /* clz using cmpbge */
242
#if ! defined (count_leading_zeros) && ! defined (LONGLONG_STANDALONE)
243
#if HAVE_ATTRIBUTE_CONST
244
long __MPN(count_leading_zeros) _PROTO ((UDItype)) __attribute__ ((const));
246
long __MPN(count_leading_zeros) _PROTO ((UDItype));
248
#define count_leading_zeros(count, x) \
249
((count) = __MPN(count_leading_zeros) (x))
250
#endif /* clz using mpn */
253
#if defined (_CRAY) && W_TYPE_SIZE == 64
254
#include <intrinsics.h>
255
#define UDIV_PREINV_ALWAYS 1
256
#define UDIV_NEEDS_NORMALIZATION 1
257
#define UDIV_TIME 220
258
long __MPN(count_leading_zeros) _PROTO ((UDItype));
259
#define count_leading_zeros(count, x) \
260
((count) = _leadz ((UWtype) (x)))
261
#if defined (_CRAYIEEE) /* I.e., Cray T90/ieee, T3D, and T3E */
262
#define umul_ppmm(ph, pl, m0, m1) \
264
UDItype __m0 = (m0), __m1 = (m1); \
265
(ph) = _int_mult_upper (m0, m1); \
266
(pl) = __m0 * __m1; \
268
#ifndef LONGLONG_STANDALONE
269
#define udiv_qrnnd(q, r, n1, n0, d) \
271
__di = __MPN(invert_limb) (d); \
272
udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \
274
#endif /* LONGLONG_STANDALONE */
275
#endif /* _CRAYIEEE */
278
#if defined (__ia64) && W_TYPE_SIZE == 64
279
/* This form encourages gcc (pre-release 3.4 at least) to emit predicated
280
"sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency. The generic
281
code using "al<bl" arithmetically comes out making an actual 0 or 1 in a
282
register, which takes an extra cycle. */
283
#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
288
(sh) = (ah) - (bh) - 1; \
290
(sh) = (ah) - (bh); \
293
#if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
294
/* Do both product parts in assembly, since that gives better code with
295
all gcc versions. Some callers will just use the upper part, and in
296
that situation we waste an instruction, but not any cycles. */
297
#define umul_ppmm(ph, pl, m0, m1) \
298
__asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0" \
299
: "=&f" (ph), "=f" (pl) \
300
: "f" (m0), "f" (m1))
302
#define count_leading_zeros(count, x) \
304
UWtype _x = (x), _y, _a, _c; \
305
__asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x)); \
306
__asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y)); \
307
_c = (_a - 1) << 3; \
314
(count) = W_TYPE_SIZE - 1 - _c; \
316
/* similar to what gcc does for __builtin_ffs, but 0 based rather than 1
317
based, and we don't need a special case for x==0 here */
318
#define count_trailing_zeros(count, x) \
320
UWtype __ctz_x = (x); \
321
__asm__ ("popcnt %0 = %1" \
323
: "r" ((__ctz_x-1) & ~__ctz_x)); \
326
#if defined (__INTEL_COMPILER)
327
#include <ia64intrin.h>
328
#define umul_ppmm(ph, pl, m0, m1) \
330
UWtype _m0 = (m0), _m1 = (m1); \
331
ph = _m64_xmahu (_m0, _m1, 0); \
335
#ifndef LONGLONG_STANDALONE
336
#define udiv_qrnnd(q, r, n1, n0, d) \
338
__di = __MPN(invert_limb) (d); \
339
udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \
341
#define UDIV_PREINV_ALWAYS 1
342
#define UDIV_NEEDS_NORMALIZATION 1
344
#define UDIV_TIME 220
348
#if defined (__GNUC__)
350
/* We sometimes need to clobber "cc" with gcc2, but that would not be
351
understood by gcc1. Use cpp to avoid major code duplication. */
354
#define __AND_CLOBBER_CC
355
#else /* __GNUC__ >= 2 */
356
#define __CLOBBER_CC : "cc"
357
#define __AND_CLOBBER_CC , "cc"
358
#endif /* __GNUC__ < 2 */
360
#if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32
361
#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
362
__asm__ ("add %1,%4,%5\n\taddc %0,%2,%3" \
363
: "=r" (sh), "=&r" (sl) \
364
: "r" (ah), "rI" (bh), "%r" (al), "rI" (bl))
365
#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
366
__asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3" \
367
: "=r" (sh), "=&r" (sl) \
368
: "r" (ah), "rI" (bh), "r" (al), "rI" (bl))
369
#define umul_ppmm(xh, xl, m0, m1) \
371
USItype __m0 = (m0), __m1 = (m1); \
372
__asm__ ("multiplu %0,%1,%2" \
374
: "r" (__m0), "r" (__m1)); \
375
__asm__ ("multmu %0,%1,%2" \
377
: "r" (__m0), "r" (__m1)); \
379
#define udiv_qrnnd(q, r, n1, n0, d) \
380
__asm__ ("dividu %0,%3,%4" \
381
: "=r" (q), "=q" (r) \
382
: "1" (n1), "r" (n0), "r" (d))
383
#define count_leading_zeros(count, x) \
384
__asm__ ("clz %0,%1" \
387
#define COUNT_LEADING_ZEROS_0 32
388
#endif /* __a29k__ */
390
#if defined (__arc__)
391
#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
392
__asm__ ("add.f\t%1, %4, %5\n\tadc\t%0, %2, %3" \
395
: "r" ((USItype) (ah)), \
396
"rIJ" ((USItype) (bh)), \
397
"%r" ((USItype) (al)), \
398
"rIJ" ((USItype) (bl)))
399
#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
400
__asm__ ("sub.f\t%1, %4, %5\n\tsbc\t%0, %2, %3" \
403
: "r" ((USItype) (ah)), \
404
"rIJ" ((USItype) (bh)), \
405
"r" ((USItype) (al)), \
406
"rIJ" ((USItype) (bl)))
409
#if defined (__arm__) && W_TYPE_SIZE == 32
410
#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
411
__asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3" \
412
: "=r" (sh), "=&r" (sl) \
413
: "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
414
#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
416
if (__builtin_constant_p (al)) \
418
if (__builtin_constant_p (ah)) \
419
__asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2" \
420
: "=r" (sh), "=&r" (sl) \
421
: "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
423
__asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3" \
424
: "=r" (sh), "=&r" (sl) \
425
: "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
427
else if (__builtin_constant_p (ah)) \
429
if (__builtin_constant_p (bl)) \
430
__asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2" \
431
: "=r" (sh), "=&r" (sl) \
432
: "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
434
__asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2" \
435
: "=r" (sh), "=&r" (sl) \
436
: "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
438
else if (__builtin_constant_p (bl)) \
440
if (__builtin_constant_p (bh)) \
441
__asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \
442
: "=r" (sh), "=&r" (sl) \
443
: "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
445
__asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2" \
446
: "=r" (sh), "=&r" (sl) \
447
: "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
449
else /* only bh might be a constant */ \
450
__asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \
451
: "=r" (sh), "=&r" (sl) \
452
: "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC);\
454
#if 1 || defined (__arm_m__) /* `M' series has widening multiply support */
455
#define umul_ppmm(xh, xl, a, b) \
456
__asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
458
#define smul_ppmm(xh, xl, a, b) \
459
__asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
460
#ifndef LONGLONG_STANDALONE
461
#define udiv_qrnnd(q, r, n1, n0, d) \
463
__di = __MPN(invert_limb) (d); \
464
udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \
466
#define UDIV_PREINV_ALWAYS 1
467
#define UDIV_NEEDS_NORMALIZATION 1
469
#endif /* LONGLONG_STANDALONE */
471
#define umul_ppmm(xh, xl, a, b) \
472
__asm__ ("%@ Inlined umul_ppmm\n" \
473
" mov %|r0, %2, lsr #16\n" \
474
" mov %|r2, %3, lsr #16\n" \
475
" bic %|r1, %2, %|r0, lsl #16\n" \
476
" bic %|r2, %3, %|r2, lsl #16\n" \
477
" mul %1, %|r1, %|r2\n" \
478
" mul %|r2, %|r0, %|r2\n" \
479
" mul %|r1, %0, %|r1\n" \
480
" mul %0, %|r0, %0\n" \
481
" adds %|r1, %|r2, %|r1\n" \
482
" addcs %0, %0, #65536\n" \
483
" adds %1, %1, %|r1, lsl #16\n" \
484
" adc %0, %0, %|r1, lsr #16" \
485
: "=&r" (xh), "=r" (xl) \
489
#ifndef LONGLONG_STANDALONE
490
#define udiv_qrnnd(q, r, n1, n0, d) \
492
(q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d)); \
495
extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype *, UWtype, UWtype, UWtype));
496
#define UDIV_TIME 200
497
#endif /* LONGLONG_STANDALONE */
501
#if defined (__clipper__) && W_TYPE_SIZE == 32
502
#define umul_ppmm(w1, w0, u, v) \
503
({union {UDItype __ll; \
504
struct {USItype __l, __h;} __i; \
506
__asm__ ("mulwux %2,%0" \
508
: "%0" ((USItype)(u)), "r" ((USItype)(v))); \
509
(w1) = __x.__i.__h; (w0) = __x.__i.__l;})
510
#define smul_ppmm(w1, w0, u, v) \
511
({union {DItype __ll; \
512
struct {SItype __l, __h;} __i; \
514
__asm__ ("mulwx %2,%0" \
516
: "%0" ((SItype)(u)), "r" ((SItype)(v))); \
517
(w1) = __x.__i.__h; (w0) = __x.__i.__l;})
518
#define __umulsidi3(u, v) \
520
__asm__ ("mulwux %2,%0" \
521
: "=r" (__w) : "%0" ((USItype)(u)), "r" ((USItype)(v))); \
523
#endif /* __clipper__ */
525
/* Fujitsu vector computers. */
526
#if defined (__uxp__) && W_TYPE_SIZE == 32
527
#define umul_ppmm(ph, pl, u, v) \
529
union {UDItype __ll; \
530
struct {USItype __h, __l;} __i; \
532
__asm__ ("mult.lu %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v));\
533
(ph) = __x.__i.__h; \
534
(pl) = __x.__i.__l; \
536
#define smul_ppmm(ph, pl, u, v) \
538
union {UDItype __ll; \
539
struct {USItype __h, __l;} __i; \
541
__asm__ ("mult.l %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v)); \
542
(ph) = __x.__i.__h; \
543
(pl) = __x.__i.__l; \
547
#if defined (__gmicro__) && W_TYPE_SIZE == 32
548
#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
549
__asm__ ("add.w %5,%1\n\taddx %3,%0" \
550
: "=g" (sh), "=&g" (sl) \
551
: "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
552
"%1" ((USItype)(al)), "g" ((USItype)(bl)))
553
#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
554
__asm__ ("sub.w %5,%1\n\tsubx %3,%0" \
555
: "=g" (sh), "=&g" (sl) \
556
: "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
557
"1" ((USItype)(al)), "g" ((USItype)(bl)))
558
#define umul_ppmm(ph, pl, m0, m1) \
559
__asm__ ("mulx %3,%0,%1" \
560
: "=g" (ph), "=r" (pl) \
561
: "%0" ((USItype)(m0)), "g" ((USItype)(m1)))
562
#define udiv_qrnnd(q, r, nh, nl, d) \
563
__asm__ ("divx %4,%0,%1" \
564
: "=g" (q), "=r" (r) \
565
: "1" ((USItype)(nh)), "0" ((USItype)(nl)), "g" ((USItype)(d)))
566
#define count_leading_zeros(count, x) \
567
__asm__ ("bsch/1 %1,%0" \
568
: "=g" (count) : "g" ((USItype)(x)), "0" ((USItype)0))
571
#if defined (__hppa) && W_TYPE_SIZE == 32
572
#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
573
__asm__ ("add%I5 %5,%r4,%1\n\taddc %r2,%r3,%0" \
574
: "=r" (sh), "=&r" (sl) \
575
: "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
576
#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
577
__asm__ ("sub%I4 %4,%r5,%1\n\tsubb %r2,%r3,%0" \
578
: "=r" (sh), "=&r" (sl) \
579
: "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
580
#if defined (_PA_RISC1_1)
581
#define umul_ppmm(wh, wl, u, v) \
583
union {UDItype __ll; \
584
struct {USItype __h, __l;} __i; \
586
__asm__ ("xmpyu %1,%2,%0" : "=*f" (__x.__ll) : "*f" (u), "*f" (v)); \
587
(wh) = __x.__i.__h; \
588
(wl) = __x.__i.__l; \
596
#define count_leading_zeros(count, x) \
601
" extru,= %1,15,16,%%r0 ; Bits 31..16 zero?\n" \
602
" extru,tr %1,15,16,%1 ; No. Shift down, skip add.\n" \
603
" ldo 16(%0),%0 ; Yes. Perform add.\n" \
604
" extru,= %1,23,8,%%r0 ; Bits 15..8 zero?\n" \
605
" extru,tr %1,23,8,%1 ; No. Shift down, skip add.\n" \
606
" ldo 8(%0),%0 ; Yes. Perform add.\n" \
607
" extru,= %1,27,4,%%r0 ; Bits 7..4 zero?\n" \
608
" extru,tr %1,27,4,%1 ; No. Shift down, skip add.\n" \
609
" ldo 4(%0),%0 ; Yes. Perform add.\n" \
610
" extru,= %1,29,2,%%r0 ; Bits 3..2 zero?\n" \
611
" extru,tr %1,29,2,%1 ; No. Shift down, skip add.\n" \
612
" ldo 2(%0),%0 ; Yes. Perform add.\n" \
613
" extru %1,30,1,%1 ; Extract bit 1.\n" \
614
" sub %0,%1,%0 ; Subtract it.\n" \
615
: "=r" (count), "=r" (__tmp) : "1" (x)); \
619
/* These macros are for ABI=2.0w. In ABI=2.0n they can't be used, since GCC
620
(3.2) puts longlong into two adjacent 32-bit registers. Presumably this
621
is just a case of no direct support for 2.0n but treating it like 1.0. */
622
#if defined (__hppa) && W_TYPE_SIZE == 64 && ! defined (_LONG_LONG_LIMB)
623
#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
624
__asm__ ("add%I5 %5,%r4,%1\n\tadd,dc %r2,%r3,%0" \
625
: "=r" (sh), "=&r" (sl) \
626
: "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
627
#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
628
__asm__ ("sub%I4 %4,%r5,%1\n\tsub,db %r2,%r3,%0" \
629
: "=r" (sh), "=&r" (sl) \
630
: "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
633
#if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32
634
#define smul_ppmm(xh, xl, m0, m1) \
636
union {DItype __ll; \
637
struct {USItype __h, __l;} __i; \
639
__asm__ ("lr %N0,%1\n\tmr %0,%2" \
641
: "r" (m0), "r" (m1)); \
642
(xh) = __x.__i.__h; (xl) = __x.__i.__l; \
644
#define sdiv_qrnnd(q, r, n1, n0, d) \
646
union {DItype __ll; \
647
struct {USItype __h, __l;} __i; \
649
__x.__i.__h = n1; __x.__i.__l = n0; \
650
__asm__ ("dr %0,%2" \
652
: "0" (__x.__ll), "r" (d)); \
653
(q) = __x.__i.__l; (r) = __x.__i.__h; \
657
#if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
658
#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
659
__asm__ ("addl %5,%k1\n\tadcl %3,%k0" \
660
: "=r" (sh), "=&r" (sl) \
661
: "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
662
"%1" ((USItype)(al)), "g" ((USItype)(bl)))
663
#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
664
__asm__ ("subl %5,%k1\n\tsbbl %3,%k0" \
665
: "=r" (sh), "=&r" (sl) \
666
: "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
667
"1" ((USItype)(al)), "g" ((USItype)(bl)))
668
#define umul_ppmm(w1, w0, u, v) \
670
: "=a" (w0), "=d" (w1) \
671
: "%0" ((USItype)(u)), "rm" ((USItype)(v)))
672
#define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
673
__asm__ ("divl %4" /* stringification in K&R C */ \
674
: "=a" (q), "=d" (r) \
675
: "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx)))
677
#if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium || HAVE_HOST_CPU_pentiummmx
678
/* Pentium bsrl takes between 10 and 72 cycles depending where the most
679
significant 1 bit is, hence the use of the following alternatives. bsfl
680
is slow too, between 18 and 42 depending where the least significant 1
681
bit is, so let the generic count_trailing_zeros below make use of the
682
count_leading_zeros here too. */
684
#if HAVE_HOST_CPU_pentiummmx && ! defined (LONGLONG_STANDALONE)
685
/* The following should be a fixed 14 or 15 cycles, but possibly plus an L1
686
cache miss reading from __clz_tab. For P55 it's favoured over the float
687
below so as to avoid mixing MMX and x87, since the penalty for switching
688
between the two is about 100 cycles.
690
The asm block sets __shift to -3 if the high 24 bits are clear, -2 for
691
16, -1 for 8, or 0 otherwise. This could be written equivalently as
692
follows, but as of gcc 2.95.2 it results in conditional jumps.
694
__shift = -(__n < 0x1000000);
695
__shift -= (__n < 0x10000);
696
__shift -= (__n < 0x100);
698
The middle two sbbl and cmpl's pair, and with luck something gcc
699
generates might pair with the first cmpl and the last sbbl. The "32+1"
700
constant could be folded into __clz_tab[], but it doesn't seem worth
701
making a different table just for that. */
703
#define count_leading_zeros(c,n) \
707
__asm__ ("cmpl $0x1000000, %1\n" \
709
"cmpl $0x10000, %1\n" \
711
"cmpl $0x100, %1\n" \
713
: "=&r" (__shift) : "r" (__n)); \
714
__shift = __shift*8 + 24 + 1; \
715
(c) = 32 + 1 - __shift - __clz_tab[__n >> __shift]; \
717
#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
718
#define COUNT_LEADING_ZEROS_0 31 /* n==0 indistinguishable from n==1 */
720
#else /* ! pentiummmx || LONGLONG_STANDALONE */
721
/* The following should be a fixed 14 cycles or so. Some scheduling
722
opportunities should be available between the float load/store too. This
723
sort of code is used in gcc 3 for __builtin_ffs (with "n&-n") and is
724
apparently suggested by the Intel optimizing manual (don't know exactly
725
where). gcc 2.95 or up will be best for this, so the "double" is
726
correctly aligned on the stack. */
727
#define count_leading_zeros(c,n) \
734
__u.d = (UWtype) (n); \
735
(c) = 0x3FF + 31 - (__u.a[1] >> 20); \
737
#define COUNT_LEADING_ZEROS_0 (0x3FF + 31)
738
#endif /* pentiummx */
740
#else /* ! pentium */
742
#if __GMP_GNUC_PREREQ (3,4) /* using bsrl */
743
#define count_leading_zeros(count,x) count_leading_zeros_gcc_clz(count,x)
746
/* On P6, gcc prior to 3.0 generates a partial register stall for
747
__cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former
748
being 1 code byte smaller. "31-__cbtmp" is a workaround, probably at the
749
cost of one extra instruction. Do this for "i386" too, since that means
751
#if ! defined (count_leading_zeros) && __GNUC__ < 3 \
752
&& (HAVE_HOST_CPU_i386 \
753
|| HAVE_HOST_CPU_i686 \
754
|| HAVE_HOST_CPU_pentiumpro \
755
|| HAVE_HOST_CPU_pentium2 \
756
|| HAVE_HOST_CPU_pentium3)
757
#define count_leading_zeros(count, x) \
761
__asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x))); \
762
(count) = 31 - __cbtmp; \
764
#endif /* gcc<3 asm bsrl */
766
#ifndef count_leading_zeros
767
#define count_leading_zeros(count, x) \
771
__asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x))); \
772
(count) = __cbtmp ^ 31; \
774
#endif /* asm bsrl */
776
#if __GMP_GNUC_PREREQ (3,4) /* using bsfl */
777
#define count_trailing_zeros(count,x) count_trailing_zeros_gcc_ctz(count,x)
780
#ifndef count_trailing_zeros
781
#define count_trailing_zeros(count, x) \
784
__asm__ ("bsfl %1,%0" : "=r" (count) : "rm" ((USItype)(x))); \
786
#endif /* asm bsfl */
788
#endif /* ! pentium */
798
#if defined (__amd64__) && W_TYPE_SIZE == 64
799
#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
800
__asm__ ("addq %5,%q1\n\tadcq %3,%q0" \
801
: "=r" (sh), "=&r" (sl) \
802
: "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)), \
803
"%1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
804
#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
805
__asm__ ("subq %5,%q1\n\tsbbq %3,%q0" \
806
: "=r" (sh), "=&r" (sl) \
807
: "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)), \
808
"1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
809
#define umul_ppmm(w1, w0, u, v) \
811
: "=a" (w0), "=d" (w1) \
812
: "%0" ((UDItype)(u)), "rm" ((UDItype)(v)))
813
#define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
814
__asm__ ("divq %4" /* stringification in K&R C */ \
815
: "=a" (q), "=d" (r) \
816
: "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx)))
817
/* bsrq destination must be a 64-bit register, hence UDItype for __cbtmp. */
818
#define count_leading_zeros(count, x) \
822
__asm__ ("bsrq %1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x))); \
823
(count) = __cbtmp ^ 63; \
825
/* bsfq destination must be a 64-bit register, "%q0" forces this in case
826
count is only an int. */
827
#define count_trailing_zeros(count, x) \
830
__asm__ ("bsfq %1,%q0" : "=r" (count) : "rm" ((UDItype)(x))); \
834
#if defined (__i860__) && W_TYPE_SIZE == 32
835
#define rshift_rhlc(r,h,l,c) \
836
__asm__ ("shr %3,r0,r0\;shrd %1,%2,%0" \
837
"=r" (r) : "r" (h), "r" (l), "rn" (c))
840
#if defined (__i960__) && W_TYPE_SIZE == 32
841
#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
842
__asm__ ("cmpo 1,0\;addc %5,%4,%1\;addc %3,%2,%0" \
843
: "=r" (sh), "=&r" (sl) \
844
: "dI" (ah), "dI" (bh), "%dI" (al), "dI" (bl))
845
#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
846
__asm__ ("cmpo 0,0\;subc %5,%4,%1\;subc %3,%2,%0" \
847
: "=r" (sh), "=&r" (sl) \
848
: "dI" (ah), "dI" (bh), "dI" (al), "dI" (bl))
849
#define umul_ppmm(w1, w0, u, v) \
850
({union {UDItype __ll; \
851
struct {USItype __l, __h;} __i; \
853
__asm__ ("emul %2,%1,%0" \
854
: "=d" (__x.__ll) : "%dI" (u), "dI" (v)); \
855
(w1) = __x.__i.__h; (w0) = __x.__i.__l;})
856
#define __umulsidi3(u, v) \
858
__asm__ ("emul %2,%1,%0" : "=d" (__w) : "%dI" (u), "dI" (v)); \
860
#define udiv_qrnnd(q, r, nh, nl, d) \
862
union {UDItype __ll; \
863
struct {USItype __l, __h;} __i; \
865
__nn.__i.__h = (nh); __nn.__i.__l = (nl); \
866
__asm__ ("ediv %d,%n,%0" \
867
: "=d" (__rq.__ll) : "dI" (__nn.__ll), "dI" (d)); \
868
(r) = __rq.__i.__l; (q) = __rq.__i.__h; \
870
#define count_leading_zeros(count, x) \
873
__asm__ ("scanbit %1,%0" : "=r" (__cbtmp) : "r" (x)); \
874
(count) = __cbtmp ^ 31; \
876
#define COUNT_LEADING_ZEROS_0 (-32) /* sic */
877
#if defined (__i960mx) /* what is the proper symbol to test??? */
878
#define rshift_rhlc(r,h,l,c) \
880
union {UDItype __ll; \
881
struct {USItype __l, __h;} __i; \
883
__nn.__i.__h = (h); __nn.__i.__l = (l); \
884
__asm__ ("shre %2,%1,%0" : "=d" (r) : "dI" (__nn.__ll), "dI" (c)); \
889
#if (defined (__mc68000__) || defined (__mc68020__) || defined(mc68020) \
890
|| defined (__m68k__) || defined (__mc5200__) || defined (__mc5206e__) \
891
|| defined (__mc5307__)) && W_TYPE_SIZE == 32
892
#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
893
__asm__ ("add%.l %5,%1\n\taddx%.l %3,%0" \
894
: "=d" (sh), "=&d" (sl) \
895
: "0" ((USItype)(ah)), "d" ((USItype)(bh)), \
896
"%1" ((USItype)(al)), "g" ((USItype)(bl)))
897
#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
898
__asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0" \
899
: "=d" (sh), "=&d" (sl) \
900
: "0" ((USItype)(ah)), "d" ((USItype)(bh)), \
901
"1" ((USItype)(al)), "g" ((USItype)(bl)))
902
/* The '020, '030, '040 and CPU32 have 32x32->64 and 64/32->32q-32r. */
903
#if defined (__mc68020__) || defined(mc68020) \
904
|| defined (__mc68030__) || defined (mc68030) \
905
|| defined (__mc68040__) || defined (mc68040) \
906
|| defined (__mcpu32__) || defined (mcpu32) \
907
|| defined (__NeXT__)
908
#define umul_ppmm(w1, w0, u, v) \
909
__asm__ ("mulu%.l %3,%1:%0" \
910
: "=d" (w0), "=d" (w1) \
911
: "%0" ((USItype)(u)), "dmi" ((USItype)(v)))
913
#define udiv_qrnnd(q, r, n1, n0, d) \
914
__asm__ ("divu%.l %4,%1:%0" \
915
: "=d" (q), "=d" (r) \
916
: "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
918
#define sdiv_qrnnd(q, r, n1, n0, d) \
919
__asm__ ("divs%.l %4,%1:%0" \
920
: "=d" (q), "=d" (r) \
921
: "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
922
#else /* for other 68k family members use 16x16->32 multiplication */
923
#define umul_ppmm(xh, xl, a, b) \
924
do { USItype __umul_tmp1, __umul_tmp2; \
925
__asm__ ("| Inlined umul_ppmm\n" \
938
" add%.l %#0x10000,%0\n" \
939
"1: move%.l %2,%3\n" \
946
" | End inlined umul_ppmm" \
947
: "=&d" (xh), "=&d" (xl), \
948
"=d" (__umul_tmp1), "=&d" (__umul_tmp2) \
949
: "%2" ((USItype)(a)), "d" ((USItype)(b))); \
951
#define UMUL_TIME 100
952
#define UDIV_TIME 400
953
#endif /* not mc68020 */
954
/* The '020, '030, '040 and '060 have bitfield insns.
955
GCC 3.4 defines __mc68020__ when in CPU32 mode, check for __mcpu32__ to
956
exclude bfffo on that chip (bitfield insns not available). */
957
#if (defined (__mc68020__) || defined (mc68020) \
958
|| defined (__mc68030__) || defined (mc68030) \
959
|| defined (__mc68040__) || defined (mc68040) \
960
|| defined (__mc68060__) || defined (mc68060) \
961
|| defined (__NeXT__)) \
962
&& ! defined (__mcpu32__)
963
#define count_leading_zeros(count, x) \
964
__asm__ ("bfffo %1{%b2:%b2},%0" \
966
: "od" ((USItype) (x)), "n" (0))
967
#define COUNT_LEADING_ZEROS_0 32
971
#if defined (__m88000__) && W_TYPE_SIZE == 32
972
#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
973
__asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3" \
974
: "=r" (sh), "=&r" (sl) \
975
: "rJ" (ah), "rJ" (bh), "%rJ" (al), "rJ" (bl))
976
#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
977
__asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3" \
978
: "=r" (sh), "=&r" (sl) \
979
: "rJ" (ah), "rJ" (bh), "rJ" (al), "rJ" (bl))
980
#define count_leading_zeros(count, x) \
983
__asm__ ("ff1 %0,%1" : "=r" (__cbtmp) : "r" (x)); \
984
(count) = __cbtmp ^ 31; \
986
#define COUNT_LEADING_ZEROS_0 63 /* sic */
987
#if defined (__m88110__)
988
#define umul_ppmm(wh, wl, u, v) \
990
union {UDItype __ll; \
991
struct {USItype __h, __l;} __i; \
993
__asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v)); \
994
(wh) = __x.__i.__h; \
995
(wl) = __x.__i.__l; \
997
#define udiv_qrnnd(q, r, n1, n0, d) \
998
({union {UDItype __ll; \
999
struct {USItype __h, __l;} __i; \
1001
__x.__i.__h = (n1); __x.__i.__l = (n0); \
1002
__asm__ ("divu.d %0,%1,%2" \
1003
: "=r" (__q.__ll) : "r" (__x.__ll), "r" (d)); \
1004
(r) = (n0) - __q.__l * (d); (q) = __q.__l; })
1006
#define UDIV_TIME 25
1008
#define UMUL_TIME 17
1009
#define UDIV_TIME 150
1010
#endif /* __m88110__ */
1011
#endif /* __m88000__ */
1013
#if defined (__mips) && W_TYPE_SIZE == 32
1014
#if __GNUC__ > 2 || __GNUC_MINOR__ >= 7
1015
#define umul_ppmm(w1, w0, u, v) \
1016
__asm__ ("multu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
1018
#define umul_ppmm(w1, w0, u, v) \
1019
__asm__ ("multu %2,%3\n\tmflo %0\n\tmfhi %1" \
1020
: "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
1022
#define UMUL_TIME 10
1023
#define UDIV_TIME 100
1026
#if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64
1027
#if __GNUC__ > 2 || __GNUC_MINOR__ >= 7
1028
#define umul_ppmm(w1, w0, u, v) \
1029
__asm__ ("dmultu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
1031
#define umul_ppmm(w1, w0, u, v) \
1032
__asm__ ("dmultu %2,%3\n\tmflo %0\n\tmfhi %1" \
1033
: "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
1035
#define UMUL_TIME 20
1036
#define UDIV_TIME 140
1039
#if defined (__ns32000__) && W_TYPE_SIZE == 32
1040
#define umul_ppmm(w1, w0, u, v) \
1041
({union {UDItype __ll; \
1042
struct {USItype __l, __h;} __i; \
1044
__asm__ ("meid %2,%0" \
1046
: "%0" ((USItype)(u)), "g" ((USItype)(v))); \
1047
(w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1048
#define __umulsidi3(u, v) \
1050
__asm__ ("meid %2,%0" \
1052
: "%0" ((USItype)(u)), "g" ((USItype)(v))); \
1054
#define udiv_qrnnd(q, r, n1, n0, d) \
1055
({union {UDItype __ll; \
1056
struct {USItype __l, __h;} __i; \
1058
__x.__i.__h = (n1); __x.__i.__l = (n0); \
1059
__asm__ ("deid %2,%0" \
1061
: "0" (__x.__ll), "g" ((USItype)(d))); \
1062
(r) = __x.__i.__l; (q) = __x.__i.__h; })
1063
#define count_trailing_zeros(count,x) \
1065
__asm__ ("ffsd %2,%0" \
1067
: "0" ((USItype) 0), "r" ((USItype) (x))); \
1069
#endif /* __ns32000__ */
1071
/* In the past we had a block of various #defines tested
1077
PPC - old gcc, GNU/Linux, SysV
1078
The plain PPC test was not good for vxWorks, since PPC is defined on all
1079
CPUs there (eg. m68k too), as a constant one is expected to compare
1082
At any rate, this was pretty unattractive and a bit fragile. The use of
1083
HAVE_HOST_CPU_FAMILY is designed to cut through it all and be sure of
1084
getting the desired effect.
1086
ENHANCE-ME: We should test _IBMR2 here when we add assembly support for
1087
the system vendor compilers. (Is that vendor compilers with inline asm,
1090
#if (HAVE_HOST_CPU_FAMILY_power || HAVE_HOST_CPU_FAMILY_powerpc) \
1091
&& W_TYPE_SIZE == 32
1092
#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1094
if (__builtin_constant_p (bh) && (bh) == 0) \
1095
__asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2" \
1096
: "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1097
else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0) \
1098
__asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2" \
1099
: "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1101
__asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3" \
1102
: "=r" (sh), "=&r" (sl) \
1103
: "r" (ah), "r" (bh), "%r" (al), "rI" (bl)); \
1105
#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1107
if (__builtin_constant_p (ah) && (ah) == 0) \
1108
__asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2" \
1109
: "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1110
else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0) \
1111
__asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2" \
1112
: "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1113
else if (__builtin_constant_p (bh) && (bh) == 0) \
1114
__asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2" \
1115
: "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1116
else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0) \
1117
__asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2" \
1118
: "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1120
__asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2" \
1121
: "=r" (sh), "=&r" (sl) \
1122
: "r" (ah), "r" (bh), "rI" (al), "r" (bl)); \
1124
#define count_leading_zeros(count, x) \
1125
__asm__ ("{cntlz|cntlzw} %0,%1" : "=r" (count) : "r" (x))
1126
#define COUNT_LEADING_ZEROS_0 32
1127
#if HAVE_HOST_CPU_FAMILY_powerpc
1128
#define umul_ppmm(ph, pl, m0, m1) \
1130
USItype __m0 = (m0), __m1 = (m1); \
1131
__asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \
1132
(pl) = __m0 * __m1; \
1134
#define UMUL_TIME 15
1135
#define smul_ppmm(ph, pl, m0, m1) \
1137
SItype __m0 = (m0), __m1 = (m1); \
1138
__asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \
1139
(pl) = __m0 * __m1; \
1141
#define SMUL_TIME 14
1142
#define UDIV_TIME 120
1145
#define smul_ppmm(xh, xl, m0, m1) \
1146
__asm__ ("mul %0,%2,%3" : "=r" (xh), "=q" (xl) : "r" (m0), "r" (m1))
1148
#define sdiv_qrnnd(q, r, nh, nl, d) \
1149
__asm__ ("div %0,%2,%4" : "=r" (q), "=q" (r) : "r" (nh), "1" (nl), "r" (d))
1150
#define UDIV_TIME 100
1152
#endif /* 32-bit POWER architecture variants. */
1154
/* We should test _IBMR2 here when we add assembly support for the system
1155
vendor compilers. */
1156
#if HAVE_HOST_CPU_FAMILY_powerpc && W_TYPE_SIZE == 64
1157
#if !defined (_LONG_LONG_LIMB)
1158
/* _LONG_LONG_LIMB is ABI=mode32 where adde operates on 32-bit values. So
1159
use adde etc only when not _LONG_LONG_LIMB. */
1160
#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1162
if (__builtin_constant_p (bh) && (bh) == 0) \
1163
__asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2" \
1164
: "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1165
else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \
1166
__asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2" \
1167
: "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1169
__asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3" \
1170
: "=r" (sh), "=&r" (sl) \
1171
: "r" (ah), "r" (bh), "%r" (al), "rI" (bl)); \
1173
#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1175
if (__builtin_constant_p (ah) && (ah) == 0) \
1176
__asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2" \
1177
: "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1178
else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0) \
1179
__asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2" \
1180
: "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1181
else if (__builtin_constant_p (bh) && (bh) == 0) \
1182
__asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2" \
1183
: "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1184
else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \
1185
__asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2" \
1186
: "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1188
__asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2" \
1189
: "=r" (sh), "=&r" (sl) \
1190
: "r" (ah), "r" (bh), "rI" (al), "r" (bl)); \
1192
#endif /* ! _LONG_LONG_LIMB */
1193
#define count_leading_zeros(count, x) \
1194
__asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x))
1195
#define COUNT_LEADING_ZEROS_0 64
1196
#define umul_ppmm(ph, pl, m0, m1) \
1198
UDItype __m0 = (m0), __m1 = (m1); \
1199
__asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \
1200
(pl) = __m0 * __m1; \
1202
#define UMUL_TIME 15
1203
#define smul_ppmm(ph, pl, m0, m1) \
1205
DItype __m0 = (m0), __m1 = (m1); \
1206
__asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \
1207
(pl) = __m0 * __m1; \
1209
#define SMUL_TIME 14 /* ??? */
1210
#define UDIV_TIME 120 /* ??? */
1211
#endif /* 64-bit PowerPC. */
1213
#if defined (__pyr__) && W_TYPE_SIZE == 32
1214
#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1215
__asm__ ("addw %5,%1\n\taddwc %3,%0" \
1216
: "=r" (sh), "=&r" (sl) \
1217
: "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
1218
"%1" ((USItype)(al)), "g" ((USItype)(bl)))
1219
#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1220
__asm__ ("subw %5,%1\n\tsubwb %3,%0" \
1221
: "=r" (sh), "=&r" (sl) \
1222
: "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
1223
"1" ((USItype)(al)), "g" ((USItype)(bl)))
1224
/* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP. */
1225
#define umul_ppmm(w1, w0, u, v) \
1226
({union {UDItype __ll; \
1227
struct {USItype __h, __l;} __i; \
1229
__asm__ ("movw %1,%R0\n\tuemul %2,%0" \
1230
: "=&r" (__x.__ll) \
1231
: "g" ((USItype) (u)), "g" ((USItype)(v))); \
1232
(w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1233
#endif /* __pyr__ */
1235
#if defined (__ibm032__) /* RT/ROMP */ && W_TYPE_SIZE == 32
1236
#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1237
__asm__ ("a %1,%5\n\tae %0,%3" \
1238
: "=r" (sh), "=&r" (sl) \
1239
: "0" ((USItype)(ah)), "r" ((USItype)(bh)), \
1240
"%1" ((USItype)(al)), "r" ((USItype)(bl)))
1241
#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1242
__asm__ ("s %1,%5\n\tse %0,%3" \
1243
: "=r" (sh), "=&r" (sl) \
1244
: "0" ((USItype)(ah)), "r" ((USItype)(bh)), \
1245
"1" ((USItype)(al)), "r" ((USItype)(bl)))
1246
#define smul_ppmm(ph, pl, m0, m1) \
1268
: "=r" (ph), "=r" (pl) \
1269
: "%r" ((USItype)(m0)), "r" ((USItype)(m1)) \
1271
#define UMUL_TIME 20
1272
#define UDIV_TIME 200
1273
#define count_leading_zeros(count, x) \
1275
if ((x) >= 0x10000) \
1276
__asm__ ("clz %0,%1" \
1277
: "=r" (count) : "r" ((USItype)(x) >> 16)); \
1280
__asm__ ("clz %0,%1" \
1281
: "=r" (count) : "r" ((USItype)(x))); \
1285
#endif /* RT/ROMP */
1287
#if defined (__sh2__) && W_TYPE_SIZE == 32
1288
#define umul_ppmm(w1, w0, u, v) \
1289
__asm__ ("dmulu.l %2,%3\n\tsts macl,%1\n\tsts mach,%0" \
1290
: "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "macl", "mach")
1294
#if defined (__sparc__) && W_TYPE_SIZE == 32
1295
#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1296
__asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0" \
1297
: "=r" (sh), "=&r" (sl) \
1298
: "rJ" (ah), "rI" (bh),"%rJ" (al), "rI" (bl) \
1300
#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1301
__asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0" \
1302
: "=r" (sh), "=&r" (sl) \
1303
: "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl) \
1305
/* FIXME: When gcc -mcpu=v9 is used on solaris, gcc/config/sol2-sld-64.h
1306
doesn't define anything to indicate that to us, it only sets __sparcv8. */
1307
#if defined (__sparc_v9__) || defined (__sparcv9)
1308
/* Perhaps we should use floating-point operations here? */
1310
/* Triggers a bug making mpz/tests/t-gcd.c fail.
1311
Perhaps we simply need explicitly zero-extend the inputs? */
1312
#define umul_ppmm(w1, w0, u, v) \
1313
__asm__ ("mulx %2,%3,%%g1; srl %%g1,0,%1; srlx %%g1,32,%0" : \
1314
"=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "g1")
1316
/* Use v8 umul until above bug is fixed. */
1317
#define umul_ppmm(w1, w0, u, v) \
1318
__asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1320
/* Use a plain v8 divide for v9. */
1321
#define udiv_qrnnd(q, r, n1, n0, d) \
1324
__asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0" \
1325
: "=r" (__q) : "r" (n1), "r" (n0), "r" (d)); \
1326
(r) = (n0) - __q * (d); \
1330
#if defined (__sparc_v8__) /* gcc normal */ \
1331
|| defined (__sparcv8) /* gcc solaris */ \
1332
|| HAVE_HOST_CPU_supersparc
1333
/* Don't match immediate range because, 1) it is not often useful,
1334
2) the 'I' flag thinks of the range as a 13 bit signed interval,
1335
while we want to match a 13 bit interval, sign extended to 32 bits,
1336
but INTERPRETED AS UNSIGNED. */
1337
#define umul_ppmm(w1, w0, u, v) \
1338
__asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1341
#if HAVE_HOST_CPU_supersparc
1342
#define UDIV_TIME 60 /* SuperSPARC timing */
1344
/* Don't use this on SuperSPARC because its udiv only handles 53 bit
1345
dividends and will trap to the kernel for the rest. */
1346
#define udiv_qrnnd(q, r, n1, n0, d) \
1349
__asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0" \
1350
: "=r" (__q) : "r" (n1), "r" (n0), "r" (d)); \
1351
(r) = (n0) - __q * (d); \
1354
#define UDIV_TIME 25
1355
#endif /* HAVE_HOST_CPU_supersparc */
1357
#else /* ! __sparc_v8__ */
1358
#if defined (__sparclite__)
1359
/* This has hardware multiply but not divide. It also has two additional
1360
instructions scan (ffs from high bit) and divscc. */
1361
#define umul_ppmm(w1, w0, u, v) \
1362
__asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1364
#define udiv_qrnnd(q, r, n1, n0, d) \
1365
__asm__ ("! Inlined udiv_qrnnd\n" \
1366
" wr %%g0,%2,%%y ! Not a delayed write for sparclite\n" \
1368
" divscc %3,%4,%%g1\n" \
1369
" divscc %%g1,%4,%%g1\n" \
1370
" divscc %%g1,%4,%%g1\n" \
1371
" divscc %%g1,%4,%%g1\n" \
1372
" divscc %%g1,%4,%%g1\n" \
1373
" divscc %%g1,%4,%%g1\n" \
1374
" divscc %%g1,%4,%%g1\n" \
1375
" divscc %%g1,%4,%%g1\n" \
1376
" divscc %%g1,%4,%%g1\n" \
1377
" divscc %%g1,%4,%%g1\n" \
1378
" divscc %%g1,%4,%%g1\n" \
1379
" divscc %%g1,%4,%%g1\n" \
1380
" divscc %%g1,%4,%%g1\n" \
1381
" divscc %%g1,%4,%%g1\n" \
1382
" divscc %%g1,%4,%%g1\n" \
1383
" divscc %%g1,%4,%%g1\n" \
1384
" divscc %%g1,%4,%%g1\n" \
1385
" divscc %%g1,%4,%%g1\n" \
1386
" divscc %%g1,%4,%%g1\n" \
1387
" divscc %%g1,%4,%%g1\n" \
1388
" divscc %%g1,%4,%%g1\n" \
1389
" divscc %%g1,%4,%%g1\n" \
1390
" divscc %%g1,%4,%%g1\n" \
1391
" divscc %%g1,%4,%%g1\n" \
1392
" divscc %%g1,%4,%%g1\n" \
1393
" divscc %%g1,%4,%%g1\n" \
1394
" divscc %%g1,%4,%%g1\n" \
1395
" divscc %%g1,%4,%%g1\n" \
1396
" divscc %%g1,%4,%%g1\n" \
1397
" divscc %%g1,%4,%%g1\n" \
1398
" divscc %%g1,%4,%%g1\n" \
1399
" divscc %%g1,%4,%0\n" \
1403
"1: ! End of inline udiv_qrnnd" \
1404
: "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d) \
1405
: "%g1" __AND_CLOBBER_CC)
1406
#define UDIV_TIME 37
1407
#define count_leading_zeros(count, x) \
1408
__asm__ ("scan %1,1,%0" : "=r" (count) : "r" (x))
1409
/* Early sparclites return 63 for an argument of 0, but they warn that future
1410
implementations might change this. Therefore, leave COUNT_LEADING_ZEROS_0
1412
#endif /* __sparclite__ */
1413
#endif /* __sparc_v8__ */
1414
#endif /* __sparc_v9__ */
1415
/* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd. */
1417
#define umul_ppmm(w1, w0, u, v) \
1418
__asm__ ("! Inlined umul_ppmm\n" \
1419
" wr %%g0,%2,%%y ! SPARC has 0-3 delay insn after a wr\n" \
1420
" sra %3,31,%%g2 ! Don't move this insn\n" \
1421
" and %2,%%g2,%%g2 ! Don't move this insn\n" \
1422
" andcc %%g0,0,%%g1 ! Don't move this insn\n" \
1423
" mulscc %%g1,%3,%%g1\n" \
1424
" mulscc %%g1,%3,%%g1\n" \
1425
" mulscc %%g1,%3,%%g1\n" \
1426
" mulscc %%g1,%3,%%g1\n" \
1427
" mulscc %%g1,%3,%%g1\n" \
1428
" mulscc %%g1,%3,%%g1\n" \
1429
" mulscc %%g1,%3,%%g1\n" \
1430
" mulscc %%g1,%3,%%g1\n" \
1431
" mulscc %%g1,%3,%%g1\n" \
1432
" mulscc %%g1,%3,%%g1\n" \
1433
" mulscc %%g1,%3,%%g1\n" \
1434
" mulscc %%g1,%3,%%g1\n" \
1435
" mulscc %%g1,%3,%%g1\n" \
1436
" mulscc %%g1,%3,%%g1\n" \
1437
" mulscc %%g1,%3,%%g1\n" \
1438
" mulscc %%g1,%3,%%g1\n" \
1439
" mulscc %%g1,%3,%%g1\n" \
1440
" mulscc %%g1,%3,%%g1\n" \
1441
" mulscc %%g1,%3,%%g1\n" \
1442
" mulscc %%g1,%3,%%g1\n" \
1443
" mulscc %%g1,%3,%%g1\n" \
1444
" mulscc %%g1,%3,%%g1\n" \
1445
" mulscc %%g1,%3,%%g1\n" \
1446
" mulscc %%g1,%3,%%g1\n" \
1447
" mulscc %%g1,%3,%%g1\n" \
1448
" mulscc %%g1,%3,%%g1\n" \
1449
" mulscc %%g1,%3,%%g1\n" \
1450
" mulscc %%g1,%3,%%g1\n" \
1451
" mulscc %%g1,%3,%%g1\n" \
1452
" mulscc %%g1,%3,%%g1\n" \
1453
" mulscc %%g1,%3,%%g1\n" \
1454
" mulscc %%g1,%3,%%g1\n" \
1455
" mulscc %%g1,0,%%g1\n" \
1456
" add %%g1,%%g2,%0\n" \
1458
: "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v) \
1459
: "%g1", "%g2" __AND_CLOBBER_CC)
1460
#define UMUL_TIME 39 /* 39 instructions */
1463
#ifndef LONGLONG_STANDALONE
1464
#define udiv_qrnnd(q, r, n1, n0, d) \
1466
(q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d)); \
1469
extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype *, UWtype, UWtype, UWtype));
1471
#define UDIV_TIME 140
1473
#endif /* LONGLONG_STANDALONE */
1474
#endif /* udiv_qrnnd */
1475
#endif /* __sparc__ */
1477
#if defined (__sparc__) && W_TYPE_SIZE == 64
1478
#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1480
"addcc %r4,%5,%1\n" \
1481
" addccc %r6,%7,%%g0\n" \
1483
: "=r" (sh), "=&r" (sl) \
1484
: "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl), \
1485
"%rJ" ((al) >> 32), "rI" ((bl) >> 32) \
1487
#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1489
"subcc %r4,%5,%1\n" \
1490
" subccc %r6,%7,%%g0\n" \
1492
: "=r" (sh), "=&r" (sl) \
1493
: "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl), \
1494
"rJ" ((al) >> 32), "rI" ((bl) >> 32) \
1498
#if defined (__vax__) && W_TYPE_SIZE == 32
1499
#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1500
__asm__ ("addl2 %5,%1\n\tadwc %3,%0" \
1501
: "=g" (sh), "=&g" (sl) \
1502
: "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
1503
"%1" ((USItype)(al)), "g" ((USItype)(bl)))
1504
#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1505
__asm__ ("subl2 %5,%1\n\tsbwc %3,%0" \
1506
: "=g" (sh), "=&g" (sl) \
1507
: "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
1508
"1" ((USItype)(al)), "g" ((USItype)(bl)))
1509
#define smul_ppmm(xh, xl, m0, m1) \
1511
union {UDItype __ll; \
1512
struct {USItype __l, __h;} __i; \
1514
USItype __m0 = (m0), __m1 = (m1); \
1515
__asm__ ("emul %1,%2,$0,%0" \
1516
: "=g" (__x.__ll) : "g" (__m0), "g" (__m1)); \
1517
(xh) = __x.__i.__h; (xl) = __x.__i.__l; \
1519
#define sdiv_qrnnd(q, r, n1, n0, d) \
1521
union {DItype __ll; \
1522
struct {SItype __l, __h;} __i; \
1524
__x.__i.__h = n1; __x.__i.__l = n0; \
1525
__asm__ ("ediv %3,%2,%0,%1" \
1526
: "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d)); \
1529
/* FIXME: This instruction appears to be unimplemented on some systems (vax
1531
#define count_trailing_zeros(count,x) \
1533
__asm__ ("ffs 0, 31, %1, %0" \
1535
: "g" ((USItype) (x))); \
1538
#endif /* __vax__ */
1540
#if defined (__z8000__) && W_TYPE_SIZE == 16
1541
#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1542
__asm__ ("add %H1,%H5\n\tadc %H0,%H3" \
1543
: "=r" (sh), "=&r" (sl) \
1544
: "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)), \
1545
"%1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1546
#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1547
__asm__ ("sub %H1,%H5\n\tsbc %H0,%H3" \
1548
: "=r" (sh), "=&r" (sl) \
1549
: "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)), \
1550
"1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1551
#define umul_ppmm(xh, xl, m0, m1) \
1553
union {long int __ll; \
1554
struct {unsigned int __h, __l;} __i; \
1556
unsigned int __m0 = (m0), __m1 = (m1); \
1557
__asm__ ("mult %S0,%H3" \
1558
: "=r" (__x.__i.__h), "=r" (__x.__i.__l) \
1559
: "%1" (m0), "rQR" (m1)); \
1560
(xh) = __x.__i.__h; (xl) = __x.__i.__l; \
1561
(xh) += ((((signed int) __m0 >> 15) & __m1) \
1562
+ (((signed int) __m1 >> 15) & __m0)); \
1564
#endif /* __z8000__ */
1566
#endif /* __GNUC__ */
1571
#if !defined (umul_ppmm) && defined (__umulsidi3)
1572
#define umul_ppmm(ph, pl, m0, m1) \
1574
UDWtype __ll = __umulsidi3 (m0, m1); \
1575
ph = (UWtype) (__ll >> W_TYPE_SIZE); \
1576
pl = (UWtype) __ll; \
1580
#if !defined (__umulsidi3)
1581
#define __umulsidi3(u, v) \
1582
({UWtype __hi, __lo; \
1583
umul_ppmm (__hi, __lo, u, v); \
1584
((UDWtype) __hi << W_TYPE_SIZE) | __lo; })
1588
/* Use mpn_umul_ppmm or mpn_udiv_qrnnd functions, if they exist. The "_r"
1589
forms have "reversed" arguments, meaning the pointer is last, which
1590
sometimes allows better parameter passing, in particular on 64-bit
1593
#define mpn_umul_ppmm __MPN(umul_ppmm)
1594
extern UWtype mpn_umul_ppmm _PROTO ((UWtype *, UWtype, UWtype));
1596
#if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm \
1597
&& ! defined (LONGLONG_STANDALONE)
1598
#define umul_ppmm(wh, wl, u, v) \
1600
UWtype __umul_ppmm__p0; \
1601
(wh) = mpn_umul_ppmm (&__umul_ppmm__p0, (UWtype) (u), (UWtype) (v)); \
1602
(wl) = __umul_ppmm__p0; \
1606
#define mpn_umul_ppmm_r __MPN(umul_ppmm_r)
1607
extern UWtype mpn_umul_ppmm_r _PROTO ((UWtype, UWtype, UWtype *));
1609
#if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm_r \
1610
&& ! defined (LONGLONG_STANDALONE)
1611
#define umul_ppmm(wh, wl, u, v) \
1613
UWtype __umul_ppmm__p0; \
1614
(wh) = mpn_umul_ppmm_r ((UWtype) (u), (UWtype) (v), &__umul_ppmm__p0); \
1615
(wl) = __umul_ppmm__p0; \
1619
#define mpn_udiv_qrnnd __MPN(udiv_qrnnd)
1620
extern UWtype mpn_udiv_qrnnd _PROTO ((UWtype *, UWtype, UWtype, UWtype));
1622
#if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd \
1623
&& ! defined (LONGLONG_STANDALONE)
1624
#define udiv_qrnnd(q, r, n1, n0, d) \
1626
UWtype __udiv_qrnnd__r; \
1627
(q) = mpn_udiv_qrnnd (&__udiv_qrnnd__r, \
1628
(UWtype) (n1), (UWtype) (n0), (UWtype) d); \
1629
(r) = __udiv_qrnnd__r; \
1633
#define mpn_udiv_qrnnd_r __MPN(udiv_qrnnd_r)
1634
extern UWtype mpn_udiv_qrnnd_r _PROTO ((UWtype, UWtype, UWtype, UWtype *));
1636
#if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd_r \
1637
&& ! defined (LONGLONG_STANDALONE)
1638
#define udiv_qrnnd(q, r, n1, n0, d) \
1640
UWtype __udiv_qrnnd__r; \
1641
(q) = mpn_udiv_qrnnd_r ((UWtype) (n1), (UWtype) (n0), (UWtype) d, \
1642
&__udiv_qrnnd__r); \
1643
(r) = __udiv_qrnnd__r; \
1648
/* If this machine has no inline assembler, use C macros. */
1650
#if !defined (add_ssaaaa)
1651
#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1654
__x = (al) + (bl); \
1655
(sh) = (ah) + (bh) + (__x < (al)); \
1660
#if !defined (sub_ddmmss)
1661
#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1664
__x = (al) - (bl); \
1665
(sh) = (ah) - (bh) - ((al) < (bl)); \
1670
/* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of
1672
#if !defined (umul_ppmm) && defined (smul_ppmm)
1673
#define umul_ppmm(w1, w0, u, v) \
1676
UWtype __xm0 = (u), __xm1 = (v); \
1677
smul_ppmm (__w1, w0, __xm0, __xm1); \
1678
(w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1) \
1679
+ (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0); \
1683
/* If we still don't have umul_ppmm, define it using plain C.
1685
For reference, when this code is used for squaring (ie. u and v identical
1686
expressions), gcc recognises __x1 and __x2 are the same and generates 3
1687
multiplies, not 4. The subsequent additions could be optimized a bit,
1688
but the only place GMP currently uses such a square is mpn_sqr_basecase,
1689
and chips obliged to use this generic C umul will have plenty of worse
1690
performance problems than a couple of extra instructions on the diagonal
1693
#if !defined (umul_ppmm)
1694
#define umul_ppmm(w1, w0, u, v) \
1696
UWtype __x0, __x1, __x2, __x3; \
1697
UHWtype __ul, __vl, __uh, __vh; \
1698
UWtype __u = (u), __v = (v); \
1700
__ul = __ll_lowpart (__u); \
1701
__uh = __ll_highpart (__u); \
1702
__vl = __ll_lowpart (__v); \
1703
__vh = __ll_highpart (__v); \
1705
__x0 = (UWtype) __ul * __vl; \
1706
__x1 = (UWtype) __ul * __vh; \
1707
__x2 = (UWtype) __uh * __vl; \
1708
__x3 = (UWtype) __uh * __vh; \
1710
__x1 += __ll_highpart (__x0);/* this can't give carry */ \
1711
__x1 += __x2; /* but this indeed can */ \
1712
if (__x1 < __x2) /* did we get it? */ \
1713
__x3 += __ll_B; /* yes, add it in the proper pos. */ \
1715
(w1) = __x3 + __ll_highpart (__x1); \
1716
(w0) = (__x1 << W_TYPE_SIZE/2) + __ll_lowpart (__x0); \
1720
/* If we don't have smul_ppmm, define it using umul_ppmm (which surely will
1721
exist in one form or another. */
1722
#if !defined (smul_ppmm)
1723
#define smul_ppmm(w1, w0, u, v) \
1726
UWtype __xm0 = (u), __xm1 = (v); \
1727
umul_ppmm (__w1, w0, __xm0, __xm1); \
1728
(w1) = __w1 - (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1) \
1729
- (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0); \
1733
/* Define this unconditionally, so it can be used for debugging. */
1734
#define __udiv_qrnnd_c(q, r, n1, n0, d) \
1736
UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m; \
1738
ASSERT ((d) != 0); \
1739
ASSERT ((n1) < (d)); \
1741
__d1 = __ll_highpart (d); \
1742
__d0 = __ll_lowpart (d); \
1744
__q1 = (n1) / __d1; \
1745
__r1 = (n1) - __q1 * __d1; \
1746
__m = __q1 * __d0; \
1747
__r1 = __r1 * __ll_B | __ll_highpart (n0); \
1750
__q1--, __r1 += (d); \
1751
if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
1753
__q1--, __r1 += (d); \
1757
__q0 = __r1 / __d1; \
1758
__r0 = __r1 - __q0 * __d1; \
1759
__m = __q0 * __d0; \
1760
__r0 = __r0 * __ll_B | __ll_lowpart (n0); \
1763
__q0--, __r0 += (d); \
1766
__q0--, __r0 += (d); \
1770
(q) = __q1 * __ll_B | __q0; \
1774
/* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
1775
__udiv_w_sdiv (defined in libgcc or elsewhere). */
1776
#if !defined (udiv_qrnnd) && defined (sdiv_qrnnd)
1777
#define udiv_qrnnd(q, r, nh, nl, d) \
1780
(q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d); \
1785
/* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c. */
1786
#if !defined (udiv_qrnnd)
1787
#define UDIV_NEEDS_NORMALIZATION 1
1788
#define udiv_qrnnd __udiv_qrnnd_c
1791
#if !defined (count_leading_zeros)
1792
#define count_leading_zeros(count, x) \
1794
UWtype __xr = (x); \
1797
if (W_TYPE_SIZE == 32) \
1799
__a = __xr < ((UWtype) 1 << 2*__BITS4) \
1800
? (__xr < ((UWtype) 1 << __BITS4) ? 1 : __BITS4 + 1) \
1801
: (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 + 1 \
1806
for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8) \
1807
if (((__xr >> __a) & 0xff) != 0) \
1812
(count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a]; \
1814
/* This version gives a well-defined value for zero. */
1815
#define COUNT_LEADING_ZEROS_0 (W_TYPE_SIZE - 1)
1816
#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
1819
/* clz_tab needed by mpn/x86/pentium/mod_1.asm in a fat binary */
1820
#if HAVE_HOST_CPU_FAMILY_x86 && WANT_FAT_BINARY
1821
#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
1824
#ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
1825
extern const unsigned char __GMP_DECLSPEC __clz_tab[128];
1828
#if !defined (count_trailing_zeros)
1829
/* Define count_trailing_zeros using count_leading_zeros. The latter might be
1830
defined in asm, but if it is not, the C version above is good enough. */
1831
#define count_trailing_zeros(count, x) \
1833
UWtype __ctz_x = (x); \
1835
ASSERT (__ctz_x != 0); \
1836
count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x); \
1837
(count) = W_TYPE_SIZE - 1 - __ctz_c; \
1841
#ifndef UDIV_NEEDS_NORMALIZATION
1842
#define UDIV_NEEDS_NORMALIZATION 0
1845
/* Whether udiv_qrnnd is actually implemented with udiv_qrnnd_preinv, and
1846
that hence the latter should always be used. */
1847
#ifndef UDIV_PREINV_ALWAYS
1848
#define UDIV_PREINV_ALWAYS 0
1851
/* Give defaults for UMUL_TIME and UDIV_TIME. */
1857
#define UDIV_TIME UMUL_TIME