19
19
#ifndef MULTI_ARITH_H
20
20
#define MULTI_ARITH_H
22
#if 0 /* old code... */
24
/* Unsigned only, because we don't need signs to multiply and divide. */
25
typedef unsigned int int128[4];
36
#define LO_WORD(ll) (((unsigned int *) &ll)[1])
37
#define HI_WORD(ll) (((unsigned int *) &ll)[0])
39
/* Convenience functions to stuff various integer values into int128s */
41
static inline void zero128(int128 a)
43
a[LSW128] = a[NLSW128] = a[NMSW128] = a[MSW128] = 0;
46
/* Human-readable word order in the arguments */
47
static inline void set128(unsigned int i3, unsigned int i2, unsigned int i1,
48
unsigned int i0, int128 a)
56
/* Convenience functions (for testing as well) */
57
static inline void int64_to_128(unsigned long long src, int128 dest)
59
dest[LSW128] = (unsigned int) src;
60
dest[NLSW128] = src >> 32;
61
dest[NMSW128] = dest[MSW128] = 0;
64
static inline void int128_to_64(const int128 src, unsigned long long *dest)
66
*dest = src[LSW128] | (long long) src[NLSW128] << 32;
69
static inline void put_i128(const int128 a)
71
printk("%08x %08x %08x %08x\n", a[MSW128], a[NMSW128],
72
a[NLSW128], a[LSW128]);
77
Note that these are only good for 0 < count < 32.
80
static inline void _lsl128(unsigned int count, int128 a)
82
a[MSW128] = (a[MSW128] << count) | (a[NMSW128] >> (32 - count));
83
a[NMSW128] = (a[NMSW128] << count) | (a[NLSW128] >> (32 - count));
84
a[NLSW128] = (a[NLSW128] << count) | (a[LSW128] >> (32 - count));
88
static inline void _lsr128(unsigned int count, int128 a)
90
a[LSW128] = (a[LSW128] >> count) | (a[NLSW128] << (32 - count));
91
a[NLSW128] = (a[NLSW128] >> count) | (a[NMSW128] << (32 - count));
92
a[NMSW128] = (a[NMSW128] >> count) | (a[MSW128] << (32 - count));
96
/* Should be faster, one would hope */
98
static inline void lslone128(int128 a)
100
asm volatile ("lsl.l #1,%0\n"
116
static inline void lsrone128(int128 a)
118
asm volatile ("lsr.l #1,%0\n"
134
/* Generalized 128-bit shifters:
136
These bit-shift to a multiple of 32, then move whole longwords. */
138
static inline void lsl128(unsigned int count, int128 a)
143
_lsl128(count % 32, a);
145
if (0 == (wordcount = count / 32))
148
/* argh, gak, endian-sensitive */
149
for (i = 0; i < 4 - wordcount; i++) {
150
a[i] = a[i + wordcount];
152
for (i = 3; i >= 4 - wordcount; --i) {
157
static inline void lsr128(unsigned int count, int128 a)
162
_lsr128(count % 32, a);
164
if (0 == (wordcount = count / 32))
167
for (i = 3; i >= wordcount; --i) {
168
a[i] = a[i - wordcount];
170
for (i = 0; i < wordcount; i++) {
175
static inline int orl128(int a, int128 b)
180
static inline int btsthi128(const int128 a)
182
return a[MSW128] & 0x80000000;
185
/* test bits (numbered from 0 = LSB) up to and including "top" */
186
static inline int bftestlo128(int top, const int128 a)
197
r |= a[3 - (top / 32)] & ((1 << (top % 32 + 1)) - 1);
202
/* Aargh. We need these because GCC is broken */
203
/* FIXME: do them in assembly, for goodness' sake! */
204
static inline void mask64(int pos, unsigned long long *mask)
209
LO_WORD(*mask) = (1 << pos) - 1;
213
HI_WORD(*mask) = (1 << (pos - 32)) - 1;
216
static inline void bset64(int pos, unsigned long long *dest)
218
/* This conditional will be optimized away. Thanks, GCC! */
220
asm volatile ("bset %1,%0":"=m"
221
(LO_WORD(*dest)):"id"(pos));
223
asm volatile ("bset %1,%0":"=m"
224
(HI_WORD(*dest)):"id"(pos - 32));
227
static inline int btst64(int pos, unsigned long long dest)
230
return (0 != (LO_WORD(dest) & (1 << pos)));
232
return (0 != (HI_WORD(dest) & (1 << (pos - 32))));
235
static inline void lsl64(int count, unsigned long long *dest)
238
HI_WORD(*dest) = (HI_WORD(*dest) << count)
239
| (LO_WORD(*dest) >> count);
240
LO_WORD(*dest) <<= count;
244
HI_WORD(*dest) = LO_WORD(*dest) << count;
248
static inline void lsr64(int count, unsigned long long *dest)
251
LO_WORD(*dest) = (LO_WORD(*dest) >> count)
252
| (HI_WORD(*dest) << (32 - count));
253
HI_WORD(*dest) >>= count;
257
LO_WORD(*dest) = HI_WORD(*dest) >> count;
262
22
static inline void fp_denormalize(struct fp_ext *reg, unsigned int cnt)
485
static inline unsigned int fp_fls128(union fp_mant128 *src)
488
unsigned int res, off;
490
if ((data = src->m32[0]))
492
else if ((data = src->m32[1]))
494
else if ((data = src->m32[2]))
496
else if ((data = src->m32[3]))
501
asm ("bfffo %1{#0,#32},%0" : "=d" (res) : "dm" (data));
505
static inline void fp_shiftmant128(union fp_mant128 *src, int shift)
507
unsigned long sticky;
513
asm volatile ("lsl.l #1,%0"
514
: "=d" (src->m32[3]) : "0" (src->m32[3]));
515
asm volatile ("roxl.l #1,%0"
516
: "=d" (src->m32[2]) : "0" (src->m32[2]));
517
asm volatile ("roxl.l #1,%0"
518
: "=d" (src->m32[1]) : "0" (src->m32[1]));
519
asm volatile ("roxl.l #1,%0"
520
: "=d" (src->m32[0]) : "0" (src->m32[0]));
523
src->m32[0] = (src->m32[0] << shift) | (src->m32[1] >> (32 - shift));
524
src->m32[1] = (src->m32[1] << shift) | (src->m32[2] >> (32 - shift));
525
src->m32[2] = (src->m32[2] << shift) | (src->m32[3] >> (32 - shift));
526
src->m32[3] = (src->m32[3] << shift);
530
src->m32[0] = (src->m32[1] << shift) | (src->m32[2] >> (32 - shift));
531
src->m32[1] = (src->m32[2] << shift) | (src->m32[3] >> (32 - shift));
532
src->m32[2] = (src->m32[3] << shift);
537
src->m32[0] = (src->m32[2] << shift) | (src->m32[3] >> (32 - shift));
538
src->m32[1] = (src->m32[3] << shift);
539
src->m32[2] = src->m32[3] = 0;
543
src->m32[0] = (src->m32[3] << shift);
544
src->m32[1] = src->m32[2] = src->m32[3] = 0;
549
if (src->m32[3] << (32 - shift))
551
src->m32[3] = (src->m32[3] >> shift) | (src->m32[2] << (32 - shift)) | sticky;
552
src->m32[2] = (src->m32[2] >> shift) | (src->m32[1] << (32 - shift));
553
src->m32[1] = (src->m32[1] >> shift) | (src->m32[0] << (32 - shift));
554
src->m32[0] = (src->m32[0] >> shift);
559
if ((src->m32[2] << (32 - shift)) || src->m32[3])
561
src->m32[3] = (src->m32[2] >> shift) | (src->m32[1] << (32 - shift)) | sticky;
562
src->m32[2] = (src->m32[1] >> shift) | (src->m32[0] << (32 - shift));
563
src->m32[1] = (src->m32[0] >> shift);
569
if ((src->m32[1] << (32 - shift)) || src->m32[2] || src->m32[3])
571
src->m32[3] = (src->m32[1] >> shift) | (src->m32[0] << (32 - shift)) | sticky;
572
src->m32[2] = (src->m32[0] >> shift);
573
src->m32[1] = src->m32[0] = 0;
578
if ((src->m32[0] << (32 - shift)) || src->m32[1] || src->m32[2] || src->m32[3])
580
src->m32[3] = (src->m32[0] >> shift) | sticky;
581
src->m32[2] = src->m32[1] = src->m32[0] = 0;
585
if (shift < 0 && (src->m32[0] || src->m32[1] || src->m32[2] || src->m32[3]))
595
244
static inline void fp_putmant128(struct fp_ext *dest, union fp_mant128 *src,
640
#if 0 /* old code... */
641
static inline int fls(unsigned int a)
645
asm volatile ("bfffo %1{#0,#32},%0"
646
: "=d" (r) : "md" (a));
650
/* fls = "find last set" (cf. ffs(3)) */
651
static inline int fls128(const int128 a)
654
return fls(a[MSW128]);
656
return fls(a[NMSW128]) + 32;
657
/* XXX: it probably never gets beyond this point in actual
658
use, but that's indicative of a more general problem in the
659
algorithm (i.e. as per the actual 68881 implementation, we
660
really only need at most 67 bits of precision [plus
661
overflow]) so I'm not going to fix it. */
663
return fls(a[NLSW128]) + 64;
665
return fls(a[LSW128]) + 96;
670
static inline int zerop128(const int128 a)
672
return !(a[LSW128] | a[NLSW128] | a[NMSW128] | a[MSW128]);
675
static inline int nonzerop128(const int128 a)
677
return (a[LSW128] | a[NLSW128] | a[NMSW128] | a[MSW128]);
680
/* Addition and subtraction */
681
/* Do these in "pure" assembly, because "extended" asm is unmanageable
683
static inline void add128(const int128 a, int128 b)
685
/* rotating carry flags */
686
unsigned int carry[2];
688
carry[0] = a[LSW128] > (0xffffffff - b[LSW128]);
689
b[LSW128] += a[LSW128];
691
carry[1] = a[NLSW128] > (0xffffffff - b[NLSW128] - carry[0]);
692
b[NLSW128] = a[NLSW128] + b[NLSW128] + carry[0];
694
carry[0] = a[NMSW128] > (0xffffffff - b[NMSW128] - carry[1]);
695
b[NMSW128] = a[NMSW128] + b[NMSW128] + carry[1];
697
b[MSW128] = a[MSW128] + b[MSW128] + carry[0];
700
/* Note: assembler semantics: "b -= a" */
701
static inline void sub128(const int128 a, int128 b)
703
/* rotating borrow flags */
704
unsigned int borrow[2];
706
borrow[0] = b[LSW128] < a[LSW128];
707
b[LSW128] -= a[LSW128];
709
borrow[1] = b[NLSW128] < a[NLSW128] + borrow[0];
710
b[NLSW128] = b[NLSW128] - a[NLSW128] - borrow[0];
712
borrow[0] = b[NMSW128] < a[NMSW128] + borrow[1];
713
b[NMSW128] = b[NMSW128] - a[NMSW128] - borrow[1];
715
b[MSW128] = b[MSW128] - a[MSW128] - borrow[0];
718
/* Poor man's 64-bit expanding multiply */
719
static inline void mul64(unsigned long long a, unsigned long long b, int128 c)
721
unsigned long long acc;
727
/* first the low words */
728
if (LO_WORD(a) && LO_WORD(b)) {
729
acc = (long long) LO_WORD(a) * LO_WORD(b);
730
c[NLSW128] = HI_WORD(acc);
731
c[LSW128] = LO_WORD(acc);
733
/* Next the high words */
734
if (HI_WORD(a) && HI_WORD(b)) {
735
acc = (long long) HI_WORD(a) * HI_WORD(b);
736
c[MSW128] = HI_WORD(acc);
737
c[NMSW128] = LO_WORD(acc);
739
/* The middle words */
740
if (LO_WORD(a) && HI_WORD(b)) {
741
acc = (long long) LO_WORD(a) * HI_WORD(b);
742
acc128[NMSW128] = HI_WORD(acc);
743
acc128[NLSW128] = LO_WORD(acc);
746
/* The first and last words */
747
if (HI_WORD(a) && LO_WORD(b)) {
748
acc = (long long) HI_WORD(a) * LO_WORD(b);
749
acc128[NMSW128] = HI_WORD(acc);
750
acc128[NLSW128] = LO_WORD(acc);
756
static inline int cmp128(int128 a, int128 b)
758
if (a[MSW128] < b[MSW128])
760
if (a[MSW128] > b[MSW128])
762
if (a[NMSW128] < b[NMSW128])
764
if (a[NMSW128] > b[NMSW128])
766
if (a[NLSW128] < b[NLSW128])
768
if (a[NLSW128] > b[NLSW128])
771
return (signed) a[LSW128] - b[LSW128];
774
inline void div128(int128 a, int128 b, int128 c)
780
Shift the divisor until it's at least as big as the
781
dividend, keeping track of the position to which we've
782
shifted it, i.e. the power of 2 which we've multiplied it
785
Then, for this power of 2 (the mask), and every one smaller
786
than it, subtract the mask from the dividend and add it to
787
the quotient until the dividend is smaller than the raised
788
divisor. At this point, divide the dividend and the mask
789
by 2 (i.e. shift one place to the right). Lather, rinse,
790
and repeat, until there are no more powers of 2 left. */
792
/* FIXME: needless to say, there's room for improvement here too. */
795
/* XXX: since it just has to be "at least as big", we can
796
probably eliminate this horribly wasteful loop. I will
797
have to prove this first, though */
798
set128(0, 0, 0, 1, mask);
799
while (cmp128(b, a) < 0 && !btsthi128(b)) {
807
if (cmp128(a, b) >= 0) {
813
} while (nonzerop128(mask));
815
/* The remainder is in a... */
819
289
#endif /* MULTI_ARITH_H */