~ubuntu-branches/ubuntu/quantal/flint/quantal : revision 1

1

/* longlong.h -- definitions for mixed size 32/64 bit arithmetic.

2

3

4

2004, 2005 Free Software Foundation, Inc.

5

6

This file is free software; you can redistribute it and/or modify

7

it under the terms of the GNU Lesser General Public License as published by

8

the Free Software Foundation; either version 2.1 of the License, or (at your

9

option) any later version.

10

11

This file is distributed in the hope that it will be useful, but

12

WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY

13

or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public

14

License for more details.

15

16

You should have received a copy of the GNU Lesser General Public License

17

along with this file; see the file COPYING.LIB. If not, write to

18

the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,

19

MA 02110-1301, USA. */

20

21

/* You have to define the following before including this file:

22

23

UWtype -- An unsigned type, default type for operations (typically a "word")

24

UHWtype -- An unsigned type, at least half the size of UWtype.

25

UDWtype -- An unsigned type, at least twice as large a UWtype

26

W_TYPE_SIZE -- size in bits of UWtype

27

28

SItype, USItype -- Signed and unsigned 32 bit types.

29

DItype, UDItype -- Signed and unsigned 64 bit types.

30

31

On a 32 bit machine UWtype should typically be USItype;

32

on a 64 bit machine, UWtype should typically be UDItype.

33

*/

34

35

#define __BITS4 (W_TYPE_SIZE / 4)

36

#define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))

37

#define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))

38

#define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))

39

40

/* This is used to make sure no undesirable sharing between different libraries

41

that use this file takes place. */

42

#ifndef __MPN

43

#define __MPN(x) __##x

44

#endif

45

46

#ifndef _PROTO

47

#if (__STDC__-0) || defined (__cplusplus)

48

#define _PROTO(x) x

49

#else

50

#define _PROTO(x) ()

51

#endif

52

#endif

53

54

/* Define auxiliary asm macros.

55

56

1) umul_ppmm(high_prod, low_prod, multipler, multiplicand) multiplies two

57

UWtype integers MULTIPLER and MULTIPLICAND, and generates a two UWtype

58

word product in HIGH_PROD and LOW_PROD.

59

60

2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a

61

UDWtype product. This is just a variant of umul_ppmm.

62

63

3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,

64

denominator) divides a UDWtype, composed by the UWtype integers

65

HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient

66

in QUOTIENT and the remainder in REMAINDER. HIGH_NUMERATOR must be less

67

than DENOMINATOR for correct operation. If, in addition, the most

68

significant bit of DENOMINATOR must be 1, then the pre-processor symbol

69

UDIV_NEEDS_NORMALIZATION is defined to 1.

70

71

4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator,

72

denominator). Like udiv_qrnnd but the numbers are signed. The quotient

73

is rounded towards 0.

74

75

5) count_leading_zeros(count, x) counts the number of zero-bits from the

76

msb to the first non-zero bit in the UWtype X. This is the number of

77

steps X needs to be shifted left to set the msb. Undefined for X == 0,

78

unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value.

79

80

6) count_trailing_zeros(count, x) like count_leading_zeros, but counts

81

from the least significant end.

82

83

7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,

84

high_addend_2, low_addend_2) adds two UWtype integers, composed by

85

HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2

86

respectively. The result is placed in HIGH_SUM and LOW_SUM. Overflow

87

(i.e. carry out) is not stored anywhere, and is lost.

88

89

8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,

90

high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,

91

composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and

92

LOW_SUBTRAHEND_2 respectively. The result is placed in HIGH_DIFFERENCE

93

and LOW_DIFFERENCE. Overflow (i.e. carry out) is not stored anywhere,

94

and is lost.

95

96

If any of these macros are left undefined for a particular CPU,

97

C macros are used.

98

99

100

Notes:

101

102

For add_ssaaaa the two high and two low addends can both commute, but

103

unfortunately gcc only supports one "%" commutative in each asm block.

104

This has always been so but is only documented in recent versions

105

(eg. pre-release 3.3). Having two or more "%"s can cause an internal

106

compiler error in certain rare circumstances.

107

108

Apparently it was only the last "%" that was ever actually respected, so

109

the code has been updated to leave just that. Clearly there's a free

110

choice whether high or low should get it, if there's a reason to favour

111

one over the other. Also obviously when the constraints on the two

112

operands are identical there's no benefit to the reloader in any "%" at

113

all.

114

115

*/

116

117

/* The CPUs come in alphabetical order below.

118

119

Please add support for more CPUs here, or improve the current support

120

for the CPUs below! */

121

122

123

/* count_leading_zeros_gcc_clz is count_leading_zeros implemented with gcc

124

3.4 __builtin_clzl or __builtin_clzll, according to our limb size.

125

Similarly count_trailing_zeros_gcc_ctz using __builtin_ctzl or

126

__builtin_ctzll.

127

128

These builtins are only used when we check what code comes out, on some

129

chips they're merely libgcc calls, where we will instead want an inline

130

in that case (either asm or generic C).

131

132

These builtins are better than an asm block of the same insn, since an

133

asm block doesn't give gcc any information about scheduling or resource

134

usage. We keep an asm block for use on prior versions of gcc though.

135

136

For reference, __builtin_ffs existed in gcc prior to __builtin_clz, but

137

it's not used (for count_leading_zeros) because it generally gives extra

138

code to ensure the result is 0 when the input is 0, which we don't need

139

or want. */

140

141

#ifdef _LONG_LONG_LIMB

142

#define count_leading_zeros_gcc_clz(count,x) \

143

do { \

144

ASSERT ((x) != 0); \

145

(count) = __builtin_clzll (x); \

146

} while (0)

147

#else

148

#define count_leading_zeros_gcc_clz(count,x) \

149

do { \

150

ASSERT ((x) != 0); \

151

(count) = __builtin_clzl (x); \

152

} while (0)

153

#endif

154

155

#ifdef _LONG_LONG_LIMB

156

#define count_trailing_zeros_gcc_ctz(count,x) \

157

do { \

158

ASSERT ((x) != 0); \

159

(count) = __builtin_ctzll (x); \

160

} while (0)

161

#else

162

#define count_trailing_zeros_gcc_ctz(count,x) \

163

do { \

164

ASSERT ((x) != 0); \

165

(count) = __builtin_ctzl (x); \

166

} while (0)

167

#endif

168

169

170

/* FIXME: The macros using external routines like __MPN(count_leading_zeros)

171

don't need to be under !NO_ASM */

172

#if ! defined (NO_ASM)

173

174

#if defined (__alpha) && W_TYPE_SIZE == 64

175

/* Most alpha-based machines, except Cray systems. */

176

#if defined (__GNUC__)

177

#define umul_ppmm(ph, pl, m0, m1) \

178

do { \

179

UDItype __m0 = (m0), __m1 = (m1); \

180

__asm__ ("umulh %r1,%2,%0" \

181

: "=r" (ph) \

182

: "%rJ" (m0), "rI" (m1)); \

183

(pl) = __m0 * __m1; \

184

} while (0)

185

#define UMUL_TIME 18

186

#else /* ! __GNUC__ */

187

#include <machine/builtins.h>

188

#define umul_ppmm(ph, pl, m0, m1) \

189

do { \

190

UDItype __m0 = (m0), __m1 = (m1); \

191

(ph) = __UMULH (m0, m1); \

192

(pl) = __m0 * __m1; \

193

} while (0)

194

#endif

195

#ifndef LONGLONG_STANDALONE

196

#define udiv_qrnnd(q, r, n1, n0, d) \

197

do { UWtype __di; \

198

__di = __MPN(invert_limb) (d); \

199

udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \

200

} while (0)

201

#define UDIV_PREINV_ALWAYS 1

202

#define UDIV_NEEDS_NORMALIZATION 1

203

#define UDIV_TIME 220

204

#endif /* LONGLONG_STANDALONE */

205

206

/* clz_tab is required in all configurations, since mpn/alpha/cntlz.asm

207

always goes into libgmp.so, even when not actually used. */

208

#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB

209

210

#if defined (__GNUC__) && HAVE_HOST_CPU_alpha_CIX

211

#define count_leading_zeros(COUNT,X) \

212

__asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X))

213

#define count_trailing_zeros(COUNT,X) \

214

__asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X))

215

#endif /* clz/ctz using cix */

216

217

#if ! defined (count_leading_zeros) \

218

&& defined (__GNUC__) && ! defined (LONGLONG_STANDALONE)

219

/* ALPHA_CMPBGE_0 gives "cmpbge $31,src,dst", ie. test src bytes == 0.

220

"$31" is written explicitly in the asm, since an "r" constraint won't

221

select reg 31. There seems no need to worry about "r31" syntax for cray,

222

since gcc itself (pre-release 3.4) emits just $31 in various places. */

223

#define ALPHA_CMPBGE_0(dst, src) \

224

do { asm ("cmpbge $31, %1, %0" : "=r" (dst) : "r" (src)); } while (0)

225

/* Zero bytes are turned into bits with cmpbge, a __clz_tab lookup counts

226

them, locating the highest non-zero byte. A second __clz_tab lookup

227

counts the leading zero bits in that byte, giving the result. */

228

#define count_leading_zeros(count, x) \

229

do { \

230

UWtype __clz__b, __clz__c, __clz__x = (x); \

231

ALPHA_CMPBGE_0 (__clz__b, __clz__x); /* zero bytes */ \

232

__clz__b = __clz_tab [(__clz__b >> 1) ^ 0x7F]; /* 8 to 1 byte */ \

233

__clz__b = __clz__b * 8 - 7; /* 57 to 1 shift */ \

234

__clz__x >>= __clz__b; \

235

__clz__c = __clz_tab [__clz__x]; /* 8 to 1 bit */ \

236

__clz__b = 65 - __clz__b; \

237

(count) = __clz__b - __clz__c; \

238

} while (0)

239

#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB

240

#endif /* clz using cmpbge */

241

242

#if ! defined (count_leading_zeros) && ! defined (LONGLONG_STANDALONE)

243

#if HAVE_ATTRIBUTE_CONST

244

long __MPN(count_leading_zeros) _PROTO ((UDItype)) __attribute__ ((const));

245

#else

246

long __MPN(count_leading_zeros) _PROTO ((UDItype));

247

#endif

248

#define count_leading_zeros(count, x) \

249

((count) = __MPN(count_leading_zeros) (x))

250

#endif /* clz using mpn */

251

#endif /* __alpha */

252

253

#if defined (_CRAY) && W_TYPE_SIZE == 64

254

#include <intrinsics.h>

255

#define UDIV_PREINV_ALWAYS 1

256

#define UDIV_NEEDS_NORMALIZATION 1

257

#define UDIV_TIME 220

258

long __MPN(count_leading_zeros) _PROTO ((UDItype));

259

#define count_leading_zeros(count, x) \

260

((count) = _leadz ((UWtype) (x)))

261

#if defined (_CRAYIEEE) /* I.e., Cray T90/ieee, T3D, and T3E */

262

#define umul_ppmm(ph, pl, m0, m1) \

263

do { \

264

UDItype __m0 = (m0), __m1 = (m1); \

265

(ph) = _int_mult_upper (m0, m1); \

266

(pl) = __m0 * __m1; \

267

} while (0)

268

#ifndef LONGLONG_STANDALONE

269

#define udiv_qrnnd(q, r, n1, n0, d) \

270

do { UWtype __di; \

271

__di = __MPN(invert_limb) (d); \

272

udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \

273

} while (0)

274

#endif /* LONGLONG_STANDALONE */

275

#endif /* _CRAYIEEE */

276

#endif /* _CRAY */

277

278

#if defined (__ia64) && W_TYPE_SIZE == 64

279

/* This form encourages gcc (pre-release 3.4 at least) to emit predicated

280

"sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency. The generic

281

code using "al<bl" arithmetically comes out making an actual 0 or 1 in a

282

register, which takes an extra cycle. */

283

#define sub_ddmmss(sh, sl, ah, al, bh, bl) \

284

do { \

285

UWtype __x; \

286

__x = (al) - (bl); \

287

if ((al) < (bl)) \

288

(sh) = (ah) - (bh) - 1; \

289

else \

290

(sh) = (ah) - (bh); \

291

(sl) = __x; \

292

} while (0)

293

#if defined (__GNUC__) && ! defined (__INTEL_COMPILER)

294

/* Do both product parts in assembly, since that gives better code with

295

all gcc versions. Some callers will just use the upper part, and in

296

that situation we waste an instruction, but not any cycles. */

297

#define umul_ppmm(ph, pl, m0, m1) \

298

__asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0" \

299

: "=&f" (ph), "=f" (pl) \

300

: "f" (m0), "f" (m1))

301

#define UMUL_TIME 14

302

#define count_leading_zeros(count, x) \

303

do { \

304

UWtype _x = (x), _y, _a, _c; \

305

__asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x)); \

306

__asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y)); \

307

_c = (_a - 1) << 3; \

308

_x >>= _c; \

309

if (_x >= 1 << 4) \

310

_x >>= 4, _c += 4; \

311

if (_x >= 1 << 2) \

312

_x >>= 2, _c += 2; \

313

_c += _x >> 1; \

314

(count) = W_TYPE_SIZE - 1 - _c; \

315

} while (0)

316

/* similar to what gcc does for __builtin_ffs, but 0 based rather than 1

317

based, and we don't need a special case for x==0 here */

318

#define count_trailing_zeros(count, x) \

319

do { \

320

UWtype __ctz_x = (x); \

321

__asm__ ("popcnt %0 = %1" \

322

: "=r" (count) \

323

: "r" ((__ctz_x-1) & ~__ctz_x)); \

324

} while (0)

325

#endif

326

#if defined (__INTEL_COMPILER)

327

#include <ia64intrin.h>

328

#define umul_ppmm(ph, pl, m0, m1) \

329

do { \

330

UWtype _m0 = (m0), _m1 = (m1); \

331

ph = _m64_xmahu (_m0, _m1, 0); \

332

pl = _m0 * _m1; \

333

} while (0)

334

#endif

335

#ifndef LONGLONG_STANDALONE

336

#define udiv_qrnnd(q, r, n1, n0, d) \

337

do { UWtype __di; \

338

__di = __MPN(invert_limb) (d); \

339

udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \

340

} while (0)

341

#define UDIV_PREINV_ALWAYS 1

342

#define UDIV_NEEDS_NORMALIZATION 1

343

#endif

344

#define UDIV_TIME 220

345

#endif

346

347

348

#if defined (__GNUC__)

349

350

/* We sometimes need to clobber "cc" with gcc2, but that would not be

351

understood by gcc1. Use cpp to avoid major code duplication. */

352

#if __GNUC__ < 2

353

#define __CLOBBER_CC

354

#define __AND_CLOBBER_CC

355

#else /* __GNUC__ >= 2 */

356

#define __CLOBBER_CC : "cc"

357

#define __AND_CLOBBER_CC , "cc"

358

#endif /* __GNUC__ < 2 */

359

360

#if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32

361

#define add_ssaaaa(sh, sl, ah, al, bh, bl) \

362

__asm__ ("add %1,%4,%5\n\taddc %0,%2,%3" \

363

: "=r" (sh), "=&r" (sl) \

364

: "r" (ah), "rI" (bh), "%r" (al), "rI" (bl))

365

#define sub_ddmmss(sh, sl, ah, al, bh, bl) \

366

__asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3" \

367

: "=r" (sh), "=&r" (sl) \

368

: "r" (ah), "rI" (bh), "r" (al), "rI" (bl))

369

#define umul_ppmm(xh, xl, m0, m1) \

370

do { \

371

USItype __m0 = (m0), __m1 = (m1); \

372

__asm__ ("multiplu %0,%1,%2" \

373

: "=r" (xl) \

374

: "r" (__m0), "r" (__m1)); \

375

__asm__ ("multmu %0,%1,%2" \

376

: "=r" (xh) \

377

: "r" (__m0), "r" (__m1)); \

378

} while (0)

379

#define udiv_qrnnd(q, r, n1, n0, d) \

380

__asm__ ("dividu %0,%3,%4" \

381

: "=r" (q), "=q" (r) \

382

: "1" (n1), "r" (n0), "r" (d))

383

#define count_leading_zeros(count, x) \

384

__asm__ ("clz %0,%1" \

385

: "=r" (count) \

386

: "r" (x))

387

#define COUNT_LEADING_ZEROS_0 32

388

#endif /* __a29k__ */

389

390

#if defined (__arc__)

391

#define add_ssaaaa(sh, sl, ah, al, bh, bl) \

392

__asm__ ("add.f\t%1, %4, %5\n\tadc\t%0, %2, %3" \

393

: "=r" (sh), \

394

"=&r" (sl) \

395

: "r" ((USItype) (ah)), \

396

"rIJ" ((USItype) (bh)), \

397

"%r" ((USItype) (al)), \

398

"rIJ" ((USItype) (bl)))

399

#define sub_ddmmss(sh, sl, ah, al, bh, bl) \

400

__asm__ ("sub.f\t%1, %4, %5\n\tsbc\t%0, %2, %3" \

401

: "=r" (sh), \

402

"=&r" (sl) \

403

: "r" ((USItype) (ah)), \

404

"rIJ" ((USItype) (bh)), \

405

"r" ((USItype) (al)), \

406

"rIJ" ((USItype) (bl)))

407

#endif

408

409

#if defined (__arm__) && W_TYPE_SIZE == 32

410

#define add_ssaaaa(sh, sl, ah, al, bh, bl) \

411

__asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3" \

412

: "=r" (sh), "=&r" (sl) \

413

: "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)

414

#define sub_ddmmss(sh, sl, ah, al, bh, bl) \

415

do { \

416

if (__builtin_constant_p (al)) \

417

{ \

418

if (__builtin_constant_p (ah)) \

419

__asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2" \

420

: "=r" (sh), "=&r" (sl) \

421

: "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \

422

else \

423

__asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3" \

424

: "=r" (sh), "=&r" (sl) \

425

: "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \

426

} \

427

else if (__builtin_constant_p (ah)) \

428

{ \

429

if (__builtin_constant_p (bl)) \

430

__asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2" \

431

: "=r" (sh), "=&r" (sl) \

432

: "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \

433

else \

434

__asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2" \

435

: "=r" (sh), "=&r" (sl) \

436

: "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \

437

} \

438

else if (__builtin_constant_p (bl)) \

439

{ \

440

if (__builtin_constant_p (bh)) \

441

__asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \

442

: "=r" (sh), "=&r" (sl) \

443

: "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \

444

else \

445

__asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2" \

446

: "=r" (sh), "=&r" (sl) \

447

: "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \

448

} \

449

else /* only bh might be a constant */ \

450

__asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \

451

: "=r" (sh), "=&r" (sl) \

452

: "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC);\

453

} while (0)

454

#if 1 || defined (__arm_m__) /* `M' series has widening multiply support */

455

#define umul_ppmm(xh, xl, a, b) \

456

__asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))

457

#define UMUL_TIME 5

458

#define smul_ppmm(xh, xl, a, b) \

459

__asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))

460

#ifndef LONGLONG_STANDALONE

461

#define udiv_qrnnd(q, r, n1, n0, d) \

462

do { UWtype __di; \

463

__di = __MPN(invert_limb) (d); \

464

udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \

465

} while (0)

466

#define UDIV_PREINV_ALWAYS 1

467

#define UDIV_NEEDS_NORMALIZATION 1

468

#define UDIV_TIME 70

469

#endif /* LONGLONG_STANDALONE */

470

#else

471

#define umul_ppmm(xh, xl, a, b) \

472

__asm__ ("%@ Inlined umul_ppmm\n" \

473

" mov %|r0, %2, lsr #16\n" \

474

" mov %|r2, %3, lsr #16\n" \

475

" bic %|r1, %2, %|r0, lsl #16\n" \

476

" bic %|r2, %3, %|r2, lsl #16\n" \

477

" mul %1, %|r1, %|r2\n" \

478

" mul %|r2, %|r0, %|r2\n" \

479

" mul %|r1, %0, %|r1\n" \

480

" mul %0, %|r0, %0\n" \

481

" adds %|r1, %|r2, %|r1\n" \

482

" addcs %0, %0, #65536\n" \

483

" adds %1, %1, %|r1, lsl #16\n" \

484

" adc %0, %0, %|r1, lsr #16" \

485

: "=&r" (xh), "=r" (xl) \

486

: "r" (a), "r" (b) \

487

: "r0", "r1", "r2")

488

#define UMUL_TIME 20

489

#ifndef LONGLONG_STANDALONE

490

#define udiv_qrnnd(q, r, n1, n0, d) \

491

do { UWtype __r; \

492

(q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d)); \

493

(r) = __r; \

494

} while (0)

495

extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype *, UWtype, UWtype, UWtype));

496

#define UDIV_TIME 200

497

#endif /* LONGLONG_STANDALONE */

498

#endif

499

#endif /* __arm__ */

500

501

#if defined (__clipper__) && W_TYPE_SIZE == 32

502

#define umul_ppmm(w1, w0, u, v) \

503

({union {UDItype __ll; \

504

struct {USItype __l, __h;} __i; \

505

} __x; \

506

__asm__ ("mulwux %2,%0" \

507

: "=r" (__x.__ll) \

508

: "%0" ((USItype)(u)), "r" ((USItype)(v))); \

509

(w1) = __x.__i.__h; (w0) = __x.__i.__l;})

510

#define smul_ppmm(w1, w0, u, v) \

511

({union {DItype __ll; \

512

struct {SItype __l, __h;} __i; \

513

} __x; \

514

__asm__ ("mulwx %2,%0" \

515

: "=r" (__x.__ll) \

516

: "%0" ((SItype)(u)), "r" ((SItype)(v))); \

517

(w1) = __x.__i.__h; (w0) = __x.__i.__l;})

518

#define __umulsidi3(u, v) \

519

({UDItype __w; \

520

__asm__ ("mulwux %2,%0" \

521

: "=r" (__w) : "%0" ((USItype)(u)), "r" ((USItype)(v))); \

522

__w; })

523

#endif /* __clipper__ */

524

525

/* Fujitsu vector computers. */

526

#if defined (__uxp__) && W_TYPE_SIZE == 32

527

#define umul_ppmm(ph, pl, u, v) \

528

do { \

529

union {UDItype __ll; \

530

struct {USItype __h, __l;} __i; \

531

} __x; \

532

__asm__ ("mult.lu %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v));\

533

(ph) = __x.__i.__h; \

534

(pl) = __x.__i.__l; \

535

} while (0)

536

#define smul_ppmm(ph, pl, u, v) \

537

do { \

538

union {UDItype __ll; \

539

struct {USItype __h, __l;} __i; \

540

} __x; \

541

__asm__ ("mult.l %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v)); \

542

(ph) = __x.__i.__h; \

543

(pl) = __x.__i.__l; \

544

} while (0)

545

#endif

546

547

#if defined (__gmicro__) && W_TYPE_SIZE == 32

548

#define add_ssaaaa(sh, sl, ah, al, bh, bl) \

549

__asm__ ("add.w %5,%1\n\taddx %3,%0" \

550

: "=g" (sh), "=&g" (sl) \

551

: "0" ((USItype)(ah)), "g" ((USItype)(bh)), \

552

"%1" ((USItype)(al)), "g" ((USItype)(bl)))

553

#define sub_ddmmss(sh, sl, ah, al, bh, bl) \

554

__asm__ ("sub.w %5,%1\n\tsubx %3,%0" \

555

: "=g" (sh), "=&g" (sl) \

556

: "0" ((USItype)(ah)), "g" ((USItype)(bh)), \

557

"1" ((USItype)(al)), "g" ((USItype)(bl)))

558

#define umul_ppmm(ph, pl, m0, m1) \

559

__asm__ ("mulx %3,%0,%1" \

560

: "=g" (ph), "=r" (pl) \

561

: "%0" ((USItype)(m0)), "g" ((USItype)(m1)))

562

#define udiv_qrnnd(q, r, nh, nl, d) \

563

__asm__ ("divx %4,%0,%1" \

564

: "=g" (q), "=r" (r) \

565

: "1" ((USItype)(nh)), "0" ((USItype)(nl)), "g" ((USItype)(d)))

566

#define count_leading_zeros(count, x) \

567

__asm__ ("bsch/1 %1,%0" \

568

: "=g" (count) : "g" ((USItype)(x)), "0" ((USItype)0))

569

#endif

570

571

#if defined (__hppa) && W_TYPE_SIZE == 32

572

#define add_ssaaaa(sh, sl, ah, al, bh, bl) \

573

__asm__ ("add%I5 %5,%r4,%1\n\taddc %r2,%r3,%0" \

574

: "=r" (sh), "=&r" (sl) \

575

: "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))

576

#define sub_ddmmss(sh, sl, ah, al, bh, bl) \

577

__asm__ ("sub%I4 %4,%r5,%1\n\tsubb %r2,%r3,%0" \

578

: "=r" (sh), "=&r" (sl) \

579

: "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))

580

#if defined (_PA_RISC1_1)

581

#define umul_ppmm(wh, wl, u, v) \

582

do { \

583

union {UDItype __ll; \

584

struct {USItype __h, __l;} __i; \

585

} __x; \

586

__asm__ ("xmpyu %1,%2,%0" : "=*f" (__x.__ll) : "*f" (u), "*f" (v)); \

587

(wh) = __x.__i.__h; \

588

(wl) = __x.__i.__l; \

589

} while (0)

590

#define UMUL_TIME 8

591

#define UDIV_TIME 60

592

#else

593

#define UMUL_TIME 40

594

#define UDIV_TIME 80

595

#endif

596

#define count_leading_zeros(count, x) \

597

do { \

598

USItype __tmp; \

599

__asm__ ( \

600

"ldi 1,%0\n" \

601

" extru,= %1,15,16,%%r0 ; Bits 31..16 zero?\n" \

602

" extru,tr %1,15,16,%1 ; No. Shift down, skip add.\n" \

603

" ldo 16(%0),%0 ; Yes. Perform add.\n" \

604

" extru,= %1,23,8,%%r0 ; Bits 15..8 zero?\n" \

605

" extru,tr %1,23,8,%1 ; No. Shift down, skip add.\n" \

606

" ldo 8(%0),%0 ; Yes. Perform add.\n" \

607

" extru,= %1,27,4,%%r0 ; Bits 7..4 zero?\n" \

608

" extru,tr %1,27,4,%1 ; No. Shift down, skip add.\n" \

609

" ldo 4(%0),%0 ; Yes. Perform add.\n" \

610

" extru,= %1,29,2,%%r0 ; Bits 3..2 zero?\n" \

611

" extru,tr %1,29,2,%1 ; No. Shift down, skip add.\n" \

612

" ldo 2(%0),%0 ; Yes. Perform add.\n" \

613

" extru %1,30,1,%1 ; Extract bit 1.\n" \

614

" sub %0,%1,%0 ; Subtract it.\n" \

615

: "=r" (count), "=r" (__tmp) : "1" (x)); \

616

} while (0)

617

#endif /* hppa */

618

619

/* These macros are for ABI=2.0w. In ABI=2.0n they can't be used, since GCC

620

(3.2) puts longlong into two adjacent 32-bit registers. Presumably this

621

is just a case of no direct support for 2.0n but treating it like 1.0. */

622

#if defined (__hppa) && W_TYPE_SIZE == 64 && ! defined (_LONG_LONG_LIMB)

623

#define add_ssaaaa(sh, sl, ah, al, bh, bl) \

624

__asm__ ("add%I5 %5,%r4,%1\n\tadd,dc %r2,%r3,%0" \

625

: "=r" (sh), "=&r" (sl) \

626

: "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))

627

#define sub_ddmmss(sh, sl, ah, al, bh, bl) \

628

__asm__ ("sub%I4 %4,%r5,%1\n\tsub,db %r2,%r3,%0" \

629

: "=r" (sh), "=&r" (sl) \

630

: "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))

631

#endif /* hppa */

632

633

#if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32

634

#define smul_ppmm(xh, xl, m0, m1) \

635

do { \

636

union {DItype __ll; \

637

struct {USItype __h, __l;} __i; \

638

} __x; \

639

__asm__ ("lr %N0,%1\n\tmr %0,%2" \

640

: "=&r" (__x.__ll) \

641

: "r" (m0), "r" (m1)); \

642

(xh) = __x.__i.__h; (xl) = __x.__i.__l; \

643

} while (0)

644

#define sdiv_qrnnd(q, r, n1, n0, d) \

645

do { \

646

union {DItype __ll; \

647

struct {USItype __h, __l;} __i; \

648

} __x; \

649

__x.__i.__h = n1; __x.__i.__l = n0; \

650

__asm__ ("dr %0,%2" \

651

: "=r" (__x.__ll) \

652

: "0" (__x.__ll), "r" (d)); \

653

(q) = __x.__i.__l; (r) = __x.__i.__h; \

654

} while (0)

655

#endif

656

657

#if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32

658

#define add_ssaaaa(sh, sl, ah, al, bh, bl) \

659

__asm__ ("addl %5,%k1\n\tadcl %3,%k0" \

660

: "=r" (sh), "=&r" (sl) \

661

: "0" ((USItype)(ah)), "g" ((USItype)(bh)), \

662

"%1" ((USItype)(al)), "g" ((USItype)(bl)))

663

#define sub_ddmmss(sh, sl, ah, al, bh, bl) \

664

__asm__ ("subl %5,%k1\n\tsbbl %3,%k0" \

665

: "=r" (sh), "=&r" (sl) \

666

: "0" ((USItype)(ah)), "g" ((USItype)(bh)), \

667

"1" ((USItype)(al)), "g" ((USItype)(bl)))

668

#define umul_ppmm(w1, w0, u, v) \

669

__asm__ ("mull %3" \

670

: "=a" (w0), "=d" (w1) \

671

: "%0" ((USItype)(u)), "rm" ((USItype)(v)))

672

#define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\

673

__asm__ ("divl %4" /* stringification in K&R C */ \

674

: "=a" (q), "=d" (r) \

675

: "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx)))

676

677

#if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium || HAVE_HOST_CPU_pentiummmx

678

/* Pentium bsrl takes between 10 and 72 cycles depending where the most

679

significant 1 bit is, hence the use of the following alternatives. bsfl

680

is slow too, between 18 and 42 depending where the least significant 1

681

bit is, so let the generic count_trailing_zeros below make use of the

682

count_leading_zeros here too. */

683

684

#if HAVE_HOST_CPU_pentiummmx && ! defined (LONGLONG_STANDALONE)

685

/* The following should be a fixed 14 or 15 cycles, but possibly plus an L1

686

cache miss reading from __clz_tab. For P55 it's favoured over the float

687

below so as to avoid mixing MMX and x87, since the penalty for switching

688

between the two is about 100 cycles.

689

690

The asm block sets __shift to -3 if the high 24 bits are clear, -2 for

691

16, -1 for 8, or 0 otherwise. This could be written equivalently as

692

follows, but as of gcc 2.95.2 it results in conditional jumps.

693

694

__shift = -(__n < 0x1000000);

695

__shift -= (__n < 0x10000);

696

__shift -= (__n < 0x100);

697

698

The middle two sbbl and cmpl's pair, and with luck something gcc

699

generates might pair with the first cmpl and the last sbbl. The "32+1"

700

constant could be folded into __clz_tab[], but it doesn't seem worth

701

making a different table just for that. */

702

703

#define count_leading_zeros(c,n) \

704

do { \

705

USItype __n = (n); \

706

USItype __shift; \

707

__asm__ ("cmpl $0x1000000, %1\n" \

708

"sbbl %0, %0\n" \

709

"cmpl $0x10000, %1\n" \

710

"sbbl $0, %0\n" \

711

"cmpl $0x100, %1\n" \

712

"sbbl $0, %0\n" \

713

: "=&r" (__shift) : "r" (__n)); \

714

__shift = __shift*8 + 24 + 1; \

715

(c) = 32 + 1 - __shift - __clz_tab[__n >> __shift]; \

716

} while (0)

717

#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB

718

#define COUNT_LEADING_ZEROS_0 31 /* n==0 indistinguishable from n==1 */

719

720

#else /* ! pentiummmx || LONGLONG_STANDALONE */

721

/* The following should be a fixed 14 cycles or so. Some scheduling

722

opportunities should be available between the float load/store too. This

723

sort of code is used in gcc 3 for __builtin_ffs (with "n&-n") and is

724

apparently suggested by the Intel optimizing manual (don't know exactly

725

where). gcc 2.95 or up will be best for this, so the "double" is

726

correctly aligned on the stack. */

727

#define count_leading_zeros(c,n) \

728

do { \

729

union { \

730

double d; \

731

unsigned a[2]; \

732

} __u; \

733

ASSERT ((n) != 0); \

734

__u.d = (UWtype) (n); \

735

(c) = 0x3FF + 31 - (__u.a[1] >> 20); \

736

} while (0)

737

#define COUNT_LEADING_ZEROS_0 (0x3FF + 31)

738

#endif /* pentiummx */

739

740

#else /* ! pentium */

741

742

#if __GMP_GNUC_PREREQ (3,4) /* using bsrl */

743

#define count_leading_zeros(count,x) count_leading_zeros_gcc_clz(count,x)

744

#endif /* gcc clz */

745

746

/* On P6, gcc prior to 3.0 generates a partial register stall for

747

__cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former

748

being 1 code byte smaller. "31-__cbtmp" is a workaround, probably at the

749

cost of one extra instruction. Do this for "i386" too, since that means

750

generic x86. */

751

#if ! defined (count_leading_zeros) && __GNUC__ < 3 \

752

&& (HAVE_HOST_CPU_i386 \

753

|| HAVE_HOST_CPU_i686 \

754

|| HAVE_HOST_CPU_pentiumpro \

755

|| HAVE_HOST_CPU_pentium2 \

756

|| HAVE_HOST_CPU_pentium3)

757

#define count_leading_zeros(count, x) \

758

do { \

759

USItype __cbtmp; \

760

ASSERT ((x) != 0); \

761

__asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x))); \

762

(count) = 31 - __cbtmp; \

763

} while (0)

764

#endif /* gcc<3 asm bsrl */

765

766

#ifndef count_leading_zeros

767

#define count_leading_zeros(count, x) \

768

do { \

769

USItype __cbtmp; \

770

ASSERT ((x) != 0); \

771

__asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x))); \

772

(count) = __cbtmp ^ 31; \

773

} while (0)

774

#endif /* asm bsrl */

775

776

#if __GMP_GNUC_PREREQ (3,4) /* using bsfl */

777

#define count_trailing_zeros(count,x) count_trailing_zeros_gcc_ctz(count,x)

778

#endif /* gcc ctz */

779

780

#ifndef count_trailing_zeros

781

#define count_trailing_zeros(count, x) \

782

do { \

783

ASSERT ((x) != 0); \

784

__asm__ ("bsfl %1,%0" : "=r" (count) : "rm" ((USItype)(x))); \

785

} while (0)

786

#endif /* asm bsfl */

787

788

#endif /* ! pentium */

789

790

#ifndef UMUL_TIME

791

#define UMUL_TIME 10

792

#endif

793

#ifndef UDIV_TIME

794

#define UDIV_TIME 40

795

#endif

796

#endif /* 80x86 */

797

798

#if defined (__amd64__) && W_TYPE_SIZE == 64

799

#define add_ssaaaa(sh, sl, ah, al, bh, bl) \

800

__asm__ ("addq %5,%q1\n\tadcq %3,%q0" \

801

: "=r" (sh), "=&r" (sl) \

802

: "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)), \

803

"%1" ((UDItype)(al)), "rme" ((UDItype)(bl)))

804

#define sub_ddmmss(sh, sl, ah, al, bh, bl) \

805

__asm__ ("subq %5,%q1\n\tsbbq %3,%q0" \

806

: "=r" (sh), "=&r" (sl) \

807

: "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)), \

808

"1" ((UDItype)(al)), "rme" ((UDItype)(bl)))

809

#define umul_ppmm(w1, w0, u, v) \

810

__asm__ ("mulq %3" \

811

: "=a" (w0), "=d" (w1) \

812

: "%0" ((UDItype)(u)), "rm" ((UDItype)(v)))

813

#define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\

814

__asm__ ("divq %4" /* stringification in K&R C */ \

815

: "=a" (q), "=d" (r) \

816

: "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx)))

817

/* bsrq destination must be a 64-bit register, hence UDItype for __cbtmp. */

818

#define count_leading_zeros(count, x) \

819

do { \

820

UDItype __cbtmp; \

821

ASSERT ((x) != 0); \

822

__asm__ ("bsrq %1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x))); \

823

(count) = __cbtmp ^ 63; \

824

} while (0)

825

/* bsfq destination must be a 64-bit register, "%q0" forces this in case

826

count is only an int. */

827

#define count_trailing_zeros(count, x) \

828

do { \

829

ASSERT ((x) != 0); \

830

__asm__ ("bsfq %1,%q0" : "=r" (count) : "rm" ((UDItype)(x))); \

831

} while (0)

832

#endif /* x86_64 */

833

834

#if defined (__i860__) && W_TYPE_SIZE == 32

835

#define rshift_rhlc(r,h,l,c) \

836

__asm__ ("shr %3,r0,r0\;shrd %1,%2,%0" \

837

"=r" (r) : "r" (h), "r" (l), "rn" (c))

838

#endif /* i860 */

839

840

#if defined (__i960__) && W_TYPE_SIZE == 32

841

#define add_ssaaaa(sh, sl, ah, al, bh, bl) \

842

__asm__ ("cmpo 1,0\;addc %5,%4,%1\;addc %3,%2,%0" \

843

: "=r" (sh), "=&r" (sl) \

844

: "dI" (ah), "dI" (bh), "%dI" (al), "dI" (bl))

845

#define sub_ddmmss(sh, sl, ah, al, bh, bl) \

846

__asm__ ("cmpo 0,0\;subc %5,%4,%1\;subc %3,%2,%0" \

847

: "=r" (sh), "=&r" (sl) \

848

: "dI" (ah), "dI" (bh), "dI" (al), "dI" (bl))

849

#define umul_ppmm(w1, w0, u, v) \

850

({union {UDItype __ll; \

851

struct {USItype __l, __h;} __i; \

852

} __x; \

853

__asm__ ("emul %2,%1,%0" \

854

: "=d" (__x.__ll) : "%dI" (u), "dI" (v)); \

855

(w1) = __x.__i.__h; (w0) = __x.__i.__l;})

856

#define __umulsidi3(u, v) \

857

({UDItype __w; \

858

__asm__ ("emul %2,%1,%0" : "=d" (__w) : "%dI" (u), "dI" (v)); \

859

__w; })

860

#define udiv_qrnnd(q, r, nh, nl, d) \

861

do { \

862

union {UDItype __ll; \

863

struct {USItype __l, __h;} __i; \

864

} __nn; \

865

__nn.__i.__h = (nh); __nn.__i.__l = (nl); \

866

__asm__ ("ediv %d,%n,%0" \

867

: "=d" (__rq.__ll) : "dI" (__nn.__ll), "dI" (d)); \

868

(r) = __rq.__i.__l; (q) = __rq.__i.__h; \

869

} while (0)

870

#define count_leading_zeros(count, x) \

871

do { \

872

USItype __cbtmp; \

873

__asm__ ("scanbit %1,%0" : "=r" (__cbtmp) : "r" (x)); \

874

(count) = __cbtmp ^ 31; \

875

} while (0)

876

#define COUNT_LEADING_ZEROS_0 (-32) /* sic */

877

#if defined (__i960mx) /* what is the proper symbol to test??? */

878

#define rshift_rhlc(r,h,l,c) \

879

do { \

880

union {UDItype __ll; \

881

struct {USItype __l, __h;} __i; \

882

} __nn; \

883

__nn.__i.__h = (h); __nn.__i.__l = (l); \

884

__asm__ ("shre %2,%1,%0" : "=d" (r) : "dI" (__nn.__ll), "dI" (c)); \

885

}

886

#endif /* i960mx */

887

#endif /* i960 */

888

889

#if (defined (__mc68000__) || defined (__mc68020__) || defined(mc68020) \

890

|| defined (__m68k__) || defined (__mc5200__) || defined (__mc5206e__) \

891

|| defined (__mc5307__)) && W_TYPE_SIZE == 32

892

#define add_ssaaaa(sh, sl, ah, al, bh, bl) \

893

__asm__ ("add%.l %5,%1\n\taddx%.l %3,%0" \

894

: "=d" (sh), "=&d" (sl) \

895

: "0" ((USItype)(ah)), "d" ((USItype)(bh)), \

896

"%1" ((USItype)(al)), "g" ((USItype)(bl)))

897

#define sub_ddmmss(sh, sl, ah, al, bh, bl) \

898

__asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0" \

899

: "=d" (sh), "=&d" (sl) \

900

: "0" ((USItype)(ah)), "d" ((USItype)(bh)), \

901

"1" ((USItype)(al)), "g" ((USItype)(bl)))

902

/* The '020, '030, '040 and CPU32 have 32x32->64 and 64/32->32q-32r. */

903

#if defined (__mc68020__) || defined(mc68020) \

904

|| defined (__mc68030__) || defined (mc68030) \

905

|| defined (__mc68040__) || defined (mc68040) \

906

|| defined (__mcpu32__) || defined (mcpu32) \

907

|| defined (__NeXT__)

908

#define umul_ppmm(w1, w0, u, v) \

909

__asm__ ("mulu%.l %3,%1:%0" \

910

: "=d" (w0), "=d" (w1) \

911

: "%0" ((USItype)(u)), "dmi" ((USItype)(v)))

912

#define UMUL_TIME 45

913

#define udiv_qrnnd(q, r, n1, n0, d) \

914

__asm__ ("divu%.l %4,%1:%0" \

915

: "=d" (q), "=d" (r) \

916

: "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))

917

#define UDIV_TIME 90

918

#define sdiv_qrnnd(q, r, n1, n0, d) \

919

__asm__ ("divs%.l %4,%1:%0" \

920

: "=d" (q), "=d" (r) \

921

: "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))

922

#else /* for other 68k family members use 16x16->32 multiplication */

923

#define umul_ppmm(xh, xl, a, b) \

924

do { USItype __umul_tmp1, __umul_tmp2; \

925

__asm__ ("| Inlined umul_ppmm\n" \

926

" move%.l %5,%3\n" \

927

" move%.l %2,%0\n" \

928

" move%.w %3,%1\n" \

929

" swap %3\n" \

930

" swap %0\n" \

931

" mulu%.w %2,%1\n" \

932

" mulu%.w %3,%0\n" \

933

" mulu%.w %2,%3\n" \

934

" swap %2\n" \

935

" mulu%.w %5,%2\n" \

936

" add%.l %3,%2\n" \

937

" jcc 1f\n" \

938

" add%.l %#0x10000,%0\n" \

939

"1: move%.l %2,%3\n" \

940

" clr%.w %2\n" \

941

" swap %2\n" \

942

" swap %3\n" \

943

" clr%.w %3\n" \

944

" add%.l %3,%1\n" \

945

" addx%.l %2,%0\n" \

946

" | End inlined umul_ppmm" \

947

: "=&d" (xh), "=&d" (xl), \

948

"=d" (__umul_tmp1), "=&d" (__umul_tmp2) \

949

: "%2" ((USItype)(a)), "d" ((USItype)(b))); \

950

} while (0)

951

#define UMUL_TIME 100

952

#define UDIV_TIME 400

953

#endif /* not mc68020 */

954

/* The '020, '030, '040 and '060 have bitfield insns.

955

GCC 3.4 defines __mc68020__ when in CPU32 mode, check for __mcpu32__ to

956

exclude bfffo on that chip (bitfield insns not available). */

957

#if (defined (__mc68020__) || defined (mc68020) \

958

|| defined (__mc68030__) || defined (mc68030) \

959

|| defined (__mc68040__) || defined (mc68040) \

960

|| defined (__mc68060__) || defined (mc68060) \

961

|| defined (__NeXT__)) \

962

&& ! defined (__mcpu32__)

963

#define count_leading_zeros(count, x) \

964

__asm__ ("bfffo %1{%b2:%b2},%0" \

965

: "=d" (count) \

966

: "od" ((USItype) (x)), "n" (0))

967

#define COUNT_LEADING_ZEROS_0 32

968

#endif

969

#endif /* mc68000 */

970

971

#if defined (__m88000__) && W_TYPE_SIZE == 32

972

#define add_ssaaaa(sh, sl, ah, al, bh, bl) \

973

__asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3" \

974

: "=r" (sh), "=&r" (sl) \

975

: "rJ" (ah), "rJ" (bh), "%rJ" (al), "rJ" (bl))

976

#define sub_ddmmss(sh, sl, ah, al, bh, bl) \

977

__asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3" \

978

: "=r" (sh), "=&r" (sl) \

979

: "rJ" (ah), "rJ" (bh), "rJ" (al), "rJ" (bl))

980

#define count_leading_zeros(count, x) \

981

do { \

982

USItype __cbtmp; \

983

__asm__ ("ff1 %0,%1" : "=r" (__cbtmp) : "r" (x)); \

984

(count) = __cbtmp ^ 31; \

985

} while (0)

986

#define COUNT_LEADING_ZEROS_0 63 /* sic */

987

#if defined (__m88110__)

988

#define umul_ppmm(wh, wl, u, v) \

989

do { \

990

union {UDItype __ll; \

991

struct {USItype __h, __l;} __i; \

992

} __x; \

993

__asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v)); \

994

(wh) = __x.__i.__h; \

995

(wl) = __x.__i.__l; \

996

} while (0)

997

#define udiv_qrnnd(q, r, n1, n0, d) \

998

({union {UDItype __ll; \

999

struct {USItype __h, __l;} __i; \

1000

} __x, __q; \

1001

__x.__i.__h = (n1); __x.__i.__l = (n0); \

1002

__asm__ ("divu.d %0,%1,%2" \

1003

: "=r" (__q.__ll) : "r" (__x.__ll), "r" (d)); \

1004

(r) = (n0) - __q.__l * (d); (q) = __q.__l; })

1005

#define UMUL_TIME 5

1006

#define UDIV_TIME 25

1007

#else

1008

#define UMUL_TIME 17

1009

#define UDIV_TIME 150

1010

#endif /* __m88110__ */

1011

#endif /* __m88000__ */

1012

1013

#if defined (__mips) && W_TYPE_SIZE == 32

1014

#if __GNUC__ > 2 || __GNUC_MINOR__ >= 7

1015

#define umul_ppmm(w1, w0, u, v) \

1016

__asm__ ("multu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))

1017

#else

1018

#define umul_ppmm(w1, w0, u, v) \

1019

__asm__ ("multu %2,%3\n\tmflo %0\n\tmfhi %1" \

1020

: "=d" (w0), "=d" (w1) : "d" (u), "d" (v))

1021

#endif

1022

#define UMUL_TIME 10

1023

#define UDIV_TIME 100

1024

#endif /* __mips */

1025

1026

#if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64

1027

#if __GNUC__ > 2 || __GNUC_MINOR__ >= 7

1028

#define umul_ppmm(w1, w0, u, v) \

1029

__asm__ ("dmultu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))

1030

#else

1031

#define umul_ppmm(w1, w0, u, v) \

1032

__asm__ ("dmultu %2,%3\n\tmflo %0\n\tmfhi %1" \

1033

: "=d" (w0), "=d" (w1) : "d" (u), "d" (v))

1034

#endif

1035

#define UMUL_TIME 20

1036

#define UDIV_TIME 140

1037

#endif /* __mips */

1038

1039

#if defined (__ns32000__) && W_TYPE_SIZE == 32

1040

#define umul_ppmm(w1, w0, u, v) \

1041

({union {UDItype __ll; \

1042

struct {USItype __l, __h;} __i; \

1043

} __x; \

1044

__asm__ ("meid %2,%0" \

1045

: "=g" (__x.__ll) \

1046

: "%0" ((USItype)(u)), "g" ((USItype)(v))); \

1047

(w1) = __x.__i.__h; (w0) = __x.__i.__l;})

1048

#define __umulsidi3(u, v) \

1049

({UDItype __w; \

1050

__asm__ ("meid %2,%0" \

1051

: "=g" (__w) \

1052

: "%0" ((USItype)(u)), "g" ((USItype)(v))); \

1053

__w; })

1054

#define udiv_qrnnd(q, r, n1, n0, d) \

1055

({union {UDItype __ll; \

1056

struct {USItype __l, __h;} __i; \

1057

} __x; \

1058

__x.__i.__h = (n1); __x.__i.__l = (n0); \

1059

__asm__ ("deid %2,%0" \

1060

: "=g" (__x.__ll) \

1061

: "0" (__x.__ll), "g" ((USItype)(d))); \

1062

(r) = __x.__i.__l; (q) = __x.__i.__h; })

1063

#define count_trailing_zeros(count,x) \

1064

do { \

1065

__asm__ ("ffsd %2,%0" \

1066

: "=r" (count) \

1067

: "0" ((USItype) 0), "r" ((USItype) (x))); \

1068

} while (0)

1069

#endif /* __ns32000__ */

1070

1071

/* In the past we had a block of various #defines tested

1072

_ARCH_PPC - AIX

1073

_ARCH_PWR - AIX

1074

__powerpc__ - gcc

1075

__POWERPC__ - BEOS

1076

__ppc__ - Darwin

1077

PPC - old gcc, GNU/Linux, SysV

1078

The plain PPC test was not good for vxWorks, since PPC is defined on all

1079

CPUs there (eg. m68k too), as a constant one is expected to compare

1080

CPU_FAMILY against.

1081

1082

At any rate, this was pretty unattractive and a bit fragile. The use of

1083

HAVE_HOST_CPU_FAMILY is designed to cut through it all and be sure of

1084

getting the desired effect.

1085

1086

ENHANCE-ME: We should test _IBMR2 here when we add assembly support for

1087

the system vendor compilers. (Is that vendor compilers with inline asm,

1088

or what?) */

1089

1090

#if (HAVE_HOST_CPU_FAMILY_power || HAVE_HOST_CPU_FAMILY_powerpc) \

1091

&& W_TYPE_SIZE == 32

1092

#define add_ssaaaa(sh, sl, ah, al, bh, bl) \

1093

do { \

1094

if (__builtin_constant_p (bh) && (bh) == 0) \

1095

__asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2" \

1096

: "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\

1097

else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0) \

1098

__asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2" \

1099

: "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\

1100

else \

1101

__asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3" \

1102

: "=r" (sh), "=&r" (sl) \

1103

: "r" (ah), "r" (bh), "%r" (al), "rI" (bl)); \

1104

} while (0)

1105

#define sub_ddmmss(sh, sl, ah, al, bh, bl) \

1106

do { \

1107

if (__builtin_constant_p (ah) && (ah) == 0) \

1108

__asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2" \

1109

: "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\

1110

else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0) \

1111

__asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2" \

1112

: "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\

1113

else if (__builtin_constant_p (bh) && (bh) == 0) \

1114

__asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2" \

1115

: "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\

1116

else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0) \

1117

__asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2" \

1118

: "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\

1119

else \

1120

__asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2" \

1121

: "=r" (sh), "=&r" (sl) \

1122

: "r" (ah), "r" (bh), "rI" (al), "r" (bl)); \

1123

} while (0)

1124

#define count_leading_zeros(count, x) \

1125

__asm__ ("{cntlz|cntlzw} %0,%1" : "=r" (count) : "r" (x))

1126

#define COUNT_LEADING_ZEROS_0 32

1127

#if HAVE_HOST_CPU_FAMILY_powerpc

1128

#define umul_ppmm(ph, pl, m0, m1) \

1129

do { \

1130

USItype __m0 = (m0), __m1 = (m1); \

1131

__asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \

1132

(pl) = __m0 * __m1; \

1133

} while (0)

1134

#define UMUL_TIME 15

1135

#define smul_ppmm(ph, pl, m0, m1) \

1136

do { \

1137

SItype __m0 = (m0), __m1 = (m1); \

1138

__asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \

1139

(pl) = __m0 * __m1; \

1140

} while (0)

1141

#define SMUL_TIME 14

1142

#define UDIV_TIME 120

1143

#else

1144

#define UMUL_TIME 8

1145

#define smul_ppmm(xh, xl, m0, m1) \

1146

__asm__ ("mul %0,%2,%3" : "=r" (xh), "=q" (xl) : "r" (m0), "r" (m1))

1147

#define SMUL_TIME 4

1148

#define sdiv_qrnnd(q, r, nh, nl, d) \

1149

__asm__ ("div %0,%2,%4" : "=r" (q), "=q" (r) : "r" (nh), "1" (nl), "r" (d))

1150

#define UDIV_TIME 100

1151

#endif

1152

#endif /* 32-bit POWER architecture variants. */

1153

1154

/* We should test _IBMR2 here when we add assembly support for the system

1155

vendor compilers. */

1156

#if HAVE_HOST_CPU_FAMILY_powerpc && W_TYPE_SIZE == 64

1157

#if !defined (_LONG_LONG_LIMB)

1158

/* _LONG_LONG_LIMB is ABI=mode32 where adde operates on 32-bit values. So

1159

use adde etc only when not _LONG_LONG_LIMB. */

1160

#define add_ssaaaa(sh, sl, ah, al, bh, bl) \

1161

do { \

1162

if (__builtin_constant_p (bh) && (bh) == 0) \

1163

__asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2" \

1164

: "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\

1165

else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \

1166

__asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2" \

1167

: "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\

1168

else \

1169

__asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3" \

1170

: "=r" (sh), "=&r" (sl) \

1171

: "r" (ah), "r" (bh), "%r" (al), "rI" (bl)); \

1172

} while (0)

1173

#define sub_ddmmss(sh, sl, ah, al, bh, bl) \

1174

do { \

1175

if (__builtin_constant_p (ah) && (ah) == 0) \

1176

__asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2" \

1177

: "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\

1178

else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0) \

1179

__asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2" \

1180

: "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\

1181

else if (__builtin_constant_p (bh) && (bh) == 0) \

1182

__asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2" \

1183

: "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\

1184

else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \

1185

__asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2" \

1186

: "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\

1187

else \

1188

__asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2" \

1189

: "=r" (sh), "=&r" (sl) \

1190

: "r" (ah), "r" (bh), "rI" (al), "r" (bl)); \

1191

} while (0)

1192

#endif /* ! _LONG_LONG_LIMB */

1193

#define count_leading_zeros(count, x) \

1194

__asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x))

1195

#define COUNT_LEADING_ZEROS_0 64

1196

#define umul_ppmm(ph, pl, m0, m1) \

1197

do { \

1198

UDItype __m0 = (m0), __m1 = (m1); \

1199

__asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \

1200

(pl) = __m0 * __m1; \

1201

} while (0)

1202

#define UMUL_TIME 15

1203

#define smul_ppmm(ph, pl, m0, m1) \

1204

do { \

1205

DItype __m0 = (m0), __m1 = (m1); \

1206

__asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \

1207

(pl) = __m0 * __m1; \

1208

} while (0)

1209

#define SMUL_TIME 14 /* ??? */

1210

#define UDIV_TIME 120 /* ??? */

1211

#endif /* 64-bit PowerPC. */

1212

1213

#if defined (__pyr__) && W_TYPE_SIZE == 32

1214

#define add_ssaaaa(sh, sl, ah, al, bh, bl) \

1215

__asm__ ("addw %5,%1\n\taddwc %3,%0" \

1216

: "=r" (sh), "=&r" (sl) \

1217

: "0" ((USItype)(ah)), "g" ((USItype)(bh)), \

1218

"%1" ((USItype)(al)), "g" ((USItype)(bl)))

1219

#define sub_ddmmss(sh, sl, ah, al, bh, bl) \

1220

__asm__ ("subw %5,%1\n\tsubwb %3,%0" \

1221

: "=r" (sh), "=&r" (sl) \

1222

: "0" ((USItype)(ah)), "g" ((USItype)(bh)), \

1223

"1" ((USItype)(al)), "g" ((USItype)(bl)))

1224

/* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP. */

1225

#define umul_ppmm(w1, w0, u, v) \

1226

({union {UDItype __ll; \

1227

struct {USItype __h, __l;} __i; \

1228

} __x; \

1229

__asm__ ("movw %1,%R0\n\tuemul %2,%0" \

1230

: "=&r" (__x.__ll) \

1231

: "g" ((USItype) (u)), "g" ((USItype)(v))); \

1232

(w1) = __x.__i.__h; (w0) = __x.__i.__l;})

1233

#endif /* __pyr__ */

1234

1235

#if defined (__ibm032__) /* RT/ROMP */ && W_TYPE_SIZE == 32

1236

#define add_ssaaaa(sh, sl, ah, al, bh, bl) \

1237

__asm__ ("a %1,%5\n\tae %0,%3" \

1238

: "=r" (sh), "=&r" (sl) \

1239

: "0" ((USItype)(ah)), "r" ((USItype)(bh)), \

1240

"%1" ((USItype)(al)), "r" ((USItype)(bl)))

1241

#define sub_ddmmss(sh, sl, ah, al, bh, bl) \

1242

__asm__ ("s %1,%5\n\tse %0,%3" \

1243

: "=r" (sh), "=&r" (sl) \

1244

: "0" ((USItype)(ah)), "r" ((USItype)(bh)), \

1245

"1" ((USItype)(al)), "r" ((USItype)(bl)))

1246

#define smul_ppmm(ph, pl, m0, m1) \

1247

__asm__ ( \

1248

"s r2,r2\n" \

1249

" mts r10,%2\n" \

1250

" m r2,%3\n" \

1251

" m r2,%3\n" \

1252

" m r2,%3\n" \

1253

" m r2,%3\n" \

1254

" m r2,%3\n" \

1255

" m r2,%3\n" \

1256

" m r2,%3\n" \

1257

" m r2,%3\n" \

1258

" m r2,%3\n" \

1259

" m r2,%3\n" \

1260

" m r2,%3\n" \

1261

" m r2,%3\n" \

1262

" m r2,%3\n" \

1263

" m r2,%3\n" \

1264

" m r2,%3\n" \

1265

" m r2,%3\n" \

1266

" cas %0,r2,r0\n" \

1267

" mfs r10,%1" \

1268

: "=r" (ph), "=r" (pl) \

1269

: "%r" ((USItype)(m0)), "r" ((USItype)(m1)) \

1270

: "r2")

1271

#define UMUL_TIME 20

1272

#define UDIV_TIME 200

1273

#define count_leading_zeros(count, x) \

1274

do { \

1275

if ((x) >= 0x10000) \

1276

__asm__ ("clz %0,%1" \

1277

: "=r" (count) : "r" ((USItype)(x) >> 16)); \

1278

else \

1279

{ \

1280

__asm__ ("clz %0,%1" \

1281

: "=r" (count) : "r" ((USItype)(x))); \

1282

(count) += 16; \

1283

} \

1284

} while (0)

1285

#endif /* RT/ROMP */

1286

1287

#if defined (__sh2__) && W_TYPE_SIZE == 32

1288

#define umul_ppmm(w1, w0, u, v) \

1289

__asm__ ("dmulu.l %2,%3\n\tsts macl,%1\n\tsts mach,%0" \

1290

: "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "macl", "mach")

1291

#define UMUL_TIME 5

1292

#endif

1293

1294

#if defined (__sparc__) && W_TYPE_SIZE == 32

1295

#define add_ssaaaa(sh, sl, ah, al, bh, bl) \

1296

__asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0" \

1297

: "=r" (sh), "=&r" (sl) \

1298

: "rJ" (ah), "rI" (bh),"%rJ" (al), "rI" (bl) \

1299

__CLOBBER_CC)

1300

#define sub_ddmmss(sh, sl, ah, al, bh, bl) \

1301

__asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0" \

1302

: "=r" (sh), "=&r" (sl) \

1303

: "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl) \

1304

__CLOBBER_CC)

1305

/* FIXME: When gcc -mcpu=v9 is used on solaris, gcc/config/sol2-sld-64.h

1306

doesn't define anything to indicate that to us, it only sets __sparcv8. */

1307

#if defined (__sparc_v9__) || defined (__sparcv9)

1308

/* Perhaps we should use floating-point operations here? */

1309

#if 0

1310

/* Triggers a bug making mpz/tests/t-gcd.c fail.

1311

Perhaps we simply need explicitly zero-extend the inputs? */

1312

#define umul_ppmm(w1, w0, u, v) \

1313

__asm__ ("mulx %2,%3,%%g1; srl %%g1,0,%1; srlx %%g1,32,%0" : \

1314

"=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "g1")

1315

#else

1316

/* Use v8 umul until above bug is fixed. */

1317

#define umul_ppmm(w1, w0, u, v) \

1318

__asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))

1319

#endif

1320

/* Use a plain v8 divide for v9. */

1321

#define udiv_qrnnd(q, r, n1, n0, d) \

1322

do { \

1323

USItype __q; \

1324

__asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0" \

1325

: "=r" (__q) : "r" (n1), "r" (n0), "r" (d)); \

1326

(r) = (n0) - __q * (d); \

1327

(q) = __q; \

1328

} while (0)

1329

#else

1330

#if defined (__sparc_v8__) /* gcc normal */ \

1331

|| defined (__sparcv8) /* gcc solaris */ \

1332

|| HAVE_HOST_CPU_supersparc

1333

/* Don't match immediate range because, 1) it is not often useful,

1334

2) the 'I' flag thinks of the range as a 13 bit signed interval,

1335

while we want to match a 13 bit interval, sign extended to 32 bits,

1336

but INTERPRETED AS UNSIGNED. */

1337

#define umul_ppmm(w1, w0, u, v) \

1338

__asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))

1339

#define UMUL_TIME 5

1340

1341

#if HAVE_HOST_CPU_supersparc

1342

#define UDIV_TIME 60 /* SuperSPARC timing */

1343

#else

1344

/* Don't use this on SuperSPARC because its udiv only handles 53 bit

1345

dividends and will trap to the kernel for the rest. */

1346

#define udiv_qrnnd(q, r, n1, n0, d) \

1347

do { \

1348

USItype __q; \

1349

__asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0" \

1350

: "=r" (__q) : "r" (n1), "r" (n0), "r" (d)); \

1351

(r) = (n0) - __q * (d); \

1352

(q) = __q; \

1353

} while (0)

1354

#define UDIV_TIME 25

1355

#endif /* HAVE_HOST_CPU_supersparc */

1356

1357

#else /* ! __sparc_v8__ */

1358

#if defined (__sparclite__)

1359

/* This has hardware multiply but not divide. It also has two additional

1360

instructions scan (ffs from high bit) and divscc. */

1361

#define umul_ppmm(w1, w0, u, v) \

1362

__asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))

1363

#define UMUL_TIME 5

1364

#define udiv_qrnnd(q, r, n1, n0, d) \

1365

__asm__ ("! Inlined udiv_qrnnd\n" \

1366

" wr %%g0,%2,%%y ! Not a delayed write for sparclite\n" \

1367

" tst %%g0\n" \

1368

" divscc %3,%4,%%g1\n" \

1369

" divscc %%g1,%4,%%g1\n" \

1370

" divscc %%g1,%4,%%g1\n" \

1371

" divscc %%g1,%4,%%g1\n" \

1372

" divscc %%g1,%4,%%g1\n" \

1373

" divscc %%g1,%4,%%g1\n" \

1374

" divscc %%g1,%4,%%g1\n" \

1375

" divscc %%g1,%4,%%g1\n" \

1376

" divscc %%g1,%4,%%g1\n" \

1377

" divscc %%g1,%4,%%g1\n" \

1378

" divscc %%g1,%4,%%g1\n" \

1379

" divscc %%g1,%4,%%g1\n" \

1380

" divscc %%g1,%4,%%g1\n" \

1381

" divscc %%g1,%4,%%g1\n" \

1382

" divscc %%g1,%4,%%g1\n" \

1383

" divscc %%g1,%4,%%g1\n" \

1384

" divscc %%g1,%4,%%g1\n" \

1385

" divscc %%g1,%4,%%g1\n" \

1386

" divscc %%g1,%4,%%g1\n" \

1387

" divscc %%g1,%4,%%g1\n" \

1388

" divscc %%g1,%4,%%g1\n" \

1389

" divscc %%g1,%4,%%g1\n" \

1390

" divscc %%g1,%4,%%g1\n" \

1391

" divscc %%g1,%4,%%g1\n" \

1392

" divscc %%g1,%4,%%g1\n" \

1393

" divscc %%g1,%4,%%g1\n" \

1394

" divscc %%g1,%4,%%g1\n" \

1395

" divscc %%g1,%4,%%g1\n" \

1396

" divscc %%g1,%4,%%g1\n" \

1397

" divscc %%g1,%4,%%g1\n" \

1398

" divscc %%g1,%4,%%g1\n" \

1399

" divscc %%g1,%4,%0\n" \

1400

" rd %%y,%1\n" \

1401

" bl,a 1f\n" \

1402

" add %1,%4,%1\n" \

1403

"1: ! End of inline udiv_qrnnd" \

1404

: "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d) \

1405

: "%g1" __AND_CLOBBER_CC)

1406

#define UDIV_TIME 37

1407

#define count_leading_zeros(count, x) \

1408

__asm__ ("scan %1,1,%0" : "=r" (count) : "r" (x))

1409

/* Early sparclites return 63 for an argument of 0, but they warn that future

1410

implementations might change this. Therefore, leave COUNT_LEADING_ZEROS_0

1411

undefined. */

1412

#endif /* __sparclite__ */

1413

#endif /* __sparc_v8__ */

1414

#endif /* __sparc_v9__ */

1415

/* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd. */

1416

#ifndef umul_ppmm

1417

#define umul_ppmm(w1, w0, u, v) \

1418

__asm__ ("! Inlined umul_ppmm\n" \

1419

" wr %%g0,%2,%%y ! SPARC has 0-3 delay insn after a wr\n" \

1420

" sra %3,31,%%g2 ! Don't move this insn\n" \

1421

" and %2,%%g2,%%g2 ! Don't move this insn\n" \

1422

" andcc %%g0,0,%%g1 ! Don't move this insn\n" \

1423

" mulscc %%g1,%3,%%g1\n" \

1424

" mulscc %%g1,%3,%%g1\n" \

1425

" mulscc %%g1,%3,%%g1\n" \

1426

" mulscc %%g1,%3,%%g1\n" \

1427

" mulscc %%g1,%3,%%g1\n" \

1428

" mulscc %%g1,%3,%%g1\n" \

1429

" mulscc %%g1,%3,%%g1\n" \

1430

" mulscc %%g1,%3,%%g1\n" \

1431

" mulscc %%g1,%3,%%g1\n" \

1432

" mulscc %%g1,%3,%%g1\n" \

1433

" mulscc %%g1,%3,%%g1\n" \

1434

" mulscc %%g1,%3,%%g1\n" \

1435

" mulscc %%g1,%3,%%g1\n" \

1436

" mulscc %%g1,%3,%%g1\n" \

1437

" mulscc %%g1,%3,%%g1\n" \

1438

" mulscc %%g1,%3,%%g1\n" \

1439

" mulscc %%g1,%3,%%g1\n" \

1440

" mulscc %%g1,%3,%%g1\n" \

1441

" mulscc %%g1,%3,%%g1\n" \

1442

" mulscc %%g1,%3,%%g1\n" \

1443

" mulscc %%g1,%3,%%g1\n" \

1444

" mulscc %%g1,%3,%%g1\n" \

1445

" mulscc %%g1,%3,%%g1\n" \

1446

" mulscc %%g1,%3,%%g1\n" \

1447

" mulscc %%g1,%3,%%g1\n" \

1448

" mulscc %%g1,%3,%%g1\n" \

1449

" mulscc %%g1,%3,%%g1\n" \

1450

" mulscc %%g1,%3,%%g1\n" \

1451

" mulscc %%g1,%3,%%g1\n" \

1452

" mulscc %%g1,%3,%%g1\n" \

1453

" mulscc %%g1,%3,%%g1\n" \

1454

" mulscc %%g1,%3,%%g1\n" \

1455

" mulscc %%g1,0,%%g1\n" \

1456

" add %%g1,%%g2,%0\n" \

1457

" rd %%y,%1" \

1458

: "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v) \

1459

: "%g1", "%g2" __AND_CLOBBER_CC)

1460

#define UMUL_TIME 39 /* 39 instructions */

1461

#endif

1462

#ifndef udiv_qrnnd

1463

#ifndef LONGLONG_STANDALONE

1464

#define udiv_qrnnd(q, r, n1, n0, d) \

1465

do { UWtype __r; \

1466

(q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d)); \

1467

(r) = __r; \

1468

} while (0)

1469

extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype *, UWtype, UWtype, UWtype));

1470

#ifndef UDIV_TIME

1471

#define UDIV_TIME 140

1472

#endif

1473

#endif /* LONGLONG_STANDALONE */

1474

#endif /* udiv_qrnnd */

1475

#endif /* __sparc__ */

1476

1477

#if defined (__sparc__) && W_TYPE_SIZE == 64

1478

#define add_ssaaaa(sh, sl, ah, al, bh, bl) \

1479

__asm__ ( \

1480

"addcc %r4,%5,%1\n" \

1481

" addccc %r6,%7,%%g0\n" \

1482

" addc %r2,%3,%0" \

1483

: "=r" (sh), "=&r" (sl) \

1484

: "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl), \

1485

"%rJ" ((al) >> 32), "rI" ((bl) >> 32) \

1486

__CLOBBER_CC)

1487

#define sub_ddmmss(sh, sl, ah, al, bh, bl) \

1488

__asm__ ( \

1489

"subcc %r4,%5,%1\n" \

1490

" subccc %r6,%7,%%g0\n" \

1491

" subc %r2,%3,%0" \

1492

: "=r" (sh), "=&r" (sl) \

1493

: "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl), \

1494

"rJ" ((al) >> 32), "rI" ((bl) >> 32) \

1495

__CLOBBER_CC)

1496

#endif

1497

1498

#if defined (__vax__) && W_TYPE_SIZE == 32

1499

#define add_ssaaaa(sh, sl, ah, al, bh, bl) \

1500

__asm__ ("addl2 %5,%1\n\tadwc %3,%0" \

1501

: "=g" (sh), "=&g" (sl) \

1502

: "0" ((USItype)(ah)), "g" ((USItype)(bh)), \

1503

"%1" ((USItype)(al)), "g" ((USItype)(bl)))

1504

#define sub_ddmmss(sh, sl, ah, al, bh, bl) \

1505

__asm__ ("subl2 %5,%1\n\tsbwc %3,%0" \

1506

: "=g" (sh), "=&g" (sl) \

1507

: "0" ((USItype)(ah)), "g" ((USItype)(bh)), \

1508

"1" ((USItype)(al)), "g" ((USItype)(bl)))

1509

#define smul_ppmm(xh, xl, m0, m1) \

1510

do { \

1511

union {UDItype __ll; \

1512

struct {USItype __l, __h;} __i; \

1513

} __x; \

1514

USItype __m0 = (m0), __m1 = (m1); \

1515

__asm__ ("emul %1,%2,$0,%0" \

1516

: "=g" (__x.__ll) : "g" (__m0), "g" (__m1)); \

1517

(xh) = __x.__i.__h; (xl) = __x.__i.__l; \

1518

} while (0)

1519

#define sdiv_qrnnd(q, r, n1, n0, d) \

1520

do { \

1521

union {DItype __ll; \

1522

struct {SItype __l, __h;} __i; \

1523

} __x; \

1524

__x.__i.__h = n1; __x.__i.__l = n0; \

1525

__asm__ ("ediv %3,%2,%0,%1" \

1526

: "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d)); \

1527

} while (0)

1528

#if 0

1529

/* FIXME: This instruction appears to be unimplemented on some systems (vax

1530

8800 maybe). */

1531

#define count_trailing_zeros(count,x) \

1532

do { \

1533

__asm__ ("ffs 0, 31, %1, %0" \

1534

: "=g" (count) \

1535

: "g" ((USItype) (x))); \

1536

} while (0)

1537

#endif

1538

#endif /* __vax__ */

1539

1540

#if defined (__z8000__) && W_TYPE_SIZE == 16

1541

#define add_ssaaaa(sh, sl, ah, al, bh, bl) \

1542

__asm__ ("add %H1,%H5\n\tadc %H0,%H3" \

1543

: "=r" (sh), "=&r" (sl) \

1544

: "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)), \

1545

"%1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))

1546

#define sub_ddmmss(sh, sl, ah, al, bh, bl) \

1547

__asm__ ("sub %H1,%H5\n\tsbc %H0,%H3" \

1548

: "=r" (sh), "=&r" (sl) \

1549

: "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)), \

1550

"1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))

1551

#define umul_ppmm(xh, xl, m0, m1) \

1552

do { \

1553

union {long int __ll; \

1554

struct {unsigned int __h, __l;} __i; \

1555

} __x; \

1556

unsigned int __m0 = (m0), __m1 = (m1); \

1557

__asm__ ("mult %S0,%H3" \

1558

: "=r" (__x.__i.__h), "=r" (__x.__i.__l) \

1559

: "%1" (m0), "rQR" (m1)); \

1560

(xh) = __x.__i.__h; (xl) = __x.__i.__l; \

1561

(xh) += ((((signed int) __m0 >> 15) & __m1) \

1562

+ (((signed int) __m1 >> 15) & __m0)); \

1563

} while (0)

1564

#endif /* __z8000__ */

1565

1566

#endif /* __GNUC__ */

1567

1568

#endif /* NO_ASM */

1569

1570

1571

#if !defined (umul_ppmm) && defined (__umulsidi3)

1572

#define umul_ppmm(ph, pl, m0, m1) \

1573

{ \

1574

UDWtype __ll = __umulsidi3 (m0, m1); \

1575

ph = (UWtype) (__ll >> W_TYPE_SIZE); \

1576

pl = (UWtype) __ll; \

1577

}

1578

#endif

1579

1580

#if !defined (__umulsidi3)

1581

#define __umulsidi3(u, v) \

1582

({UWtype __hi, __lo; \

1583

umul_ppmm (__hi, __lo, u, v); \

1584

((UDWtype) __hi << W_TYPE_SIZE) | __lo; })

1585

#endif

1586

1587

1588

/* Use mpn_umul_ppmm or mpn_udiv_qrnnd functions, if they exist. The "_r"

1589

forms have "reversed" arguments, meaning the pointer is last, which

1590

sometimes allows better parameter passing, in particular on 64-bit

1591

hppa. */

1592

1593

#define mpn_umul_ppmm __MPN(umul_ppmm)

1594

extern UWtype mpn_umul_ppmm _PROTO ((UWtype *, UWtype, UWtype));

1595

1596

#if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm \

1597

&& ! defined (LONGLONG_STANDALONE)

1598

#define umul_ppmm(wh, wl, u, v) \

1599

do { \

1600

UWtype __umul_ppmm__p0; \

1601

(wh) = mpn_umul_ppmm (&__umul_ppmm__p0, (UWtype) (u), (UWtype) (v)); \

1602

(wl) = __umul_ppmm__p0; \

1603

} while (0)

1604

#endif

1605

1606

#define mpn_umul_ppmm_r __MPN(umul_ppmm_r)

1607

extern UWtype mpn_umul_ppmm_r _PROTO ((UWtype, UWtype, UWtype *));

1608

1609

#if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm_r \

1610

&& ! defined (LONGLONG_STANDALONE)

1611

#define umul_ppmm(wh, wl, u, v) \

1612

do { \

1613

UWtype __umul_ppmm__p0; \

1614

(wh) = mpn_umul_ppmm_r ((UWtype) (u), (UWtype) (v), &__umul_ppmm__p0); \

1615

(wl) = __umul_ppmm__p0; \

1616

} while (0)

1617

#endif

1618

1619

#define mpn_udiv_qrnnd __MPN(udiv_qrnnd)

1620

extern UWtype mpn_udiv_qrnnd _PROTO ((UWtype *, UWtype, UWtype, UWtype));

1621

1622

#if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd \

1623

&& ! defined (LONGLONG_STANDALONE)

1624

#define udiv_qrnnd(q, r, n1, n0, d) \

1625

do { \

1626

UWtype __udiv_qrnnd__r; \

1627

(q) = mpn_udiv_qrnnd (&__udiv_qrnnd__r, \

1628

(UWtype) (n1), (UWtype) (n0), (UWtype) d); \

1629

(r) = __udiv_qrnnd__r; \

1630

} while (0)

1631

#endif

1632

1633

#define mpn_udiv_qrnnd_r __MPN(udiv_qrnnd_r)

1634

extern UWtype mpn_udiv_qrnnd_r _PROTO ((UWtype, UWtype, UWtype, UWtype *));

1635

1636

#if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd_r \

1637

&& ! defined (LONGLONG_STANDALONE)

1638

#define udiv_qrnnd(q, r, n1, n0, d) \

1639

do { \

1640

UWtype __udiv_qrnnd__r; \

1641

(q) = mpn_udiv_qrnnd_r ((UWtype) (n1), (UWtype) (n0), (UWtype) d, \

1642

&__udiv_qrnnd__r); \

1643

(r) = __udiv_qrnnd__r; \

1644

} while (0)

1645

#endif

1646

1647

1648

/* If this machine has no inline assembler, use C macros. */

1649

1650

#if !defined (add_ssaaaa)

1651

#define add_ssaaaa(sh, sl, ah, al, bh, bl) \

1652

do { \

1653

UWtype __x; \

1654

__x = (al) + (bl); \

1655

(sh) = (ah) + (bh) + (__x < (al)); \

1656

(sl) = __x; \

1657

} while (0)

1658

#endif

1659

1660

#if !defined (sub_ddmmss)

1661

#define sub_ddmmss(sh, sl, ah, al, bh, bl) \

1662

do { \

1663

UWtype __x; \

1664

__x = (al) - (bl); \

1665

(sh) = (ah) - (bh) - ((al) < (bl)); \

1666

(sl) = __x; \

1667

} while (0)

1668

#endif

1669

1670

/* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of

1671

smul_ppmm. */

1672

#if !defined (umul_ppmm) && defined (smul_ppmm)

1673

#define umul_ppmm(w1, w0, u, v) \

1674

do { \

1675

UWtype __w1; \

1676

UWtype __xm0 = (u), __xm1 = (v); \

1677

smul_ppmm (__w1, w0, __xm0, __xm1); \

1678

(w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1) \

1679

+ (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0); \

1680

} while (0)

1681

#endif

1682

1683

/* If we still don't have umul_ppmm, define it using plain C.

1684

1685

For reference, when this code is used for squaring (ie. u and v identical

1686

expressions), gcc recognises __x1 and __x2 are the same and generates 3

1687

multiplies, not 4. The subsequent additions could be optimized a bit,

1688

but the only place GMP currently uses such a square is mpn_sqr_basecase,

1689

and chips obliged to use this generic C umul will have plenty of worse

1690

performance problems than a couple of extra instructions on the diagonal

1691

of sqr_basecase. */

1692

1693

#if !defined (umul_ppmm)

1694

#define umul_ppmm(w1, w0, u, v) \

1695

do { \

1696

UWtype __x0, __x1, __x2, __x3; \

1697

UHWtype __ul, __vl, __uh, __vh; \

1698

UWtype __u = (u), __v = (v); \

1699

\

1700

__ul = __ll_lowpart (__u); \

1701

__uh = __ll_highpart (__u); \

1702

__vl = __ll_lowpart (__v); \

1703

__vh = __ll_highpart (__v); \

1704

\

1705

__x0 = (UWtype) __ul * __vl; \

1706

__x1 = (UWtype) __ul * __vh; \

1707

__x2 = (UWtype) __uh * __vl; \

1708

__x3 = (UWtype) __uh * __vh; \

1709

\

1710

__x1 += __ll_highpart (__x0);/* this can't give carry */ \

1711

__x1 += __x2; /* but this indeed can */ \

1712

if (__x1 < __x2) /* did we get it? */ \

1713

__x3 += __ll_B; /* yes, add it in the proper pos. */ \

1714

\

1715

(w1) = __x3 + __ll_highpart (__x1); \

1716

(w0) = (__x1 << W_TYPE_SIZE/2) + __ll_lowpart (__x0); \

1717

} while (0)

1718

#endif

1719

1720

/* If we don't have smul_ppmm, define it using umul_ppmm (which surely will

1721

exist in one form or another. */

1722

#if !defined (smul_ppmm)

1723

#define smul_ppmm(w1, w0, u, v) \

1724

do { \

1725

UWtype __w1; \

1726

UWtype __xm0 = (u), __xm1 = (v); \

1727

umul_ppmm (__w1, w0, __xm0, __xm1); \

1728

(w1) = __w1 - (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1) \

1729

- (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0); \

1730

} while (0)

1731

#endif

1732

1733

/* Define this unconditionally, so it can be used for debugging. */

1734

#define __udiv_qrnnd_c(q, r, n1, n0, d) \

1735

do { \

1736

UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m; \

1737

\

1738

ASSERT ((d) != 0); \

1739

ASSERT ((n1) < (d)); \

1740

\

1741

__d1 = __ll_highpart (d); \

1742

__d0 = __ll_lowpart (d); \

1743

\

1744

__q1 = (n1) / __d1; \

1745

__r1 = (n1) - __q1 * __d1; \

1746

__m = __q1 * __d0; \

1747

__r1 = __r1 * __ll_B | __ll_highpart (n0); \

1748

if (__r1 < __m) \

1749

{ \

1750

__q1--, __r1 += (d); \

1751

if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\

1752

if (__r1 < __m) \

1753

__q1--, __r1 += (d); \

1754

} \

1755

__r1 -= __m; \

1756

\

1757

__q0 = __r1 / __d1; \

1758

__r0 = __r1 - __q0 * __d1; \

1759

__m = __q0 * __d0; \

1760

__r0 = __r0 * __ll_B | __ll_lowpart (n0); \

1761

if (__r0 < __m) \

1762

{ \

1763

__q0--, __r0 += (d); \

1764

if (__r0 >= (d)) \

1765

if (__r0 < __m) \

1766

__q0--, __r0 += (d); \

1767

} \

1768

__r0 -= __m; \

1769

\

1770

(q) = __q1 * __ll_B | __q0; \

1771

(r) = __r0; \

1772

} while (0)

1773

1774

/* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through

1775

__udiv_w_sdiv (defined in libgcc or elsewhere). */

1776

#if !defined (udiv_qrnnd) && defined (sdiv_qrnnd)

1777

#define udiv_qrnnd(q, r, nh, nl, d) \

1778

do { \

1779

UWtype __r; \

1780

(q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d); \

1781

(r) = __r; \

1782

} while (0)

1783

#endif

1784

1785

/* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c. */

1786

#if !defined (udiv_qrnnd)

1787

#define UDIV_NEEDS_NORMALIZATION 1

1788

#define udiv_qrnnd __udiv_qrnnd_c

1789

#endif

1790

1791

#if !defined (count_leading_zeros)

1792

#define count_leading_zeros(count, x) \

1793

do { \

1794

UWtype __xr = (x); \

1795

UWtype __a; \

1796

\

1797

if (W_TYPE_SIZE == 32) \

1798

{ \

1799

__a = __xr < ((UWtype) 1 << 2*__BITS4) \

1800

? (__xr < ((UWtype) 1 << __BITS4) ? 1 : __BITS4 + 1) \

1801

: (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 + 1 \

1802

: 3*__BITS4 + 1); \

1803

} \

1804

else \

1805

{ \

1806

for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8) \

1807

if (((__xr >> __a) & 0xff) != 0) \

1808

break; \

1809

++__a; \

1810

} \

1811

\

1812

(count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a]; \

1813

} while (0)

1814

/* This version gives a well-defined value for zero. */

1815

#define COUNT_LEADING_ZEROS_0 (W_TYPE_SIZE - 1)

1816

#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB

1817

#endif

1818

1819

/* clz_tab needed by mpn/x86/pentium/mod_1.asm in a fat binary */

1820

#if HAVE_HOST_CPU_FAMILY_x86 && WANT_FAT_BINARY

1821

#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB

1822

#endif

1823

1824

#ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB

1825

extern const unsigned char __GMP_DECLSPEC __clz_tab[128];

1826

#endif

1827

1828

#if !defined (count_trailing_zeros)

1829

/* Define count_trailing_zeros using count_leading_zeros. The latter might be

1830

defined in asm, but if it is not, the C version above is good enough. */

1831

#define count_trailing_zeros(count, x) \

1832

do { \

1833

UWtype __ctz_x = (x); \

1834

UWtype __ctz_c; \

1835

ASSERT (__ctz_x != 0); \

1836

count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x); \

1837

(count) = W_TYPE_SIZE - 1 - __ctz_c; \

1838

} while (0)

1839

#endif

1840

1841

#ifndef UDIV_NEEDS_NORMALIZATION

1842

#define UDIV_NEEDS_NORMALIZATION 0

1843

#endif

1844

1845

/* Whether udiv_qrnnd is actually implemented with udiv_qrnnd_preinv, and

1846

that hence the latter should always be used. */

1847

#ifndef UDIV_PREINV_ALWAYS

1848

#define UDIV_PREINV_ALWAYS 0

1849

#endif

1850

1851

/* Give defaults for UMUL_TIME and UDIV_TIME. */

1852

#ifndef UMUL_TIME

1853

#define UMUL_TIME 1

1854

#endif

1855

1856

#ifndef UDIV_TIME

1857

#define UDIV_TIME UMUL_TIME

1858

#endif