1
/* Header for speed and threshold things.
3
Copyright 1999, 2000, 2001 Free Software Foundation, Inc.
5
This file is part of the GNU MP Library.
7
The GNU MP Library is free software; you can redistribute it and/or modify
8
it under the terms of the GNU Lesser General Public License as published by
9
the Free Software Foundation; either version 2.1 of the License, or (at your
10
option) any later version.
12
The GNU MP Library is distributed in the hope that it will be useful, but
13
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15
License for more details.
17
You should have received a copy of the GNU Lesser General Public License
18
along with the GNU MP Library; see the file COPYING.LIB. If not, write to
19
the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
20
MA 02111-1307, USA. */
26
/* Pad ptr,oldsize with zero limbs (at the most significant end) to make it
28
#define MPN_ZERO_EXTEND(ptr, oldsize, newsize) \
30
ASSERT ((newsize) >= (oldsize)); \
31
MPN_ZERO ((ptr)+(oldsize), (newsize)-(oldsize)); \
34
/* A mask of the least significant n bits. Note 1<<32 doesn't give zero on
35
x86 family CPUs, hence the separate case for BITS_PER_MP_LIMB. */
36
#define MP_LIMB_T_LOWBITMASK(n) \
37
((n) == BITS_PER_MP_LIMB ? MP_LIMB_T_MAX : ((mp_limb_t) 1 << (n)) - 1)
40
/* align must be a power of 2 here, usually CACHE_LINE_SIZE is a good choice */
42
#define TMP_ALLOC_ALIGNED(bytes, align) \
43
align_pointer (TMP_ALLOC ((bytes) + (align)-1), (align))
44
#define TMP_ALLOC_LIMBS_ALIGNED(limbs, align) \
45
((mp_ptr) TMP_ALLOC_ALIGNED ((limbs)*sizeof(mp_limb_t), align))
47
/* 32 for pentium, 64 for athlon, might want to configure this for other
48
CPUs. In truth though nothing has yet shown up that cares about cache
49
line boundaries. The only practical effect of this is to restrict the
50
range that s->align_xp can take. Perhaps this could be a variable
52
#define CACHE_LINE_SIZE 64 /* bytes */
54
#define SPEED_TMP_ALLOC_ADJUST_MASK (CACHE_LINE_SIZE/BYTES_PER_MP_LIMB - 1)
56
#define SPEED_TMP_ALLOC_LIMBS(limbs, align) \
57
(speed_tmp_alloc_adjust \
58
(TMP_ALLOC_LIMBS((limbs) + SPEED_TMP_ALLOC_ADJUST_MASK), (align)))
61
/* This is the size for s->xp_block and s->yp_block, used in certain
62
routines that want to run across many different data values and use
63
s->size for a different purpose, eg. SPEED_ROUTINE_MPN_GCD_1.
65
512 means 2kbytes of data for each of xp_block and yp_block, making 4k
66
total, which should fit easily in any L1 data cache. */
68
#define SPEED_BLOCK_SIZE 512 /* limbs */
71
extern double speed_unittime;
72
extern double speed_cycletime;
73
extern int speed_precision;
74
extern const char *speed_time_string;
75
void speed_time_init _PROTO ((void));
76
void speed_cycletime_init _PROTO ((void));
77
void speed_cycletime_need_cycles _PROTO ((void));
78
void speed_cycletime_need_seconds _PROTO ((void));
79
void speed_starttime _PROTO ((void));
80
double speed_endtime _PROTO ((void));
84
unsigned reps; /* how many times to run the routine */
85
mp_ptr xp; /* first argument */
86
mp_ptr yp; /* second argument */
87
mp_size_t size; /* size of both arguments */
88
mp_limb_t r; /* user supplied parameter */
89
mp_size_t align_xp; /* alignment of xp */
90
mp_size_t align_yp; /* alignment of yp */
91
mp_size_t align_wp; /* intended alignment of wp */
92
mp_size_t align_wp2; /* intended alignment of wp2 */
93
mp_ptr xp_block; /* first special SPEED_BLOCK_SIZE block */
94
mp_ptr yp_block; /* second special SPEED_BLOCK_SIZE block */
96
double time_divisor; /* optionally set by the speed routine */
98
/* used by the cache priming things */
100
unsigned src_num, dst_num;
107
typedef double (*speed_function_t) _PROTO ((struct speed_params *s));
109
double speed_measure _PROTO ((speed_function_t fun, struct speed_params *s));
111
/* Prototypes for speed measuring routines */
113
double speed_back_to_back (struct speed_params *s);
114
double speed_count_leading_zeros _PROTO ((struct speed_params *s));
115
double speed_count_trailing_zeros _PROTO ((struct speed_params *s));
116
double speed_find_a _PROTO ((struct speed_params *s));
117
double speed_gmp_allocate_free _PROTO ((struct speed_params *s));
118
double speed_gmp_allocate_reallocate_free _PROTO ((struct speed_params *s));
119
double speed_invert_limb _PROTO ((struct speed_params *s));
120
double speed_malloc_free _PROTO ((struct speed_params *s));
121
double speed_malloc_realloc_free _PROTO ((struct speed_params *s));
122
double speed_memcpy _PROTO ((struct speed_params *s));
123
double speed_modlimb_invert _PROTO ((struct speed_params *s));
124
double speed_modlimb_invert_mul1 _PROTO ((struct speed_params *s));
125
double speed_modlimb_invert_loop _PROTO ((struct speed_params *s));
126
double speed_modlimb_invert_cond _PROTO ((struct speed_params *s));
127
double speed_modlimb_invert_arith _PROTO ((struct speed_params *s));
129
double speed_mpf_init_clear _PROTO ((struct speed_params *s));
131
double speed_mpn_add_n _PROTO ((struct speed_params *s));
132
double speed_mpn_add_n_self _PROTO ((struct speed_params *s));
133
double speed_mpn_add_n_inplace _PROTO ((struct speed_params *s));
134
double speed_mpn_and_n _PROTO ((struct speed_params *s));
135
double speed_mpn_andn_n _PROTO ((struct speed_params *s));
136
double speed_mpn_addmul_1 _PROTO ((struct speed_params *s));
137
double speed_mpn_com_n _PROTO ((struct speed_params *s));
138
double speed_mpn_copyd _PROTO ((struct speed_params *s));
139
double speed_mpn_copyi _PROTO ((struct speed_params *s));
140
double speed_mpn_dc_divrem_n _PROTO ((struct speed_params *s));
141
double speed_mpn_dc_divrem_sb _PROTO ((struct speed_params *s));
142
double speed_mpn_dc_divrem_sb_div _PROTO ((struct speed_params *s));
143
double speed_mpn_dc_divrem_sb_inv _PROTO ((struct speed_params *s));
144
double speed_mpn_dc_tdiv_qr _PROTO ((struct speed_params *s));
145
double speed_MPN_COPY _PROTO ((struct speed_params *s));
146
double speed_MPN_COPY_DECR _PROTO ((struct speed_params *s));
147
double speed_MPN_COPY_INCR _PROTO ((struct speed_params *s));
148
double speed_mpn_divexact_1 _PROTO ((struct speed_params *s));
149
double speed_mpn_divexact_by3 _PROTO ((struct speed_params *s));
150
double speed_mpn_divrem_1 _PROTO ((struct speed_params *s));
151
double speed_mpn_divrem_1f _PROTO ((struct speed_params *s));
152
double speed_mpn_divrem_1c _PROTO ((struct speed_params *s));
153
double speed_mpn_divrem_1cf _PROTO ((struct speed_params *s));
154
double speed_mpn_divrem_1_div _PROTO ((struct speed_params *s));
155
double speed_mpn_divrem_1f_div _PROTO ((struct speed_params *s));
156
double speed_mpn_divrem_1_inv _PROTO ((struct speed_params *s));
157
double speed_mpn_divrem_1f_inv _PROTO ((struct speed_params *s));
158
double speed_mpn_divrem_2 _PROTO ((struct speed_params *s));
159
double speed_mpn_divrem_2_div _PROTO ((struct speed_params *s));
160
double speed_mpn_divrem_2_inv _PROTO ((struct speed_params *s));
161
double speed_mpn_fib2_ui _PROTO ((struct speed_params *s));
162
double speed_mpn_gcd _PROTO ((struct speed_params *s));
163
double speed_mpn_gcd_finda _PROTO ((struct speed_params *s));
164
double speed_mpn_gcd_1 _PROTO ((struct speed_params *s));
165
double speed_mpn_gcd_1N _PROTO ((struct speed_params *s));
166
double speed_mpn_gcd_binary _PROTO ((struct speed_params *s));
167
double speed_mpn_gcd_finda _PROTO ((struct speed_params *s));
168
double speed_mpn_gcdext _PROTO ((struct speed_params *s));
169
double speed_mpn_gcdext_double _PROTO ((struct speed_params *s));
170
double speed_mpn_gcdext_one_double _PROTO ((struct speed_params *s));
171
double speed_mpn_gcdext_one_single _PROTO ((struct speed_params *s));
172
double speed_mpn_gcdext_single _PROTO ((struct speed_params *s));
173
double speed_mpn_get_str _PROTO ((struct speed_params *s));
174
double speed_mpn_hamdist _PROTO ((struct speed_params *s));
175
double speed_mpn_ior_n _PROTO ((struct speed_params *s));
176
double speed_mpn_iorn_n _PROTO ((struct speed_params *s));
177
double speed_mpn_jacobi_base _PROTO ((struct speed_params *s));
178
double speed_mpn_kara_mul_n _PROTO ((struct speed_params *s));
179
double speed_mpn_kara_sqr_n _PROTO ((struct speed_params *s));
180
double speed_mpn_lshift _PROTO ((struct speed_params *s));
181
double speed_mpn_mod_1 _PROTO ((struct speed_params *s));
182
double speed_mpn_mod_1c _PROTO ((struct speed_params *s));
183
double speed_mpn_mod_1_div _PROTO ((struct speed_params *s));
184
double speed_mpn_mod_1_inv _PROTO ((struct speed_params *s));
185
double speed_mpn_mod_34lsub1 _PROTO ((struct speed_params *s));
186
double speed_mpn_modexact_1_odd _PROTO ((struct speed_params *s));
187
double speed_mpn_modexact_1c_odd _PROTO ((struct speed_params *s));
188
double speed_mpn_mul_1 _PROTO ((struct speed_params *s));
189
double speed_mpn_mul_1_inplace _PROTO ((struct speed_params *s));
190
double speed_mpn_mul_2 _PROTO ((struct speed_params *s));
191
double speed_mpn_mul_basecase _PROTO ((struct speed_params *s));
192
double speed_mpn_mul_fft _PROTO ((struct speed_params *s));
193
double speed_mpn_mul_fft_sqr _PROTO ((struct speed_params *s));
194
double speed_mpn_mul_fft_full _PROTO ((struct speed_params *s));
195
double speed_mpn_mul_fft_full_sqr _PROTO ((struct speed_params *s));
196
double speed_mpn_mul_n _PROTO ((struct speed_params *s));
197
double speed_mpn_mul_n_sqr _PROTO ((struct speed_params *s));
198
double speed_mpn_nand_n _PROTO ((struct speed_params *s));
199
double speed_mpn_nior_n _PROTO ((struct speed_params *s));
200
double speed_mpn_popcount _PROTO ((struct speed_params *s));
201
double speed_mpn_preinv_mod_1 _PROTO ((struct speed_params *s));
202
double speed_redc _PROTO ((struct speed_params *s));
203
double speed_mpn_rshift _PROTO ((struct speed_params *s));
204
double speed_mpn_sb_divrem_m3 _PROTO ((struct speed_params *s));
205
double speed_mpn_sb_divrem_m3_div _PROTO ((struct speed_params *s));
206
double speed_mpn_sb_divrem_m3_inv _PROTO ((struct speed_params *s));
207
double speed_mpn_set_str _PROTO ((struct speed_params *s));
208
double speed_mpn_sqr_basecase _PROTO ((struct speed_params *s));
209
double speed_mpn_sqr_diagonal _PROTO ((struct speed_params *s));
210
double speed_mpn_sqr_n _PROTO ((struct speed_params *s));
211
double speed_mpn_sqrtrem _PROTO ((struct speed_params *s));
212
double speed_mpn_sub_n _PROTO ((struct speed_params *s));
213
double speed_mpn_submul_1 _PROTO ((struct speed_params *s));
214
double speed_mpn_toom3_mul_n _PROTO ((struct speed_params *s));
215
double speed_mpn_toom3_mul_n_mpn _PROTO ((struct speed_params *s));
216
double speed_mpn_toom3_mul_n_open _PROTO ((struct speed_params *s));
217
double speed_mpn_toom3_sqr_n _PROTO ((struct speed_params *s));
218
double speed_mpn_toom3_sqr_n_mpn _PROTO ((struct speed_params *s));
219
double speed_mpn_toom3_sqr_n_open _PROTO ((struct speed_params *s));
220
double speed_mpn_udiv_qrnnd _PROTO ((struct speed_params *s));
221
double speed_mpn_umul_ppmm _PROTO ((struct speed_params *s));
222
double speed_mpn_xnor_n _PROTO ((struct speed_params *s));
223
double speed_mpn_xor_n _PROTO ((struct speed_params *s));
224
double speed_MPN_ZERO _PROTO ((struct speed_params *s));
226
double speed_mpq_init_clear _PROTO ((struct speed_params *s));
228
double speed_mpz_add _PROTO ((struct speed_params *s));
229
double speed_mpz_bin_uiui _PROTO ((struct speed_params *s));
230
double speed_mpz_fac_ui _PROTO ((struct speed_params *s));
231
double speed_mpz_fib_ui _PROTO ((struct speed_params *s));
232
double speed_mpz_fib2_ui _PROTO ((struct speed_params *s));
233
double speed_mpz_init_clear _PROTO ((struct speed_params *s));
234
double speed_mpz_init_realloc_clear _PROTO ((struct speed_params *s));
235
double speed_mpz_jacobi _PROTO ((struct speed_params *s));
236
double speed_mpz_lucnum_ui _PROTO ((struct speed_params *s));
237
double speed_mpz_lucnum2_ui _PROTO ((struct speed_params *s));
238
double speed_mpz_mod _PROTO ((struct speed_params *s));
239
double speed_mpz_powm _PROTO ((struct speed_params *s));
240
double speed_mpz_powm_mod _PROTO ((struct speed_params *s));
241
double speed_mpz_powm_redc _PROTO ((struct speed_params *s));
242
double speed_mpz_powm_ui _PROTO ((struct speed_params *s));
244
double speed_noop _PROTO ((struct speed_params *s));
245
double speed_noop_wxs _PROTO ((struct speed_params *s));
246
double speed_noop_wxys _PROTO ((struct speed_params *s));
248
double speed_operator_div (struct speed_params *s);
249
double speed_operator_mod (struct speed_params *s);
251
double speed_udiv_qrnnd _PROTO ((struct speed_params *s));
252
double speed_udiv_qrnnd_preinv _PROTO ((struct speed_params *s));
253
double speed_udiv_qrnnd_preinv2norm _PROTO ((struct speed_params *s));
254
double speed_udiv_qrnnd_c _PROTO ((struct speed_params *s));
255
double speed_umul_ppmm _PROTO ((struct speed_params *s));
258
/* Prototypes for other routines */
260
/* low 32-bits in p[0], high 32-bits in p[1] */
261
void speed_cyclecounter _PROTO ((unsigned p[2]));
263
/* In i386 gcc -fPIC, ebx is a fixed register and can't be declared a dummy
264
output or a clobber for the cpuid, hence an explicit save and restore. A
265
clobber as such doesn't provoke an error unfortunately (gcc 3.0), so use
266
the dummy output style in non-PIC, so there's an error if somehow -fPIC
267
is used without a -DPIC to tell us about it. */
268
#if defined(__GNUC__) && ! defined (NO_ASM) \
269
&& (defined (__i386__) || defined (__i486__))
271
#define speed_cyclecounter(p) \
273
int __speed_cyclecounter__save_ebx; \
274
int __speed_cyclecounter__dummy; \
275
__asm__ __volatile__ ("movl %%ebx, %1\n" \
280
"=&rm" (__speed_cyclecounter__save_ebx), \
281
"=c" (__speed_cyclecounter__dummy), \
285
#define speed_cyclecounter(p) \
287
int __speed_cyclecounter__dummy1; \
288
int __speed_cyclecounter__dummy2; \
289
__asm__ __volatile__ ("cpuid\n" \
292
"=b" (__speed_cyclecounter__dummy1), \
293
"=c" (__speed_cyclecounter__dummy2), \
299
double speed_cyclecounter_diff _PROTO ((const unsigned end[2],
300
const unsigned start[2]));
301
int gettimeofday_microseconds_p _PROTO ((void));
302
int getrusage_microseconds_p _PROTO ((void));
303
int cycles_works_p _PROTO ((void));
305
int double_cmp_ptr _PROTO ((const double *p, const double *q));
306
void pentium_wbinvd _PROTO ((void));
307
typedef int (*qsort_function_t) _PROTO ((const void *, const void *));
309
void noop _PROTO ((void));
310
void noop_1 _PROTO ((mp_limb_t n));
311
void noop_wxs _PROTO ((mp_ptr wp, mp_srcptr xp, mp_size_t size));
312
void noop_wxys _PROTO ((mp_ptr wp, mp_srcptr xp, mp_srcptr yp,
314
void mpn_cache_fill _PROTO ((mp_srcptr ptr, mp_size_t size));
315
void mpn_cache_fill_dummy _PROTO ((mp_limb_t n));
316
mp_ptr speed_tmp_alloc_adjust _PROTO ((void *ptr, mp_size_t align));
317
void speed_cache_fill _PROTO ((struct speed_params *s));
318
void speed_operand_src _PROTO ((struct speed_params *s,
319
mp_ptr ptr, mp_size_t size));
320
void speed_operand_dst _PROTO ((struct speed_params *s,
321
mp_ptr ptr, mp_size_t size));
323
extern int speed_option_addrs;
324
extern int speed_option_verbose;
325
void speed_option_set _PROTO((const char *s));
327
mp_limb_t mpn_divrem_1_div _PROTO ((mp_ptr qp, mp_size_t xsize,
328
mp_srcptr ap, mp_size_t size,
330
mp_limb_t mpn_divrem_1_inv _PROTO ((mp_ptr qp, mp_size_t xsize,
331
mp_srcptr ap, mp_size_t size,
333
mp_limb_t mpn_divrem_2_div _PROTO ((mp_ptr qp, mp_size_t qxn,
334
mp_ptr np, mp_size_t nsize,
336
mp_limb_t mpn_divrem_2_inv _PROTO ((mp_ptr qp, mp_size_t qxn,
337
mp_ptr np, mp_size_t nsize,
339
mp_limb_t mpn_mod_1_div _PROTO ((mp_srcptr ap, mp_size_t size, mp_limb_t d)) __GMP_ATTRIBUTE_PURE;
340
mp_limb_t mpn_mod_1_inv _PROTO ((mp_srcptr ap, mp_size_t size, mp_limb_t d)) __GMP_ATTRIBUTE_PURE;
342
mp_size_t mpn_gcd_binary
343
_PROTO ((mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t vsize));
344
mp_size_t mpn_gcdext_one_double
345
_PROTO ((mp_ptr gp, mp_ptr s0p, mp_size_t *s0size,
346
mp_ptr up, mp_size_t size, mp_ptr vp, mp_size_t vsize));
347
mp_size_t mpn_gcdext_one_single
348
_PROTO ((mp_ptr gp, mp_ptr s0p, mp_size_t *s0size,
349
mp_ptr up, mp_size_t size, mp_ptr vp, mp_size_t vsize));
350
mp_size_t mpn_gcdext_single
351
_PROTO ((mp_ptr gp, mp_ptr s0p, mp_size_t *s0size,
352
mp_ptr up, mp_size_t size, mp_ptr vp, mp_size_t vsize));
353
mp_size_t mpn_gcdext_double
354
_PROTO ((mp_ptr gp, mp_ptr s0p, mp_size_t *s0size,
355
mp_ptr up, mp_size_t size, mp_ptr vp, mp_size_t vsize));
357
mp_limb_t mpn_sb_divrem_mn_div _PROTO ((mp_ptr qp,
358
mp_ptr np, mp_size_t nsize,
359
mp_srcptr dp, mp_size_t dsize));
360
mp_limb_t mpn_sb_divrem_mn_inv _PROTO ((mp_ptr qp,
361
mp_ptr np, mp_size_t nsize,
362
mp_srcptr dp, mp_size_t dsize));
363
void mpn_toom3_mul_n_open _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t,
365
void mpn_toom3_sqr_n_open _PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_ptr));
366
void mpn_toom3_mul_n_mpn _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t,
368
void mpn_toom3_sqr_n_mpn _PROTO((mp_ptr, mp_srcptr, mp_size_t, mp_ptr));
370
void mpz_powm_mod _PROTO ((mpz_ptr res, mpz_srcptr base, mpz_srcptr e,
372
void mpz_powm_redc _PROTO ((mpz_ptr res, mpz_srcptr base, mpz_srcptr e,
374
void redc _PROTO ((mp_ptr cp, mp_srcptr mp, mp_size_t n, mp_limb_t Nprim,
377
int speed_routine_count_zeros_setup _PROTO ((struct speed_params *s,
378
mp_ptr xp, int leading,
381
/* The measuring routines use these big macros to save duplication for
382
similar forms. They also get used for some automatically generated
383
measuring of new implementations of functions.
385
Having something like SPEED_ROUTINE_BINARY_N as a subroutine accepting a
386
function pointer is considered undesirable since it's not the way a
387
normal application will be calling, and some processors might do
388
different things with an indirect call, like not branch predicting, or
389
doing a full pipe flush. At least some of the "functions" measured are
392
The net effect is to bloat the object code, possibly in a big way, but
393
only what's being measured is being run, so that doesn't matter.
395
Note that if a called function is __GMP_ATTRIBUTE_PURE or
396
ATTRIBUTE_CONST, as for example mpn_mod_1, then the return value should
397
be used in some way, to stop gcc 3 and up from discarding the calls. See
398
SPEED_ROUTINE_MPN_MOD_CALL for instance. */
401
#define SPEED_RESTRICT_COND(cond) if (!(cond)) return -1.0;
403
/* For mpn_copy or similar. */
404
#define SPEED_ROUTINE_MPN_COPY(function) \
411
SPEED_RESTRICT_COND (s->size >= 0); \
414
wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp); \
416
speed_operand_src (s, s->xp, s->size); \
417
speed_operand_dst (s, wp, s->size); \
418
speed_cache_fill (s); \
420
speed_starttime (); \
423
function (wp, s->xp, s->size); \
425
t = speed_endtime (); \
431
#define SPEED_ROUTINE_MPN_COPYC(function) \
438
SPEED_RESTRICT_COND (s->size >= 0); \
441
wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp); \
443
speed_operand_src (s, s->xp, s->size); \
444
speed_operand_dst (s, wp, s->size); \
445
speed_cache_fill (s); \
447
speed_starttime (); \
450
function (wp, s->xp, s->size, 0); \
452
t = speed_endtime (); \
458
/* s->size is still in limbs, and it's limbs which are copied, but
459
"function" takes a size in bytes not limbs. */
460
#define SPEED_ROUTINE_MPN_COPY_BYTES(function) \
467
SPEED_RESTRICT_COND (s->size >= 0); \
470
wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp); \
472
speed_operand_src (s, s->xp, s->size); \
473
speed_operand_dst (s, wp, s->size); \
474
speed_cache_fill (s); \
476
speed_starttime (); \
479
function (wp, s->xp, s->size * BYTES_PER_MP_LIMB); \
481
t = speed_endtime (); \
488
/* For mpn_add_n, mpn_sub_n, or similar. */
489
#define SPEED_ROUTINE_MPN_BINARY_N_CALL(call) \
496
SPEED_RESTRICT_COND (s->size >= 1); \
499
wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp); \
501
speed_operand_src (s, s->xp, s->size); \
502
speed_operand_src (s, s->yp, s->size); \
503
speed_operand_dst (s, wp, s->size); \
504
speed_cache_fill (s); \
506
speed_starttime (); \
511
t = speed_endtime (); \
517
#define SPEED_ROUTINE_MPN_BINARY_N(function) \
518
SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, s->xp, s->yp, s->size))
520
#define SPEED_ROUTINE_MPN_BINARY_NC(function) \
521
SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, s->xp, s->yp, s->size, 0))
523
#define SPEED_ROUTINE_MPN_BINARY_N_SELF(function) \
524
SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, s->xp, s->xp, s->size))
526
/* FIXME: wp is uninitialized here, should start it off from yp or something */
527
#define SPEED_ROUTINE_MPN_BINARY_N_INPLACE(function) \
528
SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, wp, s->xp, s->size))
531
/* For mpn_lshift, mpn_rshift, mpn_mul_1, with r, or similar. */
532
#define SPEED_ROUTINE_MPN_UNARY_1_CALL(call) \
539
SPEED_RESTRICT_COND (s->size >= 1); \
542
wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp); \
544
speed_operand_src (s, s->xp, s->size); \
545
speed_operand_dst (s, wp, s->size); \
546
speed_cache_fill (s); \
548
speed_starttime (); \
553
t = speed_endtime (); \
559
#define SPEED_ROUTINE_MPN_UNARY_1(function) \
560
SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
562
#define SPEED_ROUTINE_MPN_UNARY_1C(function) \
563
SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r, 0))
565
/* FIXME: wp is uninitialized here, should start it off from xp */
566
#define SPEED_ROUTINE_MPN_UNARY_1_INPLACE(function) \
567
SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, wp, s->size, s->r))
569
#define SPEED_ROUTINE_MPN_DIVEXACT_1(function) \
570
SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
572
#define SPEED_ROUTINE_MPN_DIVREM_1(function) \
573
SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r))
575
#define SPEED_ROUTINE_MPN_DIVREM_1C(function) \
576
SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r, 0))
578
#define SPEED_ROUTINE_MPN_DIVREM_1F(function) \
579
SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r))
581
#define SPEED_ROUTINE_MPN_DIVREM_1CF(function) \
582
SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r, 0))
585
/* For mpn_lshift, mpn_rshift, mpn_mul_1, with r, or similar. */
586
#define SPEED_ROUTINE_MPN_UNARY_2_CALL(call) \
594
SPEED_RESTRICT_COND (s->size >= 1); \
597
wp = SPEED_TMP_ALLOC_LIMBS (s->size+1, s->align_wp); \
601
speed_operand_src (s, s->xp, s->size); \
602
speed_operand_dst (s, wp, s->size+1); \
603
speed_cache_fill (s); \
605
speed_starttime (); \
610
t = speed_endtime (); \
616
#define SPEED_ROUTINE_MPN_UNARY_2(function) \
617
SPEED_ROUTINE_MPN_UNARY_2_CALL ((*function) (wp, s->xp, s->size, l, h))
620
/* For mpn_mul_basecase, xsize=r, ysize=s->size. */
621
#define SPEED_ROUTINE_MPN_MUL_BASECASE(function) \
629
size1 = (s->r == 0 ? s->size : s->r); \
631
SPEED_RESTRICT_COND (s->size >= 1); \
632
SPEED_RESTRICT_COND (size1 >= s->size); \
635
wp = SPEED_TMP_ALLOC_LIMBS (size1 + s->size, s->align_wp); \
637
speed_operand_src (s, s->xp, size1); \
638
speed_operand_src (s, s->yp, s->size); \
639
speed_operand_dst (s, wp, size1 + s->size); \
640
speed_cache_fill (s); \
642
speed_starttime (); \
645
function (wp, s->xp, size1, s->yp, s->size); \
647
t = speed_endtime (); \
654
#define SPEED_ROUTINE_MPN_MUL_N_CALL(call) \
661
SPEED_RESTRICT_COND (s->size >= 1); \
664
wp = SPEED_TMP_ALLOC_LIMBS (2*s->size, s->align_wp); \
666
speed_operand_src (s, s->xp, s->size); \
667
speed_operand_src (s, s->yp, s->size); \
668
speed_operand_dst (s, wp, 2*s->size); \
669
speed_cache_fill (s); \
671
speed_starttime (); \
676
t = speed_endtime (); \
682
#define SPEED_ROUTINE_MPN_MUL_N(function) \
683
SPEED_ROUTINE_MPN_MUL_N_CALL (function (wp, s->xp, s->yp, s->size));
686
#define SPEED_ROUTINE_MPN_MUL_N_TSPACE(call, tsize, minsize) \
693
SPEED_RESTRICT_COND (s->size >= minsize); \
696
wp = SPEED_TMP_ALLOC_LIMBS (2*s->size, s->align_wp); \
697
tspace = SPEED_TMP_ALLOC_LIMBS (tsize, s->align_wp2); \
699
speed_operand_src (s, s->xp, s->size); \
700
speed_operand_src (s, s->yp, s->size); \
701
speed_operand_dst (s, wp, 2*s->size); \
702
speed_operand_dst (s, tspace, tsize); \
703
speed_cache_fill (s); \
705
speed_starttime (); \
710
t = speed_endtime (); \
716
#define SPEED_ROUTINE_MPN_KARA_MUL_N(function) \
717
SPEED_ROUTINE_MPN_MUL_N_TSPACE \
718
(function (wp, s->xp, s->xp, s->size, tspace), \
719
MPN_KARA_MUL_N_TSIZE (s->size), \
720
MPN_KARA_MUL_N_MINSIZE)
722
#define SPEED_ROUTINE_MPN_TOOM3_MUL_N(function) \
723
SPEED_ROUTINE_MPN_MUL_N_TSPACE \
724
(function (wp, s->xp, s->yp, s->size, tspace), \
725
MPN_TOOM3_MUL_N_TSIZE (s->size), \
726
MPN_TOOM3_MUL_N_MINSIZE)
729
#define SPEED_ROUTINE_MPN_SQR_CALL(call) \
736
SPEED_RESTRICT_COND (s->size >= 1); \
739
wp = SPEED_TMP_ALLOC_LIMBS (2*s->size, s->align_wp); \
741
speed_operand_src (s, s->xp, s->size); \
742
speed_operand_dst (s, wp, 2*s->size); \
743
speed_cache_fill (s); \
745
speed_starttime (); \
750
t = speed_endtime (); \
756
#define SPEED_ROUTINE_MPN_SQR(function) \
757
SPEED_ROUTINE_MPN_SQR_CALL (function (wp, s->xp, s->size))
759
#define SPEED_ROUTINE_MPN_SQR_DIAGONAL(function) \
760
SPEED_ROUTINE_MPN_SQR (function)
763
#define SPEED_ROUTINE_MPN_SQR_TSPACE(call, tsize, minsize) \
770
SPEED_RESTRICT_COND (s->size >= minsize); \
773
wp = SPEED_TMP_ALLOC_LIMBS (2*s->size, s->align_wp); \
774
tspace = SPEED_TMP_ALLOC_LIMBS (tsize, s->align_wp2); \
776
speed_operand_src (s, s->xp, s->size); \
777
speed_operand_dst (s, wp, 2*s->size); \
778
speed_operand_dst (s, tspace, tsize); \
779
speed_cache_fill (s); \
781
speed_starttime (); \
786
t = speed_endtime (); \
792
#define SPEED_ROUTINE_MPN_KARA_SQR_N(function) \
793
SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \
794
MPN_KARA_SQR_N_TSIZE (s->size), \
795
MPN_KARA_SQR_N_MINSIZE)
797
#define SPEED_ROUTINE_MPN_TOOM3_SQR_N(function) \
798
SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \
799
MPN_TOOM3_SQR_N_TSIZE (s->size), \
800
MPN_TOOM3_SQR_N_MINSIZE)
803
#define SPEED_ROUTINE_MPN_MOD_CALL(call) \
806
mp_limb_t dummy = 0; \
808
SPEED_RESTRICT_COND (s->size >= 0); \
810
speed_operand_src (s, s->xp, s->size); \
811
speed_cache_fill (s); \
813
speed_starttime (); \
820
return speed_endtime (); \
823
#define SPEED_ROUTINE_MPN_MOD_1(function) \
824
SPEED_ROUTINE_MPN_MOD_CALL ((*function) (s->xp, s->size, s->r))
826
#define SPEED_ROUTINE_MPN_MOD_1C(function) \
827
SPEED_ROUTINE_MPN_MOD_CALL ((*function)(s->xp, s->size, s->r, CNST_LIMB(0)))
829
#define SPEED_ROUTINE_MPN_MODEXACT_1_ODD(function) \
830
SPEED_ROUTINE_MPN_MOD_CALL (function (s->xp, s->size, s->r));
832
#define SPEED_ROUTINE_MPN_MODEXACT_1C_ODD(function) \
833
SPEED_ROUTINE_MPN_MOD_CALL (function (s->xp, s->size, s->r, CNST_LIMB(0)));
835
#define SPEED_ROUTINE_MPN_MOD_34LSUB1(function) \
836
SPEED_ROUTINE_MPN_MOD_CALL ((*function) (s->xp, s->size))
838
#define SPEED_ROUTINE_MPN_PREINV_MOD_1(function) \
842
mp_limb_t dummy = 0; \
844
SPEED_RESTRICT_COND (s->size >= 0); \
845
SPEED_RESTRICT_COND (s->r & MP_LIMB_T_HIGHBIT); \
847
invert_limb (inv, s->r); \
848
speed_operand_src (s, s->xp, s->size); \
849
speed_cache_fill (s); \
851
speed_starttime (); \
854
dummy += (*function) (s->xp, s->size, s->r, inv); \
858
return speed_endtime (); \
862
/* A division of 2*s->size by s->size limbs */
864
#define SPEED_ROUTINE_MPN_DC_DIVREM_CALL(call) \
871
SPEED_RESTRICT_COND (s->size >= 1); \
874
a = SPEED_TMP_ALLOC_LIMBS (2*s->size, s->align_xp); \
875
d = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_yp); \
876
q = SPEED_TMP_ALLOC_LIMBS (s->size+1, s->align_wp); \
877
r = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp2); \
879
MPN_COPY (a, s->xp, s->size); \
880
MPN_COPY (a+s->size, s->xp, s->size); \
882
MPN_COPY (d, s->yp, s->size); \
884
/* normalize the data */ \
885
d[s->size-1] |= MP_LIMB_T_HIGHBIT; \
886
a[2*s->size-1] = d[s->size-1] - 1; \
888
speed_operand_src (s, a, 2*s->size); \
889
speed_operand_src (s, d, s->size); \
890
speed_operand_dst (s, q, s->size+1); \
891
speed_operand_dst (s, r, s->size); \
892
speed_cache_fill (s); \
894
speed_starttime (); \
899
t = speed_endtime (); \
905
#define SPEED_ROUTINE_MPN_DC_DIVREM_N(function) \
906
SPEED_ROUTINE_MPN_DC_DIVREM_CALL((*function) (q, a, d, s->size))
908
#define SPEED_ROUTINE_MPN_DC_DIVREM_SB(function) \
909
SPEED_ROUTINE_MPN_DC_DIVREM_CALL \
910
((*function) (q, a, 2*s->size, d, s->size))
912
#define SPEED_ROUTINE_MPN_DC_TDIV_QR(function) \
913
SPEED_ROUTINE_MPN_DC_DIVREM_CALL \
914
((*function) (q, r, 0, a, 2*s->size, d, s->size))
917
/* A division of s->size by 3 limbs */
919
#define SPEED_ROUTINE_MPN_SB_DIVREM_M3(function) \
927
SPEED_RESTRICT_COND (s->size >= 3); \
930
a = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_xp); \
932
d = SPEED_TMP_ALLOC_LIMBS (3, s->align_yp); \
933
MPN_COPY (d, s->yp, 3); \
934
d[2] |= MP_LIMB_T_HIGHBIT; \
936
qsize = s->size - 3; \
937
q = SPEED_TMP_ALLOC_LIMBS (qsize, s->align_wp); \
939
speed_operand_dst (s, a, s->size); \
940
speed_operand_src (s, d, 3); \
941
speed_operand_dst (s, q, qsize); \
942
speed_cache_fill (s); \
944
speed_starttime (); \
948
MPN_COPY (a, s->xp, s->size); \
949
function (q, a, s->size, d, 3); \
952
t = speed_endtime (); \
959
/* A remainder 2*s->size by s->size limbs */
961
#define SPEED_ROUTINE_MPZ_MOD(function) \
966
SPEED_RESTRICT_COND (s->size >= 1); \
968
mpz_init_set_n (d, s->yp, s->size); \
970
/* high part less than d, low part a duplicate copied in */ \
971
mpz_init_set_n (a, s->xp, s->size); \
973
mpz_mul_2exp (a, a, BITS_PER_MP_LIMB * s->size); \
974
MPN_COPY (PTR(a), s->xp, s->size); \
978
speed_operand_src (s, PTR(a), SIZ(a)); \
979
speed_operand_src (s, PTR(d), SIZ(d)); \
980
speed_cache_fill (s); \
982
speed_starttime (); \
985
function (r, a, d); \
987
return speed_endtime (); \
991
#define SPEED_ROUTINE_REDC(function) \
994
mp_ptr cp, mp, tp, ap; \
999
SPEED_RESTRICT_COND (s->size >= 1); \
1001
TMP_MARK (marker); \
1002
ap = SPEED_TMP_ALLOC_LIMBS (2*s->size+1, s->align_xp); \
1003
mp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_yp); \
1004
cp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp); \
1005
tp = SPEED_TMP_ALLOC_LIMBS (2*s->size+1, s->align_wp2); \
1007
MPN_COPY (ap, s->xp, s->size); \
1008
MPN_COPY (ap+s->size, s->xp, s->size); \
1010
/* modulus must be odd */ \
1011
MPN_COPY (mp, s->yp, s->size); \
1013
modlimb_invert (Nprim, mp[0]); \
1015
speed_operand_src (s, ap, 2*s->size+1); \
1016
speed_operand_dst (s, tp, 2*s->size+1); \
1017
speed_operand_src (s, mp, s->size); \
1018
speed_operand_dst (s, cp, s->size); \
1019
speed_cache_fill (s); \
1021
speed_starttime (); \
1024
MPN_COPY (tp, ap, 2*s->size); \
1025
function (cp, mp, s->size, Nprim, tp); \
1026
} while (--i != 0); \
1027
t = speed_endtime (); \
1029
TMP_FREE (marker); \
1034
#define SPEED_ROUTINE_MPN_POPCOUNT(function) \
1037
unsigned long dummy = 0; \
1039
SPEED_RESTRICT_COND (s->size >= 1); \
1041
speed_operand_src (s, s->xp, s->size); \
1042
speed_cache_fill (s); \
1044
speed_starttime (); \
1047
dummy += function (s->xp, s->size); \
1050
noop_1 ((mp_limb_t) dummy); \
1051
return speed_endtime (); \
1054
#define SPEED_ROUTINE_MPN_HAMDIST(function) \
1057
unsigned long dummy = 0; \
1059
SPEED_RESTRICT_COND (s->size >= 1); \
1061
speed_operand_src (s, s->xp, s->size); \
1062
speed_operand_src (s, s->yp, s->size); \
1063
speed_cache_fill (s); \
1065
speed_starttime (); \
1068
dummy += function (s->xp, s->yp, s->size); \
1071
noop_1 ((mp_limb_t) dummy); \
1072
return speed_endtime (); \
1076
#define SPEED_ROUTINE_MPZ_UI(function) \
1082
SPEED_RESTRICT_COND (s->size >= 0); \
1086
speed_starttime (); \
1089
function (z, s->size); \
1091
t = speed_endtime (); \
1097
#define SPEED_ROUTINE_MPZ_FAC_UI(function) SPEED_ROUTINE_MPZ_UI(function)
1098
#define SPEED_ROUTINE_MPZ_FIB_UI(function) SPEED_ROUTINE_MPZ_UI(function)
1099
#define SPEED_ROUTINE_MPZ_LUCNUM_UI(function) SPEED_ROUTINE_MPZ_UI(function)
1102
#define SPEED_ROUTINE_MPZ_2_UI(function) \
1108
SPEED_RESTRICT_COND (s->size >= 0); \
1113
speed_starttime (); \
1116
function (z, z2, s->size); \
1118
t = speed_endtime (); \
1125
#define SPEED_ROUTINE_MPZ_FIB2_UI(function) SPEED_ROUTINE_MPZ_2_UI(function)
1126
#define SPEED_ROUTINE_MPZ_LUCNUM2_UI(function) SPEED_ROUTINE_MPZ_2_UI(function)
1129
#define SPEED_ROUTINE_MPN_FIB2_UI(function) \
1135
TMP_DECL (marker); \
1137
SPEED_RESTRICT_COND (s->size >= 0); \
1139
TMP_MARK (marker); \
1140
alloc = MPN_FIB2_SIZE (s->size); \
1141
fp = SPEED_TMP_ALLOC_LIMBS (alloc, s->align_xp); \
1142
f1p = SPEED_TMP_ALLOC_LIMBS (alloc, s->align_yp); \
1144
speed_starttime (); \
1147
function (fp, f1p, s->size); \
1149
t = speed_endtime (); \
1151
TMP_FREE (marker); \
1157
/* Calculate b^e mod m for random b and m of s->size limbs and random e of 6
1158
limbs. m is forced to odd so that redc can be used. e is limited in
1159
size so the calculation doesn't take too long. */
1160
#define SPEED_ROUTINE_MPZ_POWM(function) \
1166
SPEED_RESTRICT_COND (s->size >= 1); \
1169
mpz_init_set_n (b, s->xp, s->size); \
1170
mpz_init_set_n (m, s->yp, s->size); \
1171
mpz_setbit (m, 0); /* force m to odd */ \
1172
mpz_init_set_n (e, s->xp_block, 6); \
1174
speed_starttime (); \
1177
function (r, b, e, m); \
1179
t = speed_endtime (); \
1188
/* (m-2)^0xAAAAAAAA mod m */
1189
#define SPEED_ROUTINE_MPZ_POWM_UI(function) \
1192
unsigned long e = (~ (unsigned long) 0) / 3; \
1196
SPEED_RESTRICT_COND (s->size >= 1); \
1200
/* force m to odd */ \
1202
mpz_set_n (m, s->xp, s->size); \
1205
mpz_init_set (b, m); \
1206
mpz_sub_ui (b, b, 2); \
1207
/* printf ("%X\n", mpz_get_ui(m)); */ \
1209
speed_starttime (); \
1211
function (r, b, e, m); \
1213
t = speed_endtime (); \
1222
#define SPEED_ROUTINE_MPN_ADDSUB_CALL(call) \
1224
mp_ptr wp, wp2, xp, yp; \
1227
TMP_DECL (marker); \
1229
SPEED_RESTRICT_COND (s->size >= 0); \
1231
TMP_MARK (marker); \
1232
wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp); \
1233
wp2 = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp2); \
1238
else if (s->r == 1) { xp = wp; } \
1239
else if (s->r == 2) { yp = wp2; } \
1240
else if (s->r == 3) { xp = wp; yp = wp2; } \
1241
else if (s->r == 4) { xp = wp2; yp = wp; } \
1243
TMP_FREE (marker); \
1246
if (xp != s->xp) MPN_COPY (xp, s->xp, s->size); \
1247
if (yp != s->yp) MPN_COPY (yp, s->yp, s->size); \
1249
speed_operand_src (s, xp, s->size); \
1250
speed_operand_src (s, yp, s->size); \
1251
speed_operand_dst (s, wp, s->size); \
1252
speed_operand_dst (s, wp2, s->size); \
1253
speed_cache_fill (s); \
1255
speed_starttime (); \
1260
t = speed_endtime (); \
1262
TMP_FREE (marker); \
1266
#define SPEED_ROUTINE_MPN_ADDSUB_N(function) \
1267
SPEED_ROUTINE_MPN_ADDSUB_CALL \
1268
(function (wp, wp2, xp, yp, s->size));
1270
#define SPEED_ROUTINE_MPN_ADDSUB_NC(function) \
1271
SPEED_ROUTINE_MPN_ADDSUB_CALL \
1272
(function (wp, wp2, xp, yp, s->size, 0));
1275
/* Doing an Nx1 gcd with the given r. */
1276
#define SPEED_ROUTINE_MPN_GCD_1N(function) \
1281
mp_limb_t dummy = 0; \
1282
TMP_DECL (marker); \
1284
SPEED_RESTRICT_COND (s->size >= 1); \
1285
SPEED_RESTRICT_COND (s->r != 0); \
1287
TMP_MARK (marker); \
1288
xp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_xp); \
1289
MPN_COPY (xp, s->xp, s->size); \
1290
xp[0] |= refmpn_zero_p (xp, s->size); \
1292
speed_operand_src (s, s->xp, s->size); \
1293
speed_cache_fill (s); \
1295
speed_starttime (); \
1298
dummy += function (xp, s->size, s->r); \
1300
t = speed_endtime (); \
1303
TMP_FREE (marker); \
1308
/* SPEED_BLOCK_SIZE many one GCDs of s->size bits each. */
1310
#define SPEED_ROUTINE_MPN_GCD_1_CALL(setup, call) \
1314
mp_limb_t x_mask, y_mask; \
1315
mp_limb_t dummy = 0; \
1317
TMP_DECL (marker); \
1319
SPEED_RESTRICT_COND (s->size >= 1); \
1320
SPEED_RESTRICT_COND (s->size <= mp_bits_per_limb); \
1322
TMP_MARK (marker); \
1323
px = SPEED_TMP_ALLOC_LIMBS (SPEED_BLOCK_SIZE, s->align_xp); \
1324
py = SPEED_TMP_ALLOC_LIMBS (SPEED_BLOCK_SIZE, s->align_yp); \
1325
MPN_COPY (px, s->xp_block, SPEED_BLOCK_SIZE); \
1326
MPN_COPY (py, s->yp_block, SPEED_BLOCK_SIZE); \
1328
x_mask = MP_LIMB_T_LOWBITMASK (s->size); \
1329
y_mask = MP_LIMB_T_LOWBITMASK (s->r != 0 ? s->r : s->size); \
1330
for (i = 0; i < SPEED_BLOCK_SIZE; i++) \
1332
px[i] &= x_mask; px[i] += (px[i] == 0); \
1333
py[i] &= y_mask; py[i] += (py[i] == 0); \
1337
speed_operand_src (s, px, SPEED_BLOCK_SIZE); \
1338
speed_operand_src (s, py, SPEED_BLOCK_SIZE); \
1339
speed_cache_fill (s); \
1341
speed_starttime (); \
1345
j = SPEED_BLOCK_SIZE; \
1353
t = speed_endtime (); \
1355
TMP_FREE (marker); \
1358
s->time_divisor = SPEED_BLOCK_SIZE; \
1362
#define SPEED_ROUTINE_MPN_GCD_1(function) \
1363
SPEED_ROUTINE_MPN_GCD_1_CALL( , function (&px[j-1], 1, py[j-1]))
1365
#define SPEED_ROUTINE_MPN_JACBASE(function) \
1366
SPEED_ROUTINE_MPN_GCD_1_CALL \
1368
/* require x<y, y odd, y!=1 */ \
1372
if (py[i]==1) py[i]=3; \
1374
function (px[j-1], py[j-1], 0))
1377
/* Run some GCDs of s->size limbs each. The number of different data values
1378
is decreased as s->size**2, since GCD is a quadratic algorithm.
1379
SPEED_ROUTINE_MPN_GCD runs more times than SPEED_ROUTINE_MPN_GCDEXT
1380
though, because the plain gcd is about twice as fast as gcdext. */
1382
#define SPEED_ROUTINE_MPN_GCD_CALL(datafactor, call) \
1385
mp_size_t j, pieces, psize; \
1386
mp_ptr wp, wp2, xtmp, ytmp, px, py; \
1388
TMP_DECL (marker); \
1390
SPEED_RESTRICT_COND (s->size >= 1); \
1392
TMP_MARK (marker); \
1393
xtmp = SPEED_TMP_ALLOC_LIMBS (s->size+1, s->align_xp); \
1394
ytmp = SPEED_TMP_ALLOC_LIMBS (s->size+1, s->align_yp); \
1395
wp = SPEED_TMP_ALLOC_LIMBS (s->size+1, s->align_wp); \
1396
wp2 = SPEED_TMP_ALLOC_LIMBS (s->size+1, s->align_wp2); \
1398
pieces = SPEED_BLOCK_SIZE * datafactor / s->size / s->size; \
1399
pieces = MIN (pieces, SPEED_BLOCK_SIZE / s->size); \
1400
pieces = MAX (pieces, 1); \
1402
psize = pieces * s->size; \
1403
px = TMP_ALLOC_LIMBS (psize); \
1404
py = TMP_ALLOC_LIMBS (psize); \
1405
MPN_COPY (px, pieces==1 ? s->xp : s->xp_block, psize); \
1406
MPN_COPY (py, pieces==1 ? s->yp : s->yp_block, psize); \
1408
/* y must be odd, x must have at least as many bits as y, \
1409
high limbs must be non-zero */ \
1410
for (j = 0; j < pieces; j++) \
1412
mp_ptr x = px+j*s->size; \
1413
mp_ptr y = py+j*s->size; \
1415
if (x[s->size-1] == 0) x[s->size-1] = 1; \
1416
if (y[s->size-1] == 0) y[s->size-1] = 1; \
1417
x[s->size-1] = MAX (x[s->size-1], y[s->size-1]); \
1420
speed_operand_src (s, px, psize); \
1421
speed_operand_src (s, py, psize); \
1422
speed_operand_dst (s, xtmp, s->size); \
1423
speed_operand_dst (s, ytmp, s->size); \
1424
speed_operand_dst (s, wp, s->size); \
1425
speed_cache_fill (s); \
1427
speed_starttime (); \
1434
MPN_COPY (xtmp, px+(j-1)*s->size, s->size); \
1435
MPN_COPY (ytmp, py+(j-1)*s->size, s->size); \
1441
t = speed_endtime (); \
1443
TMP_FREE (marker); \
1445
s->time_divisor = pieces; \
1449
#define SPEED_ROUTINE_MPN_GCD(function) \
1450
SPEED_ROUTINE_MPN_GCD_CALL (8, function (wp, xtmp, s->size, ytmp, s->size))
1452
#define SPEED_ROUTINE_MPN_GCDEXT(function) \
1453
SPEED_ROUTINE_MPN_GCD_CALL \
1454
(4, { mp_size_t wp2size; \
1455
function (wp, wp2, &wp2size, xtmp, s->size, ytmp, s->size); })
1458
#define SPEED_ROUTINE_MPN_GCDEXT_ONE(function) \
1461
mp_size_t j, pieces, psize, wp2size; \
1462
mp_ptr wp, wp2, xtmp, ytmp, px, py; \
1464
TMP_DECL (marker); \
1466
SPEED_RESTRICT_COND (s->size >= 1); \
1468
TMP_MARK (marker); \
1470
xtmp = SPEED_TMP_ALLOC_LIMBS (s->size+1, s->align_xp); \
1471
ytmp = SPEED_TMP_ALLOC_LIMBS (s->size+1, s->align_yp); \
1472
MPN_COPY (xtmp, s->xp, s->size); \
1473
MPN_COPY (ytmp, s->yp, s->size); \
1475
wp = SPEED_TMP_ALLOC_LIMBS (s->size+1, s->align_wp); \
1476
wp2 = SPEED_TMP_ALLOC_LIMBS (s->size+1, s->align_wp2); \
1478
pieces = SPEED_BLOCK_SIZE / 3; \
1479
psize = 3 * pieces; \
1480
px = TMP_ALLOC_LIMBS (psize); \
1481
py = TMP_ALLOC_LIMBS (psize); \
1482
MPN_COPY (px, s->xp_block, psize); \
1483
MPN_COPY (py, s->yp_block, psize); \
1485
/* x must have at least as many bits as y, \
1486
high limbs must be non-zero */ \
1487
for (j = 0; j < pieces; j++) \
1489
mp_ptr x = px+3*j; \
1490
mp_ptr y = py+3*j; \
1491
x[2] += (x[2] == 0); \
1492
y[2] += (y[2] == 0); \
1494
MP_LIMB_T_SWAP (x[2], y[2]); \
1497
speed_operand_src (s, px, psize); \
1498
speed_operand_src (s, py, psize); \
1499
speed_operand_dst (s, xtmp, s->size); \
1500
speed_operand_dst (s, ytmp, s->size); \
1501
speed_operand_dst (s, wp, s->size); \
1502
speed_cache_fill (s); \
1504
speed_starttime (); \
1510
mp_ptr xth = &xtmp[s->size-3]; \
1511
mp_ptr yth = &ytmp[s->size-3]; \
1515
xth[0] = x[0], xth[1] = x[1], xth[2] = x[2]; \
1516
yth[0] = y[0], yth[1] = y[1], yth[2] = y[2]; \
1518
ytmp[0] |= 1; /* y must be odd, */ \
1520
function (wp, wp2, &wp2size, xtmp, s->size, ytmp, s->size); \
1528
t = speed_endtime (); \
1530
TMP_FREE (marker); \
1532
s->time_divisor = pieces; \
1536
#define SPEED_ROUTINE_MPZ_JACOBI(function) \
1540
mp_size_t j, pieces, psize; \
1544
TMP_DECL (marker); \
1546
TMP_MARK (marker); \
1547
pieces = SPEED_BLOCK_SIZE / MAX (s->size, 1); \
1548
pieces = MAX (pieces, 1); \
1549
s->time_divisor = pieces; \
1551
psize = pieces * s->size; \
1552
px = TMP_ALLOC_LIMBS (psize); \
1553
py = TMP_ALLOC_LIMBS (psize); \
1554
MPN_COPY (px, pieces==1 ? s->xp : s->xp_block, psize); \
1555
MPN_COPY (py, pieces==1 ? s->yp : s->yp_block, psize); \
1557
for (j = 0; j < pieces; j++) \
1559
mp_ptr x = px+j*s->size; \
1560
mp_ptr y = py+j*s->size; \
1565
/* high limbs non-zero */ \
1566
if (x[s->size-1] == 0) x[s->size-1] = 1; \
1567
if (y[s->size-1] == 0) y[s->size-1] = 1; \
1573
speed_operand_src (s, px, psize); \
1574
speed_operand_src (s, py, psize); \
1575
speed_cache_fill (s); \
1577
speed_starttime (); \
1584
PTR(a) = px+(j-1)*s->size; \
1585
PTR(b) = py+(j-1)*s->size; \
1586
dummy += function (a, b); \
1591
t = speed_endtime (); \
1593
noop_1 ((mp_limb_t) dummy); \
1594
TMP_FREE (marker); \
1598
#define SPEED_ROUTINE_MPN_DIVREM_2(function) \
1604
TMP_DECL (marker); \
1606
SPEED_RESTRICT_COND (s->size >= 2); \
1608
TMP_MARK (marker); \
1609
xp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_xp); \
1610
wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp); \
1612
/* source is destroyed */ \
1613
MPN_COPY (xp, s->xp, s->size); \
1615
/* divisor must be normalized */ \
1616
MPN_COPY (yp, s->yp_block, 2); \
1617
yp[1] |= MP_LIMB_T_HIGHBIT; \
1619
speed_operand_src (s, xp, s->size); \
1620
speed_operand_src (s, yp, 2); \
1621
speed_operand_dst (s, wp, s->size); \
1622
speed_cache_fill (s); \
1624
speed_starttime (); \
1627
function (wp, 0, xp, s->size, yp); \
1629
t = speed_endtime (); \
1631
TMP_FREE (marker); \
1636
#define SPEED_ROUTINE_MODLIMB_INVERT(function) \
1643
xp = s->xp_block-1; \
1645
speed_operand_src (s, s->xp_block, SPEED_BLOCK_SIZE); \
1646
speed_cache_fill (s); \
1648
speed_starttime (); \
1652
j = SPEED_BLOCK_SIZE; \
1655
/* randomized but successively dependent */ \
1656
n += (xp[j] << 1); \
1663
t = speed_endtime (); \
1665
/* make sure the compiler won't optimize away n */ \
1668
s->time_divisor = SPEED_BLOCK_SIZE; \
1673
#define SPEED_ROUTINE_MPN_SQRTREM(function) \
1678
TMP_DECL (marker); \
1680
SPEED_RESTRICT_COND (s->size >= 1); \
1682
TMP_MARK (marker); \
1683
wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp); \
1684
wp2 = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp2); \
1686
speed_operand_src (s, s->xp, s->size); \
1687
speed_operand_dst (s, wp, s->size); \
1688
speed_operand_dst (s, wp2, s->size); \
1689
speed_cache_fill (s); \
1691
speed_starttime (); \
1694
function (wp, wp2, s->xp, s->size); \
1696
t = speed_endtime (); \
1698
TMP_FREE (marker); \
1703
/* s->size controls the number of limbs in the input, s->r is the base, or
1704
decimal by default. */
1705
#define SPEED_ROUTINE_MPN_GET_STR(function) \
1707
unsigned char *wp; \
1713
TMP_DECL (marker); \
1715
SPEED_RESTRICT_COND (s->size >= 1); \
1717
base = s->r == 0 ? 10 : s->r; \
1718
SPEED_RESTRICT_COND (base >= 2 && base <= 255); \
1720
TMP_MARK (marker); \
1721
xp = SPEED_TMP_ALLOC_LIMBS (s->size + 1, s->align_xp); \
1723
wsize = ((mp_size_t) (s->size * BITS_PER_MP_LIMB \
1724
* __mp_bases[base].chars_per_bit_exactly)) + 2; \
1725
wp = TMP_ALLOC (wsize); \
1727
/* use this during development to guard against overflowing wp */ \
1729
MPN_COPY (xp, s->xp, s->size); \
1730
ASSERT_ALWAYS (mpn_get_str (wp, base, xp, s->size) <= wsize); \
1733
speed_operand_src (s, s->xp, s->size); \
1734
speed_operand_dst (s, xp, s->size); \
1735
speed_operand_dst (s, (mp_ptr) wp, wsize/BYTES_PER_MP_LIMB); \
1736
speed_cache_fill (s); \
1738
speed_starttime (); \
1742
MPN_COPY (xp, s->xp, s->size); \
1743
function (wp, base, xp, s->size); \
1746
t = speed_endtime (); \
1748
TMP_FREE (marker); \
1752
/* s->size controls the number of digits in the input, s->r is the base, or
1753
decimal by default. */
1754
#define SPEED_ROUTINE_MPN_SET_STR(function) \
1756
unsigned char *xp; \
1762
TMP_DECL (marker); \
1764
SPEED_RESTRICT_COND (s->size >= 1); \
1766
base = s->r == 0 ? 10 : s->r; \
1767
SPEED_RESTRICT_COND (base >= 2 && base <= 255); \
1769
TMP_MARK (marker); \
1771
xp = TMP_ALLOC (s->size); \
1772
for (i = 0; i < s->size; i++) \
1773
xp[i] = s->xp[i] % base; \
1775
wsize = ((mp_size_t) (s->size / __mp_bases[base].chars_per_bit_exactly)) \
1776
/ BITS_PER_MP_LIMB + 2; \
1777
wp = SPEED_TMP_ALLOC_LIMBS (wsize, s->align_wp); \
1779
/* use this during development to check wsize is big enough */ \
1781
ASSERT_ALWAYS (mpn_set_str (wp, xp, s->size, base) <= wsize); \
1784
speed_operand_src (s, (mp_ptr) xp, s->size/BYTES_PER_MP_LIMB); \
1785
speed_operand_dst (s, wp, s->size); \
1786
speed_cache_fill (s); \
1788
speed_starttime (); \
1791
function (wp, xp, s->size, base); \
1793
t = speed_endtime (); \
1795
TMP_FREE (marker); \
1800
/* Run an accel gcd find_a() function over various data values. A set of
1801
values is used in case some run particularly fast or slow. The size
1802
parameter is ignored, the amount of data tested is fixed. */
1804
#define SPEED_ROUTINE_MPN_GCD_FINDA(function) \
1807
mp_limb_t cp[SPEED_BLOCK_SIZE][2]; \
1809
mp_limb_t dummy = 0; \
1810
TMP_DECL (marker); \
1812
TMP_MARK (marker); \
1814
/* low must be odd, high must be non-zero */ \
1815
for (i = 0; i < SPEED_BLOCK_SIZE; i++) \
1817
cp[i][0] = s->xp_block[i] | 1; \
1818
cp[i][1] = s->yp_block[i] + (s->yp_block[i] == 0); \
1821
speed_operand_src (s, &cp[0][0], 2*SPEED_BLOCK_SIZE); \
1822
speed_cache_fill (s); \
1824
speed_starttime (); \
1828
j = SPEED_BLOCK_SIZE; \
1831
dummy += function (cp[j-1]); \
1836
t = speed_endtime (); \
1838
TMP_FREE (marker); \
1840
s->time_divisor = SPEED_BLOCK_SIZE; \
1845
/* "call" should do "count_foo_zeros(c,n)".
1846
Give leading=1 if foo is leading zeros, leading=0 for trailing.
1847
Give zero=1 if n=0 is allowed in the call, zero=0 if not. */
1849
#define SPEED_ROUTINE_COUNT_ZEROS_A(leading, zero) \
1856
TMP_DECL (marker); \
1858
TMP_MARK (marker); \
1859
xp = SPEED_TMP_ALLOC_LIMBS (SPEED_BLOCK_SIZE, s->align_xp); \
1861
if (! speed_routine_count_zeros_setup (s, xp, leading, zero)) \
1863
speed_operand_src (s, xp, SPEED_BLOCK_SIZE); \
1864
speed_cache_fill (s); \
1867
speed_starttime (); \
1870
for (i = 0; i < SPEED_BLOCK_SIZE; i++) \
1875
#define SPEED_ROUTINE_COUNT_ZEROS_B() \
1877
} while (--j != 0); \
1878
t = speed_endtime (); \
1880
/* don't let c go dead */ \
1883
s->time_divisor = SPEED_BLOCK_SIZE; \
1885
TMP_FREE (marker); \
1889
#define SPEED_ROUTINE_COUNT_ZEROS_C(call, leading, zero) \
1891
SPEED_ROUTINE_COUNT_ZEROS_A (leading, zero); \
1893
SPEED_ROUTINE_COUNT_ZEROS_B (); \
1896
#define SPEED_ROUTINE_COUNT_LEADING_ZEROS_C(call,zero) \
1897
SPEED_ROUTINE_COUNT_ZEROS_C (call, 1, zero)
1898
#define SPEED_ROUTINE_COUNT_LEADING_ZEROS(fun) \
1899
SPEED_ROUTINE_COUNT_ZEROS_C (fun (c, n), 1, 0)
1901
#define SPEED_ROUTINE_COUNT_TRAILING_ZEROS_C(call,zero) \
1902
SPEED_ROUTINE_COUNT_ZEROS_C (call, 0, zero)
1903
#define SPEED_ROUTINE_COUNT_TRAILING_ZEROS(call) \
1904
SPEED_ROUTINE_COUNT_ZEROS_C (fun (c, n), 0, 0)
1907
#define SPEED_ROUTINE_INVERT_LIMB_CALL(call) \
1910
mp_limb_t d, dinv=0; \
1911
mp_ptr xp = s->xp_block - 1; \
1913
s->time_divisor = SPEED_BLOCK_SIZE; \
1915
speed_starttime (); \
1919
j = SPEED_BLOCK_SIZE; \
1923
d |= MP_LIMB_T_HIGHBIT; \
1924
do { call; } while (0); \
1930
/* don't let the compiler optimize everything away */ \
1933
return speed_endtime(); \
1940
#define SPEED_ROUTINE_MPN_BACK_TO(function) \
1943
speed_starttime (); \
1948
return speed_endtime (); \
1952
#define SPEED_ROUTINE_MPN_ZERO_CALL(call) \
1957
TMP_DECL (marker); \
1959
SPEED_RESTRICT_COND (s->size >= 0); \
1961
TMP_MARK (marker); \
1962
wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp); \
1963
speed_operand_dst (s, wp, s->size); \
1964
speed_cache_fill (s); \
1966
speed_starttime (); \
1971
t = speed_endtime (); \
1973
TMP_FREE (marker); \
1977
#define SPEED_ROUTINE_MPN_ZERO(function) \
1978
SPEED_ROUTINE_MPN_ZERO_CALL (function (wp, s->size))