~ubuntu-branches/ubuntu/quantal/gclcvs/quantal

« back to all changes in this revision

Viewing changes to gmp3/tune/speed.h

  • Committer: Bazaar Package Importer
  • Author(s): Camm Maguire
  • Date: 2004-06-24 15:13:46 UTC
  • Revision ID: james.westby@ubuntu.com-20040624151346-xh0xaaktyyp7aorc
Tags: 2.7.0-26
C_GC_OFFSET is 2 on m68k-linux

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/* Header for speed and threshold things.
 
2
 
 
3
Copyright 1999, 2000, 2001 Free Software Foundation, Inc.
 
4
 
 
5
This file is part of the GNU MP Library.
 
6
 
 
7
The GNU MP Library is free software; you can redistribute it and/or modify
 
8
it under the terms of the GNU Lesser General Public License as published by
 
9
the Free Software Foundation; either version 2.1 of the License, or (at your
 
10
option) any later version.
 
11
 
 
12
The GNU MP Library is distributed in the hope that it will be useful, but
 
13
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 
14
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
 
15
License for more details.
 
16
 
 
17
You should have received a copy of the GNU Lesser General Public License
 
18
along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
 
19
the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
 
20
MA 02111-1307, USA. */
 
21
 
 
22
#ifndef __SPEED_H__
 
23
#define __SPEED_H__
 
24
 
 
25
 
 
26
/* Pad ptr,oldsize with zero limbs (at the most significant end) to make it
 
27
   newsize long. */
 
28
#define MPN_ZERO_EXTEND(ptr, oldsize, newsize)          \
 
29
  do {                                                  \
 
30
    ASSERT ((newsize) >= (oldsize));                    \
 
31
    MPN_ZERO ((ptr)+(oldsize), (newsize)-(oldsize));    \
 
32
  } while (0)
 
33
 
 
34
/* A mask of the least significant n bits.  Note 1<<32 doesn't give zero on
 
35
   x86 family CPUs, hence the separate case for BITS_PER_MP_LIMB. */
 
36
#define MP_LIMB_T_LOWBITMASK(n) \
 
37
  ((n) == BITS_PER_MP_LIMB ? MP_LIMB_T_MAX : ((mp_limb_t) 1 << (n)) - 1)
 
38
 
 
39
 
 
40
/* align must be a power of 2 here, usually CACHE_LINE_SIZE is a good choice */
 
41
 
 
42
#define TMP_ALLOC_ALIGNED(bytes, align) \
 
43
  align_pointer (TMP_ALLOC ((bytes) + (align)-1), (align))
 
44
#define TMP_ALLOC_LIMBS_ALIGNED(limbs, align) \
 
45
  ((mp_ptr) TMP_ALLOC_ALIGNED ((limbs)*sizeof(mp_limb_t), align))
 
46
 
 
47
/* 32 for pentium, 64 for athlon, might want to configure this for other
 
48
   CPUs.  In truth though nothing has yet shown up that cares about cache
 
49
   line boundaries.  The only practical effect of this is to restrict the
 
50
   range that s->align_xp can take.  Perhaps this could be a variable
 
51
   instead. */
 
52
#define CACHE_LINE_SIZE    64 /* bytes */
 
53
 
 
54
#define SPEED_TMP_ALLOC_ADJUST_MASK  (CACHE_LINE_SIZE/BYTES_PER_MP_LIMB - 1)
 
55
 
 
56
#define SPEED_TMP_ALLOC_LIMBS(limbs, align) \
 
57
  (speed_tmp_alloc_adjust             \
 
58
    (TMP_ALLOC_LIMBS((limbs) + SPEED_TMP_ALLOC_ADJUST_MASK), (align)))
 
59
 
 
60
 
 
61
/* This is the size for s->xp_block and s->yp_block, used in certain
 
62
   routines that want to run across many different data values and use
 
63
   s->size for a different purpose, eg. SPEED_ROUTINE_MPN_GCD_1.
 
64
 
 
65
   512 means 2kbytes of data for each of xp_block and yp_block, making 4k
 
66
   total, which should fit easily in any L1 data cache. */
 
67
 
 
68
#define SPEED_BLOCK_SIZE   512 /* limbs */
 
69
 
 
70
 
 
71
extern double  speed_unittime;
 
72
extern double  speed_cycletime;
 
73
extern int     speed_precision;
 
74
extern const char *speed_time_string;
 
75
void speed_time_init _PROTO ((void));
 
76
void speed_cycletime_init _PROTO ((void));
 
77
void speed_cycletime_need_cycles _PROTO ((void));
 
78
void speed_cycletime_need_seconds _PROTO ((void));
 
79
void speed_starttime _PROTO ((void));
 
80
double speed_endtime _PROTO ((void));
 
81
 
 
82
 
 
83
struct speed_params {
 
84
  unsigned   reps;      /* how many times to run the routine */
 
85
  mp_ptr     xp;        /* first argument */
 
86
  mp_ptr     yp;        /* second argument */
 
87
  mp_size_t  size;      /* size of both arguments */
 
88
  mp_limb_t  r;         /* user supplied parameter */
 
89
  mp_size_t  align_xp;  /* alignment of xp */
 
90
  mp_size_t  align_yp;  /* alignment of yp */
 
91
  mp_size_t  align_wp;  /* intended alignment of wp */
 
92
  mp_size_t  align_wp2; /* intended alignment of wp2 */
 
93
  mp_ptr     xp_block;  /* first special SPEED_BLOCK_SIZE block */
 
94
  mp_ptr     yp_block;  /* second special SPEED_BLOCK_SIZE block */
 
95
 
 
96
  double     time_divisor; /* optionally set by the speed routine */
 
97
  
 
98
  /* used by the cache priming things */
 
99
  int        cache;
 
100
  unsigned   src_num, dst_num;
 
101
  struct {
 
102
    mp_ptr    ptr;
 
103
    mp_size_t size;
 
104
  } src[2], dst[3];
 
105
};
 
106
 
 
107
typedef double (*speed_function_t) _PROTO ((struct speed_params *s));
 
108
 
 
109
double speed_measure _PROTO ((speed_function_t fun, struct speed_params *s));
 
110
 
 
111
/* Prototypes for speed measuring routines */
 
112
 
 
113
double speed_back_to_back (struct speed_params *s);
 
114
double speed_count_leading_zeros _PROTO ((struct speed_params *s));
 
115
double speed_count_trailing_zeros _PROTO ((struct speed_params *s));
 
116
double speed_find_a _PROTO ((struct speed_params *s));
 
117
double speed_gmp_allocate_free _PROTO ((struct speed_params *s));
 
118
double speed_gmp_allocate_reallocate_free _PROTO ((struct speed_params *s));
 
119
double speed_invert_limb _PROTO ((struct speed_params *s));
 
120
double speed_malloc_free _PROTO ((struct speed_params *s));
 
121
double speed_malloc_realloc_free _PROTO ((struct speed_params *s));
 
122
double speed_memcpy _PROTO ((struct speed_params *s));
 
123
double speed_modlimb_invert _PROTO ((struct speed_params *s));
 
124
double speed_modlimb_invert_mul1 _PROTO ((struct speed_params *s));
 
125
double speed_modlimb_invert_loop _PROTO ((struct speed_params *s));
 
126
double speed_modlimb_invert_cond _PROTO ((struct speed_params *s));
 
127
double speed_modlimb_invert_arith _PROTO ((struct speed_params *s));
 
128
 
 
129
double speed_mpf_init_clear _PROTO ((struct speed_params *s));
 
130
 
 
131
double speed_mpn_add_n _PROTO ((struct speed_params *s));
 
132
double speed_mpn_add_n_self _PROTO ((struct speed_params *s));
 
133
double speed_mpn_add_n_inplace _PROTO ((struct speed_params *s));
 
134
double speed_mpn_and_n _PROTO ((struct speed_params *s));
 
135
double speed_mpn_andn_n _PROTO ((struct speed_params *s));
 
136
double speed_mpn_addmul_1 _PROTO ((struct speed_params *s));
 
137
double speed_mpn_com_n _PROTO ((struct speed_params *s));
 
138
double speed_mpn_copyd _PROTO ((struct speed_params *s));
 
139
double speed_mpn_copyi _PROTO ((struct speed_params *s));
 
140
double speed_mpn_dc_divrem_n _PROTO ((struct speed_params *s));
 
141
double speed_mpn_dc_divrem_sb _PROTO ((struct speed_params *s));
 
142
double speed_mpn_dc_divrem_sb_div _PROTO ((struct speed_params *s));
 
143
double speed_mpn_dc_divrem_sb_inv _PROTO ((struct speed_params *s));
 
144
double speed_mpn_dc_tdiv_qr _PROTO ((struct speed_params *s));
 
145
double speed_MPN_COPY _PROTO ((struct speed_params *s));
 
146
double speed_MPN_COPY_DECR _PROTO ((struct speed_params *s));
 
147
double speed_MPN_COPY_INCR _PROTO ((struct speed_params *s));
 
148
double speed_mpn_divexact_1 _PROTO ((struct speed_params *s));
 
149
double speed_mpn_divexact_by3 _PROTO ((struct speed_params *s));
 
150
double speed_mpn_divrem_1 _PROTO ((struct speed_params *s));
 
151
double speed_mpn_divrem_1f _PROTO ((struct speed_params *s));
 
152
double speed_mpn_divrem_1c _PROTO ((struct speed_params *s));
 
153
double speed_mpn_divrem_1cf _PROTO ((struct speed_params *s));
 
154
double speed_mpn_divrem_1_div _PROTO ((struct speed_params *s));
 
155
double speed_mpn_divrem_1f_div _PROTO ((struct speed_params *s));
 
156
double speed_mpn_divrem_1_inv _PROTO ((struct speed_params *s));
 
157
double speed_mpn_divrem_1f_inv _PROTO ((struct speed_params *s));
 
158
double speed_mpn_divrem_2 _PROTO ((struct speed_params *s));
 
159
double speed_mpn_divrem_2_div _PROTO ((struct speed_params *s));
 
160
double speed_mpn_divrem_2_inv _PROTO ((struct speed_params *s));
 
161
double speed_mpn_fib2_ui _PROTO ((struct speed_params *s));
 
162
double speed_mpn_gcd _PROTO ((struct speed_params *s));
 
163
double speed_mpn_gcd_finda _PROTO ((struct speed_params *s));
 
164
double speed_mpn_gcd_1 _PROTO ((struct speed_params *s));
 
165
double speed_mpn_gcd_1N _PROTO ((struct speed_params *s));
 
166
double speed_mpn_gcd_binary _PROTO ((struct speed_params *s));
 
167
double speed_mpn_gcd_finda _PROTO ((struct speed_params *s));
 
168
double speed_mpn_gcdext _PROTO ((struct speed_params *s));
 
169
double speed_mpn_gcdext_double _PROTO ((struct speed_params *s));
 
170
double speed_mpn_gcdext_one_double _PROTO ((struct speed_params *s));
 
171
double speed_mpn_gcdext_one_single _PROTO ((struct speed_params *s));
 
172
double speed_mpn_gcdext_single _PROTO ((struct speed_params *s));
 
173
double speed_mpn_get_str _PROTO ((struct speed_params *s));
 
174
double speed_mpn_hamdist _PROTO ((struct speed_params *s));
 
175
double speed_mpn_ior_n _PROTO ((struct speed_params *s));
 
176
double speed_mpn_iorn_n _PROTO ((struct speed_params *s));
 
177
double speed_mpn_jacobi_base _PROTO ((struct speed_params *s));
 
178
double speed_mpn_kara_mul_n _PROTO ((struct speed_params *s));
 
179
double speed_mpn_kara_sqr_n _PROTO ((struct speed_params *s));
 
180
double speed_mpn_lshift _PROTO ((struct speed_params *s));
 
181
double speed_mpn_mod_1 _PROTO ((struct speed_params *s));
 
182
double speed_mpn_mod_1c _PROTO ((struct speed_params *s));
 
183
double speed_mpn_mod_1_div _PROTO ((struct speed_params *s));
 
184
double speed_mpn_mod_1_inv _PROTO ((struct speed_params *s));
 
185
double speed_mpn_mod_34lsub1 _PROTO ((struct speed_params *s));
 
186
double speed_mpn_modexact_1_odd _PROTO ((struct speed_params *s));
 
187
double speed_mpn_modexact_1c_odd _PROTO ((struct speed_params *s));
 
188
double speed_mpn_mul_1 _PROTO ((struct speed_params *s));
 
189
double speed_mpn_mul_1_inplace _PROTO ((struct speed_params *s));
 
190
double speed_mpn_mul_2 _PROTO ((struct speed_params *s));
 
191
double speed_mpn_mul_basecase _PROTO ((struct speed_params *s));
 
192
double speed_mpn_mul_fft _PROTO ((struct speed_params *s));
 
193
double speed_mpn_mul_fft_sqr _PROTO ((struct speed_params *s));
 
194
double speed_mpn_mul_fft_full _PROTO ((struct speed_params *s));
 
195
double speed_mpn_mul_fft_full_sqr _PROTO ((struct speed_params *s));
 
196
double speed_mpn_mul_n _PROTO ((struct speed_params *s));
 
197
double speed_mpn_mul_n_sqr _PROTO ((struct speed_params *s));
 
198
double speed_mpn_nand_n _PROTO ((struct speed_params *s));
 
199
double speed_mpn_nior_n _PROTO ((struct speed_params *s));
 
200
double speed_mpn_popcount _PROTO ((struct speed_params *s));
 
201
double speed_mpn_preinv_mod_1 _PROTO ((struct speed_params *s));
 
202
double speed_redc _PROTO ((struct speed_params *s));
 
203
double speed_mpn_rshift _PROTO ((struct speed_params *s));
 
204
double speed_mpn_sb_divrem_m3 _PROTO ((struct speed_params *s));
 
205
double speed_mpn_sb_divrem_m3_div _PROTO ((struct speed_params *s));
 
206
double speed_mpn_sb_divrem_m3_inv _PROTO ((struct speed_params *s));
 
207
double speed_mpn_set_str _PROTO ((struct speed_params *s));
 
208
double speed_mpn_sqr_basecase _PROTO ((struct speed_params *s));
 
209
double speed_mpn_sqr_diagonal _PROTO ((struct speed_params *s));
 
210
double speed_mpn_sqr_n _PROTO ((struct speed_params *s));
 
211
double speed_mpn_sqrtrem _PROTO ((struct speed_params *s));
 
212
double speed_mpn_sub_n _PROTO ((struct speed_params *s));
 
213
double speed_mpn_submul_1 _PROTO ((struct speed_params *s));
 
214
double speed_mpn_toom3_mul_n _PROTO ((struct speed_params *s));
 
215
double speed_mpn_toom3_mul_n_mpn _PROTO ((struct speed_params *s));
 
216
double speed_mpn_toom3_mul_n_open _PROTO ((struct speed_params *s));
 
217
double speed_mpn_toom3_sqr_n _PROTO ((struct speed_params *s));
 
218
double speed_mpn_toom3_sqr_n_mpn _PROTO ((struct speed_params *s));
 
219
double speed_mpn_toom3_sqr_n_open _PROTO ((struct speed_params *s));
 
220
double speed_mpn_udiv_qrnnd _PROTO ((struct speed_params *s));
 
221
double speed_mpn_umul_ppmm _PROTO ((struct speed_params *s));
 
222
double speed_mpn_xnor_n _PROTO ((struct speed_params *s));
 
223
double speed_mpn_xor_n _PROTO ((struct speed_params *s));
 
224
double speed_MPN_ZERO _PROTO ((struct speed_params *s));
 
225
 
 
226
double speed_mpq_init_clear _PROTO ((struct speed_params *s));
 
227
 
 
228
double speed_mpz_add _PROTO ((struct speed_params *s));
 
229
double speed_mpz_bin_uiui _PROTO ((struct speed_params *s));
 
230
double speed_mpz_fac_ui _PROTO ((struct speed_params *s));
 
231
double speed_mpz_fib_ui _PROTO ((struct speed_params *s));
 
232
double speed_mpz_fib2_ui _PROTO ((struct speed_params *s));
 
233
double speed_mpz_init_clear _PROTO ((struct speed_params *s));
 
234
double speed_mpz_init_realloc_clear _PROTO ((struct speed_params *s));
 
235
double speed_mpz_jacobi _PROTO ((struct speed_params *s));
 
236
double speed_mpz_lucnum_ui _PROTO ((struct speed_params *s));
 
237
double speed_mpz_lucnum2_ui _PROTO ((struct speed_params *s));
 
238
double speed_mpz_mod _PROTO ((struct speed_params *s));
 
239
double speed_mpz_powm _PROTO ((struct speed_params *s));
 
240
double speed_mpz_powm_mod _PROTO ((struct speed_params *s));
 
241
double speed_mpz_powm_redc _PROTO ((struct speed_params *s));
 
242
double speed_mpz_powm_ui _PROTO ((struct speed_params *s));
 
243
 
 
244
double speed_noop _PROTO ((struct speed_params *s));
 
245
double speed_noop_wxs _PROTO ((struct speed_params *s));
 
246
double speed_noop_wxys _PROTO ((struct speed_params *s));
 
247
 
 
248
double speed_operator_div (struct speed_params *s);
 
249
double speed_operator_mod (struct speed_params *s);
 
250
 
 
251
double speed_udiv_qrnnd _PROTO ((struct speed_params *s));
 
252
double speed_udiv_qrnnd_preinv _PROTO ((struct speed_params *s));
 
253
double speed_udiv_qrnnd_preinv2norm _PROTO ((struct speed_params *s));
 
254
double speed_udiv_qrnnd_c _PROTO ((struct speed_params *s));
 
255
double speed_umul_ppmm _PROTO ((struct speed_params *s));
 
256
 
 
257
 
 
258
/* Prototypes for other routines */
 
259
 
 
260
/* low 32-bits in p[0], high 32-bits in p[1] */
 
261
void speed_cyclecounter _PROTO ((unsigned p[2]));
 
262
 
 
263
/* In i386 gcc -fPIC, ebx is a fixed register and can't be declared a dummy
 
264
   output or a clobber for the cpuid, hence an explicit save and restore.  A
 
265
   clobber as such doesn't provoke an error unfortunately (gcc 3.0), so use
 
266
   the dummy output style in non-PIC, so there's an error if somehow -fPIC
 
267
   is used without a -DPIC to tell us about it.  */
 
268
#if defined(__GNUC__) && ! defined (NO_ASM)     \
 
269
  && (defined (__i386__) || defined (__i486__))
 
270
#ifdef PIC
 
271
#define speed_cyclecounter(p)                                           \
 
272
  do {                                                                  \
 
273
    int  __speed_cyclecounter__save_ebx;                                \
 
274
    int  __speed_cyclecounter__dummy;                                   \
 
275
    __asm__ __volatile__ ("movl %%ebx, %1\n"                            \
 
276
                          "cpuid\n"                                     \
 
277
                          "movl %1, %%ebx\n"                            \
 
278
                          "rdtsc"                                       \
 
279
                          : "=a"   ((p)[0]),                            \
 
280
                            "=&rm" (__speed_cyclecounter__save_ebx),    \
 
281
                            "=c"   (__speed_cyclecounter__dummy),       \
 
282
                            "=d"   ((p)[1]));                           \
 
283
  } while (0)
 
284
#else
 
285
#define speed_cyclecounter(p)                                           \
 
286
  do {                                                                  \
 
287
    int  __speed_cyclecounter__dummy1;                                  \
 
288
    int  __speed_cyclecounter__dummy2;                                  \
 
289
    __asm__ __volatile__ ("cpuid\n"                                     \
 
290
                          "rdtsc"                                       \
 
291
                          : "=a" ((p)[0]),                              \
 
292
                            "=b" (__speed_cyclecounter__dummy1),        \
 
293
                            "=c" (__speed_cyclecounter__dummy2),        \
 
294
                            "=d" ((p)[1]));                             \
 
295
  } while (0)
 
296
#endif
 
297
#endif
 
298
 
 
299
double speed_cyclecounter_diff _PROTO ((const unsigned end[2],
 
300
                                        const unsigned start[2]));
 
301
int gettimeofday_microseconds_p _PROTO ((void));
 
302
int getrusage_microseconds_p _PROTO ((void));
 
303
int cycles_works_p _PROTO ((void));
 
304
 
 
305
int double_cmp_ptr _PROTO ((const double *p, const double *q));
 
306
void pentium_wbinvd _PROTO ((void));
 
307
typedef int (*qsort_function_t) _PROTO ((const void *, const void *));
 
308
 
 
309
void noop _PROTO ((void));
 
310
void noop_1 _PROTO ((mp_limb_t n));
 
311
void noop_wxs _PROTO ((mp_ptr wp, mp_srcptr xp, mp_size_t size));
 
312
void noop_wxys _PROTO ((mp_ptr wp, mp_srcptr xp, mp_srcptr yp,
 
313
                        mp_size_t size));
 
314
void mpn_cache_fill _PROTO ((mp_srcptr ptr, mp_size_t size));
 
315
void mpn_cache_fill_dummy _PROTO ((mp_limb_t n));
 
316
mp_ptr speed_tmp_alloc_adjust _PROTO ((void *ptr, mp_size_t align));
 
317
void speed_cache_fill _PROTO ((struct speed_params *s));
 
318
void speed_operand_src _PROTO ((struct speed_params *s,
 
319
                                mp_ptr ptr, mp_size_t size));
 
320
void speed_operand_dst _PROTO ((struct speed_params *s,
 
321
                                mp_ptr ptr, mp_size_t size));
 
322
 
 
323
extern int  speed_option_addrs;
 
324
extern int  speed_option_verbose;
 
325
void speed_option_set _PROTO((const char *s));
 
326
 
 
327
mp_limb_t mpn_divrem_1_div _PROTO ((mp_ptr qp, mp_size_t xsize,
 
328
                                    mp_srcptr ap, mp_size_t size,
 
329
                                    mp_limb_t d));
 
330
mp_limb_t mpn_divrem_1_inv _PROTO ((mp_ptr qp, mp_size_t xsize,
 
331
                                    mp_srcptr ap, mp_size_t size,
 
332
                                    mp_limb_t d));
 
333
mp_limb_t mpn_divrem_2_div _PROTO ((mp_ptr qp, mp_size_t qxn,
 
334
                                    mp_ptr np, mp_size_t nsize,
 
335
                                    mp_srcptr dp));
 
336
mp_limb_t mpn_divrem_2_inv _PROTO ((mp_ptr qp, mp_size_t qxn,
 
337
                                    mp_ptr np, mp_size_t nsize,
 
338
                                    mp_srcptr dp));
 
339
mp_limb_t mpn_mod_1_div _PROTO ((mp_srcptr ap, mp_size_t size, mp_limb_t d)) __GMP_ATTRIBUTE_PURE;
 
340
mp_limb_t mpn_mod_1_inv _PROTO ((mp_srcptr ap, mp_size_t size, mp_limb_t d)) __GMP_ATTRIBUTE_PURE;
 
341
 
 
342
mp_size_t mpn_gcd_binary
 
343
  _PROTO ((mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t vsize));
 
344
mp_size_t mpn_gcdext_one_double
 
345
  _PROTO ((mp_ptr gp, mp_ptr s0p, mp_size_t *s0size,
 
346
           mp_ptr up, mp_size_t size, mp_ptr vp, mp_size_t vsize));
 
347
mp_size_t mpn_gcdext_one_single
 
348
  _PROTO ((mp_ptr gp, mp_ptr s0p, mp_size_t *s0size,
 
349
           mp_ptr up, mp_size_t size, mp_ptr vp, mp_size_t vsize));
 
350
mp_size_t mpn_gcdext_single
 
351
  _PROTO ((mp_ptr gp, mp_ptr s0p, mp_size_t *s0size,
 
352
           mp_ptr up, mp_size_t size, mp_ptr vp, mp_size_t vsize));
 
353
mp_size_t mpn_gcdext_double
 
354
  _PROTO ((mp_ptr gp, mp_ptr s0p, mp_size_t *s0size,
 
355
           mp_ptr up, mp_size_t size, mp_ptr vp, mp_size_t vsize));
 
356
 
 
357
mp_limb_t mpn_sb_divrem_mn_div _PROTO ((mp_ptr qp,
 
358
                                        mp_ptr np,    mp_size_t nsize,
 
359
                                        mp_srcptr dp, mp_size_t dsize));
 
360
mp_limb_t mpn_sb_divrem_mn_inv _PROTO ((mp_ptr qp,
 
361
                                        mp_ptr np,    mp_size_t nsize,
 
362
                                        mp_srcptr dp, mp_size_t dsize));
 
363
void mpn_toom3_mul_n_open _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t,
 
364
                                   mp_ptr));
 
365
void mpn_toom3_sqr_n_open _PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_ptr));
 
366
void mpn_toom3_mul_n_mpn _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t,
 
367
                                  mp_ptr));
 
368
void mpn_toom3_sqr_n_mpn _PROTO((mp_ptr, mp_srcptr, mp_size_t, mp_ptr));
 
369
 
 
370
void mpz_powm_mod _PROTO ((mpz_ptr res, mpz_srcptr base, mpz_srcptr e,
 
371
                           mpz_srcptr mod));
 
372
void mpz_powm_redc _PROTO ((mpz_ptr res, mpz_srcptr base, mpz_srcptr e,
 
373
                            mpz_srcptr mod));
 
374
void redc _PROTO ((mp_ptr cp, mp_srcptr mp, mp_size_t n, mp_limb_t Nprim,
 
375
                   mp_ptr tp));
 
376
 
 
377
int speed_routine_count_zeros_setup _PROTO ((struct speed_params *s,
 
378
                                             mp_ptr xp, int leading,
 
379
                                             int zero));
 
380
 
 
381
/* The measuring routines use these big macros to save duplication for
 
382
   similar forms.  They also get used for some automatically generated
 
383
   measuring of new implementations of functions.
 
384
 
 
385
   Having something like SPEED_ROUTINE_BINARY_N as a subroutine accepting a
 
386
   function pointer is considered undesirable since it's not the way a
 
387
   normal application will be calling, and some processors might do
 
388
   different things with an indirect call, like not branch predicting, or
 
389
   doing a full pipe flush.  At least some of the "functions" measured are
 
390
   actually macros too.
 
391
 
 
392
   The net effect is to bloat the object code, possibly in a big way, but
 
393
   only what's being measured is being run, so that doesn't matter.
 
394
 
 
395
   Note that if a called function is __GMP_ATTRIBUTE_PURE or
 
396
   ATTRIBUTE_CONST, as for example mpn_mod_1, then the return value should
 
397
   be used in some way, to stop gcc 3 and up from discarding the calls.  See
 
398
   SPEED_ROUTINE_MPN_MOD_CALL for instance.  */
 
399
 
 
400
 
 
401
#define SPEED_RESTRICT_COND(cond)   if (!(cond)) return -1.0;
 
402
 
 
403
/* For mpn_copy or similar. */
 
404
#define SPEED_ROUTINE_MPN_COPY(function)                \
 
405
  {                                                     \
 
406
    mp_ptr    wp;                                       \
 
407
    unsigned  i;                                        \
 
408
    double    t;                                        \
 
409
    TMP_DECL (marker);                                  \
 
410
                                                        \
 
411
    SPEED_RESTRICT_COND (s->size >= 0);                 \
 
412
                                                        \
 
413
    TMP_MARK (marker);                                  \
 
414
    wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp);  \
 
415
                                                        \
 
416
    speed_operand_src (s, s->xp, s->size);              \
 
417
    speed_operand_dst (s, wp, s->size);                 \
 
418
    speed_cache_fill (s);                               \
 
419
                                                        \
 
420
    speed_starttime ();                                 \
 
421
    i = s->reps;                                        \
 
422
    do                                                  \
 
423
      function (wp, s->xp, s->size);                    \
 
424
    while (--i != 0);                                   \
 
425
    t = speed_endtime ();                               \
 
426
                                                        \
 
427
    TMP_FREE (marker);                                  \
 
428
    return t;                                           \
 
429
  }  
 
430
 
 
431
#define SPEED_ROUTINE_MPN_COPYC(function)               \
 
432
  {                                                     \
 
433
    mp_ptr    wp;                                       \
 
434
    unsigned  i;                                        \
 
435
    double    t;                                        \
 
436
    TMP_DECL (marker);                                  \
 
437
                                                        \
 
438
    SPEED_RESTRICT_COND (s->size >= 0);                 \
 
439
                                                        \
 
440
    TMP_MARK (marker);                                  \
 
441
    wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp);  \
 
442
                                                        \
 
443
    speed_operand_src (s, s->xp, s->size);              \
 
444
    speed_operand_dst (s, wp, s->size);                 \
 
445
    speed_cache_fill (s);                               \
 
446
                                                        \
 
447
    speed_starttime ();                                 \
 
448
    i = s->reps;                                        \
 
449
    do                                                  \
 
450
      function (wp, s->xp, s->size, 0);                 \
 
451
    while (--i != 0);                                   \
 
452
    t = speed_endtime ();                               \
 
453
                                                        \
 
454
    TMP_FREE (marker);                                  \
 
455
    return t;                                           \
 
456
  }  
 
457
 
 
458
/* s->size is still in limbs, and it's limbs which are copied, but
 
459
   "function" takes a size in bytes not limbs.  */
 
460
#define SPEED_ROUTINE_MPN_COPY_BYTES(function)                  \
 
461
  {                                                             \
 
462
    mp_ptr    wp;                                               \
 
463
    unsigned  i;                                                \
 
464
    double    t;                                                \
 
465
    TMP_DECL (marker);                                          \
 
466
                                                                \
 
467
    SPEED_RESTRICT_COND (s->size >= 0);                         \
 
468
                                                                \
 
469
    TMP_MARK (marker);                                          \
 
470
    wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp);          \
 
471
                                                                \
 
472
    speed_operand_src (s, s->xp, s->size);                      \
 
473
    speed_operand_dst (s, wp, s->size);                         \
 
474
    speed_cache_fill (s);                                       \
 
475
                                                                \
 
476
    speed_starttime ();                                         \
 
477
    i = s->reps;                                                \
 
478
    do                                                          \
 
479
      function (wp, s->xp, s->size * BYTES_PER_MP_LIMB);        \
 
480
    while (--i != 0);                                           \
 
481
    t = speed_endtime ();                                       \
 
482
                                                                \
 
483
    TMP_FREE (marker);                                          \
 
484
    return t;                                                   \
 
485
  }  
 
486
 
 
487
 
 
488
/* For mpn_add_n, mpn_sub_n, or similar. */
 
489
#define SPEED_ROUTINE_MPN_BINARY_N_CALL(call)           \
 
490
  {                                                     \
 
491
    mp_ptr    wp;                                       \
 
492
    unsigned  i;                                        \
 
493
    double    t;                                        \
 
494
    TMP_DECL (marker);                                  \
 
495
                                                        \
 
496
    SPEED_RESTRICT_COND (s->size >= 1);                 \
 
497
                                                        \
 
498
    TMP_MARK (marker);                                  \
 
499
    wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp);  \
 
500
                                                        \
 
501
    speed_operand_src (s, s->xp, s->size);              \
 
502
    speed_operand_src (s, s->yp, s->size);              \
 
503
    speed_operand_dst (s, wp, s->size);                 \
 
504
    speed_cache_fill (s);                               \
 
505
                                                        \
 
506
    speed_starttime ();                                 \
 
507
    i = s->reps;                                        \
 
508
    do                                                  \
 
509
      call;                                             \
 
510
    while (--i != 0);                                   \
 
511
    t = speed_endtime ();                               \
 
512
                                                        \
 
513
    TMP_FREE (marker);                                  \
 
514
    return t;                                           \
 
515
  }  
 
516
 
 
517
#define SPEED_ROUTINE_MPN_BINARY_N(function) \
 
518
   SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, s->xp, s->yp, s->size))
 
519
 
 
520
#define SPEED_ROUTINE_MPN_BINARY_NC(function) \
 
521
   SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, s->xp, s->yp, s->size, 0))
 
522
 
 
523
#define SPEED_ROUTINE_MPN_BINARY_N_SELF(function) \
 
524
   SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, s->xp, s->xp, s->size))
 
525
 
 
526
/* FIXME: wp is uninitialized here, should start it off from yp or something */
 
527
#define SPEED_ROUTINE_MPN_BINARY_N_INPLACE(function) \
 
528
   SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, wp, s->xp, s->size))
 
529
 
 
530
 
 
531
/* For mpn_lshift, mpn_rshift, mpn_mul_1, with r, or similar. */
 
532
#define SPEED_ROUTINE_MPN_UNARY_1_CALL(call)            \
 
533
  {                                                     \
 
534
    mp_ptr    wp;                                       \
 
535
    unsigned  i;                                        \
 
536
    double    t;                                        \
 
537
    TMP_DECL (marker);                                  \
 
538
                                                        \
 
539
    SPEED_RESTRICT_COND (s->size >= 1);                 \
 
540
                                                        \
 
541
    TMP_MARK (marker);                                  \
 
542
    wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp);  \
 
543
                                                        \
 
544
    speed_operand_src (s, s->xp, s->size);              \
 
545
    speed_operand_dst (s, wp, s->size);                 \
 
546
    speed_cache_fill (s);                               \
 
547
                                                        \
 
548
    speed_starttime ();                                 \
 
549
    i = s->reps;                                        \
 
550
    do                                                  \
 
551
      call;                                             \
 
552
    while (--i != 0);                                   \
 
553
    t = speed_endtime ();                               \
 
554
                                                        \
 
555
    TMP_FREE (marker);                                  \
 
556
    return t;                                           \
 
557
  }  
 
558
 
 
559
#define SPEED_ROUTINE_MPN_UNARY_1(function) \
 
560
  SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
 
561
 
 
562
#define SPEED_ROUTINE_MPN_UNARY_1C(function) \
 
563
  SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r, 0))
 
564
 
 
565
/* FIXME: wp is uninitialized here, should start it off from xp */
 
566
#define SPEED_ROUTINE_MPN_UNARY_1_INPLACE(function) \
 
567
  SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, wp, s->size, s->r))
 
568
 
 
569
#define SPEED_ROUTINE_MPN_DIVEXACT_1(function)                            \
 
570
  SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
 
571
 
 
572
#define SPEED_ROUTINE_MPN_DIVREM_1(function) \
 
573
  SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r))
 
574
 
 
575
#define SPEED_ROUTINE_MPN_DIVREM_1C(function) \
 
576
  SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r, 0))
 
577
 
 
578
#define SPEED_ROUTINE_MPN_DIVREM_1F(function) \
 
579
  SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r))
 
580
 
 
581
#define SPEED_ROUTINE_MPN_DIVREM_1CF(function) \
 
582
  SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r, 0))
 
583
 
 
584
 
 
585
/* For mpn_lshift, mpn_rshift, mpn_mul_1, with r, or similar. */
 
586
#define SPEED_ROUTINE_MPN_UNARY_2_CALL(call)                    \
 
587
  {                                                             \
 
588
    mp_ptr     wp;                                              \
 
589
    unsigned   i;                                               \
 
590
    double     t;                                               \
 
591
    mp_limb_t  h, l;                                            \
 
592
    TMP_DECL (marker);                                          \
 
593
                                                                \
 
594
    SPEED_RESTRICT_COND (s->size >= 1);                         \
 
595
                                                                \
 
596
    TMP_MARK (marker);                                          \
 
597
    wp = SPEED_TMP_ALLOC_LIMBS (s->size+1, s->align_wp);        \
 
598
    l = s->yp[0];                                               \
 
599
    h = s->yp[1];                                               \
 
600
                                                                \
 
601
    speed_operand_src (s, s->xp, s->size);                      \
 
602
    speed_operand_dst (s, wp, s->size+1);                       \
 
603
    speed_cache_fill (s);                                       \
 
604
                                                                \
 
605
    speed_starttime ();                                         \
 
606
    i = s->reps;                                                \
 
607
    do                                                          \
 
608
      call;                                                     \
 
609
    while (--i != 0);                                           \
 
610
    t = speed_endtime ();                                       \
 
611
                                                                \
 
612
    TMP_FREE (marker);                                          \
 
613
    return t;                                                   \
 
614
  }  
 
615
 
 
616
#define SPEED_ROUTINE_MPN_UNARY_2(function) \
 
617
  SPEED_ROUTINE_MPN_UNARY_2_CALL ((*function) (wp, s->xp, s->size, l, h))
 
618
 
 
619
 
 
620
/* For mpn_mul_basecase, xsize=r, ysize=s->size. */
 
621
#define SPEED_ROUTINE_MPN_MUL_BASECASE(function)                \
 
622
  {                                                             \
 
623
    mp_ptr    wp;                                               \
 
624
    mp_size_t  size1;                                           \
 
625
    unsigned  i;                                                \
 
626
    double    t;                                                \
 
627
    TMP_DECL (marker);                                          \
 
628
                                                                \
 
629
    size1 = (s->r == 0 ? s->size : s->r);                       \
 
630
                                                                \
 
631
    SPEED_RESTRICT_COND (s->size >= 1);                         \
 
632
    SPEED_RESTRICT_COND (size1 >= s->size);                     \
 
633
                                                                \
 
634
    TMP_MARK (marker);                                          \
 
635
    wp = SPEED_TMP_ALLOC_LIMBS (size1 + s->size, s->align_wp);  \
 
636
                                                                \
 
637
    speed_operand_src (s, s->xp, size1);                        \
 
638
    speed_operand_src (s, s->yp, s->size);                      \
 
639
    speed_operand_dst (s, wp, size1 + s->size);                 \
 
640
    speed_cache_fill (s);                                       \
 
641
                                                                \
 
642
    speed_starttime ();                                         \
 
643
    i = s->reps;                                                \
 
644
    do                                                          \
 
645
      function (wp, s->xp, size1, s->yp, s->size);              \
 
646
    while (--i != 0);                                           \
 
647
    t = speed_endtime ();                                       \
 
648
                                                                \
 
649
    TMP_FREE (marker);                                          \
 
650
    return t;                                                   \
 
651
  }  
 
652
 
 
653
 
 
654
#define SPEED_ROUTINE_MPN_MUL_N_CALL(call)                      \
 
655
  {                                                             \
 
656
    mp_ptr    wp;                                               \
 
657
    unsigned  i;                                                \
 
658
    double    t;                                                \
 
659
    TMP_DECL (marker);                                          \
 
660
                                                                \
 
661
    SPEED_RESTRICT_COND (s->size >= 1);                         \
 
662
                                                                \
 
663
    TMP_MARK (marker);                                          \
 
664
    wp = SPEED_TMP_ALLOC_LIMBS (2*s->size, s->align_wp);        \
 
665
                                                                \
 
666
    speed_operand_src (s, s->xp, s->size);                      \
 
667
    speed_operand_src (s, s->yp, s->size);                      \
 
668
    speed_operand_dst (s, wp, 2*s->size);                       \
 
669
    speed_cache_fill (s);                                       \
 
670
                                                                \
 
671
    speed_starttime ();                                         \
 
672
    i = s->reps;                                                \
 
673
    do                                                          \
 
674
      call;                                                     \
 
675
    while (--i != 0);                                           \
 
676
    t = speed_endtime ();                                       \
 
677
                                                                \
 
678
    TMP_FREE (marker);                                          \
 
679
    return t;                                                   \
 
680
  }  
 
681
 
 
682
#define SPEED_ROUTINE_MPN_MUL_N(function) \
 
683
  SPEED_ROUTINE_MPN_MUL_N_CALL (function (wp, s->xp, s->yp, s->size));
 
684
 
 
685
 
 
686
#define SPEED_ROUTINE_MPN_MUL_N_TSPACE(call, tsize, minsize)    \
 
687
  {                                                             \
 
688
    mp_ptr    wp, tspace;                                       \
 
689
    unsigned  i;                                                \
 
690
    double    t;                                                \
 
691
    TMP_DECL (marker);                                          \
 
692
                                                                \
 
693
    SPEED_RESTRICT_COND (s->size >= minsize);                   \
 
694
                                                                \
 
695
    TMP_MARK (marker);                                          \
 
696
    wp = SPEED_TMP_ALLOC_LIMBS (2*s->size, s->align_wp);        \
 
697
    tspace = SPEED_TMP_ALLOC_LIMBS (tsize, s->align_wp2);       \
 
698
                                                                \
 
699
    speed_operand_src (s, s->xp, s->size);                      \
 
700
    speed_operand_src (s, s->yp, s->size);                      \
 
701
    speed_operand_dst (s, wp, 2*s->size);                       \
 
702
    speed_operand_dst (s, tspace, tsize);                       \
 
703
    speed_cache_fill (s);                                       \
 
704
                                                                \
 
705
    speed_starttime ();                                         \
 
706
    i = s->reps;                                                \
 
707
    do                                                          \
 
708
      call;                                                     \
 
709
    while (--i != 0);                                           \
 
710
    t = speed_endtime ();                                       \
 
711
                                                                \
 
712
    TMP_FREE (marker);                                          \
 
713
    return t;                                                   \
 
714
  }  
 
715
 
 
716
#define SPEED_ROUTINE_MPN_KARA_MUL_N(function)          \
 
717
  SPEED_ROUTINE_MPN_MUL_N_TSPACE                        \
 
718
    (function (wp, s->xp, s->xp, s->size, tspace),      \
 
719
     MPN_KARA_MUL_N_TSIZE (s->size),                    \
 
720
     MPN_KARA_MUL_N_MINSIZE)
 
721
 
 
722
#define SPEED_ROUTINE_MPN_TOOM3_MUL_N(function)         \
 
723
  SPEED_ROUTINE_MPN_MUL_N_TSPACE                        \
 
724
    (function (wp, s->xp, s->yp, s->size, tspace),      \
 
725
     MPN_TOOM3_MUL_N_TSIZE (s->size),                   \
 
726
     MPN_TOOM3_MUL_N_MINSIZE)
 
727
 
 
728
 
 
729
#define SPEED_ROUTINE_MPN_SQR_CALL(call)                        \
 
730
  {                                                             \
 
731
    mp_ptr    wp;                                               \
 
732
    unsigned  i;                                                \
 
733
    double    t;                                                \
 
734
    TMP_DECL (marker);                                          \
 
735
                                                                \
 
736
    SPEED_RESTRICT_COND (s->size >= 1);                         \
 
737
                                                                \
 
738
    TMP_MARK (marker);                                          \
 
739
    wp = SPEED_TMP_ALLOC_LIMBS (2*s->size, s->align_wp);        \
 
740
                                                                \
 
741
    speed_operand_src (s, s->xp, s->size);                      \
 
742
    speed_operand_dst (s, wp, 2*s->size);                       \
 
743
    speed_cache_fill (s);                                       \
 
744
                                                                \
 
745
    speed_starttime ();                                         \
 
746
    i = s->reps;                                                \
 
747
    do                                                          \
 
748
      call;                                                     \
 
749
    while (--i != 0);                                           \
 
750
    t = speed_endtime ();                                       \
 
751
                                                                \
 
752
    TMP_FREE (marker);                                          \
 
753
    return t;                                                   \
 
754
  }  
 
755
 
 
756
#define SPEED_ROUTINE_MPN_SQR(function) \
 
757
  SPEED_ROUTINE_MPN_SQR_CALL (function (wp, s->xp, s->size))
 
758
 
 
759
#define SPEED_ROUTINE_MPN_SQR_DIAGONAL(function) \
 
760
  SPEED_ROUTINE_MPN_SQR (function)
 
761
 
 
762
 
 
763
#define SPEED_ROUTINE_MPN_SQR_TSPACE(call, tsize, minsize)      \
 
764
  {                                                             \
 
765
    mp_ptr    wp, tspace;                                       \
 
766
    unsigned  i;                                                \
 
767
    double    t;                                                \
 
768
    TMP_DECL (marker);                                          \
 
769
                                                                \
 
770
    SPEED_RESTRICT_COND (s->size >= minsize);                   \
 
771
                                                                \
 
772
    TMP_MARK (marker);                                          \
 
773
    wp = SPEED_TMP_ALLOC_LIMBS (2*s->size, s->align_wp);        \
 
774
    tspace = SPEED_TMP_ALLOC_LIMBS (tsize, s->align_wp2);       \
 
775
                                                                \
 
776
    speed_operand_src (s, s->xp, s->size);                      \
 
777
    speed_operand_dst (s, wp, 2*s->size);                       \
 
778
    speed_operand_dst (s, tspace, tsize);                       \
 
779
    speed_cache_fill (s);                                       \
 
780
                                                                \
 
781
    speed_starttime ();                                         \
 
782
    i = s->reps;                                                \
 
783
    do                                                          \
 
784
      call;                                                     \
 
785
    while (--i != 0);                                           \
 
786
    t = speed_endtime ();                                       \
 
787
                                                                \
 
788
    TMP_FREE (marker);                                          \
 
789
    return t;                                                   \
 
790
  }  
 
791
 
 
792
#define SPEED_ROUTINE_MPN_KARA_SQR_N(function)                          \
 
793
  SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),  \
 
794
                                MPN_KARA_SQR_N_TSIZE (s->size),         \
 
795
                                MPN_KARA_SQR_N_MINSIZE)
 
796
 
 
797
#define SPEED_ROUTINE_MPN_TOOM3_SQR_N(function)                         \
 
798
  SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),  \
 
799
                                MPN_TOOM3_SQR_N_TSIZE (s->size),        \
 
800
                                MPN_TOOM3_SQR_N_MINSIZE)
 
801
 
 
802
 
 
803
#define SPEED_ROUTINE_MPN_MOD_CALL(call)        \
 
804
  {                                             \
 
805
    unsigned   i;                               \
 
806
    mp_limb_t  dummy = 0;                       \
 
807
                                                \
 
808
    SPEED_RESTRICT_COND (s->size >= 0);         \
 
809
                                                \
 
810
    speed_operand_src (s, s->xp, s->size);      \
 
811
    speed_cache_fill (s);                       \
 
812
                                                \
 
813
    speed_starttime ();                         \
 
814
    i = s->reps;                                \
 
815
    do                                          \
 
816
      dummy += call;                            \
 
817
    while (--i != 0);                           \
 
818
                                                \
 
819
    noop_1 (dummy);                             \
 
820
    return speed_endtime ();                    \
 
821
  }  
 
822
 
 
823
#define SPEED_ROUTINE_MPN_MOD_1(function)  \
 
824
   SPEED_ROUTINE_MPN_MOD_CALL ((*function) (s->xp, s->size, s->r))
 
825
 
 
826
#define SPEED_ROUTINE_MPN_MOD_1C(function) \
 
827
   SPEED_ROUTINE_MPN_MOD_CALL ((*function)(s->xp, s->size, s->r, CNST_LIMB(0)))
 
828
 
 
829
#define SPEED_ROUTINE_MPN_MODEXACT_1_ODD(function) \
 
830
  SPEED_ROUTINE_MPN_MOD_CALL (function (s->xp, s->size, s->r));
 
831
 
 
832
#define SPEED_ROUTINE_MPN_MODEXACT_1C_ODD(function) \
 
833
  SPEED_ROUTINE_MPN_MOD_CALL (function (s->xp, s->size, s->r, CNST_LIMB(0)));
 
834
 
 
835
#define SPEED_ROUTINE_MPN_MOD_34LSUB1(function)  \
 
836
   SPEED_ROUTINE_MPN_MOD_CALL ((*function) (s->xp, s->size))
 
837
 
 
838
#define SPEED_ROUTINE_MPN_PREINV_MOD_1(function)        \
 
839
  {                                                     \
 
840
    unsigned   i;                                       \
 
841
    mp_limb_t  inv;                                     \
 
842
    mp_limb_t  dummy = 0;                               \
 
843
                                                        \
 
844
    SPEED_RESTRICT_COND (s->size >= 0);                 \
 
845
    SPEED_RESTRICT_COND (s->r & MP_LIMB_T_HIGHBIT);     \
 
846
                                                        \
 
847
    invert_limb (inv, s->r);                            \
 
848
    speed_operand_src (s, s->xp, s->size);              \
 
849
    speed_cache_fill (s);                               \
 
850
                                                        \
 
851
    speed_starttime ();                                 \
 
852
    i = s->reps;                                        \
 
853
    do                                                  \
 
854
      dummy += (*function) (s->xp, s->size, s->r, inv); \
 
855
    while (--i != 0);                                   \
 
856
                                                        \
 
857
    noop_1 (dummy);                                     \
 
858
    return speed_endtime ();                            \
 
859
  }  
 
860
 
 
861
 
 
862
/* A division of 2*s->size by s->size limbs */
 
863
 
 
864
#define SPEED_ROUTINE_MPN_DC_DIVREM_CALL(call)                  \
 
865
  {                                                             \
 
866
    unsigned  i;                                                \
 
867
    mp_ptr    a, d, q, r;                                       \
 
868
    double    t;                                                \
 
869
    TMP_DECL (marker);                                          \
 
870
                                                                \
 
871
    SPEED_RESTRICT_COND (s->size >= 1);                         \
 
872
                                                                \
 
873
    TMP_MARK (marker);                                          \
 
874
    a = SPEED_TMP_ALLOC_LIMBS (2*s->size, s->align_xp);         \
 
875
    d = SPEED_TMP_ALLOC_LIMBS (s->size,   s->align_yp);         \
 
876
    q = SPEED_TMP_ALLOC_LIMBS (s->size+1, s->align_wp);         \
 
877
    r = SPEED_TMP_ALLOC_LIMBS (s->size,   s->align_wp2);        \
 
878
                                                                \
 
879
    MPN_COPY (a, s->xp, s->size);                               \
 
880
    MPN_COPY (a+s->size, s->xp, s->size);                       \
 
881
                                                                \
 
882
    MPN_COPY (d, s->yp, s->size);                               \
 
883
                                                                \
 
884
    /* normalize the data */                                    \
 
885
    d[s->size-1] |= MP_LIMB_T_HIGHBIT;                          \
 
886
    a[2*s->size-1] = d[s->size-1] - 1;                          \
 
887
                                                                \
 
888
    speed_operand_src (s, a, 2*s->size);                        \
 
889
    speed_operand_src (s, d, s->size);                          \
 
890
    speed_operand_dst (s, q, s->size+1);                        \
 
891
    speed_operand_dst (s, r, s->size);                          \
 
892
    speed_cache_fill (s);                                       \
 
893
                                                                \
 
894
    speed_starttime ();                                         \
 
895
    i = s->reps;                                                \
 
896
    do                                                          \
 
897
      call;                                                     \
 
898
    while (--i != 0);                                           \
 
899
    t = speed_endtime ();                                       \
 
900
                                                                \
 
901
    TMP_FREE (marker);                                          \
 
902
    return t;                                                   \
 
903
  }  
 
904
 
 
905
#define SPEED_ROUTINE_MPN_DC_DIVREM_N(function) \
 
906
  SPEED_ROUTINE_MPN_DC_DIVREM_CALL((*function) (q, a, d, s->size))
 
907
 
 
908
#define SPEED_ROUTINE_MPN_DC_DIVREM_SB(function)        \
 
909
  SPEED_ROUTINE_MPN_DC_DIVREM_CALL                      \
 
910
    ((*function) (q, a, 2*s->size, d, s->size))
 
911
 
 
912
#define SPEED_ROUTINE_MPN_DC_TDIV_QR(function)          \
 
913
  SPEED_ROUTINE_MPN_DC_DIVREM_CALL                      \
 
914
    ((*function) (q, r, 0, a, 2*s->size, d, s->size))
 
915
 
 
916
 
 
917
/* A division of s->size by 3 limbs */
 
918
 
 
919
#define SPEED_ROUTINE_MPN_SB_DIVREM_M3(function)        \
 
920
  {                                                     \
 
921
    unsigned   i;                                       \
 
922
    mp_ptr     a, d, q;                                 \
 
923
    mp_size_t  qsize;                                   \
 
924
    double     t;                                       \
 
925
    TMP_DECL (marker);                                  \
 
926
                                                        \
 
927
    SPEED_RESTRICT_COND (s->size >= 3);                 \
 
928
                                                        \
 
929
    TMP_MARK (marker);                                  \
 
930
    a = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_xp);   \
 
931
                                                        \
 
932
    d = SPEED_TMP_ALLOC_LIMBS (3, s->align_yp);         \
 
933
    MPN_COPY (d, s->yp, 3);                             \
 
934
    d[2] |= MP_LIMB_T_HIGHBIT;                          \
 
935
                                                        \
 
936
    qsize = s->size - 3;                                \
 
937
    q = SPEED_TMP_ALLOC_LIMBS (qsize, s->align_wp);     \
 
938
                                                        \
 
939
    speed_operand_dst (s, a, s->size);                  \
 
940
    speed_operand_src (s, d, 3);                        \
 
941
    speed_operand_dst (s, q, qsize);                    \
 
942
    speed_cache_fill (s);                               \
 
943
                                                        \
 
944
    speed_starttime ();                                 \
 
945
    i = s->reps;                                        \
 
946
    do                                                  \
 
947
      {                                                 \
 
948
        MPN_COPY (a, s->xp, s->size);                   \
 
949
        function (q, a, s->size, d, 3);                 \
 
950
      }                                                 \
 
951
    while (--i != 0);                                   \
 
952
    t = speed_endtime ();                               \
 
953
                                                        \
 
954
    TMP_FREE (marker);                                  \
 
955
    return t;                                           \
 
956
  }  
 
957
 
 
958
 
 
959
/* A remainder 2*s->size by s->size limbs */
 
960
 
 
961
#define SPEED_ROUTINE_MPZ_MOD(function)                         \
 
962
  {                                                             \
 
963
    unsigned   i;                                               \
 
964
    mpz_t      a, d, r;                                         \
 
965
                                                                \
 
966
    SPEED_RESTRICT_COND (s->size >= 1);                         \
 
967
                                                                \
 
968
    mpz_init_set_n (d, s->yp, s->size);                         \
 
969
                                                                \
 
970
    /* high part less than d, low part a duplicate copied in */ \
 
971
    mpz_init_set_n (a, s->xp, s->size);                         \
 
972
    mpz_mod (a, a, d);                                          \
 
973
    mpz_mul_2exp (a, a, BITS_PER_MP_LIMB * s->size);            \
 
974
    MPN_COPY (PTR(a), s->xp, s->size);                          \
 
975
                                                                \
 
976
    mpz_init (r);                                               \
 
977
                                                                \
 
978
    speed_operand_src (s, PTR(a), SIZ(a));                      \
 
979
    speed_operand_src (s, PTR(d), SIZ(d));                      \
 
980
    speed_cache_fill (s);                                       \
 
981
                                                                \
 
982
    speed_starttime ();                                         \
 
983
    i = s->reps;                                                \
 
984
    do                                                          \
 
985
      function (r, a, d);                                       \
 
986
    while (--i != 0);                                           \
 
987
    return speed_endtime ();                                    \
 
988
  }  
 
989
 
 
990
 
 
991
#define SPEED_ROUTINE_REDC(function)                            \
 
992
  {                                                             \
 
993
    unsigned   i;                                               \
 
994
    mp_ptr     cp, mp, tp, ap;                                  \
 
995
    mp_limb_t  Nprim;                                           \
 
996
    double     t;                                               \
 
997
    TMP_DECL (marker);                                          \
 
998
                                                                \
 
999
    SPEED_RESTRICT_COND (s->size >= 1);                         \
 
1000
                                                                \
 
1001
    TMP_MARK (marker);                                          \
 
1002
    ap = SPEED_TMP_ALLOC_LIMBS (2*s->size+1, s->align_xp);      \
 
1003
    mp = SPEED_TMP_ALLOC_LIMBS (s->size,     s->align_yp);      \
 
1004
    cp = SPEED_TMP_ALLOC_LIMBS (s->size,     s->align_wp);      \
 
1005
    tp = SPEED_TMP_ALLOC_LIMBS (2*s->size+1, s->align_wp2);     \
 
1006
                                                                \
 
1007
    MPN_COPY (ap,         s->xp, s->size);                      \
 
1008
    MPN_COPY (ap+s->size, s->xp, s->size);                      \
 
1009
                                                                \
 
1010
    /* modulus must be odd */                                   \
 
1011
    MPN_COPY (mp, s->yp, s->size);                              \
 
1012
    mp[0] |= 1;                                                 \
 
1013
    modlimb_invert (Nprim, mp[0]);                              \
 
1014
                                                                \
 
1015
    speed_operand_src (s, ap, 2*s->size+1);                     \
 
1016
    speed_operand_dst (s, tp, 2*s->size+1);                     \
 
1017
    speed_operand_src (s, mp, s->size);                         \
 
1018
    speed_operand_dst (s, cp, s->size);                         \
 
1019
    speed_cache_fill (s);                                       \
 
1020
                                                                \
 
1021
    speed_starttime ();                                         \
 
1022
    i = s->reps;                                                \
 
1023
    do {                                                        \
 
1024
      MPN_COPY (tp, ap, 2*s->size);                             \
 
1025
      function (cp, mp, s->size, Nprim, tp);                    \
 
1026
    } while (--i != 0);                                         \
 
1027
    t = speed_endtime ();                                       \
 
1028
                                                                \
 
1029
    TMP_FREE (marker);                                          \
 
1030
    return t;                                                   \
 
1031
  }  
 
1032
 
 
1033
 
 
1034
#define SPEED_ROUTINE_MPN_POPCOUNT(function)    \
 
1035
  {                                             \
 
1036
    unsigned       i;                           \
 
1037
    unsigned long  dummy = 0;                   \
 
1038
                                                \
 
1039
    SPEED_RESTRICT_COND (s->size >= 1);         \
 
1040
                                                \
 
1041
    speed_operand_src (s, s->xp, s->size);      \
 
1042
    speed_cache_fill (s);                       \
 
1043
                                                \
 
1044
    speed_starttime ();                         \
 
1045
    i = s->reps;                                \
 
1046
    do                                          \
 
1047
      dummy += function (s->xp, s->size);       \
 
1048
    while (--i != 0);                           \
 
1049
                                                \
 
1050
    noop_1 ((mp_limb_t) dummy);                 \
 
1051
    return speed_endtime ();                    \
 
1052
  }  
 
1053
 
 
1054
#define SPEED_ROUTINE_MPN_HAMDIST(function)             \
 
1055
  {                                                     \
 
1056
    unsigned       i;                                   \
 
1057
    unsigned long  dummy = 0;                           \
 
1058
                                                        \
 
1059
    SPEED_RESTRICT_COND (s->size >= 1);                 \
 
1060
                                                        \
 
1061
    speed_operand_src (s, s->xp, s->size);              \
 
1062
    speed_operand_src (s, s->yp, s->size);              \
 
1063
    speed_cache_fill (s);                               \
 
1064
                                                        \
 
1065
    speed_starttime ();                                 \
 
1066
    i = s->reps;                                        \
 
1067
    do                                                  \
 
1068
      dummy += function (s->xp, s->yp, s->size);        \
 
1069
    while (--i != 0);                                   \
 
1070
                                                        \
 
1071
    noop_1 ((mp_limb_t) dummy);                         \
 
1072
    return speed_endtime ();                            \
 
1073
  }  
 
1074
 
 
1075
 
 
1076
#define SPEED_ROUTINE_MPZ_UI(function)  \
 
1077
  {                                     \
 
1078
    mpz_t     z;                        \
 
1079
    unsigned  i;                        \
 
1080
    double    t;                        \
 
1081
                                        \
 
1082
    SPEED_RESTRICT_COND (s->size >= 0); \
 
1083
                                        \
 
1084
    mpz_init (z);                       \
 
1085
                                        \
 
1086
    speed_starttime ();                 \
 
1087
    i = s->reps;                        \
 
1088
    do                                  \
 
1089
      function (z, s->size);            \
 
1090
    while (--i != 0);                   \
 
1091
    t = speed_endtime ();               \
 
1092
                                        \
 
1093
    mpz_clear (z);                      \
 
1094
    return t;                           \
 
1095
  }  
 
1096
 
 
1097
#define SPEED_ROUTINE_MPZ_FAC_UI(function)    SPEED_ROUTINE_MPZ_UI(function)
 
1098
#define SPEED_ROUTINE_MPZ_FIB_UI(function)    SPEED_ROUTINE_MPZ_UI(function)
 
1099
#define SPEED_ROUTINE_MPZ_LUCNUM_UI(function) SPEED_ROUTINE_MPZ_UI(function)
 
1100
 
 
1101
 
 
1102
#define SPEED_ROUTINE_MPZ_2_UI(function)        \
 
1103
  {                                             \
 
1104
    mpz_t     z, z2;                            \
 
1105
    unsigned  i;                                \
 
1106
    double    t;                                \
 
1107
                                                \
 
1108
    SPEED_RESTRICT_COND (s->size >= 0);         \
 
1109
                                                \
 
1110
    mpz_init (z);                               \
 
1111
    mpz_init (z2);                              \
 
1112
                                                \
 
1113
    speed_starttime ();                         \
 
1114
    i = s->reps;                                \
 
1115
    do                                          \
 
1116
      function (z, z2, s->size);                \
 
1117
    while (--i != 0);                           \
 
1118
    t = speed_endtime ();                       \
 
1119
                                                \
 
1120
    mpz_clear (z);                              \
 
1121
    mpz_clear (z2);                             \
 
1122
    return t;                                   \
 
1123
  }  
 
1124
 
 
1125
#define SPEED_ROUTINE_MPZ_FIB2_UI(function)    SPEED_ROUTINE_MPZ_2_UI(function)
 
1126
#define SPEED_ROUTINE_MPZ_LUCNUM2_UI(function) SPEED_ROUTINE_MPZ_2_UI(function)
 
1127
 
 
1128
 
 
1129
#define SPEED_ROUTINE_MPN_FIB2_UI(function)             \
 
1130
  {                                                     \
 
1131
    mp_ptr     fp, f1p;                                 \
 
1132
    mp_size_t  alloc;                                   \
 
1133
    unsigned   i;                                       \
 
1134
    double     t;                                       \
 
1135
    TMP_DECL (marker);                                  \
 
1136
                                                        \
 
1137
    SPEED_RESTRICT_COND (s->size >= 0);                 \
 
1138
                                                        \
 
1139
    TMP_MARK (marker);                                  \
 
1140
    alloc = MPN_FIB2_SIZE (s->size);                    \
 
1141
    fp  = SPEED_TMP_ALLOC_LIMBS (alloc, s->align_xp);   \
 
1142
    f1p = SPEED_TMP_ALLOC_LIMBS (alloc, s->align_yp);   \
 
1143
                                                        \
 
1144
    speed_starttime ();                                 \
 
1145
    i = s->reps;                                        \
 
1146
    do                                                  \
 
1147
      function (fp, f1p, s->size);                      \
 
1148
    while (--i != 0);                                   \
 
1149
    t = speed_endtime ();                               \
 
1150
                                                        \
 
1151
    TMP_FREE (marker);                                  \
 
1152
    return t;                                           \
 
1153
  }  
 
1154
 
 
1155
 
 
1156
 
 
1157
/* Calculate b^e mod m for random b and m of s->size limbs and random e of 6
 
1158
   limbs.  m is forced to odd so that redc can be used.  e is limited in
 
1159
   size so the calculation doesn't take too long. */
 
1160
#define SPEED_ROUTINE_MPZ_POWM(function)        \
 
1161
  {                                             \
 
1162
    mpz_t     r, b, e, m;                       \
 
1163
    unsigned  i;                                \
 
1164
    double    t;                                \
 
1165
                                                \
 
1166
    SPEED_RESTRICT_COND (s->size >= 1);         \
 
1167
                                                \
 
1168
    mpz_init (r);                               \
 
1169
    mpz_init_set_n (b, s->xp, s->size);         \
 
1170
    mpz_init_set_n (m, s->yp, s->size);         \
 
1171
    mpz_setbit (m, 0);  /* force m to odd */    \
 
1172
    mpz_init_set_n (e, s->xp_block, 6);         \
 
1173
                                                \
 
1174
    speed_starttime ();                         \
 
1175
    i = s->reps;                                \
 
1176
    do                                          \
 
1177
      function (r, b, e, m);                    \
 
1178
    while (--i != 0);                           \
 
1179
    t = speed_endtime ();                       \
 
1180
                                                \
 
1181
    mpz_clear (r);                              \
 
1182
    mpz_clear (b);                              \
 
1183
    mpz_clear (e);                              \
 
1184
    mpz_clear (m);                              \
 
1185
    return t;                                   \
 
1186
  }  
 
1187
 
 
1188
/* (m-2)^0xAAAAAAAA mod m */
 
1189
#define SPEED_ROUTINE_MPZ_POWM_UI(function)             \
 
1190
  {                                                     \
 
1191
    mpz_t          r, b, m;                             \
 
1192
    unsigned long  e = (~ (unsigned long) 0) / 3;       \
 
1193
    unsigned       i;                                   \
 
1194
    double         t;                                   \
 
1195
                                                        \
 
1196
    SPEED_RESTRICT_COND (s->size >= 1);                 \
 
1197
                                                        \
 
1198
    mpz_init (r);                                       \
 
1199
                                                        \
 
1200
    /* force m to odd */                                \
 
1201
    mpz_init (m);                                       \
 
1202
    mpz_set_n (m, s->xp, s->size);                      \
 
1203
    PTR(m)[0] |= 1;                                     \
 
1204
                                                        \
 
1205
    mpz_init_set (b, m);                                \
 
1206
    mpz_sub_ui (b, b, 2);                               \
 
1207
/* printf ("%X\n", mpz_get_ui(m)); */                   \
 
1208
    i = s->reps;                                        \
 
1209
    speed_starttime ();                                 \
 
1210
    do                                                  \
 
1211
      function (r, b, e, m);                            \
 
1212
    while (--i != 0);                                   \
 
1213
    t = speed_endtime ();                               \
 
1214
                                                        \
 
1215
    mpz_clear (r);                                      \
 
1216
    mpz_clear (b);                                      \
 
1217
    mpz_clear (m);                                      \
 
1218
    return t;                                           \
 
1219
  }  
 
1220
 
 
1221
 
 
1222
#define SPEED_ROUTINE_MPN_ADDSUB_CALL(call)                     \
 
1223
  {                                                             \
 
1224
    mp_ptr    wp, wp2, xp, yp;                                  \
 
1225
    unsigned  i;                                                \
 
1226
    double    t;                                                \
 
1227
    TMP_DECL (marker);                                          \
 
1228
                                                                \
 
1229
    SPEED_RESTRICT_COND (s->size >= 0);                         \
 
1230
                                                                \
 
1231
    TMP_MARK (marker);                                          \
 
1232
    wp  = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp);         \
 
1233
    wp2 = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp2);        \
 
1234
    xp = s->xp;                                                 \
 
1235
    yp = s->yp;                                                 \
 
1236
                                                                \
 
1237
    if (s->r == 0)      ;                                       \
 
1238
    else if (s->r == 1) { xp = wp;            }                 \
 
1239
    else if (s->r == 2) {           yp = wp2; }                 \
 
1240
    else if (s->r == 3) { xp = wp;  yp = wp2; }                 \
 
1241
    else if (s->r == 4) { xp = wp2; yp = wp;  }                 \
 
1242
    else {                                                      \
 
1243
      TMP_FREE (marker);                                        \
 
1244
      return -1.0;                                              \
 
1245
    }                                                           \
 
1246
    if (xp != s->xp) MPN_COPY (xp, s->xp, s->size);             \
 
1247
    if (yp != s->yp) MPN_COPY (yp, s->yp, s->size);             \
 
1248
                                                                \
 
1249
    speed_operand_src (s, xp, s->size);                         \
 
1250
    speed_operand_src (s, yp, s->size);                         \
 
1251
    speed_operand_dst (s, wp, s->size);                         \
 
1252
    speed_operand_dst (s, wp2, s->size);                        \
 
1253
    speed_cache_fill (s);                                       \
 
1254
                                                                \
 
1255
    speed_starttime ();                                         \
 
1256
    i = s->reps;                                                \
 
1257
    do                                                          \
 
1258
      call;                                                     \
 
1259
    while (--i != 0);                                           \
 
1260
    t = speed_endtime ();                                       \
 
1261
                                                                \
 
1262
    TMP_FREE (marker);                                          \
 
1263
    return t;                                                   \
 
1264
  }
 
1265
 
 
1266
#define SPEED_ROUTINE_MPN_ADDSUB_N(function)    \
 
1267
  SPEED_ROUTINE_MPN_ADDSUB_CALL                 \
 
1268
    (function (wp, wp2, xp, yp, s->size));
 
1269
 
 
1270
#define SPEED_ROUTINE_MPN_ADDSUB_NC(function)   \
 
1271
  SPEED_ROUTINE_MPN_ADDSUB_CALL                 \
 
1272
    (function (wp, wp2, xp, yp, s->size, 0));
 
1273
 
 
1274
 
 
1275
/* Doing an Nx1 gcd with the given r. */
 
1276
#define SPEED_ROUTINE_MPN_GCD_1N(function)              \
 
1277
  {                                                     \
 
1278
    mp_ptr    xp;                                       \
 
1279
    unsigned  i;                                        \
 
1280
    double    t;                                        \
 
1281
    mp_limb_t dummy = 0;                                \
 
1282
    TMP_DECL (marker);                                  \
 
1283
                                                        \
 
1284
    SPEED_RESTRICT_COND (s->size >= 1);                 \
 
1285
    SPEED_RESTRICT_COND (s->r != 0);                    \
 
1286
                                                        \
 
1287
    TMP_MARK (marker);                                  \
 
1288
    xp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_xp);  \
 
1289
    MPN_COPY (xp, s->xp, s->size);                      \
 
1290
    xp[0] |= refmpn_zero_p (xp, s->size);               \
 
1291
                                                        \
 
1292
    speed_operand_src (s, s->xp, s->size);              \
 
1293
    speed_cache_fill (s);                               \
 
1294
                                                        \
 
1295
    speed_starttime ();                                 \
 
1296
    i = s->reps;                                        \
 
1297
    do                                                  \
 
1298
      dummy += function (xp, s->size, s->r);            \
 
1299
    while (--i != 0);                                   \
 
1300
    t = speed_endtime ();                               \
 
1301
                                                        \
 
1302
    noop_1 (dummy);                                     \
 
1303
    TMP_FREE (marker);                                  \
 
1304
    return t;                                           \
 
1305
  }  
 
1306
 
 
1307
 
 
1308
/* SPEED_BLOCK_SIZE many one GCDs of s->size bits each. */
 
1309
 
 
1310
#define SPEED_ROUTINE_MPN_GCD_1_CALL(setup, call)               \
 
1311
  {                                                             \
 
1312
    unsigned  i, j;                                             \
 
1313
    mp_ptr    px, py;                                           \
 
1314
    mp_limb_t x_mask, y_mask;                                   \
 
1315
    mp_limb_t dummy = 0;                                        \
 
1316
    double    t;                                                \
 
1317
    TMP_DECL (marker);                                          \
 
1318
                                                                \
 
1319
    SPEED_RESTRICT_COND (s->size >= 1);                         \
 
1320
    SPEED_RESTRICT_COND (s->size <= mp_bits_per_limb);          \
 
1321
                                                                \
 
1322
    TMP_MARK (marker);                                          \
 
1323
    px = SPEED_TMP_ALLOC_LIMBS (SPEED_BLOCK_SIZE, s->align_xp); \
 
1324
    py = SPEED_TMP_ALLOC_LIMBS (SPEED_BLOCK_SIZE, s->align_yp); \
 
1325
    MPN_COPY (px, s->xp_block, SPEED_BLOCK_SIZE);               \
 
1326
    MPN_COPY (py, s->yp_block, SPEED_BLOCK_SIZE);               \
 
1327
                                                                \
 
1328
    x_mask = MP_LIMB_T_LOWBITMASK (s->size);                    \
 
1329
    y_mask = MP_LIMB_T_LOWBITMASK (s->r != 0 ? s->r : s->size); \
 
1330
    for (i = 0; i < SPEED_BLOCK_SIZE; i++)                      \
 
1331
      {                                                         \
 
1332
        px[i] &= x_mask; px[i] += (px[i] == 0);                 \
 
1333
        py[i] &= y_mask; py[i] += (py[i] == 0);                 \
 
1334
        setup;                                                  \
 
1335
      }                                                         \
 
1336
                                                                \
 
1337
    speed_operand_src (s, px, SPEED_BLOCK_SIZE);                \
 
1338
    speed_operand_src (s, py, SPEED_BLOCK_SIZE);                \
 
1339
    speed_cache_fill (s);                                       \
 
1340
                                                                \
 
1341
    speed_starttime ();                                         \
 
1342
    i = s->reps;                                                \
 
1343
    do                                                          \
 
1344
      {                                                         \
 
1345
        j = SPEED_BLOCK_SIZE;                                   \
 
1346
        do                                                      \
 
1347
          {                                                     \
 
1348
            dummy += call;                                      \
 
1349
          }                                                     \
 
1350
        while (--j != 0);                                       \
 
1351
      }                                                         \
 
1352
    while (--i != 0);                                           \
 
1353
    t = speed_endtime ();                                       \
 
1354
                                                                \
 
1355
    TMP_FREE (marker);                                          \
 
1356
                                                                \
 
1357
    noop_1 (dummy);                                             \
 
1358
    s->time_divisor = SPEED_BLOCK_SIZE;                         \
 
1359
    return t;                                                   \
 
1360
  }  
 
1361
 
 
1362
#define SPEED_ROUTINE_MPN_GCD_1(function) \
 
1363
  SPEED_ROUTINE_MPN_GCD_1_CALL( , function (&px[j-1], 1, py[j-1]))
 
1364
 
 
1365
#define SPEED_ROUTINE_MPN_JACBASE(function)     \
 
1366
  SPEED_ROUTINE_MPN_GCD_1_CALL                  \
 
1367
    ({                                          \
 
1368
       /* require x<y, y odd, y!=1 */           \
 
1369
       px[i] %= py[i];                          \
 
1370
       px[i] |= 1;                              \
 
1371
       py[i] |= 1;                              \
 
1372
       if (py[i]==1) py[i]=3;                   \
 
1373
     },                                         \
 
1374
     function (px[j-1], py[j-1], 0))
 
1375
 
 
1376
 
 
1377
/* Run some GCDs of s->size limbs each.  The number of different data values
 
1378
   is decreased as s->size**2, since GCD is a quadratic algorithm.
 
1379
   SPEED_ROUTINE_MPN_GCD runs more times than SPEED_ROUTINE_MPN_GCDEXT
 
1380
   though, because the plain gcd is about twice as fast as gcdext.  */
 
1381
 
 
1382
#define SPEED_ROUTINE_MPN_GCD_CALL(datafactor, call)            \
 
1383
  {                                                             \
 
1384
    unsigned  i;                                                \
 
1385
    mp_size_t j, pieces, psize;                                 \
 
1386
    mp_ptr    wp, wp2, xtmp, ytmp, px, py;                      \
 
1387
    double    t;                                                \
 
1388
    TMP_DECL (marker);                                          \
 
1389
                                                                \
 
1390
    SPEED_RESTRICT_COND (s->size >= 1);                         \
 
1391
                                                                \
 
1392
    TMP_MARK (marker);                                          \
 
1393
    xtmp = SPEED_TMP_ALLOC_LIMBS (s->size+1, s->align_xp);      \
 
1394
    ytmp = SPEED_TMP_ALLOC_LIMBS (s->size+1, s->align_yp);      \
 
1395
    wp = SPEED_TMP_ALLOC_LIMBS (s->size+1, s->align_wp);        \
 
1396
    wp2 = SPEED_TMP_ALLOC_LIMBS (s->size+1, s->align_wp2);      \
 
1397
                                                                \
 
1398
    pieces = SPEED_BLOCK_SIZE * datafactor / s->size / s->size; \
 
1399
    pieces = MIN (pieces, SPEED_BLOCK_SIZE / s->size);          \
 
1400
    pieces = MAX (pieces, 1);                                   \
 
1401
                                                                \
 
1402
    psize = pieces * s->size;                                   \
 
1403
    px = TMP_ALLOC_LIMBS (psize);                               \
 
1404
    py = TMP_ALLOC_LIMBS (psize);                               \
 
1405
    MPN_COPY (px, pieces==1 ? s->xp : s->xp_block, psize);      \
 
1406
    MPN_COPY (py, pieces==1 ? s->yp : s->yp_block, psize);      \
 
1407
                                                                \
 
1408
    /* y must be odd, x must have at least as many bits as y,   \
 
1409
       high limbs must be non-zero */                           \
 
1410
    for (j = 0; j < pieces; j++)                                \
 
1411
      {                                                         \
 
1412
        mp_ptr  x = px+j*s->size;                               \
 
1413
        mp_ptr  y = py+j*s->size;                               \
 
1414
        y[0] |= 1;                                              \
 
1415
        if (x[s->size-1] == 0) x[s->size-1] = 1;                \
 
1416
        if (y[s->size-1] == 0) y[s->size-1] = 1;                \
 
1417
        x[s->size-1] = MAX (x[s->size-1], y[s->size-1]);        \
 
1418
      }                                                         \
 
1419
                                                                \
 
1420
    speed_operand_src (s, px, psize);                           \
 
1421
    speed_operand_src (s, py, psize);                           \
 
1422
    speed_operand_dst (s, xtmp, s->size);                       \
 
1423
    speed_operand_dst (s, ytmp, s->size);                       \
 
1424
    speed_operand_dst (s, wp, s->size);                         \
 
1425
    speed_cache_fill (s);                                       \
 
1426
                                                                \
 
1427
    speed_starttime ();                                         \
 
1428
    i = s->reps;                                                \
 
1429
    do                                                          \
 
1430
      {                                                         \
 
1431
        j = pieces;                                             \
 
1432
        do                                                      \
 
1433
          {                                                     \
 
1434
            MPN_COPY (xtmp, px+(j-1)*s->size, s->size);         \
 
1435
            MPN_COPY (ytmp, py+(j-1)*s->size, s->size);         \
 
1436
            call;                                               \
 
1437
          }                                                     \
 
1438
        while (--j != 0);                                       \
 
1439
      }                                                         \
 
1440
    while (--i != 0);                                           \
 
1441
    t = speed_endtime ();                                       \
 
1442
                                                                \
 
1443
    TMP_FREE (marker);                                          \
 
1444
                                                                \
 
1445
    s->time_divisor = pieces;                                   \
 
1446
    return t;                                                   \
 
1447
  }  
 
1448
 
 
1449
#define SPEED_ROUTINE_MPN_GCD(function) \
 
1450
  SPEED_ROUTINE_MPN_GCD_CALL (8, function (wp, xtmp, s->size, ytmp, s->size))
 
1451
 
 
1452
#define SPEED_ROUTINE_MPN_GCDEXT(function)                               \
 
1453
  SPEED_ROUTINE_MPN_GCD_CALL                                             \
 
1454
    (4, { mp_size_t  wp2size;                                            \
 
1455
          function (wp, wp2, &wp2size, xtmp, s->size, ytmp, s->size); })
 
1456
 
 
1457
 
 
1458
#define SPEED_ROUTINE_MPN_GCDEXT_ONE(function)                          \
 
1459
  {                                                                     \
 
1460
    unsigned  i;                                                        \
 
1461
    mp_size_t j, pieces, psize, wp2size;                                \
 
1462
    mp_ptr    wp, wp2, xtmp, ytmp, px, py;                              \
 
1463
    double    t;                                                        \
 
1464
    TMP_DECL (marker);                                                  \
 
1465
                                                                        \
 
1466
    SPEED_RESTRICT_COND (s->size >= 1);                                 \
 
1467
                                                                        \
 
1468
    TMP_MARK (marker);                                                  \
 
1469
                                                                        \
 
1470
    xtmp = SPEED_TMP_ALLOC_LIMBS (s->size+1, s->align_xp);              \
 
1471
    ytmp = SPEED_TMP_ALLOC_LIMBS (s->size+1, s->align_yp);              \
 
1472
    MPN_COPY (xtmp, s->xp, s->size);                                    \
 
1473
    MPN_COPY (ytmp, s->yp, s->size);                                    \
 
1474
                                                                        \
 
1475
    wp = SPEED_TMP_ALLOC_LIMBS (s->size+1, s->align_wp);                \
 
1476
    wp2 = SPEED_TMP_ALLOC_LIMBS (s->size+1, s->align_wp2);              \
 
1477
                                                                        \
 
1478
    pieces = SPEED_BLOCK_SIZE / 3;                                      \
 
1479
    psize = 3 * pieces;                                                 \
 
1480
    px = TMP_ALLOC_LIMBS (psize);                                       \
 
1481
    py = TMP_ALLOC_LIMBS (psize);                                       \
 
1482
    MPN_COPY (px, s->xp_block, psize);                                  \
 
1483
    MPN_COPY (py, s->yp_block, psize);                                  \
 
1484
                                                                        \
 
1485
    /* x must have at least as many bits as y,                          \
 
1486
       high limbs must be non-zero */                                   \
 
1487
    for (j = 0; j < pieces; j++)                                        \
 
1488
      {                                                                 \
 
1489
        mp_ptr  x = px+3*j;                                             \
 
1490
        mp_ptr  y = py+3*j;                                             \
 
1491
        x[2] += (x[2] == 0);                                            \
 
1492
        y[2] += (y[2] == 0);                                            \
 
1493
        if (x[2] < y[2])                                                \
 
1494
          MP_LIMB_T_SWAP (x[2], y[2]);                                  \
 
1495
      }                                                                 \
 
1496
                                                                        \
 
1497
    speed_operand_src (s, px, psize);                                   \
 
1498
    speed_operand_src (s, py, psize);                                   \
 
1499
    speed_operand_dst (s, xtmp, s->size);                               \
 
1500
    speed_operand_dst (s, ytmp, s->size);                               \
 
1501
    speed_operand_dst (s, wp, s->size);                                 \
 
1502
    speed_cache_fill (s);                                               \
 
1503
                                                                        \
 
1504
    speed_starttime ();                                                 \
 
1505
    i = s->reps;                                                        \
 
1506
    do                                                                  \
 
1507
      {                                                                 \
 
1508
        mp_ptr  x = px;                                                 \
 
1509
        mp_ptr  y = py;                                                 \
 
1510
        mp_ptr  xth = &xtmp[s->size-3];                                 \
 
1511
        mp_ptr  yth = &ytmp[s->size-3];                                 \
 
1512
        j = pieces;                                                     \
 
1513
        do                                                              \
 
1514
          {                                                             \
 
1515
            xth[0] = x[0], xth[1] = x[1], xth[2] = x[2];                \
 
1516
            yth[0] = y[0], yth[1] = y[1], yth[2] = y[2];                \
 
1517
                                                                        \
 
1518
            ytmp[0] |= 1; /* y must be odd, */                          \
 
1519
                                                                        \
 
1520
            function (wp, wp2, &wp2size, xtmp, s->size, ytmp, s->size); \
 
1521
                                                                        \
 
1522
            x += 3;                                                     \
 
1523
            y += 3;                                                     \
 
1524
          }                                                             \
 
1525
        while (--j != 0);                                               \
 
1526
      }                                                                 \
 
1527
    while (--i != 0);                                                   \
 
1528
    t = speed_endtime ();                                               \
 
1529
                                                                        \
 
1530
    TMP_FREE (marker);                                                  \
 
1531
                                                                        \
 
1532
    s->time_divisor = pieces;                                           \
 
1533
    return t;                                                           \
 
1534
  }  
 
1535
 
 
1536
#define SPEED_ROUTINE_MPZ_JACOBI(function)                      \
 
1537
  {                                                             \
 
1538
    mpz_t     a, b;                                             \
 
1539
    unsigned  i;                                                \
 
1540
    mp_size_t j, pieces, psize;                                 \
 
1541
    mp_ptr    px, py;                                           \
 
1542
    double    t;                                                \
 
1543
    int       dummy = 0;                                        \
 
1544
    TMP_DECL (marker);                                          \
 
1545
                                                                \
 
1546
    TMP_MARK (marker);                                          \
 
1547
    pieces = SPEED_BLOCK_SIZE / MAX (s->size, 1);               \
 
1548
    pieces = MAX (pieces, 1);                                   \
 
1549
    s->time_divisor = pieces;                                   \
 
1550
                                                                \
 
1551
    psize = pieces * s->size;                                   \
 
1552
    px = TMP_ALLOC_LIMBS (psize);                               \
 
1553
    py = TMP_ALLOC_LIMBS (psize);                               \
 
1554
    MPN_COPY (px, pieces==1 ? s->xp : s->xp_block, psize);      \
 
1555
    MPN_COPY (py, pieces==1 ? s->yp : s->yp_block, psize);      \
 
1556
                                                                \
 
1557
    for (j = 0; j < pieces; j++)                                \
 
1558
      {                                                         \
 
1559
        mp_ptr  x = px+j*s->size;                               \
 
1560
        mp_ptr  y = py+j*s->size;                               \
 
1561
                                                                \
 
1562
        /* y odd */                                             \
 
1563
        y[0] |= 1;                                              \
 
1564
                                                                \
 
1565
        /* high limbs non-zero */                               \
 
1566
        if (x[s->size-1] == 0) x[s->size-1] = 1;                \
 
1567
        if (y[s->size-1] == 0) y[s->size-1] = 1;                \
 
1568
      }                                                         \
 
1569
                                                                \
 
1570
    SIZ(a) = s->size;                                           \
 
1571
    SIZ(b) = s->size;                                           \
 
1572
                                                                \
 
1573
    speed_operand_src (s, px, psize);                           \
 
1574
    speed_operand_src (s, py, psize);                           \
 
1575
    speed_cache_fill (s);                                       \
 
1576
                                                                \
 
1577
    speed_starttime ();                                         \
 
1578
    i = s->reps;                                                \
 
1579
    do                                                          \
 
1580
      {                                                         \
 
1581
        j = pieces;                                             \
 
1582
        do                                                      \
 
1583
          {                                                     \
 
1584
            PTR(a) = px+(j-1)*s->size;                          \
 
1585
            PTR(b) = py+(j-1)*s->size;                          \
 
1586
            dummy += function (a, b);                           \
 
1587
          }                                                     \
 
1588
        while (--j != 0);                                       \
 
1589
      }                                                         \
 
1590
    while (--i != 0);                                           \
 
1591
    t = speed_endtime ();                                       \
 
1592
                                                                \
 
1593
    noop_1 ((mp_limb_t) dummy);                                 \
 
1594
    TMP_FREE (marker);                                          \
 
1595
    return t;                                                   \
 
1596
  }  
 
1597
 
 
1598
#define SPEED_ROUTINE_MPN_DIVREM_2(function)            \
 
1599
  {                                                     \
 
1600
    mp_ptr    wp, xp;                                   \
 
1601
    mp_limb_t yp[2];                                    \
 
1602
    unsigned  i;                                        \
 
1603
    double    t;                                        \
 
1604
    TMP_DECL (marker);                                  \
 
1605
                                                        \
 
1606
    SPEED_RESTRICT_COND (s->size >= 2);                 \
 
1607
                                                        \
 
1608
    TMP_MARK (marker);                                  \
 
1609
    xp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_xp);  \
 
1610
    wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp);  \
 
1611
                                                        \
 
1612
    /* source is destroyed */                           \
 
1613
    MPN_COPY (xp, s->xp, s->size);                      \
 
1614
                                                        \
 
1615
    /* divisor must be normalized */                    \
 
1616
    MPN_COPY (yp, s->yp_block, 2);                      \
 
1617
    yp[1] |= MP_LIMB_T_HIGHBIT;                         \
 
1618
                                                        \
 
1619
    speed_operand_src (s, xp, s->size);                 \
 
1620
    speed_operand_src (s, yp, 2);                       \
 
1621
    speed_operand_dst (s, wp, s->size);                 \
 
1622
    speed_cache_fill (s);                               \
 
1623
                                                        \
 
1624
    speed_starttime ();                                 \
 
1625
    i = s->reps;                                        \
 
1626
    do                                                  \
 
1627
      function (wp, 0, xp, s->size, yp);                \
 
1628
    while (--i != 0);                                   \
 
1629
    t = speed_endtime ();                               \
 
1630
                                                        \
 
1631
    TMP_FREE (marker);                                  \
 
1632
    return t;                                           \
 
1633
  }  
 
1634
 
 
1635
 
 
1636
#define SPEED_ROUTINE_MODLIMB_INVERT(function)                  \
 
1637
  {                                                             \
 
1638
    unsigned   i, j;                                            \
 
1639
    mp_ptr     xp;                                              \
 
1640
    mp_limb_t  n = 1;                                           \
 
1641
    double     t;                                               \
 
1642
                                                                \
 
1643
    xp = s->xp_block-1;                                         \
 
1644
                                                                \
 
1645
    speed_operand_src (s, s->xp_block, SPEED_BLOCK_SIZE);       \
 
1646
    speed_cache_fill (s);                                       \
 
1647
                                                                \
 
1648
    speed_starttime ();                                         \
 
1649
    i = s->reps;                                                \
 
1650
    do                                                          \
 
1651
      {                                                         \
 
1652
        j = SPEED_BLOCK_SIZE;                                   \
 
1653
        do                                                      \
 
1654
          {                                                     \
 
1655
            /* randomized but successively dependent */         \
 
1656
            n += (xp[j] << 1);                                  \
 
1657
                                                                \
 
1658
            function (n, n);                                    \
 
1659
          }                                                     \
 
1660
        while (--j != 0);                                       \
 
1661
      }                                                         \
 
1662
    while (--i != 0);                                           \
 
1663
    t = speed_endtime ();                                       \
 
1664
                                                                \
 
1665
    /* make sure the compiler won't optimize away n */          \
 
1666
    noop_1 (n);                                                 \
 
1667
                                                                \
 
1668
    s->time_divisor = SPEED_BLOCK_SIZE;                         \
 
1669
    return t;                                                   \
 
1670
  }  
 
1671
 
 
1672
 
 
1673
#define SPEED_ROUTINE_MPN_SQRTREM(function)                     \
 
1674
  {                                                             \
 
1675
    mp_ptr    wp, wp2;                                          \
 
1676
    unsigned  i;                                                \
 
1677
    double    t;                                                \
 
1678
    TMP_DECL (marker);                                          \
 
1679
                                                                \
 
1680
    SPEED_RESTRICT_COND (s->size >= 1);                         \
 
1681
                                                                \
 
1682
    TMP_MARK (marker);                                          \
 
1683
    wp  = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp);         \
 
1684
    wp2 = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp2);        \
 
1685
                                                                \
 
1686
    speed_operand_src (s, s->xp, s->size);                      \
 
1687
    speed_operand_dst (s, wp, s->size);                         \
 
1688
    speed_operand_dst (s, wp2, s->size);                        \
 
1689
    speed_cache_fill (s);                                       \
 
1690
                                                                \
 
1691
    speed_starttime ();                                         \
 
1692
    i = s->reps;                                                \
 
1693
    do                                                          \
 
1694
      function (wp, wp2, s->xp, s->size);                       \
 
1695
    while (--i != 0);                                           \
 
1696
    t = speed_endtime ();                                       \
 
1697
                                                                \
 
1698
    TMP_FREE (marker);                                          \
 
1699
    return t;                                                   \
 
1700
  }  
 
1701
 
 
1702
 
 
1703
/* s->size controls the number of limbs in the input, s->r is the base, or
 
1704
   decimal by default. */
 
1705
#define SPEED_ROUTINE_MPN_GET_STR(function)                             \
 
1706
  {                                                                     \
 
1707
    unsigned char *wp;                                                  \
 
1708
    mp_size_t    wsize;                                                 \
 
1709
    mp_ptr    xp;                                                       \
 
1710
    int       base;                                                     \
 
1711
    unsigned  i;                                                        \
 
1712
    double    t;                                                        \
 
1713
    TMP_DECL (marker);                                                  \
 
1714
                                                                        \
 
1715
    SPEED_RESTRICT_COND (s->size >= 1);                                 \
 
1716
                                                                        \
 
1717
    base = s->r == 0 ? 10 : s->r;                                       \
 
1718
    SPEED_RESTRICT_COND (base >= 2 && base <= 255);                     \
 
1719
                                                                        \
 
1720
    TMP_MARK (marker);                                                  \
 
1721
    xp = SPEED_TMP_ALLOC_LIMBS (s->size + 1, s->align_xp);              \
 
1722
                                                                        \
 
1723
    wsize = ((mp_size_t) (s->size * BITS_PER_MP_LIMB                    \
 
1724
                        * __mp_bases[base].chars_per_bit_exactly)) + 2; \
 
1725
    wp = TMP_ALLOC (wsize);                                             \
 
1726
                                                                        \
 
1727
    /* use this during development to guard against overflowing wp */   \
 
1728
    /*                                                                  \
 
1729
    MPN_COPY (xp, s->xp, s->size);                                      \
 
1730
    ASSERT_ALWAYS (mpn_get_str (wp, base, xp, s->size) <= wsize);       \
 
1731
    */                                                                  \
 
1732
                                                                        \
 
1733
    speed_operand_src (s, s->xp, s->size);                              \
 
1734
    speed_operand_dst (s, xp, s->size);                                 \
 
1735
    speed_operand_dst (s, (mp_ptr) wp, wsize/BYTES_PER_MP_LIMB);        \
 
1736
    speed_cache_fill (s);                                               \
 
1737
                                                                        \
 
1738
    speed_starttime ();                                                 \
 
1739
    i = s->reps;                                                        \
 
1740
    do                                                                  \
 
1741
      {                                                                 \
 
1742
        MPN_COPY (xp, s->xp, s->size);                                  \
 
1743
        function (wp, base, xp, s->size);                               \
 
1744
      }                                                                 \
 
1745
    while (--i != 0);                                                   \
 
1746
    t = speed_endtime ();                                               \
 
1747
                                                                        \
 
1748
    TMP_FREE (marker);                                                  \
 
1749
    return t;                                                           \
 
1750
  }  
 
1751
 
 
1752
/* s->size controls the number of digits in the input, s->r is the base, or
 
1753
   decimal by default. */
 
1754
#define SPEED_ROUTINE_MPN_SET_STR(function)                                  \
 
1755
  {                                                                          \
 
1756
    unsigned char *xp;                                                       \
 
1757
    mp_ptr     wp;                                                           \
 
1758
    mp_size_t  wsize;                                                        \
 
1759
    unsigned   i;                                                            \
 
1760
    int        base;                                                         \
 
1761
    double     t;                                                            \
 
1762
    TMP_DECL (marker);                                                       \
 
1763
                                                                             \
 
1764
    SPEED_RESTRICT_COND (s->size >= 1);                                      \
 
1765
                                                                             \
 
1766
    base = s->r == 0 ? 10 : s->r;                                            \
 
1767
    SPEED_RESTRICT_COND (base >= 2 && base <= 255);                          \
 
1768
                                                                             \
 
1769
    TMP_MARK (marker);                                                       \
 
1770
                                                                             \
 
1771
    xp = TMP_ALLOC (s->size);                                                \
 
1772
    for (i = 0; i < s->size; i++)                                            \
 
1773
      xp[i] = s->xp[i] % base;                                               \
 
1774
                                                                             \
 
1775
    wsize = ((mp_size_t) (s->size / __mp_bases[base].chars_per_bit_exactly)) \
 
1776
      / BITS_PER_MP_LIMB + 2;                                                \
 
1777
    wp = SPEED_TMP_ALLOC_LIMBS (wsize, s->align_wp);                         \
 
1778
                                                                             \
 
1779
    /* use this during development to check wsize is big enough */           \
 
1780
    /*                                                                       \
 
1781
    ASSERT_ALWAYS (mpn_set_str (wp, xp, s->size, base) <= wsize);            \
 
1782
    */                                                                       \
 
1783
                                                                             \
 
1784
    speed_operand_src (s, (mp_ptr) xp, s->size/BYTES_PER_MP_LIMB);           \
 
1785
    speed_operand_dst (s, wp, s->size);                                      \
 
1786
    speed_cache_fill (s);                                                    \
 
1787
                                                                             \
 
1788
    speed_starttime ();                                                      \
 
1789
    i = s->reps;                                                             \
 
1790
    do                                                                       \
 
1791
      function (wp, xp, s->size, base);                                      \
 
1792
    while (--i != 0);                                                        \
 
1793
    t = speed_endtime ();                                                    \
 
1794
                                                                             \
 
1795
    TMP_FREE (marker);                                                       \
 
1796
    return t;                                                                \
 
1797
  }  
 
1798
 
 
1799
 
 
1800
/* Run an accel gcd find_a() function over various data values.  A set of
 
1801
   values is used in case some run particularly fast or slow.  The size
 
1802
   parameter is ignored, the amount of data tested is fixed.  */
 
1803
 
 
1804
#define SPEED_ROUTINE_MPN_GCD_FINDA(function)                   \
 
1805
  {                                                             \
 
1806
    unsigned  i, j;                                             \
 
1807
    mp_limb_t cp[SPEED_BLOCK_SIZE][2];                          \
 
1808
    double    t;                                                \
 
1809
    mp_limb_t dummy = 0;                                        \
 
1810
    TMP_DECL (marker);                                          \
 
1811
                                                                \
 
1812
    TMP_MARK (marker);                                          \
 
1813
                                                                \
 
1814
    /* low must be odd, high must be non-zero */                \
 
1815
    for (i = 0; i < SPEED_BLOCK_SIZE; i++)                      \
 
1816
      {                                                         \
 
1817
        cp[i][0] = s->xp_block[i] | 1;                          \
 
1818
        cp[i][1] = s->yp_block[i] + (s->yp_block[i] == 0);      \
 
1819
      }                                                         \
 
1820
                                                                \
 
1821
    speed_operand_src (s, &cp[0][0], 2*SPEED_BLOCK_SIZE);       \
 
1822
    speed_cache_fill (s);                                       \
 
1823
                                                                \
 
1824
    speed_starttime ();                                         \
 
1825
    i = s->reps;                                                \
 
1826
    do                                                          \
 
1827
      {                                                         \
 
1828
        j = SPEED_BLOCK_SIZE;                                   \
 
1829
        do                                                      \
 
1830
          {                                                     \
 
1831
            dummy += function (cp[j-1]);                        \
 
1832
          }                                                     \
 
1833
        while (--j != 0);                                       \
 
1834
      }                                                         \
 
1835
    while (--i != 0);                                           \
 
1836
    t = speed_endtime ();                                       \
 
1837
                                                                \
 
1838
    TMP_FREE (marker);                                          \
 
1839
                                                                \
 
1840
    s->time_divisor = SPEED_BLOCK_SIZE;                         \
 
1841
    return t;                                                   \
 
1842
  }  
 
1843
 
 
1844
 
 
1845
/* "call" should do "count_foo_zeros(c,n)".
 
1846
   Give leading=1 if foo is leading zeros, leading=0 for trailing.
 
1847
   Give zero=1 if n=0 is allowed in the call, zero=0 if not.  */
 
1848
 
 
1849
#define SPEED_ROUTINE_COUNT_ZEROS_A(leading, zero)                      \
 
1850
  {                                                                     \
 
1851
    mp_ptr     xp;                                                      \
 
1852
    int        i, c;                                                    \
 
1853
    unsigned   j;                                                       \
 
1854
    mp_limb_t  n;                                                       \
 
1855
    double     t;                                                       \
 
1856
    TMP_DECL (marker);                                                  \
 
1857
                                                                        \
 
1858
    TMP_MARK (marker);                                                  \
 
1859
    xp = SPEED_TMP_ALLOC_LIMBS (SPEED_BLOCK_SIZE, s->align_xp);         \
 
1860
                                                                        \
 
1861
    if (! speed_routine_count_zeros_setup (s, xp, leading, zero))       \
 
1862
      return -1.0;                                                      \
 
1863
    speed_operand_src (s, xp, SPEED_BLOCK_SIZE);                        \
 
1864
    speed_cache_fill (s);                                               \
 
1865
                                                                        \
 
1866
    c = 0;                                                              \
 
1867
    speed_starttime ();                                                 \
 
1868
    j = s->reps;                                                        \
 
1869
    do {                                                                \
 
1870
      for (i = 0; i < SPEED_BLOCK_SIZE; i++)                            \
 
1871
        {                                                               \
 
1872
          n = xp[i];                                                    \
 
1873
          n ^= c;                                                       \
 
1874
 
 
1875
#define SPEED_ROUTINE_COUNT_ZEROS_B()                                   \
 
1876
        }                                                               \
 
1877
    } while (--j != 0);                                                 \
 
1878
    t = speed_endtime ();                                               \
 
1879
                                                                        \
 
1880
    /* don't let c go dead */                                           \
 
1881
    noop_1 (c);                                                         \
 
1882
                                                                        \
 
1883
    s->time_divisor = SPEED_BLOCK_SIZE;                                 \
 
1884
                                                                        \
 
1885
    TMP_FREE (marker);                                                  \
 
1886
    return t;                                                           \
 
1887
  }                                                                     \
 
1888
 
 
1889
#define SPEED_ROUTINE_COUNT_ZEROS_C(call, leading, zero)        \
 
1890
  do {                                                          \
 
1891
    SPEED_ROUTINE_COUNT_ZEROS_A (leading, zero);                \
 
1892
    call;                                                       \
 
1893
    SPEED_ROUTINE_COUNT_ZEROS_B ();                             \
 
1894
  } while (0)                                                   \
 
1895
 
 
1896
#define SPEED_ROUTINE_COUNT_LEADING_ZEROS_C(call,zero)  \
 
1897
  SPEED_ROUTINE_COUNT_ZEROS_C (call, 1, zero)
 
1898
#define SPEED_ROUTINE_COUNT_LEADING_ZEROS(fun)          \
 
1899
  SPEED_ROUTINE_COUNT_ZEROS_C (fun (c, n), 1, 0)
 
1900
 
 
1901
#define SPEED_ROUTINE_COUNT_TRAILING_ZEROS_C(call,zero) \
 
1902
  SPEED_ROUTINE_COUNT_ZEROS_C (call, 0, zero)
 
1903
#define SPEED_ROUTINE_COUNT_TRAILING_ZEROS(call)        \
 
1904
  SPEED_ROUTINE_COUNT_ZEROS_C (fun (c, n), 0, 0)
 
1905
 
 
1906
 
 
1907
#define SPEED_ROUTINE_INVERT_LIMB_CALL(call)                    \
 
1908
  {                                                             \
 
1909
    unsigned   i, j;                                            \
 
1910
    mp_limb_t  d, dinv=0;                                       \
 
1911
    mp_ptr     xp = s->xp_block - 1;                            \
 
1912
                                                                \
 
1913
    s->time_divisor = SPEED_BLOCK_SIZE;                         \
 
1914
                                                                \
 
1915
    speed_starttime ();                                         \
 
1916
    i = s->reps;                                                \
 
1917
    do                                                          \
 
1918
      {                                                         \
 
1919
        j = SPEED_BLOCK_SIZE;                                   \
 
1920
        do                                                      \
 
1921
          {                                                     \
 
1922
            d = dinv ^ xp[j];                                   \
 
1923
            d |= MP_LIMB_T_HIGHBIT;                             \
 
1924
            do { call; } while (0);                             \
 
1925
          }                                                     \
 
1926
        while (--j != 0);                                       \
 
1927
      }                                                         \
 
1928
    while (--i != 0);                                           \
 
1929
                                                                \
 
1930
    /* don't let the compiler optimize everything away */       \
 
1931
    noop_1 (dinv);                                              \
 
1932
                                                                \
 
1933
    return speed_endtime();                                     \
 
1934
  }
 
1935
 
 
1936
 
 
1937
#endif
 
1938
 
 
1939
 
 
1940
#define SPEED_ROUTINE_MPN_BACK_TO(function)     \
 
1941
  {                                             \
 
1942
    unsigned  i;                                \
 
1943
    speed_starttime ();                         \
 
1944
    i = s->reps;                                \
 
1945
    do                                          \
 
1946
      function ();                              \
 
1947
    while (--i != 0);                           \
 
1948
    return speed_endtime ();                    \
 
1949
  }
 
1950
 
 
1951
 
 
1952
#define SPEED_ROUTINE_MPN_ZERO_CALL(call)               \
 
1953
  {                                                     \
 
1954
    mp_ptr    wp;                                       \
 
1955
    unsigned  i;                                        \
 
1956
    double    t;                                        \
 
1957
    TMP_DECL (marker);                                  \
 
1958
                                                        \
 
1959
    SPEED_RESTRICT_COND (s->size >= 0);                 \
 
1960
                                                        \
 
1961
    TMP_MARK (marker);                                  \
 
1962
    wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp);  \
 
1963
    speed_operand_dst (s, wp, s->size);                 \
 
1964
    speed_cache_fill (s);                               \
 
1965
                                                        \
 
1966
    speed_starttime ();                                 \
 
1967
    i = s->reps;                                        \
 
1968
    do                                                  \
 
1969
      call;                                             \
 
1970
    while (--i != 0);                                   \
 
1971
    t = speed_endtime ();                               \
 
1972
                                                        \
 
1973
    TMP_FREE (marker);                                  \
 
1974
    return t;                                           \
 
1975
  }  
 
1976
 
 
1977
#define SPEED_ROUTINE_MPN_ZERO(function)                \
 
1978
  SPEED_ROUTINE_MPN_ZERO_CALL (function (wp, s->size))