1
dnl Intel P5 mpn_mod_1 -- mpn by limb remainder.
3
dnl P5: 28.0 cycles/limb
6
dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
8
dnl This file is part of the GNU MP Library.
10
dnl The GNU MP Library is free software; you can redistribute it and/or
11
dnl modify it under the terms of the GNU Lesser General Public License as
12
dnl published by the Free Software Foundation; either version 2.1 of the
13
dnl License, or (at your option) any later version.
15
dnl The GNU MP Library is distributed in the hope that it will be useful,
16
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
17
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18
dnl Lesser General Public License for more details.
20
dnl You should have received a copy of the GNU Lesser General Public
21
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
22
dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
23
dnl Suite 330, Boston, MA 02111-1307, USA.
26
include(`../config.m4')
29
C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
30
C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
32
C mp_limb_t mpn_preinv_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
35
C This code is not unlike mpn/x86/p6/mod_1.asm, it does the same sort of
36
C multiply by inverse without on-the-fly shifts. See that code for some
41
C P5 shldl is 4 cycles, so shifting on the fly would be at least 5 cycles
42
C slower, probably more depending what it did to register usage. Using MMX
43
C on P55 would be better, but still at least 4 or 5 instructions and so 2 or
47
dnl These thresholds are the sizes where the multiply by inverse method is
48
dnl used, rather than plain "divl"s. Minimum value 2.
50
dnl MUL_NORM_THRESHOLD is for an already normalized divisor (high bit set),
51
dnl MUL_UNNORM_THRESHOLD for an unnormalized divisor.
53
dnl With the divl loop at 44 c/l and the inverse at 28 c/l with about 70
54
dnl cycles to setup, the threshold should be about ceil(70/16)==5, which is
55
dnl what happens in practice.
57
dnl An unnormalized divisor gets an extra 40 cycles at the end for the
58
dnl final (r*2^n)%(d*2^n) and shift. This increases the threshold by about
61
dnl PIC adds between 4 and 7 cycles (not sure why it varies), but this
62
dnl doesn't change the thresholds.
64
dnl The entry sequence code that chooses between MUL_NORM_THRESHOLD and
65
dnl MUL_UNNORM_THRESHOLD is a bit horrible, but it adds only 2 cycles
66
dnl (branch free) and ensures the choice between div or mul is optimal.
68
deflit(MUL_NORM_THRESHOLD, ifdef(`PIC',5,5))
69
deflit(MUL_UNNORM_THRESHOLD, ifdef(`PIC',8,8))
71
deflit(MUL_NORM_DELTA, eval(MUL_NORM_THRESHOLD - MUL_UNNORM_THRESHOLD))
74
defframe(PARAM_INVERSE, 16) dnl mpn_preinv_mod_1
75
defframe(PARAM_CARRY, 16) dnl mpn_mod_1c
76
defframe(PARAM_DIVISOR, 12)
77
defframe(PARAM_SIZE, 8)
78
defframe(PARAM_SRC, 4)
80
dnl re-using parameter space
81
define(VAR_NORM, `PARAM_DIVISOR')
82
define(VAR_INVERSE, `PARAM_SIZE')
87
PROLOGUE(mpn_preinv_mod_1)
90
pushl %ebp FRAME_pushl()
91
pushl %esi FRAME_pushl()
96
pushl %edi FRAME_pushl()
97
pushl %ebx FRAME_pushl()
99
movl PARAM_DIVISOR, %ebp
100
movl PARAM_INVERSE, %eax
102
movl -4(%esi,%edx,4), %edi C src high limb
103
leal -8(%esi,%edx,4), %esi C &src[size-2]
108
jnz LF(mpn_mod_1,start_preinv)
110
subl %ebp, %edi C src-divisor
113
sbbl %ecx, %ecx C -1 if underflow
114
movl %edi, %eax C src-divisor
116
andl %ebp, %ecx C d if underflow
119
addl %ecx, %eax C remainder, with possible addback
133
movl PARAM_DIVISOR, %eax
134
movl PARAM_SIZE, %ecx
136
sarl $31, %eax C d highbit
137
movl PARAM_CARRY, %edx
140
jz LF(mpn_mod_1,done_edx) C result==carry if size==0
142
andl $MUL_NORM_DELTA, %eax
143
pushl %ebp FRAME_pushl()
145
addl $MUL_UNNORM_THRESHOLD, %eax C norm or unnorm thresh
146
pushl %esi FRAME_pushl()
149
movl PARAM_DIVISOR, %ebp
152
jb LF(mpn_mod_1,divide_top)
154
movl %edx, %eax C carry as pretend src high limb
155
leal 1(%ecx), %edx C size+1
157
cmpl $0x1000000, %ebp
158
jmp LF(mpn_mod_1,mul_by_inverse_1c)
167
movl PARAM_SIZE, %ecx
168
pushl %ebp FRAME_pushl()
174
movl PARAM_DIVISOR, %ebp
176
sarl $31, %ebp C -1 if divisor normalized
177
movl -4(%eax,%ecx,4), %eax C src high limb
179
movl PARAM_DIVISOR, %edx
180
pushl %esi FRAME_pushl()
182
andl $MUL_NORM_DELTA, %ebp
183
cmpl %edx, %eax C carry flag if high<divisor
185
sbbl %edx, %edx C -1 if high<divisor
186
addl $MUL_UNNORM_THRESHOLD, %ebp C norm or unnorm thresh
188
addl %edx, %ecx C size-1 if high<divisor
192
movl PARAM_DIVISOR, %ebp
195
jae L(mul_by_inverse)
197
andl %eax, %edx C high as initial carry if high<divisor
201
C eax scratch (quotient)
203
C ecx counter, limbs, decrementing
204
C edx scratch (remainder)
209
movl -4(%esi,%ecx,4), %eax
233
C -----------------------------------------------------------------------------
235
C The divisor is normalized using the same code as the pentium
236
C count_leading_zeros in longlong.h. Going through the GOT for PIC costs a
237
C couple of cycles, but is more or less unavoidable.
250
movl PARAM_SIZE, %edx
251
cmpl $0x1000000, %ebp
253
L(mul_by_inverse_1c):
261
pushl %edi FRAME_pushl()
263
pushl %ebx FRAME_pushl()
270
leal 25(,%ecx,8), %ecx C 0,-1,-2,-3 -> 25,17,9,1
273
addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %edi
276
movl __clz_tab@GOT(%edi), %edi
280
movb (%ebx,%edi), %bl
283
leal 25(,%ecx,8), %ecx C 0,-1,-2,-3 -> 25,17,9,1
289
movb __clz_tab(%ebx), %bl
291
movl %eax, %edi C carry -> n1
293
addl %ebx, %ecx C -34 + c + __clz_tab[d>>c] = -clz-1
294
leal -8(%esi,%edx,4), %esi C &src[size-2]
299
ASSERT(e,`pushl %eax C clz calculation same as bsrl
305
shll %cl, %ebp C d normalized
308
subl %ebp, %edx C (b-d)-1, so edx:eax = b*(b-d)-1
311
divl %ebp C floor (b*(b-d)-1) / d
314
movl %eax, VAR_INVERSE
317
movl %ecx, %edx C fake high, will cancel
320
C For mpn_mod_1 and mpn_preinv_mod_1, the initial carry in %edi is the src
321
C high limb, and this may be greater than the divisor and may need one copy
322
C of the divisor subtracted (only one, because the divisor is normalized).
323
C This is accomplished by having the initial ecx:edi act as a fake previous
324
C n2:n10. The initial edx:eax is d, acting as a fake (q1+1)*d which is
325
C subtracted from ecx:edi, with the usual addback if it produces an
330
C eax scratch (n10, n1, q1, etc)
331
C ebx scratch (nadj, src limit)
334
C esi src pointer, &src[size-2] to &src[0]
338
subl %eax, %edi C low n - (q1+1)*d
339
movl (%esi), %eax C new n10
341
sbbl %edx, %ecx C high n - (q1+1)*d, 0 or -1
345
andl %ebp, %ecx C d if underflow
347
addl %edi, %ecx C remainder -> n2, and possible addback
348
ASSERT(b,`cmpl %ebp, %ecx')
349
andl %eax, %ebx C -n1 & d
351
movl (%esi), %edi C n10
354
addl %ecx, %eax C n2+n1
355
addl %edi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
357
mull VAR_INVERSE C m*(n2+n1)
359
addl %eax, %ebx C low(m*(n2+n1) + nadj), giving carry flag
360
leal 1(%ecx), %eax C 1+n2
362
adcl %edx, %eax C 1 + high[n2<<32 + m*(n2+n1) + nadj] = q1+1
365
sbbl $0, %eax C use q1 if q1+1 overflows
366
subl $4, %esi C step src ptr
375
C %edi (after subtract and addback) is the remainder modulo d*2^n
376
C and must be reduced to 0<=r<d by calculating r*2^n mod d*2^n and
377
C right shifting by n.
379
C If d was already normalized on entry so that n==0 then nothing is
380
C needed here. This is always the case for preinv_mod_1. For mod_1
381
C or mod_1c the chance of n==0 is low, but about 40 cycles can be
384
subl %eax, %edi C low n - (q1+1)*d
387
sbbl %edx, %ebx C high n - (q1+1)*d, 0 or -1
388
xorl %esi, %esi C next n2
390
andl %ebp, %ebx C d if underflow
393
addl %ebx, %edi C remainder, with possible addback
399
C Here using %esi=n2 and %edi=n10, unlike the above
401
shldl( %cl, %edi, %esi) C n2
405
movl %edi, %eax C n10
406
movl %edi, %ebx C n10
411
andl %ebp, %ebx C -n1 & d
413
addl %esi, %eax C n2+n1
414
addl %edi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
416
mull VAR_INVERSE C m*(n2+n1)
418
addl %eax, %ebx C m*(n2+n1) + nadj, low giving carry flag
419
leal 1(%esi), %eax C 1+n2
421
adcl %edx, %eax C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
423
sbbl $0, %eax C use q1 if q1+1 overflows
427
subl %eax, %edi C low n - (q1+1)*d
430
sbbl %edx, %esi C high n - (q1+1)*d, 0 or -1
433
andl %ebp, %esi C d if underflow
436
addl %esi, %eax C addback if underflow
439
shrl %cl, %eax C denorm remainder