1
dnl Intel P6 mpn_mod_1 -- mpn by limb remainder.
3
dnl P6: 21.5 cycles/limb
6
dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
8
dnl This file is part of the GNU MP Library.
10
dnl The GNU MP Library is free software; you can redistribute it and/or
11
dnl modify it under the terms of the GNU Lesser General Public License as
12
dnl published by the Free Software Foundation; either version 2.1 of the
13
dnl License, or (at your option) any later version.
15
dnl The GNU MP Library is distributed in the hope that it will be useful,
16
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
17
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18
dnl Lesser General Public License for more details.
20
dnl You should have received a copy of the GNU Lesser General Public
21
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
22
dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
23
dnl Suite 330, Boston, MA 02111-1307, USA.
26
include(`../config.m4')
29
C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
30
C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
32
C mp_limb_t mpn_preinv_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
35
C The code here is in two parts, a simple divl loop and a mul-by-inverse.
36
C The divl is used by mod_1 and mod_1c for small sizes, until the savings in
37
C the mul-by-inverse can overcome the time to calculate an inverse.
38
C preinv_mod_1 goes straight to the mul-by-inverse.
40
C The mul-by-inverse normalizes the divisor (or for preinv_mod_1 it's
41
C already normalized). The calculation done is r=a%(d*2^n) followed by a
42
C final (r*2^n)%(d*2^n), where a is the dividend, d the divisor, and n is
43
C the number of leading zero bits on d. This means there's no bit shifts in
44
C the main loop, at the cost of an extra divide step at the end.
46
C The simple divl for mod_1 is able to skip one divide step if high<divisor.
47
C For mod_1c the carry parameter is the high of the first divide step, and
48
C no attempt is make to skip that step since carry==0 will be very rare.
50
C The mul-by-inverse always skips one divide step, but then needs an extra
51
C step at the end, unless the divisor was already normalized (n==0). This
52
C leads to different mul-by-inverse thresholds for normalized and
53
C unnormalized divisors, in mod_1 and mod_1c.
57
C If n is small then the extra divide step could be done by a few shift and
58
C trial subtract steps instead of a full divide. That would probably be 3
59
C or 4 cycles/bit, so say up to n=8 might benefit from that over a 21 cycle
60
C divide. However it's considered that small divisors, meaning biggish n,
61
C are more likely than small n, and that it's not worth the branch
62
C mispredicts of a loop.
66
C There used to be some MMX based code for P-II and P-III, roughly following
67
C the K7 form, but it was slower (about 24.0 c/l) than the code here. That
68
C code did have an advantage that mod_1 was able to do one less divide step
69
C when high<divisor and the divisor unnormalized, but the speed advantage of
70
C the current code soon overcomes that.
74
C It's not clear whether what's here is optimal. A rough count of micro-ops
75
C on the dependent chain would suggest a couple of cycles could be shaved,
79
dnl The following thresholds are the sizes where the multiply by inverse
80
dnl method is used instead of plain divl's. Minimum value 2 each.
82
dnl MUL_NORM_THRESHOLD is for normalized divisors (high bit set),
83
dnl MUL_UNNORM_THRESHOLD for unnormalized divisors.
85
dnl With the divl loop at 39 c/l, and the inverse loop at 21.5 c/l but
86
dnl setups for the inverse of about 50, the threshold should be around
87
dnl 50/(39-21.5)==2.85. An unnormalized divisor gets an extra divide step
88
dnl at the end, so if that's about 25 cycles then that threshold might be
89
dnl around (50+25)/(39-21.5) == 4.3.
91
deflit(MUL_NORM_THRESHOLD, 4)
92
deflit(MUL_UNNORM_THRESHOLD, 5)
94
deflit(MUL_NORM_DELTA, eval(MUL_NORM_THRESHOLD - MUL_UNNORM_THRESHOLD))
97
defframe(PARAM_INVERSE, 16) dnl mpn_preinv_mod_1
98
defframe(PARAM_CARRY, 16) dnl mpn_mod_1c
99
defframe(PARAM_DIVISOR, 12)
100
defframe(PARAM_SIZE, 8)
101
defframe(PARAM_SRC, 4)
103
defframe(SAVE_EBX, -4)
104
defframe(SAVE_ESI, -8)
105
defframe(SAVE_EDI, -12)
106
defframe(SAVE_EBP, -16)
108
defframe(VAR_NORM, -20)
109
defframe(VAR_INVERSE, -24)
111
deflit(STACK_SPACE, 24)
116
PROLOGUE(mpn_preinv_mod_1)
120
subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
123
movl PARAM_SIZE, %ebx
126
movl PARAM_DIVISOR, %ebp
129
movl PARAM_INVERSE, %eax
132
movl -4(%edx,%ebx,4), %edi C src high limb
135
leal -8(%edx,%ebx,4), %ecx C &src[size-2]
140
subl %ebp, %edi C high-divisor
142
cmovc( %esi, %edi) C restore if underflow
144
jnz LF(mpn_mod_1,preinv_entry)
146
jmp LF(mpn_mod_1,done_edi)
155
movl PARAM_SIZE, %ecx
156
subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
159
movl PARAM_DIVISOR, %eax
162
movl PARAM_CARRY, %edx
166
jz LF(mpn_mod_1,done_edx) C result==carry if size==0
169
movl PARAM_DIVISOR, %ebp
171
andl $MUL_NORM_DELTA, %eax
173
addl $MUL_UNNORM_THRESHOLD, %eax
176
jb LF(mpn_mod_1,divide_top)
179
C The carry parameter pretends to be the src high limb.
182
leal 1(%ecx), %ebx C size+1
184
movl %edx, %eax C carry
185
jmp LF(mpn_mod_1,mul_by_inverse_1c)
194
movl PARAM_SIZE, %ecx
195
subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
196
movl $0, %edx C initial carry (if can't skip a div)
202
movl PARAM_DIVISOR, %ebp
204
movl PARAM_DIVISOR, %esi
208
movl -4(%eax,%ecx,4), %eax C src high limb
212
andl $MUL_NORM_DELTA, %ebp
214
addl $MUL_UNNORM_THRESHOLD, %ebp
215
cmpl %esi, %eax C carry flag if high<divisor
217
cmovc( %eax, %edx) C src high limb as initial carry
220
sbbl $0, %ecx C size-1 to skip one div
221
jz L(done_eax) C done if had size==1
224
movl PARAM_DIVISOR, %ebp
225
jae L(mul_by_inverse)
229
C eax scratch (quotient)
231
C ecx counter, limbs, decrementing
232
C edx scratch (remainder)
237
movl -4(%esi,%ecx,4), %eax
251
addl $STACK_SPACE, %esp
256
C -----------------------------------------------------------------------------
268
movl PARAM_SIZE, %ebx
270
L(mul_by_inverse_1c):
271
bsrl %ebp, %ecx C 31-l
277
shll %cl, %ebp C d normalized
279
movl %eax, %edi C src high -> n2
282
cmovnc( %eax, %edi) C n2-divisor if no underflow
287
subl %ebp, %edx C (b-d)-1 so edx:eax = b*(b-d)-1
288
leal -8(%esi,%ebx,4), %ecx C &src[size-2]
290
divl %ebp C floor (b*(b-d)-1) / d
293
movl %eax, VAR_INVERSE
297
C No special scheduling of loads is necessary in this loop, out of order
298
C execution hides the latencies already.
300
C The way q1+1 is generated in %ebx and d is moved to %eax for the multiply
301
C seems fastest. The obvious change to generate q1+1 in %eax and then just
302
C multiply by %ebp (as per mpn/x86/pentium/mod_1.asm in fact) runs 1 cycle
303
C slower, for no obvious reason.
308
C eax n10 (then scratch)
309
C ebx scratch (nadj, q1)
310
C ecx src pointer, decrementing
316
movl (%ecx), %eax C next src limb
322
andl %eax, %ebx C -n1 & d
325
addl %edi, %eax C n2+n1
327
mull VAR_INVERSE C m*(n2+n1)
329
addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
334
addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
335
leal 1(%edi), %ebx C n2+1
338
adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
345
subl %eax, %esi C low n - (q1+1)*d
347
sbbl %edx, %edi C high n - (q1+1)*d, 0 or -1
349
andl %ebp, %edi C d if underflow
351
addl %esi, %edi C remainder with addback if necessary
357
C -----------------------------------------------------------------------------
358
L(inverse_loop_done):
360
C %edi is the remainder modulo d*2^n and now must be reduced to
361
C 0<=r<d by calculating r*2^n mod d*2^n and then right shifting by
362
C n. If d was already normalized on entry so that n==0 then nothing
363
C is needed here. The chance of n==0 is low, but it's true of say
364
C PP from gmp-impl.h.
372
C ebp divisor (normalized)
381
C Here use %edi=n10 and %esi=n2, opposite to the loop above.
383
C The q1=0xFFFFFFFF case is handled with an sbbl to adjust q1+1
384
C back, rather than q1_ff special case code. This is simpler and
387
shldl( %cl, %edi, %esi)
391
movl %edi, %eax C n10
396
andl %eax, %ebx C -n1 & d
399
addl %edi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
400
addl %esi, %eax C n2+n1
402
mull VAR_INVERSE C m*(n2+n1)
406
addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
407
leal 1(%esi), %ebx C n2+1
409
adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
420
subl %eax, %edi C low n - (q1+1)*d is remainder
422
sbbl %edx, %esi C high n - (q1+1)*d, 0 or -1
427
leal (%esi,%edi), %eax C remainder
430
shrl %cl, %eax C denorm remainder
432
addl $STACK_SPACE, %esp
446
addl $STACK_SPACE, %esp
451
C -----------------------------------------------------------------------------
453
C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
454
C of q*d is simply -d and the remainder n-q*d = n10+d.
456
C This is reached only very rarely.
467
leal (%ebp,%esi), %edi C n-q*d remainder -> next n2
472
jmp L(inverse_loop_done)