1
dnl Intel P5 mpn_mod_1 -- mpn by limb remainder.
3
dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
5
dnl This file is part of the GNU MP Library.
7
dnl The GNU MP Library is free software; you can redistribute it and/or
8
dnl modify it under the terms of the GNU Lesser General Public License as
9
dnl published by the Free Software Foundation; either version 2.1 of the
10
dnl License, or (at your option) any later version.
12
dnl The GNU MP Library is distributed in the hope that it will be useful,
13
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
dnl Lesser General Public License for more details.
17
dnl You should have received a copy of the GNU Lesser General Public
18
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19
dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20
dnl Suite 330, Boston, MA 02111-1307, USA.
22
include(`../config.m4')
25
C P5: 28.0 cycles/limb
28
C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
29
C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
31
C mp_limb_t mpn_preinv_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
34
C This code is not unlike mpn/x86/p6/mod_1.asm, it does the same sort of
35
C multiply by inverse without on-the-fly shifts. See that code for some
40
C P5 shldl is 4 cycles, so shifting on the fly would be at least 5 cycles
41
C slower, probably more depending what it did to register usage. Using MMX
42
C on P55 would be better, but still at least 4 or 5 instructions and so 2 or
46
dnl These thresholds are the sizes where the multiply by inverse method is
47
dnl used, rather than plain "divl"s. Minimum value 2.
49
dnl MUL_NORM_THRESHOLD is for an already normalized divisor (high bit set),
50
dnl MUL_UNNORM_THRESHOLD for an unnormalized divisor.
52
dnl With the divl loop at 44 c/l and the inverse at 28 c/l with about 70
53
dnl cycles to setup, the threshold should be about ceil(70/16)==5, which is
54
dnl what happens in practice.
56
dnl An unnormalized divisor gets an extra 40 cycles at the end for the
57
dnl final (r*2^n)%(d*2^n) and shift. This increases the threshold by about
60
dnl PIC adds between 4 and 7 cycles (not sure why it varies), but this
61
dnl doesn't change the thresholds.
63
dnl The entry sequence code that chooses between MUL_NORM_THRESHOLD and
64
dnl MUL_UNNORM_THRESHOLD is a bit horrible, but it adds only 2 cycles
65
dnl (branch free) and ensures the choice between div or mul is optimal.
67
deflit(MUL_NORM_THRESHOLD, ifdef(`PIC',5,5))
68
deflit(MUL_UNNORM_THRESHOLD, ifdef(`PIC',8,8))
70
deflit(MUL_NORM_DELTA, eval(MUL_NORM_THRESHOLD - MUL_UNNORM_THRESHOLD))
73
defframe(PARAM_INVERSE, 16) dnl mpn_preinv_mod_1
74
defframe(PARAM_CARRY, 16) dnl mpn_mod_1c
75
defframe(PARAM_DIVISOR, 12)
76
defframe(PARAM_SIZE, 8)
77
defframe(PARAM_SRC, 4)
79
dnl re-using parameter space
80
define(VAR_NORM, `PARAM_DIVISOR')
81
define(VAR_INVERSE, `PARAM_SIZE')
86
PROLOGUE(mpn_preinv_mod_1)
89
pushl %ebp FRAME_pushl()
90
pushl %esi FRAME_pushl()
95
pushl %edi FRAME_pushl()
96
pushl %ebx FRAME_pushl()
98
movl PARAM_DIVISOR, %ebp
99
movl PARAM_INVERSE, %eax
101
movl -4(%esi,%edx,4), %edi C src high limb
102
leal -8(%esi,%edx,4), %esi C &src[size-2]
109
subl %ebp, %edi C src-divisor
112
sbbl %ecx, %ecx C -1 if underflow
113
movl %edi, %eax C src-divisor
115
andl %ebp, %ecx C d if underflow
118
addl %ecx, %eax C remainder, with possible addback
132
movl PARAM_DIVISOR, %eax
133
movl PARAM_SIZE, %ecx
135
sarl $31, %eax C d highbit
136
movl PARAM_CARRY, %edx
139
jz L(done_edx) C result==carry if size==0
141
andl $MUL_NORM_DELTA, %eax
142
pushl %ebp FRAME_pushl()
144
addl $MUL_UNNORM_THRESHOLD, %eax C norm or unnorm thresh
145
pushl %esi FRAME_pushl()
148
movl PARAM_DIVISOR, %ebp
153
movl %edx, %eax C carry as pretend src high limb
154
leal 1(%ecx), %edx C size+1
156
cmpl $0x1000000, %ebp
157
jmp L(mul_by_inverse_1c)
166
movl PARAM_SIZE, %ecx
167
pushl %ebp FRAME_pushl()
173
movl PARAM_DIVISOR, %ebp
175
sarl $31, %ebp C -1 if divisor normalized
176
movl -4(%eax,%ecx,4), %eax C src high limb
178
movl PARAM_DIVISOR, %edx
179
pushl %esi FRAME_pushl()
181
andl $MUL_NORM_DELTA, %ebp
182
cmpl %edx, %eax C carry flag if high<divisor
184
sbbl %edx, %edx C -1 if high<divisor
185
addl $MUL_UNNORM_THRESHOLD, %ebp C norm or unnorm thresh
187
addl %edx, %ecx C size-1 if high<divisor
191
movl PARAM_DIVISOR, %ebp
194
jae L(mul_by_inverse)
196
andl %eax, %edx C high as initial carry if high<divisor
200
C eax scratch (quotient)
202
C ecx counter, limbs, decrementing
203
C edx scratch (remainder)
208
movl -4(%esi,%ecx,4), %eax
232
C -----------------------------------------------------------------------------
234
C The divisor is normalized using the same code as the pentium
235
C count_leading_zeros in longlong.h. Going through the GOT for PIC costs a
236
C couple of cycles, but is more or less unavoidable.
249
movl PARAM_SIZE, %edx
250
cmpl $0x1000000, %ebp
252
L(mul_by_inverse_1c):
260
pushl %edi FRAME_pushl()
262
pushl %ebx FRAME_pushl()
269
leal 25(,%ecx,8), %ecx C 0,-1,-2,-3 -> 25,17,9,1
272
addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %edi
275
movl __clz_tab@GOT(%edi), %edi
279
movb (%ebx,%edi), %bl
282
leal 25(,%ecx,8), %ecx C 0,-1,-2,-3 -> 25,17,9,1
288
movb __clz_tab(%ebx), %bl
290
movl %eax, %edi C carry -> n1
292
addl %ebx, %ecx C -34 + c + __clz_tab[d>>c] = -clz-1
293
leal -8(%esi,%edx,4), %esi C &src[size-2]
298
ASSERT(e,`pushl %eax C clz calculation same as bsrl
304
shll %cl, %ebp C d normalized
307
subl %ebp, %edx C (b-d)-1, so edx:eax = b*(b-d)-1
310
divl %ebp C floor (b*(b-d)-1) / d
313
movl %eax, VAR_INVERSE
316
movl %ecx, %edx C fake high, will cancel
319
C For mpn_mod_1 and mpn_preinv_mod_1, the initial carry in %edi is the src
320
C high limb, and this may be greater than the divisor and may need one copy
321
C of the divisor subtracted (only one, because the divisor is normalized).
322
C This is accomplished by having the initial ecx:edi act as a fake previous
323
C n2:n10. The initial edx:eax is d, acting as a fake (q1+1)*d which is
324
C subtracted from ecx:edi, with the usual addback if it produces an
329
C eax scratch (n10, n1, q1, etc)
330
C ebx scratch (nadj, src limit)
333
C esi src pointer, &src[size-2] to &src[0]
337
subl %eax, %edi C low n - (q1+1)*d
338
movl (%esi), %eax C new n10
340
sbbl %edx, %ecx C high n - (q1+1)*d, 0 or -1
344
andl %ebp, %ecx C d if underflow
346
addl %edi, %ecx C remainder -> n2, and possible addback
347
ASSERT(b,`cmpl %ebp, %ecx')
348
andl %eax, %ebx C -n1 & d
350
movl (%esi), %edi C n10
353
addl %ecx, %eax C n2+n1
354
addl %edi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
356
mull VAR_INVERSE C m*(n2+n1)
358
addl %eax, %ebx C low(m*(n2+n1) + nadj), giving carry flag
359
leal 1(%ecx), %eax C 1+n2
361
adcl %edx, %eax C 1 + high[n2<<32 + m*(n2+n1) + nadj] = q1+1
364
sbbl $0, %eax C use q1 if q1+1 overflows
365
subl $4, %esi C step src ptr
374
C %edi (after subtract and addback) is the remainder modulo d*2^n
375
C and must be reduced to 0<=r<d by calculating r*2^n mod d*2^n and
376
C right shifting by n.
378
C If d was already normalized on entry so that n==0 then nothing is
379
C needed here. This is always the case for preinv_mod_1. For mod_1
380
C or mod_1c the chance of n==0 is low, but about 40 cycles can be
383
subl %eax, %edi C low n - (q1+1)*d
386
sbbl %edx, %ebx C high n - (q1+1)*d, 0 or -1
387
xorl %esi, %esi C next n2
389
andl %ebp, %ebx C d if underflow
392
addl %ebx, %edi C remainder, with possible addback
398
C Here using %esi=n2 and %edi=n10, unlike the above
400
shldl( %cl, %edi, %esi) C n2
404
movl %edi, %eax C n10
405
movl %edi, %ebx C n10
410
andl %ebp, %ebx C -n1 & d
412
addl %esi, %eax C n2+n1
413
addl %edi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
415
mull VAR_INVERSE C m*(n2+n1)
417
addl %eax, %ebx C m*(n2+n1) + nadj, low giving carry flag
418
leal 1(%esi), %eax C 1+n2
420
adcl %edx, %eax C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
422
sbbl $0, %eax C use q1 if q1+1 overflows
426
subl %eax, %edi C low n - (q1+1)*d
429
sbbl %edx, %esi C high n - (q1+1)*d, 0 or -1
432
andl %ebp, %esi C d if underflow
435
addl %esi, %eax C addback if underflow
438
shrl %cl, %eax C denorm remainder