1
dnl Intel Pentium mpn_divexact_1 -- mpn by limb exact division.
5
dnl P54: 24.5 30.5 cycles/limb
8
dnl Copyright 2001 Free Software Foundation, Inc.
10
dnl This file is part of the GNU MP Library.
12
dnl The GNU MP Library is free software; you can redistribute it and/or
13
dnl modify it under the terms of the GNU Lesser General Public License as
14
dnl published by the Free Software Foundation; either version 2.1 of the
15
dnl License, or (at your option) any later version.
17
dnl The GNU MP Library is distributed in the hope that it will be useful,
18
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
19
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20
dnl Lesser General Public License for more details.
22
dnl You should have received a copy of the GNU Lesser General Public
23
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
24
dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
25
dnl Suite 330, Boston, MA 02111-1307, USA.
27
include(`../config.m4')
30
C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
33
C Plain divl is used for small sizes, since the inverse takes a while to
34
C setup. Multiplying works out faster for size>=3 when the divisor is odd,
35
C or size>=4 when the divisor is even. Actually on P55 size==2 for odd or
36
C size==3 for even are about the same speed for both divl or mul, but the
37
C former is used since it will use up less code cache.
39
C The P55 speeds noted above, 23 cycles odd or 28 cycles even, are as
40
C expected. On P54 in the even case the shrdl pairing nonsense (see
41
C mpn/x86/pentium/README) costs 1 cycle, but it's not clear why there's a
42
C further 1.5 slowdown for both odd and even.
44
defframe(PARAM_DIVISOR,16)
45
defframe(PARAM_SIZE, 12)
46
defframe(PARAM_SRC, 8)
47
defframe(PARAM_DST, 4)
49
dnl re-use parameter space
50
define(VAR_INVERSE,`PARAM_DST')
55
PROLOGUE(mpn_divexact_1)
58
movl PARAM_DIVISOR, %eax
61
pushl %esi FRAME_pushl()
62
push %edi FRAME_pushl()
68
addl %ecx, %eax C size if even, size+1 if odd
76
movl -4(%esi,%ecx,4), %eax
80
movl %eax, -4(%edi,%ecx,4)
93
movl PARAM_DIVISOR, %eax
97
ASSERT(nz, `orl %eax, %eax')
99
incl %ecx C shift count
103
leal 1(%eax,%eax), %edx C d
104
andl $127, %eax C d/2, 7 bits
106
pushl %ebx FRAME_pushl()
107
pushl %ebp FRAME_pushl()
114
addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp
116
movl modlimb_invert_table@GOT(%ebp), %ebp
118
movzbl (%eax,%ebp), %eax
122
movzbl modlimb_invert_table(%eax), %eax C inv 8 bits
125
movl %eax, %ebp C inv
126
addl %eax, %eax C 2*inv
128
imull %ebp, %ebp C inv*inv
130
imull %edx, %ebp C inv*inv*d
132
subl %ebp, %eax C inv = 2*inv - inv*inv*d
133
movl PARAM_SIZE, %ebx
136
addl %eax, %eax C 2*inv
138
imull %ebp, %ebp C inv*inv
140
imull %edx, %ebp C inv*inv*d
142
subl %ebp, %eax C inv = 2*inv - inv*inv*d
143
movl %edx, PARAM_DIVISOR C d without twos
145
leal (%esi,%ebx,4), %esi C src end
146
leal (%edi,%ebx,4), %edi C dst end
150
ASSERT(e,` C expect d*inv == 1 mod 2^BITS_PER_MP_LIMB
151
pushl %eax FRAME_pushl()
152
imull PARAM_DIVISOR, %eax
154
popl %eax FRAME_popl()')
156
movl %eax, VAR_INVERSE
157
xorl %ebp, %ebp C initial carry bit
159
movl (%esi,%ebx,4), %eax C src low limb
160
orl %ecx, %ecx C shift
162
movl 4(%esi,%ebx,4), %edx C src second limb (for even)
165
shrdl( %cl, %edx, %eax)
174
C ebx counter, limbs, negative
179
C ebp carry bit, 0 or -1
183
movl (%esi,%ebx,4), %eax
191
imull VAR_INVERSE, %eax
193
movl %eax, (%edi,%ebx,4)
210
C ebx counter, limbs, negative
215
C ebp carry bit, 0 or -1
219
subl %ebp, %edx C carry bit
220
movl -4(%esi,%ebx,4), %eax C src limb
222
movl (%esi,%ebx,4), %ebp C and one above it
224
shrdl( %cl, %ebp, %eax)
226
subl %edx, %eax C carry limb
231
imull VAR_INVERSE, %eax
233
movl %eax, -4(%edi,%ebx,4)
242
movl -4(%esi), %eax C src high limb
247
subl %edx, %eax C no carry if division is exact
249
imull VAR_INVERSE, %eax
251
movl %eax, -4(%edi) C dst high limb
252
nop C protect against cache bank clash