1
dnl AMD K6 mpn_divexact_1 -- mpn by limb exact division.
3
dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
5
dnl This file is part of the GNU MP Library.
7
dnl The GNU MP Library is free software; you can redistribute it and/or
8
dnl modify it under the terms of the GNU Lesser General Public License as
9
dnl published by the Free Software Foundation; either version 2.1 of the
10
dnl License, or (at your option) any later version.
12
dnl The GNU MP Library is distributed in the hope that it will be useful,
13
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
dnl Lesser General Public License for more details.
17
dnl You should have received a copy of the GNU Lesser General Public
18
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19
dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20
dnl Suite 330, Boston, MA 02111-1307, USA.
22
include(`../config.m4')
27
C K6: 10.0 12.0 cycles/limb
31
C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
34
C A simple divl is used for size==1. This is about 10 cycles faster for an
35
C odd divisor or 20 cycles for an even divisor.
37
C The loops are quite sensitive to code alignment, speeds should be
38
C rechecked (odd and even divisor, pic and non-pic) if contemplating
41
defframe(PARAM_DIVISOR,16)
42
defframe(PARAM_SIZE, 12)
43
defframe(PARAM_SRC, 8)
44
defframe(PARAM_DST, 4)
46
dnl re-use parameter space
47
define(VAR_INVERSE,`PARAM_DST')
52
PROLOGUE(mpn_divexact_1)
74
movl PARAM_DIVISOR, %eax
75
pushl %ebx FRAME_pushl()
78
pushl %ebp FRAME_pushl()
82
incl %edx C will get shift+1
85
pushl %esi FRAME_pushl()
87
leal 1(%eax,%eax), %esi C d without twos
88
andl $127, %eax C d/2, 7 bits
93
addl $_GLOBAL_OFFSET_TABLE_, %ebp
95
movl modlimb_invert_table@GOT(%ebp), %ebp
97
Zdisp( movzbl, 0,(%eax,%ebp), %eax)
101
movzbl modlimb_invert_table(%eax), %eax C inv 8 bits
103
pushl %edi FRAME_pushl()
105
leal (%eax,%eax), %ebp C 2*inv
107
imull %eax, %eax C inv*inv
111
imull %esi, %eax C inv*inv*d
113
subl %eax, %ebp C inv = 2*inv - inv*inv*d
114
leal (%ebp,%ebp), %eax C 2*inv
116
imull %ebp, %ebp C inv*inv
118
movl %esi, PARAM_DIVISOR C d without twos
119
leal (%ebx,%ecx,4), %ebx C src end
121
imull %esi, %ebp C inv*inv*d
123
leal (%edi,%ecx,4), %edi C dst end
126
subl %ebp, %eax C inv = 2*inv - inv*inv*d
127
subl $1, %edx C shift amount, and clear carry
129
ASSERT(e,` C expect d*inv == 1 mod 2^BITS_PER_MP_LIMB
130
pushl %eax FRAME_pushl()
131
imull PARAM_DIVISOR, %eax
133
popl %eax FRAME_popl()')
135
movl %eax, VAR_INVERSE
138
movl (%ebx,%ecx,4), %esi C src low limb
147
C ecx counter, limbs, negative
149
C esi next limb, adjusted for carry
151
C ebp carry bit, 0 or -1
155
movl PARAM_DIVISOR, %eax
156
movl %esi, -4(%edi,%ecx,4)
158
mull %esi C carry limb in edx
160
subl %ebp, %edx C apply carry bit
161
movl (%ebx,%ecx,4), %esi
164
subl %edx, %esi C apply carry limb
165
movl VAR_INVERSE, %edx
167
sbbl %ebp, %ebp C 0 or -1
175
movl %esi, -4(%edi,%ecx,4)
192
nop C code alignment, necessary for claimed speed
195
C non-PIC code alignment already ok at 0x9a
208
Zdisp( movq, 0,(%ebx,%ecx,4), %mm0) C src[0,1]
211
movl VAR_INVERSE, %edx
217
jz L(even_two) C if only two limbs
220
C Out-of-order execution is good enough to hide the load/rshift/movd
221
C latency. Having imul at the top of the loop gives 11.5 c/l instead of 12,
222
C on K6-2. In fact there's only 11 of decode, but nothing running at 11 has
223
C been found. Maybe the fact every second movq is unaligned costs the extra
229
C ecx counter, limbs, negative
231
C esi next limb, adjusted for carry
233
C ebp carry bit, 0 or -1
235
C mm0 scratch, source limbs
240
movl %esi, -8(%edi,%ecx,4)
241
movl PARAM_DIVISOR, %eax
243
mull %esi C carry limb in edx
245
movq -4(%ebx,%ecx,4), %mm0
249
subl %ebp, %edx C apply carry bit
251
subl %edx, %esi C apply carry limb
252
movl VAR_INVERSE, %edx
254
sbbl %ebp, %ebp C 0 or -1
261
movd -4(%ebx), %mm0 C src high limb
267
movl PARAM_DIVISOR, %eax
269
mull %esi C carry limb in edx
272
subl %ebp, %edx C apply carry bit
274
movl VAR_INVERSE, %eax
275
subl %edx, %esi C apply carry limb