1
dnl Intel Pentium-4 mpn_divexact_1 -- mpn by limb exact division.
3
dnl P4: 19.0 cycles/limb
5
dnl Copyright 2001 Free Software Foundation, Inc.
7
dnl This file is part of the GNU MP Library.
9
dnl The GNU MP Library is free software; you can redistribute it and/or
10
dnl modify it under the terms of the GNU Lesser General Public License as
11
dnl published by the Free Software Foundation; either version 2.1 of the
12
dnl License, or (at your option) any later version.
14
dnl The GNU MP Library is distributed in the hope that it will be useful,
15
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
16
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17
dnl Lesser General Public License for more details.
19
dnl You should have received a copy of the GNU Lesser General Public
20
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
21
dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
22
dnl Suite 330, Boston, MA 02111-1307, USA.
24
include(`../config.m4')
27
C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
30
C Pairs of movd's are used to avoid unaligned loads. Despite the loads not
31
C being on the dependent chain and there being plenty of cycles available,
32
C using an unaligned movq on every second iteration measured about 23 c/l.
34
C Using divl for size==1 seems a touch quicker than mul-by-inverse. The mul
35
C will be about 9+2*4+2*2+10*4+19+12 = 92 cycles latency, though some of
36
C that might be hidden by out-of-order execution, whereas divl is around 60.
37
C At size==2 an extra 19 for the mul versus 60 for the divl will see the mul
40
defframe(PARAM_DIVISOR,16)
41
defframe(PARAM_SIZE, 12)
42
defframe(PARAM_SRC, 8)
43
defframe(PARAM_DST, 4)
48
PROLOGUE(mpn_divexact_1)
55
movl PARAM_DIVISOR, %ecx
76
bsfl %ecx, %ecx C trailing twos
78
shrl %cl, %eax C d = divisor without twos
80
movd %ecx, %mm7 C shift
84
andl $127, %eax C d/2, 7 bits
89
addl $_GLOBAL_OFFSET_TABLE_, %ecx
91
movl modlimb_invert_table@GOT(%ecx), %ecx
93
movzbl (%eax,%ecx), %eax C inv 8 bits
96
movzbl modlimb_invert_table(%eax), %eax C inv 8 bits
101
movd %eax, %mm5 C inv
103
movd %eax, %mm0 C inv
105
pmuludq %mm5, %mm5 C inv*inv
109
pmuludq %mm6, %mm5 C inv*inv*d
110
paddd %mm0, %mm0 C 2*inv
114
psubd %mm5, %mm0 C inv = 2*inv - inv*inv*d
118
pmuludq %mm0, %mm0 C inv*inv
121
psrlq $32, %mm4 C 0x00000000FFFFFFFF
125
pmuludq %mm6, %mm0 C inv*inv*d
126
paddd %mm5, %mm5 C 2*inv
130
pxor %mm1, %mm1 C initial carry limb
134
psubd %mm0, %mm5 C inv = 2*inv - inv*inv*d
136
ASSERT(e,` C expect d*inv == 1 mod 2^BITS_PER_MP_LIMB
137
pushl %eax FRAME_pushl()
142
popl %eax FRAME_popl()')
144
pxor %mm0, %mm0 C initial carry bit
147
C The dependent chain here is as follows.
150
C psubq s = (src-cbit) - climb 2
151
C pmuludq q = s*inverse 8
152
C pmuludq prod = q*divisor 8
153
C psrlq climb = high(prod) 2
157
C Yet the loop measures 19.0 c/l, so obviously there's something gained
158
C there over a straight reading of the chip documentation.
161
C eax src, incrementing
163
C ecx dst, incrementing
164
C edx counter, size-1 iterations
168
C mm4 0x00000000FFFFFFFF
179
pand %mm4, %mm2 C src
180
psubq %mm0, %mm2 C src - cbit
182
psubq %mm1, %mm2 C src - cbit - climb
184
psrlq $63, %mm0 C new cbit
186
pmuludq %mm5, %mm2 C s*inverse
187
movd %mm2, (%ecx) C q
191
pmuludq %mm2, %mm1 C q*divisor
192
psrlq $32, %mm1 C new climb
200
psrlq %mm7, %mm2 C src
201
psubq %mm0, %mm2 C src - cbit
203
psubq %mm1, %mm2 C src - cbit - climb
205
pmuludq %mm5, %mm2 C s*inverse
206
movd %mm2, (%ecx) C q