1
dnl AMD K6-2 mpn_rshift -- mpn right shift.
3
dnl K6-2: 1.75 cycles/limb
6
dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
8
dnl This file is part of the GNU MP Library.
10
dnl The GNU MP Library is free software; you can redistribute it and/or
11
dnl modify it under the terms of the GNU Lesser General Public License as
12
dnl published by the Free Software Foundation; either version 2.1 of the
13
dnl License, or (at your option) any later version.
15
dnl The GNU MP Library is distributed in the hope that it will be useful,
16
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
17
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18
dnl Lesser General Public License for more details.
20
dnl You should have received a copy of the GNU Lesser General Public
21
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
22
dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
23
dnl Suite 330, Boston, MA 02111-1307, USA.
26
include(`../config.m4')
29
C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
33
defframe(PARAM_SHIFT,16)
34
defframe(PARAM_SIZE, 12)
35
defframe(PARAM_SRC, 8)
36
defframe(PARAM_DST, 4)
39
dnl Minimum 9, because the unrolled loop can't handle less.
41
deflit(UNROLL_THRESHOLD, 9)
49
C The 1 limb case can be done without the push %ebx, but it's then
50
C still the same speed. The push is left as a free helping hand for
51
C the two_or_more code.
54
pushl %ebx FRAME_pushl()
59
movl PARAM_SHIFT, %ecx
62
movl (%ebx), %edx C src limb
65
shrdl( %cl, %edx, %eax) C return value
69
movl %edx, (%ebx) C dst limb
75
C -----------------------------------------------------------------------------
76
ALIGN(16) C avoid offset 0x1f
83
movl (%ebx), %edx C src low limb
87
movd PARAM_SHIFT, %mm6
90
cmpl $UNROLL_THRESHOLD-1, %eax
103
leal (%ebx,%eax,4), %ebx
105
leal -4(%ecx,%eax,4), %ecx
108
C This loop runs at about 3 cycles/limb, which is the amount of
109
C decoding, and this is despite every second access being unaligned.
112
C eax counter, -(size-1) to -1
120
Zdisp( movq, 0,(%ebx,%eax,4), %mm0)
125
Zdisp( movd, %mm0, 0,(%ecx,%eax,4))
138
C -----------------------------------------------------------------------------
149
subl $7, %eax C size-8
154
movq (%ebx), %mm2 C src low qword
155
leal (%ebx,%eax,4), %ebx C src end - 32
158
leal (%ecx,%eax,4), %ecx C dst end - 32
160
notl %eax C -(size-7)
166
Zdisp( movd, %mm2, 0,(%ecx,%eax,4)) C dst low limb
167
movq 4(%ebx,%eax,4), %mm2 C new src low qword
170
movq 12(%ebx,%eax,4), %mm0 C src second lowest qword
171
nop C avoid bad cache line crossing
174
C This loop is the important bit, the rest is just support for it.
175
C Four src limbs are held at the start, and four more will be read.
176
C Four dst limbs will be written. This schedule seems necessary for
179
C The use of -(size-7) lets the loop stop when %eax becomes >= 0 and
180
C and leaves 0 to 3 which can be tested with test $1 and $2.
183
C eax counter, -(size-7) step by +4 until >=0
201
movq 4(%ebx,%eax,4), %mm0
204
movq %mm2, -12(%ecx,%eax,4)
210
movq 12(%ebx,%eax,4), %mm0
212
movq %mm1, -4(%ecx,%eax,4)
213
ja L(top) C jump if no carry and not zero
217
C Now have the four limbs in mm2 (low) and mm0 (high), and %eax is 0
218
C to 3 representing respectively 3 to 0 further limbs.
220
testl $2, %eax C testl to avoid bad cache line crossings
223
C Two or three extra limbs: rshift mm2, OR it with lshifted mm0, mm0
224
C becomes new mm2 and a new mm0 is loaded.
233
movq 12(%ebx,%eax,4), %mm0
235
movq %mm2, -4(%ecx,%eax,4)
249
movq %mm2, 4(%ecx,%eax,4)
253
C one further extra limb to process
255
movd 32-4(%ebx), %mm0 C src[size-1], most significant limb
264
movq %mm1, 32-12(%ecx) C dst[size-3,size-2]
265
movd %mm2, 32-4(%ecx) C dst[size-1]
267
movl %edx, %eax C retval
273
nop C avoid bad cache line crossing
275
C no further extra limbs
277
movq %mm1, 32-8(%ecx) C dst[size-2,size-1]
278
movl %edx, %eax C retval