1
dnl AMD K6-2 mpn_copyd -- copy limb vector, decrementing.
3
dnl K6-2: 0.56 or 1.0 cycles/limb (at 32 limbs/loop), depending on data
7
dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
9
dnl This file is part of the GNU MP Library.
11
dnl The GNU MP Library is free software; you can redistribute it and/or
12
dnl modify it under the terms of the GNU Lesser General Public License as
13
dnl published by the Free Software Foundation; either version 2.1 of the
14
dnl License, or (at your option) any later version.
16
dnl The GNU MP Library is distributed in the hope that it will be useful,
17
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
18
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19
dnl Lesser General Public License for more details.
21
dnl You should have received a copy of the GNU Lesser General Public
22
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
23
dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
24
dnl Suite 330, Boston, MA 02111-1307, USA.
27
include(`../config.m4')
31
dnl UNROLL_COUNT cycles/limb
36
dnl Maximum possible with the current code is 64, the minimum is 2.
38
deflit(UNROLL_COUNT, 32)
41
C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
43
C Copy src,size to dst,size, processing limbs from high to low addresses.
45
C The comments in copyi.asm apply here too.
48
defframe(PARAM_SIZE,12)
49
defframe(PARAM_SRC, 8)
50
defframe(PARAM_DST, 4)
66
cmpl $UNROLL_COUNT, %ecx
68
leal -4(%esi,%ecx,4), %esi
70
leal -4(%edi,%ecx,4), %edi
86
C if src and dst are different alignments mod8, then use rep movs
87
C if src and dst are both 4mod8 then process one limb to get 0mod8
90
leal (%esi,%edi), %ebx
98
leal -UNROLL_COUNT(%ecx), %ecx
99
jnz L(already_aligned)
107
ifelse(UNROLL_BYTES,256,`
112
C offset 0x3D here, but gets full speed without further alignment
118
C esi src, incrementing
119
C edi dst, incrementing
122
C `disp' is never 0, so don't need to force 0(%esi).
124
deflit(CHUNK_COUNT, 2)
125
forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
126
deflit(`disp', eval(-4-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,+128)))
127
movq disp(%esi), %mm0
128
movq %mm0, disp(%edi)
131
leal -UNROLL_BYTES(%esi), %esi
132
subl $UNROLL_COUNT, %ecx
134
leal -UNROLL_BYTES(%edi), %edi
138
C now %ecx is -UNROLL_COUNT to -1 representing repectively 0 to
139
C UNROLL_COUNT-1 limbs remaining
141
testb $eval(UNROLL_COUNT/2), %cl
143
leal UNROLL_COUNT(%ecx), %ecx
147
C at an unroll count of 32 this block of code is 16 cycles faster than
148
C the rep movs, less 3 or 4 to test whether to do it
150
forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT/2-1, `
151
deflit(`disp', eval(-4-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,+128)))
152
movq disp(%esi), %mm0
153
movq %mm0, disp(%edi)
156
subl $eval(UNROLL_BYTES/2), %esi
157
subl $eval(UNROLL_BYTES/2), %edi
159
subl $eval(UNROLL_COUNT/2), %ecx
163
ifelse(UNROLL_BYTES,256,`