1
dnl Intel Pentium mpn_copyi -- copy limb vector, incrementing.
3
dnl Copyright 1996, 2001, 2002 Free Software Foundation, Inc.
5
dnl This file is part of the GNU MP Library.
7
dnl The GNU MP Library is free software; you can redistribute it and/or
8
dnl modify it under the terms of the GNU Lesser General Public License as
9
dnl published by the Free Software Foundation; either version 2.1 of the
10
dnl License, or (at your option) any later version.
12
dnl The GNU MP Library is distributed in the hope that it will be useful,
13
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
dnl Lesser General Public License for more details.
17
dnl You should have received a copy of the GNU Lesser General Public
18
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19
dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20
dnl Suite 330, Boston, MA 02111-1307, USA.
22
include(`../config.m4')
25
C P5: 1.25 cycles/limb
28
C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
30
C Destination prefetching is done to avoid repeated write-throughs on lines
33
C At least one of the src or dst pointer needs to be incremented rather than
34
C using indexing, so that there's somewhere to put the loop control without
35
C an AGI. Incrementing one and not two lets us keep loop overhead to 2
36
C cycles. Making it the src pointer incremented avoids an AGI on the %ecx
37
C subtracts in the finishup code.
39
C The block of finishup code is almost as big as the main loop itself, which
40
C is unfortunate, but it's faster that way than with say rep movsl, by about
41
C 10 cycles for instance on P55.
43
C There's nothing to be gained from MMX on P55, since it can do only one
44
C movq load (or store) per cycle, so the throughput would be the same as the
45
C code here (and even then only if src and dst have the same alignment mod
48
defframe(PARAM_SIZE,12)
49
defframe(PARAM_SRC, 8)
50
defframe(PARAM_DST, 4)
58
pushl %ebx FRAME_pushl()
59
pushl %esi FRAME_pushl()
61
leal (%edx,%ecx,4), %edx C &dst[size-1]
62
xorl $-1, %ecx C -size-1
65
addl $8, %ecx C -size+7
69
movl -28(%edx,%ecx,4), %eax C fetch destination cache line, dst[0]
75
C ecx counter, limbs, negative
77
C esi src, incrementing
81
movl (%edx,%ecx,4), %eax C fetch destination cache line
84
movl (%esi), %eax C read words pairwise
86
movl %eax, -60(%edx,%ecx,4) C store words pairwise
87
movl %ebx, -56(%edx,%ecx,4)
91
movl %eax, -52(%edx,%ecx,4)
92
movl %ebx, -48(%edx,%ecx,4)
96
movl %eax, -44(%edx,%ecx,4)
97
movl %ebx, -40(%edx,%ecx,4)
101
movl %eax, -36(%edx,%ecx,4)
102
movl %ebx, -32(%edx,%ecx,4)
109
C ecx 0 to 7, representing respectively 7 to 0 limbs remaining
111
C edx dst, next location to store
118
movl %eax, -12(%edx,%ecx,4)
119
movl %ebx, -8(%edx,%ecx,4)
123
movl %eax, -4(%edx,%ecx,4)
124
movl %ebx, (%edx,%ecx,4)
135
movl %eax, -4(%edx,%ecx,4)
136
movl %ebx, (%edx,%ecx,4)
145
movl %eax, -4(%edx,%ecx,4) C risk of cache bank clash here