1
dnl AMD K7 mpn_add_n/mpn_sub_n -- mpn add or subtract.
3
dnl K7: 1.64 cycles/limb (at 16 limb/loop).
6
dnl Copyright 1999, 2000, 2001 Free Software Foundation, Inc.
8
dnl This file is part of the GNU MP Library.
10
dnl The GNU MP Library is free software; you can redistribute it and/or
11
dnl modify it under the terms of the GNU Lesser General Public License as
12
dnl published by the Free Software Foundation; either version 2.1 of the
13
dnl License, or (at your option) any later version.
15
dnl The GNU MP Library is distributed in the hope that it will be useful,
16
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
17
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18
dnl Lesser General Public License for more details.
20
dnl You should have received a copy of the GNU Lesser General Public
21
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
22
dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
23
dnl Suite 330, Boston, MA 02111-1307, USA.
26
include(`../config.m4')
29
dnl K7: UNROLL_COUNT cycles/limb
34
dnl Maximum possible with the current code is 64.
36
deflit(UNROLL_COUNT, 16)
39
ifdef(`OPERATION_add_n', `
41
define(M4_function_n, mpn_add_n)
42
define(M4_function_nc, mpn_add_nc)
43
define(M4_description, add)
44
',`ifdef(`OPERATION_sub_n', `
46
define(M4_function_n, mpn_sub_n)
47
define(M4_function_nc, mpn_sub_nc)
48
define(M4_description, subtract)
49
',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
52
MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
55
C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
57
C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
58
C mp_size_t size, mp_limb_t carry);
60
C Calculate src1,size M4_description src2,size, and store the result in
61
C dst,size. The return value is the carry bit from the top of the result (1
64
C The _nc version accepts 1 or 0 for an initial carry into the low limb of
65
C the calculation. Note values other than 1 or 0 here will lead to garbage
68
C This code runs at 1.64 cycles/limb, which is probably the best possible
69
C with plain integer operations. Each limb is 2 loads and 1 store, and in
70
C one cycle the K7 can do two loads, or a load and a store, leading to 1.5
73
dnl Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1.
75
deflit(UNROLL_THRESHOLD, 8)
77
deflit(UNROLL_THRESHOLD, 8)
80
defframe(PARAM_CARRY,20)
81
defframe(PARAM_SIZE, 16)
82
defframe(PARAM_SRC2, 12)
83
defframe(PARAM_SRC1, 8)
84
defframe(PARAM_DST, 4)
86
defframe(SAVE_EBP, -4)
87
defframe(SAVE_ESI, -8)
88
defframe(SAVE_EBX, -12)
89
defframe(SAVE_EDI, -16)
90
deflit(STACK_SPACE, 16)
96
PROLOGUE(M4_function_nc)
97
movl PARAM_CARRY, %eax
98
jmp LF(M4_function_n,start)
101
PROLOGUE(M4_function_n)
103
xorl %eax, %eax C carry
105
movl PARAM_SIZE, %ecx
106
subl $STACK_SPACE, %esp
107
deflit(`FRAME',STACK_SPACE)
111
cmpl $UNROLL_THRESHOLD, %ecx
113
movl PARAM_SRC2, %edx
114
movl PARAM_SRC1, %ebx
118
leal (%ebx,%ecx,4), %ebx
119
leal (%edx,%ecx,4), %edx
121
leal (%edi,%ecx,4), %edi
125
C This loop in in a single 16 byte code block already, so no
126
C alignment necessary.
136
movl (%ebx,%ecx,4), %eax
137
M4_inst (%edx,%ecx,4), %eax
138
movl %eax, (%edi,%ecx,4)
147
addl $STACK_SPACE, %esp
152
C -----------------------------------------------------------------------------
153
C This is at 0x55, close enough to aligned.
155
deflit(`FRAME',STACK_SPACE)
157
andl $-2, %ecx C size low bit masked out
158
andl $1, PARAM_SIZE C size low bit kept
164
shrl $UNROLL_LOG2, %ecx
168
andl $UNROLL_MASK, %edi
174
leal L(entry) (%edi,%edi,8), %esi C 9 bytes per
179
leal ifelse(UNROLL_BYTES,256,128) (%ebx,%edi,4), %ebx
180
leal ifelse(UNROLL_BYTES,256,128) (%edx,%edi,4), %edx
181
leal ifelse(UNROLL_BYTES,256,128) (%ebp,%edi,4), %edi
188
C See mpn/x86/README about old gas bugs
189
leal (%edi,%edi,8), %esi
190
addl $L(entry)-L(here), %esi
196
C -----------------------------------------------------------------------------
203
C esi scratch (was computed jump)
207
leal UNROLL_BYTES(%edx), %edx
210
deflit(CHUNK_COUNT, 2)
211
forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
212
deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
213
deflit(`disp1', eval(disp0 + 4))
215
Zdisp( movl, disp0,(%ebx), %esi)
216
movl disp1(%ebx), %ebp
217
Zdisp( M4_inst,disp0,(%edx), %esi)
218
Zdisp( movl, %esi, disp0,(%edi))
219
M4_inst disp1(%edx), %ebp
220
movl %ebp, disp1(%edi)
224
leal UNROLL_BYTES(%ebx), %ebx
225
leal UNROLL_BYTES(%edi), %edi
237
M4_inst UNROLL_BYTES(%edx), %ecx
246
addl $STACK_SPACE, %esp