1
dnl AMD K7 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
3
dnl K7: 3.9 cycles/limb.
5
dnl Future: It should be possible to avoid the separate mul after the
6
dnl unrolled loop by moving the movl/adcl to the top.
9
dnl Copyright 1999, 2000, 2001 Free Software Foundation, Inc.
11
dnl This file is part of the GNU MP Library.
13
dnl The GNU MP Library is free software; you can redistribute it and/or
14
dnl modify it under the terms of the GNU Lesser General Public License as
15
dnl published by the Free Software Foundation; either version 2.1 of the
16
dnl License, or (at your option) any later version.
18
dnl The GNU MP Library is distributed in the hope that it will be useful,
19
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
20
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
dnl Lesser General Public License for more details.
23
dnl You should have received a copy of the GNU Lesser General Public
24
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
25
dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
26
dnl Suite 330, Boston, MA 02111-1307, USA.
29
include(`../config.m4')
32
dnl K7: UNROLL_COUNT cycles/limb
38
dnl Maximum possible with the current code is 64.
40
deflit(UNROLL_COUNT, 16)
43
ifdef(`OPERATION_addmul_1',`
45
define(M4_function_1, mpn_addmul_1)
46
define(M4_function_1c, mpn_addmul_1c)
47
define(M4_description, add it to)
48
define(M4_desc_retval, carry)
49
',`ifdef(`OPERATION_submul_1',`
51
define(M4_function_1, mpn_submul_1)
52
define(M4_function_1c, mpn_submul_1c)
53
define(M4_description, subtract it from)
54
define(M4_desc_retval, borrow)
55
',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
58
MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
61
C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
63
C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
64
C mp_limb_t mult, mp_limb_t carry);
66
C Calculate src,size multiplied by mult and M4_description dst,size.
67
C Return the M4_desc_retval limb from the top of the result.
70
deflit(UNROLL_THRESHOLD, 9)
72
deflit(UNROLL_THRESHOLD, 6)
75
defframe(PARAM_CARRY, 20)
76
defframe(PARAM_MULTIPLIER,16)
77
defframe(PARAM_SIZE, 12)
78
defframe(PARAM_SRC, 8)
79
defframe(PARAM_DST, 4)
82
defframe(SAVE_EBX, -4)
83
defframe(SAVE_ESI, -8)
84
defframe(SAVE_EDI, -12)
85
defframe(SAVE_EBP, -16)
90
PROLOGUE(M4_function_1)
96
jnz LF(M4_function_1c,start_1)
101
mull PARAM_MULTIPLIER
111
PROLOGUE(M4_function_1c)
112
movl PARAM_SIZE, %edx
116
jnz L(more_than_one_limb)
121
mull PARAM_MULTIPLIER
123
addl PARAM_CARRY, %eax
134
C offset 0x44 so close enough to aligned
135
L(more_than_one_limb):
136
movl PARAM_CARRY, %ecx
141
subl $SAVE_SIZE, %esp
146
movl %edx, %ebx C size-1
150
cmpl $UNROLL_THRESHOLD, %edx
152
movl PARAM_MULTIPLIER, %ebp
155
movl (%esi), %eax C src low limb
162
leal 4(%esi,%ebx,4), %esi C point one limb past last
163
leal (%edi,%ebx,4), %edi C point at last limb
166
C The movl to load the next source limb is done well ahead of the
167
C mul. This is necessary for full speed, and leads to one limb
168
C handled separately at the end.
184
M4_inst %ecx, (%edi,%ebx,4)
185
movl (%esi,%ebx,4), %eax
206
addl $SAVE_SIZE, %esp
212
C -----------------------------------------------------------------------------
223
dnl overlapping with parameters no longer needed
224
define(VAR_COUNTER,`PARAM_SIZE')
225
define(VAR_JUMP, `PARAM_MULTIPLIER')
227
subl $2, %ebx C (size-2)-1
230
shrl $UNROLL_LOG2, %ebx
233
movl %ebx, VAR_COUNTER
234
andl $UNROLL_MASK, %edx
243
leal L(entry) (%edx,%ebx,1), %edx
250
addl %eax, %ecx C initial carry, becomes low carry
254
movl 4(%esi), %eax C src second limb
255
leal ifelse(UNROLL_BYTES,256,128+) 8(%esi,%ebx,4), %esi
256
leal ifelse(UNROLL_BYTES,256,128) (%edi,%ebx,4), %edi
258
movl %edx, %ebx C high carry
259
cmovnz( %ecx, %ebx) C high,low carry other way around
267
C See mpn/x86/README about old gas bugs
268
leal (%edx,%ebx,1), %edx
269
addl $L(entry)-L(here), %edx
275
C -----------------------------------------------------------------------------
276
C This code uses a "two carry limbs" scheme. At the top of the loop the
277
C carries are ebx=lo, ecx=hi, then they swap for each limb processed. For
278
C the computed jump an odd size means they start one way around, an even
279
C size the other. Either way one limb is handled separately at the start of
282
C The positioning of the movl to load the next source limb is important.
283
C Moving it after the adcl with a view to avoiding a separate mul at the end
284
C of the loop slows the code down.
296
C VAR_COUNTER loop counter
301
deflit(CHUNK_COUNT,2)
302
forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
303
deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
304
deflit(`disp1', eval(disp0 + 4))
308
Zdisp( M4_inst,%ecx, disp0,(%edi))
313
Zdisp( movl, disp0,(%esi), %eax)
319
M4_inst %ebx, disp1(%edi)
324
movl disp1(%esi), %eax
329
leal UNROLL_BYTES(%esi), %esi
330
leal UNROLL_BYTES(%edi), %edi
340
C edi dst (points at second last limb)
342
deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
343
deflit(`disp1', eval(disp0-0 + 4))
347
M4_inst %ecx, disp0(%edi)
355
M4_inst %eax, disp1(%edi)
359
addl $SAVE_SIZE, %esp