1
dnl Intel P6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
3
dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
5
dnl This file is part of the GNU MP Library.
7
dnl The GNU MP Library is free software; you can redistribute it and/or
8
dnl modify it under the terms of the GNU Lesser General Public License as
9
dnl published by the Free Software Foundation; either version 2.1 of the
10
dnl License, or (at your option) any later version.
12
dnl The GNU MP Library is distributed in the hope that it will be useful,
13
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
dnl Lesser General Public License for more details.
17
dnl You should have received a copy of the GNU Lesser General Public
18
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19
dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20
dnl Suite 330, Boston, MA 02111-1307, USA.
22
include(`../config.m4')
25
C P6: 6.35 cycles/limb (at 16 limbs/loop).
28
dnl P6 UNROLL_COUNT cycles/limb
33
dnl Maximum possible with the current code is 64.
35
deflit(UNROLL_COUNT, 16)
38
ifdef(`OPERATION_addmul_1', `
40
define(M4_function_1, mpn_addmul_1)
41
define(M4_function_1c, mpn_addmul_1c)
42
define(M4_description, add it to)
43
define(M4_desc_retval, carry)
44
',`ifdef(`OPERATION_submul_1', `
46
define(M4_function_1, mpn_submul_1)
47
define(M4_function_1c, mpn_submul_1c)
48
define(M4_description, subtract it from)
49
define(M4_desc_retval, borrow)
50
',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
53
MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
56
C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
58
C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
59
C mp_limb_t mult, mp_limb_t carry);
61
C Calculate src,size multiplied by mult and M4_description dst,size.
62
C Return the M4_desc_retval limb from the top of the result.
64
C This code is pretty much the same as the K6 code. The unrolled loop is
65
C the same, but there's just a few scheduling tweaks in the setups and the
68
C A number of variations have been tried for the unrolled loop, with one or
69
C two carries, and with loads scheduled earlier, but nothing faster than 6
70
C cycles/limb has been found.
73
deflit(UNROLL_THRESHOLD, 5)
75
deflit(UNROLL_THRESHOLD, 5)
78
defframe(PARAM_CARRY, 20)
79
defframe(PARAM_MULTIPLIER,16)
80
defframe(PARAM_SIZE, 12)
81
defframe(PARAM_SRC, 8)
82
defframe(PARAM_DST, 4)
87
PROLOGUE(M4_function_1c)
90
movl PARAM_CARRY, %ebx
94
PROLOGUE(M4_function_1)
97
xorl %ebx, %ebx C initial carry
100
movl PARAM_SIZE, %ecx
111
cmpl $UNROLL_THRESHOLD, %ecx
113
movl PARAM_MULTIPLIER, %ebp
118
C this is offset 0x22, so close enough to aligned
136
M4_inst %eax, -4(%edi)
157
C------------------------------------------------------------------------------
158
C VAR_JUMP holds the computed jump temporarily because there's not enough
159
C registers when doing the mul for the initial two carry limbs.
161
C The add/adc for the initial carry in %ebx is necessary only for the
162
C mpn_add/submul_1c entry points. Duplicating the startup code to
163
C eliminiate this for the plain mpn_add/submul_1 doesn't seem like a good
166
dnl overlapping with parameters already fetched
167
define(VAR_COUNTER,`PARAM_SIZE')
168
define(VAR_JUMP, `PARAM_DST')
170
C this is offset 0x43, so close enough to aligned
186
shrl $UNROLL_LOG2, %edx
187
andl $UNROLL_MASK, %ecx
189
movl %edx, VAR_COUNTER
192
C 15 code bytes per limb
200
leal L(entry) (%edx,%ecx,1), %edx
202
movl (%esi), %eax C src low limb
205
leal ifelse(UNROLL_BYTES,256,128+) 4(%esi,%ecx,4), %esi
209
addl %ebx, %eax C initial carry (from _1c)
212
movl %edx, %ebx C high carry
213
leal ifelse(UNROLL_BYTES,256,128) (%edi,%ecx,4), %edi
217
movl %eax, %ecx C low carry
219
cmovnz( %ebx, %ecx) C high,low carry other way around
230
C See mpn/x86/README about old gas bugs
231
leal (%edx,%ecx,1), %edx
232
addl $L(entry)-L(here), %edx
240
C -----------------------------------------------------------
252
C VAR_COUNTER loop counter
254
C 15 code bytes per limb
256
addl $UNROLL_BYTES, %edi
259
deflit(CHUNK_COUNT,2)
260
forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
261
deflit(`disp0', eval(i*4*CHUNK_COUNT ifelse(UNROLL_BYTES,256,-128)))
262
deflit(`disp1', eval(disp0 + 4))
264
Zdisp( movl, disp0,(%esi), %eax)
266
Zdisp( M4_inst,%ecx, disp0,(%edi))
271
movl disp1(%esi), %eax
273
M4_inst %ebx, disp1(%edi)
280
leal UNROLL_BYTES(%esi), %esi
285
deflit(`disp0', eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128)))
287
M4_inst %ecx, disp0(%edi)