1
dnl Intel P6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
3
dnl P6: 6.35 cycles/limb (at 16 limbs/loop).
6
dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
8
dnl This file is part of the GNU MP Library.
10
dnl The GNU MP Library is free software; you can redistribute it and/or
11
dnl modify it under the terms of the GNU Lesser General Public License as
12
dnl published by the Free Software Foundation; either version 2.1 of the
13
dnl License, or (at your option) any later version.
15
dnl The GNU MP Library is distributed in the hope that it will be useful,
16
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
17
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18
dnl Lesser General Public License for more details.
20
dnl You should have received a copy of the GNU Lesser General Public
21
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
22
dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
23
dnl Suite 330, Boston, MA 02111-1307, USA.
26
include(`../config.m4')
29
dnl P6 UNROLL_COUNT cycles/limb
34
dnl Maximum possible with the current code is 64.
36
deflit(UNROLL_COUNT, 16)
39
ifdef(`OPERATION_addmul_1', `
41
define(M4_function_1, mpn_addmul_1)
42
define(M4_function_1c, mpn_addmul_1c)
43
define(M4_description, add it to)
44
define(M4_desc_retval, carry)
45
',`ifdef(`OPERATION_submul_1', `
47
define(M4_function_1, mpn_submul_1)
48
define(M4_function_1c, mpn_submul_1c)
49
define(M4_description, subtract it from)
50
define(M4_desc_retval, borrow)
51
',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
54
MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
57
C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
59
C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
60
C mp_limb_t mult, mp_limb_t carry);
62
C Calculate src,size multiplied by mult and M4_description dst,size.
63
C Return the M4_desc_retval limb from the top of the result.
65
C This code is pretty much the same as the K6 code. The unrolled loop is
66
C the same, but there's just a few scheduling tweaks in the setups and the
69
C A number of variations have been tried for the unrolled loop, with one or
70
C two carries, and with loads scheduled earlier, but nothing faster than 6
71
C cycles/limb has been found.
74
deflit(UNROLL_THRESHOLD, 5)
76
deflit(UNROLL_THRESHOLD, 5)
79
defframe(PARAM_CARRY, 20)
80
defframe(PARAM_MULTIPLIER,16)
81
defframe(PARAM_SIZE, 12)
82
defframe(PARAM_SRC, 8)
83
defframe(PARAM_DST, 4)
88
PROLOGUE(M4_function_1c)
91
movl PARAM_CARRY, %ebx
92
jmp LF(M4_function_1,start_nc)
95
PROLOGUE(M4_function_1)
98
xorl %ebx, %ebx C initial carry
101
movl PARAM_SIZE, %ecx
112
cmpl $UNROLL_THRESHOLD, %ecx
114
movl PARAM_MULTIPLIER, %ebp
119
C this is offset 0x22, so close enough to aligned
137
M4_inst %eax, -4(%edi)
158
C------------------------------------------------------------------------------
159
C VAR_JUMP holds the computed jump temporarily because there's not enough
160
C registers when doing the mul for the initial two carry limbs.
162
C The add/adc for the initial carry in %ebx is necessary only for the
163
C mpn_add/submul_1c entry points. Duplicating the startup code to
164
C eliminiate this for the plain mpn_add/submul_1 doesn't seem like a good
167
dnl overlapping with parameters already fetched
168
define(VAR_COUNTER,`PARAM_SIZE')
169
define(VAR_JUMP, `PARAM_DST')
171
C this is offset 0x43, so close enough to aligned
187
shrl $UNROLL_LOG2, %edx
188
andl $UNROLL_MASK, %ecx
190
movl %edx, VAR_COUNTER
193
C 15 code bytes per limb
201
leal L(entry) (%edx,%ecx,1), %edx
203
movl (%esi), %eax C src low limb
206
leal ifelse(UNROLL_BYTES,256,128+) 4(%esi,%ecx,4), %esi
210
addl %ebx, %eax C initial carry (from _1c)
213
movl %edx, %ebx C high carry
214
leal ifelse(UNROLL_BYTES,256,128) (%edi,%ecx,4), %edi
218
movl %eax, %ecx C low carry
220
cmovnz( %ebx, %ecx) C high,low carry other way around
231
C See README.family about old gas bugs
232
leal (%edx,%ecx,1), %edx
233
addl $L(entry)-L(here), %edx
241
C -----------------------------------------------------------
253
C VAR_COUNTER loop counter
255
C 15 code bytes per limb
257
addl $UNROLL_BYTES, %edi
260
deflit(CHUNK_COUNT,2)
261
forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
262
deflit(`disp0', eval(i*4*CHUNK_COUNT ifelse(UNROLL_BYTES,256,-128)))
263
deflit(`disp1', eval(disp0 + 4))
265
Zdisp( movl, disp0,(%esi), %eax)
267
Zdisp( M4_inst,%ecx, disp0,(%edi))
272
movl disp1(%esi), %eax
274
M4_inst %ebx, disp1(%edi)
281
leal UNROLL_BYTES(%esi), %esi
286
deflit(`disp0', eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128)))
288
M4_inst %ecx, disp0(%edi)