1
dnl AMD K7 mpn_gcd_1 -- mpn by 1 gcd.
3
dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
5
dnl This file is part of the GNU MP Library.
7
dnl The GNU MP Library is free software; you can redistribute it and/or
8
dnl modify it under the terms of the GNU Lesser General Public License as
9
dnl published by the Free Software Foundation; either version 2.1 of the
10
dnl License, or (at your option) any later version.
12
dnl The GNU MP Library is distributed in the hope that it will be useful,
13
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
dnl Lesser General Public License for more details.
17
dnl You should have received a copy of the GNU Lesser General Public
18
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19
dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20
dnl Suite 330, Boston, MA 02111-1307, USA.
22
include(`../config.m4')
25
C K7: 6.75 cycles/bit (approx) 1x1 gcd
26
C 11.0 cycles/limb Nx1 reduction (modexact_1_odd)
29
dnl Reduce using x%y if x is more than DIV_THRESHOLD bits bigger than y,
30
dnl where x is the larger of the two. See tune/README for more.
32
dnl divl at 40 cycles compared to the gcd at about 7 cycles/bitpair
33
dnl suggests 40/7*2=11.4 but 7 seems to be about right.
35
deflit(DIV_THRESHOLD, 7)
38
C table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
40
C This is mixed in with the code, but as per the k7 optimization manual it's
41
C a full cache line and suitably aligned so it won't get swapped between
42
C code and data. Having it in TEXT rather than RODATA saves needing a GOT
45
C Actually, there doesn't seem to be a measurable difference between this in
46
C it's own cache line or plonked in the middle of the code. Presumably
47
C since TEXT is read-only there's no worries about coherency.
57
` .byte m4_count_trailing_zeros(i)
61
C mp_limb_t mpn_gcd_1 (mp_srcptr src, mp_size_t size, mp_limb_t limb);
64
defframe(PARAM_LIMB, 12)
65
defframe(PARAM_SIZE, 8)
66
defframe(PARAM_SRC, 4)
68
defframe(SAVE_EBX, -4)
69
defframe(SAVE_ESI, -8)
70
defframe(SAVE_EDI, -12)
71
defframe(SAVE_EBP, -16)
72
defframe(CALL_DIVISOR,-20)
73
defframe(CALL_SIZE, -24)
74
defframe(CALL_SRC, -28)
76
deflit(STACK_SPACE, 28)
84
ASSERT(ne, `cmpl $0, PARAM_LIMB') C y!=0
85
ASSERT(ae, `cmpl $1, PARAM_SIZE') C size>=1
89
subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE)
94
movl (%eax), %esi C src low limb
98
call L(movl_eip_to_edi)
100
addl $L(table)-L(here), %edi
110
jnc L(twos) C 3/4 chance of x or y odd already
114
movl %ecx, %esi C common twos
116
movl PARAM_SIZE, %ecx
132
cmovb( %ebx, %eax) C swap to make x bigger than y
145
ASSERT(nz,`orl %ebx,%ebx')
161
shrl $DIV_THRESHOLD, %eax
165
ja L(strip_x_entry) C do x%y if x much bigger than y
173
movl %edx, %eax C remainder -> x
180
C Offset 0x9D here for non-PIC. About 0.4 cycles/bit is saved by
181
C ensuring the end of the jnz at the end of this loop doesn't cross
182
C into the next cache line at 0xC0.
184
C PIC on the other hand is offset 0xAC here and extends to 0xC9, so
185
C it crosses but doesn't suffer any measurable slowdown.
192
C esi twos, for use at end
195
cmovc( %ebx, %ecx) C if x-y gave carry, use x and y-x
203
ASSERT(nz, `orl %eax, %eax')
206
movb (%ecx,%edi), %cl
208
movb L(table) (%ecx), %cl
218
ASSERT(nz, `testl $1, %eax') C both odd
219
ASSERT(nz, `testl $1, %edx')
241
C -----------------------------------------------------------------------------
244
dnl MODEXACT_THRESHOLD is the size at which it's better to call
245
dnl mpn_modexact_1_odd than do an inline loop.
247
deflit(MODEXACT_THRESHOLD, ifdef(`PIC',6,5))
259
ASSERT(nz,`orl %edx,%edx')
261
jnc L(divide_strip_y)
262
leal 1(%edx,%edx), %ebx C y now odd
266
movl -4(%eax,%ecx,4), %eax C src high limb
268
cmp $MODEXACT_THRESHOLD, %ecx
271
cmpl %ebx, %eax C high cmp divisor
274
cmovc( %eax, %edx) C skip a div if high<divisor
279
C eax scratch (quotient)
281
C ecx counter (size to 1, inclusive)
282
C edx carry (remainder)
287
movl -4(%ebp,%ecx,4), %eax
330
movl %edi, %ebx C L(table)
332
addl $_GLOBAL_OFFSET_TABLE_+[.-L(table)], %ebx
333
movl %ebp, CALL_DIVISOR
336
call GSYM_PREFIX`'mpn_modexact_1_odd@PLT
339
movl %ebx, CALL_DIVISOR
343
call GSYM_PREFIX`'mpn_modexact_1_odd
355
movl ifdef(`PIC',`%ebp',`%ebx'), %edx