1
dnl AMD K7 mpn_mod_32lsub1 -- remainder modulo 2^24-1.
3
dnl K7: 1.0 cycles/limb
5
dnl Copyright 2000, 2001 Free Software Foundation, Inc.
7
dnl This file is part of the GNU MP Library.
9
dnl The GNU MP Library is free software; you can redistribute it and/or
10
dnl modify it under the terms of the GNU Lesser General Public License as
11
dnl published by the Free Software Foundation; either version 2.1 of the
12
dnl License, or (at your option) any later version.
14
dnl The GNU MP Library is distributed in the hope that it will be useful,
15
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
16
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17
dnl Lesser General Public License for more details.
19
dnl You should have received a copy of the GNU Lesser General Public
20
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
21
dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
22
dnl Suite 330, Boston, MA 02111-1307, USA.
24
include(`../config.m4')
27
C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
29
C The loop form below and the 64 byte code alignment seem necessary for the
30
C claimed speed. This is a bit strange, since normally k7 isn't very
31
C sensitive to such things. Perhaps there has to be 6 instructions in the
32
C first 16 bytes for the BTB entry or something.
34
defframe(PARAM_SIZE, 8)
35
defframe(PARAM_SRC, 4)
37
dnl re-use parameter space
38
define(SAVE_EDI, `PARAM_SIZE')
42
PROLOGUE(mpn_mod_34lsub1)
56
shrl $24, %eax C src[0] low
58
andl $0xFFFFFF, %edx C src[0] high
63
shrl $16, %edx C src[1] high
66
shll $8, %ecx C src[1] low
82
pushl %ebx FRAME_pushl()
87
pushl %esi FRAME_pushl()
88
xorl %esi, %esi C and clear carry flag
91
C code offset 0x40 at this point
123
C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs, respectively
126
movl $0xFFFFFFFF, %edi
131
movl $0xFFFFFF00, %edi
135
movl $0xFFFF0000, %edi
147
sbbl %ecx, %ecx C carry
148
movl %eax, %edx C 0mod3
149
shrl $24, %eax C 0mod3 high
151
andl %edi, %ecx C carry masked
152
andl $0x00FFFFFF, %edx C 0mod3 low
153
movl %ebx, %edi C 1mod3
155
subl %ecx, %eax C apply carry
156
shrl $16, %ebx C 1mod3 high
159
addl %edx, %eax C apply 0mod3 low
160
movl %esi, %edx C 2mod3
161
shll $8, %edi C 1mod3 low
163
addl %ebx, %eax C apply 1mod3 high
164
shrl $8, %esi C 2mod3 high
165
andl $0xFF, %edx C 2mod3 low
167
addl %edi, %eax C apply 1mod3 low
168
shll $16, %edx C 2mod3 low
170
addl %esi, %eax C apply 2mod3 high
171
popl %esi FRAME_popl()
174
addl %edx, %eax C apply 2mod3 low
175
popl %ebx FRAME_popl()