1
dnl Intel Pentium 4 mpn_mod_32lsub1 -- remainder modulo 2^24-1.
3
dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
5
dnl This file is part of the GNU MP Library.
7
dnl The GNU MP Library is free software; you can redistribute it and/or
8
dnl modify it under the terms of the GNU Lesser General Public License as
9
dnl published by the Free Software Foundation; either version 2.1 of the
10
dnl License, or (at your option) any later version.
12
dnl The GNU MP Library is distributed in the hope that it will be useful,
13
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
dnl Lesser General Public License for more details.
17
dnl You should have received a copy of the GNU Lesser General Public
18
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19
dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20
dnl Suite 330, Boston, MA 02111-1307, USA.
22
include(`../config.m4')
25
C Pentium4: 1.0 cycles/limb
28
C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
32
C There might a couple of cycles to save by using plain integer code for
33
C more small sizes. 2 limbs measures about 20 cycles, but 3 limbs jumps to
34
C about 46 (inclusive of some function call overheads).
36
defframe(PARAM_SIZE, 8)
37
defframe(PARAM_SRC, 4)
39
dnl re-use parameter space
40
define(SAVE_EBX, `PARAM_SRC')
41
define(SAVE_ESI, `PARAM_SIZE')
45
PROLOGUE(mpn_mod_34lsub1)
58
shrl $24, %eax C src[0] high
60
andl $0x00FFFFFF, %ecx C src[0] low
66
shrl $16, %ecx C src[1] low
69
andl $0x00FFFF00, %edx C src[1] high
82
psrlq $32, %mm7 C 0x00000000FFFFFFFF, low 32 bits
85
psrlq $40, %mm6 C 0x0000000000FFFFFF, low 24 bits
90
C ecx counter, size-2 to 0, -1 or -2
91
C edx src, incrementing
99
C mm6 0x0000000000FFFFFF
100
C mm7 0x00000000FFFFFFFF
116
C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs, respectively
119
js L(combine) C 0 more
124
jz L(combine) C 1 more
130
movq %mm7, %mm3 C low halves
139
psrlq $32, %mm0 C high halves
143
paddq %mm0, %mm4 C fold high halves to give 33 bits each
147
psllq $8, %mm4 C combine at respective offsets
150
paddq %mm5, %mm3 C 0x000cxxxxxxxxxxxx, 50 bits
152
pand %mm3, %mm6 C fold at 24 bits
158
ASSERT(z, C nothing left in high dword