1
dnl Intel Pentium-4 mpn_divexact_by3 -- mpn exact division by 3.
3
dnl Copyright 2001, 2002, 2003 Free Software Foundation, Inc.
5
dnl This file is part of the GNU MP Library.
7
dnl The GNU MP Library is free software; you can redistribute it and/or
8
dnl modify it under the terms of the GNU Library General Public License as
9
dnl published by the Free Software Foundation; either version 2 of the
10
dnl License, or (at your option) any later version.
12
dnl The GNU MP Library is distributed in the hope that it will be useful,
13
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
dnl Library General Public License for more details.
17
dnl You should have received a copy of the GNU Library General Public
18
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19
dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20
dnl Suite 330, Boston, MA 02111-1307, USA.
22
include(`../config.m4')
25
C P4: 18.0 cycles/limb
28
C mp_limb_t mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size,
31
C The dependent chain in the loop is as follows, and this is what the code
34
C psubq (src-cbit) - climb 2
43
C Perhaps the s*inverse can be taken off the dependent chain as described in
44
C mpn/generic/diveby3.c, with a modified 3*q calculation that can give
48
defframe(PARAM_CARRY,16)
49
defframe(PARAM_SIZE, 12)
50
defframe(PARAM_SRC, 8)
51
defframe(PARAM_DST, 4)
54
C multiplicative inverse of 3, modulo 2^32
62
PROLOGUE(mpn_divexact_by3c)
68
movd PARAM_CARRY, %mm1
71
movl $0xAAAAAAAB, %edx
75
psrlq $32, %mm6 C 0x00000000FFFFFFFF
80
C eax src, incrementing
82
C ecx counter, limbs, decrementing
83
C edx dst, incrementing
87
C mm6 0x00000000FFFFFFFF
93
psubq %mm0, %mm2 C src - cbit
95
psubq %mm1, %mm2 C src - cbit - climb
97
psrlq $63, %mm0 C new cbit
99
pmuludq %mm7, %mm2 C s*inverse
100
movd %mm2, (%edx) C q