2
; Copyright 2001, 2002 Free Software Foundation, Inc.
4
; This file is part of the GNU MP Library.
6
; The GNU MP Library is free software; you can redistribute it and/or
7
; modify it under the terms of the GNU Lesser General Public License as
8
; published by the Free Software Foundation; either version 2.1 of the
9
; License, or (at your option) any later version.
11
; The GNU MP Library is distributed in the hope that it will be useful,
12
; but WITHOUT ANY WARRANTY; without even the implied warranty of
13
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
; Lesser General Public License for more details.
16
; You should have received a copy of the GNU Lesser General Public
17
; License along with the GNU MP Library; see the file COPYING.LIB. If
18
; not, write to the Free Software Foundation, Inc., 59 Temple Place -
19
; Suite 330, Boston, MA 02111-1307, USA.
21
; Translation of AT&T syntax code by Brian Gladman
23
%include "..\x86i.inc"
25
extern ___gmp_modlimb_invert_table
26
global ___gmpn_divexact_1
29
export ___gmpn_divexact_1
32
%define PARAM_DIVISOR esp+frame+16
33
%define PARAM_SIZE esp+frame+12
34
%define PARAM_SRC esp+frame+8
35
%define PARAM_DST esp+frame+4
37
%define SAVE_EBX esp+frame-4
38
%define SAVE_ESI esp+frame-8
39
%define SAVE_EDI esp+frame-12
40
%define SAVE_EBP esp+frame-16
41
%define VAR_INVERSE esp+frame-20
42
%define STACK_SPACE 20
50
mov eax,[PARAM_DIVISOR]
57
bsf ecx,eax ; trailing twos
59
shr eax,cl ; d without twos
61
shr eax,1 ; d/2 without twos
62
mov [PARAM_DIVISOR],edx
67
add ebp,_GLOBAL_OFFSET_TABLE_
68
mov ebp,[___gmp_modlimb_invert_table+edx+ebp]
69
movzx ebp,byte [eax+ebp] ; inv 8 bits
71
movzx ebp,byte [___gmp_modlimb_invert_table+eax] ; inv 8 bits
74
lea eax,[ebp+ebp] ; 2*inv
75
imul ebp,ebp ; inv*inv
78
lea esi,[esi+ebx*4] ; src end
79
imul ebp,[PARAM_DIVISOR] ; inv*inv*d
80
sub eax,ebp ; inv = 2*inv - inv*inv*d
81
lea ebp,[eax+eax] ; 2*inv
82
imul eax,eax ; inv*inv
83
lea edi,[edi+ebx*4] ; dst end
86
imul eax,[PARAM_DIVISOR] ; inv*inv*d
87
sub ebp,eax ; inv = 2*inv - inv*inv*d
90
mov eax,[esi+ebx*4] ; src[0]
93
jmp Lodd_entry ; ecx initial carry is zero
95
; The dependent chain here is
99
; mull PARAM_DIVISOR 5
103
; and this is the measured speed. No special scheduling is necessary,out
104
; of order execution hides the load latency.
106
; eax scratch (src limb)
107
; ebx counter,limbs,negative
109
; edx carry limb,high of last product
114
mul dword [PARAM_DIVISOR]
121
imul eax,[VAR_INVERSE]
134
; ebx counter,limbs,negative
138
xor ebp,ebp ; initial carry bit
139
xor edx,edx ; initial carry limb (for size==1)
142
mov edi,[esi+ebx*4] ; src[1]
147
; ebx counter,limbs,negative
151
; edi &dst[size] and scratch
156
mul dword [PARAM_DIVISOR]
157
mov eax,[-4+esi+ebx*4]
165
imul eax,[VAR_INVERSE]
168
mov [-4+edi+ebx*4],eax
171
mul dword [PARAM_DIVISOR]
180
imul eax,[VAR_INVERSE]