2
; AMD64 mpn_add_n/mpn_sub_n -- mpn add or subtract.
4
; Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
6
; This file is part of the GNU MP Library.
8
; The GNU MP Library is free software; you can redistribute it and/or
9
; modify it under the terms of the GNU Lesser General Public License as
10
; published by the Free Software Foundation; either version 2.1 of the
11
; License, or (at your option) any later version.
13
; The GNU MP Library is distributed in the hope that it will be useful,
14
; but WITHOUT ANY WARRANTY; without even the implied warranty of
15
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16
; Lesser General Public License for more details.
18
; You should have received a copy of the GNU Lesser General Public
19
; License along with the GNU MP Library; see the file COPYING.LIB. If
20
; not, write to the Free Software Foundation, Inc., 59 Temple Place -
21
; Suite 330, Boston, MA 02111-1307, USA.
23
; Adapted by Brian Gladman AMD64 using the Microsoft VC++ v8 64-bit
24
; compiler and the YASM assembler.
28
; mp_limb_t __gmpn_<op>mul_1( <op> = add or sub
35
; mp_limb_t __gmpn_<op>mul_1c(
40
; mp_limb_t carry [rsp+0x28]
43
; Calculate src[size] multiplied by mult[1] and add to /subtract from dst[size] and
44
; return the carry or borrow from the top of the result
53
%define UNROLL_COUNT (1 << UNROLL_LOG2)
54
%define UNROLL_MASK (UNROLL_COUNT - 1)
55
%define UNROLL_BYTES (8 * UNROLL_COUNT)
56
%define UNROLL_THRESHOLD 9
58
%if UNROLL_BYTES >= 256
59
%error unroll count is too large
60
%elif UNROLL_BYTES >= 128
80
mov src,rdx ; source ptr
81
xor cry,cry ; carry = 0
82
dec len ; test for one limb only
83
jnz %%0 ; if more than one
84
mov rax,[src] ; get limb value
85
mul mlt ; rax * mlt -> rdx (hi), rax (lo)
86
%2 [dst],rax ; add/sub from destination
87
adc rdx,byte 0 ; add any carry into high word
88
mov rax,rdx ; and return the carry value
92
mov src,rdx ; source pointer
93
mov cry,[rsp+0x28] ; carry value
94
dec len ; test for one limb
95
jnz %%0 ; if more than one
96
mov rax,[src] ; get limb value
97
mul mlt ; rax * mlt -> rdx (hi), rax (lo)
98
add rax,cry ; add in input carry
99
adc rdx,byte 0 ; propagate it into rdx
100
%2 [dst],rax ; add or subtract rax from dest limb
101
adc rdx,byte 0 ; propagate carry into high word
106
cmp len,byte UNROLL_THRESHOLD
107
mov rax,[src] ; first limb of source
108
ja %%2 ; unroll for many limbs
109
lea src,[src+len*8+8] ; next source limb
110
lea dst,[dst+len*8] ; current dst limb
113
mul mlt ; multiply current src limb -> rxx, rax
114
add rax,cry ; add in carry
115
adc rdx,byte 0 ; propagate carry into rdx
116
%2 [dst+len*8],rax ; add or subtract rax from dest limb
117
mov rax,[src+len*8] ; get next source limb
118
adc rdx,byte 0 ; add carry or borrow into high word
119
inc len ; go to next limb
120
mov cry,rdx ; high word -> carry
122
mul mlt ; one more limb to do
127
mov rax,rdx ; return carry value as a limb
130
%define jmp_val rbp ; jump into code sequence
131
%define rep_cnt rbx ; repeats for full sequence
132
%define cry_hi rsi ; second carry for alternate block
140
shr rep_cnt,UNROLL_LOG2
144
mov cry_hi,len ; cry_hi and jmp_val are temporary
145
shl jmp_val,2 ; values for calculating the jump
146
shl cry_hi,4 ; offset into the unrolled code
148
lea cry_hi,[cry_hi+jmp_val]
149
lea jmp_val,[%%3 wrt rip]
150
lea jmp_val,[jmp_val+cry_hi]
152
lea jmp_val,[cry_hi+jmp_val+%%3]
156
add cry,rax ; initial carry, becomes low carry
160
mov rax,[src+8] ; src second limb
161
lea src,[src+len*8+off+16]
162
lea dst,[dst+len*8+off]
163
cmovnz cry_hi,cry ; high, low carry other way around
169
%define CHUNK_COUNT 2
171
%rep UNROLL_COUNT / CHUNK_COUNT
172
%assign disp0 8 * i * CHUNK_COUNT - off
175
%2 [byte dst+disp0],cry
176
mov cry,len ; len = 0
178
mov rax,[byte src+disp0]
181
%2 [byte dst+disp0+8],cry_hi
182
mov cry_hi,len ; len = 0
184
mov rax,[byte src+disp0+8]
191
lea src,[src+UNROLL_BYTES]
192
lea dst,[dst+UNROLL_BYTES]
211
mac_sub __g,add,mpn_addmul_1,mpn_addmul_1c
212
mac_sub __g,sub,mpn_submul_1,mpn_submul_1c