2
; Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
4
; This file is part of the GNU MP Library.
6
; The GNU MP Library is free software; you can redistribute it and/or
7
; modify it under the terms of the GNU Lesser General Public License as
8
; published by the Free Software Foundation; either version 2.1 of the
9
; License, or (at your option) any later version.
11
; The GNU MP Library is distributed in the hope that it will be useful,
12
; but WITHOUT ANY WARRANTY; without even the implied warranty of
13
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
; Lesser General Public License for more details.
16
; You should have received a copy of the GNU Lesser General Public
17
; License along with the GNU MP Library; see the file COPYING.LIB. If
18
; not, write to the Free Software Foundation, Inc., 59 Temple Place -
19
; Suite 330, Boston, MA 02111-1307, USA.
21
; Translation of AT&T syntax code by Brian Gladman
23
%include "..\\x86i.inc"
25
global ___gmpn_sqr_basecase
28
export ___gmpn_sqr_basecase
31
%define UNROLL_COUNT 64 ; seems to be maximum required (I hope!)
33
%define PARAM_SIZE esp+frame+12
34
%define PARAM_SRC esp+frame+8
35
%define PARAM_DST esp+frame+4
67
%define SAVE_ESI esp+frame-4
68
%define SAVE_EBX esp+frame-8
69
%define SAVE_EDI esp+frame-12
70
%define SAVE_EBP esp+frame-16
79
mov [ecx],eax ; dst[0]
89
mul dword [4+esi] ; src[0]*src[1]
143
mul dword [4+esi] ; src[0] * src[1]
147
mul dword [8+esi] ; src[0] * src[2]
152
mul dword [8+esi] ; src[1] * src[2]
160
; esi zero,will be dst[5]
186
adc eax,esi ; no carry out of this
199
; First multiply src[0]*src[1..size-1] and store at dst[1..size].
201
%define VAR_COUNTER esp+frame-20
202
%define VAR_JMP esp+frame-24
203
%define STACK_SPACE 24
206
sub esp,STACK_SPACE-frame
207
%define frame STACK_SPACE
212
sub ecx,edx ; -(size-1)
214
mov ebx,0 ; initial carry
215
lea esi,[esi+edx*4] ; &src[size]
216
mov ebp,eax ; multiplier
217
lea edi,[-4+edi+edx*4] ; &dst[size-1]
219
; This loop runs at just over 6 c/l.
222
; ecx counter,limbs,negative,-(size-1) to -1
230
mul dword [esi+ecx*4]
234
mov [4+edi+ecx*4],eax
239
; Addmul src[n]*src[n+1..size-1] at dst[2*n-1...],for each n=1..size-2.
241
; The last two addmuls,which are the bottom right corner of the product
242
; triangle,are left to the end. These are src[size-3]*src[size-2,size-1]
243
; and src[size-2]*src[size-1]. If size is 4 then it's only these corner
244
; cases that need to be done.
246
; The unrolled code is the same as mpn_addmul_1(),see that routine for some
249
; VAR_COUNTER is the outer loop,running from -(size-4) to -1,inclusive.
251
; VAR_JMP is the computed jump into the unrolled code,stepped by one code
252
; chunk each outer loop.
254
; This is also hard-coded in the address calculation below.
256
; With &src[size] and &dst[size-1] pointers,the displacements in the
257
; unrolled code fit in a byte for UNROLL_COUNT values up to 32,but above
258
; that an offset must be added to them.
268
%define CODE_BYTES_PER_LIMB 15
269
%if UNROLL_COUNT > 32
270
%define OFFSET 4*(UNROLL_COUNT-32)
288
add ecx,Lunroll_inner_end-Lhere-(2*CODE_BYTES_PER_LIMB)
292
lea ecx,[Lunroll_inner_end-2*CODE_BYTES_PER_LIMB+ecx+edx]
299
; The calculated jump mustn't be before the start of the available
300
; code. This is the limit that UNROLL_COUNT puts on the src operand
301
; size,but checked here using the jump address directly.
303
; ASSERT(ae,movl_text_address( Lunroll_inner_start,%eax) cmpl %eax,%ecx)
306
mov eax,Lunroll_inner_start
308
jae Lunroll_outer_top
313
; ebx high limb to store
315
; edx VAR_COUNTER,limbs,negative
316
; esi &src[size],constant
317
; edi dst ptr,second highest limb of last addmul
320
%if UNROLL_COUNT % 2 == 1
328
mov ebp,[-12+OFFSET+esi+edx*4] ; multiplier
329
mov [VAR_COUNTER],edx
330
mov eax,[-8+OFFSET+esi+edx*4] ; first limb of multiplicand
333
mov ebx,edx ; high carry
336
mov ecx,eax ; low carry
337
lea edx,[CODE_BYTES_PER_LIMB+edx]
343
; Must be on an even address here so the low bit of the jump address
344
; will indicate which way around ecx/ebx should start.
354
; 15 code bytes each limb
355
; ecx/ebx reversed on each chunk
361
%assign i UNROLL_COUNT
363
%assign disp_src OFFSET-4*i
364
%assign disp_dst disp_src
365
; m4_assert(disp_src>=-128 && disp_src<128)
366
; m4_assert(disp_dst>=-128 && disp_dst<128)
368
mov eax,[byte disp_src+esi]
371
add [byte disp_dst+edi],ebx
376
add [byte disp_dst+edi],ecx
386
mov edx,[VAR_COUNTER]
388
mov [OFFSET+4+edi],ecx
391
jnz Lunroll_outer_top
429
; Left shift of dst[1..2*size-2],the bit shifted out becomes dst[2*size-1].
432
xor eax,eax ; ready for final adcl,and clear carry
438
; ecx counter,size-1 to 1
439
; edx size-1 (for later use)
440
; esi src (for later use)
441
; edi dst,incrementing
451
mov [4+edi],eax ; dst most significant limb
452
mov eax,[esi] ; src[0]
453
lea esi,[4+esi+edx*4] ; &src[size]
454
sub ecx,edx ; -(size-1)
456
; Now add in the squares on the diagonal,src[0]^2,src[1]^2,...,
457
; src[size-1]^2. dst[0] hasn't yet been set at all yet,and just gets the
458
; low limb of src[0]^2.
461
mov [edi+ecx*8],eax ; dst[0]
465
; ecx counter,negative
475
add [4+edi+ecx*8],ebx
476
adc [8+edi+ecx*8],eax
482
add [4+edi],edx ; dst most significant limb