2
; Intel P6 mpn_sqr_basecase -- square an mpn number.
4
; Copyright 1999,2000,2002 Free Software Foundation,Inc.
6
; This file is part of the GNU MP Library.
8
; The GNU MP Library is free software; you can redistribute it and/or
9
; modify it under the terms of the GNU Lesser General Public License as
10
; published by the Free Software Foundation; either version 2.1 of the
11
; License,or (at your option) any later version.
13
; The GNU MP Library is distributed in the hope that it will be useful,
14
; but WITHOUT ANY WARRANTY; without even the implied warranty of
15
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16
; Lesser General Public License for more details.
18
; You should have received a copy of the GNU Lesser General Public
19
; License along with the GNU MP Library; see the file COPYING.LIB. If
20
; not,write to the Free Software Foundation,Inc.,59 Temple Place -
21
; Suite 330,Boston,MA 02111-1307,USA.
23
; P6: approx 4.0 cycles per cross product,or 7.75 cycles per triangular
24
; product (measured on the speed difference between 20 and 40 limbs,
25
; which is the Karatsuba recursing range).
27
; These are the same as in mpn/x86/k6/sqr_basecase.asm,see that file for
28
; a description. The only difference here is that UNROLL_COUNT can go up
29
; to 64 (not 63) making SQR_KARATSUBA_THRESHOLD_MAX 67.
31
; void mpn_sqr_basecase (mp_ptr dst,mp_srcptr src,mp_size_t size);
33
; The algorithm is basically the same as mpn/generic/sqr_basecase.c,but a
34
; lot of function call overheads are avoided,especially when the given size
37
; The code size might look a bit excessive,but not all of it is executed so
38
; it won't all get into the code cache. The 1x1,2x2 and 3x3 special cases
39
; clearly apply only to those sizes; mid sizes like 10x10 only need part of
40
; the unrolled addmul; and big sizes like 40x40 that do use the full
41
; unrolling will least be making good use of it,because 40x40 will take
42
; something like 7000 cycles.
44
%include "..\\x86i.inc"
46
%define SQR_KARATSUBA_THRESHOLD 10
47
%define SQR_KARATSUBA_THRESHOLD_MAX 67
49
%ifdef SQR_KARATSUBA_THRESHOLD_OVERRIDE
50
%define SQR_KARATSUBA_THRESHOLD SQR_KARATSUBA_THRESHOLD_OVERRIDE
53
%define UNROLL_COUNT SQR_KARATSUBA_THRESHOLD-3
60
global ___gmpn_sqr_basecase
94
%define STACK_SPACE 16
103
mov [ecx],eax ; dst[0]
113
mul dword [4+esi] ; src[0]*src[1]
173
mul dword [4+esi] ; src[0] * src[1]
177
mul dword [8+esi] ; src[0] * src[2]
182
mul dword [8+esi] ; src[1] * src[2]
190
; esi zero,will be dst[5]
216
adc eax,esi ; no carry out of this
230
%define VAR_COUNTER esp+frame-20
231
%define VAR_JMP esp+frame-24
232
%define STACK_SPACE 24
235
; First multiply src[0]*src[1..size-1] and store at dst[1..size].
238
sub esp,STACK_SPACE-frame
239
%define frame STACK_SPACE
244
sub ecx,edx ; -(size-1)
246
mov ebx,0 ; initial carry
247
lea esi,[esi+edx*4] ; &src[size]
248
mov ebp,eax ; multiplier
249
lea edi,[-4+edi+edx*4] ; &dst[size-1]
251
; This loop runs at just over 6 c/l.
255
; ecx counter,limbs,negative,-(size-1) to -1
263
mul dword [esi+ecx*4]
267
mov [4+edi+ecx*4],eax
272
; Addmul src[n]*src[n+1..size-1] at dst[2*n-1...],for each n=1..size-2.
274
; The last two addmuls,which are the bottom right corner of the product
275
; triangle,are left to the end. These are src[size-3]*src[size-2,size-1]
276
; and src[size-2]*src[size-1]. If size is 4 then it's only these corner
277
; cases that need to be done.
279
; The unrolled code is the same as mpn_addmul_1(),see that routine for some
282
; VAR_COUNTER is the outer loop,running from -(size-4) to -1,inclusive.
284
; VAR_JMP is the computed jump into the unrolled code,stepped by one code
285
; chunk each outer loop.
287
; This is also hard-coded in the address calculation below.
289
; With &src[size] and &dst[size-1] pointers,the displacements in the
290
; unrolled code fit in a byte for UNROLL_COUNT values up to 32,but above
291
; that an offset must be added to them.
301
%define CODE_BYTES_PER_LIMB 15
303
%if UNROLL_COUNT > 32
304
%define OFFSET UNROLL_COUNT-32
323
add ecx,Lunroll_inner_end-Lhere-2*CODE_BYTES_PER_LIMB
327
lea ecx,[Lunroll_inner_end-2*CODE_BYTES_PER_LIMB+ecx+edx]
335
; The calculated jump mustn't be before the start of the available
336
; code. This is the limit that UNROLL_COUNT puts on the src operand
337
; size,but checked here using the jump address directly.
340
mov eax,Lunroll_inner_start
342
jae Lunroll_outer_top
347
; ebx high limb to store
349
; edx VAR_COUNTER,limbs,negative
350
; esi &src[size],constant
351
; edi dst ptr,second highest limb of last addmul
356
mov ebp,[-12+OFFSET+esi+edx*4] ; multiplier
357
mov [VAR_COUNTER],edx
358
mov eax,[-8+OFFSET+esi+edx*4] ; first limb of multiplicand
361
%if UNROLL_COUNT % 2 == 1
368
mov ebx,edx ; high carry
371
mov ecx,eax ; low carry
372
lea edx,[CODE_BYTES_PER_LIMB+edx]
378
; Must be on an even address here so the low bit of the jump address
379
; will indicate which way around ecx/ebx should start.
389
; 15 code bytes each limb
390
; ecx/ebx reversed on each chunk
395
%assign i UNROLL_COUNT
397
%assign disp OFFSET-4*i
399
mov eax,[byte disp+esi]
401
add [byte disp+edi],ebx
406
; this one comes out last
407
mov eax,[byte disp+esi]
409
add [byte disp+edi],ecx
419
mov edx,[VAR_COUNTER]
421
mov [OFFSET+4+edi],ecx
424
jnz Lunroll_outer_top
461
; Left shift of dst[1..2*size-2],the bit shifted out becomes dst[2*size-1].
464
xor eax,eax ; ready for final adcl,and clear carry
470
; ecx counter,size-1 to 1
471
; edx size-1 (for later use)
472
; esi src (for later use)
473
; edi dst,incrementing
483
mov [4+edi],eax ; dst most significant limb
484
mov eax,[esi] ; src[0]
485
lea esi,[4+esi+edx*4] ; &src[size]
486
sub ecx,edx ; -(size-1)
488
; Now add in the squares on the diagonal,src[0]^2,src[1]^2,...,
489
; src[size-1]^2. dst[0] hasn't yet been set at all yet,and just gets the
490
; low limb of src[0]^2.
493
mov [edi+ecx*8],eax ; dst[0]
497
; ecx counter,negative
507
add [4+edi+ecx*8],ebx
508
adc [8+edi+ecx*8],eax
514
add [4+edi],edx ; dst most significant limb