1
dnl Intel P6 mpn_sqr_basecase -- square an mpn number.
3
dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
5
dnl This file is part of the GNU MP Library.
7
dnl The GNU MP Library is free software; you can redistribute it and/or
8
dnl modify it under the terms of the GNU Lesser General Public License as
9
dnl published by the Free Software Foundation; either version 2.1 of the
10
dnl License, or (at your option) any later version.
12
dnl The GNU MP Library is distributed in the hope that it will be useful,
13
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
dnl Lesser General Public License for more details.
17
dnl You should have received a copy of the GNU Lesser General Public
18
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19
dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20
dnl Suite 330, Boston, MA 02111-1307, USA.
22
include(`../config.m4')
25
C P6: approx 4.0 cycles per cross product, or 7.75 cycles per triangular
26
C product (measured on the speed difference between 20 and 40 limbs,
27
C which is the Karatsuba recursing range).
30
dnl These are the same as in mpn/x86/k6/sqr_basecase.asm, see that file for
31
dnl a description. The only difference here is that UNROLL_COUNT can go up
32
dnl to 64 (not 63) making SQR_KARATSUBA_THRESHOLD_MAX 67.
34
deflit(SQR_KARATSUBA_THRESHOLD_MAX, 67)
36
ifdef(`SQR_KARATSUBA_THRESHOLD_OVERRIDE',
37
`define(`SQR_KARATSUBA_THRESHOLD',SQR_KARATSUBA_THRESHOLD_OVERRIDE)')
39
m4_config_gmp_mparam(`SQR_KARATSUBA_THRESHOLD')
40
deflit(UNROLL_COUNT, eval(SQR_KARATSUBA_THRESHOLD-3))
43
C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
45
C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
46
C lot of function call overheads are avoided, especially when the given size
49
C The code size might look a bit excessive, but not all of it is executed so
50
C it won't all get into the code cache. The 1x1, 2x2 and 3x3 special cases
51
C clearly apply only to those sizes; mid sizes like 10x10 only need part of
52
C the unrolled addmul; and big sizes like 40x40 that do use the full
53
C unrolling will least be making good use of it, because 40x40 will take
54
C something like 7000 cycles.
56
defframe(PARAM_SIZE,12)
57
defframe(PARAM_SRC, 8)
58
defframe(PARAM_DST, 4)
62
PROLOGUE(mpn_sqr_basecase)
77
C -----------------------------------------------------------------------------
92
C -----------------------------------------------------------------------------
99
defframe(SAVE_ESI, -4)
100
defframe(SAVE_EBX, -8)
101
defframe(SAVE_EDI, -12)
102
defframe(SAVE_EBP, -16)
103
deflit(`STACK_SPACE',16)
105
subl $STACK_SPACE, %esp
106
deflit(`FRAME',STACK_SPACE)
114
movl %eax, (%ecx) C dst[0]
118
movl %edx, %ebx C dst[1]
123
movl %eax, %edi C dst[2]
127
movl %edx, %ebp C dst[3]
129
mull 4(%esi) C src[0]*src[1]
156
C -----------------------------------------------------------------------------
164
pushl %esi defframe_pushl(`SAVE_ESI')
171
C -----------------------------------------------------------------------------
182
pushl %ebp defframe_pushl(`SAVE_EBP')
183
pushl %edi defframe_pushl(`SAVE_EDI')
185
mull %eax C src[0] ^ 2
193
mull %eax C src[1] ^ 2
199
pushl %ebx defframe_pushl(`SAVE_EBX')
201
mull %eax C src[2] ^ 2
208
mull 4(%esi) C src[0] * src[1]
215
mull 8(%esi) C src[0] * src[2]
223
mull 8(%esi) C src[1] * src[2]
232
C esi zero, will be dst[5]
270
adcl %esi, %eax C no carry out of this
280
C -----------------------------------------------------------------------------
281
defframe(VAR_COUNTER,-20)
282
defframe(VAR_JMP, -24)
283
deflit(`STACK_SPACE',24)
293
deflit(`FRAME',4) dnl %esi already pushed
295
C First multiply src[0]*src[1..size-1] and store at dst[1..size].
297
subl $STACK_SPACE-FRAME, %esp
298
deflit(`FRAME',STACK_SPACE)
305
subl %edx, %ecx C -(size-1)
308
movl $0, %ebx C initial carry
310
leal (%esi,%edx,4), %esi C &src[size]
311
movl %eax, %ebp C multiplier
313
leal -4(%edi,%edx,4), %edi C &dst[size-1]
316
C This loop runs at just over 6 c/l.
321
C ecx counter, limbs, negative, -(size-1) to -1
335
movl %eax, 4(%edi,%ecx,4)
344
C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
346
C The last two addmuls, which are the bottom right corner of the product
347
C triangle, are left to the end. These are src[size-3]*src[size-2,size-1]
348
C and src[size-2]*src[size-1]. If size is 4 then it's only these corner
349
C cases that need to be done.
351
C The unrolled code is the same as mpn_addmul_1(), see that routine for some
354
C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
356
C VAR_JMP is the computed jump into the unrolled code, stepped by one code
357
C chunk each outer loop.
359
dnl This is also hard-coded in the address calculation below.
360
deflit(CODE_BYTES_PER_LIMB, 15)
362
dnl With &src[size] and &dst[size-1] pointers, the displacements in the
363
dnl unrolled code fit in a byte for UNROLL_COUNT values up to 32, but above
364
dnl that an offset must be added to them.
366
ifelse(eval(UNROLL_COUNT>32),1,
367
eval((UNROLL_COUNT-32)*4),
378
movl PARAM_SIZE, %ecx
387
ifelse(OFFSET,0,,`subl $OFFSET, %esi')
393
leal L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
397
ifelse(OFFSET,0,,`subl $OFFSET, %edi')
399
C The calculated jump mustn't be before the start of the available
400
C code. This is the limit that UNROLL_COUNT puts on the src operand
401
C size, but checked here using the jump address directly.
404
`movl_text_address( L(unroll_inner_start), %eax)
408
C -----------------------------------------------------------------------------
412
C ebx high limb to store
414
C edx VAR_COUNTER, limbs, negative
415
C esi &src[size], constant
416
C edi dst ptr, second highest limb of last addmul
419
movl -12+OFFSET(%esi,%edx,4), %ebp C multiplier
420
movl %edx, VAR_COUNTER
422
movl -8+OFFSET(%esi,%edx,4), %eax C first limb of multiplicand
426
define(cmovX,`ifelse(eval(UNROLL_COUNT%2),1,`cmovz($@)',`cmovnz($@)')')
430
movl %edx, %ebx C high carry
433
movl %ecx, %edx C jump
435
movl %eax, %ecx C low carry
436
leal CODE_BYTES_PER_LIMB(%edx), %edx
438
cmovX( %ebx, %ecx) C high carry reverse
439
cmovX( %eax, %ebx) C low carry reverse
444
C Must be on an even address here so the low bit of the jump address
445
C will indicate which way around ecx/ebx should start.
449
L(unroll_inner_start):
458
C 15 code bytes each limb
459
C ecx/ebx reversed on each chunk
461
forloop(`i', UNROLL_COUNT, 1, `
462
deflit(`disp_src', eval(-i*4 + OFFSET))
463
deflit(`disp_dst', eval(disp_src))
465
m4_assert(`disp_src>=-128 && disp_src<128')
466
m4_assert(`disp_dst>=-128 && disp_dst<128')
469
Zdisp( movl, disp_src,(%esi), %eax)
471
Zdisp( addl, %ebx, disp_dst,(%edi))
476
dnl this one comes out last
477
Zdisp( movl, disp_src,(%esi), %eax)
479
Zdisp( addl, %ecx, disp_dst,(%edi))
487
addl %ebx, m4_empty_if_zero(OFFSET)(%edi)
489
movl VAR_COUNTER, %edx
492
movl %ecx, m4_empty_if_zero(OFFSET+4)(%edi)
496
jnz L(unroll_outer_top)
505
C -----------------------------------------------------------------------------
540
movl PARAM_SIZE, %ecx
551
C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
553
subl $1, %ecx C size-1
554
xorl %eax, %eax C ready for final adcl, and clear carry
563
C ecx counter, size-1 to 1
564
C edx size-1 (for later use)
565
C esi src (for later use)
566
C edi dst, incrementing
579
movl %eax, 4(%edi) C dst most significant limb
580
movl (%esi), %eax C src[0]
582
leal 4(%esi,%edx,4), %esi C &src[size]
583
subl %edx, %ecx C -(size-1)
586
C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
587
C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the
588
C low limb of src[0]^2.
593
movl %eax, (%edi,%ecx,8) C dst[0]
599
C ecx counter, negative
605
movl (%esi,%ecx,4), %eax
610
addl %ebx, 4(%edi,%ecx,8)
611
adcl %eax, 8(%edi,%ecx,8)
621
addl %edx, 4(%edi) C dst most significant limb
630
C -----------------------------------------------------------------------------
634
addl $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx