1
dnl Intel Pentium-4 mpn_sqr_basecase -- square an mpn number.
3
dnl P4: approx 3.5 cycles per crossproduct, or 7 cycles per triangular
4
dnl product, at around 30x30 limbs.
6
dnl Copyright 2001 Free Software Foundation, Inc.
8
dnl This file is part of the GNU MP Library.
10
dnl The GNU MP Library is free software; you can redistribute it and/or
11
dnl modify it under the terms of the GNU Lesser General Public License as
12
dnl published by the Free Software Foundation; either version 2.1 of the
13
dnl License, or (at your option) any later version.
15
dnl The GNU MP Library is distributed in the hope that it will be useful,
16
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
17
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18
dnl Lesser General Public License for more details.
20
dnl You should have received a copy of the GNU Lesser General Public
21
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
22
dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
23
dnl Suite 330, Boston, MA 02111-1307, USA.
25
include(`../config.m4')
28
C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
30
C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
31
C lot of function call overheads are avoided, especially when the size is
34
C On small sizes there's only a small speedup over mpn_mul_basecase,
35
C presumably branch mispredictions are a bigger fraction of the work done.
36
C It's not clear how to help this.
38
defframe(PARAM_SIZE,12)
39
defframe(PARAM_SRC, 8)
40
defframe(PARAM_DST, 4)
44
PROLOGUE(mpn_sqr_basecase)
56
C -----------------------------------------------------------------------------
71
C -----------------------------------------------------------------------------
80
pmuludq %mm1, %mm0 C src[0]*src[1]
82
pmuludq %mm1, %mm1 C src[0]^2
85
pmuludq %mm2, %mm2 C src[1]^2
87
movd %mm1, (%ecx) C dst[0]
91
psrlq $32, %mm3 C 0x00000000FFFFFFFF
92
pand %mm0, %mm3 C low(src[0]*src[1])
93
psrlq $32, %mm0 C high(src[0]*src[1])
95
psllq $1, %mm3 C 2*low(src[0]*src[1])
96
paddq %mm3, %mm1 C high(src[0]^2)
97
movd %mm1, 4(%ecx) C dst[1]
100
psrlq $32, %mm4 C 0x00000000FFFFFFFF
101
pand %mm2, %mm4 C low(src[1]^2)
102
psrlq $32, %mm2 C high(src[1]^2)
104
psllq $1, %mm0 C 2*high(src[0]*src[1])
105
psrlq $32, %mm1 C carry
107
paddq %mm4, %mm0 C low(src[1]^2)
108
movd %mm0, 8(%ecx) C dst[2]
110
psrlq $32, %mm0 C carry
111
paddq %mm2, %mm0 C high(src[1]^2)
112
movd %mm0, 12(%ecx) C dst[3]
123
C -----------------------------------------------------------------------------
134
C First multiply src[0]*src[1..size-1] and store at dst[1..size].
136
defframe(SAVE_ESI, -4)
137
defframe(SAVE_EDI, -8)
138
defframe(SAVE_EBP, -12)
139
deflit(STACK_SPACE, 12)
141
subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
142
pxor %mm0, %mm0 C initial carry
143
movd (%eax), %mm7 C multiplier
154
C First multiply src[0]*src[1..size-1] and store at dst[1..size].
156
C eax src, incrementing
158
C ecx dst, incrementing
159
C edx counter, size-1 iterations
178
movl PARAM_SIZE, %ebp
183
C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
184
C n=1..size-2. The last two products, which are the end corner of
185
C the product triangle, are handled separately to save looping
193
C esi src, incrementing
194
C edi dst, incrementing
195
C ebp size, decrementing
199
movd 4(%esi), %mm7 C multiplier
200
movd %mm0, 4(%ecx) C prev carry
202
leal 8(%esi), %eax C next src
205
leal 8(%edi), %ecx C next dst
208
leal 1(%ebp), %edx C counter
210
pxor %mm0, %mm0 C initial carry limb, clear carry flag
213
C eax src, incrementing
215
C ecx dst, incrementing
248
C | | | src[size-2]*src[size-1]
253
pmuludq %mm2, %mm1 C src[size-1]*src[size-2]
257
pmuludq %mm2, %mm2 C src[0]^2
263
movd 4(%edx), %mm3 C dst[1]
266
movd %mm0, 12(%edi) C dst[2*size-3]
269
movd %mm0, 16(%edi) C dst[2*size-2]
271
movd %mm2, (%edx) C dst[0]
274
psllq $1, %mm3 C 2*dst[1]
279
movl PARAM_SIZE, %ecx
282
C Now form squares on the diagonal src[0]^2,...,src[size-1]^2, and
283
C add to the triangular parts dst[1..2*size-2] with those left
287
C eax src, incrementing
289
C ecx counter, size-2 iterations
290
C edx dst, incrementing
296
C mm7 0x00000000FFFFFFFF
298
movd 4(%eax), %mm0 C src limb
302
pand %mm0, %mm1 C diagonal low
303
psrlq $32, %mm0 C diagonal high
306
psllq $1, %mm3 C 2*dst[i]
313
psllq $1, %mm3 C 2*dst[i+1]
324
movd 4(%eax), %mm0 C src[size-1]
326
pand %mm0, %mm7 C diagonal low
327
psrlq $32, %mm0 C diagonal high
329
movd 8(%edx), %mm3 C dst[2*size-2]
337
movd %mm2, 12(%edx) C dst[2*size-1]
339
ASSERT(z,` C no further carry
348
addl $STACK_SPACE, %esp