1
dnl Intel P5 mpn_lshift -- mpn left shift.
3
dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
5
dnl This file is part of the GNU MP Library.
7
dnl The GNU MP Library is free software; you can redistribute it and/or
8
dnl modify it under the terms of the GNU Lesser General Public License as
9
dnl published by the Free Software Foundation; either version 2.1 of the
10
dnl License, or (at your option) any later version.
12
dnl The GNU MP Library is distributed in the hope that it will be useful,
13
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
dnl Lesser General Public License for more details.
17
dnl You should have received a copy of the GNU Lesser General Public
18
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19
dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20
dnl Suite 330, Boston, MA 02111-1307, USA.
22
include(`../config.m4')
25
C P5: 1.75 cycles/limb.
28
C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
31
C Shift src,size left by shift many bits and store the result in dst,size.
32
C Zeros are shifted in at the right. Return the bits shifted out at the
35
C The comments in mpn_rshift apply here too.
37
defframe(PARAM_SHIFT,16)
38
defframe(PARAM_SIZE, 12)
39
defframe(PARAM_SRC, 8)
40
defframe(PARAM_DST, 4)
43
dnl minimum 5, because the unrolled loop can't handle less
44
deflit(UNROLL_THRESHOLD, 5)
59
movl PARAM_SHIFT, %ecx
61
cmp $UNROLL_THRESHOLD, %eax
64
movl -4(%ebx,%eax,4), %edi C src high limb
69
shldl( %cl, %edi, %eax) C eax was decremented to zero
73
movl %edi, (%edx) C dst low limb
74
popl %edi C risk of data cache bank clash
81
C -----------------------------------------------------------------------------
92
movd (%ebx,%eax,4), %mm5 C src high limb
94
movd %ecx, %mm6 C lshift
101
psrlq $32, %mm5 C retval
105
C eax counter, limbs, negative
117
movq -4(%ebx,%eax,4), %mm0
124
movd %mm0, 4(%edx,%eax,4)
143
C -----------------------------------------------------------------------------
155
movd -4(%ebx,%eax,4), %mm5 C src high limb
156
leal (%ebx,%eax,4), %edi
158
movd %ecx, %mm6 C lshift
162
jz L(start_src_aligned)
165
C src isn't aligned, process high limb separately (marked xxx) to
168
C source -8(ebx,%eax,4)
170
C +-------+-------+-------+--
172
C +-------+-------+-------+--
178
C +-------+-------+--
180
C +-------+-------+--
182
movq -8(%ebx,%eax,4), %mm0 C unaligned load
191
movd %mm0, (%edx,%eax,4)
192
L(start_src_aligned):
194
movq -8(%ebx,%eax,4), %mm1 C src high qword
195
leal (%edx,%eax,4), %edi
198
psrlq $32, %mm5 C return value
200
movq -16(%ebx,%eax,4), %mm3 C src second highest qword
201
jz L(start_dst_aligned)
203
C dst isn't aligned, subtract 4 to make it so, and pretend the shift
204
C is 32 bits extra. High limb of dst (marked xxx) handled here
207
C source -8(ebx,%eax,4)
209
C +-------+-------+--
211
C +-------+-------+--
217
C +-------+-------+-------+--
219
C +-------+-------+-------+--
223
addl $32, %ecx C new shift
230
C wasted cycle here waiting for %mm0
232
movd %mm0, -4(%edx,%eax,4)
234
L(start_dst_aligned):
240
addl $64, %ecx C 64-shift
244
subl $8, %eax C size-8
248
por %mm1, %mm3 C mm3 ready to store
252
C The comments in mpn_rshift apply here too.
265
C mm2 src qword from 16(%ebx,%eax,4)
266
C mm3 dst qword ready to store to 24(%edx,%eax,4)
272
movq 8(%ebx,%eax,4), %mm0
278
movq %mm3, 24(%edx,%eax,4) C prev
281
movq (%ebx,%eax,4), %mm3 C
284
movq %mm0, 16(%edx,%eax,4)
296
C eax -4 to -1 representing respectively 0 to 3 limbs remaining
302
movq 8(%ebx,%eax,4), %mm0
308
movq %mm3, 24(%edx,%eax,4) C prev
318
C eax -4 or -3 representing respectively 0 or 1 limbs remaining
320
C mm2 src prev qword, from 16(%ebx,%eax,4)
321
C mm3 dst qword, for 24(%edx,%eax,4)
324
movd %mm5, %eax C retval
330
C One extra src limb, destination was aligned.
333
C --+---------------+-------+
335
C --+---------------+-------+
337
C dest edx+12 edx+4 edx
338
C --+---------------+---------------+-------+
340
C --+---------------+---------------+-------+
343
C mm7 = ecx = 64-shift
346
C One extra src limb, destination was unaligned.
349
C --+---------------+-------+
351
C --+---------------+-------+
354
C --+---------------+---------------+
356
C --+---------------+---------------+
359
C mm7 = ecx = 64-(shift+32)
362
C In both cases there's one extra limb of src to fetch and combine
363
C with mm2 to make a qword at 4(%edx), and in the aligned case
364
C there's an extra limb of dst to be formed from that extra src limb
386
jz L(finish_one_unaligned)
389
L(finish_one_unaligned):
398
C No extra src limbs, destination was aligned.
401
C --+---------------+
403
C --+---------------+
406
C --+---------------+---------------+
408
C --+---------------+---------------+
411
C mm7 = ecx = 64-shift
414
C No extra src limbs, destination was unaligned.
417
C --+---------------+
419
C --+---------------+
422
C --+---------------+-------+
424
C --+---------------+-------+
427
C mm7 = ecx = 64-(shift+32)
430
C The movd for the unaligned case writes the same data to 4(%edx)
431
C that the movq does for the aligned case.
438
jz L(finish_zero_unaligned)
441
L(finish_zero_unaligned):
446
movd %mm5, %eax C retval