1
dnl AMD K7 mpn_divrem_1, mpn_divrem_1c, mpn_preinv_divrem_1 -- mpn by limb
4
dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
6
dnl This file is part of the GNU MP Library.
8
dnl The GNU MP Library is free software; you can redistribute it and/or
9
dnl modify it under the terms of the GNU Lesser General Public License as
10
dnl published by the Free Software Foundation; either version 2.1 of the
11
dnl License, or (at your option) any later version.
13
dnl The GNU MP Library is distributed in the hope that it will be useful,
14
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
15
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16
dnl Lesser General Public License for more details.
18
dnl You should have received a copy of the GNU Lesser General Public
19
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
20
dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
21
dnl Suite 330, Boston, MA 02111-1307, USA.
23
include(`../config.m4')
26
C K7: 17.0 cycles/limb integer part, 15.0 cycles/limb fraction part.
29
C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
30
C mp_srcptr src, mp_size_t size,
32
C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
33
C mp_srcptr src, mp_size_t size,
34
C mp_limb_t divisor, mp_limb_t carry);
35
C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
36
C mp_srcptr src, mp_size_t size,
37
C mp_limb_t divisor, mp_limb_t inverse,
40
C The method and nomenclature follow part 8 of "Division by Invariant
41
C Integers using Multiplication" by Granlund and Montgomery, reference in
44
C The "and"s shown in the paper are done here with "cmov"s. "m" is written
45
C for m', and "d" for d_norm, which won't cause any confusion since it's
46
C only the normalized divisor that's of any use in the code. "b" is written
47
C for 2^N, the size of a limb, N being 32 here.
49
C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high
50
C limb is less than the divisor. mpn_divrem_1c doesn't check for a zero
51
C carry, since in normal circumstances that will be a very rare event.
53
C The test for skipping a division is branch free (once size>=1 is tested).
54
C The store to the destination high limb is 0 when a divide is skipped, or
55
C if it's not skipped then a copy of the src high limb is used. The latter
56
C is in case src==dst.
58
C There's a small bias towards expecting xsize==0, by having code for
59
C xsize==0 in a straight line and xsize!=0 under forward jumps.
63
C If the divisor is normalized (high bit set) then a division step can
64
C always be skipped, since the high destination limb is always 0 or 1 in
65
C that case. It doesn't seem worth checking for this though, since it
66
C probably occurs infrequently, in particular note that big_base for a
67
C decimal mpn_get_str is not normalized in a 32-bit limb.
70
dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by
71
dnl inverse method is used, rather than plain "divl"s. Minimum value 1.
73
dnl The inverse takes about 50 cycles to calculate, but after that the
74
dnl multiply is 17 c/l versus division at 42 c/l.
76
dnl At 3 limbs the mul is a touch faster than div on the integer part, and
77
dnl even more so on the fractional part.
79
deflit(MUL_THRESHOLD, 3)
82
defframe(PARAM_PREINV_SHIFT, 28) dnl mpn_preinv_divrem_1
83
defframe(PARAM_PREINV_INVERSE, 24) dnl mpn_preinv_divrem_1
84
defframe(PARAM_CARRY, 24) dnl mpn_divrem_1c
85
defframe(PARAM_DIVISOR,20)
86
defframe(PARAM_SIZE, 16)
87
defframe(PARAM_SRC, 12)
88
defframe(PARAM_XSIZE, 8)
89
defframe(PARAM_DST, 4)
91
defframe(SAVE_EBX, -4)
92
defframe(SAVE_ESI, -8)
93
defframe(SAVE_EDI, -12)
94
defframe(SAVE_EBP, -16)
96
defframe(VAR_NORM, -20)
97
defframe(VAR_INVERSE, -24)
98
defframe(VAR_SRC, -28)
99
defframe(VAR_DST, -32)
100
defframe(VAR_DST_STOP,-36)
102
deflit(STACK_SPACE, 36)
107
PROLOGUE(mpn_preinv_divrem_1)
109
movl PARAM_XSIZE, %ecx
111
subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
117
movl PARAM_SIZE, %ebx
119
leal 8(%edx,%ecx,4), %edx C &dst[xsize+2]
121
movl PARAM_DIVISOR, %ebp
123
movl %edx, VAR_DST_STOP C &dst[xsize+2]
125
xorl %edi, %edi C carry
127
movl -4(%esi,%ebx,4), %eax C src high limb
134
cmpl %ebp, %eax C high cmp divisor
136
cmovc( %eax, %edi) C high is carry if high<divisor
137
cmovnc( %eax, %ecx) C 0 if skip div, src high if not
138
C (the latter in case src==dst)
140
movl %ecx, -12(%edx,%ebx,4) C dst high limb
141
sbbl $0, %ebx C skip one division if high<divisor
142
movl PARAM_PREINV_SHIFT, %ecx
144
leal -8(%edx,%ebx,4), %edx C &dst[xsize+size]
147
movl %edx, VAR_DST C &dst[xsize+size]
149
shll %cl, %ebp C d normalized
153
movd %eax, %mm7 C rshift
154
movl PARAM_PREINV_INVERSE, %eax
162
PROLOGUE(mpn_divrem_1c)
164
movl PARAM_CARRY, %edx
165
movl PARAM_SIZE, %ecx
166
subl $STACK_SPACE, %esp
167
deflit(`FRAME',STACK_SPACE)
170
movl PARAM_XSIZE, %ebx
176
movl PARAM_DIVISOR, %ebp
181
leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
187
C offset 0xa1, close enough to aligned
188
PROLOGUE(mpn_divrem_1)
191
movl PARAM_SIZE, %ecx
192
movl $0, %edx C initial carry (if can't skip a div)
193
subl $STACK_SPACE, %esp
194
deflit(`FRAME',STACK_SPACE)
200
movl PARAM_XSIZE, %ebx
203
movl PARAM_DIVISOR, %ebp
204
orl %ecx, %ecx C size
208
leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
210
jz L(no_skip_div) C if size==0
211
movl -4(%esi,%ecx,4), %eax C src high limb
214
cmpl %ebp, %eax C high cmp divisor
216
cmovc( %eax, %edx) C high is carry if high<divisor
217
cmovnc( %eax, %esi) C 0 if skip div, src high if not
219
movl %esi, (%edi,%ecx,4) C dst high limb
220
sbbl $0, %ecx C size-1 if high<divisor
221
movl PARAM_SRC, %esi C reload
234
leal (%ebx,%ecx), %eax C size+xsize
235
cmpl $MUL_THRESHOLD, %eax
236
jae L(mul_by_inverse)
239
C With MUL_THRESHOLD set to 3, the simple loops here only do 0 to 2 limbs.
240
C It'd be possible to write them out without the looping, but no speedup
243
C Using PARAM_DIVISOR instead of %ebp measures 1 cycle/loop faster on the
244
C integer part, but curiously not on the fractional part, where %ebp is a
245
C (fixed) couple of cycles faster.
248
jz L(divide_no_integer)
251
C eax scratch (quotient)
254
C edx scratch (remainder)
259
movl -4(%esi,%ecx,4), %eax
263
movl %eax, (%edi,%ecx,4)
265
jnz L(divide_integer)
268
L(divide_no_integer):
271
jnz L(divide_fraction)
280
addl $STACK_SPACE, %esp
286
C eax scratch (quotient)
289
C edx scratch (remainder)
298
movl %eax, -4(%edi,%ebx,4)
300
jnz L(divide_fraction)
306
C -----------------------------------------------------------------------------
317
bsrl %ebp, %eax C 31-l
319
leal 12(%edi), %ebx C &dst[xsize+2], loop dst stop
320
leal 4(%edi,%ecx,4), %edi C &dst[xsize+size]
323
movl %ebx, VAR_DST_STOP
325
movl %ecx, %ebx C size
328
movl %edx, %edi C carry
336
shll %cl, %ebp C d normalized
342
subl %ebp, %edx C (b-d)-1 giving edx:eax = b*(b-d)-1
344
divl %ebp C floor (b*(b-d)-1) / d
357
orl %ebx, %ebx C size
358
movl %eax, VAR_INVERSE
359
leal -12(%esi,%ebx,4), %eax C &src[size-3]
365
movl 8(%eax), %esi C src high limb
368
L(start_two_or_more):
369
movl 4(%eax), %edx C src second highest limb
371
shldl( %cl, %esi, %edi) C n2 = carry,high << l
373
shldl( %cl, %edx, %esi) C n10 = high,second << l
376
je L(integer_two_left)
381
shldl( %cl, %esi, %edi) C n2 = carry,high << l
383
shll %cl, %esi C n10 = high << l
385
jmp L(integer_one_left)
389
C Can be here with xsize==0 if mpn_preinv_divrem_1 had size==1 and
390
C skipped a division.
392
shll %cl, %edi C n2 = carry << l
393
movl %edi, %eax C return value for zero_done
401
C -----------------------------------------------------------------------------
403
C The multiply by inverse loop is 17 cycles, and relies on some out-of-order
404
C execution. The instruction scheduling is important, with various
405
C apparently equivalent forms running 1 to 5 cycles slower.
407
C A lower bound for the time would seem to be 16 cycles, based on the
408
C following successive dependencies.
420
C This chain is what the loop has already, but 16 cycles isn't achieved.
421
C K7 has enough decode, and probably enough execute (depending maybe on what
422
C a mul actually consumes), but nothing running under 17 has been found.
424
C In theory n2+n1 could be done in the sub and addback stages (by
425
C calculating both n2 and n2+n1 there), but lack of registers makes this an
426
C unlikely proposition.
428
C The jz in the loop keeps the q1+1 stage to 1 cycle. Handling an overflow
429
C from q1+1 with an "sbbl $0, %ebx" would add a cycle to the dependent
430
C chain, and nothing better than 18 cycles has been found when using it.
431
C The jump is taken only when q1 is 0xFFFFFFFF, and on random data this will
432
C be an extremely rare event.
434
C Branch mispredictions will hit random occurrances of q1==0xFFFFFFFF, but
435
C if some special data is coming out with this always, the q1_ff special
436
C case actually runs at 15 c/l. 0x2FFF...FFFD divided by 3 is a good way to
437
C induce the q1_ff case, for speed measurements or testing. Note that
438
C 0xFFF...FFF divided by 1 or 2 doesn't induce it.
440
C The instruction groupings and empty comments show the cycles for a naive
441
C in-order view of the code (conveniently ignoring the load latency on
442
C VAR_INVERSE). This shows some of where the time is going, but is nonsense
443
C to the extent that out-of-order execution rearranges it. In this case
444
C there's 19 cycles shown, but it executes at 17.
449
C ebx scratch (nadj, q1)
450
C ecx scratch (src, dst)
456
C mm0 scratch (src qword)
457
C mm7 rshift for normalization
459
cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
463
leal (%ebp,%esi), %ebx
464
cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
465
sbbl $-1, %eax C n2+n1
467
mull VAR_INVERSE C m*(n2+n1)
469
movq (%ecx), %mm0 C next limb and the one below it
476
addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
477
leal 1(%edi), %ebx C n2+1
482
adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
495
movl VAR_DST_STOP, %eax
499
sbbl %edx, %edi C n - (q1+1)*d
500
movl %esi, %edi C remainder -> n2
501
leal (%ebp,%esi), %edx
505
cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
514
L(integer_loop_done):
517
C -----------------------------------------------------------------------------
519
C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
520
C q1_ff special case. This make the code a bit smaller and simpler, and
521
C costs only 1 cycle (each).
525
C ebx scratch (nadj, q1)
526
C ecx scratch (src, dst)
534
cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
538
leal (%ebp,%esi), %ebx
539
cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
540
sbbl $-1, %eax C n2+n1
542
mull VAR_INVERSE C m*(n2+n1)
544
movd (%ecx), %mm0 C src low limb
546
movl VAR_DST_STOP, %ecx
550
addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
551
leal 1(%edi), %ebx C n2+1
554
adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
570
sbbl %edx, %edi C n - (q1+1)*d
571
movl %esi, %edi C remainder -> n2
572
leal (%ebp,%esi), %edx
576
cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
582
C -----------------------------------------------------------------------------
585
C ebx scratch (nadj, q1)
594
movl VAR_DST_STOP, %ecx
595
cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
598
leal (%ebp,%esi), %ebx
599
cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
600
sbbl $-1, %eax C n2+n1
602
mull VAR_INVERSE C m*(n2+n1)
610
addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
611
leal 1(%edi), %ebx C n2+1
616
adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
618
sbbl $0, %ebx C q1 if q1+1 overflowed
632
sbbl %edx, %edi C n - (q1+1)*d
633
movl %esi, %edi C remainder -> n2
634
leal (%ebp,%esi), %edx
636
cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
658
addl $STACK_SPACE, %esp
666
C -----------------------------------------------------------------------------
668
C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
669
C of q*d is simply -d and the remainder n-q*d = n10+d
681
movl VAR_DST_STOP, %edx
685
leal (%ebp,%esi), %edi C n-q*d remainder -> next n2
688
movd %mm0, %esi C next n10
694
jmp L(integer_loop_done)
698
C -----------------------------------------------------------------------------
700
C Being the fractional part, the "source" limbs are all zero, meaning
701
C n10=0, n1=0, and hence nadj=0, leading to many instructions eliminated.
703
C The loop runs at 15 cycles. The dependent chain is the same as the
704
C general case above, but without the n2+n1 stage (due to n1==0), so 15
705
C would seem to be the lower bound.
707
C A not entirely obvious simplification is that q1+1 never overflows a limb,
708
C and so there's no need for the sbbl $0 or jz q1_ff from the general case.
709
C q1 is the high word of m*n2+b*n2 and the following shows q1<=b-2 always.
710
C rnd() means rounding down to a multiple of d.
712
C m*n2 + b*n2 <= m*(d-1) + b*(d-1)
713
C = m*d + b*d - m - b
714
C = floor((b(b-d)-1)/d)*d + b*d - m - b
715
C = rnd(b(b-d)-1) + b*d - m - b
716
C = rnd(b(b-d)-1 + b*d) - m - b
717
C = rnd(b*b-1) - m - b
720
C Unchanged from the general case is that the final quotient limb q can be
721
C either q1 or q1+1, and the q1+1 case occurs often. This can be seen from
722
C equation 8.4 of the paper which simplifies as follows when n1==0 and
725
C n-q1*d = (n2*k+q0*d)/b <= d + (d*d-2d)/b
727
C As before, the instruction groupings and empty comments show a naive
728
C in-order view of the code, which is made a nonsense by out of order
729
C execution. There's 17 cycles shown, but it executes at 15.
731
C Rotating the store q and remainder->n2 instructions up to the top of the
732
C loop gets the run time down from 16 to 15.
745
movl VAR_DST_STOP, %ecx C &dst[xsize+2]
748
subl $8, %ecx C &dst[xsize]
749
jmp L(fraction_entry)
754
C eax n2 carry, then scratch
755
C ebx scratch (nadj, q1)
756
C ecx dst, decrementing
762
movl %ebx, (%ecx) C previous q
763
movl %eax, %edi C remainder->n2
766
mull VAR_INVERSE C m*n2
780
addl %edx, %ebx C 1 + high(n2<<32 + m*n2) = q1+1
790
negl %eax C low of n - (q1+1)*d
794
sbbl %edx, %edi C high of n - (q1+1)*d, caring only about carry
795
leal (%ebp,%eax), %edx
797
cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1