1
dnl Alpha ev6 nails mpn_addmul_3.
3
dnl Copyright 2002 Free Software Foundation, Inc.
5
dnl This file is part of the GNU MP Library.
7
dnl The GNU MP Library is free software; you can redistribute it and/or
8
dnl modify it under the terms of the GNU Lesser General Public License as
9
dnl published by the Free Software Foundation; either version 2.1 of the
10
dnl License, or (at your option) any later version.
12
dnl The GNU MP Library is distributed in the hope that it will be useful,
13
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
dnl Lesser General Public License for more details.
17
dnl You should have received a copy of the GNU Lesser General Public
18
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19
dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20
dnl Suite 330, Boston, MA 02111-1307, USA.
22
include(`../config.m4')
30
dnl Useful register aliases
31
define(`numb_mask',`r24')
50
dnl Used for temps: r8 r19 r28
52
define(`NAIL_BITS',`GMP_NAIL_BITS')
53
define(`NUMB_BITS',`GMP_NUMB_BITS')
55
dnl This declaration is munged by configure
58
dnl Runs at 3.0 cycles/limb. With unrolling, the ulimb load and the 3
59
dnl bookkeeping increments and the `bis' that copies from r22 to r6 could be
60
dnl removed and the instruction count reduced from 26 to to 21. We could
61
dnl thereby probably reach 2 cycles/limb, the IMUL bandwidth.
63
dnl If this is going to be a Karatsuba basecase building block, we need some
64
dnl of the combinations below. That way, we won't ever hit the
65
dnl slower mpn_addmul_1 for any huge multiplication.
67
dnl Alt 3 Alt 4 Alt 5 Alt 6
68
dnl addmul_2 addmul_2 addmul_3 addmul_3
69
dnl addmul_3 addmul_3 addmul_4 addmul_4
70
dnl addmul_4 addmul_5 addmul_5
74
dnl callee-saves: r9 r10 r11 r12 r13 r14 r15
75
dnl scratch: r0 r1 r2 r3 r4 r5 r6 r7 r8
76
dnl r16 r17 r18 r19 r20 r21 r22 r23 r24 r25 r27 r28
77
dnl return address: 26
78
dnl global pointer: 29
82
PROLOGUE(mpn_addmul_3)
84
srl numb_mask,NAIL_BITS,numb_mask
90
bis r31, r31, acc0 C zero acc0
92
bis r31, r31, acc1 C zero acc1
94
bis r31, r31, acc2 C zero acc2
101
mulq v0, ulimb, m0a C U1
102
umulh v0, ulimb, m0b C U1
103
mulq v1, ulimb, m1a C U1
104
umulh v1, ulimb, m1b C U1
106
mulq v2, ulimb, m2a C U1
107
umulh v2, ulimb, m2b C U1
111
bis r31, r31, r31 C nop
114
addq r19, acc0, acc0 C propagate nail
117
srl m0a,NAIL_BITS, r8 C U0
119
mulq v0, ulimb, m0a C U1
123
umulh v0, ulimb, m0b C U1
124
bis r31, r31, r31 C nop
127
srl m1a,NAIL_BITS, r8 C U0
128
bis r31, r31, r31 C nop
129
mulq v1, ulimb, m1a C U1
133
umulh v1, ulimb, m1b C U1
134
and r19,numb_mask, r28 C extract numb part
136
bis r31, r31, r31 C nop
137
srl m2a,NAIL_BITS, r8 C U0
139
mulq v2, ulimb, m2a C U1
143
umulh v2, ulimb, m2b C U1
144
srl r19,NUMB_BITS, r19 C extract nail part
146
bis r31, r31, r31 C nop
153
addq r19, acc0, acc0 C propagate nail
155
srl m0a,NAIL_BITS, r8 C U0
159
srl m1a,NAIL_BITS, r8 C U0
162
and r19,numb_mask, r28 C extract limb
163
srl m2a,NAIL_BITS, r8 C U0
166
srl r19,NUMB_BITS, r19 C extract nail
169
addq r19, acc0, acc0 C propagate nail
170
and acc0,numb_mask, r28
172
srl acc0,NUMB_BITS, r19
175
and acc1,numb_mask, r28
177
srl acc1,NUMB_BITS, r19
181
EPILOGUE(mpn_addmul_3)