16
16
dnl License for more details.
18
18
dnl You should have received a copy of the GNU Lesser General Public License
19
dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
20
dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
21
dnl MA 02111-1307, USA.
24
dnl This approaches ?? cycles/limb on PA8000 and 6.75 cycles/limb on PA8500
25
dnl for huge operands.
27
dnl The feed-in and wind-down code has not yet been scheduled. Many cycles
28
dnl could be saved there per call.
31
dnl The main loop "BIG" is 4-way unrolled, mainly to allow
32
dnl effective use of ADD,DC. Delays in moving data via the cache from the FP
33
dnl registers to the IU registers, have demaned a deep software pipeline, and
34
dnl a lot of stack slots for partial products in flight.
37
dnl save-some-registers
38
dnl do 0, 1, 2, or 3 limbs
39
dnl if done, restore-some-regs and return
45
dnl HP-PA stack grows upwards. We could allocate 8 fewer slots by using the
46
dnl slots marked FREE, as well as some slots in the caller's "frame marker".
83
dnl -38/-138 vlimb home slot. For 2.0N, the vlimb arg will arrive here.
86
include(`../config.m4')
89
define(`rp',`%r26') dnl
90
define(`up',`%r25') dnl
91
define(`n',`%r24') dnl
92
define(`vlimb',`%r23') dnl
94
define(`climb',`%r23') dnl
19
dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write
20
dnl to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
21
dnl Boston, MA 02110-1301, USA.
23
include(`../config.m4')
29
C The feed-in and wind-down code has not yet been scheduled. Many cycles
30
C could be saved there per call.
33
C The main loop "BIG" is 4-way unrolled, mainly to allow
34
C effective use of ADD,DC. Delays in moving data via the cache from the FP
35
C registers to the IU registers, have demanded a deep software pipeline, and
36
C a lot of stack slots for partial products in flight.
40
C do 0, 1, 2, or 3 limbs
41
C if done, restore-some-regs and return
47
C HP-PA stack grows upwards. We could allocate 8 fewer slots by using the
48
C slots marked FREE, as well as some slots in the caller's "frame marker".
85
C -38/-138 vlimb home slot. For 2.0N, the vlimb arg will arrive here.
88
include(`../config.m4')
94
define(`vlimb',`%r23') C
96
define(`climb',`%r23') C
96
98
ifdef(`HAVE_ABI_2_0w',
108
110
ldo 0(%r0), climb C clear climb
109
111
fldd -0x138(%r30), %fr8 C put vlimb in fp register
111
define(`p032a1',`%r1') dnl
112
define(`p032a2',`%r19') dnl
114
define(`m032',`%r20') dnl
115
define(`m096',`%r21') dnl
117
define(`p000a',`%r22') dnl
118
define(`p064a',`%r29') dnl
120
define(`s000',`%r31') dnl
122
define(`ma000',`%r4') dnl
123
define(`ma064',`%r20') dnl
125
define(`r000',`%r3') dnl
113
define(`p032a1',`%r1') C
114
define(`p032a2',`%r19') C
116
define(`m032',`%r20') C
117
define(`m096',`%r21') C
119
define(`p000a',`%r22') C
120
define(`p064a',`%r29') C
122
define(`s000',`%r31') C
124
define(`ma000',`%r4') C
125
define(`ma064',`%r20') C
127
define(`r000',`%r3') C
127
129
extrd,u n, 63, 2, %r5
128
130
cmpb,= %r5, %r0, L(BIG)
172
174
depd m096, 31, 32, ma064
176
178
add p032a1, p032a2, m032
177
179
add,dc %r0, %r0, m096
178
180
depd,z m032, 31, 32, ma000
179
181
extrd,u m032, 31, 32, ma064
181
dnl addib,= -1, %r5, L(0_out)
183
C addib,= -1, %r5, L(0_out)
182
184
depd m096, 31, 32, ma064
184
dnl xmpyu %fr8R, %fr4L, %fr22
185
dnl xmpyu %fr8L, %fr4R, %fr23
186
dnl ldd -0x78(%r30), p032a1
187
dnl fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
189
dnl xmpyu %fr8R, %fr4R, %fr24
190
dnl xmpyu %fr8L, %fr4L, %fr25
191
dnl ldd -0x70(%r30), p032a2
192
dnl fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
195
dnl add climb, p000a, s000
196
dnl ldd -0x80(%r30), p000a
197
dnl fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
199
dnl add,dc p064a, %r0, climb
201
dnl ldd -0x68(%r30), p064a
202
dnl fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
204
dnl add ma000, s000, s000
205
dnl add,dc ma064, climb, climb
208
dnl sub r000, s000, s000
209
dnl sub,db %r0, climb, climb
210
dnl sub %r0, climb, climb
213
dnl add p032a1, p032a2, m032
214
dnl add,dc %r0, %r0, m096
216
dnl depd,z m032, 31, 32, ma000
217
dnl extrd,u m032, 31, 32, ma064
219
dnl addib,<> -1, %r5, L(oop0)
220
dnl depd m096, 31, 32, ma064
186
C xmpyu %fr8R, %fr4L, %fr22
187
C xmpyu %fr8L, %fr4R, %fr23
188
C ldd -0x78(%r30), p032a1
189
C fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
191
C xmpyu %fr8R, %fr4R, %fr24
192
C xmpyu %fr8L, %fr4L, %fr25
193
C ldd -0x70(%r30), p032a2
194
C fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
197
C add climb, p000a, s000
198
C ldd -0x80(%r30), p000a
199
C fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
201
C add,dc p064a, %r0, climb
203
C ldd -0x68(%r30), p064a
204
C fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
206
C add ma000, s000, s000
207
C add,dc ma064, climb, climb
210
C sub r000, s000, s000
211
C sub,db %r0, climb, climb
212
C sub %r0, climb, climb
215
C add p032a1, p032a2, m032
216
C add,dc %r0, %r0, m096
218
C depd,z m032, 31, 32, ma000
219
C extrd,u m032, 31, 32, ma064
221
C addib,<> -1, %r5, L(loop0)
222
C depd m096, 31, 32, ma064
223
225
xmpyu %fr8R, %fr4L, %fr22
224
226
xmpyu %fr8L, %fr4R, %fr23
281
283
cmpib,>= 4, n, L(done)
284
dnl 4-way unrolled code.
288
define(`p032a1',`%r1') dnl
289
define(`p032a2',`%r19') dnl
290
define(`p096b1',`%r20') dnl
291
define(`p096b2',`%r21') dnl
292
define(`p160c1',`%r22') dnl
293
define(`p160c2',`%r29') dnl
294
define(`p224d1',`%r31') dnl
295
define(`p224d2',`%r3') dnl
297
define(`m032',`%r4') dnl
298
define(`m096',`%r5') dnl
299
define(`m160',`%r6') dnl
300
define(`m224',`%r7') dnl
301
define(`m288',`%r8') dnl
303
define(`p000a',`%r1') dnl
304
define(`p064a',`%r19') dnl
305
define(`p064b',`%r20') dnl
306
define(`p128b',`%r21') dnl
307
define(`p128c',`%r22') dnl
308
define(`p192c',`%r29') dnl
309
define(`p192d',`%r31') dnl
310
define(`p256d',`%r3') dnl
312
define(`s000',`%r10') dnl
313
define(`s064',`%r11') dnl
314
define(`s128',`%r12') dnl
315
define(`s192',`%r13') dnl
317
define(`ma000',`%r9') dnl
318
define(`ma064',`%r4') dnl
319
define(`ma128',`%r5') dnl
320
define(`ma192',`%r6') dnl
321
define(`ma256',`%r7') dnl
323
define(`r000',`%r1') dnl
324
define(`r064',`%r19') dnl
325
define(`r128',`%r20') dnl
326
define(`r192',`%r21') dnl
286
C 4-way unrolled code.
290
define(`p032a1',`%r1') C
291
define(`p032a2',`%r19') C
292
define(`p096b1',`%r20') C
293
define(`p096b2',`%r21') C
294
define(`p160c1',`%r22') C
295
define(`p160c2',`%r29') C
296
define(`p224d1',`%r31') C
297
define(`p224d2',`%r3') C
299
define(`m032',`%r4') C
300
define(`m096',`%r5') C
301
define(`m160',`%r6') C
302
define(`m224',`%r7') C
303
define(`m288',`%r8') C
305
define(`p000a',`%r1') C
306
define(`p064a',`%r19') C
307
define(`p064b',`%r20') C
308
define(`p128b',`%r21') C
309
define(`p128c',`%r22') C
310
define(`p192c',`%r29') C
311
define(`p192d',`%r31') C
312
define(`p256d',`%r3') C
314
define(`s000',`%r10') C
315
define(`s064',`%r11') C
316
define(`s128',`%r12') C
317
define(`s192',`%r13') C
319
define(`ma000',`%r9') C
320
define(`ma064',`%r4') C
321
define(`ma128',`%r5') C
322
define(`ma192',`%r6') C
323
define(`ma256',`%r7') C
325
define(`r000',`%r1') C
326
define(`r064',`%r19') C
327
define(`r128',`%r20') C
328
define(`r192',`%r21') C
328
330
std %r6, -0xe8(%r30)
329
331
std %r7, -0xe0(%r30)