1
dnl Alpha ev6 mpn_mul_1 -- Multiply a limb vector with a limb and store the
2
dnl result in a second limb vector.
4
dnl Copyright 2000, 2001 Free Software Foundation, Inc.
6
dnl This file is part of the GNU MP Library.
8
dnl The GNU MP Library is free software; you can redistribute it and/or modify
9
dnl it under the terms of the GNU Lesser General Public License as published
10
dnl by the Free Software Foundation; either version 2.1 of the License, or (at
11
dnl your option) any later version.
13
dnl The GNU MP Library is distributed in the hope that it will be useful, but
14
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16
dnl License for more details.
18
dnl You should have received a copy of the GNU Lesser General Public License
19
dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
20
dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
21
dnl MA 02111-1307, USA.
23
include(`../config.m4')
31
C This code runs at 2.25 cycles/limb on EV6.
33
C This code was written in close cooperation with ev6 pipeline expert
34
C Steve Root. Any errors are tege's fault, though.
39
C code for n > 8 code for (n mod 8)
40
C code for (n div 8) feed-in code
44
C Some notes about unrolled loop:
46
C r1-r8 multiplies and workup
47
C r21-r28 multiplies and workup
50
C r20,r29,r13-r15 scramble
52
C We're doing 7 of the 8 carry propagations with a br fixup code and 1 with a
53
C put-the-carry-into-hi. The idea is that these branches are very rarely
54
C taken, and since a non-taken branch consumes no resurces, that is better
57
C Software pipeline: a load in cycle #09, feeds a mul in cycle #16, feeds an
58
C add NEXT cycle #09 which feeds a store in NEXT cycle #02
60
C The code could use some further work:
61
C 1. Speed up really small multiplies. The default alpha/mul_1.asm code is
62
C faster than this for size < 3.
63
C 2. Improve feed-in code, perhaps with the equivalent of switch(n%8) unless
65
C 3. Consider using 4-way unrolling, even if that runs slower.
66
C 4. Reduce register usage. In particular, try to avoid using r29.
74
ldq r2,0(r17) C r2 = s1_limb
75
lda r18,-1(r18) C size--
76
mulq r2,r19,r3 C r3 = prod_low
77
bic r31,r31,r4 C clear cy_limb
78
umulh r2,r19,r0 C r0 = prod_high
79
beq r18,$Le1a C jump if size was == 1
80
ldq r2,8(r17) C r2 = s1_limb
81
lda r18,-1(r18) C size--
83
beq r18,$Le2a C jump if size was == 2
85
$Lopa: mulq r2,r19,r3 C r3 = prod_low
86
addq r4,r0,r0 C cy_limb = cy_limb + 'cy'
87
lda r18,-1(r18) C size--
88
umulh r2,r19,r4 C r4 = cy_limb
89
ldq r2,16(r17) C r2 = s1_limb
90
lda r17,8(r17) C s1_ptr++
91
addq r3,r0,r3 C r3 = cy_limb + prod_low
93
cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
94
lda r16,8(r16) C res_ptr++
97
$Le2a: mulq r2,r19,r3 C r3 = prod_low
98
addq r4,r0,r0 C cy_limb = cy_limb + 'cy'
99
umulh r2,r19,r4 C r4 = cy_limb
100
addq r3,r0,r3 C r3 = cy_limb + prod_low
101
cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
103
addq r4,r0,r0 C cy_limb = prod_high + cy
120
and r18, 7, r20 C count for the first loop, 0-7
121
srl r18, 3, r18 C count for unrolled loop
123
beq r20, $L_8_or_more C skip first loop
126
ldq r2,0(r17) C r2 = s1_limb
127
lda r17,8(r17) C s1_ptr++
128
lda r20,-1(r20) C size--
129
mulq r2,r19,r3 C r3 = prod_low
130
umulh r2,r19,r21 C r21 = prod_high
131
beq r20,$Le1b C jump if size was == 1
132
bis r31, r31, r0 C FIXME: shouldtn't need this
133
ldq r2,0(r17) C r2 = s1_limb
134
lda r17,8(r17) C s1_ptr++
135
lda r20,-1(r20) C size--
137
lda r16,8(r16) C res_ptr++
138
beq r20,$Le2b C jump if size was == 2
140
$Lopb: mulq r2,r19,r3 C r3 = prod_low
141
addq r21,r0,r0 C cy_limb = cy_limb + 'cy'
142
lda r20,-1(r20) C size--
143
umulh r2,r19,r21 C r21 = prod_high
144
ldq r2,0(r17) C r2 = s1_limb
145
lda r17,8(r17) C s1_ptr++
146
addq r3,r0,r3 C r3 = cy_limb + prod_low
148
cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
149
lda r16,8(r16) C res_ptr++
152
$Le2b: mulq r2,r19,r3 C r3 = prod_low
153
addq r21,r0,r0 C cy_limb = cy_limb + 'cy'
154
umulh r2,r19,r21 C r21 = prod_high
155
addq r3,r0,r3 C r3 = cy_limb + prod_low
156
cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
158
lda r16,8(r16) C res_ptr++
159
addq r21,r0,r21 C cy_limb = prod_high + cy
162
lda r16,8(r16) C res_ptr++
165
lda r0, -1(r31) C put -1 in r0, for tricky loop control
166
lda r17, -32(r17) C L1 bookkeeping
167
lda r18, -1(r18) C decrement count
170
ldq r10, 40(r17) C L1
171
mulq r9, r19, r22 C U1 #07
172
ldq r11, 48(r17) C L1
173
umulh r9, r19, r23 C U1 #08
174
ldq r12, 56(r17) C L1
175
mulq r10, r19, r24 C U1 #09
178
lda r17, 64(r17) C L1 bookkeeping
180
umulh r10, r19, r25 C U1 #11
181
mulq r11, r19, r26 C U1 #12
182
umulh r11, r19, r27 C U1 #13
183
mulq r12, r19, r28 C U1 #14
185
umulh r12, r19, r1 C U1 #15
186
ldq r11, 16(r17) C L1
187
mulq r9, r19, r2 C U1 #16
188
ldq r12, 24(r17) C L1
189
umulh r9, r19, r3 C U1 #17
190
addq r21, r22, r13 C L1 mov
191
mulq r10, r19, r4 C U1 #18
192
addq r23, r24, r22 C L0 sum 2 mul's
193
cmpult r13, r21, r14 C L1 carry from sum
194
bgt r18, $L_16_or_more
196
cmpult r22, r24, r24 C U0 carry from sum
197
umulh r10, r19, r5 C U1 #02
198
addq r25, r26, r23 C U0 sum 2 mul's
199
mulq r11, r19, r6 C U1 #03
200
cmpult r23, r26, r25 C U0 carry from sum
201
umulh r11, r19, r7 C U1 #04
202
addq r27, r28, r28 C U0 sum 2 mul's
203
mulq r12, r19, r8 C U1 #05
204
cmpult r28, r27, r15 C L0 carry from sum
205
lda r16, 32(r16) C L1 bookkeeping
206
addq r13, r31, r13 C U0 start carry cascade
207
umulh r12, r19, r21 C U1 #06
211
C ---------------------------------------------------------------
213
cmpult r22, r24, r24 C U0 carry from sum
216
umulh r10, r19, r5 C U1 #02
217
addq r25, r26, r23 C U0 sum 2 mul's
218
mulq r11, r19, r6 C U1 #03
219
cmpult r23, r26, r25 C U0 carry from sum
220
umulh r11, r19, r7 C U1 #04
221
addq r27, r28, r28 C U0 sum 2 mul's
222
mulq r12, r19, r8 C U1 #05
223
cmpult r28, r27, r15 C L0 carry from sum
224
lda r16, 32(r16) C L1 bookkeeping
225
addq r13, r31, r13 C U0 start carry cascade
227
umulh r12, r19, r21 C U1 #06
228
C beq r13, fix0w C U0
229
ret0w: addq r22, r14, r26 C L0
230
ldq r10, 40(r17) C L1
232
mulq r9, r19, r22 C U1 #07
234
ret1w: addq r23, r24, r27 C L0
235
ldq r11, 48(r17) C L1
237
umulh r9, r19, r23 C U1 #08
239
ret2w: addq r28, r25, r28 C L0
240
ldq r12, 56(r17) C L1
242
mulq r10, r19, r24 C U1 #09
244
ret3w: addq r1, r2, r20 C L0 sum 2 mul's
247
addq r3, r4, r2 C L0 #10 2 mul's
248
lda r17, 64(r17) C L1 bookkeeping
249
cmpult r20, r1, r29 C U0 carry from sum
251
umulh r10, r19, r25 C U1 #11
252
cmpult r2, r4, r4 C U0 carry from sum
253
stq r13, -32(r16) C L0
254
stq r26, -24(r16) C L1
256
mulq r11, r19, r26 C U1 #12
257
addq r5, r6, r14 C U0 sum 2 mul's
258
stq r27, -16(r16) C L0
259
stq r28, -8(r16) C L1
261
umulh r11, r19, r27 C U1 #13
262
cmpult r14, r6, r3 C U0 carry from sum
263
C could do cross-jumping here:
264
C bra $L_middle_of_unrolled_loop
265
mulq r12, r19, r28 C U1 #14
266
addq r7, r3, r5 C L0 eat carry
267
addq r20, r15, r20 C U0 carry cascade
270
umulh r12, r19, r1 C U1 #15
272
ret4w: addq r2, r29, r6 C L0
273
ldq r11, 16(r17) C L1
275
mulq r9, r19, r2 C U1 #16
277
ret5w: addq r14, r4, r7 C L0
278
ldq r12, 24(r17) C L1
280
umulh r9, r19, r3 C U1 #17
282
ret6w: addq r5, r8, r8 C L0 sum 2
283
addq r21, r22, r13 C L1 sum 2 mul's
285
mulq r10, r19, r4 C U1 #18
286
addq r23, r24, r22 C L0 sum 2 mul's
287
cmpult r13, r21, r14 C L1 carry from sum
289
C ---------------------------------------------------------------
292
umulh r0, r18, r18 C U1 #01 decrement r18!
293
cmpult r8, r5, r29 C L0 carry from last bunch
294
cmpult r22, r24, r24 C U0 carry from sum
297
umulh r10, r19, r5 C U1 #02
298
addq r25, r26, r23 C U0 sum 2 mul's
302
mulq r11, r19, r6 C U1 #03
303
cmpult r23, r26, r25 C U0 carry from sum
307
umulh r11, r19, r7 C U1 #04
308
bis r31, r31, r31 C L0 st slosh
309
bis r31, r31, r31 C L1 st slosh
310
addq r27, r28, r28 C U0 sum 2 mul's
312
mulq r12, r19, r8 C U1 #05
313
cmpult r28, r27, r15 C L0 carry from sum
314
lda r16, 64(r16) C L1 bookkeeping
315
addq r13, r29, r13 C U0 start carry cascade
317
umulh r12, r19, r21 C U1 #06
319
ret0: addq r22, r14, r26 C L0
320
ldq r10, 40(r17) C L1
322
mulq r9, r19, r22 C U1 #07
324
ret1: addq r23, r24, r27 C L0
325
ldq r11, 48(r17) C L1
327
umulh r9, r19, r23 C U1 #08
329
ret2: addq r28, r25, r28 C L0
330
ldq r12, 56(r17) C L1
332
mulq r10, r19, r24 C U1 #09
334
ret3: addq r1, r2, r20 C L0 sum 2 mul's
337
addq r3, r4, r2 C L0 #10 2 mul's
338
bis r31, r31, r31 C U1 mul hole
339
lda r17, 64(r17) C L1 bookkeeping
340
cmpult r20, r1, r29 C U0 carry from sum
342
umulh r10, r19, r25 C U1 #11
343
cmpult r2, r4, r4 C U0 carry from sum
344
stq r13, -32(r16) C L0
345
stq r26, -24(r16) C L1
347
mulq r11, r19, r26 C U1 #12
348
addq r5, r6, r14 C U0 sum 2 mul's
349
stq r27, -16(r16) C L0
350
stq r28, -8(r16) C L1
352
umulh r11, r19, r27 C U1 #13
353
bis r31, r31, r31 C L0 st slosh
354
bis r31, r31, r31 C L1 st slosh
355
cmpult r14, r6, r3 C U0 carry from sum
356
$L_middle_of_unrolled_loop:
357
mulq r12, r19, r28 C U1 #14
358
addq r7, r3, r5 C L0 eat carry
359
addq r20, r15, r20 C U0 carry cascade
362
umulh r12, r19, r1 C U1 #15
364
ret4: addq r2, r29, r6 C L0
365
ldq r11, 16(r17) C L1
367
mulq r9, r19, r2 C U1 #16
369
ret5: addq r14, r4, r7 C L0
370
ldq r12, 24(r17) C L1
372
umulh r9, r19, r3 C U1 #17
374
ret6: addq r5, r8, r8 C L0 sum 2
375
addq r21, r22, r13 C L1 sum 2 mul's
377
mulq r10, r19, r4 C U1 #18
378
addq r23, r24, r22 C L0 sum 2 mul's
379
cmpult r13, r21, r14 C L1 carry from sum
381
C ---------------------------------------------------------------
383
cmpult r8, r5, r29 C L0 carry from last bunch
384
cmpult r22, r24, r24 C U0 carry from sum
386
umulh r10, r19, r5 C U1 #02
387
addq r25, r26, r23 C U0 sum 2 mul's
391
mulq r11, r19, r6 C U1 #03
392
cmpult r23, r26, r25 C U0 carry from sum
396
umulh r11, r19, r7 C U1 #04
397
addq r27, r28, r28 C U0 sum 2 mul's
399
mulq r12, r19, r8 C U1 #05
400
cmpult r28, r27, r15 C L0 carry from sum
401
lda r16, 64(r16) C L1 bookkeeping
402
addq r13, r29, r13 C U0 start carry cascade
404
umulh r12, r19, r21 C U1 #06
406
ret0c: addq r22, r14, r26 C L0
408
ret1c: addq r23, r24, r27 C L0
410
ret2c: addq r28, r25, r28 C L0
412
ret3c: addq r1, r2, r20 C L0 sum 2 mul's
413
addq r3, r4, r2 C L0 #10 2 mul's
414
lda r17, 64(r17) C L1 bookkeeping
415
cmpult r20, r1, r29 C U0 carry from sum
416
cmpult r2, r4, r4 C U0 carry from sum
417
stq r13, -32(r16) C L0
418
stq r26, -24(r16) C L1
419
addq r5, r6, r14 C U0 sum 2 mul's
420
stq r27, -16(r16) C L0
421
stq r28, -8(r16) C L1
422
cmpult r14, r6, r3 C U0 carry from sum
423
addq r7, r3, r5 C L0 eat carry
424
addq r20, r15, r20 C U0 carry cascade
426
ret4c: addq r2, r29, r6 C L0
428
ret5c: addq r14, r4, r7 C L0
430
ret6c: addq r5, r8, r8 C L0 sum 2
431
cmpult r8, r5, r29 C L0 carry from last bunch
450
C fix0w: bis r14, r29, r14 C join carries
452
fix1w: bis r24, r14, r24 C join carries
454
fix2w: bis r25, r24, r25 C join carries
456
fix3w: bis r15, r25, r15 C join carries
458
fix0: bis r14, r29, r14 C join carries
460
fix1: bis r24, r14, r24 C join carries
462
fix2: bis r25, r24, r25 C join carries
464
fix3: bis r15, r25, r15 C join carries
466
fix4: bis r29, r15, r29 C join carries
468
fix5: bis r4, r29, r4 C join carries
470
fix6: addq r5, r4, r5 C can't carry twice!
472
fix0c: bis r14, r29, r14 C join carries
474
fix1c: bis r24, r14, r24 C join carries
476
fix2c: bis r25, r24, r25 C join carries
478
fix3c: bis r15, r25, r15 C join carries
480
fix4c: bis r29, r15, r29 C join carries
482
fix5c: bis r4, r29, r4 C join carries
484
fix6c: addq r5, r4, r5 C can't carry twice!