1
dnl SPARC v9 64-bit mpn_addmul_2 -- Multiply an n limb number with 2-limb
2
dnl number and add the result to a n limb vector.
4
dnl Copyright 2002, 2003 Free Software Foundation, Inc.
6
dnl This file is part of the GNU MP Library.
8
dnl The GNU MP Library is free software; you can redistribute it and/or modify
9
dnl it under the terms of the GNU Lesser General Public License as published
10
dnl by the Free Software Foundation; either version 2.1 of the License, or (at
11
dnl your option) any later version.
13
dnl The GNU MP Library is distributed in the hope that it will be useful, but
14
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16
dnl License for more details.
18
dnl You should have received a copy of the GNU Lesser General Public License
19
dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write
20
dnl to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
21
dnl Boston, MA 02110-1301, USA.
23
include(`../config.m4')
29
C Algorithm: We use 16 floating-point multiplies per limb product, with the
30
C 2-limb v operand split into eight 16-bit pieces, and the n-limb u operand
31
C split into 32-bit pieces. We sum four 48-bit partial products using
32
C floating-point add, then convert the resulting four 50-bit quantities and
33
C transfer them to the integer unit.
35
C Possible optimizations:
36
C 1. Align the stack area where we transfer the four 50-bit product-sums
37
C to a 32-byte boundary. That would minimize the cache collision.
38
C (UltraSPARC-1/2 use a direct-mapped cache.) (Perhaps even better would
39
C be to align the area to map to the area immediately before up?)
40
C 2. Perform two of the fp->int conversions with integer instructions. We
41
C can get almost ten free IEU slots, if we clean up bookkeeping and the
42
C silly carry-limb code.
43
C 3. For an mpn_addmul_1 based on this, we need to fix the silly carry-limb
46
C OSP (Overlapping software pipeline) version of mpn_mul_basecase:
47
C Operand swap will require 8 LDDA and 8 FXTOD, which will mean 8 cycles.
53
C Instruction classification (as per UltraSPARC functional units).
54
C Assuming silly carry code is fixed. Includes bookkeeping.
56
C mpn_addmul_X mpn_mul_X
58
C ========== ==========
66
C TOTAL IEU 17 17 16 16
69
C IEU cycles 8.5 8.5 8 8
70
C MEM cycles 12 12 10 10
71
C ISSUE cycles 12 16 11.25 15.25
72
C FPU cycles 10 18 10 18
73
C cycles/loop 12 18 12 18
74
C cycles/limb 12 9 12 9
85
REGISTER(%g2,#scratch)
86
REGISTER(%g3,#scratch)
96
define(`p000', `%f8') define(`p016',`%f10')
97
define(`p032',`%f12') define(`p048',`%f14')
98
define(`p064',`%f16') define(`p080',`%f18')
99
define(`p096a',`%f20') define(`p112a',`%f22')
100
define(`p096b',`%f56') define(`p112b',`%f58')
102
define(`out000',`%f0') define(`out016',`%f6')
104
define(`v000',`%f24') define(`v016',`%f26')
105
define(`v032',`%f28') define(`v048',`%f30')
106
define(`v064',`%f44') define(`v080',`%f46')
107
define(`v096',`%f48') define(`v112',`%f50')
109
define(`u00',`%f32') define(`u32', `%f34')
111
define(`a000',`%f36') define(`a016',`%f38')
112
define(`a032',`%f40') define(`a048',`%f42')
113
define(`a064',`%f60') define(`a080',`%f62')
115
define(`u00_hi',`%f2') define(`u32_hi',`%f4')
116
define(`u00_lo',`%f3') define(`u32_lo',`%f5')
119
define(`rlimb',`%g3')
120
define(`i00',`%l0') define(`i16',`%l1')
121
define(`r00',`%l2') define(`r32',`%l3')
122
define(`xffffffff',`%l7')
123
define(`xffff',`%o0')
126
PROLOGUE(mpn_addmul_2)
128
C Initialization. (1) Split v operand into eight 16-bit chunks and store them
129
C as IEEE double in fp registers. (2) Clear upper 32 bits of fp register pairs
130
C f2 and f4. (3) Store masks in registers aliased to `xffff' and `xffffffff'.
131
C This code could be better scheduled.
138
srlx %g4, 32, xffffffff C store mask in register `xffffffff'
139
ldda [%i3+6] %asi, v000
140
ldda [%i3+4] %asi, v016
141
ldda [%i3+2] %asi, v032
142
ldda [%i3+0] %asi, v048
144
ldda [%i3+14] %asi, v064
146
ldda [%i3+12] %asi, v080
148
ldda [%i3+10] %asi, v096
150
ldda [%i3+8] %asi, v112
159
ldx [%i3+0], %l0 C vp[0]
160
srlx %g4, 48, xffff C store mask in register `xffff'
161
ldx [%i3+8], %l1 C vp[1]
164
stx %g2, [%sp+2223+0]
167
stx %g3, [%sp+2223+8]
170
stx %g2, [%sp+2223+16]
172
stx %g3, [%sp+2223+24]
174
stx %g2, [%sp+2223+32]
177
stx %g3, [%sp+2223+40]
180
stx %g2, [%sp+2223+48]
182
stx %g3, [%sp+2223+56]
184
srlx %g4, 32, xffffffff C store mask in register `xffffffff'
186
ldd [%sp+2223+0], v000
187
ldd [%sp+2223+8], v016
188
ldd [%sp+2223+16], v032
189
ldd [%sp+2223+24], v048
191
ldd [%sp+2223+32], v064
193
ldd [%sp+2223+40], v080
195
ldd [%sp+2223+48], v096
197
ldd [%sp+2223+56], v112
199
ld [%sp+2223+0], u00_hi C zero u00_hi
201
ld [%sp+2223+0], u32_hi C zero u32_hi
205
C Initialization done.
209
add %i0, -8, %i0 C BOOKKEEPING
211
C Start software pipeline.
213
ld [%i1+4], u00_lo C read low 32 bits of up[i]
216
ld [%i1+0], u32_lo C read high 32 bits of up[i]
217
fmuld u00, v000, a000
218
fmuld u00, v016, a016
219
fmuld u00, v032, a032
220
fmuld u00, v048, a048
221
add %i2, -1, %i2 C BOOKKEEPING
222
fmuld u00, v064, p064
223
add %i1, 8, %i1 C BOOKKEEPING
225
fmuld u00, v080, p080
226
fmuld u00, v096, p096a
227
brnz,pt %i2, .L_2_or_more
228
fmuld u00, v112, p112a
230
.L1: fdtox a000, out000
231
fmuld u32, v000, p000
233
fmuld u32, v016, p016
235
fmuld u32, v032, p032
237
fmuld u32, v048, p048
238
std out000, [%sp+2223+16]
239
faddd p000, a032, a000
240
fmuld u32, v064, p064
241
std out016, [%sp+2223+24]
243
faddd p016, a048, a016
244
fmuld u32, v080, p080
245
faddd p032, a064, a032
246
fmuld u32, v096, p096b
247
faddd p048, a080, a048
248
fmuld u32, v112, p112b
252
faddd p064, p096a, a064
253
faddd p080, p112a, a080
254
std out000, [%sp+2223+0]
256
std out016, [%sp+2223+8]
259
ld [%i1+4], u00_lo C read low 32 bits of up[i]
261
fmuld u32, v000, p000
263
fmuld u32, v016, p016
265
fmuld u32, v032, p032
267
fmuld u32, v048, p048
268
std out000, [%sp+2223+16]
269
faddd p000, a032, a000
270
fmuld u32, v064, p064
271
std out016, [%sp+2223+24]
273
faddd p016, a048, a016
274
fmuld u32, v080, p080
275
faddd p032, a064, a032
276
fmuld u32, v096, p096b
277
faddd p048, a080, a048
278
fmuld u32, v112, p112b
280
ld [%i1+0], u32_lo C read high 32 bits of up[i]
282
fmuld u00, v000, p000
284
fmuld u00, v016, p016
285
faddd p064, p096a, a064
286
fmuld u00, v032, p032
287
faddd p080, p112a, a080
288
fmuld u00, v048, p048
289
add %i2, -1, %i2 C BOOKKEEPING
290
std out000, [%sp+2223+0]
291
faddd p000, a032, a000
292
fmuld u00, v064, p064
293
add %i1, 8, %i1 C BOOKKEEPING
294
std out016, [%sp+2223+8]
296
faddd p016, a048, a016
297
fmuld u00, v080, p080
298
faddd p032, a064, a032
299
fmuld u00, v096, p096a
300
faddd p048, a080, a048
301
brnz,pt %i2, .L_3_or_more
302
fmuld u00, v112, p112a
311
C . |_______i00__| 50
312
C |_______i16__| . 50
318
.Loop: ld [%i1+4], u00_lo C read low 32 bits of up[i]
319
and %g2, xffffffff, %g2
321
fmuld u32, v000, p000
323
lduw [%i0+4+8], r00 C read low 32 bits of rp[i]
326
fmuld u32, v016, p016
329
ldx [%sp+2223+16], i00
330
faddd p064, p096b, a064
331
fmuld u32, v032, p032
333
add %g4, cy, cy C new cy
334
ldx [%sp+2223+24], i16
335
faddd p080, p112b, a080
336
fmuld u32, v048, p048
339
std out000, [%sp+2223+16]
340
faddd p000, a032, a000
341
fmuld u32, v064, p064
344
add %i0, 8, %i0 C BOOKKEEPING
345
std out016, [%sp+2223+24]
350
faddd p016, a048, a016
351
fmuld u32, v080, p080
355
faddd p032, a064, a032
356
fmuld u32, v096, p096b
360
faddd p048, a080, a048
361
fmuld u32, v112, p112b
363
ld [%i1+0], u32_lo C read high 32 bits of up[i]
364
and %g2, xffffffff, %g2
366
fmuld u00, v000, p000
368
lduw [%i0+0], r32 C read high 32 bits of rp[i]
371
fmuld u00, v016, p016
374
ldx [%sp+2223+0], i00
375
faddd p064, p096a, a064
376
fmuld u00, v032, p032
378
add %g4, cy, cy C new cy
379
ldx [%sp+2223+8], i16
380
faddd p080, p112a, a080
381
fmuld u00, v048, p048
383
add %i2, -1, %i2 C BOOKKEEPING
384
std out000, [%sp+2223+0]
385
faddd p000, a032, a000
386
fmuld u00, v064, p064
389
add %i1, 8, %i1 C BOOKKEEPING
390
std out016, [%sp+2223+8]
395
faddd p016, a048, a016
396
fmuld u00, v080, p080
400
faddd p032, a064, a032
401
fmuld u00, v096, p096a
404
faddd p048, a080, a048
406
fmuld u00, v112, p112a
410
.Lend: and %g2, xffffffff, %g2
412
fmuld u32, v000, p000
413
lduw [%i0+4+8], r00 C read low 32 bits of rp[i]
416
fmuld u32, v016, p016
418
ldx [%sp+2223+16], i00
419
faddd p064, p096b, a064
420
fmuld u32, v032, p032
421
add %g4, cy, cy C new cy
422
ldx [%sp+2223+24], i16
423
faddd p080, p112b, a080
424
fmuld u32, v048, p048
425
std out000, [%sp+2223+16]
426
faddd p000, a032, a000
427
fmuld u32, v064, p064
429
add %i0, 8, %i0 C BOOKKEEPING
430
std out016, [%sp+2223+24]
433
faddd p016, a048, a016
434
fmuld u32, v080, p080
437
faddd p032, a064, a032
438
fmuld u32, v096, p096b
440
faddd p048, a080, a048
441
fmuld u32, v112, p112b
443
and %g2, xffffffff, %g2
445
lduw [%i0+0], r32 C read high 32 bits of rp[i]
449
ldx [%sp+2223+0], i00
450
faddd p064, p096a, a064
451
add %g4, cy, cy C new cy
452
ldx [%sp+2223+8], i16
453
faddd p080, p112a, a080
454
std out000, [%sp+2223+0]
456
std out016, [%sp+2223+8]
464
.L_wd2: and %g2, xffffffff, %g2
466
lduw [%i0+4+8], r00 C read low 32 bits of rp[i]
470
ldx [%sp+2223+16], i00
471
add %g4, cy, cy C new cy
472
ldx [%sp+2223+24], i16
473
std out000, [%sp+2223+16]
475
add %i0, 8, %i0 C BOOKKEEPING
476
std out016, [%sp+2223+24]
483
and %g2, xffffffff, %g2
485
lduw [%i0+0], r32 C read high 32 bits of rp[i]
489
ldx [%sp+2223+0], i00
490
add %g4, cy, cy C new cy
491
ldx [%sp+2223+8], i16
492
std out000, [%sp+2223+0]
494
std out016, [%sp+2223+8]
502
.L_wd3: and %g2, xffffffff, %g2
507
ldx [%sp+2223+16], rlimb
508
add %g4, cy, cy C new cy
509
ldx [%sp+2223+24], i16
510
std out000, [%sp+2223+16]
511
add %i0, 8, %i0 C BOOKKEEPING
512
std out016, [%sp+2223+24]
519
and %g2, xffffffff, %g2
522
ldx [%sp+2223+0], rlimb
523
add %g4, cy, cy C new cy
524
ldx [%sp+2223+8], i16
531
and %g2, xffffffff, %g2
534
ldx [%sp+2223+16], i00
535
add %g4, cy, cy C new cy
536
ldx [%sp+2223+24], i16
542
EPILOGUE(mpn_addmul_2)