1
dnl SPARC v9 32-bit mpn_submul_1 -- Multiply a limb vector with a limb and
2
dnl subtract the result from a second limb vector.
4
dnl Copyright 1998, 2000, 2001 Free Software Foundation, Inc.
6
dnl This file is part of the GNU MP Library.
8
dnl The GNU MP Library is free software; you can redistribute it and/or modify
9
dnl it under the terms of the GNU Lesser General Public License as published
10
dnl by the Free Software Foundation; either version 2.1 of the License, or (at
11
dnl your option) any later version.
13
dnl The GNU MP Library is distributed in the hope that it will be useful, but
14
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16
dnl License for more details.
18
dnl You should have received a copy of the GNU Lesser General Public License
19
dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
20
dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
21
dnl MA 02111-1307, USA.
23
include(`../config.m4')
25
C Algorithm: We use two floating-point multiplies per limb product, with the
26
C invariant v operand split into two 16-bit pieces, and the u operand split
27
C into 32-bit pieces. We convert the two 48-bit products and transfer them to
30
C Speed: 7 cycles/limb on UltraSPARC-1/2.
32
C Possible optimizations:
33
C 1. Combine 32-bit memory operations into 64-bit operations. Since we're
34
C memory bandwidth limited, this could save 1.5 cycles/limb.
35
C 2. Unroll the inner loop. Since we already use alternate temporary areas,
36
C it is very straightforward to unroll, using an exit branch midways.
37
C Unrolling would allow deeper scheduling which could improve speed for L2
39
C 3. For mpn_mul_1: Use more alternating temp areas. The std'es and ldx'es
40
C aren't sufficiently apart-scheduled with just two temp areas.
41
C 4. Do some cross-jumping to save about 1/2 the code size.
42
C 5. Specialize for particular v values. If its upper 16 bits are zero, we
43
C could save many operations.
54
PROLOGUE(mpn_submul_1)
56
sethi %hi(0xffff), %g1
58
or %g1, %lo(0xffff), %g1
66
ld [%sp+104], %f10 C zero f10
70
define(`fanop', `fitod %f18, %f0') C A quasi nop running in the FA pipe
72
add %sp, 160, %o5 C point in scratch area
73
and %o5, -32, %o5 C align at 0 (mod 32) in scratch area
76
ld [%o1], %f11 C read up[i]
77
add %o1, 4, %o1 C up++
78
bne,pt %icc, .L_two_or_more
80
.L_1: fmuld %f2, %f8, %f16
86
ldx [%o5+16], %g2 C p16
87
ldx [%o5+24], %g1 C p0
88
lduw [%o0], %g5 C read rp[i]
89
sllx %g2, 16, %g4 C (p16 << 16)
90
add %g1, %g4, %g4 C p = p0 + (p16 << 16)
91
sub %g5, %g4, %g4 C p += rp[i]
93
srlx %g4, 32, %g3 C new cy
101
ld [%o1], %f11 C read up[i]
104
add %o1, 4, %o1 C up++
105
bne,pt %icc, .L_three_or_more
107
.L_2: fdtox %f16, %f14
117
lduw [%o0], %g5 C read rp[i]
118
ldx [%o5+16], %g2 C p16
119
ldx [%o5+24], %g1 C p0
120
sllx %g2, 16, %g4 C (p16 << 16) * crossjmp pt
121
ldx [%o5+0], %g2 C p16
122
add %g1, %g4, %g4 C p = p0 + (p16 << 16)
123
ldx [%o5+8], %g1 C p0
124
sub %g5, %g4, %g4 C p += rp[i]
126
srlx %g4, 32, %g3 C new cy
127
lduw [%o0+4], %g5 C read rp[i]
129
sllx %g2, 16, %g4 C (p16 << 16)
130
srl %g3, 0, %g3 C zero most significant 32 bits
131
add %g1, %g4, %g4 C p = p0 + (p16 << 16)
132
add %g3, %g4, %g4 C p += cy
133
sub %g5, %g4, %g4 C p += rp[i]
135
srlx %g4, 32, %g3 C new cy
143
ld [%o1], %f11 C read up[i]
150
add %o1, 4, %o1 C up++
151
bne,pt %icc, .L_four_or_more
153
.L_3: fdtox %f16, %f14
160
ldx [%o5+16], %g2 C p16
162
ldx [%o5+24], %g1 C p0
165
lduw [%o0], %g5 C read rp[i]
166
sllx %g2, 16, %g4 C (p16 << 16)
167
ldx [%o5+0], %g2 C p16
168
add %g1, %g4, %g4 C p = p0 + (p16 << 16)
169
ldx [%o5+8], %g1 C p0
170
sub %g5, %g4, %g4 C p += rp[i]
172
srlx %g4, 32, %g3 C new cy
173
lduw [%o0+4], %g5 C read rp[i]
175
sllx %g2, 16, %g4 C (p16 << 16)
176
ldx [%o5+16], %g2 C p16
177
srl %g3, 0, %g3 C zero most significant 32 bits
178
add %g1, %g4, %g4 C p = p0 + (p16 << 16)
179
ldx [%o5+24], %g1 C p0
180
add %g3, %g4, %g4 C p += cy
181
sub %g5, %g4, %g4 C p += rp[i]
183
srlx %g4, 32, %g3 C new cy
184
lduw [%o0+8], %g5 C read rp[i]
186
sllx %g2, 16, %g4 C (p16 << 16)
187
srl %g3, 0, %g3 C zero most significant 32 bits
188
add %g1, %g4, %g4 C p = p0 + (p16 << 16)
189
add %g3, %g4, %g4 C p += cy
190
sub %g5, %g4, %g4 C p += rp[i]
192
srlx %g4, 32, %g3 C new cy
200
ld [%o1], %f11 C read up[i]
207
add %o1, 4, %o1 C up++
208
bne,pt %icc, .L_five_or_more
210
.L_4: fdtox %f16, %f14
211
ldx [%o5+16], %g2 C p16
213
ldx [%o5+24], %g1 C p0
218
add %o1, 4, %o1 C up++
219
lduw [%o0], %g5 C read rp[i]
221
sllx %g2, 16, %g4 C (p16 << 16)
222
ldx [%o5+0], %g2 C p16
224
add %g1, %g4, %g4 C p = p0 + (p16 << 16)
225
ldx [%o5+8], %g1 C p0
227
sub %g5, %g4, %g4 C p += rp[i]
230
srlx %g4, 32, %g3 C new cy
231
lduw [%o0+4], %g5 C read rp[i]
233
sllx %g2, 16, %g4 C (p16 << 16)
234
ldx [%o5+16], %g2 C p16
235
srl %g3, 0, %g3 C zero most significant 32 bits
236
add %g1, %g4, %g4 C p = p0 + (p16 << 16)
237
ldx [%o5+24], %g1 C p0
238
add %g3, %g4, %g4 C p += cy
239
sub %g5, %g4, %g4 C p += rp[i]
241
srlx %g4, 32, %g3 C new cy
242
lduw [%o0+8], %g5 C read rp[i]
244
sllx %g2, 16, %g4 C (p16 << 16)
245
ldx [%o5+0], %g2 C p16
246
srl %g3, 0, %g3 C zero most significant 32 bits
247
add %g1, %g4, %g4 C p = p0 + (p16 << 16)
248
ldx [%o5+8], %g1 C p0
249
add %g3, %g4, %g4 C p += cy
250
sub %g5, %g4, %g4 C p += rp[i]
252
srlx %g4, 32, %g3 C new cy
253
lduw [%o0+12], %g5 C read rp[i]
255
sllx %g2, 16, %g4 C (p16 << 16)
256
srl %g3, 0, %g3 C zero most significant 32 bits
257
add %g1, %g4, %g4 C p = p0 + (p16 << 16)
258
add %g3, %g4, %g4 C p += cy
259
sub %g5, %g4, %g4 C p += rp[i]
261
srlx %g4, 32, %g3 C new cy
269
ld [%o1], %f11 C read up[i]
271
ldx [%o5+16], %g2 C p16
273
ldx [%o5+24], %g1 C p0
278
add %o1, 4, %o1 C up++
279
lduw [%o0], %g5 C read rp[i]
287
.Loop: sub %g0, %g3, %g3
289
ld [%o1], %f11 C read up[i]
292
sllx %g2, 16, %g4 C (p16 << 16)
293
add %o0, 4, %o0 C rp++
294
ldx [%o5+0], %g2 C p16
297
srl %g3, 0, %g3 C zero most significant 32 bits
298
add %g1, %g4, %g4 C p = p0 + (p16 << 16)
299
ldx [%o5+8], %g1 C p0
303
add %g3, %g4, %g4 C p += cy
308
sub %g5, %g4, %g4 C p += rp[i]
312
xor %o5, 16, %o5 C alternate scratch variables
313
add %o1, 4, %o1 C up++
317
srlx %g4, 32, %g3 C new cy
318
lduw [%o0], %g5 C read rp[i]
326
sllx %g2, 16, %g4 C (p16 << 16)
327
ldx [%o5+0], %g2 C p16
329
srl %g3, 0, %g3 C zero most significant 32 bits
330
add %g1, %g4, %g4 C p = p0 + (p16 << 16)
331
ldx [%o5+8], %g1 C p0
332
add %g4, %g3, %g4 C p += cy
335
sub %g5, %g4, %g4 C p += rp[i]
340
srlx %g4, 32, %g3 C new cy
341
lduw [%o0+4], %g5 C read rp[i]
345
sllx %g2, 16, %g4 C (p16 << 16)
346
ldx [%o5+0], %g2 C p16
348
srl %g3, 0, %g3 C zero most significant 32 bits
349
add %g1, %g4, %g4 C p = p0 + (p16 << 16)
350
ldx [%o5+8], %g1 C p0
351
add %g3, %g4, %g4 C p += cy
353
sub %g5, %g4, %g4 C p += rp[i]
357
srlx %g4, 32, %g3 C new cy
358
lduw [%o0+8], %g5 C read rp[i]
361
sllx %g2, 16, %g4 C (p16 << 16)
362
ldx [%o5+0], %g2 C p16
363
srl %g3, 0, %g3 C zero most significant 32 bits
364
add %g1, %g4, %g4 C p = p0 + (p16 << 16)
365
ldx [%o5+8], %g1 C p0
366
add %g3, %g4, %g4 C p += cy
367
sub %g5, %g4, %g4 C p += rp[i]
370
srlx %g4, 32, %g3 C new cy
371
lduw [%o0+12], %g5 C read rp[i]
374
sllx %g2, 16, %g4 C (p16 << 16)
375
ldx [%o5+0], %g2 C p16
376
srl %g3, 0, %g3 C zero most significant 32 bits
377
add %g1, %g4, %g4 C p = p0 + (p16 << 16)
378
ldx [%o5+8], %g1 C p0
379
add %g3, %g4, %g4 C p += cy
380
sub %g5, %g4, %g4 C p += rp[i]
382
srlx %g4, 32, %g3 C new cy
383
lduw [%o0+16], %g5 C read rp[i]
386
sllx %g2, 16, %g4 C (p16 << 16)
387
srl %g3, 0, %g3 C zero most significant 32 bits
388
add %g1, %g4, %g4 C p = p0 + (p16 << 16)
389
add %g3, %g4, %g4 C p += cy
390
sub %g5, %g4, %g4 C p += rp[i]
392
srlx %g4, 32, %g3 C new cy
397
EPILOGUE(mpn_submul_1)