16
16
dnl License for more details.
18
18
dnl You should have received a copy of the GNU Lesser General Public License
19
dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
20
dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
21
dnl MA 02111-1307, USA.
19
dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write
20
dnl to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
21
dnl Boston, MA 02110-1301, USA.
23
23
include(`../config.m4')
31
dnl This code runs at 5.4 cycles/limb on EV5, and 2.1 cycles/limb on EV6.
33
dnl This code was written in close cooperation with ev6 pipeline expert
34
dnl Steve Root. Any errors are tege's fault, though.
42
dnl sustains 8 subtracts in 17 cycles !
43
dnl (from the d_cache)
45
dnl pair loads and stores where possible
46
dnl store pairs oct-aligned where possible
47
dnl (didn't need it here)
48
dnl stores are delayed every third cycle
49
dnl loads and stores are delayed by fills
50
dnl U stays still, put code there where possible
51
dnl (note alternation of U1 and U0)
52
dnl L moves because of loads and stores
53
dnl note dampers in L to limit damage
54
dnl note, load ahead of time where possible
56
dnl this odd-looking optimization expects
57
dnl that were having random bits in our data, so
58
dnl that a pure zero result is unlikely. so we
59
dnl penalize the unlikely case to help the
35
C cy r20 (for mpn_add_nc)
38
C Finish cleaning up cy registers r22, r23 (make them use cy0/cy1)
39
C Use multi-pronged feed-in.
40
C Perform additional micro-tuning
42
C This code was written in cooperation with ev6 pipeline expert Steve Root.
44
C Pair loads and stores where possible
45
C Store pairs oct-aligned where possible (didn't need it here)
46
C Stores are delayed every third cycle
47
C Loads and stores are delayed by fills
48
C U stays still, put code there where possible (note alternation of U1 and U0)
49
C L moves because of loads and stores
50
C Note dampers in L to limit damage
52
C This odd-looking optimization expects that were having random bits in our
53
C data, so that a pure zero result is unlikely. so we penalize the unlikely
54
C case to help the common case.
56
define(`u0', `r0') define(`u1', `r3')
57
define(`v0', `r1') define(`v1', `r4')
59
define(`cy0', `r20') define(`cy1', `r21')
61
MULFUNC_PROLOGUE(mpn_sub_n mpn_sub_nc)
63
67
PROLOGUE(mpn_sub_n)
69
lda r19, -8(r19) C L1 move counter
74
ldq r0, 0(r17) C L0 get next ones
76
ldq r3, 8(r17) C L0 get next ones
78
ldq r6, 16(r17) C L0 get next ones
81
ldq r9, 24(r17) C L0 get next ones
84
subq r0, r1, r2 C U1 sub two data
86
cmpult r0, r1, r20 C U1 did it borrow
88
ldq r0, 32(r17) C L0 get next ones
91
subq r3, r4, r5 C U0 sub two data
93
cmpult r3, r4, r21 C U0 did it borrow
94
ldq r3, 40(r17) C L0 get next ones
97
subq r6, r7, r8 C U1 sub two data
98
subq r5, r20, r24 C U0 borrow from last
101
cmpult r6, r7, r22 C U1 did it borrow
102
beq r5, $fix5w C U0 fix exact zero
103
$ret5w: ldq r6, 48(r17) C L0 get next ones
106
bis r31, r31, r31 C L damp out
107
subq r8, r21, r25 C U1 borrow from last
108
bis r31, r31, r31 C L moves in L !
109
subq r9, r10, r11 C U0 sub two data
111
beq r8, $fix6w C U1 fix exact zero
112
$ret6w: cmpult r9, r10, r23 C U0 did it borrow
113
ldq r9, 56(r17) C L0 get next ones
114
ldq r10, 56(r18) C L1
116
lda r17, 64(r17) C L0 move pointer
117
bis r31, r31, r31 C U
118
lda r18, 64(r18) C L1 move pointer
120
lda r19, -8(r19) C L1 move counter
68
bis r31, r31, cy0 C clear carry in
69
$entry: cmpult r19, 5, r22 C L1 move counter
70
ldq u1, 0(r17) C L0 get next ones
74
ldq u0, 8(r17) C L0 get next ones
76
subq u1, v1, r5 C U0 sub two data
78
cmpult u1, v1, r23 C U0 did it borrow
79
ldq u1, 16(r17) C L0 get next ones
82
subq u0, v0, r8 C U1 sub two data
83
subq r5, cy0, r24 C U0 borrow in
85
cmpult u0, v0, r22 C U1 did it borrow
86
beq r5, $fix5f C U0 fix exact zero
87
$ret5f: ldq u0, 24(r17) C L0 get next ones
90
subq r8, r23, r25 C U1 borrow from last
91
subq u1, v1, r7 C U0 sub two data
93
beq r8, $fix6f C U1 fix exact zero
94
$ret6f: cmpult u1, v1, r23 C U0 did it borrow
95
ldq u1, 32(r17) C L0 get next ones
98
lda r17, 40(r17) C L0 move pointer
99
lda r18, 40(r18) C L1 move pointer
102
lda r19, -13(r19) C L1 move counter
103
blt r19, $Lend C U1 loop control
123
106
C Main loop. 8-way unrolled.
126
subq r0, r1, r2 C U1 sub two data
108
$Loop: subq u0, v0, r2 C U1 sub two data
127
109
stq r24, 8(r16) C L0 put an answer
128
subq r11, r22, r24 C U0 borrow from last
110
subq r7, r22, r24 C U0 borrow from last
129
111
stq r25, 16(r16) C L1 pair
131
cmpult r0, r1, r20 C U1 did it borrow
132
beq r11, $fix7 C U0 fix exact 0
133
$ret7: ldq r0, 0(r17) C L0 get next ones
113
cmpult u0, v0, cy1 C U1 did it borrow
114
beq r7, $fix7 C U0 fix exact 0
115
$ret7: ldq u0, 0(r17) C L0 get next ones
136
118
bis r31, r31, r31 C L damp out
137
119
subq r2, r23, r25 C U1 borrow from last
138
120
bis r31, r31, r31 C L moves in L !
139
subq r3, r4, r5 C U0 sub two data
121
subq u1, v1, r5 C U0 sub two data
141
123
beq r2, $fix0 C U1 fix exact zero
142
$ret0: cmpult r3, r4, r21 C U0 did it borrow
143
ldq r3, 8(r17) C L0 get next ones
124
$ret0: cmpult u1, v1, cy0 C U0 did it borrow
125
ldq u1, 8(r17) C L0 get next ones
146
subq r6, r7, r8 C U1 sub two data
128
subq u0, v0, r8 C U1 sub two data
147
129
stq r24, 24(r16) C L0 store pair
148
subq r5, r20, r24 C U0 borrow from last
130
subq r5, cy1, r24 C U0 borrow from last
149
131
stq r25, 32(r16) C L1
151
cmpult r6, r7, r22 C U1 did it borrow
133
cmpult u0, v0, r22 C U1 did it borrow
152
134
beq r5, $fix1 C U0 fix exact zero
153
$ret1: ldq r6, 16(r17) C L0 get next ones
135
$ret1: ldq u0, 16(r17) C L0 get next ones
156
138
lda r16, 64(r16) C L0 move pointer
157
subq r8, r21, r25 C U1 borrow from last
139
subq r8, cy0, r25 C U1 borrow from last
158
140
lda r19, -8(r19) C L1 move counter
159
subq r9, r10, r11 C U0 sub two data
141
subq u1, v1, r7 C U0 sub two data
161
143
beq r8, $fix2 C U1 fix exact zero
162
$ret2: cmpult r9, r10, r23 C U0 did it borrow
163
ldq r9, 24(r17) C L0 get next ones
164
ldq r10, 24(r18) C L1
144
$ret2: cmpult u1, v1, r23 C U0 did it borrow
145
ldq u1, 24(r17) C L0 get next ones
166
subq r0, r1, r2 C U1 sub two data
148
subq u0, v0, r2 C U1 sub two data
167
149
stq r24, -24(r16) C L0 put an answer
168
subq r11, r22, r24 C U0 borrow from last
150
subq r7, r22, r24 C U0 borrow from last
169
151
stq r25, -16(r16) C L1 pair
171
cmpult r0, r1, r20 C U1 did it borrow
172
beq r11, $fix3 C U0 fix exact 0
173
$ret3: ldq r0, 32(r17) C L0 get next ones
153
cmpult u0, v0, cy1 C U1 did it borrow
154
beq r7, $fix3 C U0 fix exact 0
155
$ret3: ldq u0, 32(r17) C L0 get next ones
176
158
bis r31, r31, r31 C L damp out
177
159
subq r2, r23, r25 C U1 borrow from last
178
160
bis r31, r31, r31 C L moves in L !
179
subq r3, r4, r5 C U0 sub two data
161
subq u1, v1, r5 C U0 sub two data
181
163
beq r2, $fix4 C U1 fix exact zero
182
$ret4: cmpult r3, r4, r21 C U0 did it borrow
183
ldq r3, 40(r17) C L0 get next ones
164
$ret4: cmpult u1, v1, cy0 C U0 did it borrow
165
ldq u1, 40(r17) C L0 get next ones
186
subq r6, r7, r8 C U1 sub two data
168
subq u0, v0, r8 C U1 sub two data
187
169
stq r24, -8(r16) C L0 store pair
188
subq r5, r20, r24 C U0 borrow from last
170
subq r5, cy1, r24 C U0 borrow from last
189
171
stq r25, 0(r16) C L1
191
cmpult r6, r7, r22 C U1 did it borrow
173
cmpult u0, v0, r22 C U1 did it borrow
192
174
beq r5, $fix5 C U0 fix exact zero
193
$ret5: ldq r6, 48(r17) C L0 get next ones
175
$ret5: ldq u0, 48(r17) C L0 get next ones
196
bis r31, r31, r31 C L damp out
197
subq r8, r21, r25 C U1 borrow from last
198
bis r31, r31, r31 C L moves in L !
199
subq r9, r10, r11 C U0 sub two data
178
ldl r31, 256(r17) C L0 prefetch
179
subq r8, cy0, r25 C U1 borrow from last
180
ldl r31, 256(r18) C L1 prefetch
181
subq u1, v1, r7 C U0 sub two data
201
183
beq r8, $fix6 C U1 fix exact zero
202
$ret6: cmpult r9, r10, r23 C U0 did it borrow
203
ldq r9, 56(r17) C L0 get next ones
204
ldq r10, 56(r18) C L1
184
$ret6: cmpult u1, v1, r23 C U0 did it borrow
185
ldq u1, 56(r17) C L0 get next ones
206
188
lda r17, 64(r17) C L0 move pointer
207
189
bis r31, r31, r31 C U
209
191
bge r19, $Loop C U1 loop control
210
192
C ==== main loop end
213
subq r0, r1, r2 C U1 sub two data
194
$Lend: subq u0, v0, r2 C U1 sub two data
214
195
stq r24, 8(r16) C L0 put an answer
215
subq r11, r22, r24 C U0 borrow from last
196
subq r7, r22, r24 C U0 borrow from last
216
197
stq r25, 16(r16) C L1 pair
218
cmpult r0, r1, r20 C U1 did it borrow
219
beq r11, $fix7c C U0 fix exact 0
221
subq r2, r23, r25 C U1 borrow from last
222
subq r3, r4, r5 C U0 sub two data
198
cmpult u0, v0, cy1 C U1 did it borrow
199
beq r7, $fix7c C U0 fix exact 0
200
$ret7c: subq r2, r23, r25 C U1 borrow from last
201
subq u1, v1, r5 C U0 sub two data
224
202
beq r2, $fix0c C U1 fix exact zero
225
$ret0c: cmpult r3, r4, r21 C U0 did it borrow
227
subq r6, r7, r8 C U1 sub two data
203
$ret0c: cmpult u1, v1, cy0 C U0 did it borrow
228
204
stq r24, 24(r16) C L0 store pair
229
subq r5, r20, r24 C U0 borrow from last
205
subq r5, cy1, r24 C U0 borrow from last
230
206
stq r25, 32(r16) C L1
232
cmpult r6, r7, r22 C U1 did it borrow
233
207
beq r5, $fix1c C U0 fix exact zero
235
lda r16, 64(r16) C L0 move pointer
236
subq r8, r21, r25 C U1 borrow from last
237
subq r9, r10, r11 C U0 sub two data
239
beq r8, $fix2c C U1 fix exact zero
240
$ret2c: cmpult r9, r10, r23 C U0 did it borrow
242
stq r24, -24(r16) C L0 put an answer
243
subq r11, r22, r24 C U0 borrow from last
244
stq r25, -16(r16) C L1 pair
246
beq r11, $fix3c C U0 fix exact 0
248
stq r24, -8(r16) C L0 store pair
208
$ret1c: stq r24, 40(r16) C L0 put an answer
209
lda r16, 48(r16) C L0 move pointer
261
$Loop0: subq r0, r1, r2 C main sub
262
cmpult r0, r1, r8 C compute bw from last sub
265
subq r2, r23, r20 C borrow sub
221
$Loop0: subq u1, v1, r2 C main sub
222
cmpult u1, v1, r8 C compute bw from last sub
225
subq r2, cy0, r5 C borrow sub
269
cmpult r2, r23, r23 C compute bw from last sub
229
cmpult r2, cy0, cy0 C compute bw from last sub
270
230
lda r19, -1(r19) C decr loop cnt
271
bis r8, r23, r23 C combine bw from the two subs
231
bis r8, cy0, cy0 C combine bw from the two subs
274
$Lend0: subq r0, r1, r2 C main sub
275
cmpult r0, r1, r8 C compute bw from last sub
276
subq r2, r23, r20 C borrow sub
277
cmpult r2, r23, r23 C compute bw from last sub
279
bis r8, r23, r23 C combine bw from the two subs
282
lda r0, 0(r23) C copy borrow into return register
291
$fix5w: bis r21, r20, r21 C bring forward borrow
293
$fix6w: bis r22, r21, r22 C bring forward borrow
295
$fix0: bis r20, r23, r20 C bring forward borrow
234
$Lend0: subq u1, v1, r2 C main sub
235
subq r2, cy0, r5 C borrow sub
236
cmpult u1, v1, r8 C compute bw from last sub
237
cmpult r2, cy0, cy0 C compute bw from last sub
239
bis r8, cy0, r0 C combine bw from the two subs
243
$Lret: lda r0, 0(cy0) C copy borrow into return register
246
$fix5f: bis r23, cy0, r23 C bring forward borrow
248
$fix6f: bis r22, r23, r22 C bring forward borrow
250
$fix0: bis cy1, r23, cy1 C bring forward borrow
297
$fix1: bis r21, r20, r21 C bring forward borrow
252
$fix1: bis cy0, cy1, cy0 C bring forward borrow
299
$fix2: bis r22, r21, r22 C bring forward borrow
254
$fix2: bis r22, cy0, r22 C bring forward borrow
301
256
$fix3: bis r23, r22, r23 C bring forward borrow
303
$fix4: bis r20, r23, r20 C bring forward borrow
258
$fix4: bis cy1, r23, cy1 C bring forward borrow
305
$fix5: bis r20, r21, r21 C bring forward borrow
260
$fix5: bis cy1, cy0, cy0 C bring forward borrow
307
$fix6: bis r22, r21, r22 C bring forward borrow
262
$fix6: bis r22, cy0, r22 C bring forward borrow
309
264
$fix7: bis r23, r22, r23 C bring forward borrow
311
$fix0c: bis r20, r23, r20 C bring forward borrow
266
$fix0c: bis cy1, r23, cy1 C bring forward borrow
313
$fix1c: bis r21, r20, r21 C bring forward borrow
268
$fix1c: bis cy0, cy1, cy0 C bring forward borrow
315
$fix2c: bis r22, r21, r22 C bring forward borrow
317
$fix3c: bis r23, r22, r23 C bring forward borrow
319
270
$fix7c: bis r23, r22, r23 C bring forward borrow