1
dnl Alpha ev6 mpn_sub_n -- Subtract two limb vectors of the same length > 0
2
dnl and store difference in a third limb vector.
4
dnl Copyright 2000 Free Software Foundation, Inc.
6
dnl This file is part of the GNU MP Library.
8
dnl The GNU MP Library is free software; you can redistribute it and/or modify
9
dnl it under the terms of the GNU Lesser General Public License as published
10
dnl by the Free Software Foundation; either version 2.1 of the License, or (at
11
dnl your option) any later version.
13
dnl The GNU MP Library is distributed in the hope that it will be useful, but
14
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16
dnl License for more details.
18
dnl You should have received a copy of the GNU Lesser General Public License
19
dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
20
dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
21
dnl MA 02111-1307, USA.
23
include(`../config.m4')
31
dnl This code runs at 5.4 cycles/limb on EV5, and 2.1 cycles/limb on EV6.
33
dnl This code was written in close cooperation with ev6 pipeline expert
34
dnl Steve Root. Any errors are tege's fault, though.
42
dnl sustains 8 subtracts in 17 cycles !
43
dnl (from the d_cache)
45
dnl pair loads and stores where possible
46
dnl store pairs oct-aligned where possible
47
dnl (didn't need it here)
48
dnl stores are delayed every third cycle
49
dnl loads and stores are delayed by fills
50
dnl U stays still, put code there where possible
51
dnl (note alternation of U1 and U0)
52
dnl L moves because of loads and stores
53
dnl note dampers in L to limit damage
54
dnl note, load ahead of time where possible
56
dnl this odd-looking optimization expects
57
dnl that were having random bits in our data, so
58
dnl that a pure zero result is unlikely. so we
59
dnl penalize the unlikely case to help the
69
lda r19, -8(r19) C L1 move counter
74
ldq r0, 0(r17) C L0 get next ones
76
ldq r3, 8(r17) C L0 get next ones
78
ldq r6, 16(r17) C L0 get next ones
81
ldq r9, 24(r17) C L0 get next ones
84
subq r0, r1, r2 C U1 sub two data
86
cmpult r0, r1, r20 C U1 did it borrow
88
ldq r0, 32(r17) C L0 get next ones
91
subq r3, r4, r5 C U0 sub two data
93
cmpult r3, r4, r21 C U0 did it borrow
94
ldq r3, 40(r17) C L0 get next ones
97
subq r6, r7, r8 C U1 sub two data
98
subq r5, r20, r24 C U0 borrow from last
101
cmpult r6, r7, r22 C U1 did it borrow
102
beq r5, $fix5w C U0 fix exact zero
103
$ret5w: ldq r6, 48(r17) C L0 get next ones
106
bis r31, r31, r31 C L damp out
107
subq r8, r21, r25 C U1 borrow from last
108
bis r31, r31, r31 C L moves in L !
109
subq r9, r10, r11 C U0 sub two data
111
beq r8, $fix6w C U1 fix exact zero
112
$ret6w: cmpult r9, r10, r23 C U0 did it borrow
113
ldq r9, 56(r17) C L0 get next ones
114
ldq r10, 56(r18) C L1
116
lda r17, 64(r17) C L0 move pointer
117
bis r31, r31, r31 C U
118
lda r18, 64(r18) C L1 move pointer
120
lda r19, -8(r19) C L1 move counter
123
C Main loop. 8-way unrolled.
126
subq r0, r1, r2 C U1 sub two data
127
stq r24, 8(r16) C L0 put an answer
128
subq r11, r22, r24 C U0 borrow from last
129
stq r25, 16(r16) C L1 pair
131
cmpult r0, r1, r20 C U1 did it borrow
132
beq r11, $fix7 C U0 fix exact 0
133
$ret7: ldq r0, 0(r17) C L0 get next ones
136
bis r31, r31, r31 C L damp out
137
subq r2, r23, r25 C U1 borrow from last
138
bis r31, r31, r31 C L moves in L !
139
subq r3, r4, r5 C U0 sub two data
141
beq r2, $fix0 C U1 fix exact zero
142
$ret0: cmpult r3, r4, r21 C U0 did it borrow
143
ldq r3, 8(r17) C L0 get next ones
146
subq r6, r7, r8 C U1 sub two data
147
stq r24, 24(r16) C L0 store pair
148
subq r5, r20, r24 C U0 borrow from last
149
stq r25, 32(r16) C L1
151
cmpult r6, r7, r22 C U1 did it borrow
152
beq r5, $fix1 C U0 fix exact zero
153
$ret1: ldq r6, 16(r17) C L0 get next ones
156
lda r16, 64(r16) C L0 move pointer
157
subq r8, r21, r25 C U1 borrow from last
158
lda r19, -8(r19) C L1 move counter
159
subq r9, r10, r11 C U0 sub two data
161
beq r8, $fix2 C U1 fix exact zero
162
$ret2: cmpult r9, r10, r23 C U0 did it borrow
163
ldq r9, 24(r17) C L0 get next ones
164
ldq r10, 24(r18) C L1
166
subq r0, r1, r2 C U1 sub two data
167
stq r24, -24(r16) C L0 put an answer
168
subq r11, r22, r24 C U0 borrow from last
169
stq r25, -16(r16) C L1 pair
171
cmpult r0, r1, r20 C U1 did it borrow
172
beq r11, $fix3 C U0 fix exact 0
173
$ret3: ldq r0, 32(r17) C L0 get next ones
176
bis r31, r31, r31 C L damp out
177
subq r2, r23, r25 C U1 borrow from last
178
bis r31, r31, r31 C L moves in L !
179
subq r3, r4, r5 C U0 sub two data
181
beq r2, $fix4 C U1 fix exact zero
182
$ret4: cmpult r3, r4, r21 C U0 did it borrow
183
ldq r3, 40(r17) C L0 get next ones
186
subq r6, r7, r8 C U1 sub two data
187
stq r24, -8(r16) C L0 store pair
188
subq r5, r20, r24 C U0 borrow from last
191
cmpult r6, r7, r22 C U1 did it borrow
192
beq r5, $fix5 C U0 fix exact zero
193
$ret5: ldq r6, 48(r17) C L0 get next ones
196
bis r31, r31, r31 C L damp out
197
subq r8, r21, r25 C U1 borrow from last
198
bis r31, r31, r31 C L moves in L !
199
subq r9, r10, r11 C U0 sub two data
201
beq r8, $fix6 C U1 fix exact zero
202
$ret6: cmpult r9, r10, r23 C U0 did it borrow
203
ldq r9, 56(r17) C L0 get next ones
204
ldq r10, 56(r18) C L1
206
lda r17, 64(r17) C L0 move pointer
207
bis r31, r31, r31 C U
208
lda r18, 64(r18) C L1 move pointer
209
bge r19, $Loop C U1 loop control
213
subq r0, r1, r2 C U1 sub two data
214
stq r24, 8(r16) C L0 put an answer
215
subq r11, r22, r24 C U0 borrow from last
216
stq r25, 16(r16) C L1 pair
218
cmpult r0, r1, r20 C U1 did it borrow
219
beq r11, $fix7c C U0 fix exact 0
221
subq r2, r23, r25 C U1 borrow from last
222
subq r3, r4, r5 C U0 sub two data
224
beq r2, $fix0c C U1 fix exact zero
225
$ret0c: cmpult r3, r4, r21 C U0 did it borrow
227
subq r6, r7, r8 C U1 sub two data
228
stq r24, 24(r16) C L0 store pair
229
subq r5, r20, r24 C U0 borrow from last
230
stq r25, 32(r16) C L1
232
cmpult r6, r7, r22 C U1 did it borrow
233
beq r5, $fix1c C U0 fix exact zero
235
lda r16, 64(r16) C L0 move pointer
236
subq r8, r21, r25 C U1 borrow from last
237
subq r9, r10, r11 C U0 sub two data
239
beq r8, $fix2c C U1 fix exact zero
240
$ret2c: cmpult r9, r10, r23 C U0 did it borrow
242
stq r24, -24(r16) C L0 put an answer
243
subq r11, r22, r24 C U0 borrow from last
244
stq r25, -16(r16) C L1 pair
246
beq r11, $fix3c C U0 fix exact 0
248
stq r24, -8(r16) C L0 store pair
261
$Loop0: subq r0, r1, r2 C main sub
262
cmpult r0, r1, r8 C compute bw from last sub
265
subq r2, r23, r20 C borrow sub
269
cmpult r2, r23, r23 C compute bw from last sub
270
lda r19, -1(r19) C decr loop cnt
271
bis r8, r23, r23 C combine bw from the two subs
274
$Lend0: subq r0, r1, r2 C main sub
275
cmpult r0, r1, r8 C compute bw from last sub
276
subq r2, r23, r20 C borrow sub
277
cmpult r2, r23, r23 C compute bw from last sub
279
bis r8, r23, r23 C combine bw from the two subs
282
lda r0, 0(r23) C copy borrow into return register
291
$fix5w: bis r21, r20, r21 C bring forward borrow
293
$fix6w: bis r22, r21, r22 C bring forward borrow
295
$fix0: bis r20, r23, r20 C bring forward borrow
297
$fix1: bis r21, r20, r21 C bring forward borrow
299
$fix2: bis r22, r21, r22 C bring forward borrow
301
$fix3: bis r23, r22, r23 C bring forward borrow
303
$fix4: bis r20, r23, r20 C bring forward borrow
305
$fix5: bis r20, r21, r21 C bring forward borrow
307
$fix6: bis r22, r21, r22 C bring forward borrow
309
$fix7: bis r23, r22, r23 C bring forward borrow
311
$fix0c: bis r20, r23, r20 C bring forward borrow
313
$fix1c: bis r21, r20, r21 C bring forward borrow
315
$fix2c: bis r22, r21, r22 C bring forward borrow
317
$fix3c: bis r23, r22, r23 C bring forward borrow
319
$fix7c: bis r23, r22, r23 C bring forward borrow