16
16
dnl License for more details.
18
18
dnl You should have received a copy of the GNU Lesser General Public License
19
dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
20
dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
21
dnl MA 02111-1307, USA.
19
dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write
20
dnl to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
21
dnl Boston, MA 02110-1301, USA.
23
23
include(`../config.m4')
31
dnl This code runs at 5.4 cycles/limb on EV5, and 2.1 cycles/limb on EV6.
33
dnl This code was written in close cooperation with ev6 pipeline expert
34
dnl Steve Root. Any errors are tege's fault, though.
42
dnl sustains 8 adds in 17 cycles !
43
dnl (from the d_cache)
45
dnl pair loads and stores where possible
46
dnl store pairs oct-aligned where possible
47
dnl (didn't need it here)
48
dnl stores are delayed every third cycle
49
dnl loads and stores are delayed by fills
50
dnl U stays still, put code there where possible
51
dnl (note alternation of U1 and U0)
52
dnl L moves because of loads and stores
53
dnl note dampers in L to limit damage
54
dnl note, load ahead of time where possible
56
dnl this odd-looking optimization expects
57
dnl that were having random bits in our data, so
58
dnl that a pure zero result is unlikely. so we
59
dnl penalize the unlikely case to help the
35
C cy r20 (for mpn_add_nc)
38
C Finish cleaning up cy registers r22, r23 (make them use cy0/cy1)
39
C Use multi-pronged feed-in.
40
C Perform additional micro-tuning
42
C This code was written in cooperation with ev6 pipeline expert Steve Root.
44
C Pair loads and stores where possible
45
C Store pairs oct-aligned where possible (didn't need it here)
46
C Stores are delayed every third cycle
47
C Loads and stores are delayed by fills
48
C U stays still, put code there where possible (note alternation of U1 and U0)
49
C L moves because of loads and stores
50
C Note dampers in L to limit damage
52
C This odd-looking optimization expects that were having random bits in our
53
C data, so that a pure zero result is unlikely. so we penalize the unlikely
54
C case to help the common case.
56
define(`u0', `r0') define(`u1', `r3')
57
define(`v0', `r1') define(`v1', `r4')
59
define(`cy0', `r20') define(`cy1', `r21')
61
MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc)
63
67
PROLOGUE(mpn_add_n)
69
lda r19, -8(r19) C L1 move counter
74
ldq r0, 0(r17) C L0 get next ones
76
ldq r3, 8(r17) C L0 get next ones
78
ldq r6, 16(r17) C L0 get next ones
81
ldq r9, 24(r17) C L0 get next ones
84
addq r0, r1, r2 C U1 add two data
86
cmpult r2, r1, r20 C U1 did it carry
88
ldq r0, 32(r17) C L0 get next ones
91
addq r3, r4, r5 C U0 add two data
93
cmpult r5, r4, r21 C U0 did it carry
94
ldq r3, 40(r17) C L0 get next ones
97
addq r6, r7, r8 C U1 add two data
98
addq r5, r20, r5 C U0 carry from last
101
cmpult r8, r7, r22 C U1 did it carry
102
beq r5, $fix5w C U0 fix exact zero
103
$ret5w: ldq r6, 48(r17) C L0 get next ones
106
bis r31, r31, r31 C L damp out
107
addq r8, r21, r8 C U1 carry from last
108
bis r31, r31, r31 C L moves in L !
109
addq r9, r10, r11 C U0 add two data
111
beq r8, $fix6w C U1 fix exact zero
112
$ret6w: cmpult r11, r10, r23 C U0 did it carry
113
ldq r9, 56(r17) C L0 get next ones
114
ldq r10, 56(r18) C L1
116
lda r17, 64(r17) C L0 move pointer
117
bis r31, r31, r31 C U
118
lda r18, 64(r18) C L1 move pointer
120
lda r19, -8(r19) C L1 move counter
68
bis r31, r31, cy0 C clear carry in
69
$entry: cmpult r19, 5, r22 C L1 move counter
70
ldq u1, 0(r17) C L0 get next ones
74
ldq u0, 8(r17) C L0 get next ones
76
addq u1, v1, r5 C U0 add two data
78
cmpult r5, v1, r23 C U0 did it carry
79
ldq u1, 16(r17) C L0 get next ones
82
addq u0, v0, r8 C U1 add two data
83
addq r5, cy0, r5 C U0 carry in
85
cmpult r8, v0, r22 C U1 did it carry
86
beq r5, $fix5f C U0 fix exact zero
87
$ret5f: ldq u0, 24(r17) C L0 get next ones
90
addq r8, r23, r8 C U1 carry from last
91
addq u1, v1, r7 C U0 add two data
93
beq r8, $fix6f C U1 fix exact zero
94
$ret6f: cmpult r7, v1, r23 C U0 did it carry
95
ldq u1, 32(r17) C L0 get next ones
98
lda r17, 40(r17) C L0 move pointer
99
lda r18, 40(r18) C L1 move pointer
102
lda r19, -13(r19) C L1 move counter
103
blt r19, $Lend C U1 loop control
123
106
C Main loop. 8-way unrolled.
126
addq r0, r1, r2 C U1 add two data
127
addq r11, r22, r11 C U0 add in carry
108
$Loop: addq u0, v0, r2 C U1 add two data
109
addq r7, r22, r7 C U0 add in carry
128
110
stq r5, 8(r16) C L0 put an answer
129
111
stq r8, 16(r16) C L1 pair
131
cmpult r2, r1, r20 C U1 did it carry
132
beq r11, $fix7 C U0 fix exact 0
133
$ret7: ldq r0, 0(r17) C L0 get next ones
113
cmpult r2, v0, cy1 C U1 did it carry
114
beq r7, $fix7 C U0 fix exact 0
115
$ret7: ldq u0, 0(r17) C L0 get next ones
136
118
bis r31, r31, r31 C L damp out
137
119
addq r2, r23, r2 C U1 carry from last
138
120
bis r31, r31, r31 C L moves in L !
139
addq r3, r4, r5 C U0 add two data
121
addq u1, v1, r5 C U0 add two data
141
123
beq r2, $fix0 C U1 fix exact zero
142
$ret0: cmpult r5, r4, r21 C U0 did it carry
143
ldq r3, 8(r17) C L0 get next ones
124
$ret0: cmpult r5, v1, cy0 C U0 did it carry
125
ldq u1, 8(r17) C L0 get next ones
146
addq r6, r7, r8 C U1 add two data
147
addq r5, r20, r5 C U0 carry from last
148
stq r11, 24(r16) C L0 store pair
128
addq u0, v0, r8 C U1 add two data
129
addq r5, cy1, r5 C U0 carry from last
130
stq r7, 24(r16) C L0 store pair
149
131
stq r2, 32(r16) C L1
151
cmpult r8, r7, r22 C U1 did it carry
133
cmpult r8, v0, r22 C U1 did it carry
152
134
beq r5, $fix1 C U0 fix exact zero
153
$ret1: ldq r6, 16(r17) C L0 get next ones
135
$ret1: ldq u0, 16(r17) C L0 get next ones
156
138
lda r16, 64(r16) C L0 move pointer
157
addq r8, r21, r8 C U1 carry from last
139
addq r8, cy0, r8 C U1 carry from last
158
140
lda r19, -8(r19) C L1 move counter
159
addq r9, r10, r11 C U0 add two data
141
addq u1, v1, r7 C U0 add two data
161
143
beq r8, $fix2 C U1 fix exact zero
162
$ret2: cmpult r11, r10, r23 C U0 did it carry
163
ldq r9, 24(r17) C L0 get next ones
164
ldq r10, 24(r18) C L1
144
$ret2: cmpult r7, v1, r23 C U0 did it carry
145
ldq u1, 24(r17) C L0 get next ones
166
addq r0, r1, r2 C U1 add two data
167
addq r11, r22, r11 C U0 add in carry
148
addq u0, v0, r2 C U1 add two data
149
addq r7, r22, r7 C U0 add in carry
168
150
stq r5, -24(r16) C L0 put an answer
169
151
stq r8, -16(r16) C L1 pair
171
cmpult r2, r1, r20 C U1 did it carry
172
beq r11, $fix3 C U0 fix exact 0
173
$ret3: ldq r0, 32(r17) C L0 get next ones
153
cmpult r2, v0, cy1 C U1 did it carry
154
beq r7, $fix3 C U0 fix exact 0
155
$ret3: ldq u0, 32(r17) C L0 get next ones
176
158
bis r31, r31, r31 C L damp out
177
159
addq r2, r23, r2 C U1 carry from last
178
160
bis r31, r31, r31 C L moves in L !
179
addq r3, r4, r5 C U0 add two data
161
addq u1, v1, r5 C U0 add two data
181
163
beq r2, $fix4 C U1 fix exact zero
182
$ret4: cmpult r5, r4, r21 C U0 did it carry
183
ldq r3, 40(r17) C L0 get next ones
164
$ret4: cmpult r5, v1, cy0 C U0 did it carry
165
ldq u1, 40(r17) C L0 get next ones
186
addq r6, r7, r8 C U1 add two data
187
addq r5, r20, r5 C U0 carry from last
188
stq r11, -8(r16) C L0 store pair
168
addq u0, v0, r8 C U1 add two data
169
addq r5, cy1, r5 C U0 carry from last
170
stq r7, -8(r16) C L0 store pair
189
171
stq r2, 0(r16) C L1
191
cmpult r8, r7, r22 C U1 did it carry
173
cmpult r8, v0, r22 C U1 did it carry
192
174
beq r5, $fix5 C U0 fix exact zero
193
$ret5: ldq r6, 48(r17) C L0 get next ones
175
$ret5: ldq u0, 48(r17) C L0 get next ones
196
bis r31, r31, r31 C L damp out
197
addq r8, r21, r8 C U1 carry from last
198
bis r31, r31, r31 C L moves in L !
199
addq r9, r10, r11 C U0 add two data
178
ldl r31, 256(r17) C L0 prefetch
179
addq r8, cy0, r8 C U1 carry from last
180
ldl r31, 256(r18) C L1 prefetch
181
addq u1, v1, r7 C U0 add two data
201
183
beq r8, $fix6 C U1 fix exact zero
202
$ret6: cmpult r11, r10, r23 C U0 did it carry
203
ldq r9, 56(r17) C L0 get next ones
204
ldq r10, 56(r18) C L1
184
$ret6: cmpult r7, v1, r23 C U0 did it carry
185
ldq u1, 56(r17) C L0 get next ones
206
188
lda r17, 64(r17) C L0 move pointer
207
189
bis r31, r31, r31 C U
209
191
bge r19, $Loop C U1 loop control
210
192
C ==== main loop end
213
addq r0, r1, r2 C U1 add two data
214
addq r11, r22, r11 C U0 add in carry
194
$Lend: addq u0, v0, r2 C U1 add two data
195
addq r7, r22, r7 C U0 add in carry
215
196
stq r5, 8(r16) C L0 put an answer
216
197
stq r8, 16(r16) C L1 pair
218
cmpult r2, r1, r20 C U1 did it carry
219
beq r11, $fix7c C U0 fix exact 0
221
addq r2, r23, r2 C U1 carry from last
222
addq r3, r4, r5 C U0 add two data
198
cmpult r2, v0, cy1 C U1 did it carry
199
beq r7, $fix7c C U0 fix exact 0
200
$ret7c: addq r2, r23, r2 C U1 carry from last
201
addq u1, v1, r5 C U0 add two data
224
202
beq r2, $fix0c C U1 fix exact zero
225
$ret0c: cmpult r5, r4, r21 C U0 did it carry
227
addq r6, r7, r8 C U1 add two data
228
addq r5, r20, r5 C U0 carry from last
229
stq r11, 24(r16) C L0 store pair
203
$ret0c: cmpult r5, v1, cy0 C U0 did it carry
204
addq r5, cy1, r5 C U0 carry from last
205
stq r7, 24(r16) C L0 store pair
230
206
stq r2, 32(r16) C L1
232
cmpult r8, r7, r22 C U1 did it carry
233
207
beq r5, $fix1c C U0 fix exact zero
235
lda r16, 64(r16) C L0 move pointer
236
addq r8, r21, r8 C U1 carry from last
237
addq r9, r10, r11 C U0 add two data
239
beq r8, $fix2c C U1 fix exact zero
240
$ret2c: cmpult r11, r10, r23 C U0 did it carry
242
addq r11, r22, r11 C U0 add in carry
243
stq r5, -24(r16) C L0 put an answer
244
stq r8, -16(r16) C L1 pair
246
beq r11, $fix3c C U0 fix exact 0
248
stq r11, -8(r16) C L0 store pair
208
$ret1c: stq r5, 40(r16) C L0 put an answer
209
lda r16, 48(r16) C L0 move pointer
261
$Loop0: addq r0, r1, r2 C main add
263
cmpult r2, r1, r8 C compute cy from last add
265
addq r2, r23, r20 C carry add
221
$Loop0: addq u1, v1, r2 C main add
222
cmpult r2, v1, r8 C compute cy from last add
225
addq r2, cy0, r5 C carry add
269
cmpult r20, r2, r23 C compute cy from last add
229
cmpult r5, r2, cy0 C compute cy from last add
270
230
lda r19, -1(r19) C decr loop cnt
271
bis r8, r23, r23 C combine cy from the two adds
231
bis r8, cy0, cy0 C combine cy from the two adds
274
$Lend0: addq r0, r1, r2 C main add
275
addq r2, r23, r20 C carry add
276
cmpult r2, r1, r8 C compute cy from last add
277
cmpult r20, r2, r23 C compute cy from last add
279
bis r8, r23, r23 C combine cy from the two adds
282
lda r0, 0(r23) C copy carry into return register
291
$fix5w: bis r21, r20, r21 C bring forward carry
293
$fix6w: bis r22, r21, r22 C bring forward carry
295
$fix0: bis r20, r23, r20 C bring forward carry
234
$Lend0: addq u1, v1, r2 C main add
235
addq r2, cy0, r5 C carry add
236
cmpult r2, v1, r8 C compute cy from last add
237
cmpult r5, r2, cy0 C compute cy from last add
239
bis r8, cy0, r0 C combine cy from the two adds
243
$Lret: lda r0, 0(cy0) C copy carry into return register
246
$fix5f: bis r23, cy0, r23 C bring forward carry
248
$fix6f: bis r22, r23, r22 C bring forward carry
250
$fix0: bis cy1, r23, cy1 C bring forward carry
297
$fix1: bis r21, r20, r21 C bring forward carry
252
$fix1: bis cy0, cy1, cy0 C bring forward carry
299
$fix2: bis r22, r21, r22 C bring forward carry
254
$fix2: bis r22, cy0, r22 C bring forward carry
301
256
$fix3: bis r23, r22, r23 C bring forward carry
303
$fix4: bis r20, r23, r20 C bring forward carry
258
$fix4: bis cy1, r23, cy1 C bring forward carry
305
$fix5: bis r20, r21, r21 C bring forward carry
260
$fix5: bis cy1, cy0, cy0 C bring forward carry
307
$fix6: bis r22, r21, r22 C bring forward carry
262
$fix6: bis r22, cy0, r22 C bring forward carry
309
264
$fix7: bis r23, r22, r23 C bring forward carry
311
$fix0c: bis r20, r23, r20 C bring forward carry
266
$fix0c: bis cy1, r23, cy1 C bring forward carry
313
$fix1c: bis r21, r20, r21 C bring forward carry
268
$fix1c: bis cy0, cy1, cy0 C bring forward carry
315
$fix2c: bis r22, r21, r22 C bring forward carry
317
$fix3c: bis r23, r22, r23 C bring forward carry
319
270
$fix7c: bis r23, r22, r23 C bring forward carry