1
dnl Alpha ev6 mpn_add_n -- Add two limb vectors of the same length > 0 and
2
dnl store sum in a third limb vector.
4
dnl Copyright 2000 Free Software Foundation, Inc.
6
dnl This file is part of the GNU MP Library.
8
dnl The GNU MP Library is free software; you can redistribute it and/or modify
9
dnl it under the terms of the GNU Lesser General Public License as published
10
dnl by the Free Software Foundation; either version 2.1 of the License, or (at
11
dnl your option) any later version.
13
dnl The GNU MP Library is distributed in the hope that it will be useful, but
14
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16
dnl License for more details.
18
dnl You should have received a copy of the GNU Lesser General Public License
19
dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
20
dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
21
dnl MA 02111-1307, USA.
23
include(`../config.m4')
31
dnl This code runs at 5.4 cycles/limb on EV5, and 2.1 cycles/limb on EV6.
33
dnl This code was written in close cooperation with ev6 pipeline expert
34
dnl Steve Root. Any errors are tege's fault, though.
42
dnl sustains 8 adds in 17 cycles !
43
dnl (from the d_cache)
45
dnl pair loads and stores where possible
46
dnl store pairs oct-aligned where possible
47
dnl (didn't need it here)
48
dnl stores are delayed every third cycle
49
dnl loads and stores are delayed by fills
50
dnl U stays still, put code there where possible
51
dnl (note alternation of U1 and U0)
52
dnl L moves because of loads and stores
53
dnl note dampers in L to limit damage
54
dnl note, load ahead of time where possible
56
dnl this odd-looking optimization expects
57
dnl that were having random bits in our data, so
58
dnl that a pure zero result is unlikely. so we
59
dnl penalize the unlikely case to help the
69
lda r19, -8(r19) C L1 move counter
74
ldq r0, 0(r17) C L0 get next ones
76
ldq r3, 8(r17) C L0 get next ones
78
ldq r6, 16(r17) C L0 get next ones
81
ldq r9, 24(r17) C L0 get next ones
84
addq r0, r1, r2 C U1 add two data
86
cmpult r2, r1, r20 C U1 did it carry
88
ldq r0, 32(r17) C L0 get next ones
91
addq r3, r4, r5 C U0 add two data
93
cmpult r5, r4, r21 C U0 did it carry
94
ldq r3, 40(r17) C L0 get next ones
97
addq r6, r7, r8 C U1 add two data
98
addq r5, r20, r5 C U0 carry from last
101
cmpult r8, r7, r22 C U1 did it carry
102
beq r5, $fix5w C U0 fix exact zero
103
$ret5w: ldq r6, 48(r17) C L0 get next ones
106
bis r31, r31, r31 C L damp out
107
addq r8, r21, r8 C U1 carry from last
108
bis r31, r31, r31 C L moves in L !
109
addq r9, r10, r11 C U0 add two data
111
beq r8, $fix6w C U1 fix exact zero
112
$ret6w: cmpult r11, r10, r23 C U0 did it carry
113
ldq r9, 56(r17) C L0 get next ones
114
ldq r10, 56(r18) C L1
116
lda r17, 64(r17) C L0 move pointer
117
bis r31, r31, r31 C U
118
lda r18, 64(r18) C L1 move pointer
120
lda r19, -8(r19) C L1 move counter
123
C Main loop. 8-way unrolled.
126
addq r0, r1, r2 C U1 add two data
127
addq r11, r22, r11 C U0 add in carry
128
stq r5, 8(r16) C L0 put an answer
129
stq r8, 16(r16) C L1 pair
131
cmpult r2, r1, r20 C U1 did it carry
132
beq r11, $fix7 C U0 fix exact 0
133
$ret7: ldq r0, 0(r17) C L0 get next ones
136
bis r31, r31, r31 C L damp out
137
addq r2, r23, r2 C U1 carry from last
138
bis r31, r31, r31 C L moves in L !
139
addq r3, r4, r5 C U0 add two data
141
beq r2, $fix0 C U1 fix exact zero
142
$ret0: cmpult r5, r4, r21 C U0 did it carry
143
ldq r3, 8(r17) C L0 get next ones
146
addq r6, r7, r8 C U1 add two data
147
addq r5, r20, r5 C U0 carry from last
148
stq r11, 24(r16) C L0 store pair
151
cmpult r8, r7, r22 C U1 did it carry
152
beq r5, $fix1 C U0 fix exact zero
153
$ret1: ldq r6, 16(r17) C L0 get next ones
156
lda r16, 64(r16) C L0 move pointer
157
addq r8, r21, r8 C U1 carry from last
158
lda r19, -8(r19) C L1 move counter
159
addq r9, r10, r11 C U0 add two data
161
beq r8, $fix2 C U1 fix exact zero
162
$ret2: cmpult r11, r10, r23 C U0 did it carry
163
ldq r9, 24(r17) C L0 get next ones
164
ldq r10, 24(r18) C L1
166
addq r0, r1, r2 C U1 add two data
167
addq r11, r22, r11 C U0 add in carry
168
stq r5, -24(r16) C L0 put an answer
169
stq r8, -16(r16) C L1 pair
171
cmpult r2, r1, r20 C U1 did it carry
172
beq r11, $fix3 C U0 fix exact 0
173
$ret3: ldq r0, 32(r17) C L0 get next ones
176
bis r31, r31, r31 C L damp out
177
addq r2, r23, r2 C U1 carry from last
178
bis r31, r31, r31 C L moves in L !
179
addq r3, r4, r5 C U0 add two data
181
beq r2, $fix4 C U1 fix exact zero
182
$ret4: cmpult r5, r4, r21 C U0 did it carry
183
ldq r3, 40(r17) C L0 get next ones
186
addq r6, r7, r8 C U1 add two data
187
addq r5, r20, r5 C U0 carry from last
188
stq r11, -8(r16) C L0 store pair
191
cmpult r8, r7, r22 C U1 did it carry
192
beq r5, $fix5 C U0 fix exact zero
193
$ret5: ldq r6, 48(r17) C L0 get next ones
196
bis r31, r31, r31 C L damp out
197
addq r8, r21, r8 C U1 carry from last
198
bis r31, r31, r31 C L moves in L !
199
addq r9, r10, r11 C U0 add two data
201
beq r8, $fix6 C U1 fix exact zero
202
$ret6: cmpult r11, r10, r23 C U0 did it carry
203
ldq r9, 56(r17) C L0 get next ones
204
ldq r10, 56(r18) C L1
206
lda r17, 64(r17) C L0 move pointer
207
bis r31, r31, r31 C U
208
lda r18, 64(r18) C L1 move pointer
209
bge r19, $Loop C U1 loop control
213
addq r0, r1, r2 C U1 add two data
214
addq r11, r22, r11 C U0 add in carry
215
stq r5, 8(r16) C L0 put an answer
216
stq r8, 16(r16) C L1 pair
218
cmpult r2, r1, r20 C U1 did it carry
219
beq r11, $fix7c C U0 fix exact 0
221
addq r2, r23, r2 C U1 carry from last
222
addq r3, r4, r5 C U0 add two data
224
beq r2, $fix0c C U1 fix exact zero
225
$ret0c: cmpult r5, r4, r21 C U0 did it carry
227
addq r6, r7, r8 C U1 add two data
228
addq r5, r20, r5 C U0 carry from last
229
stq r11, 24(r16) C L0 store pair
232
cmpult r8, r7, r22 C U1 did it carry
233
beq r5, $fix1c C U0 fix exact zero
235
lda r16, 64(r16) C L0 move pointer
236
addq r8, r21, r8 C U1 carry from last
237
addq r9, r10, r11 C U0 add two data
239
beq r8, $fix2c C U1 fix exact zero
240
$ret2c: cmpult r11, r10, r23 C U0 did it carry
242
addq r11, r22, r11 C U0 add in carry
243
stq r5, -24(r16) C L0 put an answer
244
stq r8, -16(r16) C L1 pair
246
beq r11, $fix3c C U0 fix exact 0
248
stq r11, -8(r16) C L0 store pair
261
$Loop0: addq r0, r1, r2 C main add
263
cmpult r2, r1, r8 C compute cy from last add
265
addq r2, r23, r20 C carry add
269
cmpult r20, r2, r23 C compute cy from last add
270
lda r19, -1(r19) C decr loop cnt
271
bis r8, r23, r23 C combine cy from the two adds
274
$Lend0: addq r0, r1, r2 C main add
275
addq r2, r23, r20 C carry add
276
cmpult r2, r1, r8 C compute cy from last add
277
cmpult r20, r2, r23 C compute cy from last add
279
bis r8, r23, r23 C combine cy from the two adds
282
lda r0, 0(r23) C copy carry into return register
291
$fix5w: bis r21, r20, r21 C bring forward carry
293
$fix6w: bis r22, r21, r22 C bring forward carry
295
$fix0: bis r20, r23, r20 C bring forward carry
297
$fix1: bis r21, r20, r21 C bring forward carry
299
$fix2: bis r22, r21, r22 C bring forward carry
301
$fix3: bis r23, r22, r23 C bring forward carry
303
$fix4: bis r20, r23, r20 C bring forward carry
305
$fix5: bis r20, r21, r21 C bring forward carry
307
$fix6: bis r22, r21, r22 C bring forward carry
309
$fix7: bis r23, r22, r23 C bring forward carry
311
$fix0c: bis r20, r23, r20 C bring forward carry
313
$fix1c: bis r21, r20, r21 C bring forward carry
315
$fix2c: bis r22, r21, r22 C bring forward carry
317
$fix3c: bis r23, r22, r23 C bring forward carry
319
$fix7c: bis r23, r22, r23 C bring forward carry