1
/* Optimized version of the standard memset() function.
3
Copyright (c) 2002 Hewlett-Packard Co/CERN
4
Sverre Jarp <Sverre.Jarp@cern.ch>
13
The algorithm is fairly straightforward: set byte by byte until we
14
we get to a 16B-aligned address, then loop on 128 B chunks using an
15
early store as prefetching, then loop on 32B chucks, then clear remaining
16
words, finally clear remaining bytes.
17
Since a stf.spill f0 can store 16B in one go, we use this instruction
18
to get peak speed when value = 0. */
20
#include <asm/asmmacro.h>
40
// This routine uses only scratch predicate registers (p6 - p15)
41
#define p_scr p6 // default register for same-cycle branches
53
#define LSIZE_SH 7 // shift amount
59
alloc tmp = ar.pfs, 3, 0, 0, 0
65
mov ret0 = dest // return value
66
cmp.ne p_nz, p_zr = value, r0 // use stf.spill if value is zero
67
cmp.eq p_scr, p0 = cnt, r0
70
and ptr2 = -(MIN1+1), dest // aligned address
71
and tmp = MIN1, dest // prepare to check for correct alignment
72
tbit.nz p_y, p_n = dest, 0 // Do we have an odd address? (M_B_U)
75
mux1 value = value, @brcst // create 8 identical bytes in word
76
(p_scr) br.ret.dpnt.many rp // return immediately if count = 0
79
cmp.ne p_unalgn, p0 = tmp, r0 //
81
sub bytecnt = (MIN1+1), tmp // NB: # of bytes to move is 1 higher than loopcnt
82
cmp.gt p_scr, p0 = 16, cnt // is it a minimalistic task?
83
(p_scr) br.cond.dptk.many .move_bytes_unaligned // go move just a few (M_B_U)
86
(p_unalgn) add ptr1 = (MIN1+1), ptr2 // after alignment
87
(p_unalgn) add ptr2 = MIN1P1HALF, ptr2 // after alignment
88
(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 3 // should we do a st8 ?
91
(p_y) add cnt = -8, cnt //
92
(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 2 // should we do a st4 ?
94
(p_y) st8 [ptr2] = value,-4 //
95
(p_n) add ptr2 = 4, ptr2 //
98
(p_yy) add cnt = -4, cnt //
99
(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 1 // should we do a st2 ?
101
(p_yy) st4 [ptr2] = value,-2 //
102
(p_nn) add ptr2 = 2, ptr2 //
105
mov tmp = LINE_SIZE+1 // for compare
106
(p_y) add cnt = -2, cnt //
107
(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 0 // should we do a st1 ?
109
setf.sig fvalue=value // transfer value to FLP side
110
(p_y) st2 [ptr2] = value,-1 //
111
(p_n) add ptr2 = 1, ptr2 //
115
(p_yy) st1 [ptr2] = value //
116
cmp.gt p_scr, p0 = tmp, cnt // is it a minimalistic task?
118
(p_yy) add cnt = -1, cnt //
119
(p_scr) br.cond.dpnt.many .fraction_of_line // go move just a few
124
shr.u linecnt = cnt, LSIZE_SH
125
(p_zr) br.cond.dptk.many .l1b // Jump to use stf.spill
128
TEXT_ALIGN(32) // --------------------- // L1A: store ahead into cache lines; fill later
130
and tmp = -(LINE_SIZE), cnt // compute end of range
131
mov ptr9 = ptr1 // used for prefetching
132
and cnt = (LINE_SIZE-1), cnt // remainder
134
mov loopcnt = PREF_AHEAD-1 // default prefetch loop
135
cmp.gt p_scr, p0 = PREF_AHEAD, linecnt // check against actual value
138
(p_scr) add loopcnt = -1, linecnt //
139
add ptr2 = 8, ptr1 // start of stores (beyond prefetch stores)
140
add ptr1 = tmp, ptr1 // first address beyond total range
143
add tmp = -1, linecnt // next loop count
144
mov.i ar.lc = loopcnt //
148
stf8 [ptr9] = fvalue, 128 // Do stores one cache line apart
150
br.cloop.dptk.few .pref_l1a
153
add ptr0 = 16, ptr2 // Two stores in parallel
158
stf8 [ptr2] = fvalue, 8
159
stf8 [ptr0] = fvalue, 8
162
stf8 [ptr2] = fvalue, 24
163
stf8 [ptr0] = fvalue, 24
166
stf8 [ptr2] = fvalue, 8
167
stf8 [ptr0] = fvalue, 8
170
stf8 [ptr2] = fvalue, 24
171
stf8 [ptr0] = fvalue, 24
174
stf8 [ptr2] = fvalue, 8
175
stf8 [ptr0] = fvalue, 8
178
stf8 [ptr2] = fvalue, 24
179
stf8 [ptr0] = fvalue, 24
182
stf8 [ptr2] = fvalue, 8
183
stf8 [ptr0] = fvalue, 32
184
cmp.lt p_scr, p0 = ptr9, ptr1 // do we need more prefetching?
187
stf8 [ptr2] = fvalue, 24
188
(p_scr) stf8 [ptr9] = fvalue, 128
189
br.cloop.dptk.few .l1ax
192
cmp.le p_scr, p0 = 8, cnt // just a few bytes left ?
193
(p_scr) br.cond.dpnt.many .fraction_of_line // Branch no. 2
194
br.cond.dpnt.many .move_bytes_from_alignment // Branch no. 3
198
.l1b: // ------------------------------------ // L1B: store ahead into cache lines; fill later
200
and tmp = -(LINE_SIZE), cnt // compute end of range
201
mov ptr9 = ptr1 // used for prefetching
202
and cnt = (LINE_SIZE-1), cnt // remainder
204
mov loopcnt = PREF_AHEAD-1 // default prefetch loop
205
cmp.gt p_scr, p0 = PREF_AHEAD, linecnt // check against actual value
208
(p_scr) add loopcnt = -1, linecnt
209
add ptr2 = 16, ptr1 // start of stores (beyond prefetch stores)
210
add ptr1 = tmp, ptr1 // first address beyond total range
213
add tmp = -1, linecnt // next loop count
214
mov.i ar.lc = loopcnt
218
stf.spill [ptr9] = f0, 128 // Do stores one cache line apart
220
br.cloop.dptk.few .pref_l1b
223
add ptr0 = 16, ptr2 // Two stores in parallel
228
stf.spill [ptr2] = f0, 32
229
stf.spill [ptr0] = f0, 32
232
stf.spill [ptr2] = f0, 32
233
stf.spill [ptr0] = f0, 32
236
stf.spill [ptr2] = f0, 32
237
stf.spill [ptr0] = f0, 64
238
cmp.lt p_scr, p0 = ptr9, ptr1 // do we need more prefetching?
241
stf.spill [ptr2] = f0, 32
242
(p_scr) stf.spill [ptr9] = f0, 128
243
br.cloop.dptk.few .l1bx
246
cmp.gt p_scr, p0 = 8, cnt // just a few bytes left ?
247
(p_scr) br.cond.dpnt.many .move_bytes_from_alignment //
253
shr.u loopcnt = cnt, 5 // loopcnt = cnt / 32
256
cmp.eq p_scr, p0 = loopcnt, r0
257
add loopcnt = -1, loopcnt
258
(p_scr) br.cond.dpnt.many .store_words
261
and cnt = 0x1f, cnt // compute the remaining cnt
262
mov.i ar.lc = loopcnt
265
.l2: // ------------------------------------ // L2A: store 32B in 2 cycles
267
stf8 [ptr1] = fvalue, 8
268
stf8 [ptr2] = fvalue, 8
270
stf8 [ptr1] = fvalue, 24
271
stf8 [ptr2] = fvalue, 24
272
br.cloop.dptk.many .l2
276
cmp.gt p_scr, p0 = 8, cnt // just a few bytes left ?
277
(p_scr) br.cond.dpnt.many .move_bytes_from_alignment // Branch
281
stf8 [ptr1] = fvalue, 8 // store
282
cmp.le p_y, p_n = 16, cnt
283
add cnt = -8, cnt // subtract
286
(p_y) stf8 [ptr1] = fvalue, 8 // store
287
(p_y) cmp.le.unc p_yy, p_nn = 16, cnt
288
(p_y) add cnt = -8, cnt // subtract
291
(p_yy) stf8 [ptr1] = fvalue, 8
292
(p_yy) add cnt = -8, cnt // subtract
295
.move_bytes_from_alignment:
297
cmp.eq p_scr, p0 = cnt, r0
298
tbit.nz.unc p_y, p0 = cnt, 2 // should we terminate with a st4 ?
299
(p_scr) br.cond.dpnt.few .restore_and_exit
302
(p_y) st4 [ptr1] = value,4
303
tbit.nz.unc p_yy, p0 = cnt, 1 // should we terminate with a st2 ?
306
(p_yy) st2 [ptr1] = value,2
307
tbit.nz.unc p_y, p0 = cnt, 0 // should we terminate with a st1 ?
311
(p_y) st1 [ptr1] = value
316
mov.i ar.lc = save_lc
320
.move_bytes_unaligned:
322
.pred.rel "mutex",p_y, p_n
323
.pred.rel "mutex",p_yy, p_nn
324
(p_n) cmp.le p_yy, p_nn = 4, cnt
325
(p_y) cmp.le p_yy, p_nn = 5, cnt
326
(p_n) add ptr2 = 2, ptr1
328
(p_y) add ptr2 = 3, ptr1
329
(p_y) st1 [ptr1] = value, 1 // fill 1 (odd-aligned) byte [15, 14 (or less) left]
330
(p_y) add cnt = -1, cnt
333
(p_yy) cmp.le.unc p_y, p0 = 8, cnt
334
add ptr3 = ptr1, cnt // prepare last store
335
mov.i ar.lc = save_lc
337
(p_yy) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes
338
(p_yy) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes [11, 10 (o less) left]
339
(p_yy) add cnt = -4, cnt
342
(p_y) cmp.le.unc p_yy, p0 = 8, cnt
343
add ptr3 = -1, ptr3 // last store
344
tbit.nz p_scr, p0 = cnt, 1 // will there be a st2 at the end ?
346
(p_y) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes
347
(p_y) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes [7, 6 (or less) left]
348
(p_y) add cnt = -4, cnt
351
(p_yy) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes
352
(p_yy) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes [3, 2 (or less) left]
353
tbit.nz p_y, p0 = cnt, 0 // will there be a st1 at the end ?
355
(p_yy) add cnt = -4, cnt
358
(p_scr) st2 [ptr1] = value // fill 2 (aligned) bytes
359
(p_y) st1 [ptr3] = value // fill last byte (using ptr3)