1
/* Optimized version of the standard memcpy() function.
2
This file is part of the GNU C Library.
3
Copyright (C) 2000, 2001, 2003 Free Software Foundation, Inc.
4
Contributed by Dan Pop for Itanium <Dan.Pop@cern.ch>.
5
Rewritten for McKinley by Sverre Jarp, HP Labs/CERN <Sverre.Jarp@cern.ch>
7
The GNU C Library is free software; you can redistribute it and/or
8
modify it under the terms of the GNU Lesser General Public
9
License as published by the Free Software Foundation; either
10
version 2.1 of the License, or (at your option) any later version.
12
The GNU C Library is distributed in the hope that it will be useful,
13
but WITHOUT ANY WARRANTY; without even the implied warranty of
14
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
Lesser General Public License for more details.
17
You should have received a copy of the GNU Lesser General Public
18
License along with the GNU C Library; if not, see
19
<http://www.gnu.org/licenses/>. */
28
An assembly implementation of the algorithm used by the generic C
29
version from glibc. The case when source and sest are aligned is
30
treated separately, for extra performance.
32
In this form, memcpy assumes little endian mode. For big endian mode,
33
sh1 must be computed using an extra instruction: sub sh1 = 64, sh1
34
and the order of r[MEMLAT] and r[MEMLAT+1] must be reverted in the
42
#define LFETCH_DIST 500
44
#define ALIGN_UNROLL_no 4 // no. of elements
45
#define ALIGN_UNROLL_sh 2 // (shift amount)
48
#define Nrot ((4*(MEMLAT+2) + 7) & ~7)
91
#elif defined(USE_INT)
105
#ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
106
/* Manually force proper loop-alignment. Note: be sure to
107
double-check the code-layout after making any changes to
109
# define ALIGN(n) { nop 0 }
111
# define ALIGN(n) .align n
114
#if defined(USE_LFETCH)
115
#define LOOP(shift) \
119
(p[0]) ld8.nt1 r[0] = [asrc], 8 ; \
120
(p[0]) lfetch.nt1 [ptr1], 16 ; \
123
(p[MEMLAT+1]) st8 [dest] = tmp3, 8 ; \
124
(p[MEMLAT]) shrp tmp3 = r[MEMLAT], s[MEMLAT+1], shift ; \
127
(p[0]) ld8.nt1 s[0] = [asrc], 8 ; \
128
(p[0]) lfetch.nt1 [ptr2], 16 ; \
131
(p[MEMLAT+1]) st8 [dest] = tmp4, 8 ; \
132
(p[MEMLAT]) shrp tmp4 = s[MEMLAT], r[MEMLAT], shift ; \
133
br.ctop.sptk.many .loop##shift \
136
br.cond.sptk.many .copy_bytes ; /* deal with the remaining bytes */ \
139
#define LOOP(shift) \
143
(p[0]) ld8.nt1 r[0] = [asrc], 8 ; \
146
(p[MEMLAT+1]) st8 [dest] = tmp3, 8 ; \
147
(p[MEMLAT]) shrp tmp3 = r[MEMLAT], s[MEMLAT+1], shift ; \
150
(p[0]) ld8.nt1 s[0] = [asrc], 8 ; \
153
(p[MEMLAT+1]) st8 [dest] = tmp4, 8 ; \
154
(p[MEMLAT]) shrp tmp4 = s[MEMLAT], r[MEMLAT], shift ; \
155
br.ctop.sptk.many .loop##shift \
158
br.cond.sptk.many .copy_bytes ; /* deal with the remaining bytes */ \
166
alloc r2 = ar.pfs, 3, Nrot - 3, 0, Nrot
167
.rotr r[MEMLAT+1], s[MEMLAT+2], q[MEMLAT+1], t[MEMLAT+1]
169
.rotf fr[MEMLAT+1], fq[MEMLAT+1], fs[MEMLAT+1], ft[MEMLAT+1]
170
mov ret0 = in0 // return tmp2 = dest
172
movi0 saved_pr = pr // save the predicate registers
174
and tmp4 = 7, in0 // check if destination is aligned
175
mov dest = in0 // dest
179
cmp.eq p_scr, p0 = in2, r0 // if (len == 0)
180
.save ar.lc, saved_lc
181
movi0 saved_lc = ar.lc // save the loop counter
183
cmp.ge p_few, p0 = OP_T_THRES, in2 // is len <= OP_T_THRESH
186
(p_scr) br.cond.dpnt.few .restore_and_exit // Branch no. 1: return dest
187
(p_few) br.cond.dpnt.many .copy_bytes // Branch no. 2: copy byte by byte
190
#if defined(USE_LFETCH)
194
shr.u elemcnt = len, 3 // elemcnt = len / 8
196
cmp.eq p_scr, p0 = tmp4, r0 // is destination aligned?
197
sub loopcnt = 7, tmp4 //
198
(p_scr) br.cond.dptk.many .dest_aligned
201
ld1 tmp2 = [src], 1 //
202
sub len = len, loopcnt, 1 // reduce len
203
movi0 ar.lc = loopcnt //
205
cmp.ne p_scr, p0 = 0, loopcnt // avoid loading beyond end-point
208
.l0: // ---------------------------- // L0: Align src on 8-byte boundary
210
st1 [dest] = tmp2, 1 //
211
(p_scr) ld1 tmp2 = [src], 1 //
213
cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point
214
add loopcnt = -1, loopcnt
215
br.cloop.dptk.few .l0 //
220
and tmp4 = 7, src // ready for alignment check
221
shr.u elemcnt = len, 3 // elemcnt = len / 8
224
cmp.ne p_scr, p0 = tmp4, r0 // is source also aligned
225
tbit.nz p_xtr, p_nxtr = src, 3 // prepare a separate move if src
226
} { .mib // is not 16B aligned
227
add ptr2 = LFETCH_DIST, dest // prefetch address
228
add ptr1 = LFETCH_DIST, src
229
(p_scr) br.cond.dptk.many .src_not_aligned
232
// The optimal case, when dest, and src are aligned
236
.pred.rel "mutex",p_xtr,p_nxtr
237
(p_xtr) cmp.gt p_scr, p0 = ALIGN_UNROLL_no+1, elemcnt // Need N + 1 to qualify
238
(p_nxtr) cmp.gt p_scr, p0 = ALIGN_UNROLL_no, elemcnt // Need only N to qualify
239
movi0 pr.rot = 1 << 16 // set rotating predicates
241
(p_scr) br.cond.dpnt.many .copy_full_words
245
(p_xtr) load tempreg = [src], 8
246
(p_xtr) add elemcnt = -1, elemcnt
247
movi0 ar.ec = MEMLAT + 1 // set the epilog counter
250
(p_xtr) add len = -8, len //
251
add asrc = 16, src // one bank apart (for USE_INT)
252
shr.u loopcnt = elemcnt, ALIGN_UNROLL_sh // cater for unrolling
255
add loopcnt = -1, loopcnt
256
(p_xtr) store [dest] = tempreg, 8 // copy the "extra" word
261
movi0 ar.lc = loopcnt // set the loop counter
264
#ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
270
.l1: // ------------------------------- // L1: Everything a multiple of 8
272
#if defined(USE_LFETCH)
273
(p[0]) lfetch.nt1 [ptr2],32
275
(p[0]) ldfp8 the_r[0],the_q[0] = [src], 16
276
(p[0]) add len = -32, len
278
(p[MEMLAT]) store [dest] = the_r[MEMLAT], 8
279
(p[MEMLAT]) store [adest] = the_s[MEMLAT], 8
282
#if defined(USE_LFETCH)
283
(p[0]) lfetch.nt1 [ptr1],32
285
(p[0]) ldfp8 the_s[0], the_t[0] = [src], 16
287
(p[MEMLAT]) store [dest] = the_q[MEMLAT], 24
288
(p[MEMLAT]) store [adest] = the_t[MEMLAT], 24
289
br.ctop.dptk.many .l1
291
#elif defined(USE_INT)
292
.l1: // ------------------------------- // L1: Everything a multiple of 8
294
(p[0]) load the_r[0] = [src], 8
295
(p[0]) load the_q[0] = [asrc], 8
296
(p[0]) add len = -32, len
298
(p[MEMLAT]) store [dest] = the_r[MEMLAT], 8
299
(p[MEMLAT]) store [adest] = the_q[MEMLAT], 8
302
(p[0]) load the_s[0] = [src], 24
303
(p[0]) load the_t[0] = [asrc], 24
305
(p[MEMLAT]) store [dest] = the_s[MEMLAT], 24
306
(p[MEMLAT]) store [adest] = the_t[MEMLAT], 24
307
#if defined(USE_LFETCH)
310
(p[0]) lfetch.nt1 [ptr2],32
311
(p[0]) lfetch.nt1 [ptr1],32
313
br.ctop.dptk.many .l1
319
cmp.gt p_scr, p0 = 8, len //
320
shr.u elemcnt = len, 3 //
321
(p_scr) br.cond.dpnt.many .copy_bytes
324
load tempreg = [src], 8
325
add loopcnt = -1, elemcnt //
328
cmp.ne p_scr, p0 = 0, loopcnt //
329
mov ar.lc = loopcnt //
332
.l2: // ------------------------------- // L2: Max 4 words copied separately
334
store [dest] = tempreg, 8
335
(p_scr) load tempreg = [src], 8 //
338
cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point
339
add loopcnt = -1, loopcnt
340
br.cloop.dptk.few .l2
345
cmp.eq p_scr, p0 = len, r0 // is len == 0 ?
346
add loopcnt = -1, len // len--;
347
(p_scr) br.cond.spnt .restore_and_exit
351
movi0 ar.lc = loopcnt
352
cmp.ne p_scr, p0 = 0, loopcnt // avoid load beyond end-point
355
.l3: // ------------------------------- // L3: Final byte move
358
(p_scr) ld1 tmp2 = [src], 1
360
cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point
361
add loopcnt = -1, loopcnt
362
br.cloop.dptk.few .l3
367
movi0 pr = saved_pr, -1 // restore the predicate registers
370
movi0 ar.lc = saved_lc // restore the loop counter
377
cmp.gt p_scr, p0 = 16, len
378
and sh1 = 7, src // sh1 = src % 8
379
shr.u loopcnt = len, 4 // element-cnt = len / 16
381
add tmp4 = @ltoff(.table), gp
382
add tmp3 = @ltoff(.loop56), gp
383
(p_scr) br.cond.dpnt.many .copy_bytes // do byte by byte if too few
386
and asrc = -8, src // asrc = (-8) -- align src for loop
387
add loopcnt = -1, loopcnt // loopcnt--
388
shl sh1 = sh1, 3 // sh1 = 8 * (src % 8)
390
ld8 ptable = [tmp4] // ptable = &table
391
ld8 ploop56 = [tmp3] // ploop56 = &loop56
392
and tmp2 = -16, len // tmp2 = len & -OPSIZ
395
add tmp3 = ptable, sh1 // tmp3 = &table + sh1
396
add src = src, tmp2 // src += len & (-16)
397
movi0 ar.lc = loopcnt // set LC
400
ld8 tmp4 = [tmp3] // tmp4 = loop offset
401
sub len = len, tmp2 // len -= len & (-16)
402
movi0 ar.ec = MEMLAT + 2 // one more pass needed
405
ld8 s[1] = [asrc], 8 // preload
406
sub loopaddr = ploop56,tmp4 // loopadd = &loop56 - loop offset
407
movi0 pr.rot = 1 << 16 // set rotating predicates
412
br b6 // jump to the appropriate loop
423
libc_hidden_builtin_def (memcpy)
428
data8 0 // dummy entry
429
data8 .loop56 - .loop8
430
data8 .loop56 - .loop16
431
data8 .loop56 - .loop24
432
data8 .loop56 - .loop32
433
data8 .loop56 - .loop40
434
data8 .loop56 - .loop48
435
data8 .loop56 - .loop56