56
54
__insn_prefetch (prefetch);
57
55
prefetch += CHIP_L2_LINE_SIZE ();
58
prefetch = (prefetch > src1_end) ? prefetch : src1;
56
prefetch = (prefetch < src1_end) ? prefetch : src1;
61
59
/* Copy bytes until dst is word-aligned. */
62
for (; (uintptr_t) dst1 & (sizeof (word_t) - 1); n--)
60
for (; (uintptr_t) dst1 & (sizeof (op_t) - 1); n--)
65
63
/* 8-byte pointer to destination memory. */
66
dst8 = (word_t *) dst1;
68
if (__builtin_expect ((uintptr_t) src1 & (sizeof (word_t) - 1), 0))
66
if (__builtin_expect ((uintptr_t) src1 & (sizeof (op_t) - 1), 0))
70
/* Misaligned copy. Copy 8 bytes at a time, but don't bother
72
TODO: Consider prefetching and using wh64 as well. */
74
/* Create an aligned src8. */
75
const word_t *__restrict src8 =
76
(const word_t *) ((uintptr_t) src1 & -sizeof (word_t));
80
for (; n >= sizeof (word_t); n -= sizeof (word_t))
83
a = __insn_dblalign (a, b, src1);
68
/* Misaligned copy. Use glibc's _wordcopy_fwd_dest_aligned, but
69
inline it to avoid prologue/epilogue. TODO: Consider
70
prefetching and using wh64 as well. */
73
long int dstp = (long int) dst1;
74
long int srcp = (long int) src1;
75
long int len = n / OPSIZ;
77
/* Save the initial source pointer so we know the number of
78
bytes to shift for merging two unaligned results. */
81
/* Make SRCP aligned by rounding it down to the beginning of the
82
`op_t' it points in the middle of. */
88
a1 = ((op_t *) srcp)[0];
89
a2 = ((op_t *) srcp)[1];
94
a0 = ((op_t *) srcp)[0];
95
a1 = ((op_t *) srcp)[1];
100
if (OP_T_THRES <= 3 * OPSIZ && len == 0)
102
a3 = ((op_t *) srcp)[0];
103
a0 = ((op_t *) srcp)[1];
108
a2 = ((op_t *) srcp)[0];
109
a3 = ((op_t *) srcp)[1];
112
if (OP_T_THRES <= 3 * OPSIZ && len == 0)
114
goto do4; /* No-op. */
120
a0 = ((op_t *) srcp)[0];
121
a2 = __insn_dblalign (a2, a3, srci);
122
((op_t *) dstp)[0] = a2;
126
a1 = ((op_t *) srcp)[0];
127
a3 = __insn_dblalign (a3, a0, srci);
128
((op_t *) dstp)[0] = a3;
132
a2 = ((op_t *) srcp)[0];
133
a0 = __insn_dblalign (a0, a1, srci);
134
((op_t *) dstp)[0] = a0;
138
a3 = ((op_t *) srcp)[0];
139
a1 = __insn_dblalign (a1, a2, srci);
140
((op_t *) dstp)[0] = a1;
147
/* This is the right position for do0. Please don't move
150
((op_t *) dstp)[0] = __insn_dblalign (a2, a3, srci);
91
b = ((const char *) src8 <= src1_end) ? *src8 : 0;
93
/* Final source bytes to write to trailing partial word, if any. */
94
final = __insn_dblalign (a, b, src1);
156
a0 = ((const char *) srcp <= src1_end) ? ((op_t *) srcp)[0] : 0;
158
final = __insn_dblalign (a3, a0, srci);
159
dst8 = (op_t *)(dstp + OPSIZ);
98
163
/* Aligned copy. */
100
const word_t *__restrict src8 = (const word_t *) src1;
165
const op_t *__restrict src8 = (const op_t *) src1;
102
167
/* src8 and dst8 are both word-aligned. */
103
168
if (n >= CHIP_L2_LINE_SIZE ())
105
170
/* Copy until 'dst' is cache-line-aligned. */
106
171
for (; (uintptr_t) dst8 & (CHIP_L2_LINE_SIZE () - 1);
107
n -= sizeof (word_t))
108
173
*dst8++ = *src8++;
110
/* If copying to self, return. The test is cheap enough
111
that we do it despite the fact that the memcpy() contract
112
doesn't require us to support overlapping dst and src.
113
This is the most common case of overlap, and any close
114
overlap will cause corruption due to the wh64 below.
115
This case is particularly important since the compiler
116
will emit memcpy() calls for aggregate copies even if it
117
can't prove that src != dst. */
118
if (__builtin_expect (dst8 == src8, 0))
121
175
for (; n >= CHIP_L2_LINE_SIZE ();)
125
/* Prefetch and advance to next line to prefetch, but
126
don't go past the end. */
127
__insn_prefetch (prefetch);
128
prefetch += CHIP_L2_LINE_SIZE ();
129
prefetch = (prefetch > src1_end) ? prefetch :
132
/* Copy an entire cache line. Manually unrolled to
133
avoid idiosyncracies of compiler unrolling. */
134
#define COPY_WORD(offset) ({ dst8[offset] = src8[offset]; n -= 8; })
177
op_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
179
/* Prefetch and advance to next line to prefetch, but
180
don't go past the end. */
181
__insn_prefetch (prefetch);
182
prefetch += CHIP_L2_LINE_SIZE ();
183
prefetch = (prefetch < src1_end) ? prefetch :
186
/* Do all the loads before wh64. This is necessary if
187
[src8, src8+7] and [dst8, dst8+7] share the same
188
cache line and dst8 <= src8, as can be the case when
189
called from memmove, or with code tested on x86 whose
190
memcpy always works with forward copies. */
143
213
#if CHIP_L2_LINE_SIZE() != 64
144
214
# error "Fix code that assumes particular L2 cache line size."
147
dst8 += CHIP_L2_LINE_SIZE () / sizeof (word_t);
148
src8 += CHIP_L2_LINE_SIZE () / sizeof (word_t);
152
for (; n >= sizeof (word_t); n -= sizeof (word_t))
218
for (; n >= sizeof (op_t); n -= sizeof (op_t))
153
219
*dst8++ = *src8++;
155
221
if (__builtin_expect (n == 0, 1))