1
/* Copyright (c) 2009 CodeSourcery, Inc.
4
* Redistribution and use in source and binary forms, with or without
5
* modification, are permitted provided that the following conditions are met:
6
* * Redistributions of source code must retain the above copyright
7
* notice, this list of conditions and the following disclaimer.
8
* * Redistributions in binary form must reproduce the above copyright
9
* notice, this list of conditions and the following disclaimer in the
10
* documentation and/or other materials provided with the distribution.
11
* * Neither the name of CodeSourcery nor the
12
* names of its contributors may be used to endorse or promote products
13
* derived from this software without specific prior written permission.
15
* THIS SOFTWARE IS PROVIDED BY CODESOURCERY, INC. ``AS IS'' AND ANY
16
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18
* DISCLAIMED. IN NO EVENT SHALL CODESOURCERY BE LIABLE FOR ANY
19
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
/* Standard operations for word-sized values. */
33
#define WORD_REF(ADDRESS, OFFSET) \
34
*((WORD_TYPE*)((char*)(ADDRESS) + (OFFSET)))
35
#define WORD_COPY(OUT, IN, OFFSET) \
36
WORD_REF(OUT, OFFSET) = WORD_REF(IN, OFFSET)
38
/* On processors with NEON, we use 128-bit vectors. Also,
39
we need to include arm_neon.h to use these. */
40
#if defined(__ARM_NEON__)
43
#define WORD_TYPE uint8x16_t
45
#define MAYBE_PREFETCH(IN) __builtin_prefetch((IN), 0, 0)
47
/* On ARM processors with 64-bit ldrd instructions, we use those,
48
except on Cortex-M* where benchmarking has shown them to
50
#elif defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) \
51
|| defined(__ARM_ARCH_5TEJ__) || defined(_ISA_ARM_6)
52
#define WORD_TYPE uint64_t
54
#define MAYBE_PREFETCH(IN) __builtin_prefetch((IN), 0, 0)
56
/* On everything else, we use 32-bit loads and stores, and
57
do not use prefetching. */
59
#define WORD_TYPE uint32_t
61
#define MAYBE_PREFETCH(IN)
64
/* On all ARM platforms, 'SHORTWORD' is a 32-bit value. */
65
#define SHORTWORD_TYPE uint32_t
66
#define SHORTWORD_SIZE 4
67
#define SHORTWORD_REF(ADDRESS, OFFSET) \
68
*((SHORTWORD_TYPE*)((char*)(ADDRESS) + (OFFSET)))
69
#define SHORTWORD_COPY(OUT, IN, OFFSET) \
70
SHORTWORD_REF(OUT, OFFSET) = SHORTWORD_REF(IN, OFFSET)
72
/* Shifting directionality depends on endianness. */
74
#define SHORTWORD_SHIFT(IN0, IN1, OFFSET) \
75
((IN0) << ((OFFSET)*8)) | ((IN1) >> (SHORTWORD_SIZE*8 - (OFFSET)*8))
77
#define SHORTWORD_SHIFT(IN0, IN1, OFFSET) \
78
((IN0) >> ((OFFSET)*8)) | ((IN1) << (SHORTWORD_SIZE*8 - (OFFSET)*8))
81
void *memcpy(void *OUT, const void *IN, size_t N)
85
#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
86
const char* OUT_end = (char*)OUT + N;
87
while ((char*)OUT < OUT_end) {
88
*((char*)OUT) = *((char*)IN);
95
/* Handle short strings and immediately return. */
96
if (__builtin_expect(N < SHORTWORD_SIZE, 1)) {
99
((char*)OUT)[i] = ((char*)IN)[i];
105
const char* OUT_end = (char*)OUT + N;
107
/* Align OUT to SHORTWORD_SIZE. */
108
while ((uintptr_t)OUT % SHORTWORD_SIZE != 0) {
109
*(char*) (OUT++) = *(char*) (IN++);
112
if ((uintptr_t) IN % SHORTWORD_SIZE == 0) {
114
#if WORD_SIZE > SHORTWORD_SIZE
115
/* Align OUT to WORD_SIZE in steps of SHORTWORD_SIZE. */
116
if (__builtin_expect(OUT_end - (char*)OUT >= WORD_SIZE, 0)) {
117
while ((uintptr_t)OUT % WORD_SIZE != 0) {
118
SHORTWORD_COPY(OUT, IN, 0);
119
OUT += SHORTWORD_SIZE;
120
IN += SHORTWORD_SIZE;
123
if ((uintptr_t) IN % WORD_SIZE == 0) {
124
#endif /* WORD_SIZE > SHORTWORD_SIZE */
126
#if defined(__ARM_NEON__)
127
/* Testing on Cortex-A8 indicates that the following idiom
128
produces faster assembly code when doing vector copies,
129
but not when doing regular copies. */
131
N = OUT_end - (char*)OUT;
132
MAYBE_PREFETCH(IN + 64);
133
MAYBE_PREFETCH(IN + 128);
134
MAYBE_PREFETCH(IN + 192);
136
MAYBE_PREFETCH(IN + 256);
137
MAYBE_PREFETCH(IN + 320);
138
MAYBE_PREFETCH(IN + 384);
139
MAYBE_PREFETCH(IN + 448);
140
MAYBE_PREFETCH(IN + 512);
141
MAYBE_PREFETCH(IN + 576);
142
MAYBE_PREFETCH(IN + 640);
143
MAYBE_PREFETCH(IN + 704);
144
/* We phrase the loop condition in this way so that the
145
i + WORD_SIZE * 16 value can be reused to increment i. */
146
while (i + WORD_SIZE * 16 <= N - 640) {
147
MAYBE_PREFETCH(IN + 768);
148
MAYBE_PREFETCH(IN + 832);
149
MAYBE_PREFETCH(IN + 896);
150
MAYBE_PREFETCH(IN + 960);
151
WORD_COPY(OUT, IN, i);
152
WORD_COPY(OUT, IN, i + WORD_SIZE * 1);
153
WORD_COPY(OUT, IN, i + WORD_SIZE * 2);
154
WORD_COPY(OUT, IN, i + WORD_SIZE * 3);
155
WORD_COPY(OUT, IN, i + WORD_SIZE * 4);
156
WORD_COPY(OUT, IN, i + WORD_SIZE * 5);
157
WORD_COPY(OUT, IN, i + WORD_SIZE * 6);
158
WORD_COPY(OUT, IN, i + WORD_SIZE * 7);
159
WORD_COPY(OUT, IN, i + WORD_SIZE * 8);
160
WORD_COPY(OUT, IN, i + WORD_SIZE * 9);
161
WORD_COPY(OUT, IN, i + WORD_SIZE * 10);
162
WORD_COPY(OUT, IN, i + WORD_SIZE * 11);
163
WORD_COPY(OUT, IN, i + WORD_SIZE * 12);
164
WORD_COPY(OUT, IN, i + WORD_SIZE * 13);
165
WORD_COPY(OUT, IN, i + WORD_SIZE * 14);
166
WORD_COPY(OUT, IN, i + WORD_SIZE * 15);
170
while (i + WORD_SIZE * 16 <= N) {
171
WORD_COPY(OUT, IN, i);
172
WORD_COPY(OUT, IN, i + WORD_SIZE * 1);
173
WORD_COPY(OUT, IN, i + WORD_SIZE * 2);
174
WORD_COPY(OUT, IN, i + WORD_SIZE * 3);
175
WORD_COPY(OUT, IN, i + WORD_SIZE * 4);
176
WORD_COPY(OUT, IN, i + WORD_SIZE * 5);
177
WORD_COPY(OUT, IN, i + WORD_SIZE * 6);
178
WORD_COPY(OUT, IN, i + WORD_SIZE * 7);
179
WORD_COPY(OUT, IN, i + WORD_SIZE * 8);
180
WORD_COPY(OUT, IN, i + WORD_SIZE * 9);
181
WORD_COPY(OUT, IN, i + WORD_SIZE * 10);
182
WORD_COPY(OUT, IN, i + WORD_SIZE * 11);
183
WORD_COPY(OUT, IN, i + WORD_SIZE * 12);
184
WORD_COPY(OUT, IN, i + WORD_SIZE * 13);
185
WORD_COPY(OUT, IN, i + WORD_SIZE * 14);
186
WORD_COPY(OUT, IN, i + WORD_SIZE * 15);
189
while (i + WORD_SIZE * 4 <= N) {
190
WORD_COPY(OUT, IN, i);
191
WORD_COPY(OUT, IN, i + WORD_SIZE * 1);
192
WORD_COPY(OUT, IN, i + WORD_SIZE * 2);
193
WORD_COPY(OUT, IN, i + WORD_SIZE * 3);
196
while (i + WORD_SIZE <= N) {
197
WORD_COPY(OUT, IN, i);
202
#else /* not defined(__ARM_NEON__) */
203
/* Note: 16-times unrolling is about 20% faster than 4-times
204
unrolling on both ARM Cortex-A8 and Cortex-M3. */
205
MAYBE_PREFETCH(IN + 64);
206
MAYBE_PREFETCH(IN + 128);
207
MAYBE_PREFETCH(IN + 192);
208
while (OUT_end - (char*)OUT >= WORD_SIZE * 16) {
209
MAYBE_PREFETCH(IN + 256);
210
MAYBE_PREFETCH(IN + 320);
211
WORD_COPY(OUT, IN, 0);
212
WORD_COPY(OUT, IN, WORD_SIZE * 1);
213
WORD_COPY(OUT, IN, WORD_SIZE * 2);
214
WORD_COPY(OUT, IN, WORD_SIZE * 3);
215
WORD_COPY(OUT, IN, WORD_SIZE * 4);
216
WORD_COPY(OUT, IN, WORD_SIZE * 5);
217
WORD_COPY(OUT, IN, WORD_SIZE * 6);
218
WORD_COPY(OUT, IN, WORD_SIZE * 7);
219
WORD_COPY(OUT, IN, WORD_SIZE * 8);
220
WORD_COPY(OUT, IN, WORD_SIZE * 9);
221
WORD_COPY(OUT, IN, WORD_SIZE * 10);
222
WORD_COPY(OUT, IN, WORD_SIZE * 11);
223
WORD_COPY(OUT, IN, WORD_SIZE * 12);
224
WORD_COPY(OUT, IN, WORD_SIZE * 13);
225
WORD_COPY(OUT, IN, WORD_SIZE * 14);
226
WORD_COPY(OUT, IN, WORD_SIZE * 15);
227
OUT += WORD_SIZE * 16;
228
IN += WORD_SIZE * 16;
230
while (WORD_SIZE * 4 <= OUT_end - (char*)OUT) {
231
WORD_COPY(OUT, IN, 0);
232
WORD_COPY(OUT, IN, WORD_SIZE * 1);
233
WORD_COPY(OUT, IN, WORD_SIZE * 2);
234
WORD_COPY(OUT, IN, WORD_SIZE * 3);
235
OUT += WORD_SIZE * 4;
238
while (WORD_SIZE <= OUT_end - (char*)OUT) {
239
WORD_COPY(OUT, IN, 0);
243
#endif /* not defined(__ARM_NEON__) */
245
#if WORD_SIZE > SHORTWORD_SIZE
246
} else { /* if IN is not WORD_SIZE aligned */
247
while (SHORTWORD_SIZE * 4 <= OUT_end - (char*)OUT) {
248
SHORTWORD_COPY(OUT, IN, 0);
249
SHORTWORD_COPY(OUT, IN, SHORTWORD_SIZE * 1);
250
SHORTWORD_COPY(OUT, IN, SHORTWORD_SIZE * 2);
251
SHORTWORD_COPY(OUT, IN, SHORTWORD_SIZE * 3);
252
OUT += SHORTWORD_SIZE * 4;
253
IN += SHORTWORD_SIZE * 4;
255
} /* end if IN is not WORD_SIZE aligned */
256
} /* end if N >= WORD_SIZE */
258
while (SHORTWORD_SIZE <= OUT_end - (char*)OUT) {
259
SHORTWORD_COPY(OUT, IN, 0);
260
OUT += SHORTWORD_SIZE;
261
IN += SHORTWORD_SIZE;
263
#endif /* WORD_SIZE > SHORTWORD_SIZE */
265
} else { /* if IN is not SHORTWORD_SIZE aligned */
266
ptrdiff_t misalign = (uintptr_t)IN % SHORTWORD_SIZE;
268
SHORTWORD_TYPE temp1, temp2;
269
temp1 = SHORTWORD_REF(IN, -misalign);
271
/* Benchmarking indicates that unrolling this loop doesn't
272
produce a measurable performance improvement on ARM. */
273
while (SHORTWORD_SIZE <= OUT_end - (char*)OUT) {
274
IN += SHORTWORD_SIZE;
275
temp2 = SHORTWORD_REF(IN, -misalign);
276
SHORTWORD_REF(OUT, 0) = SHORTWORD_SHIFT(temp1, temp2, misalign);
278
OUT += SHORTWORD_SIZE;
281
} /* end if IN is not SHORTWORD_SIZE aligned */
283
while ((char*)OUT < OUT_end) {
284
*((char*)OUT) = *((char*)IN);