2
* Copyright (C) 2008 The Android Open Source Project
5
* Redistribution and use in source and binary forms, with or without
6
* modification, are permitted provided that the following conditions
8
* * Redistributions of source code must retain the above copyright
9
* notice, this list of conditions and the following disclaimer.
10
* * Redistributions in binary form must reproduce the above copyright
11
* notice, this list of conditions and the following disclaimer in
12
* the documentation and/or other materials provided with the
15
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29
#include <machine/cpu-features.h>
30
#include <machine/asm.h>
32
#if defined(__ARM_NEON__) && !defined(ARCH_ARM_USE_NON_NEON_MEMCPY)
37
/* a prefetch distance of 4 cache-lines works best experimentally */
38
#define CACHE_LINE_SIZE 64
39
#define PREFETCH_DISTANCE (CACHE_LINE_SIZE*4)
45
/* start preloading as early as possible */
46
pld [r1, #(CACHE_LINE_SIZE*0)]
47
pld [r1, #(CACHE_LINE_SIZE*1)]
49
/* do we have at least 16-bytes to copy (needed for alignment below) */
53
/* align destination to half cache-line for the write-buffer */
58
/* copy up to 15-bytes (count in r3) */
69
// copies 4 bytes, destination 32-bits aligned
70
vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
71
vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
73
// copies 8 bytes, destination 64-bits aligned
75
vst1.8 {d0}, [r0, :64]!
78
0: /* preload immediately the next cache line, which we may need */
79
pld [r1, #(CACHE_LINE_SIZE*0)]
80
pld [r1, #(CACHE_LINE_SIZE*1)]
82
/* make sure we have at least 64 bytes to copy */
86
/* preload all the cache lines we need.
87
* NOTE: the number of pld below depends on PREFETCH_DISTANCE,
88
* ideally would would increase the distance in the main loop to
89
* avoid the goofy code below. In practice this doesn't seem to make
92
pld [r1, #(CACHE_LINE_SIZE*2)]
93
pld [r1, #(CACHE_LINE_SIZE*3)]
94
pld [r1, #(PREFETCH_DISTANCE)]
96
1: /* The main loop copies 64 bytes at a time */
97
vld1.8 {d0 - d3}, [r1]!
98
vld1.8 {d4 - d7}, [r1]!
99
pld [r1, #(PREFETCH_DISTANCE)]
101
vst1.8 {d0 - d3}, [r0, :128]!
102
vst1.8 {d4 - d7}, [r0, :128]!
105
2: /* fix-up the remaining count and make sure we have >= 32 bytes left */
110
3: /* 32 bytes at a time. These cache lines were already preloaded */
111
vld1.8 {d0 - d3}, [r1]!
113
vst1.8 {d0 - d3}, [r0, :128]!
116
4: /* less than 32 left */
120
// copies 16 bytes, 128-bits aligned
121
vld1.8 {d0, d1}, [r1]!
122
vst1.8 {d0, d1}, [r0, :128]!
124
5: /* copy up to 15-bytes (count in r2) */
130
vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
131
vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]!
132
2: movs ip, r2, lsl #31
145
#else /* __ARM_ARCH__ < 7 */
149
* Optimized memcpy() for ARM.
151
* note that memcpy() always returns the destination pointer,
152
* so we have to preserve R0.
156
/* The stack must always be 64-bits aligned to be compliant with the
157
* ARM ABI. Since we have to save R0, we might as well save R4
158
* which we can use for better pipelining of the reads below
161
stmfd sp!, {r0, r4, lr}
162
/* Making room for r5-r11 which will be spilled later */
166
// preload the destination because we'll align it to a cache line
167
// with small writes. Also start the source "pump".
172
/* it simplifies things to take care of len<4 early */
174
blo copy_last_3_and_return
176
/* compute the offset to align the source
177
* offset = (4-(src&3))&3 = -src & 3
183
/* align source to 32 bits. We need to insert 2 instructions between
184
* a ldr[b|h] and str[b|h] because byte and half-word instructions
187
movs r12, r3, lsl #31
188
sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */
198
/* see if src and dst are aligned together (congruent) */
203
/* Use post-incriment mode for stm to spill r5-r11 to reserved stack
204
* frame. Don't update sp.
208
/* align the destination to a cache-line */
211
beq congruent_aligned32
215
/* conditionnaly copies 0 to 7 words (length in r3) */
216
movs r12, r3, lsl #28
217
ldmcsia r1!, {r4, r5, r6, r7} /* 16 bytes */
218
ldmmiia r1!, {r8, r9} /* 8 bytes */
219
stmcsia r0!, {r4, r5, r6, r7}
220
stmmiia r0!, {r8, r9}
222
ldrne r10,[r1], #4 /* 4 bytes */
228
* here source is aligned to 32 bytes.
233
blo less_than_32_left
236
* We preload a cache-line up to 64 bytes ahead. On the 926, this will
237
* stall only until the requested world is fetched, but the linefill
238
* continues in the the background.
239
* While the linefill is going, we write our previous cache-line
240
* into the write-buffer (which should have some free space).
241
* When the linefill is done, the writebuffer will
242
* start dumping its content into memory
244
* While all this is going, we then load a full cache line into
245
* 8 registers, this cache line should be in the cache by now
246
* (or partly in the cache).
248
* This code should work well regardless of the source/dest alignment.
252
// Align the preload register to a cache-line because the cpu does
253
// "critical word first" (the first word requested is loaded first).
257
1: ldmia r1!, { r4-r11 }
261
// NOTE: if r12 is more than 64 ahead of r1, the following ldrhi
262
// for ARM9 preload will not be safely guarded by the preceding subs.
263
// When it is safely guarded the only possibility to have SIGSEGV here
264
// is because the caller overstates the length.
265
ldrhi r3, [r12], #32 /* cheap ARM9 preload */
266
stmia r0!, { r4-r11 }
276
* less than 32 bytes left at this point (length in r2)
279
/* skip all this if there is nothing to do, which should
280
* be a common case (if not executed the code below takes
286
/* conditionnaly copies 0 to 31 bytes */
287
movs r12, r2, lsl #28
288
ldmcsia r1!, {r4, r5, r6, r7} /* 16 bytes */
289
ldmmiia r1!, {r8, r9} /* 8 bytes */
290
stmcsia r0!, {r4, r5, r6, r7}
291
stmmiia r0!, {r8, r9}
292
movs r12, r2, lsl #30
293
ldrcs r3, [r1], #4 /* 4 bytes */
294
ldrmih r4, [r1], #2 /* 2 bytes */
298
ldrneb r3, [r1] /* last byte */
301
/* we're done! restore everything and return */
302
1: ldmfd sp!, {r5-r11}
303
ldmfd sp!, {r0, r4, lr}
306
/********************************************************************/
310
* here source is aligned to 4 bytes
311
* but destination is not.
313
* in the code below r2 is the number of bytes read
314
* (the number of bytes written is always smaller, because we have
315
* partial words in the shift queue)
318
blo copy_last_3_and_return
320
/* Use post-incriment mode for stm to spill r5-r11 to reserved stack
321
* frame. Don't update sp.
325
/* compute shifts needed to align src to dest */
327
and r5, r5, #3 /* r5 = # bytes in partial words */
328
mov r12, r5, lsl #3 /* r12 = right */
329
rsb lr, r12, #32 /* lr = left */
331
/* read the first word */
335
/* write a partial word (0 to 3 bytes), such that destination
336
* becomes aligned to 32 bits (r5 = nb of words to copy for alignment)
347
blo partial_word_tail
349
/* Align destination to 32 bytes (cache line boundary) */
354
orr r4, r3, r5, lsl lr
359
blo partial_word_tail
361
/* copy 32 bytes at a time */
363
blo less_than_thirtytwo
365
/* Use immediate mode for the shifts, because there is an extra cycle
366
* for register shifts, which could account for up to 50% of
378
ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
382
orr r3, r3, r4, lsl #16
384
orr r4, r4, r5, lsl #16
386
orr r5, r5, r6, lsl #16
388
orr r6, r6, r7, lsl #16
390
orr r7, r7, r8, lsl #16
392
orr r8, r8, r9, lsl #16
394
orr r9, r9, r10, lsl #16
395
mov r10, r10, lsr #16
396
orr r10, r10, r11, lsl #16
397
stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
400
b less_than_thirtytwo
405
ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
409
orr r3, r3, r4, lsl #24
411
orr r4, r4, r5, lsl #24
413
orr r5, r5, r6, lsl #24
415
orr r6, r6, r7, lsl #24
417
orr r7, r7, r8, lsl #24
419
orr r8, r8, r9, lsl #24
421
orr r9, r9, r10, lsl #24
423
orr r10, r10, r11, lsl #24
424
stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
427
b less_than_thirtytwo
432
ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
436
orr r3, r3, r4, lsl #8
438
orr r4, r4, r5, lsl #8
440
orr r5, r5, r6, lsl #8
442
orr r6, r6, r7, lsl #8
444
orr r7, r7, r8, lsl #8
446
orr r8, r8, r9, lsl #8
448
orr r9, r9, r10, lsl #8
449
mov r10, r10, lsr #24
450
orr r10, r10, r11, lsl #8
451
stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
457
/* copy the last 0 to 31 bytes of the source */
458
rsb r12, lr, #32 /* we corrupted r12, recompute it */
461
blo partial_word_tail
465
orr r4, r3, r5, lsl lr
472
/* we have a partial word in the input buffer */
473
movs r5, lr, lsl #(31-3)
480
/* Refill spilled registers from the stack. Don't update sp. */
483
copy_last_3_and_return:
484
movs r2, r2, lsl #31 /* copy remaining 0, 1, 2 or 3 bytes */
492
/* we're done! restore sp and spilled registers and return */
494
ldmfd sp!, {r0, r4, lr}
499
#endif /* __ARM_ARCH__ < 7 */