2
* Copyright (C) 2008 The Android Open Source Project
5
* Redistribution and use in source and binary forms, with or without
6
* modification, are permitted provided that the following conditions
8
* * Redistributions of source code must retain the above copyright
9
* notice, this list of conditions and the following disclaimer.
10
* * Redistributions in binary form must reproduce the above copyright
11
* notice, this list of conditions and the following disclaimer in
12
* the documentation and/or other materials provided with the
15
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29
* Copyright (c) 2013 ARM Ltd
30
* All rights reserved.
32
* Redistribution and use in source and binary forms, with or without
33
* modification, are permitted provided that the following conditions
35
* 1. Redistributions of source code must retain the above copyright
36
* notice, this list of conditions and the following disclaimer.
37
* 2. Redistributions in binary form must reproduce the above copyright
38
* notice, this list of conditions and the following disclaimer in the
39
* documentation and/or other materials provided with the distribution.
40
* 3. The name of the company may not be used to endorse or promote
41
* products derived from this software without specific prior written
44
* THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
45
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
46
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
47
* IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
48
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
49
* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
50
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
51
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
52
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
53
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
56
/* Prototype: void *memcpy (void *dst, const void *src, size_t count). */
58
// This version is tuned for the Cortex-A15 processor.
64
#define CACHE_LINE_SIZE 64
67
.type memcpy,%function
70
// Assumes that n >= 0, and dst, src are valid pointers.
71
// For any sizes less than 832 use the neon code that doesn't
72
// care about the src alignment. This avoids any checks
73
// for src alignment, and offers the best improvement since
74
// smaller sized copies are dominated by the overhead of
75
// the pre and post main loop.
76
// For larger copies, if src and dst cannot both be aligned to
77
// word boundaries, use the neon code.
78
// For all other copies, align dst to a double word boundary
79
// and copy using LDRD/STRD instructions.
81
// Save registers (r0 holds the return value):
82
// optimized push {r0, lr}.
84
pld [r1, #(CACHE_LINE_SIZE*16)]
88
blo copy_less_than_16_unknown_align
93
copy_unknown_alignment:
94
// Unknown alignment of src and dst.
95
// Assumes that the first few bytes have already been prefetched.
97
// Align destination to 128 bits. The mainloop store instructions
98
// require this alignment or they will throw an exception.
103
// Copy up to 15 bytes (count in r3).
118
// Copies 4 bytes, dst 32 bits aligned before, at least 64 bits after.
119
vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
120
vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
122
// Copies 8 bytes, dst 64 bits aligned before, at least 128 bits after.
124
vst1.8 {d0}, [r0, :64]!
126
2: // Make sure we have at least 64 bytes to copy.
130
1: // The main loop copies 64 bytes at a time.
131
vld1.8 {d0 - d3}, [r1]!
132
vld1.8 {d4 - d7}, [r1]!
133
pld [r1, #(CACHE_LINE_SIZE*4)]
135
vst1.8 {d0 - d3}, [r0, :128]!
136
vst1.8 {d4 - d7}, [r0, :128]!
139
2: // Fix-up the remaining count and make sure we have >= 32 bytes left.
143
// 32 bytes. These cache lines were already preloaded.
144
vld1.8 {d0 - d3}, [r1]!
146
vst1.8 {d0 - d3}, [r0, :128]!
147
3: // Less than 32 left.
150
beq copy_less_than_16_unknown_align
151
// Copies 16 bytes, destination 128 bits aligned.
152
vld1.8 {d0, d1}, [r1]!
153
vst1.8 {d0, d1}, [r0, :128]!
155
copy_less_than_16_unknown_align:
156
// Copy up to 15 bytes (count in r2).
162
vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
163
vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]!
165
2: // Copy 0 to 4 bytes.
179
// If src and dst cannot both be aligned to a word boundary,
180
// use the unaligned copy version.
183
bne copy_unknown_alignment
185
// To try and improve performance, stack layout changed,
186
// i.e., not keeping the stack looking like users expect
187
// (highest numbered register at highest address).
188
// TODO: Add debug frame directives.
189
// We don't need exception unwind directives, because the code below
190
// does not throw any exceptions and does not call any other functions.
191
// Generally, newlib functions like this lack debug information for
194
strd r4, r5, [sp, #-8]!
196
strd r6, r7, [sp, #-8]!
198
strd r8, r9, [sp, #-8]!
200
// Optimized for already aligned dst code.
202
bne dst_not_word_aligned
205
// Align the destination buffer to 8 bytes, to make sure double
206
// loads and stores don't cross a cache line boundary,
207
// as they are then more expensive even if the data is in the cache
208
// (require two load/store issue cycles instead of one).
209
// If only one of the buffers is not 8 bytes aligned,
210
// then it's more important to align dst than src,
211
// because there is more penalty for stores
212
// than loads that cross a cacheline boundary.
213
// This check and realignment are only done if there is >= 832
216
// Dst is word aligned, but check if it is already double word aligned.
223
1: // Can only get here if > 64 bytes to copy, so don't do check r2.
226
2: // Every loop iteration copies 64 bytes.
227
.irp offset, #0, #8, #16, #24, #32
228
ldrd r4, r5, [r1, \offset]
229
strd r4, r5, [r0, \offset]
232
ldrd r4, r5, [r1, #40]
233
ldrd r6, r7, [r1, #48]
234
ldrd r8, r9, [r1, #56]
236
// Keep the pld as far from the next load as possible.
237
// The amount to prefetch was determined experimentally using
238
// large sizes, and verifying the prefetch size does not affect
239
// the smaller copies too much.
240
// WARNING: If the ldrd and strd instructions get too far away
241
// from each other, performance suffers. Three loads
242
// in a row is the best tradeoff.
243
pld [r1, #(CACHE_LINE_SIZE*16)]
244
strd r4, r5, [r0, #40]
245
strd r6, r7, [r0, #48]
246
strd r8, r9, [r0, #56]
253
// Fix-up the remaining count and make sure we have >= 32 bytes left.
257
// Copy 32 bytes. These cache lines were already preloaded.
258
.irp offset, #0, #8, #16, #24
259
ldrd r4, r5, [r1, \offset]
260
strd r4, r5, [r0, \offset]
265
4: // Less than 32 left.
271
ldrd r4, r5, [r1, \offset]
272
strd r4, r5, [r0, \offset]
277
5: // Copy up to 15 bytes (count in r2).
281
ldrd r4, r5, [r1], #8
282
strd r4, r5, [r0], #8
287
2: // Copy 0 to 4 bytes.
298
// Restore registers: optimized pop {r0, pc}
299
ldrd r8, r9, [sp], #8
300
ldrd r6, r7, [sp], #8
301
ldrd r4, r5, [sp], #8
304
dst_not_word_aligned:
305
// Align dst to word.
322
// Src is guaranteed to be at least word aligned by this point.
325
.size memcpy, .-memcpy