4
@ Copyright (c) 2010-2011, Linaro Limited
7
@ Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
9
@ * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
10
@ * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
11
@ * Neither the name of Linaro Limited nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
13
@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
15
@ Written by Dave Gilbert <david.gilbert@linaro.org>
17
@ This memcpy routine is optimised on a Cortex-A9 and should work on all ARMv7
1
/* Copyright (c) 2010-2011, Linaro Limited
4
Redistribution and use in source and binary forms, with or without
5
modification, are permitted provided that the following conditions
8
* Redistributions of source code must retain the above copyright
9
notice, this list of conditions and the following disclaimer.
11
* Redistributions in binary form must reproduce the above copyright
12
notice, this list of conditions and the following disclaimer in the
13
documentation and/or other materials provided with the distribution.
15
* Neither the name of Linaro Limited nor the names of its
16
contributors may be used to endorse or promote products derived
17
from this software without specific prior written permission.
19
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31
Written by Dave Gilbert <david.gilbert@linaro.org>
33
This memcpy routine is optimised on a Cortex-A9 and should work on
34
all ARMv7 processors. */
20
36
@ 2011-09-01 david.gilbert@linaro.org
21
37
@ Extracted from local git 2f11b436
23
42
@ this lets us check a flag in a 00/ff byte easily in either endianness
25
44
#define CHARTSTMASK(c) 1<<(31-(c*8))
27
46
#define CHARTSTMASK(c) 1<<(c*8)
32
51
@ ---------------------------------------------------------------------------
37
56
.type memcpy,%function
43
@ Overlaps of source/dest not allowed according to spec
44
@ Note this routine relies on v7 misaligned loads/stores
46
mov r12, r0 @ stash original r0
48
blt 10f @ take the small copy case separately
62
@ Overlaps of source/dest not allowed according to spec
63
@ Note this routine relies on v7 misaligned loads/stores
65
mov r12, r0 @ stash original r0
67
blt 10f @ take the small copy case separately
50
@ test for either source or destination being misaligned
51
@ (We only rely on word align)
52
@ TODO: Test for co-misalignment
56
bne 30f @ misaligned case
69
@ test for either source or destination being misaligned
70
@ (We only rely on word align)
71
@ TODO: Test for co-misalignment
75
bne 30f @ misaligned case
59
@ at this point we are word (or better) aligned and have at least
60
@ 32 bytes to play with
61
push {r3,r4,r5,r6,r7,r8,r10,r11}
78
@ at this point we are word (or better) aligned and have at least
79
@ 32 bytes to play with
80
push {r3,r4,r5,r6,r7,r8,r10,r11}
63
ldmia r1!,{r3,r4,r5,r6,r7,r8,r10,r11}
67
stmia r0!,{r3,r4,r5,r6,r7,r8,r10,r11}
82
ldmia r1!,{r3,r4,r5,r6,r7,r8,r10,r11}
86
stmia r0!,{r3,r4,r5,r6,r7,r8,r10,r11}
70
pop {r3,r4,r5,r6,r7,r8,r10,r11}
71
@ We are now down to less than 32 bytes
72
cbz r2,15f @ quick exit for the case where we copied a multiple of 32
89
pop {r3,r4,r5,r6,r7,r8,r10,r11}
90
@ We are now down to less than 32 bytes
91
cbz r2,15f @ quick exit for the case where we copied a multiple of 32
74
93
10: @ small copies (not necessarily aligned - note might be slightly more than 32bytes)
95
mov r0,r12 @ restore r0
114
mov r0,r12 @ restore r0
98
117
30: @ non-aligned - at least 32 bytes to play with
99
@ On v7 we're allowed to do ldr's and str's from arbitrary alignments
100
@ but not ldrd/strd or ldm/stm
101
@ Note Neon is often a better choice misaligned using vld1
118
@ On v7 we're allowed to do ldr's and str's from arbitrary alignments
119
@ but not ldrd/strd or ldm/stm
120
@ Note Neon is often a better choice misaligned using vld1
103
@ copy a byte at a time until the point where we have an aligned destination
104
@ we know we have enough bytes to go to know we won't run out in this phase
122
@ copy a byte at a time until the point where we have an aligned destination
123
@ we know we have enough bytes to go to know we won't run out in this phase
115
cmp r2,#32 @ Lets get back to knowing we have 32 bytes to play with
118
@ Now the store address is aligned
134
cmp r2,#32 @ Lets get back to knowing we have 32 bytes to play with
137
@ Now the store address is aligned
120
push {r3,r4,r5,r6,r7,r8,r10,r11,r12,r14}
121
and r6,r1,#3 @ how misaligned we are
123
cbz r6, 100f @ Go there if we're actually aligned
124
bge 120f @ And here if it's aligned on 2 or 3 byte
125
@ Note might be worth splitting to bgt and a separate beq
126
@ if the branches are well separated
139
push {r3,r4,r5,r6,r7,r8,r10,r11,r12,r14}
140
and r6,r1,#3 @ how misaligned we are
142
cbz r6, 100f @ Go there if we're actually aligned
143
bge 120f @ And here if it's aligned on 2 or 3 byte
144
@ Note might be worth splitting to bgt and a separate beq
145
@ if the branches are well separated
128
@ At this point dest is aligned, source is 1 byte forward
147
@ At this point dest is aligned, source is 1 byte forward
130
ldr r3,[r1] @ Misaligned load - but it gives the first 4 bytes to store
131
sub r2,r2,#3 @ Number of bytes left in whole words we can load
132
add r1,r1,#3 @ To aligned load address
133
bic r3,r3,#0xff000000
149
ldr r3,[r1] @ Misaligned load - but it gives the first 4 bytes to store
150
sub r2,r2,#3 @ Number of bytes left in whole words we can load
151
add r1,r1,#3 @ To aligned load address
152
bic r3,r3,#0xff000000
136
ldmia r1!,{r5,r6,r7,r8}
146
ldmia r1!,{r10,r11,r12,r14}
154
orr r10,r10,r12,lsl#24
156
orr r11,r11,r14,lsl#24
157
stmia r0!,{r3,r4,r5,r6,r7,r8,r10,r11}
162
@ Deal with the stragglers
165
pop {r3,r4,r5,r6,r7,r8,r10,r11,r12,r14}
155
ldmia r1!,{r5,r6,r7,r8}
165
ldmia r1!,{r10,r11,r12,r14}
173
orr r10,r10,r12,lsl#24
175
orr r11,r11,r14,lsl#24
176
stmia r0!,{r3,r4,r5,r6,r7,r8,r10,r11}
181
@ Deal with the stragglers
184
pop {r3,r4,r5,r6,r7,r8,r10,r11,r12,r14}
168
187
100: @ Dest and source aligned - must have been originally co-misaligned
169
@ Fallback to main aligned case if still big enough
170
pop {r3,r4,r5,r6,r7,r8,r10,r11,r12,r14}
171
b 4b @ Big copies (32 bytes or more)
188
@ Fallback to main aligned case if still big enough
189
pop {r3,r4,r5,r6,r7,r8,r10,r11,r12,r14}
190
b 4b @ Big copies (32 bytes or more)
173
192
120: @ Dest is aligned, source is align+2 or 3
174
bgt 130f @ Now split off for 3 byte offset
193
bgt 130f @ Now split off for 3 byte offset
177
sub r2,r2,#2 @ Number of bytes left in whole words we can load
178
add r1,r1,#2 @ To aligned load address
196
sub r2,r2,#2 @ Number of bytes left in whole words we can load
197
add r1,r1,#2 @ To aligned load address
181
ldmia r1!,{r5,r6,r7,r8}
191
ldmia r1!,{r10,r11,r12,r14}
199
orr r10,r10,r12,lsl#16
201
orr r11,r11,r14,lsl#16
202
stmia r0!,{r3,r4,r5,r6,r7,r8,r10,r11}
207
@ Deal with the stragglers
210
pop {r3,r4,r5,r6,r7,r8,r10,r11,r12,r14}
200
ldmia r1!,{r5,r6,r7,r8}
210
ldmia r1!,{r10,r11,r12,r14}
218
orr r10,r10,r12,lsl#16
220
orr r11,r11,r14,lsl#16
221
stmia r0!,{r3,r4,r5,r6,r7,r8,r10,r11}
226
@ Deal with the stragglers
229
pop {r3,r4,r5,r6,r7,r8,r10,r11,r12,r14}
213
232
130: @ Dest is aligned, source is align+3
215
sub r2,r2,#1 @ Number of bytes left in whole words we can load
216
add r1,r1,#1 @ To aligned load address
234
sub r2,r2,#1 @ Number of bytes left in whole words we can load
235
add r1,r1,#1 @ To aligned load address
219
ldmia r1!,{r5,r6,r7,r8}
229
ldmia r1!,{r10,r11,r12,r14}
237
orr r10,r10,r12,lsl#8
239
orr r11,r11,r14,lsl#8
240
stmia r0!,{r3,r4,r5,r6,r7,r8,r10,r11}
245
@ Deal with the stragglers
248
pop {r3,r4,r5,r6,r7,r8,r10,r11,r12,r14}
238
ldmia r1!,{r5,r6,r7,r8}
248
ldmia r1!,{r10,r11,r12,r14}
256
orr r10,r10,r12,lsl#8
258
orr r11,r11,r14,lsl#8
259
stmia r0!,{r3,r4,r5,r6,r7,r8,r10,r11}
264
@ Deal with the stragglers
267
pop {r3,r4,r5,r6,r7,r8,r10,r11,r12,r14}