103
by Will Newton
Split bionic reference code into A15 and A9 versions. |
1 |
/*
|
2 |
* Copyright (C) 2008 The Android Open Source Project |
|
3 |
* All rights reserved. |
|
4 |
*
|
|
5 |
* Redistribution and use in source and binary forms, with or without |
|
6 |
* modification, are permitted provided that the following conditions |
|
7 |
* are met: |
|
8 |
* * Redistributions of source code must retain the above copyright |
|
9 |
* notice, this list of conditions and the following disclaimer. |
|
10 |
* * Redistributions in binary form must reproduce the above copyright |
|
11 |
* notice, this list of conditions and the following disclaimer in |
|
12 |
* the documentation and/or other materials provided with the |
|
13 |
* distribution. |
|
14 |
*
|
|
15 |
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
|
16 |
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
|
17 |
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS |
|
18 |
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE |
|
19 |
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, |
|
20 |
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, |
|
21 |
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS |
|
22 |
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED |
|
23 |
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
|
24 |
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT |
|
25 |
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
|
26 |
* SUCH DAMAGE. |
|
27 |
*/
|
|
28 |
||
29 |
/*
|
|
30 |
* This code assumes it is running on a processor that supports all arm v7 |
|
31 |
* instructions, that supports neon instructions, and that has a 32 byte |
|
32 |
* cache line. |
|
33 |
*/
|
|
34 |
||
35 |
.text
|
|
36 |
.fpu neon |
|
37 |
||
38 |
#define CACHE_LINE_SIZE 32
|
|
39 |
||
40 |
.globl memcpy |
|
41 |
.type memcpy,%function |
|
42 |
memcpy: |
|
43 |
.fnstart
|
|
44 |
.save {r0, lr} |
|
45 |
/* start preloading as early as possible */ |
|
46 |
pld [r1, #(CACHE_LINE_SIZE * 0)] |
|
47 |
stmfd sp!, {r0, lr} |
|
48 |
pld [r1, #(CACHE_LINE_SIZE * 2)] |
|
49 |
||
50 |
// Check so divider is at least 16 bytes, needed for alignment code. |
|
51 |
cmp r2, #16 |
|
52 |
blo 5f |
|
53 |
||
54 |
||
55 |
/* check if buffers are aligned. If so, run arm-only version */ |
|
56 |
eor r3, r0, r1 |
|
57 |
ands r3, r3, #0x3 |
|
58 |
beq 11f |
|
59 |
||
60 |
/* Check the upper size limit for Neon unaligned memory access in memcpy */ |
|
61 |
cmp r2, #224 |
|
62 |
blo 3f |
|
63 |
||
64 |
/* align destination to 16 bytes for the write-buffer */ |
|
65 |
rsb r3, r0, #0 |
|
66 |
ands r3, r3, #0xF |
|
67 |
beq 3f |
|
68 |
||
69 |
/* copy up to 15-bytes (count in r3) */ |
|
70 |
sub r2, r2, r3 |
|
71 |
movs ip, r3, lsl #31 |
|
72 |
ldrmib lr, [r1], #1 |
|
73 |
strmib lr, [r0], #1 |
|
74 |
ldrcsb ip, [r1], #1 |
|
75 |
ldrcsb lr, [r1], #1 |
|
76 |
strcsb ip, [r0], #1 |
|
77 |
strcsb lr, [r0], #1 |
|
78 |
movs ip, r3, lsl #29 |
|
79 |
bge 1f |
|
80 |
// copies 4 bytes, destination 32-bits aligned |
|
81 |
vld1.32 {d0[0]}, [r1]! |
|
82 |
vst1.32 {d0[0]}, [r0, :32]! |
|
83 |
1: bcc 2f |
|
84 |
// copies 8 bytes, destination 64-bits aligned |
|
85 |
vld1.8 {d0}, [r1]! |
|
86 |
vst1.8 {d0}, [r0, :64]! |
|
87 |
2: |
|
88 |
/* preload immediately the next cache line, which we may need */ |
|
89 |
pld [r1, #(CACHE_LINE_SIZE * 0)] |
|
90 |
pld [r1, #(CACHE_LINE_SIZE * 2)] |
|
91 |
3: |
|
92 |
/* make sure we have at least 64 bytes to copy */ |
|
93 |
subs r2, r2, #64 |
|
94 |
blo 2f |
|
95 |
||
96 |
/* preload all the cache lines we need */ |
|
97 |
pld [r1, #(CACHE_LINE_SIZE * 4)] |
|
98 |
pld [r1, #(CACHE_LINE_SIZE * 6)] |
|
99 |
||
100 |
1: /* The main loop copies 64 bytes at a time */ |
|
101 |
vld1.8 {d0 - d3}, [r1]! |
|
102 |
vld1.8 {d4 - d7}, [r1]! |
|
103 |
pld [r1, #(CACHE_LINE_SIZE * 6)] |
|
104 |
subs r2, r2, #64 |
|
105 |
vst1.8 {d0 - d3}, [r0]! |
|
106 |
vst1.8 {d4 - d7}, [r0]! |
|
107 |
bhs 1b |
|
108 |
||
109 |
2: /* fix-up the remaining count and make sure we have >= 32 bytes left */ |
|
110 |
add r2, r2, #64 |
|
111 |
subs r2, r2, #32 |
|
112 |
blo 4f |
|
113 |
||
114 |
3: /* 32 bytes at a time. These cache lines were already preloaded */ |
|
115 |
vld1.8 {d0 - d3}, [r1]! |
|
116 |
subs r2, r2, #32 |
|
117 |
vst1.8 {d0 - d3}, [r0]! |
|
118 |
bhs 3b |
|
119 |
||
120 |
4: /* less than 32 left */ |
|
121 |
add r2, r2, #32 |
|
122 |
tst r2, #0x10 |
|
123 |
beq 5f |
|
124 |
// copies 16 bytes, 128-bits aligned |
|
125 |
vld1.8 {d0, d1}, [r1]! |
|
126 |
vst1.8 {d0, d1}, [r0]! |
|
127 |
5: /* copy up to 15-bytes (count in r2) */ |
|
128 |
movs ip, r2, lsl #29 |
|
129 |
bcc 1f |
|
130 |
vld1.8 {d0}, [r1]! |
|
131 |
vst1.8 {d0}, [r0]! |
|
132 |
1: bge 2f |
|
133 |
vld1.32 {d0[0]}, [r1]! |
|
134 |
vst1.32 {d0[0]}, [r0]! |
|
135 |
2: movs ip, r2, lsl #31 |
|
136 |
ldrmib r3, [r1], #1 |
|
137 |
ldrcsb ip, [r1], #1 |
|
138 |
ldrcsb lr, [r1], #1 |
|
139 |
strmib r3, [r0], #1 |
|
140 |
strcsb ip, [r0], #1 |
|
141 |
strcsb lr, [r0], #1 |
|
142 |
||
143 |
ldmfd sp!, {r0, lr} |
|
144 |
bx lr |
|
145 |
11: |
|
146 |
/* Simple arm-only copy loop to handle aligned copy operations */ |
|
147 |
stmfd sp!, {r4, r5, r6, r7, r8} |
|
148 |
pld [r1, #(CACHE_LINE_SIZE * 4)] |
|
149 |
||
150 |
/* Check alignment */ |
|
151 |
rsb r3, r1, #0 |
|
152 |
ands r3, #3 |
|
153 |
beq 2f |
|
154 |
||
155 |
/* align source to 32 bits. We need to insert 2 instructions between |
|
156 |
* a ldr[b|h] and str[b|h] because byte and half-word instructions |
|
157 |
* stall 2 cycles. |
|
158 |
*/
|
|
159 |
movs r12, r3, lsl #31 |
|
160 |
sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */ |
|
161 |
ldrmib r3, [r1], #1 |
|
162 |
ldrcsb r4, [r1], #1 |
|
163 |
ldrcsb r5, [r1], #1 |
|
164 |
strmib r3, [r0], #1 |
|
165 |
strcsb r4, [r0], #1 |
|
166 |
strcsb r5, [r0], #1 |
|
167 |
||
168 |
2: |
|
169 |
subs r2, r2, #64 |
|
170 |
blt 4f |
|
171 |
||
172 |
3: /* Main copy loop, copying 64 bytes at a time */ |
|
173 |
pld [r1, #(CACHE_LINE_SIZE * 8)] |
|
174 |
ldmia r1!, {r3, r4, r5, r6, r7, r8, r12, lr} |
|
175 |
stmia r0!, {r3, r4, r5, r6, r7, r8, r12, lr} |
|
176 |
ldmia r1!, {r3, r4, r5, r6, r7, r8, r12, lr} |
|
177 |
stmia r0!, {r3, r4, r5, r6, r7, r8, r12, lr} |
|
178 |
subs r2, r2, #64 |
|
179 |
bge 3b |
|
180 |
||
181 |
4: /* Check if there are > 32 bytes left */ |
|
182 |
adds r2, r2, #64 |
|
183 |
subs r2, r2, #32 |
|
184 |
blt 5f |
|
185 |
||
186 |
/* Copy 32 bytes */ |
|
187 |
ldmia r1!, {r3, r4, r5, r6, r7, r8, r12, lr} |
|
188 |
stmia r0!, {r3, r4, r5, r6, r7, r8, r12, lr} |
|
189 |
subs r2, #32 |
|
190 |
||
191 |
5: /* Handle any remaining bytes */ |
|
192 |
adds r2, #32 |
|
193 |
beq 6f |
|
194 |
||
195 |
movs r12, r2, lsl #28 |
|
196 |
ldmcsia r1!, {r3, r4, r5, r6} /* 16 bytes */ |
|
197 |
ldmmiia r1!, {r7, r8} /* 8 bytes */ |
|
198 |
stmcsia r0!, {r3, r4, r5, r6} |
|
199 |
stmmiia r0!, {r7, r8} |
|
200 |
movs r12, r2, lsl #30 |
|
201 |
ldrcs r3, [r1], #4 /* 4 bytes */ |
|
202 |
ldrmih r4, [r1], #2 /* 2 bytes */ |
|
203 |
strcs r3, [r0], #4 |
|
204 |
strmih r4, [r0], #2 |
|
205 |
tst r2, #0x1 |
|
206 |
ldrneb r3, [r1] /* last byte */ |
|
207 |
strneb r3, [r0] |
|
208 |
6: |
|
209 |
ldmfd sp!, {r4, r5, r6, r7, r8} |
|
210 |
ldmfd sp!, {r0, pc} |
|
211 |
.fnend
|
|
212 |
.size memcpy, .-memcpy |