~linaro-toolchain-dev/cortex-strings/trunk

103 by Will Newton
Split bionic reference code into A15 and A9 versions.
1
/*
2
 * Copyright (C) 2008 The Android Open Source Project
3
 * All rights reserved.
4
 *
5
 * Redistribution and use in source and binary forms, with or without
6
 * modification, are permitted provided that the following conditions
7
 * are met:
8
 *  * Redistributions of source code must retain the above copyright
9
 *    notice, this list of conditions and the following disclaimer.
10
 *  * Redistributions in binary form must reproduce the above copyright
11
 *    notice, this list of conditions and the following disclaimer in
12
 *    the documentation and/or other materials provided with the
13
 *    distribution.
14
 *
15
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26
 * SUCH DAMAGE.
27
 */
28
/*
29
 * Copyright (c) 2013 ARM Ltd
30
 * All rights reserved.
31
 *
32
 * Redistribution and use in source and binary forms, with or without
33
 * modification, are permitted provided that the following conditions
34
 * are met:
35
 * 1. Redistributions of source code must retain the above copyright
36
 *    notice, this list of conditions and the following disclaimer.
37
 * 2. Redistributions in binary form must reproduce the above copyright
38
 *    notice, this list of conditions and the following disclaimer in the
39
 *    documentation and/or other materials provided with the distribution.
40
 * 3. The name of the company may not be used to endorse or promote
41
 *    products derived from this software without specific prior written
42
 *    permission.
43
 *
44
 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
45
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
46
 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
47
 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
48
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
49
 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
50
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
51
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
52
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
53
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
54
 */
55
56
    /* Prototype: void *memcpy (void *dst, const void *src, size_t count).  */
57
58
        // This version is tuned for the Cortex-A15 processor.
59
60
        .text
61
        .syntax unified
62
        .fpu    neon
63
64
#define CACHE_LINE_SIZE 64
65
66
	.globl memcpy
67
	.type memcpy,%function
68
memcpy:
69
	.fnstart
70
        // Assumes that n >= 0, and dst, src are valid pointers.
71
        // For any sizes less than 832 use the neon code that doesn't
72
        // care about the src alignment. This avoids any checks
73
        // for src alignment, and offers the best improvement since
74
        // smaller sized copies are dominated by the overhead of
75
        // the pre and post main loop.
76
        // For larger copies, if src and dst cannot both be aligned to
77
        // word boundaries, use the neon code.
78
        // For all other copies, align dst to a double word boundary
79
        // and copy using LDRD/STRD instructions.
80
81
        // Save registers (r0 holds the return value):
82
        // optimized push {r0, lr}.
83
        .save   {r0, lr}
84
        pld     [r1, #(CACHE_LINE_SIZE*16)]
85
        push    {r0, lr}
86
87
        cmp     r2, #16
88
        blo     copy_less_than_16_unknown_align
89
90
        cmp     r2, #832
91
        bge     check_alignment
92
93
copy_unknown_alignment:
94
        // Unknown alignment of src and dst.
95
        // Assumes that the first few bytes have already been prefetched.
96
97
        // Align destination to 128 bits. The mainloop store instructions
98
        // require this alignment or they will throw an exception.
99
        rsb         r3, r0, #0
100
        ands        r3, r3, #0xF
101
        beq         2f
102
103
        // Copy up to 15 bytes (count in r3).
104
        sub         r2, r2, r3
105
        movs        ip, r3, lsl #31
106
107
        itt         mi
108
        ldrbmi      lr, [r1], #1
109
        strbmi      lr, [r0], #1
110
        itttt       cs
111
        ldrbcs      ip, [r1], #1
112
        ldrbcs      lr, [r1], #1
113
        strbcs      ip, [r0], #1
114
        strbcs      lr, [r0], #1
115
116
        movs        ip, r3, lsl #29
117
        bge         1f
118
        // Copies 4 bytes, dst 32 bits aligned before, at least 64 bits after.
119
        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]!
120
        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
121
1:      bcc         2f
122
        // Copies 8 bytes, dst 64 bits aligned before, at least 128 bits after.
123
        vld1.8      {d0}, [r1]!
124
        vst1.8      {d0}, [r0, :64]!
125
126
2:      // Make sure we have at least 64 bytes to copy.
127
        subs        r2, r2, #64
128
        blo         2f
129
130
1:      // The main loop copies 64 bytes at a time.
131
        vld1.8      {d0  - d3},   [r1]!
132
        vld1.8      {d4  - d7},   [r1]!
133
        pld         [r1, #(CACHE_LINE_SIZE*4)]
134
        subs        r2, r2, #64
135
        vst1.8      {d0  - d3},   [r0, :128]!
136
        vst1.8      {d4  - d7},   [r0, :128]!
137
        bhs         1b
138
139
2:      // Fix-up the remaining count and make sure we have >= 32 bytes left.
140
        adds        r2, r2, #32
141
        blo         3f
142
143
        // 32 bytes. These cache lines were already preloaded.
144
        vld1.8      {d0 - d3},  [r1]!
145
        sub         r2, r2, #32
146
        vst1.8      {d0 - d3},  [r0, :128]!
147
3:      // Less than 32 left.
148
        add         r2, r2, #32
149
        tst         r2, #0x10
150
        beq         copy_less_than_16_unknown_align
151
        // Copies 16 bytes, destination 128 bits aligned.
152
        vld1.8      {d0, d1}, [r1]!
153
        vst1.8      {d0, d1}, [r0, :128]!
154
155
copy_less_than_16_unknown_align:
156
        // Copy up to 15 bytes (count in r2).
157
        movs        ip, r2, lsl #29
158
        bcc         1f
159
        vld1.8      {d0}, [r1]!
160
        vst1.8      {d0}, [r0]!
161
1:      bge         2f
162
        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]!
163
        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0]!
164
165
2:      // Copy 0 to 4 bytes.
166
        lsls        r2, r2, #31
167
        itt         ne
168
        ldrbne      lr, [r1], #1
169
        strbne      lr, [r0], #1
170
        itttt       cs
171
        ldrbcs      ip, [r1], #1
172
        ldrbcs      lr, [r1]
173
        strbcs      ip, [r0], #1
174
        strbcs      lr, [r0]
175
176
        pop         {r0, pc}
177
178
check_alignment:
179
        // If src and dst cannot both be aligned to a word boundary,
180
        // use the unaligned copy version.
181
        eor     r3, r0, r1
182
        ands    r3, r3, #0x3
183
        bne     copy_unknown_alignment
184
185
        // To try and improve performance, stack layout changed,
186
        // i.e., not keeping the stack looking like users expect
187
        // (highest numbered register at highest address).
188
        // TODO: Add debug frame directives.
189
        // We don't need exception unwind directives, because the code below
190
        // does not throw any exceptions and does not call any other functions.
191
        // Generally, newlib functions like this lack debug information for
192
        // assembler source.
193
        .save   {r4, r5}
194
        strd    r4, r5, [sp, #-8]!
195
        .save   {r6, r7}
196
        strd    r6, r7, [sp, #-8]!
197
        .save   {r8, r9}
198
        strd    r8, r9, [sp, #-8]!
199
200
        // Optimized for already aligned dst code.
201
        ands    ip, r0, #3
202
        bne     dst_not_word_aligned
203
204
word_aligned:
205
        // Align the destination buffer to 8 bytes, to make sure double
206
        // loads and stores don't cross a cache line boundary,
207
        // as they are then more expensive even if the data is in the cache
208
        // (require two load/store issue cycles instead of one).
209
        // If only one of the buffers is not 8 bytes aligned,
210
        // then it's more important to align dst than src,
211
        // because there is more penalty for stores
212
        // than loads that cross a cacheline boundary.
213
        // This check and realignment are only done if there is >= 832
214
        // bytes to copy.
215
216
        // Dst is word aligned, but check if it is already double word aligned.
217
        ands    r3, r0, #4
218
        beq     1f
219
        ldr     r3, [r1], #4
220
        str     r3, [r0], #4
221
        sub     r2, #4
222
223
1:      // Can only get here if > 64 bytes to copy, so don't do check r2.
224
        sub     r2, #64
225
226
2:      // Every loop iteration copies 64 bytes.
227
        .irp    offset, #0, #8, #16, #24, #32
228
        ldrd    r4, r5, [r1, \offset]
229
        strd    r4, r5, [r0, \offset]
230
        .endr
231
232
        ldrd    r4, r5, [r1, #40]
233
        ldrd    r6, r7, [r1, #48]
234
        ldrd    r8, r9, [r1, #56]
235
236
        // Keep the pld as far from the next load as possible.
237
        // The amount to prefetch was determined experimentally using
238
        // large sizes, and verifying the prefetch size does not affect
239
        // the smaller copies too much.
240
        // WARNING: If the ldrd and strd instructions get too far away
241
        //          from each other, performance suffers. Three loads
242
        //          in a row is the best tradeoff.
243
        pld     [r1, #(CACHE_LINE_SIZE*16)]
244
        strd    r4, r5, [r0, #40]
245
        strd    r6, r7, [r0, #48]
246
        strd    r8, r9, [r0, #56]
247
248
        add     r0, r0, #64
249
        add     r1, r1, #64
250
        subs    r2, r2, #64
251
        bge     2b
252
253
        // Fix-up the remaining count and make sure we have >= 32 bytes left.
254
        adds    r2, r2, #32
255
        blo     4f
256
257
        // Copy 32 bytes. These cache lines were already preloaded.
258
        .irp    offset, #0, #8, #16, #24
259
        ldrd    r4, r5, [r1, \offset]
260
        strd    r4, r5, [r0, \offset]
261
        .endr
262
        add     r1, r1, #32
263
        add     r0, r0, #32
264
        sub     r2, r2, #32
265
4:      // Less than 32 left.
266
        add     r2, r2, #32
267
        tst     r2, #0x10
268
        beq     5f
269
        // Copy 16 bytes.
270
        .irp    offset, #0, #8
271
        ldrd    r4, r5, [r1, \offset]
272
        strd    r4, r5, [r0, \offset]
273
        .endr
274
        add     r1, r1, #16
275
        add     r0, r0, #16
276
277
5:      // Copy up to 15 bytes (count in r2).
278
        movs    ip, r2, lsl #29
279
        bcc     1f
280
        // Copy 8 bytes.
281
        ldrd    r4, r5, [r1], #8
282
        strd    r4, r5, [r0], #8
283
1:      bge         2f
284
        // Copy 4 bytes.
285
        ldr     r4, [r1], #4
286
        str     r4, [r0], #4
287
2:      // Copy 0 to 4 bytes.
288
        lsls    r2, r2, #31
289
        itt     ne
290
        ldrbne  lr, [r1], #1
291
        strbne  lr, [r0], #1
292
        itttt   cs
293
        ldrbcs  ip, [r1], #1
294
        ldrbcs  lr, [r1]
295
        strbcs  ip, [r0], #1
296
        strbcs  lr, [r0]
297
298
        // Restore registers: optimized pop {r0, pc}
299
        ldrd    r8, r9, [sp], #8
300
        ldrd    r6, r7, [sp], #8
301
        ldrd    r4, r5, [sp], #8
302
        pop     {r0, pc}
303
304
dst_not_word_aligned:
305
        // Align dst to word.
306
        rsb     ip, ip, #4
307
        cmp     ip, #2
308
309
        itt     gt
310
        ldrbgt  lr, [r1], #1
311
        strbgt  lr, [r0], #1
312
313
        itt     ge
314
        ldrbge  lr, [r1], #1
315
        strbge  lr, [r0], #1
316
317
        ldrb    lr, [r1], #1
318
        strb    lr, [r0], #1
319
320
        sub     r2, r2, ip
321
322
        // Src is guaranteed to be at least word aligned by this point.
323
        b       word_aligned
324
	.fnend
325
	.size memcpy, .-memcpy