~linaro-toolchain-dev/cortex-strings/trunk

103 by Will Newton
Split bionic reference code into A15 and A9 versions.
1
/*
2
 * Copyright (C) 2008 The Android Open Source Project
3
 * All rights reserved.
4
 *
5
 * Redistribution and use in source and binary forms, with or without
6
 * modification, are permitted provided that the following conditions
7
 * are met:
8
 *  * Redistributions of source code must retain the above copyright
9
 *    notice, this list of conditions and the following disclaimer.
10
 *  * Redistributions in binary form must reproduce the above copyright
11
 *    notice, this list of conditions and the following disclaimer in
12
 *    the documentation and/or other materials provided with the
13
 *    distribution.
14
 *
15
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26
 * SUCH DAMAGE.
27
 */
28
29
/*
30
 * This code assumes it is running on a processor that supports all arm v7
31
 * instructions, that supports neon instructions, and that has a 32 byte
32
 * cache line.
33
 */
34
35
        .text
36
        .fpu    neon
37
38
#define CACHE_LINE_SIZE     32
39
40
	.globl memcpy
41
	.type memcpy,%function
42
memcpy:
43
	.fnstart
44
        .save       {r0, lr}
45
        /* start preloading as early as possible */
46
        pld         [r1, #(CACHE_LINE_SIZE * 0)]
47
        stmfd       sp!, {r0, lr}
48
        pld         [r1, #(CACHE_LINE_SIZE * 2)]
49
50
        // Check so divider is at least 16 bytes, needed for alignment code.
51
        cmp         r2, #16
52
        blo         5f
53
54
55
        /* check if buffers are aligned. If so, run arm-only version */
56
        eor         r3, r0, r1
57
        ands        r3, r3, #0x3
58
        beq         11f
59
60
        /* Check the upper size limit for Neon unaligned memory access in memcpy */
61
        cmp         r2, #224
62
        blo         3f
63
64
        /* align destination to 16 bytes for the write-buffer */
65
        rsb         r3, r0, #0
66
        ands        r3, r3, #0xF
67
        beq         3f
68
69
        /* copy up to 15-bytes (count in r3) */
70
        sub         r2, r2, r3
71
        movs        ip, r3, lsl #31
72
        ldrmib      lr, [r1], #1
73
        strmib      lr, [r0], #1
74
        ldrcsb      ip, [r1], #1
75
        ldrcsb      lr, [r1], #1
76
        strcsb      ip, [r0], #1
77
        strcsb      lr, [r0], #1
78
        movs        ip, r3, lsl #29
79
        bge         1f
80
        // copies 4 bytes, destination 32-bits aligned
81
        vld1.32     {d0[0]}, [r1]!
82
        vst1.32     {d0[0]}, [r0, :32]!
83
1:      bcc         2f
84
        // copies 8 bytes, destination 64-bits aligned
85
        vld1.8      {d0}, [r1]!
86
        vst1.8      {d0}, [r0, :64]!
87
2:
88
        /* preload immediately the next cache line, which we may need */
89
        pld         [r1, #(CACHE_LINE_SIZE * 0)]
90
        pld         [r1, #(CACHE_LINE_SIZE * 2)]
91
3:
92
        /* make sure we have at least 64 bytes to copy */
93
        subs        r2, r2, #64
94
        blo         2f
95
96
        /* preload all the cache lines we need */
97
        pld         [r1, #(CACHE_LINE_SIZE * 4)]
98
        pld         [r1, #(CACHE_LINE_SIZE * 6)]
99
100
1:      /* The main loop copies 64 bytes at a time */
101
        vld1.8      {d0 - d3}, [r1]!
102
        vld1.8      {d4 - d7}, [r1]!
103
        pld         [r1, #(CACHE_LINE_SIZE * 6)]
104
        subs        r2, r2, #64
105
        vst1.8      {d0 - d3}, [r0]!
106
        vst1.8      {d4 - d7}, [r0]!
107
        bhs         1b
108
109
2:      /* fix-up the remaining count and make sure we have >= 32 bytes left */
110
        add         r2, r2, #64
111
        subs        r2, r2, #32
112
        blo         4f
113
114
3:      /* 32 bytes at a time. These cache lines were already preloaded */
115
        vld1.8      {d0 - d3}, [r1]!
116
        subs        r2, r2, #32
117
        vst1.8      {d0 - d3}, [r0]!
118
        bhs         3b
119
120
4:      /* less than 32 left */
121
        add         r2, r2, #32
122
        tst         r2, #0x10
123
        beq         5f
124
        // copies 16 bytes, 128-bits aligned
125
        vld1.8      {d0, d1}, [r1]!
126
        vst1.8      {d0, d1}, [r0]!
127
5:      /* copy up to 15-bytes (count in r2) */
128
        movs        ip, r2, lsl #29
129
        bcc         1f
130
        vld1.8      {d0}, [r1]!
131
        vst1.8      {d0}, [r0]!
132
1:      bge         2f
133
        vld1.32     {d0[0]}, [r1]!
134
        vst1.32     {d0[0]}, [r0]!
135
2:      movs        ip, r2, lsl #31
136
        ldrmib      r3, [r1], #1
137
        ldrcsb      ip, [r1], #1
138
        ldrcsb      lr, [r1], #1
139
        strmib      r3, [r0], #1
140
        strcsb      ip, [r0], #1
141
        strcsb      lr, [r0], #1
142
143
        ldmfd       sp!, {r0, lr}
144
        bx          lr
145
11:
146
        /* Simple arm-only copy loop to handle aligned copy operations */
147
        stmfd       sp!, {r4, r5, r6, r7, r8}
148
        pld         [r1, #(CACHE_LINE_SIZE * 4)]
149
150
        /* Check alignment */
151
        rsb         r3, r1, #0
152
        ands        r3, #3
153
        beq         2f
154
155
        /* align source to 32 bits. We need to insert 2 instructions between
156
         * a ldr[b|h] and str[b|h] because byte and half-word instructions
157
         * stall 2 cycles.
158
         */
159
        movs        r12, r3, lsl #31
160
        sub         r2, r2, r3      /* we know that r3 <= r2 because r2 >= 4 */
161
        ldrmib      r3, [r1], #1
162
        ldrcsb      r4, [r1], #1
163
        ldrcsb      r5, [r1], #1
164
        strmib      r3, [r0], #1
165
        strcsb      r4, [r0], #1
166
        strcsb      r5, [r0], #1
167
168
2:
169
        subs        r2, r2, #64
170
        blt         4f
171
172
3:      /* Main copy loop, copying 64 bytes at a time */
173
        pld         [r1, #(CACHE_LINE_SIZE * 8)]
174
        ldmia       r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
175
        stmia       r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
176
        ldmia       r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
177
        stmia       r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
178
        subs        r2, r2, #64
179
        bge         3b
180
181
4:      /* Check if there are > 32 bytes left */
182
        adds        r2, r2, #64
183
        subs        r2, r2, #32
184
        blt         5f
185
186
        /* Copy 32 bytes */
187
        ldmia       r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
188
        stmia       r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
189
        subs        r2, #32
190
191
5:      /* Handle any remaining bytes */
192
        adds        r2, #32
193
        beq         6f
194
195
        movs        r12, r2, lsl #28
196
        ldmcsia     r1!, {r3, r4, r5, r6}   /* 16 bytes */
197
        ldmmiia     r1!, {r7, r8}           /*  8 bytes */
198
        stmcsia     r0!, {r3, r4, r5, r6}
199
        stmmiia     r0!, {r7, r8}
200
        movs        r12, r2, lsl #30
201
        ldrcs       r3, [r1], #4            /*  4 bytes */
202
        ldrmih      r4, [r1], #2            /*  2 bytes */
203
        strcs       r3, [r0], #4
204
        strmih      r4, [r0], #2
205
        tst         r2, #0x1
206
        ldrneb      r3, [r1]                /*  last byte  */
207
        strneb      r3, [r0]
208
6:
209
        ldmfd       sp!, {r4, r5, r6, r7, r8}
210
        ldmfd       sp!, {r0, pc}
211
	.fnend
212
	.size memcpy, .-memcpy