~linaro-toolchain-dev/cortex-strings/trunk

103 by Will Newton
Split bionic reference code into A15 and A9 versions.
1
/*
2
 * Copyright (C) 2008 The Android Open Source Project
3
 * All rights reserved.
4
 *
5
 * Redistribution and use in source and binary forms, with or without
6
 * modification, are permitted provided that the following conditions
7
 * are met:
8
 *  * Redistributions of source code must retain the above copyright
9
 *    notice, this list of conditions and the following disclaimer.
10
 *  * Redistributions in binary form must reproduce the above copyright
11
 *    notice, this list of conditions and the following disclaimer in
12
 *    the documentation and/or other materials provided with the
13
 *    distribution.
14
 *
15
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26
 * SUCH DAMAGE.
27
 */
28
29
#ifdef HAVE_32_BYTE_CACHE_LINE
30
#define CACHE_LINE_SIZE     32
31
#else
32
#define CACHE_LINE_SIZE     64
33
#endif
34
35
/*
36
 * Optimized memcmp() for Cortex-A9.
37
 */
38
	.text
39
	.globl memcmp
40
	.type memcmp,%function
41
memcmp:
42
	.fnstart
43
        pld         [r0, #(CACHE_LINE_SIZE * 0)]
44
        pld         [r0, #(CACHE_LINE_SIZE * 1)]
45
46
        /* take of the case where length is 0 or the buffers are the same */
47
        cmp         r0, r1
48
        moveq       r0, #0
49
        bxeq        lr
50
51
        pld         [r1, #(CACHE_LINE_SIZE * 0)]
52
        pld         [r1, #(CACHE_LINE_SIZE * 1)]
53
54
        /* make sure we have at least 8+4 bytes, this simplify things below
55
         * and avoid some overhead for small blocks
56
         */
57
        cmp        r2, #(8+4)
58
        bmi        10f
59
/*
60
 * Neon optimization
61
 * Comparing 32 bytes at a time
62
 */
63
#if defined(__ARM_NEON__) && defined(NEON_UNALIGNED_ACCESS)
64
        subs        r2, r2, #32
65
        blo         3f
66
67
        /* preload all the cache lines we need. */
68
        pld         [r0, #(CACHE_LINE_SIZE * 2)]
69
        pld         [r1, #(CACHE_LINE_SIZE * 2)]
70
71
1:      /* The main loop compares 32 bytes at a time */
72
        vld1.8      {d0 - d3}, [r0]!
73
        pld         [r0, #(CACHE_LINE_SIZE * 2)]
74
        vld1.8      {d4 - d7}, [r1]!
75
        pld         [r1, #(CACHE_LINE_SIZE * 2)]
76
77
        /* Start subtracting the values and merge results */
78
        vsub.i8     q0, q2
79
        vsub.i8     q1, q3
80
        vorr        q2, q0, q1
81
        vorr        d4, d5
82
        vmov        r3, ip, d4
83
        /* Check if there are any differences among the 32 bytes */
84
        orrs        r3, ip
85
        bne         2f
86
        subs        r2, r2, #32
87
        bhs         1b
88
        b           3f
89
2:
90
        /* Check if the difference was in the first or last 16 bytes */
91
        sub         r0, #32
92
        vorr        d0, d1
93
        sub         r1, #32
94
        vmov        r3, ip, d0
95
        orrs        r3, ip
96
        /* if the first 16 bytes are equal, we only have to rewind 16 bytes */
97
        ittt        eq
98
        subeq       r2, #16
99
        addeq       r0, #16
100
        addeq       r1, #16
101
102
3:      /* fix-up the remaining count */
103
        add         r2, r2, #32
104
105
        cmp        r2, #(8+4)
106
        bmi        10f
107
#endif
108
109
        .save {r4, lr}
110
        /* save registers */
111
        stmfd       sp!, {r4, lr}
112
113
        /* since r0 hold the result, move the first source
114
         * pointer somewhere else
115
         */
116
         mov        r4, r0
117
118
        /* align first pointer to word boundary
119
         * offset = -src & 3
120
         */
121
        rsb         r3, r4, #0
122
        ands        r3, r3, #3
123
        beq         0f
124
125
        /* align first pointer  */
126
        sub         r2, r2, r3
127
1:      ldrb        r0, [r4], #1
128
        ldrb        ip, [r1], #1
129
        subs        r0, r0, ip
130
        bne         9f
131
        subs        r3, r3, #1
132
        bne         1b
133
134
135
0:      /* here the first pointer is aligned, and we have at least 4 bytes
136
         * to process.
137
         */
138
139
        /* see if the pointers are congruent */
140
        eor         r0, r4, r1
141
        ands        r0, r0, #3
142
        bne         5f
143
144
        /* congruent case, 32 bytes per iteration
145
         * We need to make sure there are at least 32+4 bytes left
146
         * because we effectively read ahead one word, and we could
147
         * read past the buffer (and segfault) if we're not careful.
148
         */
149
150
        ldr         ip, [r1]
151
        subs        r2, r2, #(32 + 4)
152
        bmi         1f
153
154
0:      pld         [r4, #(CACHE_LINE_SIZE * 2)]
155
        pld         [r1, #(CACHE_LINE_SIZE * 2)]
156
        ldr         r0, [r4], #4
157
        ldr         lr, [r1, #4]!
158
        eors        r0, r0, ip
159
        ldreq       r0, [r4], #4
160
        ldreq       ip, [r1, #4]!
161
        eoreqs      r0, r0, lr
162
        ldreq       r0, [r4], #4
163
        ldreq       lr, [r1, #4]!
164
        eoreqs      r0, r0, ip
165
        ldreq       r0, [r4], #4
166
        ldreq       ip, [r1, #4]!
167
        eoreqs      r0, r0, lr
168
        ldreq       r0, [r4], #4
169
        ldreq       lr, [r1, #4]!
170
        eoreqs      r0, r0, ip
171
        ldreq       r0, [r4], #4
172
        ldreq       ip, [r1, #4]!
173
        eoreqs      r0, r0, lr
174
        ldreq       r0, [r4], #4
175
        ldreq       lr, [r1, #4]!
176
        eoreqs      r0, r0, ip
177
        ldreq       r0, [r4], #4
178
        ldreq       ip, [r1, #4]!
179
        eoreqs      r0, r0, lr
180
        bne         2f
181
        subs        r2, r2, #32
182
        bhs         0b
183
184
        /* do we have at least 4 bytes left? */
185
1:      adds        r2, r2, #(32 - 4 + 4)
186
        bmi         4f
187
188
        /* finish off 4 bytes at a time */
189
3:      ldr         r0, [r4], #4
190
        ldr         ip, [r1], #4
191
        eors        r0, r0, ip
192
        bne         2f
193
        subs        r2, r2, #4
194
        bhs         3b
195
196
        /* are we done? */
197
4:      adds        r2, r2, #4
198
        moveq       r0, #0
199
        beq         9f
200
201
        /* finish off the remaining bytes */
202
        b           8f
203
204
2:      /* the last 4 bytes are different, restart them */
205
        sub         r4, r4, #4
206
        sub         r1, r1, #4
207
        mov         r2, #4
208
209
        /* process the last few bytes */
210
8:      ldrb        r0, [r4], #1
211
        ldrb        ip, [r1], #1
212
        // stall
213
        subs        r0, r0, ip
214
        bne         9f
215
        subs        r2, r2, #1
216
        bne         8b
217
218
9:      /* restore registers and return */
219
        ldmfd       sp!, {r4, lr}
220
        bx          lr
221
222
10:     /* process less than 12 bytes */
223
        cmp         r2, #0
224
        moveq       r0, #0
225
        bxeq        lr
226
        mov         r3, r0
227
11:
228
        ldrb        r0, [r3], #1
229
        ldrb        ip, [r1], #1
230
        subs        r0, ip
231
        bxne        lr
232
        subs        r2, r2, #1
233
        bne         11b
234
        bx          lr
235
236
5:      /*************** non-congruent case ***************/
237
        and         r0, r1, #3
238
        cmp         r0, #2
239
        bne         4f
240
241
        /* here, offset is 2 (16-bits aligned, special cased) */
242
243
        /* make sure we have at least 16 bytes to process */
244
        subs        r2, r2, #16
245
        addmi       r2, r2, #16
246
        bmi         8b
247
248
        /* align the unaligned pointer */
249
        bic         r1, r1, #3
250
        ldr         lr, [r1], #4
251
252
6:      pld         [r1, #(CACHE_LINE_SIZE * 2)]
253
        pld         [r4, #(CACHE_LINE_SIZE * 2)]
254
        mov         ip, lr, lsr #16
255
        ldr         lr, [r1], #4
256
        ldr         r0, [r4], #4
257
        orr         ip, ip, lr, lsl #16
258
        eors        r0, r0, ip
259
        moveq       ip, lr, lsr #16
260
        ldreq       lr, [r1], #4
261
        ldreq       r0, [r4], #4
262
        orreq       ip, ip, lr, lsl #16
263
        eoreqs      r0, r0, ip
264
        moveq       ip, lr, lsr #16
265
        ldreq       lr, [r1], #4
266
        ldreq       r0, [r4], #4
267
        orreq       ip, ip, lr, lsl #16
268
        eoreqs      r0, r0, ip
269
        moveq       ip, lr, lsr #16
270
        ldreq       lr, [r1], #4
271
        ldreq       r0, [r4], #4
272
        orreq       ip, ip, lr, lsl #16
273
        eoreqs      r0, r0, ip
274
        bne         7f
275
        subs        r2, r2, #16
276
        bhs         6b
277
        sub         r1, r1, #2
278
        /* are we done? */
279
        adds        r2, r2, #16
280
        moveq       r0, #0
281
        beq         9b
282
        /* finish off the remaining bytes */
283
        b           8b
284
285
7:      /* fix up the 2 pointers and fallthrough... */
286
        sub         r1, r1, #(4+2)
287
        sub         r4, r4, #4
288
        mov         r2, #4
289
        b           8b
290
291
292
4:      /*************** offset is 1 or 3 (less optimized) ***************/
293
294
		stmfd		sp!, {r5, r6, r7}
295
296
        // r5 = rhs
297
        // r6 = lhs
298
        // r7 = scratch
299
300
        mov         r5, r0, lsl #3		/* r5 = right shift */
301
        rsb         r6, r5, #32         /* r6 = left shift */
302
303
        /* align the unaligned pointer */
304
        bic         r1, r1, #3
305
        ldr         r7, [r1], #4
306
        sub         r2, r2, #8
307
308
6:      mov         ip, r7, lsr r5
309
        ldr         r7, [r1], #4
310
        ldr         r0, [r4], #4
311
        orr         ip, ip, r7, lsl r6
312
        eors        r0, r0, ip
313
        moveq       ip, r7, lsr r5
314
        ldreq       r7, [r1], #4
315
        ldreq       r0, [r4], #4
316
        orreq       ip, ip, r7, lsl r6
317
        eoreqs      r0, r0, ip
318
        bne         7f
319
        subs        r2, r2, #8
320
        bhs         6b
321
322
        sub         r1, r1, r6, lsr #3
323
		ldmfd       sp!, {r5, r6, r7}
324
325
        /* are we done? */
326
        adds        r2, r2, #8
327
        moveq       r0, #0
328
        beq         9b
329
330
        /* finish off the remaining bytes */
331
        b           8b
332
333
7:      /* fix up the 2 pointers and fallthrough... */
334
        sub         r1, r1, #4
335
        sub         r1, r1, r6, lsr #3
336
        sub         r4, r4, #4
337
        mov         r2, #4
338
		ldmfd		sp!, {r5, r6, r7}
339
        b           8b
340
	.fnend
341
	.size memcmp, .-memcmp