2
* Copyright (C) 2008 The Android Open Source Project
5
* Redistribution and use in source and binary forms, with or without
6
* modification, are permitted provided that the following conditions
8
* * Redistributions of source code must retain the above copyright
9
* notice, this list of conditions and the following disclaimer.
10
* * Redistributions in binary form must reproduce the above copyright
11
* notice, this list of conditions and the following disclaimer in
12
* the documentation and/or other materials provided with the
15
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29
#ifdef HAVE_32_BYTE_CACHE_LINE
30
#define CACHE_LINE_SIZE 32
32
#define CACHE_LINE_SIZE 64
36
* Optimized memcmp() for Cortex-A9.
40
.type memcmp,%function
43
pld [r0, #(CACHE_LINE_SIZE * 0)]
44
pld [r0, #(CACHE_LINE_SIZE * 1)]
46
/* take of the case where length is 0 or the buffers are the same */
51
pld [r1, #(CACHE_LINE_SIZE * 0)]
52
pld [r1, #(CACHE_LINE_SIZE * 1)]
54
/* make sure we have at least 8+4 bytes, this simplify things below
55
* and avoid some overhead for small blocks
61
* Comparing 32 bytes at a time
63
#if defined(__ARM_NEON__) && defined(NEON_UNALIGNED_ACCESS)
67
/* preload all the cache lines we need. */
68
pld [r0, #(CACHE_LINE_SIZE * 2)]
69
pld [r1, #(CACHE_LINE_SIZE * 2)]
71
1: /* The main loop compares 32 bytes at a time */
72
vld1.8 {d0 - d3}, [r0]!
73
pld [r0, #(CACHE_LINE_SIZE * 2)]
74
vld1.8 {d4 - d7}, [r1]!
75
pld [r1, #(CACHE_LINE_SIZE * 2)]
77
/* Start subtracting the values and merge results */
83
/* Check if there are any differences among the 32 bytes */
90
/* Check if the difference was in the first or last 16 bytes */
96
/* if the first 16 bytes are equal, we only have to rewind 16 bytes */
102
3: /* fix-up the remaining count */
113
/* since r0 hold the result, move the first source
114
* pointer somewhere else
118
/* align first pointer to word boundary
125
/* align first pointer */
135
0: /* here the first pointer is aligned, and we have at least 4 bytes
139
/* see if the pointers are congruent */
144
/* congruent case, 32 bytes per iteration
145
* We need to make sure there are at least 32+4 bytes left
146
* because we effectively read ahead one word, and we could
147
* read past the buffer (and segfault) if we're not careful.
151
subs r2, r2, #(32 + 4)
154
0: pld [r4, #(CACHE_LINE_SIZE * 2)]
155
pld [r1, #(CACHE_LINE_SIZE * 2)]
184
/* do we have at least 4 bytes left? */
185
1: adds r2, r2, #(32 - 4 + 4)
188
/* finish off 4 bytes at a time */
201
/* finish off the remaining bytes */
204
2: /* the last 4 bytes are different, restart them */
209
/* process the last few bytes */
218
9: /* restore registers and return */
222
10: /* process less than 12 bytes */
236
5: /*************** non-congruent case ***************/
241
/* here, offset is 2 (16-bits aligned, special cased) */
243
/* make sure we have at least 16 bytes to process */
248
/* align the unaligned pointer */
252
6: pld [r1, #(CACHE_LINE_SIZE * 2)]
253
pld [r4, #(CACHE_LINE_SIZE * 2)]
257
orr ip, ip, lr, lsl #16
259
moveq ip, lr, lsr #16
262
orreq ip, ip, lr, lsl #16
264
moveq ip, lr, lsr #16
267
orreq ip, ip, lr, lsl #16
269
moveq ip, lr, lsr #16
272
orreq ip, ip, lr, lsl #16
282
/* finish off the remaining bytes */
285
7: /* fix up the 2 pointers and fallthrough... */
292
4: /*************** offset is 1 or 3 (less optimized) ***************/
294
stmfd sp!, {r5, r6, r7}
300
mov r5, r0, lsl #3 /* r5 = right shift */
301
rsb r6, r5, #32 /* r6 = left shift */
303
/* align the unaligned pointer */
308
6: mov ip, r7, lsr r5
311
orr ip, ip, r7, lsl r6
316
orreq ip, ip, r7, lsl r6
322
sub r1, r1, r6, lsr #3
323
ldmfd sp!, {r5, r6, r7}
330
/* finish off the remaining bytes */
333
7: /* fix up the 2 pointers and fallthrough... */
335
sub r1, r1, r6, lsr #3
338
ldmfd sp!, {r5, r6, r7}
341
.size memcmp, .-memcmp