~linaro-toolchain-dev/cortex-strings/trunk : contents of reference/bionic-a9/memcmp.S at revision 123

~linaro-toolchain-dev/cortex-strings/trunk : (revision 123)

/*
 * Copyright (C) 2008 The Android Open Source Project
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#ifdef HAVE_32_BYTE_CACHE_LINE
#define CACHE_LINE_SIZE     32
#else
#define CACHE_LINE_SIZE     64
#endif

/*
 * Optimized memcmp() for Cortex-A9.
 */
	.text
	.globl memcmp
	.type memcmp,%function
memcmp:
	.fnstart
        pld         [r0, #(CACHE_LINE_SIZE * 0)]
        pld         [r0, #(CACHE_LINE_SIZE * 1)]

        /* take of the case where length is 0 or the buffers are the same */
        cmp         r0, r1
        moveq       r0, #0
        bxeq        lr

        pld         [r1, #(CACHE_LINE_SIZE * 0)]
        pld         [r1, #(CACHE_LINE_SIZE * 1)]

        /* make sure we have at least 8+4 bytes, this simplify things below
         * and avoid some overhead for small blocks
         */
        cmp        r2, #(8+4)
        bmi        10f
/*
 * Neon optimization
 * Comparing 32 bytes at a time
 */
#if defined(__ARM_NEON__) && defined(NEON_UNALIGNED_ACCESS)
        subs        r2, r2, #32
        blo         3f

        /* preload all the cache lines we need. */
        pld         [r0, #(CACHE_LINE_SIZE * 2)]
        pld         [r1, #(CACHE_LINE_SIZE * 2)]

1:      /* The main loop compares 32 bytes at a time */
        vld1.8      {d0 - d3}, [r0]!
        pld         [r0, #(CACHE_LINE_SIZE * 2)]
        vld1.8      {d4 - d7}, [r1]!
        pld         [r1, #(CACHE_LINE_SIZE * 2)]

        /* Start subtracting the values and merge results */
        vsub.i8     q0, q2
        vsub.i8     q1, q3
        vorr        q2, q0, q1
        vorr        d4, d5
        vmov        r3, ip, d4
        /* Check if there are any differences among the 32 bytes */
        orrs        r3, ip
        bne         2f
        subs        r2, r2, #32
        bhs         1b
        b           3f
2:
        /* Check if the difference was in the first or last 16 bytes */
        sub         r0, #32
        vorr        d0, d1
        sub         r1, #32
        vmov        r3, ip, d0
        orrs        r3, ip
        /* if the first 16 bytes are equal, we only have to rewind 16 bytes */
        ittt        eq
        subeq       r2, #16
        addeq       r0, #16
        addeq       r1, #16

3:      /* fix-up the remaining count */
        add         r2, r2, #32

        cmp        r2, #(8+4)
        bmi        10f
#endif

        .save {r4, lr}
        /* save registers */
        stmfd       sp!, {r4, lr}

        /* since r0 hold the result, move the first source
         * pointer somewhere else
         */
         mov        r4, r0

        /* align first pointer to word boundary
         * offset = -src & 3
         */
        rsb         r3, r4, #0
        ands        r3, r3, #3
        beq         0f

        /* align first pointer  */
        sub         r2, r2, r3
1:      ldrb        r0, [r4], #1
        ldrb        ip, [r1], #1
        subs        r0, r0, ip
        bne         9f
        subs        r3, r3, #1
        bne         1b


0:      /* here the first pointer is aligned, and we have at least 4 bytes
         * to process.
         */

        /* see if the pointers are congruent */
        eor         r0, r4, r1
        ands        r0, r0, #3
        bne         5f

        /* congruent case, 32 bytes per iteration
         * We need to make sure there are at least 32+4 bytes left
         * because we effectively read ahead one word, and we could
         * read past the buffer (and segfault) if we're not careful.
         */

        ldr         ip, [r1]
        subs        r2, r2, #(32 + 4)
        bmi         1f

0:      pld         [r4, #(CACHE_LINE_SIZE * 2)]
        pld         [r1, #(CACHE_LINE_SIZE * 2)]
        ldr         r0, [r4], #4
        ldr         lr, [r1, #4]!
        eors        r0, r0, ip
        ldreq       r0, [r4], #4
        ldreq       ip, [r1, #4]!
        eoreqs      r0, r0, lr
        ldreq       r0, [r4], #4
        ldreq       lr, [r1, #4]!
        eoreqs      r0, r0, ip
        ldreq       r0, [r4], #4
        ldreq       ip, [r1, #4]!
        eoreqs      r0, r0, lr
        ldreq       r0, [r4], #4
        ldreq       lr, [r1, #4]!
        eoreqs      r0, r0, ip
        ldreq       r0, [r4], #4
        ldreq       ip, [r1, #4]!
        eoreqs      r0, r0, lr
        ldreq       r0, [r4], #4
        ldreq       lr, [r1, #4]!
        eoreqs      r0, r0, ip
        ldreq       r0, [r4], #4
        ldreq       ip, [r1, #4]!
        eoreqs      r0, r0, lr
        bne         2f
        subs        r2, r2, #32
        bhs         0b

        /* do we have at least 4 bytes left? */
1:      adds        r2, r2, #(32 - 4 + 4)
        bmi         4f

        /* finish off 4 bytes at a time */
3:      ldr         r0, [r4], #4
        ldr         ip, [r1], #4
        eors        r0, r0, ip
        bne         2f
        subs        r2, r2, #4
        bhs         3b

        /* are we done? */
4:      adds        r2, r2, #4
        moveq       r0, #0
        beq         9f

        /* finish off the remaining bytes */
        b           8f

2:      /* the last 4 bytes are different, restart them */
        sub         r4, r4, #4
        sub         r1, r1, #4
        mov         r2, #4

        /* process the last few bytes */
8:      ldrb        r0, [r4], #1
        ldrb        ip, [r1], #1
        // stall
        subs        r0, r0, ip
        bne         9f
        subs        r2, r2, #1
        bne         8b

9:      /* restore registers and return */
        ldmfd       sp!, {r4, lr}
        bx          lr

10:     /* process less than 12 bytes */
        cmp         r2, #0
        moveq       r0, #0
        bxeq        lr
        mov         r3, r0
11:
        ldrb        r0, [r3], #1
        ldrb        ip, [r1], #1
        subs        r0, ip
        bxne        lr
        subs        r2, r2, #1
        bne         11b
        bx          lr

5:      /*************** non-congruent case ***************/
        and         r0, r1, #3
        cmp         r0, #2
        bne         4f

        /* here, offset is 2 (16-bits aligned, special cased) */

        /* make sure we have at least 16 bytes to process */
        subs        r2, r2, #16
        addmi       r2, r2, #16
        bmi         8b

        /* align the unaligned pointer */
        bic         r1, r1, #3
        ldr         lr, [r1], #4

6:      pld         [r1, #(CACHE_LINE_SIZE * 2)]
        pld         [r4, #(CACHE_LINE_SIZE * 2)]
        mov         ip, lr, lsr #16
        ldr         lr, [r1], #4
        ldr         r0, [r4], #4
        orr         ip, ip, lr, lsl #16
        eors        r0, r0, ip
        moveq       ip, lr, lsr #16
        ldreq       lr, [r1], #4
        ldreq       r0, [r4], #4
        orreq       ip, ip, lr, lsl #16
        eoreqs      r0, r0, ip
        moveq       ip, lr, lsr #16
        ldreq       lr, [r1], #4
        ldreq       r0, [r4], #4
        orreq       ip, ip, lr, lsl #16
        eoreqs      r0, r0, ip
        moveq       ip, lr, lsr #16
        ldreq       lr, [r1], #4
        ldreq       r0, [r4], #4
        orreq       ip, ip, lr, lsl #16
        eoreqs      r0, r0, ip
        bne         7f
        subs        r2, r2, #16
        bhs         6b
        sub         r1, r1, #2
        /* are we done? */
        adds        r2, r2, #16
        moveq       r0, #0
        beq         9b
        /* finish off the remaining bytes */
        b           8b

7:      /* fix up the 2 pointers and fallthrough... */
        sub         r1, r1, #(4+2)
        sub         r4, r4, #4
        mov         r2, #4
        b           8b


4:      /*************** offset is 1 or 3 (less optimized) ***************/

		stmfd		sp!, {r5, r6, r7}

        // r5 = rhs
        // r6 = lhs
        // r7 = scratch

        mov         r5, r0, lsl #3		/* r5 = right shift */
        rsb         r6, r5, #32         /* r6 = left shift */

        /* align the unaligned pointer */
        bic         r1, r1, #3
        ldr         r7, [r1], #4
        sub         r2, r2, #8

6:      mov         ip, r7, lsr r5
        ldr         r7, [r1], #4
        ldr         r0, [r4], #4
        orr         ip, ip, r7, lsl r6
        eors        r0, r0, ip
        moveq       ip, r7, lsr r5
        ldreq       r7, [r1], #4
        ldreq       r0, [r4], #4
        orreq       ip, ip, r7, lsl r6
        eoreqs      r0, r0, ip
        bne         7f
        subs        r2, r2, #8
        bhs         6b

        sub         r1, r1, r6, lsr #3
		ldmfd       sp!, {r5, r6, r7}

        /* are we done? */
        adds        r2, r2, #8
        moveq       r0, #0
        beq         9b

        /* finish off the remaining bytes */
        b           8b

7:      /* fix up the 2 pointers and fallthrough... */
        sub         r1, r1, #4
        sub         r1, r1, r6, lsr #3
        sub         r4, r4, #4
        mov         r2, #4
		ldmfd		sp!, {r5, r6, r7}
        b           8b
	.fnend
	.size memcmp, .-memcmp

103 by Will Newton Split bionic reference code into A15 and A9 versions.	1	/*
	2	* Copyright (C) 2008 The Android Open Source Project
	3	* All rights reserved.
	4	*
	5	* Redistribution and use in source and binary forms, with or without
	6	* modification, are permitted provided that the following conditions
	7	* are met:
	8	* * Redistributions of source code must retain the above copyright
	9	* notice, this list of conditions and the following disclaimer.
	10	* * Redistributions in binary form must reproduce the above copyright
	11	* notice, this list of conditions and the following disclaimer in
	12	* the documentation and/or other materials provided with the
	13	* distribution.
	14	*
	15	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	16	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	17	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	18	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	19	* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	20	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
	21	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
	22	* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	23	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	24	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	25	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	26	* SUCH DAMAGE.
	27	*/
	28
	29	#ifdef HAVE_32_BYTE_CACHE_LINE
	30	#define CACHE_LINE_SIZE 32
	31	#else
	32	#define CACHE_LINE_SIZE 64
	33	#endif
	34
	35	/*
	36	* Optimized memcmp() for Cortex-A9.
	37	*/
	38	.text
	39	.globl memcmp
	40	.type memcmp,%function
	41	memcmp:
	42	.fnstart
	43	pld [r0, #(CACHE_LINE_SIZE * 0)]
	44	pld [r0, #(CACHE_LINE_SIZE * 1)]
	45
	46	/* take of the case where length is 0 or the buffers are the same */
	47	cmp r0, r1
	48	moveq r0, #0
	49	bxeq lr
	50
	51	pld [r1, #(CACHE_LINE_SIZE * 0)]
	52	pld [r1, #(CACHE_LINE_SIZE * 1)]
	53
	54	/* make sure we have at least 8+4 bytes, this simplify things below
	55	* and avoid some overhead for small blocks
	56	*/
	57	cmp r2, #(8+4)
	58	bmi 10f
	59	/*
	60	* Neon optimization
	61	* Comparing 32 bytes at a time
	62	*/
	63	#if defined(__ARM_NEON__) && defined(NEON_UNALIGNED_ACCESS)
	64	subs r2, r2, #32
65	blo 3f
66
67	/* preload all the cache lines we need. */
68	pld [r0, #(CACHE_LINE_SIZE * 2)]
69	pld [r1, #(CACHE_LINE_SIZE * 2)]
70
71	1: /* The main loop compares 32 bytes at a time */
72	vld1.8 {d0 - d3}, [r0]!
73	pld [r0, #(CACHE_LINE_SIZE * 2)]
74	vld1.8 {d4 - d7}, [r1]!
75	pld [r1, #(CACHE_LINE_SIZE * 2)]
76
77	/* Start subtracting the values and merge results */
78	vsub.i8 q0, q2
79	vsub.i8 q1, q3
80	vorr q2, q0, q1
81	vorr d4, d5
82	vmov r3, ip, d4
83	/* Check if there are any differences among the 32 bytes */
84	orrs r3, ip
85	bne 2f
86	subs r2, r2, #32
87	bhs 1b
88	b 3f
89	2:
90	/* Check if the difference was in the first or last 16 bytes */
91	sub r0, #32
92	vorr d0, d1
93	sub r1, #32
94	vmov r3, ip, d0
95	orrs r3, ip
96	/* if the first 16 bytes are equal, we only have to rewind 16 bytes */
97	ittt eq
98	subeq r2, #16
99	addeq r0, #16
100	addeq r1, #16
101
102	3: /* fix-up the remaining count */
103	add r2, r2, #32
104
105	cmp r2, #(8+4)
106	bmi 10f
107	#endif
108
109	.save {r4, lr}
110	/* save registers */
111	stmfd sp!, {r4, lr}
112
113	/* since r0 hold the result, move the first source
114	* pointer somewhere else
115	*/
116	mov r4, r0
117
118	/* align first pointer to word boundary
119	* offset = -src & 3
120	*/
121	rsb r3, r4, #0
122	ands r3, r3, #3
123	beq 0f
124
125	/* align first pointer */
126	sub r2, r2, r3
127	1: ldrb r0, [r4], #1
128	ldrb ip, [r1], #1
129	subs r0, r0, ip
130	bne 9f
131	subs r3, r3, #1
132	bne 1b
133
134
135	0: /* here the first pointer is aligned, and we have at least 4 bytes
136	* to process.
137	*/
138
139	/* see if the pointers are congruent */
140	eor r0, r4, r1
141	ands r0, r0, #3
142	bne 5f
143
144	/* congruent case, 32 bytes per iteration
145	* We need to make sure there are at least 32+4 bytes left
146	* because we effectively read ahead one word, and we could
147	* read past the buffer (and segfault) if we're not careful.
148	*/
149
150	ldr ip, [r1]
151	subs r2, r2, #(32 + 4)
152	bmi 1f
153
154	0: pld [r4, #(CACHE_LINE_SIZE * 2)]
155	pld [r1, #(CACHE_LINE_SIZE * 2)]
156	ldr r0, [r4], #4
157	ldr lr, [r1, #4]!
158	eors r0, r0, ip
159	ldreq r0, [r4], #4
160	ldreq ip, [r1, #4]!
161	eoreqs r0, r0, lr
162	ldreq r0, [r4], #4
163	ldreq lr, [r1, #4]!
164	eoreqs r0, r0, ip
165	ldreq r0, [r4], #4
166	ldreq ip, [r1, #4]!
167	eoreqs r0, r0, lr
168	ldreq r0, [r4], #4
169	ldreq lr, [r1, #4]!
170	eoreqs r0, r0, ip
171	ldreq r0, [r4], #4
172	ldreq ip, [r1, #4]!
173	eoreqs r0, r0, lr
174	ldreq r0, [r4], #4
175	ldreq lr, [r1, #4]!
176	eoreqs r0, r0, ip
177	ldreq r0, [r4], #4
178	ldreq ip, [r1, #4]!
179	eoreqs r0, r0, lr
180	bne 2f
181	subs r2, r2, #32
182	bhs 0b
183
184	/* do we have at least 4 bytes left? */
185	1: adds r2, r2, #(32 - 4 + 4)
186	bmi 4f
187
188	/* finish off 4 bytes at a time */
189	3: ldr r0, [r4], #4
190	ldr ip, [r1], #4
191	eors r0, r0, ip
192	bne 2f
193	subs r2, r2, #4
194	bhs 3b
195
196	/* are we done? */
197	4: adds r2, r2, #4
198	moveq r0, #0
199	beq 9f
200
201	/* finish off the remaining bytes */
202	b 8f
203
204	2: /* the last 4 bytes are different, restart them */
205	sub r4, r4, #4
206	sub r1, r1, #4
207	mov r2, #4
208
209	/* process the last few bytes */
210	8: ldrb r0, [r4], #1
211	ldrb ip, [r1], #1
212	// stall
213	subs r0, r0, ip
214	bne 9f
215	subs r2, r2, #1
216	bne 8b
217
218	9: /* restore registers and return */
219	ldmfd sp!, {r4, lr}
220	bx lr
221
222	10: /* process less than 12 bytes */
223	cmp r2, #0
224	moveq r0, #0
225	bxeq lr
226	mov r3, r0
227	11:
228	ldrb r0, [r3], #1
229	ldrb ip, [r1], #1
230	subs r0, ip
231	bxne lr
232	subs r2, r2, #1
233	bne 11b
234	bx lr
235
236	5: /************* non-congruent case *************/
237	and r0, r1, #3
238	cmp r0, #2
239	bne 4f
240
241	/* here, offset is 2 (16-bits aligned, special cased) */
242
243	/* make sure we have at least 16 bytes to process */
244	subs r2, r2, #16
245	addmi r2, r2, #16
246	bmi 8b
247
248	/* align the unaligned pointer */
249	bic r1, r1, #3
250	ldr lr, [r1], #4
251
252	6: pld [r1, #(CACHE_LINE_SIZE * 2)]
253	pld [r4, #(CACHE_LINE_SIZE * 2)]
254	mov ip, lr, lsr #16
255	ldr lr, [r1], #4
256	ldr r0, [r4], #4
257	orr ip, ip, lr, lsl #16
258	eors r0, r0, ip
259	moveq ip, lr, lsr #16
260	ldreq lr, [r1], #4
261	ldreq r0, [r4], #4
262	orreq ip, ip, lr, lsl #16
263	eoreqs r0, r0, ip
264	moveq ip, lr, lsr #16
265	ldreq lr, [r1], #4
266	ldreq r0, [r4], #4
267	orreq ip, ip, lr, lsl #16
268	eoreqs r0, r0, ip
269	moveq ip, lr, lsr #16
270	ldreq lr, [r1], #4
271	ldreq r0, [r4], #4
272	orreq ip, ip, lr, lsl #16
273	eoreqs r0, r0, ip
274	bne 7f
275	subs r2, r2, #16
276	bhs 6b
277	sub r1, r1, #2
278	/* are we done? */
279	adds r2, r2, #16
280	moveq r0, #0
281	beq 9b
282	/* finish off the remaining bytes */
283	b 8b
284
285	7: /* fix up the 2 pointers and fallthrough... */
286	sub r1, r1, #(4+2)
287	sub r4, r4, #4
288	mov r2, #4
289	b 8b
290
291
292	4: /************* offset is 1 or 3 (less optimized) *************/
293
294	stmfd sp!, {r5, r6, r7}
295
296	// r5 = rhs
297	// r6 = lhs
298	// r7 = scratch
299
300	mov r5, r0, lsl #3 /* r5 = right shift */
301	rsb r6, r5, #32 /* r6 = left shift */
302
303	/* align the unaligned pointer */
304	bic r1, r1, #3
305	ldr r7, [r1], #4
306	sub r2, r2, #8
307
308	6: mov ip, r7, lsr r5
309	ldr r7, [r1], #4
310	ldr r0, [r4], #4
311	orr ip, ip, r7, lsl r6
312	eors r0, r0, ip
313	moveq ip, r7, lsr r5
314	ldreq r7, [r1], #4
315	ldreq r0, [r4], #4
316	orreq ip, ip, r7, lsl r6
317	eoreqs r0, r0, ip
318	bne 7f
319	subs r2, r2, #8
320	bhs 6b
321
322	sub r1, r1, r6, lsr #3
323	ldmfd sp!, {r5, r6, r7}
324
325	/* are we done? */
326	adds r2, r2, #8
327	moveq r0, #0
328	beq 9b
329
330	/* finish off the remaining bytes */
331	b 8b
332
333	7: /* fix up the 2 pointers and fallthrough... */
334	sub r1, r1, #4
335	sub r1, r1, r6, lsr #3
336	sub r4, r4, #4
337	mov r2, #4
338	ldmfd sp!, {r5, r6, r7}
339	b 8b
340	.fnend
341	.size memcmp, .-memcmp