~linaro-toolchain-dev/cortex-strings/trunk : contents of reference/bionic/memcmp.S at revision 86

~linaro-toolchain-dev/cortex-strings/trunk : (revision 86)

/*
 * Copyright (C) 2008 The Android Open Source Project
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

	.global memcmp
	.type memcmp, %function
	.text

/*
 * Optimized memcmp() for ARM9.
 * This would not be optimal on XScale or ARM11, where more prefetching
 * and use of PLD will be needed.
 * The 2 major optimzations here are
 * (1) The main loop compares 16 bytes at a time
 * (2) The loads are scheduled in a way they won't stall
 */

memcmp:
        .fnstart
        PLD         [r0, #0]
        PLD         [r1, #0]

        /* take of the case where length is 0 or the buffers are the same */
        cmp         r0, r1
        cmpne       r2, #0
        moveq       r0, #0
        bxeq        lr

        .save {r4, lr}
        /* save registers */
        stmfd       sp!, {r4, lr}
        
        PLD         [r0, #32]
        PLD         [r1, #32]

        /* since r0 hold the result, move the first source
         * pointer somewhere else
         */
         
         mov        r4, r0
         
         /* make sure we have at least 8+4 bytes, this simplify things below
          * and avoid some overhead for small blocks
          */
         cmp        r2, #(8+4)
         bmi        8f
        
        /* align first pointer to word boundary
         * offset = -src & 3
         */
        rsb         r3, r4, #0
        ands        r3, r3, #3
        beq         0f

        /* align first pointer  */
        sub         r2, r2, r3
1:      ldrb        r0, [r4], #1
        ldrb        ip, [r1], #1
        subs        r0, r0, ip
        bne         9f
        subs        r3, r3, #1
        bne         1b


0:      /* here the first pointer is aligned, and we have at least 4 bytes
         * to process.
         */

        /* see if the pointers are congruent */
        eor         r0, r4, r1
        ands        r0, r0, #3
        bne         5f

        /* congruent case, 32 bytes per iteration
         * We need to make sure there are at least 32+4 bytes left
         * because we effectively read ahead one word, and we could
         * read past the buffer (and segfault) if we're not careful.
         */

        ldr         ip, [r1]
        subs        r2, r2, #(32 + 4)
        bmi         1f
        
0:      PLD         [r4, #64]
        PLD         [r1, #64]
        ldr         r0, [r4], #4
        ldr         lr, [r1, #4]!
        eors        r0, r0, ip
        ldreq       r0, [r4], #4
        ldreq       ip, [r1, #4]!
        eoreqs      r0, r0, lr
        ldreq       r0, [r4], #4
        ldreq       lr, [r1, #4]!
        eoreqs      r0, r0, ip
        ldreq       r0, [r4], #4
        ldreq       ip, [r1, #4]!
        eoreqs      r0, r0, lr
        ldreq       r0, [r4], #4
        ldreq       lr, [r1, #4]!
        eoreqs      r0, r0, ip
        ldreq       r0, [r4], #4
        ldreq       ip, [r1, #4]!
        eoreqs      r0, r0, lr
        ldreq       r0, [r4], #4
        ldreq       lr, [r1, #4]!
        eoreqs      r0, r0, ip
        ldreq       r0, [r4], #4
        ldreq       ip, [r1, #4]!
        eoreqs      r0, r0, lr
        bne         2f        
        subs        r2, r2, #32
        bhs         0b

        /* do we have at least 4 bytes left? */
1:      adds        r2, r2, #(32 - 4 + 4)
        bmi         4f
        
        /* finish off 4 bytes at a time */
3:      ldr         r0, [r4], #4
        ldr         ip, [r1], #4
        eors        r0, r0, ip
        bne         2f
        subs        r2, r2, #4
        bhs         3b

        /* are we done? */
4:      adds        r2, r2, #4
        moveq       r0, #0
        beq         9f

        /* finish off the remaining bytes */
        b           8f

2:      /* the last 4 bytes are different, restart them */
        sub         r4, r4, #4
        sub         r1, r1, #4
        mov         r2, #4

        /* process the last few bytes */
8:      ldrb        r0, [r4], #1
        ldrb        ip, [r1], #1
        // stall
        subs        r0, r0, ip
        bne         9f
        subs        r2, r2, #1
        bne         8b

9:      /* restore registers and return */
        ldmfd       sp!, {r4, lr}
        bx          lr
        .fnend





5:      /*************** non-congruent case ***************/
        and         r0, r1, #3      
        cmp         r0, #2
        bne         4f

        /* here, offset is 2 (16-bits aligned, special cased) */
        
        /* make sure we have at least 16 bytes to process */
        subs        r2, r2, #16
        addmi       r2, r2, #16
        bmi         8b

        /* align the unaligned pointer */
        bic         r1, r1, #3
        ldr         lr, [r1], #4

6:      PLD         [r1, #64]
        PLD         [r4, #64]
        mov         ip, lr, lsr #16
        ldr         lr, [r1], #4
        ldr         r0, [r4], #4
        orr         ip, ip, lr, lsl #16
        eors        r0, r0, ip
        moveq       ip, lr, lsr #16
        ldreq       lr, [r1], #4
        ldreq       r0, [r4], #4
        orreq       ip, ip, lr, lsl #16
        eoreqs      r0, r0, ip
        moveq       ip, lr, lsr #16
        ldreq       lr, [r1], #4
        ldreq       r0, [r4], #4
        orreq       ip, ip, lr, lsl #16
        eoreqs      r0, r0, ip
        moveq       ip, lr, lsr #16
        ldreq       lr, [r1], #4
        ldreq       r0, [r4], #4
        orreq       ip, ip, lr, lsl #16
        eoreqs      r0, r0, ip
        bne         7f
        subs        r2, r2, #16
        bhs         6b
        sub         r1, r1, #2
        /* are we done? */
        adds        r2, r2, #16
        moveq       r0, #0
        beq         9b
        /* finish off the remaining bytes */
        b           8b

7:      /* fix up the 2 pointers and fallthrough... */
        sub         r1, r1, #(4+2)
        sub         r4, r4, #4
        mov         r2, #4
        b           8b


4:      /*************** offset is 1 or 3 (less optimized) ***************/

		stmfd		sp!, {r5, r6, r7}

        // r5 = rhs
        // r6 = lhs
        // r7 = scratch

        mov         r5, r0, lsl #3		/* r5 = right shift */
        rsb         r6, r5, #32         /* r6 = left shift */

        /* align the unaligned pointer */
        bic         r1, r1, #3
        ldr         r7, [r1], #4
        sub         r2, r2, #8

6:      mov         ip, r7, lsr r5
        ldr         r7, [r1], #4
        ldr         r0, [r4], #4
        orr         ip, ip, r7, lsl r6
        eors        r0, r0, ip
        moveq       ip, r7, lsr r5
        ldreq       r7, [r1], #4
        ldreq       r0, [r4], #4
        orreq       ip, ip, r7, lsl r6
        eoreqs      r0, r0, ip
        bne         7f
        subs        r2, r2, #8
        bhs         6b

        sub         r1, r1, r6, lsr #3
		ldmfd       sp!, {r5, r6, r7}

        /* are we done? */
        adds        r2, r2, #8
        moveq       r0, #0
        beq         9b

        /* finish off the remaining bytes */
        b           8b

7:      /* fix up the 2 pointers and fallthrough... */
        sub         r1, r1, #4
        sub         r1, r1, r6, lsr #3
        sub         r4, r4, #4
        mov         r2, #4
		ldmfd		sp!, {r5, r6, r7}
        b           8b

1 by Michael Hope Pulled in the initial versions	1	/*
	2	* Copyright (C) 2008 The Android Open Source Project
	3	* All rights reserved.
	4	*
	5	* Redistribution and use in source and binary forms, with or without
	6	* modification, are permitted provided that the following conditions
	7	* are met:
	8	* * Redistributions of source code must retain the above copyright
	9	* notice, this list of conditions and the following disclaimer.
	10	* * Redistributions in binary form must reproduce the above copyright
	11	* notice, this list of conditions and the following disclaimer in
	12	* the documentation and/or other materials provided with the
	13	* distribution.
	14	*
	15	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	16	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	17	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	18	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	19	* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	20	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
	21	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
	22	* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	23	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	24	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	25	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	26	* SUCH DAMAGE.
	27	*/
	28
77 by Michael Hope Added function type attributes to all reference assembly versions.	29	.global memcmp
	30	.type memcmp, %function
	31	.text
1 by Michael Hope Pulled in the initial versions	32
	33	/*
	34	* Optimized memcmp() for ARM9.
	35	* This would not be optimal on XScale or ARM11, where more prefetching
	36	* and use of PLD will be needed.
	37	* The 2 major optimzations here are
	38	* (1) The main loop compares 16 bytes at a time
	39	* (2) The loads are scheduled in a way they won't stall
	40	*/
	41
	42	memcmp:
	43	.fnstart
4 by Michael Hope Modified the imported versions to build locally. Added the CSL routines.	44	PLD [r0, #0]
	45	PLD [r1, #0]
1 by Michael Hope Pulled in the initial versions	46
	47	/* take of the case where length is 0 or the buffers are the same */
	48	cmp r0, r1
	49	cmpne r2, #0
	50	moveq r0, #0
	51	bxeq lr
	52
	53	.save {r4, lr}
	54	/* save registers */
	55	stmfd sp!, {r4, lr}
	56
4 by Michael Hope Modified the imported versions to build locally. Added the CSL routines.	57	PLD [r0, #32]
	58	PLD [r1, #32]
1 by Michael Hope Pulled in the initial versions	59
	60	/* since r0 hold the result, move the first source
	61	* pointer somewhere else
	62	*/
	63
	64	mov r4, r0
	65
	66	/* make sure we have at least 8+4 bytes, this simplify things below
	67	* and avoid some overhead for small blocks
	68	*/
	69	cmp r2, #(8+4)
	70	bmi 8f
	71
	72	/* align first pointer to word boundary
	73	* offset = -src & 3
	74	*/
	75	rsb r3, r4, #0
	76	ands r3, r3, #3
	77	beq 0f
	78
	79	/* align first pointer */
	80	sub r2, r2, r3
	81	1: ldrb r0, [r4], #1
	82	ldrb ip, [r1], #1
	83	subs r0, r0, ip
	84	bne 9f
	85	subs r3, r3, #1
	86	bne 1b
	87
	88
	89	0: /* here the first pointer is aligned, and we have at least 4 bytes
	90	* to process.
	91	*/
	92
	93	/* see if the pointers are congruent */
	94	eor r0, r4, r1
	95	ands r0, r0, #3
	96	bne 5f
	97
	98	/* congruent case, 32 bytes per iteration
	99	* We need to make sure there are at least 32+4 bytes left
	100	* because we effectively read ahead one word, and we could
	101	* read past the buffer (and segfault) if we're not careful.
	102	*/
	103
	104	ldr ip, [r1]
	105	subs r2, r2, #(32 + 4)
	106	bmi 1f
	107
4 by Michael Hope Modified the imported versions to build locally. Added the CSL routines.	108	0: PLD [r4, #64]
	109	PLD [r1, #64]
1 by Michael Hope Pulled in the initial versions	110	ldr r0, [r4], #4
	111	ldr lr, [r1, #4]!
	112	eors r0, r0, ip
	113	ldreq r0, [r4], #4
	114	ldreq ip, [r1, #4]!
	115	eoreqs r0, r0, lr
	116	ldreq r0, [r4], #4
	117	ldreq lr, [r1, #4]!
	118	eoreqs r0, r0, ip
	119	ldreq r0, [r4], #4
	120	ldreq ip, [r1, #4]!
	121	eoreqs r0, r0, lr
	122	ldreq r0, [r4], #4
	123	ldreq lr, [r1, #4]!
	124	eoreqs r0, r0, ip
	125	ldreq r0, [r4], #4
	126	ldreq ip, [r1, #4]!
	127	eoreqs r0, r0, lr
	128	ldreq r0, [r4], #4
	129	ldreq lr, [r1, #4]!
	130	eoreqs r0, r0, ip
	131	ldreq r0, [r4], #4
	132	ldreq ip, [r1, #4]!
	133	eoreqs r0, r0, lr
	134	bne 2f
	135	subs r2, r2, #32
	136	bhs 0b
	137
	138	/* do we have at least 4 bytes left? */
	139	1: adds r2, r2, #(32 - 4 + 4)
	140	bmi 4f
	141
	142	/* finish off 4 bytes at a time */
	143	3: ldr r0, [r4], #4
	144	ldr ip, [r1], #4
	145	eors r0, r0, ip
	146	bne 2f
	147	subs r2, r2, #4
	148	bhs 3b
	149
	150	/* are we done? */
	151	4: adds r2, r2, #4
	152	moveq r0, #0
	153	beq 9f
	154
	155	/* finish off the remaining bytes */
	156	b 8f
	157
	158	2: /* the last 4 bytes are different, restart them */
	159	sub r4, r4, #4
	160	sub r1, r1, #4
	161	mov r2, #4
	162
	163	/* process the last few bytes */
	164	8: ldrb r0, [r4], #1
	165	ldrb ip, [r1], #1
	166	// stall
	167	subs r0, r0, ip
	168	bne 9f
	169	subs r2, r2, #1
	170	bne 8b
	171
	172	9: /* restore registers and return */
	173	ldmfd sp!, {r4, lr}
174	bx lr
175	.fnend
176
177
178
179
180
181	5: /************* non-congruent case *************/
182	and r0, r1, #3
183	cmp r0, #2
184	bne 4f
185
186	/* here, offset is 2 (16-bits aligned, special cased) */
187
188	/* make sure we have at least 16 bytes to process */
189	subs r2, r2, #16
190	addmi r2, r2, #16
191	bmi 8b
192
193	/* align the unaligned pointer */
194	bic r1, r1, #3
195	ldr lr, [r1], #4
196
4 by Michael Hope Modified the imported versions to build locally. Added the CSL routines.	197	6: PLD [r1, #64]
	198	PLD [r4, #64]
1 by Michael Hope Pulled in the initial versions	199	mov ip, lr, lsr #16
	200	ldr lr, [r1], #4
	201	ldr r0, [r4], #4
	202	orr ip, ip, lr, lsl #16
	203	eors r0, r0, ip
	204	moveq ip, lr, lsr #16
	205	ldreq lr, [r1], #4
	206	ldreq r0, [r4], #4
	207	orreq ip, ip, lr, lsl #16
	208	eoreqs r0, r0, ip
	209	moveq ip, lr, lsr #16
	210	ldreq lr, [r1], #4
	211	ldreq r0, [r4], #4
	212	orreq ip, ip, lr, lsl #16
	213	eoreqs r0, r0, ip
	214	moveq ip, lr, lsr #16
	215	ldreq lr, [r1], #4
	216	ldreq r0, [r4], #4
	217	orreq ip, ip, lr, lsl #16
	218	eoreqs r0, r0, ip
	219	bne 7f
	220	subs r2, r2, #16
	221	bhs 6b
	222	sub r1, r1, #2
	223	/* are we done? */
	224	adds r2, r2, #16
	225	moveq r0, #0
	226	beq 9b
	227	/* finish off the remaining bytes */
	228	b 8b
	229
	230	7: /* fix up the 2 pointers and fallthrough... */
	231	sub r1, r1, #(4+2)
	232	sub r4, r4, #4
	233	mov r2, #4
	234	b 8b
	235
	236
	237	4: /************* offset is 1 or 3 (less optimized) *************/
	238
	239	stmfd sp!, {r5, r6, r7}
	240
	241	// r5 = rhs
	242	// r6 = lhs
	243	// r7 = scratch
	244
	245	mov r5, r0, lsl #3 /* r5 = right shift */
	246	rsb r6, r5, #32 /* r6 = left shift */
	247
	248	/* align the unaligned pointer */
	249	bic r1, r1, #3
	250	ldr r7, [r1], #4
	251	sub r2, r2, #8
	252
	253	6: mov ip, r7, lsr r5
	254	ldr r7, [r1], #4
	255	ldr r0, [r4], #4
	256	orr ip, ip, r7, lsl r6
	257	eors r0, r0, ip
	258	moveq ip, r7, lsr r5
	259	ldreq r7, [r1], #4
	260	ldreq r0, [r4], #4
	261	orreq ip, ip, r7, lsl r6
	262	eoreqs r0, r0, ip
263	bne 7f
264	subs r2, r2, #8
265	bhs 6b
266
267	sub r1, r1, r6, lsr #3
268	ldmfd sp!, {r5, r6, r7}
269
270	/* are we done? */
271	adds r2, r2, #8
272	moveq r0, #0
273	beq 9b
274
275	/* finish off the remaining bytes */
276	b 8b
277
278	7: /* fix up the 2 pointers and fallthrough... */
279	sub r1, r1, #4
280	sub r1, r1, r6, lsr #3
281	sub r4, r4, #4
282	mov r2, #4
283	ldmfd sp!, {r5, r6, r7}
284	b 8b