~linaro-toolchain-dev/cortex-strings/trunk : contents of src/aarch64/strcmp.S at revision 123

~linaro-toolchain-dev/cortex-strings/trunk : (revision 123)

/* Copyright (c) 2012, Linaro Limited
   All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are met:
       * Redistributions of source code must retain the above copyright
         notice, this list of conditions and the following disclaimer.
       * Redistributions in binary form must reproduce the above copyright
         notice, this list of conditions and the following disclaimer in the
         documentation and/or other materials provided with the distribution.
       * Neither the name of the Linaro nor the
         names of its contributors may be used to endorse or promote products
         derived from this software without specific prior written permission.

   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */

/* Assumptions:
 *
 * ARMv8-a, AArch64
 */

	.macro def_fn f p2align=0
	.text
	.p2align \p2align
	.global \f
	.type \f, %function
\f:
	.endm

#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
#define REP8_80 0x8080808080808080

/* Parameters and result.  */
#define src1		x0
#define src2		x1
#define result		x0

/* Internal variables.  */
#define data1		x2
#define data1w		w2
#define data2		x3
#define data2w		w3
#define has_nul		x4
#define diff		x5
#define syndrome	x6
#define tmp1		x7
#define tmp2		x8
#define tmp3		x9
#define zeroones	x10
#define pos		x11

	/* Start of performance-critical section  -- one 64B cache line.  */
def_fn strcmp p2align=6
	eor	tmp1, src1, src2
	mov	zeroones, #REP8_01
	tst	tmp1, #7
	b.ne	.Lmisaligned8
	ands	tmp1, src1, #7
	b.ne	.Lmutual_align
	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
	   can be done in parallel across the entire word.  */
.Lloop_aligned:
	ldr	data1, [src1], #8
	ldr	data2, [src2], #8
.Lstart_realigned:
	sub	tmp1, data1, zeroones
	orr	tmp2, data1, #REP8_7f
	eor	diff, data1, data2	/* Non-zero if differences found.  */
	bic	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
	orr	syndrome, diff, has_nul
	cbz	syndrome, .Lloop_aligned
	/* End of performance-critical section  -- one 64B cache line.  */

#ifndef	__AARCH64EB__
	rev	syndrome, syndrome
	rev	data1, data1
	/* The MS-non-zero bit of the syndrome marks either the first bit
	   that is different, or the top bit of the first zero byte.
	   Shifting left now will bring the critical information into the
	   top bits.  */
	clz	pos, syndrome
	rev	data2, data2
	lsl	data1, data1, pos
	lsl	data2, data2, pos
	/* But we need to zero-extend (char is unsigned) the value and then
	   perform a signed 32-bit subtraction.  */
	lsr	data1, data1, #56
	sub	result, data1, data2, lsr #56
	ret
#else
	/* For big-endian we cannot use the trick with the syndrome value
	   as carry-propagation can corrupt the upper bits if the trailing
	   bytes in the string contain 0x01.  */
	/* However, if there is no NUL byte in the dword, we can generate
	   the result directly.  We can't just subtract the bytes as the
	   MSB might be significant.  */
	cbnz	has_nul, 1f
	cmp	data1, data2
	cset	result, ne
	cneg	result, result, lo
	ret
1:
	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
	rev	tmp3, data1
	sub	tmp1, tmp3, zeroones
	orr	tmp2, tmp3, #REP8_7f
	bic	has_nul, tmp1, tmp2
	rev	has_nul, has_nul
	orr	syndrome, diff, has_nul
	clz	pos, syndrome
	/* The MS-non-zero bit of the syndrome marks either the first bit
	   that is different, or the top bit of the first zero byte.
	   Shifting left now will bring the critical information into the
	   top bits.  */
	lsl	data1, data1, pos
	lsl	data2, data2, pos
	/* But we need to zero-extend (char is unsigned) the value and then
	   perform a signed 32-bit subtraction.  */
	lsr	data1, data1, #56
	sub	result, data1, data2, lsr #56
	ret
#endif

.Lmutual_align:
	/* Sources are mutually aligned, but are not currently at an
	   alignment boundary.  Round down the addresses and then mask off
	   the bytes that preceed the start point.  */
	bic	src1, src1, #7
	bic	src2, src2, #7
	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
	ldr	data1, [src1], #8
	neg	tmp1, tmp1		/* Bits to alignment -64.  */
	ldr	data2, [src2], #8
	mov	tmp2, #~0
#ifdef __AARCH64EB__
	/* Big-endian.  Early bytes are at MSB.  */
	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
#else
	/* Little-endian.  Early bytes are at LSB.  */
	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
#endif
	orr	data1, data1, tmp2
	orr	data2, data2, tmp2
	b	.Lstart_realigned

.Lmisaligned8:
	/* We can do better than this.  */
	ldrb	data1w, [src1], #1
	ldrb	data2w, [src2], #1
	cmp	data1w, #1
	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
	b.eq	.Lmisaligned8
	sub	result, data1, data2
	ret

89 by Michael Hope Add aarch64 implementations of memcpy, memset and strcmp to	1	/* Copyright (c) 2012, Linaro Limited
	2	All rights reserved.
	3
	4	Redistribution and use in source and binary forms, with or without
	5	modification, are permitted provided that the following conditions are met:
	6	* Redistributions of source code must retain the above copyright
	7	notice, this list of conditions and the following disclaimer.
	8	* Redistributions in binary form must reproduce the above copyright
	9	notice, this list of conditions and the following disclaimer in the
	10	documentation and/or other materials provided with the distribution.
	11	* Neither the name of the Linaro nor the
	12	names of its contributors may be used to endorse or promote products
	13	derived from this software without specific prior written permission.
	14
	15	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	16	"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	17	LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	18	A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	19	HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	20	SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	21	LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	22	DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	23	THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	24	(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	25	OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
	26
	27	/* Assumptions:
	28	*
	29	* ARMv8-a, AArch64
	30	*/
	31
	32	.macro def_fn f p2align=0
	33	.text
	34	.p2align \p2align
	35	.global \f
	36	.type \f, %function
	37	\f:
	38	.endm
	39
	40	#define REP8_01 0x0101010101010101
	41	#define REP8_7f 0x7f7f7f7f7f7f7f7f
	42	#define REP8_80 0x8080808080808080
	43
	44	/* Parameters and result. */
	45	#define src1 x0
	46	#define src2 x1
	47	#define result x0
	48
	49	/* Internal variables. */
	50	#define data1 x2
	51	#define data1w w2
	52	#define data2 x3
	53	#define data2w w3
	54	#define has_nul x4
	55	#define diff x5
	56	#define syndrome x6
	57	#define tmp1 x7
	58	#define tmp2 x8
	59	#define tmp3 x9
	60	#define zeroones x10
	61	#define pos x11
	62
	63	/* Start of performance-critical section -- one 64B cache line. */
	64	def_fn strcmp p2align=6
65	eor tmp1, src1, src2
66	mov zeroones, #REP8_01
67	tst tmp1, #7
68	b.ne .Lmisaligned8
69	ands tmp1, src1, #7
70	b.ne .Lmutual_align
71	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
72	(=> (X - 1) & ~(X \| 0x7f)) is non-zero iff a byte is zero, and
73	can be done in parallel across the entire word. */
74	.Lloop_aligned:
75	ldr data1, [src1], #8
76	ldr data2, [src2], #8
77	.Lstart_realigned:
78	sub tmp1, data1, zeroones
79	orr tmp2, data1, #REP8_7f
80	eor diff, data1, data2 /* Non-zero if differences found. */
81	bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
82	orr syndrome, diff, has_nul
83	cbz syndrome, .Lloop_aligned
84	/* End of performance-critical section -- one 64B cache line. */
85
86	#ifndef __AARCH64EB__
87	rev syndrome, syndrome
88	rev data1, data1
89	/* The MS-non-zero bit of the syndrome marks either the first bit
90	that is different, or the top bit of the first zero byte.
91	Shifting left now will bring the critical information into the
92	top bits. */
93	clz pos, syndrome
94	rev data2, data2
95	lsl data1, data1, pos
96	lsl data2, data2, pos
97	/* But we need to zero-extend (char is unsigned) the value and then
98	perform a signed 32-bit subtraction. */
99	lsr data1, data1, #56
100	sub result, data1, data2, lsr #56
101	ret
102	#else
103	/* For big-endian we cannot use the trick with the syndrome value
104	as carry-propagation can corrupt the upper bits if the trailing
105	bytes in the string contain 0x01. */
106	/* However, if there is no NUL byte in the dword, we can generate
107	the result directly. We can't just subtract the bytes as the
108	MSB might be significant. */
109	cbnz has_nul, 1f
110	cmp data1, data2
111	cset result, ne
112	cneg result, result, lo
113	ret
114	1:
115	/* Re-compute the NUL-byte detection, using a byte-reversed value. */
116	rev tmp3, data1
117	sub tmp1, tmp3, zeroones
118	orr tmp2, tmp3, #REP8_7f
119	bic has_nul, tmp1, tmp2
120	rev has_nul, has_nul
121	orr syndrome, diff, has_nul
122	clz pos, syndrome
123	/* The MS-non-zero bit of the syndrome marks either the first bit
124	that is different, or the top bit of the first zero byte.
125	Shifting left now will bring the critical information into the
126	top bits. */
127	lsl data1, data1, pos
128	lsl data2, data2, pos
129	/* But we need to zero-extend (char is unsigned) the value and then
130	perform a signed 32-bit subtraction. */
131	lsr data1, data1, #56
132	sub result, data1, data2, lsr #56
133	ret
134	#endif
135
136	.Lmutual_align:
137	/* Sources are mutually aligned, but are not currently at an
138	alignment boundary. Round down the addresses and then mask off
139	the bytes that preceed the start point. */
140	bic src1, src1, #7
141	bic src2, src2, #7
142	lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
143	ldr data1, [src1], #8
144	neg tmp1, tmp1 /* Bits to alignment -64. */
145	ldr data2, [src2], #8
146	mov tmp2, #~0
147	#ifdef __AARCH64EB__
148	/* Big-endian. Early bytes are at MSB. */
149	lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
150	#else
151	/* Little-endian. Early bytes are at LSB. */
152	lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
153	#endif
154	orr data1, data1, tmp2
155	orr data2, data2, tmp2
156	b .Lstart_realigned
157
158	.Lmisaligned8:
159	/* We can do better than this. */
160	ldrb data1w, [src1], #1
161	ldrb data2w, [src2], #1
162	cmp data1w, #1
163	ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
164	b.eq .Lmisaligned8
165	sub result, data1, data2
166	ret