~linaro-toolchain-dev/cortex-strings/trunk

96.1.2 by Marcus Shawcroft
Add AArch64 optimised strnlen.
1
/* strnlen - calculate the length of a string with limit.
2
3
   Copyright (c) 2013, Linaro Limited
4
   All rights reserved.
5
6
   Redistribution and use in source and binary forms, with or without
7
   modification, are permitted provided that the following conditions are met:
8
       * Redistributions of source code must retain the above copyright
9
         notice, this list of conditions and the following disclaimer.
10
       * Redistributions in binary form must reproduce the above copyright
11
         notice, this list of conditions and the following disclaimer in the
12
         documentation and/or other materials provided with the distribution.
13
       * Neither the name of the Linaro nor the
14
         names of its contributors may be used to endorse or promote products
15
         derived from this software without specific prior written permission.
16
17
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18
   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20
   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21
   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22
   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23
   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24
   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25
   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26
   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
28
29
/* Assumptions:
30
 *
31
 * ARMv8-a, AArch64
32
 */
33
34
/* Arguments and results.  */
35
#define srcin		x0
36
#define len		x0
37
#define limit		x1
38
39
/* Locals and temporaries.  */
40
#define src		x2
41
#define data1		x3
42
#define data2		x4
43
#define data2a		x5
44
#define has_nul1	x6
45
#define has_nul2	x7
46
#define tmp1		x8
47
#define tmp2		x9
48
#define tmp3		x10
49
#define tmp4		x11
50
#define zeroones	x12
51
#define pos		x13
52
#define limit_wd	x14
53
54
	.macro def_fn f p2align=0
55
	.text
56
	.p2align \p2align
57
	.global \f
58
	.type \f, %function
59
\f:
60
	.endm
61
62
#define REP8_01 0x0101010101010101
63
#define REP8_7f 0x7f7f7f7f7f7f7f7f
64
#define REP8_80 0x8080808080808080
65
66
	.text
67
	.p2align	6
68
.Lstart:
69
	/* Pre-pad to ensure critical loop begins an icache line.  */
70
	.rep 7
71
	nop
72
	.endr
73
	/* Put this code here to avoid wasting more space with pre-padding.  */
74
.Lhit_limit:
75
	mov	len, limit
76
	ret
77
78
def_fn strnlen
79
	cbz	limit, .Lhit_limit
80
	mov	zeroones, #REP8_01
81
	bic	src, srcin, #15
82
	ands	tmp1, srcin, #15
83
	b.ne	.Lmisaligned
98.1.2 by Marcus Shawcroft
This patch fixes an issue in the AArch64 strnlen implementation which
84
	/* Calculate the number of full and partial words -1.  */
85
	sub	limit_wd, limit, #1	/* Limit != 0, so no underflow.  */
86
	lsr	limit_wd, limit_wd, #4	/* Convert to Qwords.  */
87
96.1.2 by Marcus Shawcroft
Add AArch64 optimised strnlen.
88
	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
89
	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
90
	   can be done in parallel across the entire word.  */
91
	/* The inner loop deals with two Dwords at a time.  This has a
92
	   slightly higher start-up cost, but we should win quite quickly,
93
	   especially on cores with a high number of issue slots per
94
	   cycle, as we get much better parallelism out of the operations.  */
95
96
	/* Start of critial section -- keep to one 64Byte cache line.  */
97
.Lloop:
98
	ldp	data1, data2, [src], #16
99
.Lrealigned:
100
	sub	tmp1, data1, zeroones
101
	orr	tmp2, data1, #REP8_7f
102
	sub	tmp3, data2, zeroones
103
	orr	tmp4, data2, #REP8_7f
104
	bic	has_nul1, tmp1, tmp2
105
	bic	has_nul2, tmp3, tmp4
106
	subs	limit_wd, limit_wd, #1
107
	orr	tmp1, has_nul1, has_nul2
98.1.2 by Marcus Shawcroft
This patch fixes an issue in the AArch64 strnlen implementation which
108
	ccmp	tmp1, #0, #0, pl	/* NZCV = 0000  */
96.1.2 by Marcus Shawcroft
Add AArch64 optimised strnlen.
109
	b.eq	.Lloop
110
	/* End of critical section -- keep to one 64Byte cache line.  */
111
112
	orr	tmp1, has_nul1, has_nul2
113
	cbz	tmp1, .Lhit_limit	/* No null in final Qword.  */
114
115
	/* We know there's a null in the final Qword.  The easiest thing
116
	   to do now is work out the length of the string and return
117
	   MIN (len, limit).  */
118
119
	sub	len, src, srcin
120
	cbz	has_nul1, .Lnul_in_data2
121
#ifdef __AARCH64EB__
122
	mov	data2, data1
123
#endif
124
	sub	len, len, #8
125
	mov	has_nul2, has_nul1
126
.Lnul_in_data2:
127
#ifdef __AARCH64EB__
128
	/* For big-endian, carry propagation (if the final byte in the
129
	   string is 0x01) means we cannot use has_nul directly.  The
130
	   easiest way to get the correct byte is to byte-swap the data
131
	   and calculate the syndrome a second time.  */
132
	rev	data2, data2
133
	sub	tmp1, data2, zeroones
134
	orr	tmp2, data2, #REP8_7f
135
	bic	has_nul2, tmp1, tmp2
136
#endif
137
	sub	len, len, #8
138
	rev	has_nul2, has_nul2
139
	clz	pos, has_nul2
140
	add	len, len, pos, lsr #3		/* Bits to bytes.  */
141
	cmp	len, limit
142
	csel	len, len, limit, ls		/* Return the lower value.  */
143
	ret
144
145
.Lmisaligned:
98.1.2 by Marcus Shawcroft
This patch fixes an issue in the AArch64 strnlen implementation which
146
	/* Deal with a partial first word.
147
	   We're doing two things in parallel here;
148
	   1) Calculate the number of words (but avoiding overflow if
149
	      limit is near ULONG_MAX) - to do this we need to work out
150
	      limit + tmp1 - 1 as a 65-bit value before shifting it;
151
	   2) Load and mask the initial data words - we force the bytes
152
	      before the ones we are interested in to 0xff - this ensures
153
	      early bytes will not hit any zero detection.  */
154
	sub	limit_wd, limit, #1
155
	neg	tmp4, tmp1
96.1.2 by Marcus Shawcroft
Add AArch64 optimised strnlen.
156
	cmp	tmp1, #8
98.1.2 by Marcus Shawcroft
This patch fixes an issue in the AArch64 strnlen implementation which
157
158
	and	tmp3, limit_wd, #15
159
	lsr	limit_wd, limit_wd, #4
96.1.2 by Marcus Shawcroft
Add AArch64 optimised strnlen.
160
	mov	tmp2, #~0
98.1.2 by Marcus Shawcroft
This patch fixes an issue in the AArch64 strnlen implementation which
161
162
	ldp	data1, data2, [src], #16
163
	lsl	tmp4, tmp4, #3		/* Bytes beyond alignment -> bits.  */
164
	add	tmp3, tmp3, tmp1
165
96.1.2 by Marcus Shawcroft
Add AArch64 optimised strnlen.
166
#ifdef __AARCH64EB__
167
	/* Big-endian.  Early bytes are at MSB.  */
98.1.2 by Marcus Shawcroft
This patch fixes an issue in the AArch64 strnlen implementation which
168
	lsl	tmp2, tmp2, tmp4	/* Shift (tmp1 & 63).  */
96.1.2 by Marcus Shawcroft
Add AArch64 optimised strnlen.
169
#else
170
	/* Little-endian.  Early bytes are at LSB.  */
98.1.2 by Marcus Shawcroft
This patch fixes an issue in the AArch64 strnlen implementation which
171
	lsr	tmp2, tmp2, tmp4	/* Shift (tmp1 & 63).  */
96.1.2 by Marcus Shawcroft
Add AArch64 optimised strnlen.
172
#endif
98.1.2 by Marcus Shawcroft
This patch fixes an issue in the AArch64 strnlen implementation which
173
	add	limit_wd, limit_wd, tmp3, lsr #4
174
96.1.2 by Marcus Shawcroft
Add AArch64 optimised strnlen.
175
	orr	data1, data1, tmp2
176
	orr	data2a, data2, tmp2
98.1.2 by Marcus Shawcroft
This patch fixes an issue in the AArch64 strnlen implementation which
177
96.1.2 by Marcus Shawcroft
Add AArch64 optimised strnlen.
178
	csinv	data1, data1, xzr, le
179
	csel	data2, data2, data2a, le
180
	b	.Lrealigned
181
	.size	strnlen, . - .Lstart	/* Include pre-padding in size.  */