~linaro-toolchain-dev/cortex-strings/trunk

96 by Marcus Shawcroft
Add AArch64 optimized strlen.
1
/* Copyright (c) 2013, Linaro Limited
2
   All rights reserved.
3
4
   Redistribution and use in source and binary forms, with or without
5
   modification, are permitted provided that the following conditions are met:
6
       * Redistributions of source code must retain the above copyright
7
         notice, this list of conditions and the following disclaimer.
8
       * Redistributions in binary form must reproduce the above copyright
9
         notice, this list of conditions and the following disclaimer in the
10
         documentation and/or other materials provided with the distribution.
11
       * Neither the name of the Linaro nor the
12
         names of its contributors may be used to endorse or promote products
13
         derived from this software without specific prior written permission.
14
15
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16
   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18
   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19
   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20
   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21
   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22
   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23
   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
26
27
/* Assumptions:
28
 *
29
 * ARMv8-a, AArch64
30
 */
31
32
/* Arguments and results.  */
33
#define srcin		x0
34
#define len		x0
35
36
/* Locals and temporaries.  */
37
#define src		x1
38
#define data1		x2
39
#define data2		x3
40
#define data2a		x4
41
#define has_nul1	x5
42
#define has_nul2	x6
43
#define tmp1		x7
44
#define tmp2		x8
45
#define tmp3		x9
46
#define tmp4		x10
47
#define zeroones	x11
48
#define pos		x12
49
50
	.macro def_fn f p2align=0
51
	.text
52
	.p2align \p2align
53
	.global \f
54
	.type \f, %function
55
\f:
56
	.endm
57
58
#define REP8_01 0x0101010101010101
59
#define REP8_7f 0x7f7f7f7f7f7f7f7f
60
#define REP8_80 0x8080808080808080
61
62
	/* Start of critial section -- keep to one 64Byte cache line.  */
63
def_fn strlen p2align=6
64
	mov	zeroones, #REP8_01
65
	bic	src, srcin, #15
66
	ands	tmp1, srcin, #15
67
	b.ne	.Lmisaligned
68
	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
69
	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
70
	   can be done in parallel across the entire word.  */
71
	/* The inner loop deals with two Dwords at a time.  This has a
72
	   slightly higher start-up cost, but we should win quite quickly,
73
	   especially on cores with a high number of issue slots per
74
	   cycle, as we get much better parallelism out of the operations.  */
75
.Lloop:
76
	ldp	data1, data2, [src], #16
77
.Lrealigned:
78
	sub	tmp1, data1, zeroones
79
	orr	tmp2, data1, #REP8_7f
80
	sub	tmp3, data2, zeroones
81
	orr	tmp4, data2, #REP8_7f
82
	bic	has_nul1, tmp1, tmp2
83
	bics	has_nul2, tmp3, tmp4
84
	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
85
	b.eq	.Lloop
86
	/* End of critical section -- keep to one 64Byte cache line.  */
87
88
	sub	len, src, srcin
89
	cbz	has_nul1, .Lnul_in_data2
90
#ifdef __AARCH64EB__
91
	mov	data2, data1
92
#endif
93
	sub	len, len, #8
94
	mov	has_nul2, has_nul1
95
.Lnul_in_data2:
96
#ifdef __AARCH64EB__
97
	/* For big-endian, carry propagation (if the final byte in the
98
	   string is 0x01) means we cannot use has_nul directly.  The
99
	   easiest way to get the correct byte is to byte-swap the data
100
	   and calculate the syndrome a second time.  */
101
	rev	data2, data2
102
	sub	tmp1, data2, zeroones
103
	orr	tmp2, data2, #REP8_7f
104
	bic	has_nul2, tmp1, tmp2
105
#endif
106
	sub	len, len, #8
107
	rev	has_nul2, has_nul2
108
	clz	pos, has_nul2
109
	add	len, len, pos, lsr #3		/* Bits to bytes.  */
110
	ret
111
112
.Lmisaligned:
113
	cmp	tmp1, #8
114
	neg	tmp1, tmp1
115
	ldp	data1, data2, [src], #16
116
	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
117
	mov	tmp2, #~0
118
#ifdef __AARCH64EB__
119
	/* Big-endian.  Early bytes are at MSB.  */
120
	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
121
#else
122
	/* Little-endian.  Early bytes are at LSB.  */
123
	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
124
#endif
125
	orr	data1, data1, tmp2
126
	orr	data2a, data2, tmp2
127
	csinv	data1, data1, xzr, le
128
	csel	data2, data2, data2a, le
129
	b	.Lrealigned
130
131
	.size	strlen, . - strlen