~linaro-toolchain-dev/cortex-strings/trunk

89 by Michael Hope
Add aarch64 implementations of memcpy, memset and strcmp to
1
/* Copyright (c) 2012, Linaro Limited
2
   All rights reserved.
3
4
   Redistribution and use in source and binary forms, with or without
5
   modification, are permitted provided that the following conditions are met:
6
       * Redistributions of source code must retain the above copyright
7
         notice, this list of conditions and the following disclaimer.
8
       * Redistributions in binary form must reproduce the above copyright
9
         notice, this list of conditions and the following disclaimer in the
10
         documentation and/or other materials provided with the distribution.
11
       * Neither the name of the Linaro nor the
12
         names of its contributors may be used to endorse or promote products
13
         derived from this software without specific prior written permission.
14
15
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16
   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18
   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19
   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20
   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21
   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22
   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23
   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
26
27
/* Assumptions:
28
 *
29
 * ARMv8-a, AArch64
30
 * Unaligned accesses
31
 *
32
 */
33
34
#define dstin	x0
35
#define src	x1
36
#define count	x2
37
#define tmp1	x3
38
#define tmp1w	w3
39
#define tmp2	x4
40
#define tmp2w	w4
41
#define tmp3	x5
42
#define tmp3w	w5
43
#define dst	x6
44
45
#define A_l	x7
46
#define A_h	x8
47
#define B_l	x9
48
#define B_h	x10
49
#define C_l	x11
50
#define C_h	x12
51
#define D_l	x13
52
#define D_h	x14
53
54
	.macro def_fn f p2align=0
55
	.text
56
	.p2align \p2align
57
	.global \f
58
	.type \f, %function
59
\f:
60
	.endm
61
62
def_fn memcpy p2align=6
63
64
	mov	dst, dstin
65
	cmp	count, #64
66
	b.ge	.Lcpy_not_short
67
	cmp	count, #15
68
	b.le	.Ltail15tiny
69
70
	/* Deal with small copies quickly by dropping straight into the
71
	 * exit block.  */
72
.Ltail63:
73
	/* Copy up to 48 bytes of data.  At this point we only need the
74
	 * bottom 6 bits of count to be accurate.  */
75
	ands	tmp1, count, #0x30
76
	b.eq	.Ltail15
77
	add	dst, dst, tmp1
78
	add	src, src, tmp1
79
	cmp	tmp1w, #0x20
80
	b.eq	1f
81
	b.lt	2f
82
	ldp	A_l, A_h, [src, #-48]
83
	stp	A_l, A_h, [dst, #-48]
84
1:
85
	ldp	A_l, A_h, [src, #-32]
86
	stp	A_l, A_h, [dst, #-32]
87
2:
88
	ldp	A_l, A_h, [src, #-16]
89
	stp	A_l, A_h, [dst, #-16]
90
91
.Ltail15:
92
	ands	count, count, #15
93
	beq	1f
94
	add	src, src, count
95
	ldp	A_l, A_h, [src, #-16]
96
	add	dst, dst, count
97
	stp	A_l, A_h, [dst, #-16]
98
1:
99
	ret
100
101
.Ltail15tiny:
102
	/* Copy up to 15 bytes of data.  Does not assume additional data
103
	   being copied.  */
104
	tbz	count, #3, 1f
105
	ldr	tmp1, [src], #8
106
	str	tmp1, [dst], #8
107
1:
108
	tbz	count, #2, 1f
109
	ldr	tmp1w, [src], #4
110
	str	tmp1w, [dst], #4
111
1:
112
	tbz	count, #1, 1f
113
	ldrh	tmp1w, [src], #2
114
	strh	tmp1w, [dst], #2
115
1:
116
	tbz	count, #0, 1f
117
	ldrb	tmp1w, [src]
118
	strb	tmp1w, [dst]
119
1:
120
	ret
121
122
.Lcpy_not_short:
123
	/* We don't much care about the alignment of DST, but we want SRC
124
	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
125
	 * boundaries on both loads and stores.  */
126
	neg	tmp2, src
127
	ands	tmp2, tmp2, #15		/* Bytes to reach alignment.  */
128
	b.eq	2f
129
	sub	count, count, tmp2
130
	/* Copy more data than needed; it's faster than jumping
131
	 * around copying sub-Quadword quantities.  We know that
132
	 * it can't overrun.  */
133
	ldp	A_l, A_h, [src]
134
	add	src, src, tmp2
135
	stp	A_l, A_h, [dst]
136
	add	dst, dst, tmp2
137
	/* There may be less than 63 bytes to go now.  */
138
	cmp	count, #63
139
	b.le	.Ltail63
140
2:
141
	subs	count, count, #128
142
	b.ge	.Lcpy_body_large
143
	/* Less than 128 bytes to copy, so handle 64 here and then jump
144
	 * to the tail.  */
145
	ldp	A_l, A_h, [src]
146
	ldp	B_l, B_h, [src, #16]
147
	ldp	C_l, C_h, [src, #32]
148
	ldp	D_l, D_h, [src, #48]
149
	stp	A_l, A_h, [dst]
150
	stp	B_l, B_h, [dst, #16]
151
	stp	C_l, C_h, [dst, #32]
152
	stp	D_l, D_h, [dst, #48]
153
	tst	count, #0x3f
154
	add	src, src, #64
155
	add	dst, dst, #64
156
	b.ne	.Ltail63
157
	ret
158
159
	/* Critical loop.  Start at a new cache line boundary.  Assuming
160
	 * 64 bytes per line this ensures the entire loop is in one line.  */
161
	.p2align 6
162
.Lcpy_body_large:
163
	/* There are at least 128 bytes to copy.  */
164
	ldp	A_l, A_h, [src, #0]
165
	sub	dst, dst, #16		/* Pre-bias.  */
166
	ldp	B_l, B_h, [src, #16]
167
	ldp	C_l, C_h, [src, #32]
168
	ldp	D_l, D_h, [src, #48]!	/* src += 64 - Pre-bias.  */
169
1:
170
	stp	A_l, A_h, [dst, #16]
171
	ldp	A_l, A_h, [src, #16]
172
	stp	B_l, B_h, [dst, #32]
173
	ldp	B_l, B_h, [src, #32]
174
	stp	C_l, C_h, [dst, #48]
175
	ldp	C_l, C_h, [src, #48]
176
	stp	D_l, D_h, [dst, #64]!
177
	ldp	D_l, D_h, [src, #64]!
178
	subs	count, count, #64
179
	b.ge	1b
180
	stp	A_l, A_h, [dst, #16]
181
	stp	B_l, B_h, [dst, #32]
182
	stp	C_l, C_h, [dst, #48]
183
	stp	D_l, D_h, [dst, #64]
184
	add	src, src, #16
185
	add	dst, dst, #64 + 16
186
	tst	count, #0x3f
187
	b.ne	.Ltail63
188
	ret