~linaro-toolchain-dev/cortex-strings/trunk : contents of src/aarch64/memcpy.S at revision 123

~linaro-toolchain-dev/cortex-strings/trunk : (revision 123)

/* Copyright (c) 2012, Linaro Limited
   All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are met:
       * Redistributions of source code must retain the above copyright
         notice, this list of conditions and the following disclaimer.
       * Redistributions in binary form must reproduce the above copyright
         notice, this list of conditions and the following disclaimer in the
         documentation and/or other materials provided with the distribution.
       * Neither the name of the Linaro nor the
         names of its contributors may be used to endorse or promote products
         derived from this software without specific prior written permission.

   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */

/* Assumptions:
 *
 * ARMv8-a, AArch64
 * Unaligned accesses
 *
 */

#define dstin	x0
#define src	x1
#define count	x2
#define tmp1	x3
#define tmp1w	w3
#define tmp2	x4
#define tmp2w	w4
#define tmp3	x5
#define tmp3w	w5
#define dst	x6

#define A_l	x7
#define A_h	x8
#define B_l	x9
#define B_h	x10
#define C_l	x11
#define C_h	x12
#define D_l	x13
#define D_h	x14

	.macro def_fn f p2align=0
	.text
	.p2align \p2align
	.global \f
	.type \f, %function
\f:
	.endm

def_fn memcpy p2align=6

	mov	dst, dstin
	cmp	count, #64
	b.ge	.Lcpy_not_short
	cmp	count, #15
	b.le	.Ltail15tiny

	/* Deal with small copies quickly by dropping straight into the
	 * exit block.  */
.Ltail63:
	/* Copy up to 48 bytes of data.  At this point we only need the
	 * bottom 6 bits of count to be accurate.  */
	ands	tmp1, count, #0x30
	b.eq	.Ltail15
	add	dst, dst, tmp1
	add	src, src, tmp1
	cmp	tmp1w, #0x20
	b.eq	1f
	b.lt	2f
	ldp	A_l, A_h, [src, #-48]
	stp	A_l, A_h, [dst, #-48]
1:
	ldp	A_l, A_h, [src, #-32]
	stp	A_l, A_h, [dst, #-32]
2:
	ldp	A_l, A_h, [src, #-16]
	stp	A_l, A_h, [dst, #-16]

.Ltail15:
	ands	count, count, #15
	beq	1f
	add	src, src, count
	ldp	A_l, A_h, [src, #-16]
	add	dst, dst, count
	stp	A_l, A_h, [dst, #-16]
1:
	ret

.Ltail15tiny:
	/* Copy up to 15 bytes of data.  Does not assume additional data
	   being copied.  */
	tbz	count, #3, 1f
	ldr	tmp1, [src], #8
	str	tmp1, [dst], #8
1:
	tbz	count, #2, 1f
	ldr	tmp1w, [src], #4
	str	tmp1w, [dst], #4
1:
	tbz	count, #1, 1f
	ldrh	tmp1w, [src], #2
	strh	tmp1w, [dst], #2
1:
	tbz	count, #0, 1f
	ldrb	tmp1w, [src]
	strb	tmp1w, [dst]
1:
	ret

.Lcpy_not_short:
	/* We don't much care about the alignment of DST, but we want SRC
	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
	 * boundaries on both loads and stores.  */
	neg	tmp2, src
	ands	tmp2, tmp2, #15		/* Bytes to reach alignment.  */
	b.eq	2f
	sub	count, count, tmp2
	/* Copy more data than needed; it's faster than jumping
	 * around copying sub-Quadword quantities.  We know that
	 * it can't overrun.  */
	ldp	A_l, A_h, [src]
	add	src, src, tmp2
	stp	A_l, A_h, [dst]
	add	dst, dst, tmp2
	/* There may be less than 63 bytes to go now.  */
	cmp	count, #63
	b.le	.Ltail63
2:
	subs	count, count, #128
	b.ge	.Lcpy_body_large
	/* Less than 128 bytes to copy, so handle 64 here and then jump
	 * to the tail.  */
	ldp	A_l, A_h, [src]
	ldp	B_l, B_h, [src, #16]
	ldp	C_l, C_h, [src, #32]
	ldp	D_l, D_h, [src, #48]
	stp	A_l, A_h, [dst]
	stp	B_l, B_h, [dst, #16]
	stp	C_l, C_h, [dst, #32]
	stp	D_l, D_h, [dst, #48]
	tst	count, #0x3f
	add	src, src, #64
	add	dst, dst, #64
	b.ne	.Ltail63
	ret

	/* Critical loop.  Start at a new cache line boundary.  Assuming
	 * 64 bytes per line this ensures the entire loop is in one line.  */
	.p2align 6
.Lcpy_body_large:
	/* There are at least 128 bytes to copy.  */
	ldp	A_l, A_h, [src, #0]
	sub	dst, dst, #16		/* Pre-bias.  */
	ldp	B_l, B_h, [src, #16]
	ldp	C_l, C_h, [src, #32]
	ldp	D_l, D_h, [src, #48]!	/* src += 64 - Pre-bias.  */
1:
	stp	A_l, A_h, [dst, #16]
	ldp	A_l, A_h, [src, #16]
	stp	B_l, B_h, [dst, #32]
	ldp	B_l, B_h, [src, #32]
	stp	C_l, C_h, [dst, #48]
	ldp	C_l, C_h, [src, #48]
	stp	D_l, D_h, [dst, #64]!
	ldp	D_l, D_h, [src, #64]!
	subs	count, count, #64
	b.ge	1b
	stp	A_l, A_h, [dst, #16]
	stp	B_l, B_h, [dst, #32]
	stp	C_l, C_h, [dst, #48]
	stp	D_l, D_h, [dst, #64]
	add	src, src, #16
	add	dst, dst, #64 + 16
	tst	count, #0x3f
	b.ne	.Ltail63
	ret

89 by Michael Hope Add aarch64 implementations of memcpy, memset and strcmp to	1	/* Copyright (c) 2012, Linaro Limited
	2	All rights reserved.
	3
	4	Redistribution and use in source and binary forms, with or without
	5	modification, are permitted provided that the following conditions are met:
	6	* Redistributions of source code must retain the above copyright
	7	notice, this list of conditions and the following disclaimer.
	8	* Redistributions in binary form must reproduce the above copyright
	9	notice, this list of conditions and the following disclaimer in the
	10	documentation and/or other materials provided with the distribution.
	11	* Neither the name of the Linaro nor the
	12	names of its contributors may be used to endorse or promote products
	13	derived from this software without specific prior written permission.
	14
	15	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	16	"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	17	LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	18	A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	19	HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	20	SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	21	LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	22	DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	23	THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	24	(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	25	OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
	26
	27	/* Assumptions:
	28	*
	29	* ARMv8-a, AArch64
	30	* Unaligned accesses
	31	*
	32	*/
	33
	34	#define dstin x0
	35	#define src x1
	36	#define count x2
	37	#define tmp1 x3
	38	#define tmp1w w3
	39	#define tmp2 x4
	40	#define tmp2w w4
	41	#define tmp3 x5
	42	#define tmp3w w5
	43	#define dst x6
	44
	45	#define A_l x7
	46	#define A_h x8
	47	#define B_l x9
	48	#define B_h x10
	49	#define C_l x11
	50	#define C_h x12
	51	#define D_l x13
	52	#define D_h x14
	53
	54	.macro def_fn f p2align=0
	55	.text
	56	.p2align \p2align
	57	.global \f
	58	.type \f, %function
	59	\f:
	60	.endm
	61
	62	def_fn memcpy p2align=6
	63
	64	mov dst, dstin
65	cmp count, #64
66	b.ge .Lcpy_not_short
67	cmp count, #15
68	b.le .Ltail15tiny
69
70	/* Deal with small copies quickly by dropping straight into the
71	* exit block. */
72	.Ltail63:
73	/* Copy up to 48 bytes of data. At this point we only need the
74	* bottom 6 bits of count to be accurate. */
75	ands tmp1, count, #0x30
76	b.eq .Ltail15
77	add dst, dst, tmp1
78	add src, src, tmp1
79	cmp tmp1w, #0x20
80	b.eq 1f
81	b.lt 2f
82	ldp A_l, A_h, [src, #-48]
83	stp A_l, A_h, [dst, #-48]
84	1:
85	ldp A_l, A_h, [src, #-32]
86	stp A_l, A_h, [dst, #-32]
87	2:
88	ldp A_l, A_h, [src, #-16]
89	stp A_l, A_h, [dst, #-16]
90
91	.Ltail15:
92	ands count, count, #15
93	beq 1f
94	add src, src, count
95	ldp A_l, A_h, [src, #-16]
96	add dst, dst, count
97	stp A_l, A_h, [dst, #-16]
98	1:
99	ret
100
101	.Ltail15tiny:
102	/* Copy up to 15 bytes of data. Does not assume additional data
103	being copied. */
104	tbz count, #3, 1f
105	ldr tmp1, [src], #8
106	str tmp1, [dst], #8
107	1:
108	tbz count, #2, 1f
109	ldr tmp1w, [src], #4
110	str tmp1w, [dst], #4
111	1:
112	tbz count, #1, 1f
113	ldrh tmp1w, [src], #2
114	strh tmp1w, [dst], #2
115	1:
116	tbz count, #0, 1f
117	ldrb tmp1w, [src]
118	strb tmp1w, [dst]
119	1:
120	ret
121
122	.Lcpy_not_short:
123	/* We don't much care about the alignment of DST, but we want SRC
124	* to be 128-bit (16 byte) aligned so that we don't cross cache line
125	* boundaries on both loads and stores. */
126	neg tmp2, src
127	ands tmp2, tmp2, #15 /* Bytes to reach alignment. */
128	b.eq 2f
129	sub count, count, tmp2
130	/* Copy more data than needed; it's faster than jumping
131	* around copying sub-Quadword quantities. We know that
132	* it can't overrun. */
133	ldp A_l, A_h, [src]
134	add src, src, tmp2
135	stp A_l, A_h, [dst]
136	add dst, dst, tmp2
137	/* There may be less than 63 bytes to go now. */
138	cmp count, #63
139	b.le .Ltail63
140	2:
141	subs count, count, #128
142	b.ge .Lcpy_body_large
143	/* Less than 128 bytes to copy, so handle 64 here and then jump
144	* to the tail. */
145	ldp A_l, A_h, [src]
146	ldp B_l, B_h, [src, #16]
147	ldp C_l, C_h, [src, #32]
148	ldp D_l, D_h, [src, #48]
149	stp A_l, A_h, [dst]
150	stp B_l, B_h, [dst, #16]
151	stp C_l, C_h, [dst, #32]
152	stp D_l, D_h, [dst, #48]
153	tst count, #0x3f
154	add src, src, #64
155	add dst, dst, #64
156	b.ne .Ltail63
157	ret
158
159	/* Critical loop. Start at a new cache line boundary. Assuming
160	* 64 bytes per line this ensures the entire loop is in one line. */
161	.p2align 6
162	.Lcpy_body_large:
163	/* There are at least 128 bytes to copy. */
164	ldp A_l, A_h, [src, #0]
165	sub dst, dst, #16 /* Pre-bias. */
166	ldp B_l, B_h, [src, #16]
167	ldp C_l, C_h, [src, #32]
168	ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */
169	1:
170	stp A_l, A_h, [dst, #16]
171	ldp A_l, A_h, [src, #16]
172	stp B_l, B_h, [dst, #32]
173	ldp B_l, B_h, [src, #32]
174	stp C_l, C_h, [dst, #48]
175	ldp C_l, C_h, [src, #48]
176	stp D_l, D_h, [dst, #64]!
177	ldp D_l, D_h, [src, #64]!
178	subs count, count, #64
179	b.ge 1b
180	stp A_l, A_h, [dst, #16]
181	stp B_l, B_h, [dst, #32]
182	stp C_l, C_h, [dst, #48]
183	stp D_l, D_h, [dst, #64]
184	add src, src, #16
185	add dst, dst, #64 + 16
186	tst count, #0x3f
187	b.ne .Ltail63
188	ret