~linaro-toolchain-dev/cortex-strings/trunk

100 by Will Newton
Integrate NEON/VFP/ARM optimised memcpy implementation.
1
/* Copyright (c) 2013, Linaro Limited
52 by Dr. David Alan Gilbert
spaces->tabs, use C style comments for the big top comment
2
   All rights reserved.
3
4
   Redistribution and use in source and binary forms, with or without
5
   modification, are permitted provided that the following conditions
6
   are met:
7
8
      * Redistributions of source code must retain the above copyright
9
      notice, this list of conditions and the following disclaimer.
10
11
      * Redistributions in binary form must reproduce the above copyright
12
      notice, this list of conditions and the following disclaimer in the
13
      documentation and/or other materials provided with the distribution.
14
15
      * Neither the name of Linaro Limited nor the names of its
16
      contributors may be used to endorse or promote products derived
17
      from this software without specific prior written permission.
18
19
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20
   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22
   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23
   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24
   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25
   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26
   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27
   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28
   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
117 by Will Newton
Split license/copyright and doc comments to ease bionic merging.
30
 */
52 by Dr. David Alan Gilbert
spaces->tabs, use C style comments for the big top comment
31
117 by Will Newton
Split license/copyright and doc comments to ease bionic merging.
32
/*
101 by Will Newton
Update memcpy comments after testing on big endian.
33
   This memcpy routine is optimised for Cortex-A15 cores and takes advantage
100 by Will Newton
Integrate NEON/VFP/ARM optimised memcpy implementation.
34
   of VFP or NEON when built with the appropriate flags.
35
36
   Assumptions:
37
38
    ARMv6 (ARMv7-a if using Neon)
39
    ARM state
40
    Unaligned accesses
41
42
 */
41.1.1 by Dr. David Alan Gilbert
Import my memcpy routines (plain ARMv7 and Neon hybrid)
43
52 by Dr. David Alan Gilbert
spaces->tabs, use C style comments for the big top comment
44
	.syntax unified
100 by Will Newton
Integrate NEON/VFP/ARM optimised memcpy implementation.
45
	/* This implementation requires ARM state.  */
46
	.arm
47
48
#ifdef __ARM_NEON__
49
50
	.fpu	neon
51
	.arch	armv7-a
52
# define FRAME_SIZE	4
53
# define USE_VFP
54
# define USE_NEON
55
56
#elif !defined (__SOFTFP__)
57
58
	.arch	armv6
59
	.fpu	vfpv2
60
# define FRAME_SIZE	32
61
# define USE_VFP
62
63
#else
64
	.arch	armv6
65
# define FRAME_SIZE    32
66
67
#endif
68
69
/* Old versions of GAS incorrectly implement the NEON align semantics.  */
70
#ifdef BROKEN_ASM_NEON_ALIGN
71
#define ALIGN(addr, align) addr,:align
72
#else
73
#define ALIGN(addr, align) addr:align
74
#endif
75
76
#define PC_OFFSET	8	/* PC pipeline compensation.  */
77
#define INSN_SIZE	4
78
79
/* Call parameters.  */
80
#define dstin	r0
81
#define src	r1
82
#define count	r2
83
84
/* Locals.  */
85
#define tmp1	r3
86
#define dst	ip
87
#define tmp2	r10
88
89
#ifndef USE_NEON
90
/* For bulk copies using GP registers.  */
91
#define	A_l	r2		/* Call-clobbered.  */
92
#define	A_h	r3		/* Call-clobbered.  */
93
#define	B_l	r4
94
#define	B_h	r5
95
#define	C_l	r6
96
#define	C_h	r7
97
#define	D_l	r8
98
#define	D_h	r9
99
#endif
100
101
/* Number of lines ahead to pre-fetch data.  If you change this the code
102
   below will need adjustment to compensate.  */
103
104
#define prefetch_lines	5
105
106
#ifdef USE_VFP
107
	.macro	cpy_line_vfp vreg, base
108
	vstr	\vreg, [dst, #\base]
109
	vldr	\vreg, [src, #\base]
110
	vstr	d0, [dst, #\base + 8]
111
	vldr	d0, [src, #\base + 8]
112
	vstr	d1, [dst, #\base + 16]
113
	vldr	d1, [src, #\base + 16]
114
	vstr	d2, [dst, #\base + 24]
115
	vldr	d2, [src, #\base + 24]
116
	vstr	\vreg, [dst, #\base + 32]
117
	vldr	\vreg, [src, #\base + prefetch_lines * 64 - 32]
118
	vstr	d0, [dst, #\base + 40]
119
	vldr	d0, [src, #\base + 40]
120
	vstr	d1, [dst, #\base + 48]
121
	vldr	d1, [src, #\base + 48]
122
	vstr	d2, [dst, #\base + 56]
123
	vldr	d2, [src, #\base + 56]
124
	.endm
125
126
	.macro	cpy_tail_vfp vreg, base
127
	vstr	\vreg, [dst, #\base]
128
	vldr	\vreg, [src, #\base]
129
	vstr	d0, [dst, #\base + 8]
130
	vldr	d0, [src, #\base + 8]
131
	vstr	d1, [dst, #\base + 16]
132
	vldr	d1, [src, #\base + 16]
133
	vstr	d2, [dst, #\base + 24]
134
	vldr	d2, [src, #\base + 24]
135
	vstr	\vreg, [dst, #\base + 32]
136
	vstr	d0, [dst, #\base + 40]
137
	vldr	d0, [src, #\base + 40]
138
	vstr	d1, [dst, #\base + 48]
139
	vldr	d1, [src, #\base + 48]
140
	vstr	d2, [dst, #\base + 56]
141
	vldr	d2, [src, #\base + 56]
142
	.endm
143
#endif
144
145
	.macro def_fn f p2align=0
52 by Dr. David Alan Gilbert
spaces->tabs, use C style comments for the big top comment
146
	.text
100 by Will Newton
Integrate NEON/VFP/ARM optimised memcpy implementation.
147
	.p2align \p2align
148
	.global \f
149
	.type \f, %function
150
\f:
151
	.endm
152
153
def_fn memcpy p2align=6
154
155
	mov	dst, dstin	/* Preserve dstin, we need to return it.  */
156
	cmp	count, #64
157
	bge	.Lcpy_not_short
158
	/* Deal with small copies quickly by dropping straight into the
159
	   exit block.  */
160
161
.Ltail63unaligned:
162
#ifdef USE_NEON
163
	and	tmp1, count, #0x38
164
	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
165
	add	pc, pc, tmp1
166
	vld1.8	{d0}, [src]!	/* 14 words to go.  */
167
	vst1.8	{d0}, [dst]!
168
	vld1.8	{d0}, [src]!	/* 12 words to go.  */
169
	vst1.8	{d0}, [dst]!
170
	vld1.8	{d0}, [src]!	/* 10 words to go.  */
171
	vst1.8	{d0}, [dst]!
172
	vld1.8	{d0}, [src]!	/* 8 words to go.  */
173
	vst1.8	{d0}, [dst]!
174
	vld1.8	{d0}, [src]!	/* 6 words to go.  */
175
	vst1.8	{d0}, [dst]!
176
	vld1.8	{d0}, [src]!	/* 4 words to go.  */
177
	vst1.8	{d0}, [dst]!
178
	vld1.8	{d0}, [src]!	/* 2 words to go.  */
179
	vst1.8	{d0}, [dst]!
180
181
	tst	count, #4
182
	ldrne	tmp1, [src], #4
183
	strne	tmp1, [dst], #4
184
#else
185
	/* Copy up to 15 full words of data.  May not be aligned.  */
186
	/* Cannot use VFP for unaligned data.  */
187
	and	tmp1, count, #0x3c
188
	add	dst, dst, tmp1
189
	add	src, src, tmp1
190
	rsb	tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
191
	/* Jump directly into the sequence below at the correct offset.  */
192
	add	pc, pc, tmp1, lsl #1
193
194
	ldr	tmp1, [src, #-60]	/* 15 words to go.  */
195
	str	tmp1, [dst, #-60]
196
197
	ldr	tmp1, [src, #-56]	/* 14 words to go.  */
198
	str	tmp1, [dst, #-56]
199
	ldr	tmp1, [src, #-52]
200
	str	tmp1, [dst, #-52]
201
202
	ldr	tmp1, [src, #-48]	/* 12 words to go.  */
203
	str	tmp1, [dst, #-48]
204
	ldr	tmp1, [src, #-44]
205
	str	tmp1, [dst, #-44]
206
207
	ldr	tmp1, [src, #-40]	/* 10 words to go.  */
208
	str	tmp1, [dst, #-40]
209
	ldr	tmp1, [src, #-36]
210
	str	tmp1, [dst, #-36]
211
212
	ldr	tmp1, [src, #-32]	/* 8 words to go.  */
213
	str	tmp1, [dst, #-32]
214
	ldr	tmp1, [src, #-28]
215
	str	tmp1, [dst, #-28]
216
217
	ldr	tmp1, [src, #-24]	/* 6 words to go.  */
218
	str	tmp1, [dst, #-24]
219
	ldr	tmp1, [src, #-20]
220
	str	tmp1, [dst, #-20]
221
222
	ldr	tmp1, [src, #-16]	/* 4 words to go.  */
223
	str	tmp1, [dst, #-16]
224
	ldr	tmp1, [src, #-12]
225
	str	tmp1, [dst, #-12]
226
227
	ldr	tmp1, [src, #-8]	/* 2 words to go.  */
228
	str	tmp1, [dst, #-8]
229
	ldr	tmp1, [src, #-4]
230
	str	tmp1, [dst, #-4]
231
#endif
232
233
	lsls	count, count, #31
234
	ldrhcs	tmp1, [src], #2
235
	ldrbne	src, [src]		/* Src is dead, use as a scratch.  */
236
	strhcs	tmp1, [dst], #2
237
	strbne	src, [dst]
238
	bx	lr
239
240
.Lcpy_not_short:
241
	/* At least 64 bytes to copy, but don't know the alignment yet.  */
242
	str	tmp2, [sp, #-FRAME_SIZE]!
116 by Will Newton
Tweak memcpy for performance with misaligned buffers.
243
	and	tmp2, src, #7
244
	and	tmp1, dst, #7
100 by Will Newton
Integrate NEON/VFP/ARM optimised memcpy implementation.
245
	cmp	tmp1, tmp2
246
	bne	.Lcpy_notaligned
247
248
#ifdef USE_VFP
249
	/* Magic dust alert!  Force VFP on Cortex-A9.  Experiments show
250
	   that the FP pipeline is much better at streaming loads and
251
	   stores.  This is outside the critical loop.  */
252
	vmov.f32	s0, s0
253
#endif
254
121 by Will Newton
memcpy: Fix comment relating to alignment.
255
	/* SRC and DST have the same mutual 64-bit alignment, but we may
100 by Will Newton
Integrate NEON/VFP/ARM optimised memcpy implementation.
256
	   still need to pre-copy some bytes to get to natural alignment.
121 by Will Newton
memcpy: Fix comment relating to alignment.
257
	   We bring SRC and DST into full 64-bit alignment.  */
100 by Will Newton
Integrate NEON/VFP/ARM optimised memcpy implementation.
258
	lsls	tmp2, dst, #29
259
	beq	1f
260
	rsbs	tmp2, tmp2, #0
261
	sub	count, count, tmp2, lsr #29
262
	ldrmi	tmp1, [src], #4
263
	strmi	tmp1, [dst], #4
264
	lsls	tmp2, tmp2, #2
265
	ldrhcs	tmp1, [src], #2
266
	ldrbne	tmp2, [src], #1
267
	strhcs	tmp1, [dst], #2
268
	strbne	tmp2, [dst], #1
269
270
1:
271
	subs	tmp2, count, #64	/* Use tmp2 for count.  */
272
	blt	.Ltail63aligned
273
274
	cmp	tmp2, #512
275
	bge	.Lcpy_body_long
276
277
.Lcpy_body_medium:			/* Count in tmp2.  */
278
#ifdef USE_VFP
279
1:
280
	vldr	d0, [src, #0]
281
	subs	tmp2, tmp2, #64
282
	vldr	d1, [src, #8]
283
	vstr	d0, [dst, #0]
284
	vldr	d0, [src, #16]
285
	vstr	d1, [dst, #8]
286
	vldr	d1, [src, #24]
287
	vstr	d0, [dst, #16]
288
	vldr	d0, [src, #32]
289
	vstr	d1, [dst, #24]
290
	vldr	d1, [src, #40]
291
	vstr	d0, [dst, #32]
292
	vldr	d0, [src, #48]
293
	vstr	d1, [dst, #40]
294
	vldr	d1, [src, #56]
295
	vstr	d0, [dst, #48]
296
	add	src, src, #64
297
	vstr	d1, [dst, #56]
298
	add	dst, dst, #64
299
	bge	1b
300
	tst	tmp2, #0x3f
301
	beq	.Ldone
302
303
.Ltail63aligned:			/* Count in tmp2.  */
304
	and	tmp1, tmp2, #0x38
305
	add	dst, dst, tmp1
306
	add	src, src, tmp1
307
	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
308
	add	pc, pc, tmp1
309
310
	vldr	d0, [src, #-56]	/* 14 words to go.  */
311
	vstr	d0, [dst, #-56]
312
	vldr	d0, [src, #-48]	/* 12 words to go.  */
313
	vstr	d0, [dst, #-48]
314
	vldr	d0, [src, #-40]	/* 10 words to go.  */
315
	vstr	d0, [dst, #-40]
316
	vldr	d0, [src, #-32]	/* 8 words to go.  */
317
	vstr	d0, [dst, #-32]
318
	vldr	d0, [src, #-24]	/* 6 words to go.  */
319
	vstr	d0, [dst, #-24]
320
	vldr	d0, [src, #-16]	/* 4 words to go.  */
321
	vstr	d0, [dst, #-16]
322
	vldr	d0, [src, #-8]	/* 2 words to go.  */
323
	vstr	d0, [dst, #-8]
324
#else
325
	sub	src, src, #8
326
	sub	dst, dst, #8
327
1:
328
	ldrd	A_l, A_h, [src, #8]
329
	strd	A_l, A_h, [dst, #8]
330
	ldrd	A_l, A_h, [src, #16]
331
	strd	A_l, A_h, [dst, #16]
332
	ldrd	A_l, A_h, [src, #24]
333
	strd	A_l, A_h, [dst, #24]
334
	ldrd	A_l, A_h, [src, #32]
335
	strd	A_l, A_h, [dst, #32]
336
	ldrd	A_l, A_h, [src, #40]
337
	strd	A_l, A_h, [dst, #40]
338
	ldrd	A_l, A_h, [src, #48]
339
	strd	A_l, A_h, [dst, #48]
340
	ldrd	A_l, A_h, [src, #56]
341
	strd	A_l, A_h, [dst, #56]
342
	ldrd	A_l, A_h, [src, #64]!
343
	strd	A_l, A_h, [dst, #64]!
344
	subs	tmp2, tmp2, #64
345
	bge	1b
346
	tst	tmp2, #0x3f
347
	bne	1f
348
	ldr	tmp2,[sp], #FRAME_SIZE
349
	bx	lr
350
1:
351
	add	src, src, #8
352
	add	dst, dst, #8
353
354
.Ltail63aligned:			/* Count in tmp2.  */
355
	/* Copy up to 7 d-words of data.  Similar to Ltail63unaligned, but
122 by Will Newton
Fix more comments relating to alignment.
356
	   we know that the src and dest are 64-bit aligned so we can use
100 by Will Newton
Integrate NEON/VFP/ARM optimised memcpy implementation.
357
	   LDRD/STRD to improve efficiency.  */
358
	/* TMP2 is now negative, but we don't care about that.  The bottom
359
	   six bits still tell us how many bytes are left to copy.  */
360
361
	and	tmp1, tmp2, #0x38
362
	add	dst, dst, tmp1
363
	add	src, src, tmp1
364
	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
365
	add	pc, pc, tmp1
366
	ldrd	A_l, A_h, [src, #-56]	/* 14 words to go.  */
367
	strd	A_l, A_h, [dst, #-56]
368
	ldrd	A_l, A_h, [src, #-48]	/* 12 words to go.  */
369
	strd	A_l, A_h, [dst, #-48]
370
	ldrd	A_l, A_h, [src, #-40]	/* 10 words to go.  */
371
	strd	A_l, A_h, [dst, #-40]
372
	ldrd	A_l, A_h, [src, #-32]	/* 8 words to go.  */
373
	strd	A_l, A_h, [dst, #-32]
374
	ldrd	A_l, A_h, [src, #-24]	/* 6 words to go.  */
375
	strd	A_l, A_h, [dst, #-24]
376
	ldrd	A_l, A_h, [src, #-16]	/* 4 words to go.  */
377
	strd	A_l, A_h, [dst, #-16]
378
	ldrd	A_l, A_h, [src, #-8]	/* 2 words to go.  */
379
	strd	A_l, A_h, [dst, #-8]
380
381
#endif
382
	tst	tmp2, #4
383
	ldrne	tmp1, [src], #4
384
	strne	tmp1, [dst], #4
385
	lsls	tmp2, tmp2, #31		/* Count (tmp2) now dead. */
386
	ldrhcs	tmp1, [src], #2
387
	ldrbne	tmp2, [src]
388
	strhcs	tmp1, [dst], #2
389
	strbne	tmp2, [dst]
390
391
.Ldone:
392
	ldr	tmp2, [sp], #FRAME_SIZE
393
	bx	lr
394
395
.Lcpy_body_long:			/* Count in tmp2.  */
396
397
	/* Long copy.  We know that there's at least (prefetch_lines * 64)
398
	   bytes to go.  */
399
#ifdef USE_VFP
400
	/* Don't use PLD.  Instead, read some data in advance of the current
401
	   copy position into a register.  This should act like a PLD
402
	   operation but we won't have to repeat the transfer.  */
403
404
	vldr	d3, [src, #0]
405
	vldr	d4, [src, #64]
406
	vldr	d5, [src, #128]
407
	vldr	d6, [src, #192]
408
	vldr	d7, [src, #256]
409
410
	vldr	d0, [src, #8]
411
	vldr	d1, [src, #16]
412
	vldr	d2, [src, #24]
413
	add	src, src, #32
414
415
	subs	tmp2, tmp2, #prefetch_lines * 64 * 2
416
	blt	2f
417
1:
418
	cpy_line_vfp	d3, 0
419
	cpy_line_vfp	d4, 64
420
	cpy_line_vfp	d5, 128
421
	add	dst, dst, #3 * 64
422
	add	src, src, #3 * 64
423
	cpy_line_vfp	d6, 0
424
	cpy_line_vfp	d7, 64
425
	add	dst, dst, #2 * 64
426
	add	src, src, #2 * 64
427
	subs	tmp2, tmp2, #prefetch_lines * 64
428
	bge	1b
429
430
2:
431
	cpy_tail_vfp	d3, 0
432
	cpy_tail_vfp	d4, 64
433
	cpy_tail_vfp	d5, 128
434
	add	src, src, #3 * 64
435
	add	dst, dst, #3 * 64
436
	cpy_tail_vfp	d6, 0
437
	vstr	d7, [dst, #64]
438
	vldr	d7, [src, #64]
439
	vstr	d0, [dst, #64 + 8]
440
	vldr	d0, [src, #64 + 8]
441
	vstr	d1, [dst, #64 + 16]
442
	vldr	d1, [src, #64 + 16]
443
	vstr	d2, [dst, #64 + 24]
444
	vldr	d2, [src, #64 + 24]
445
	vstr	d7, [dst, #64 + 32]
446
	add	src, src, #96
447
	vstr	d0, [dst, #64 + 40]
448
	vstr	d1, [dst, #64 + 48]
449
	vstr	d2, [dst, #64 + 56]
450
	add	dst, dst, #128
451
	add	tmp2, tmp2, #prefetch_lines * 64
452
	b	.Lcpy_body_medium
453
#else
454
	/* Long copy.  Use an SMS style loop to maximize the I/O
455
	   bandwidth of the core.  We don't have enough spare registers
456
	   to synthesise prefetching, so use PLD operations.  */
457
	/* Pre-bias src and dst.  */
458
	sub	src, src, #8
459
	sub	dst, dst, #8
460
	pld	[src, #8]
461
	pld	[src, #72]
462
	subs	tmp2, tmp2, #64
463
	pld	[src, #136]
464
	ldrd	A_l, A_h, [src, #8]
465
	strd	B_l, B_h, [sp, #8]
466
	ldrd	B_l, B_h, [src, #16]
467
	strd	C_l, C_h, [sp, #16]
468
	ldrd	C_l, C_h, [src, #24]
469
	strd	D_l, D_h, [sp, #24]
470
	pld	[src, #200]
471
	ldrd	D_l, D_h, [src, #32]!
472
	b	1f
473
	.p2align	6
474
2:
475
	pld	[src, #232]
476
	strd	A_l, A_h, [dst, #40]
477
	ldrd	A_l, A_h, [src, #40]
478
	strd	B_l, B_h, [dst, #48]
479
	ldrd	B_l, B_h, [src, #48]
480
	strd	C_l, C_h, [dst, #56]
481
	ldrd	C_l, C_h, [src, #56]
482
	strd	D_l, D_h, [dst, #64]!
483
	ldrd	D_l, D_h, [src, #64]!
484
	subs	tmp2, tmp2, #64
485
1:
486
	strd	A_l, A_h, [dst, #8]
487
	ldrd	A_l, A_h, [src, #8]
488
	strd	B_l, B_h, [dst, #16]
489
	ldrd	B_l, B_h, [src, #16]
490
	strd	C_l, C_h, [dst, #24]
491
	ldrd	C_l, C_h, [src, #24]
492
	strd	D_l, D_h, [dst, #32]
493
	ldrd	D_l, D_h, [src, #32]
494
	bcs	2b
495
	/* Save the remaining bytes and restore the callee-saved regs.  */
496
	strd	A_l, A_h, [dst, #40]
497
	add	src, src, #40
498
	strd	B_l, B_h, [dst, #48]
499
	ldrd	B_l, B_h, [sp, #8]
500
	strd	C_l, C_h, [dst, #56]
501
	ldrd	C_l, C_h, [sp, #16]
502
	strd	D_l, D_h, [dst, #64]
503
	ldrd	D_l, D_h, [sp, #24]
504
	add	dst, dst, #72
505
	tst	tmp2, #0x3f
506
	bne	.Ltail63aligned
507
	ldr	tmp2, [sp], #FRAME_SIZE
508
	bx	lr
509
#endif
510
511
.Lcpy_notaligned:
512
	pld	[src]
513
	pld	[src, #64]
514
	/* There's at least 64 bytes to copy, but there is no mutual
515
	   alignment.  */
516
	/* Bring DST to 64-bit alignment.  */
517
	lsls	tmp2, dst, #29
518
	pld	[src, #(2 * 64)]
519
	beq	1f
520
	rsbs	tmp2, tmp2, #0
521
	sub	count, count, tmp2, lsr #29
522
	ldrmi	tmp1, [src], #4
523
	strmi	tmp1, [dst], #4
524
	lsls	tmp2, tmp2, #2
525
	ldrbne	tmp1, [src], #1
526
	ldrhcs	tmp2, [src], #2
527
	strbne	tmp1, [dst], #1
528
	strhcs	tmp2, [dst], #2
529
1:
530
	pld	[src, #(3 * 64)]
531
	subs	count, count, #64
532
	ldrmi	tmp2, [sp], #FRAME_SIZE
533
	bmi	.Ltail63unaligned
534
	pld	[src, #(4 * 64)]
535
536
#ifdef USE_NEON
537
	vld1.8	{d0-d3}, [src]!
538
	vld1.8	{d4-d7}, [src]!
539
	subs	count, count, #64
540
	bmi	2f
541
1:
542
	pld	[src, #(4 * 64)]
543
	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
544
	vld1.8	{d0-d3}, [src]!
545
	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
546
	vld1.8	{d4-d7}, [src]!
547
	subs	count, count, #64
548
	bpl	1b
549
2:
550
	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
551
	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
552
	ands	count, count, #0x3f
553
#else
554
	/* Use an SMS style loop to maximize the I/O bandwidth.  */
555
	sub	src, src, #4
556
	sub	dst, dst, #8
557
	subs	tmp2, count, #64	/* Use tmp2 for count.  */
558
	ldr	A_l, [src, #4]
559
	ldr	A_h, [src, #8]
560
	strd	B_l, B_h, [sp, #8]
561
	ldr	B_l, [src, #12]
562
	ldr	B_h, [src, #16]
563
	strd	C_l, C_h, [sp, #16]
564
	ldr	C_l, [src, #20]
565
	ldr	C_h, [src, #24]
566
	strd	D_l, D_h, [sp, #24]
567
	ldr	D_l, [src, #28]
568
	ldr	D_h, [src, #32]!
569
	b	1f
570
	.p2align	6
571
2:
572
	pld	[src, #(5 * 64) - (32 - 4)]
573
	strd	A_l, A_h, [dst, #40]
574
	ldr	A_l, [src, #36]
575
	ldr	A_h, [src, #40]
576
	strd	B_l, B_h, [dst, #48]
577
	ldr	B_l, [src, #44]
578
	ldr	B_h, [src, #48]
579
	strd	C_l, C_h, [dst, #56]
580
	ldr	C_l, [src, #52]
581
	ldr	C_h, [src, #56]
582
	strd	D_l, D_h, [dst, #64]!
583
	ldr	D_l, [src, #60]
584
	ldr	D_h, [src, #64]!
585
	subs	tmp2, tmp2, #64
586
1:
587
	strd	A_l, A_h, [dst, #8]
588
	ldr	A_l, [src, #4]
589
	ldr	A_h, [src, #8]
590
	strd	B_l, B_h, [dst, #16]
591
	ldr	B_l, [src, #12]
592
	ldr	B_h, [src, #16]
593
	strd	C_l, C_h, [dst, #24]
594
	ldr	C_l, [src, #20]
595
	ldr	C_h, [src, #24]
596
	strd	D_l, D_h, [dst, #32]
597
	ldr	D_l, [src, #28]
598
	ldr	D_h, [src, #32]
599
	bcs	2b
600
601
	/* Save the remaining bytes and restore the callee-saved regs.  */
602
	strd	A_l, A_h, [dst, #40]
603
	add	src, src, #36
604
	strd	B_l, B_h, [dst, #48]
605
	ldrd	B_l, B_h, [sp, #8]
606
	strd	C_l, C_h, [dst, #56]
607
	ldrd	C_l, C_h, [sp, #16]
608
	strd	D_l, D_h, [dst, #64]
609
	ldrd	D_l, D_h, [sp, #24]
610
	add	dst, dst, #72
611
	ands	count, tmp2, #0x3f
612
#endif
613
	ldr	tmp2, [sp], #FRAME_SIZE
614
	bne	.Ltail63unaligned
615
	bx	lr
616
617
	.size	memcpy, . - memcpy