~linaro-toolchain-dev/cortex-strings/trunk

100 by Will Newton
Integrate NEON/VFP/ARM optimised memcpy implementation.
1
/* Copyright (c) 2013, Linaro Limited
52 by Dr. David Alan Gilbert
spaces->tabs, use C style comments for the big top comment
2
   All rights reserved.
3
4
   Redistribution and use in source and binary forms, with or without
5
   modification, are permitted provided that the following conditions
6
   are met:
7
8
      * Redistributions of source code must retain the above copyright
9
      notice, this list of conditions and the following disclaimer.
10
11
      * Redistributions in binary form must reproduce the above copyright
12
      notice, this list of conditions and the following disclaimer in the
13
      documentation and/or other materials provided with the distribution.
14
15
      * Neither the name of Linaro Limited nor the names of its
16
      contributors may be used to endorse or promote products derived
17
      from this software without specific prior written permission.
18
19
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20
   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22
   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23
   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24
   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25
   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26
   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27
   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28
   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
101 by Will Newton
Update memcpy comments after testing on big endian.
31
   This memcpy routine is optimised for Cortex-A15 cores and takes advantage
100 by Will Newton
Integrate NEON/VFP/ARM optimised memcpy implementation.
32
   of VFP or NEON when built with the appropriate flags.
33
34
   Assumptions:
35
36
    ARMv6 (ARMv7-a if using Neon)
37
    ARM state
38
    Unaligned accesses
39
    LDRD/STRD support unaligned word accesses
40
41
 */
41.1.1 by Dr. David Alan Gilbert
Import my memcpy routines (plain ARMv7 and Neon hybrid)
42
52 by Dr. David Alan Gilbert
spaces->tabs, use C style comments for the big top comment
43
	.syntax unified
100 by Will Newton
Integrate NEON/VFP/ARM optimised memcpy implementation.
44
	/* This implementation requires ARM state.  */
45
	.arm
46
47
#ifdef __ARM_NEON__
48
49
	.fpu	neon
50
	.arch	armv7-a
51
# define FRAME_SIZE	4
52
# define USE_VFP
53
# define USE_NEON
54
55
#elif !defined (__SOFTFP__)
56
57
	.arch	armv6
58
	.fpu	vfpv2
59
# define FRAME_SIZE	32
60
# define USE_VFP
61
62
#else
63
	.arch	armv6
64
# define FRAME_SIZE    32
65
66
#endif
67
68
/* Old versions of GAS incorrectly implement the NEON align semantics.  */
69
#ifdef BROKEN_ASM_NEON_ALIGN
70
#define ALIGN(addr, align) addr,:align
71
#else
72
#define ALIGN(addr, align) addr:align
73
#endif
74
75
#define PC_OFFSET	8	/* PC pipeline compensation.  */
76
#define INSN_SIZE	4
77
78
/* Call parameters.  */
79
#define dstin	r0
80
#define src	r1
81
#define count	r2
82
83
/* Locals.  */
84
#define tmp1	r3
85
#define dst	ip
86
#define tmp2	r10
87
88
#ifndef USE_NEON
89
/* For bulk copies using GP registers.  */
90
#define	A_l	r2		/* Call-clobbered.  */
91
#define	A_h	r3		/* Call-clobbered.  */
92
#define	B_l	r4
93
#define	B_h	r5
94
#define	C_l	r6
95
#define	C_h	r7
96
#define	D_l	r8
97
#define	D_h	r9
98
#endif
99
100
/* Number of lines ahead to pre-fetch data.  If you change this the code
101
   below will need adjustment to compensate.  */
102
103
#define prefetch_lines	5
104
105
#ifdef USE_VFP
106
	.macro	cpy_line_vfp vreg, base
107
	vstr	\vreg, [dst, #\base]
108
	vldr	\vreg, [src, #\base]
109
	vstr	d0, [dst, #\base + 8]
110
	vldr	d0, [src, #\base + 8]
111
	vstr	d1, [dst, #\base + 16]
112
	vldr	d1, [src, #\base + 16]
113
	vstr	d2, [dst, #\base + 24]
114
	vldr	d2, [src, #\base + 24]
115
	vstr	\vreg, [dst, #\base + 32]
116
	vldr	\vreg, [src, #\base + prefetch_lines * 64 - 32]
117
	vstr	d0, [dst, #\base + 40]
118
	vldr	d0, [src, #\base + 40]
119
	vstr	d1, [dst, #\base + 48]
120
	vldr	d1, [src, #\base + 48]
121
	vstr	d2, [dst, #\base + 56]
122
	vldr	d2, [src, #\base + 56]
123
	.endm
124
125
	.macro	cpy_tail_vfp vreg, base
126
	vstr	\vreg, [dst, #\base]
127
	vldr	\vreg, [src, #\base]
128
	vstr	d0, [dst, #\base + 8]
129
	vldr	d0, [src, #\base + 8]
130
	vstr	d1, [dst, #\base + 16]
131
	vldr	d1, [src, #\base + 16]
132
	vstr	d2, [dst, #\base + 24]
133
	vldr	d2, [src, #\base + 24]
134
	vstr	\vreg, [dst, #\base + 32]
135
	vstr	d0, [dst, #\base + 40]
136
	vldr	d0, [src, #\base + 40]
137
	vstr	d1, [dst, #\base + 48]
138
	vldr	d1, [src, #\base + 48]
139
	vstr	d2, [dst, #\base + 56]
140
	vldr	d2, [src, #\base + 56]
141
	.endm
142
#endif
143
144
	.macro def_fn f p2align=0
52 by Dr. David Alan Gilbert
spaces->tabs, use C style comments for the big top comment
145
	.text
100 by Will Newton
Integrate NEON/VFP/ARM optimised memcpy implementation.
146
	.p2align \p2align
147
	.global \f
148
	.type \f, %function
149
\f:
150
	.endm
151
152
def_fn memcpy p2align=6
153
154
	mov	dst, dstin	/* Preserve dstin, we need to return it.  */
155
	cmp	count, #64
156
	bge	.Lcpy_not_short
157
	/* Deal with small copies quickly by dropping straight into the
158
	   exit block.  */
159
160
.Ltail63unaligned:
161
#ifdef USE_NEON
162
	and	tmp1, count, #0x38
163
	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
164
	add	pc, pc, tmp1
165
	vld1.8	{d0}, [src]!	/* 14 words to go.  */
166
	vst1.8	{d0}, [dst]!
167
	vld1.8	{d0}, [src]!	/* 12 words to go.  */
168
	vst1.8	{d0}, [dst]!
169
	vld1.8	{d0}, [src]!	/* 10 words to go.  */
170
	vst1.8	{d0}, [dst]!
171
	vld1.8	{d0}, [src]!	/* 8 words to go.  */
172
	vst1.8	{d0}, [dst]!
173
	vld1.8	{d0}, [src]!	/* 6 words to go.  */
174
	vst1.8	{d0}, [dst]!
175
	vld1.8	{d0}, [src]!	/* 4 words to go.  */
176
	vst1.8	{d0}, [dst]!
177
	vld1.8	{d0}, [src]!	/* 2 words to go.  */
178
	vst1.8	{d0}, [dst]!
179
180
	tst	count, #4
181
	ldrne	tmp1, [src], #4
182
	strne	tmp1, [dst], #4
183
#else
184
	/* Copy up to 15 full words of data.  May not be aligned.  */
185
	/* Cannot use VFP for unaligned data.  */
186
	and	tmp1, count, #0x3c
187
	add	dst, dst, tmp1
188
	add	src, src, tmp1
189
	rsb	tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
190
	/* Jump directly into the sequence below at the correct offset.  */
191
	add	pc, pc, tmp1, lsl #1
192
193
	ldr	tmp1, [src, #-60]	/* 15 words to go.  */
194
	str	tmp1, [dst, #-60]
195
196
	ldr	tmp1, [src, #-56]	/* 14 words to go.  */
197
	str	tmp1, [dst, #-56]
198
	ldr	tmp1, [src, #-52]
199
	str	tmp1, [dst, #-52]
200
201
	ldr	tmp1, [src, #-48]	/* 12 words to go.  */
202
	str	tmp1, [dst, #-48]
203
	ldr	tmp1, [src, #-44]
204
	str	tmp1, [dst, #-44]
205
206
	ldr	tmp1, [src, #-40]	/* 10 words to go.  */
207
	str	tmp1, [dst, #-40]
208
	ldr	tmp1, [src, #-36]
209
	str	tmp1, [dst, #-36]
210
211
	ldr	tmp1, [src, #-32]	/* 8 words to go.  */
212
	str	tmp1, [dst, #-32]
213
	ldr	tmp1, [src, #-28]
214
	str	tmp1, [dst, #-28]
215
216
	ldr	tmp1, [src, #-24]	/* 6 words to go.  */
217
	str	tmp1, [dst, #-24]
218
	ldr	tmp1, [src, #-20]
219
	str	tmp1, [dst, #-20]
220
221
	ldr	tmp1, [src, #-16]	/* 4 words to go.  */
222
	str	tmp1, [dst, #-16]
223
	ldr	tmp1, [src, #-12]
224
	str	tmp1, [dst, #-12]
225
226
	ldr	tmp1, [src, #-8]	/* 2 words to go.  */
227
	str	tmp1, [dst, #-8]
228
	ldr	tmp1, [src, #-4]
229
	str	tmp1, [dst, #-4]
230
#endif
231
232
	lsls	count, count, #31
233
	ldrhcs	tmp1, [src], #2
234
	ldrbne	src, [src]		/* Src is dead, use as a scratch.  */
235
	strhcs	tmp1, [dst], #2
236
	strbne	src, [dst]
237
	bx	lr
238
239
.Lcpy_not_short:
240
	/* At least 64 bytes to copy, but don't know the alignment yet.  */
241
	str	tmp2, [sp, #-FRAME_SIZE]!
116 by Will Newton
Tweak memcpy for performance with misaligned buffers.
242
	and	tmp2, src, #7
243
	and	tmp1, dst, #7
100 by Will Newton
Integrate NEON/VFP/ARM optimised memcpy implementation.
244
	cmp	tmp1, tmp2
245
	bne	.Lcpy_notaligned
246
247
#ifdef USE_VFP
248
	/* Magic dust alert!  Force VFP on Cortex-A9.  Experiments show
249
	   that the FP pipeline is much better at streaming loads and
250
	   stores.  This is outside the critical loop.  */
251
	vmov.f32	s0, s0
252
#endif
253
254
	/* SRC and DST have the same mutual 32-bit alignment, but we may
255
	   still need to pre-copy some bytes to get to natural alignment.
256
	   We bring DST into full 64-bit alignment.  */
257
	lsls	tmp2, dst, #29
258
	beq	1f
259
	rsbs	tmp2, tmp2, #0
260
	sub	count, count, tmp2, lsr #29
261
	ldrmi	tmp1, [src], #4
262
	strmi	tmp1, [dst], #4
263
	lsls	tmp2, tmp2, #2
264
	ldrhcs	tmp1, [src], #2
265
	ldrbne	tmp2, [src], #1
266
	strhcs	tmp1, [dst], #2
267
	strbne	tmp2, [dst], #1
268
269
1:
270
	subs	tmp2, count, #64	/* Use tmp2 for count.  */
271
	blt	.Ltail63aligned
272
273
	cmp	tmp2, #512
274
	bge	.Lcpy_body_long
275
276
.Lcpy_body_medium:			/* Count in tmp2.  */
277
#ifdef USE_VFP
278
1:
279
	vldr	d0, [src, #0]
280
	subs	tmp2, tmp2, #64
281
	vldr	d1, [src, #8]
282
	vstr	d0, [dst, #0]
283
	vldr	d0, [src, #16]
284
	vstr	d1, [dst, #8]
285
	vldr	d1, [src, #24]
286
	vstr	d0, [dst, #16]
287
	vldr	d0, [src, #32]
288
	vstr	d1, [dst, #24]
289
	vldr	d1, [src, #40]
290
	vstr	d0, [dst, #32]
291
	vldr	d0, [src, #48]
292
	vstr	d1, [dst, #40]
293
	vldr	d1, [src, #56]
294
	vstr	d0, [dst, #48]
295
	add	src, src, #64
296
	vstr	d1, [dst, #56]
297
	add	dst, dst, #64
298
	bge	1b
299
	tst	tmp2, #0x3f
300
	beq	.Ldone
301
302
.Ltail63aligned:			/* Count in tmp2.  */
303
	and	tmp1, tmp2, #0x38
304
	add	dst, dst, tmp1
305
	add	src, src, tmp1
306
	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
307
	add	pc, pc, tmp1
308
309
	vldr	d0, [src, #-56]	/* 14 words to go.  */
310
	vstr	d0, [dst, #-56]
311
	vldr	d0, [src, #-48]	/* 12 words to go.  */
312
	vstr	d0, [dst, #-48]
313
	vldr	d0, [src, #-40]	/* 10 words to go.  */
314
	vstr	d0, [dst, #-40]
315
	vldr	d0, [src, #-32]	/* 8 words to go.  */
316
	vstr	d0, [dst, #-32]
317
	vldr	d0, [src, #-24]	/* 6 words to go.  */
318
	vstr	d0, [dst, #-24]
319
	vldr	d0, [src, #-16]	/* 4 words to go.  */
320
	vstr	d0, [dst, #-16]
321
	vldr	d0, [src, #-8]	/* 2 words to go.  */
322
	vstr	d0, [dst, #-8]
323
#else
324
	sub	src, src, #8
325
	sub	dst, dst, #8
326
1:
327
	ldrd	A_l, A_h, [src, #8]
328
	strd	A_l, A_h, [dst, #8]
329
	ldrd	A_l, A_h, [src, #16]
330
	strd	A_l, A_h, [dst, #16]
331
	ldrd	A_l, A_h, [src, #24]
332
	strd	A_l, A_h, [dst, #24]
333
	ldrd	A_l, A_h, [src, #32]
334
	strd	A_l, A_h, [dst, #32]
335
	ldrd	A_l, A_h, [src, #40]
336
	strd	A_l, A_h, [dst, #40]
337
	ldrd	A_l, A_h, [src, #48]
338
	strd	A_l, A_h, [dst, #48]
339
	ldrd	A_l, A_h, [src, #56]
340
	strd	A_l, A_h, [dst, #56]
341
	ldrd	A_l, A_h, [src, #64]!
342
	strd	A_l, A_h, [dst, #64]!
343
	subs	tmp2, tmp2, #64
344
	bge	1b
345
	tst	tmp2, #0x3f
346
	bne	1f
347
	ldr	tmp2,[sp], #FRAME_SIZE
348
	bx	lr
349
1:
350
	add	src, src, #8
351
	add	dst, dst, #8
352
353
.Ltail63aligned:			/* Count in tmp2.  */
354
	/* Copy up to 7 d-words of data.  Similar to Ltail63unaligned, but
355
	   we know that the src and dest are 32-bit aligned so we can use
356
	   LDRD/STRD to improve efficiency.  */
357
	/* TMP2 is now negative, but we don't care about that.  The bottom
358
	   six bits still tell us how many bytes are left to copy.  */
359
360
	and	tmp1, tmp2, #0x38
361
	add	dst, dst, tmp1
362
	add	src, src, tmp1
363
	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
364
	add	pc, pc, tmp1
365
	ldrd	A_l, A_h, [src, #-56]	/* 14 words to go.  */
366
	strd	A_l, A_h, [dst, #-56]
367
	ldrd	A_l, A_h, [src, #-48]	/* 12 words to go.  */
368
	strd	A_l, A_h, [dst, #-48]
369
	ldrd	A_l, A_h, [src, #-40]	/* 10 words to go.  */
370
	strd	A_l, A_h, [dst, #-40]
371
	ldrd	A_l, A_h, [src, #-32]	/* 8 words to go.  */
372
	strd	A_l, A_h, [dst, #-32]
373
	ldrd	A_l, A_h, [src, #-24]	/* 6 words to go.  */
374
	strd	A_l, A_h, [dst, #-24]
375
	ldrd	A_l, A_h, [src, #-16]	/* 4 words to go.  */
376
	strd	A_l, A_h, [dst, #-16]
377
	ldrd	A_l, A_h, [src, #-8]	/* 2 words to go.  */
378
	strd	A_l, A_h, [dst, #-8]
379
380
#endif
381
	tst	tmp2, #4
382
	ldrne	tmp1, [src], #4
383
	strne	tmp1, [dst], #4
384
	lsls	tmp2, tmp2, #31		/* Count (tmp2) now dead. */
385
	ldrhcs	tmp1, [src], #2
386
	ldrbne	tmp2, [src]
387
	strhcs	tmp1, [dst], #2
388
	strbne	tmp2, [dst]
389
390
.Ldone:
391
	ldr	tmp2, [sp], #FRAME_SIZE
392
	bx	lr
393
394
.Lcpy_body_long:			/* Count in tmp2.  */
395
396
	/* Long copy.  We know that there's at least (prefetch_lines * 64)
397
	   bytes to go.  */
398
#ifdef USE_VFP
399
	/* Don't use PLD.  Instead, read some data in advance of the current
400
	   copy position into a register.  This should act like a PLD
401
	   operation but we won't have to repeat the transfer.  */
402
403
	vldr	d3, [src, #0]
404
	vldr	d4, [src, #64]
405
	vldr	d5, [src, #128]
406
	vldr	d6, [src, #192]
407
	vldr	d7, [src, #256]
408
409
	vldr	d0, [src, #8]
410
	vldr	d1, [src, #16]
411
	vldr	d2, [src, #24]
412
	add	src, src, #32
413
414
	subs	tmp2, tmp2, #prefetch_lines * 64 * 2
415
	blt	2f
416
1:
417
	cpy_line_vfp	d3, 0
418
	cpy_line_vfp	d4, 64
419
	cpy_line_vfp	d5, 128
420
	add	dst, dst, #3 * 64
421
	add	src, src, #3 * 64
422
	cpy_line_vfp	d6, 0
423
	cpy_line_vfp	d7, 64
424
	add	dst, dst, #2 * 64
425
	add	src, src, #2 * 64
426
	subs	tmp2, tmp2, #prefetch_lines * 64
427
	bge	1b
428
429
2:
430
	cpy_tail_vfp	d3, 0
431
	cpy_tail_vfp	d4, 64
432
	cpy_tail_vfp	d5, 128
433
	add	src, src, #3 * 64
434
	add	dst, dst, #3 * 64
435
	cpy_tail_vfp	d6, 0
436
	vstr	d7, [dst, #64]
437
	vldr	d7, [src, #64]
438
	vstr	d0, [dst, #64 + 8]
439
	vldr	d0, [src, #64 + 8]
440
	vstr	d1, [dst, #64 + 16]
441
	vldr	d1, [src, #64 + 16]
442
	vstr	d2, [dst, #64 + 24]
443
	vldr	d2, [src, #64 + 24]
444
	vstr	d7, [dst, #64 + 32]
445
	add	src, src, #96
446
	vstr	d0, [dst, #64 + 40]
447
	vstr	d1, [dst, #64 + 48]
448
	vstr	d2, [dst, #64 + 56]
449
	add	dst, dst, #128
450
	add	tmp2, tmp2, #prefetch_lines * 64
451
	b	.Lcpy_body_medium
452
#else
453
	/* Long copy.  Use an SMS style loop to maximize the I/O
454
	   bandwidth of the core.  We don't have enough spare registers
455
	   to synthesise prefetching, so use PLD operations.  */
456
	/* Pre-bias src and dst.  */
457
	sub	src, src, #8
458
	sub	dst, dst, #8
459
	pld	[src, #8]
460
	pld	[src, #72]
461
	subs	tmp2, tmp2, #64
462
	pld	[src, #136]
463
	ldrd	A_l, A_h, [src, #8]
464
	strd	B_l, B_h, [sp, #8]
465
	ldrd	B_l, B_h, [src, #16]
466
	strd	C_l, C_h, [sp, #16]
467
	ldrd	C_l, C_h, [src, #24]
468
	strd	D_l, D_h, [sp, #24]
469
	pld	[src, #200]
470
	ldrd	D_l, D_h, [src, #32]!
471
	b	1f
472
	.p2align	6
473
2:
474
	pld	[src, #232]
475
	strd	A_l, A_h, [dst, #40]
476
	ldrd	A_l, A_h, [src, #40]
477
	strd	B_l, B_h, [dst, #48]
478
	ldrd	B_l, B_h, [src, #48]
479
	strd	C_l, C_h, [dst, #56]
480
	ldrd	C_l, C_h, [src, #56]
481
	strd	D_l, D_h, [dst, #64]!
482
	ldrd	D_l, D_h, [src, #64]!
483
	subs	tmp2, tmp2, #64
484
1:
485
	strd	A_l, A_h, [dst, #8]
486
	ldrd	A_l, A_h, [src, #8]
487
	strd	B_l, B_h, [dst, #16]
488
	ldrd	B_l, B_h, [src, #16]
489
	strd	C_l, C_h, [dst, #24]
490
	ldrd	C_l, C_h, [src, #24]
491
	strd	D_l, D_h, [dst, #32]
492
	ldrd	D_l, D_h, [src, #32]
493
	bcs	2b
494
	/* Save the remaining bytes and restore the callee-saved regs.  */
495
	strd	A_l, A_h, [dst, #40]
496
	add	src, src, #40
497
	strd	B_l, B_h, [dst, #48]
498
	ldrd	B_l, B_h, [sp, #8]
499
	strd	C_l, C_h, [dst, #56]
500
	ldrd	C_l, C_h, [sp, #16]
501
	strd	D_l, D_h, [dst, #64]
502
	ldrd	D_l, D_h, [sp, #24]
503
	add	dst, dst, #72
504
	tst	tmp2, #0x3f
505
	bne	.Ltail63aligned
506
	ldr	tmp2, [sp], #FRAME_SIZE
507
	bx	lr
508
#endif
509
510
.Lcpy_notaligned:
511
	pld	[src]
512
	pld	[src, #64]
513
	/* There's at least 64 bytes to copy, but there is no mutual
514
	   alignment.  */
515
	/* Bring DST to 64-bit alignment.  */
516
	lsls	tmp2, dst, #29
517
	pld	[src, #(2 * 64)]
518
	beq	1f
519
	rsbs	tmp2, tmp2, #0
520
	sub	count, count, tmp2, lsr #29
521
	ldrmi	tmp1, [src], #4
522
	strmi	tmp1, [dst], #4
523
	lsls	tmp2, tmp2, #2
524
	ldrbne	tmp1, [src], #1
525
	ldrhcs	tmp2, [src], #2
526
	strbne	tmp1, [dst], #1
527
	strhcs	tmp2, [dst], #2
528
1:
529
	pld	[src, #(3 * 64)]
530
	subs	count, count, #64
531
	ldrmi	tmp2, [sp], #FRAME_SIZE
532
	bmi	.Ltail63unaligned
533
	pld	[src, #(4 * 64)]
534
535
#ifdef USE_NEON
536
	vld1.8	{d0-d3}, [src]!
537
	vld1.8	{d4-d7}, [src]!
538
	subs	count, count, #64
539
	bmi	2f
540
1:
541
	pld	[src, #(4 * 64)]
542
	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
543
	vld1.8	{d0-d3}, [src]!
544
	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
545
	vld1.8	{d4-d7}, [src]!
546
	subs	count, count, #64
547
	bpl	1b
548
2:
549
	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
550
	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
551
	ands	count, count, #0x3f
552
#else
553
	/* Use an SMS style loop to maximize the I/O bandwidth.  */
554
	sub	src, src, #4
555
	sub	dst, dst, #8
556
	subs	tmp2, count, #64	/* Use tmp2 for count.  */
557
	ldr	A_l, [src, #4]
558
	ldr	A_h, [src, #8]
559
	strd	B_l, B_h, [sp, #8]
560
	ldr	B_l, [src, #12]
561
	ldr	B_h, [src, #16]
562
	strd	C_l, C_h, [sp, #16]
563
	ldr	C_l, [src, #20]
564
	ldr	C_h, [src, #24]
565
	strd	D_l, D_h, [sp, #24]
566
	ldr	D_l, [src, #28]
567
	ldr	D_h, [src, #32]!
568
	b	1f
569
	.p2align	6
570
2:
571
	pld	[src, #(5 * 64) - (32 - 4)]
572
	strd	A_l, A_h, [dst, #40]
573
	ldr	A_l, [src, #36]
574
	ldr	A_h, [src, #40]
575
	strd	B_l, B_h, [dst, #48]
576
	ldr	B_l, [src, #44]
577
	ldr	B_h, [src, #48]
578
	strd	C_l, C_h, [dst, #56]
579
	ldr	C_l, [src, #52]
580
	ldr	C_h, [src, #56]
581
	strd	D_l, D_h, [dst, #64]!
582
	ldr	D_l, [src, #60]
583
	ldr	D_h, [src, #64]!
584
	subs	tmp2, tmp2, #64
585
1:
586
	strd	A_l, A_h, [dst, #8]
587
	ldr	A_l, [src, #4]
588
	ldr	A_h, [src, #8]
589
	strd	B_l, B_h, [dst, #16]
590
	ldr	B_l, [src, #12]
591
	ldr	B_h, [src, #16]
592
	strd	C_l, C_h, [dst, #24]
593
	ldr	C_l, [src, #20]
594
	ldr	C_h, [src, #24]
595
	strd	D_l, D_h, [dst, #32]
596
	ldr	D_l, [src, #28]
597
	ldr	D_h, [src, #32]
598
	bcs	2b
599
600
	/* Save the remaining bytes and restore the callee-saved regs.  */
601
	strd	A_l, A_h, [dst, #40]
602
	add	src, src, #36
603
	strd	B_l, B_h, [dst, #48]
604
	ldrd	B_l, B_h, [sp, #8]
605
	strd	C_l, C_h, [dst, #56]
606
	ldrd	C_l, C_h, [sp, #16]
607
	strd	D_l, D_h, [dst, #64]
608
	ldrd	D_l, D_h, [sp, #24]
609
	add	dst, dst, #72
610
	ands	count, tmp2, #0x3f
611
#endif
612
	ldr	tmp2, [sp], #FRAME_SIZE
613
	bne	.Ltail63unaligned
614
	bx	lr
615
616
	.size	memcpy, . - memcpy