~linaro-toolchain-dev/cortex-strings/trunk : contents of src/linaro-a9/memcpy.S at revision 116

~linaro-toolchain-dev/cortex-strings/trunk : (revision 116)

/* Copyright (c) 2013, Linaro Limited
   All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
   are met:

      * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.

      * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.

      * Neither the name of Linaro Limited nor the names of its
      contributors may be used to endorse or promote products derived
      from this software without specific prior written permission.

   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

   This memcpy routine is optimised for Cortex-A15 cores and takes advantage
   of VFP or NEON when built with the appropriate flags.

   Assumptions:

    ARMv6 (ARMv7-a if using Neon)
    ARM state
    Unaligned accesses
    LDRD/STRD support unaligned word accesses

 */

	.syntax unified
	/* This implementation requires ARM state.  */
	.arm

#ifdef __ARM_NEON__

	.fpu	neon
	.arch	armv7-a
# define FRAME_SIZE	4
# define USE_VFP
# define USE_NEON

#elif !defined (__SOFTFP__)

	.arch	armv6
	.fpu	vfpv2
# define FRAME_SIZE	32
# define USE_VFP

#else
	.arch	armv6
# define FRAME_SIZE    32

#endif

/* Old versions of GAS incorrectly implement the NEON align semantics.  */
#ifdef BROKEN_ASM_NEON_ALIGN
#define ALIGN(addr, align) addr,:align
#else
#define ALIGN(addr, align) addr:align
#endif

#define PC_OFFSET	8	/* PC pipeline compensation.  */
#define INSN_SIZE	4

/* Call parameters.  */
#define dstin	r0
#define src	r1
#define count	r2

/* Locals.  */
#define tmp1	r3
#define dst	ip
#define tmp2	r10

#ifndef USE_NEON
/* For bulk copies using GP registers.  */
#define	A_l	r2		/* Call-clobbered.  */
#define	A_h	r3		/* Call-clobbered.  */
#define	B_l	r4
#define	B_h	r5
#define	C_l	r6
#define	C_h	r7
#define	D_l	r8
#define	D_h	r9
#endif

/* Number of lines ahead to pre-fetch data.  If you change this the code
   below will need adjustment to compensate.  */

#define prefetch_lines	5

#ifdef USE_VFP
	.macro	cpy_line_vfp vreg, base
	vstr	\vreg, [dst, #\base]
	vldr	\vreg, [src, #\base]
	vstr	d0, [dst, #\base + 8]
	vldr	d0, [src, #\base + 8]
	vstr	d1, [dst, #\base + 16]
	vldr	d1, [src, #\base + 16]
	vstr	d2, [dst, #\base + 24]
	vldr	d2, [src, #\base + 24]
	vstr	\vreg, [dst, #\base + 32]
	vldr	\vreg, [src, #\base + prefetch_lines * 64 - 32]
	vstr	d0, [dst, #\base + 40]
	vldr	d0, [src, #\base + 40]
	vstr	d1, [dst, #\base + 48]
	vldr	d1, [src, #\base + 48]
	vstr	d2, [dst, #\base + 56]
	vldr	d2, [src, #\base + 56]
	.endm

	.macro	cpy_tail_vfp vreg, base
	vstr	\vreg, [dst, #\base]
	vldr	\vreg, [src, #\base]
	vstr	d0, [dst, #\base + 8]
	vldr	d0, [src, #\base + 8]
	vstr	d1, [dst, #\base + 16]
	vldr	d1, [src, #\base + 16]
	vstr	d2, [dst, #\base + 24]
	vldr	d2, [src, #\base + 24]
	vstr	\vreg, [dst, #\base + 32]
	vstr	d0, [dst, #\base + 40]
	vldr	d0, [src, #\base + 40]
	vstr	d1, [dst, #\base + 48]
	vldr	d1, [src, #\base + 48]
	vstr	d2, [dst, #\base + 56]
	vldr	d2, [src, #\base + 56]
	.endm
#endif

	.macro def_fn f p2align=0
	.text
	.p2align \p2align
	.global \f
	.type \f, %function
\f:
	.endm

def_fn memcpy p2align=6

	mov	dst, dstin	/* Preserve dstin, we need to return it.  */
	cmp	count, #64
	bge	.Lcpy_not_short
	/* Deal with small copies quickly by dropping straight into the
	   exit block.  */

.Ltail63unaligned:
#ifdef USE_NEON
	and	tmp1, count, #0x38
	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
	add	pc, pc, tmp1
	vld1.8	{d0}, [src]!	/* 14 words to go.  */
	vst1.8	{d0}, [dst]!
	vld1.8	{d0}, [src]!	/* 12 words to go.  */
	vst1.8	{d0}, [dst]!
	vld1.8	{d0}, [src]!	/* 10 words to go.  */
	vst1.8	{d0}, [dst]!
	vld1.8	{d0}, [src]!	/* 8 words to go.  */
	vst1.8	{d0}, [dst]!
	vld1.8	{d0}, [src]!	/* 6 words to go.  */
	vst1.8	{d0}, [dst]!
	vld1.8	{d0}, [src]!	/* 4 words to go.  */
	vst1.8	{d0}, [dst]!
	vld1.8	{d0}, [src]!	/* 2 words to go.  */
	vst1.8	{d0}, [dst]!

	tst	count, #4
	ldrne	tmp1, [src], #4
	strne	tmp1, [dst], #4
#else
	/* Copy up to 15 full words of data.  May not be aligned.  */
	/* Cannot use VFP for unaligned data.  */
	and	tmp1, count, #0x3c
	add	dst, dst, tmp1
	add	src, src, tmp1
	rsb	tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
	/* Jump directly into the sequence below at the correct offset.  */
	add	pc, pc, tmp1, lsl #1

	ldr	tmp1, [src, #-60]	/* 15 words to go.  */
	str	tmp1, [dst, #-60]

	ldr	tmp1, [src, #-56]	/* 14 words to go.  */
	str	tmp1, [dst, #-56]
	ldr	tmp1, [src, #-52]
	str	tmp1, [dst, #-52]

	ldr	tmp1, [src, #-48]	/* 12 words to go.  */
	str	tmp1, [dst, #-48]
	ldr	tmp1, [src, #-44]
	str	tmp1, [dst, #-44]

	ldr	tmp1, [src, #-40]	/* 10 words to go.  */
	str	tmp1, [dst, #-40]
	ldr	tmp1, [src, #-36]
	str	tmp1, [dst, #-36]

	ldr	tmp1, [src, #-32]	/* 8 words to go.  */
	str	tmp1, [dst, #-32]
	ldr	tmp1, [src, #-28]
	str	tmp1, [dst, #-28]

	ldr	tmp1, [src, #-24]	/* 6 words to go.  */
	str	tmp1, [dst, #-24]
	ldr	tmp1, [src, #-20]
	str	tmp1, [dst, #-20]

	ldr	tmp1, [src, #-16]	/* 4 words to go.  */
	str	tmp1, [dst, #-16]
	ldr	tmp1, [src, #-12]
	str	tmp1, [dst, #-12]

	ldr	tmp1, [src, #-8]	/* 2 words to go.  */
	str	tmp1, [dst, #-8]
	ldr	tmp1, [src, #-4]
	str	tmp1, [dst, #-4]
#endif

	lsls	count, count, #31
	ldrhcs	tmp1, [src], #2
	ldrbne	src, [src]		/* Src is dead, use as a scratch.  */
	strhcs	tmp1, [dst], #2
	strbne	src, [dst]
	bx	lr

.Lcpy_not_short:
	/* At least 64 bytes to copy, but don't know the alignment yet.  */
	str	tmp2, [sp, #-FRAME_SIZE]!
	and	tmp2, src, #7
	and	tmp1, dst, #7
	cmp	tmp1, tmp2
	bne	.Lcpy_notaligned

#ifdef USE_VFP
	/* Magic dust alert!  Force VFP on Cortex-A9.  Experiments show
	   that the FP pipeline is much better at streaming loads and
	   stores.  This is outside the critical loop.  */
	vmov.f32	s0, s0
#endif

	/* SRC and DST have the same mutual 32-bit alignment, but we may
	   still need to pre-copy some bytes to get to natural alignment.
	   We bring DST into full 64-bit alignment.  */
	lsls	tmp2, dst, #29
	beq	1f
	rsbs	tmp2, tmp2, #0
	sub	count, count, tmp2, lsr #29
	ldrmi	tmp1, [src], #4
	strmi	tmp1, [dst], #4
	lsls	tmp2, tmp2, #2
	ldrhcs	tmp1, [src], #2
	ldrbne	tmp2, [src], #1
	strhcs	tmp1, [dst], #2
	strbne	tmp2, [dst], #1

1:
	subs	tmp2, count, #64	/* Use tmp2 for count.  */
	blt	.Ltail63aligned

	cmp	tmp2, #512
	bge	.Lcpy_body_long

.Lcpy_body_medium:			/* Count in tmp2.  */
#ifdef USE_VFP
1:
	vldr	d0, [src, #0]
	subs	tmp2, tmp2, #64
	vldr	d1, [src, #8]
	vstr	d0, [dst, #0]
	vldr	d0, [src, #16]
	vstr	d1, [dst, #8]
	vldr	d1, [src, #24]
	vstr	d0, [dst, #16]
	vldr	d0, [src, #32]
	vstr	d1, [dst, #24]
	vldr	d1, [src, #40]
	vstr	d0, [dst, #32]
	vldr	d0, [src, #48]
	vstr	d1, [dst, #40]
	vldr	d1, [src, #56]
	vstr	d0, [dst, #48]
	add	src, src, #64
	vstr	d1, [dst, #56]
	add	dst, dst, #64
	bge	1b
	tst	tmp2, #0x3f
	beq	.Ldone

.Ltail63aligned:			/* Count in tmp2.  */
	and	tmp1, tmp2, #0x38
	add	dst, dst, tmp1
	add	src, src, tmp1
	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
	add	pc, pc, tmp1

	vldr	d0, [src, #-56]	/* 14 words to go.  */
	vstr	d0, [dst, #-56]
	vldr	d0, [src, #-48]	/* 12 words to go.  */
	vstr	d0, [dst, #-48]
	vldr	d0, [src, #-40]	/* 10 words to go.  */
	vstr	d0, [dst, #-40]
	vldr	d0, [src, #-32]	/* 8 words to go.  */
	vstr	d0, [dst, #-32]
	vldr	d0, [src, #-24]	/* 6 words to go.  */
	vstr	d0, [dst, #-24]
	vldr	d0, [src, #-16]	/* 4 words to go.  */
	vstr	d0, [dst, #-16]
	vldr	d0, [src, #-8]	/* 2 words to go.  */
	vstr	d0, [dst, #-8]
#else
	sub	src, src, #8
	sub	dst, dst, #8
1:
	ldrd	A_l, A_h, [src, #8]
	strd	A_l, A_h, [dst, #8]
	ldrd	A_l, A_h, [src, #16]
	strd	A_l, A_h, [dst, #16]
	ldrd	A_l, A_h, [src, #24]
	strd	A_l, A_h, [dst, #24]
	ldrd	A_l, A_h, [src, #32]
	strd	A_l, A_h, [dst, #32]
	ldrd	A_l, A_h, [src, #40]
	strd	A_l, A_h, [dst, #40]
	ldrd	A_l, A_h, [src, #48]
	strd	A_l, A_h, [dst, #48]
	ldrd	A_l, A_h, [src, #56]
	strd	A_l, A_h, [dst, #56]
	ldrd	A_l, A_h, [src, #64]!
	strd	A_l, A_h, [dst, #64]!
	subs	tmp2, tmp2, #64
	bge	1b
	tst	tmp2, #0x3f
	bne	1f
	ldr	tmp2,[sp], #FRAME_SIZE
	bx	lr
1:
	add	src, src, #8
	add	dst, dst, #8

.Ltail63aligned:			/* Count in tmp2.  */
	/* Copy up to 7 d-words of data.  Similar to Ltail63unaligned, but
	   we know that the src and dest are 32-bit aligned so we can use
	   LDRD/STRD to improve efficiency.  */
	/* TMP2 is now negative, but we don't care about that.  The bottom
	   six bits still tell us how many bytes are left to copy.  */

	and	tmp1, tmp2, #0x38
	add	dst, dst, tmp1
	add	src, src, tmp1
	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
	add	pc, pc, tmp1
	ldrd	A_l, A_h, [src, #-56]	/* 14 words to go.  */
	strd	A_l, A_h, [dst, #-56]
	ldrd	A_l, A_h, [src, #-48]	/* 12 words to go.  */
	strd	A_l, A_h, [dst, #-48]
	ldrd	A_l, A_h, [src, #-40]	/* 10 words to go.  */
	strd	A_l, A_h, [dst, #-40]
	ldrd	A_l, A_h, [src, #-32]	/* 8 words to go.  */
	strd	A_l, A_h, [dst, #-32]
	ldrd	A_l, A_h, [src, #-24]	/* 6 words to go.  */
	strd	A_l, A_h, [dst, #-24]
	ldrd	A_l, A_h, [src, #-16]	/* 4 words to go.  */
	strd	A_l, A_h, [dst, #-16]
	ldrd	A_l, A_h, [src, #-8]	/* 2 words to go.  */
	strd	A_l, A_h, [dst, #-8]

#endif
	tst	tmp2, #4
	ldrne	tmp1, [src], #4
	strne	tmp1, [dst], #4
	lsls	tmp2, tmp2, #31		/* Count (tmp2) now dead. */
	ldrhcs	tmp1, [src], #2
	ldrbne	tmp2, [src]
	strhcs	tmp1, [dst], #2
	strbne	tmp2, [dst]

.Ldone:
	ldr	tmp2, [sp], #FRAME_SIZE
	bx	lr

.Lcpy_body_long:			/* Count in tmp2.  */

	/* Long copy.  We know that there's at least (prefetch_lines * 64)
	   bytes to go.  */
#ifdef USE_VFP
	/* Don't use PLD.  Instead, read some data in advance of the current
	   copy position into a register.  This should act like a PLD
	   operation but we won't have to repeat the transfer.  */

	vldr	d3, [src, #0]
	vldr	d4, [src, #64]
	vldr	d5, [src, #128]
	vldr	d6, [src, #192]
	vldr	d7, [src, #256]

	vldr	d0, [src, #8]
	vldr	d1, [src, #16]
	vldr	d2, [src, #24]
	add	src, src, #32

	subs	tmp2, tmp2, #prefetch_lines * 64 * 2
	blt	2f
1:
	cpy_line_vfp	d3, 0
	cpy_line_vfp	d4, 64
	cpy_line_vfp	d5, 128
	add	dst, dst, #3 * 64
	add	src, src, #3 * 64
	cpy_line_vfp	d6, 0
	cpy_line_vfp	d7, 64
	add	dst, dst, #2 * 64
	add	src, src, #2 * 64
	subs	tmp2, tmp2, #prefetch_lines * 64
	bge	1b

2:
	cpy_tail_vfp	d3, 0
	cpy_tail_vfp	d4, 64
	cpy_tail_vfp	d5, 128
	add	src, src, #3 * 64
	add	dst, dst, #3 * 64
	cpy_tail_vfp	d6, 0
	vstr	d7, [dst, #64]
	vldr	d7, [src, #64]
	vstr	d0, [dst, #64 + 8]
	vldr	d0, [src, #64 + 8]
	vstr	d1, [dst, #64 + 16]
	vldr	d1, [src, #64 + 16]
	vstr	d2, [dst, #64 + 24]
	vldr	d2, [src, #64 + 24]
	vstr	d7, [dst, #64 + 32]
	add	src, src, #96
	vstr	d0, [dst, #64 + 40]
	vstr	d1, [dst, #64 + 48]
	vstr	d2, [dst, #64 + 56]
	add	dst, dst, #128
	add	tmp2, tmp2, #prefetch_lines * 64
	b	.Lcpy_body_medium
#else
	/* Long copy.  Use an SMS style loop to maximize the I/O
	   bandwidth of the core.  We don't have enough spare registers
	   to synthesise prefetching, so use PLD operations.  */
	/* Pre-bias src and dst.  */
	sub	src, src, #8
	sub	dst, dst, #8
	pld	[src, #8]
	pld	[src, #72]
	subs	tmp2, tmp2, #64
	pld	[src, #136]
	ldrd	A_l, A_h, [src, #8]
	strd	B_l, B_h, [sp, #8]
	ldrd	B_l, B_h, [src, #16]
	strd	C_l, C_h, [sp, #16]
	ldrd	C_l, C_h, [src, #24]
	strd	D_l, D_h, [sp, #24]
	pld	[src, #200]
	ldrd	D_l, D_h, [src, #32]!
	b	1f
	.p2align	6
2:
	pld	[src, #232]
	strd	A_l, A_h, [dst, #40]
	ldrd	A_l, A_h, [src, #40]
	strd	B_l, B_h, [dst, #48]
	ldrd	B_l, B_h, [src, #48]
	strd	C_l, C_h, [dst, #56]
	ldrd	C_l, C_h, [src, #56]
	strd	D_l, D_h, [dst, #64]!
	ldrd	D_l, D_h, [src, #64]!
	subs	tmp2, tmp2, #64
1:
	strd	A_l, A_h, [dst, #8]
	ldrd	A_l, A_h, [src, #8]
	strd	B_l, B_h, [dst, #16]
	ldrd	B_l, B_h, [src, #16]
	strd	C_l, C_h, [dst, #24]
	ldrd	C_l, C_h, [src, #24]
	strd	D_l, D_h, [dst, #32]
	ldrd	D_l, D_h, [src, #32]
	bcs	2b
	/* Save the remaining bytes and restore the callee-saved regs.  */
	strd	A_l, A_h, [dst, #40]
	add	src, src, #40
	strd	B_l, B_h, [dst, #48]
	ldrd	B_l, B_h, [sp, #8]
	strd	C_l, C_h, [dst, #56]
	ldrd	C_l, C_h, [sp, #16]
	strd	D_l, D_h, [dst, #64]
	ldrd	D_l, D_h, [sp, #24]
	add	dst, dst, #72
	tst	tmp2, #0x3f
	bne	.Ltail63aligned
	ldr	tmp2, [sp], #FRAME_SIZE
	bx	lr
#endif

.Lcpy_notaligned:
	pld	[src]
	pld	[src, #64]
	/* There's at least 64 bytes to copy, but there is no mutual
	   alignment.  */
	/* Bring DST to 64-bit alignment.  */
	lsls	tmp2, dst, #29
	pld	[src, #(2 * 64)]
	beq	1f
	rsbs	tmp2, tmp2, #0
	sub	count, count, tmp2, lsr #29
	ldrmi	tmp1, [src], #4
	strmi	tmp1, [dst], #4
	lsls	tmp2, tmp2, #2
	ldrbne	tmp1, [src], #1
	ldrhcs	tmp2, [src], #2
	strbne	tmp1, [dst], #1
	strhcs	tmp2, [dst], #2
1:
	pld	[src, #(3 * 64)]
	subs	count, count, #64
	ldrmi	tmp2, [sp], #FRAME_SIZE
	bmi	.Ltail63unaligned
	pld	[src, #(4 * 64)]

#ifdef USE_NEON
	vld1.8	{d0-d3}, [src]!
	vld1.8	{d4-d7}, [src]!
	subs	count, count, #64
	bmi	2f
1:
	pld	[src, #(4 * 64)]
	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
	vld1.8	{d0-d3}, [src]!
	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
	vld1.8	{d4-d7}, [src]!
	subs	count, count, #64
	bpl	1b
2:
	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
	ands	count, count, #0x3f
#else
	/* Use an SMS style loop to maximize the I/O bandwidth.  */
	sub	src, src, #4
	sub	dst, dst, #8
	subs	tmp2, count, #64	/* Use tmp2 for count.  */
	ldr	A_l, [src, #4]
	ldr	A_h, [src, #8]
	strd	B_l, B_h, [sp, #8]
	ldr	B_l, [src, #12]
	ldr	B_h, [src, #16]
	strd	C_l, C_h, [sp, #16]
	ldr	C_l, [src, #20]
	ldr	C_h, [src, #24]
	strd	D_l, D_h, [sp, #24]
	ldr	D_l, [src, #28]
	ldr	D_h, [src, #32]!
	b	1f
	.p2align	6
2:
	pld	[src, #(5 * 64) - (32 - 4)]
	strd	A_l, A_h, [dst, #40]
	ldr	A_l, [src, #36]
	ldr	A_h, [src, #40]
	strd	B_l, B_h, [dst, #48]
	ldr	B_l, [src, #44]
	ldr	B_h, [src, #48]
	strd	C_l, C_h, [dst, #56]
	ldr	C_l, [src, #52]
	ldr	C_h, [src, #56]
	strd	D_l, D_h, [dst, #64]!
	ldr	D_l, [src, #60]
	ldr	D_h, [src, #64]!
	subs	tmp2, tmp2, #64
1:
	strd	A_l, A_h, [dst, #8]
	ldr	A_l, [src, #4]
	ldr	A_h, [src, #8]
	strd	B_l, B_h, [dst, #16]
	ldr	B_l, [src, #12]
	ldr	B_h, [src, #16]
	strd	C_l, C_h, [dst, #24]
	ldr	C_l, [src, #20]
	ldr	C_h, [src, #24]
	strd	D_l, D_h, [dst, #32]
	ldr	D_l, [src, #28]
	ldr	D_h, [src, #32]
	bcs	2b

	/* Save the remaining bytes and restore the callee-saved regs.  */
	strd	A_l, A_h, [dst, #40]
	add	src, src, #36
	strd	B_l, B_h, [dst, #48]
	ldrd	B_l, B_h, [sp, #8]
	strd	C_l, C_h, [dst, #56]
	ldrd	C_l, C_h, [sp, #16]
	strd	D_l, D_h, [dst, #64]
	ldrd	D_l, D_h, [sp, #24]
	add	dst, dst, #72
	ands	count, tmp2, #0x3f
#endif
	ldr	tmp2, [sp], #FRAME_SIZE
	bne	.Ltail63unaligned
	bx	lr

	.size	memcpy, . - memcpy

100 by Will Newton Integrate NEON/VFP/ARM optimised memcpy implementation.	1	/* Copyright (c) 2013, Linaro Limited
52 by Dr. David Alan Gilbert spaces->tabs, use C style comments for the big top comment	2	All rights reserved.
	3
	4	Redistribution and use in source and binary forms, with or without
	5	modification, are permitted provided that the following conditions
	6	are met:
	7
	8	* Redistributions of source code must retain the above copyright
	9	notice, this list of conditions and the following disclaimer.
	10
	11	* Redistributions in binary form must reproduce the above copyright
	12	notice, this list of conditions and the following disclaimer in the
	13	documentation and/or other materials provided with the distribution.
	14
	15	* Neither the name of Linaro Limited nor the names of its
	16	contributors may be used to endorse or promote products derived
	17	from this software without specific prior written permission.
	18
	19	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	20	"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	21	LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	22	A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	23	HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	24	SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	25	LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	26	DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	27	THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	28	(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	29	OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	30
101 by Will Newton Update memcpy comments after testing on big endian.	31	This memcpy routine is optimised for Cortex-A15 cores and takes advantage
100 by Will Newton Integrate NEON/VFP/ARM optimised memcpy implementation.	32	of VFP or NEON when built with the appropriate flags.
	33
	34	Assumptions:
	35
	36	ARMv6 (ARMv7-a if using Neon)
	37	ARM state
	38	Unaligned accesses
	39	LDRD/STRD support unaligned word accesses
	40
	41	*/
41.1.1 by Dr. David Alan Gilbert Import my memcpy routines (plain ARMv7 and Neon hybrid)	42
52 by Dr. David Alan Gilbert spaces->tabs, use C style comments for the big top comment	43	.syntax unified
100 by Will Newton Integrate NEON/VFP/ARM optimised memcpy implementation.	44	/* This implementation requires ARM state. */
	45	.arm
	46
	47	#ifdef __ARM_NEON__
	48
	49	.fpu neon
	50	.arch armv7-a
	51	# define FRAME_SIZE 4
	52	# define USE_VFP
	53	# define USE_NEON
	54
	55	#elif !defined (__SOFTFP__)
	56
	57	.arch armv6
	58	.fpu vfpv2
	59	# define FRAME_SIZE 32
	60	# define USE_VFP
	61
	62	#else
	63	.arch armv6
	64	# define FRAME_SIZE 32
	65
	66	#endif
	67
	68	/* Old versions of GAS incorrectly implement the NEON align semantics. */
	69	#ifdef BROKEN_ASM_NEON_ALIGN
	70	#define ALIGN(addr, align) addr,:align
	71	#else
	72	#define ALIGN(addr, align) addr:align
	73	#endif
	74
	75	#define PC_OFFSET 8 /* PC pipeline compensation. */
	76	#define INSN_SIZE 4
	77
	78	/* Call parameters. */
	79	#define dstin r0
	80	#define src r1
	81	#define count r2
	82
	83	/* Locals. */
	84	#define tmp1 r3
	85	#define dst ip
	86	#define tmp2 r10
	87
	88	#ifndef USE_NEON
	89	/* For bulk copies using GP registers. */
	90	#define A_l r2 /* Call-clobbered. */
	91	#define A_h r3 /* Call-clobbered. */
	92	#define B_l r4
	93	#define B_h r5
	94	#define C_l r6
	95	#define C_h r7
	96	#define D_l r8
	97	#define D_h r9
	98	#endif
	99
	100	/* Number of lines ahead to pre-fetch data. If you change this the code
	101	below will need adjustment to compensate. */
	102
	103	#define prefetch_lines 5
	104
	105	#ifdef USE_VFP
	106	.macro cpy_line_vfp vreg, base
	107	vstr \vreg, [dst, #\base]
108	vldr \vreg, [src, #\base]
109	vstr d0, [dst, #\base + 8]
110	vldr d0, [src, #\base + 8]
111	vstr d1, [dst, #\base + 16]
112	vldr d1, [src, #\base + 16]
113	vstr d2, [dst, #\base + 24]
114	vldr d2, [src, #\base + 24]
115	vstr \vreg, [dst, #\base + 32]
116	vldr \vreg, [src, #\base + prefetch_lines * 64 - 32]
117	vstr d0, [dst, #\base + 40]
118	vldr d0, [src, #\base + 40]
119	vstr d1, [dst, #\base + 48]
120	vldr d1, [src, #\base + 48]
121	vstr d2, [dst, #\base + 56]
122	vldr d2, [src, #\base + 56]
123	.endm
124
125	.macro cpy_tail_vfp vreg, base
126	vstr \vreg, [dst, #\base]
127	vldr \vreg, [src, #\base]
128	vstr d0, [dst, #\base + 8]
129	vldr d0, [src, #\base + 8]
130	vstr d1, [dst, #\base + 16]
131	vldr d1, [src, #\base + 16]
132	vstr d2, [dst, #\base + 24]
133	vldr d2, [src, #\base + 24]
134	vstr \vreg, [dst, #\base + 32]
135	vstr d0, [dst, #\base + 40]
136	vldr d0, [src, #\base + 40]
137	vstr d1, [dst, #\base + 48]
138	vldr d1, [src, #\base + 48]
139	vstr d2, [dst, #\base + 56]
140	vldr d2, [src, #\base + 56]
141	.endm
142	#endif
143
144	.macro def_fn f p2align=0
52 by Dr. David Alan Gilbert spaces->tabs, use C style comments for the big top comment	145	.text
100 by Will Newton Integrate NEON/VFP/ARM optimised memcpy implementation.	146	.p2align \p2align
	147	.global \f
	148	.type \f, %function
	149	\f:
	150	.endm
	151
	152	def_fn memcpy p2align=6
	153
	154	mov dst, dstin /* Preserve dstin, we need to return it. */
	155	cmp count, #64
	156	bge .Lcpy_not_short
	157	/* Deal with small copies quickly by dropping straight into the
	158	exit block. */
	159
	160	.Ltail63unaligned:
	161	#ifdef USE_NEON
	162	and tmp1, count, #0x38
	163	rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
	164	add pc, pc, tmp1
	165	vld1.8 {d0}, [src]! /* 14 words to go. */
	166	vst1.8 {d0}, [dst]!
	167	vld1.8 {d0}, [src]! /* 12 words to go. */
	168	vst1.8 {d0}, [dst]!
	169	vld1.8 {d0}, [src]! /* 10 words to go. */
	170	vst1.8 {d0}, [dst]!
	171	vld1.8 {d0}, [src]! /* 8 words to go. */
	172	vst1.8 {d0}, [dst]!
	173	vld1.8 {d0}, [src]! /* 6 words to go. */
	174	vst1.8 {d0}, [dst]!
	175	vld1.8 {d0}, [src]! /* 4 words to go. */
	176	vst1.8 {d0}, [dst]!
	177	vld1.8 {d0}, [src]! /* 2 words to go. */
	178	vst1.8 {d0}, [dst]!
	179
	180	tst count, #4
	181	ldrne tmp1, [src], #4
	182	strne tmp1, [dst], #4
	183	#else
	184	/* Copy up to 15 full words of data. May not be aligned. */
	185	/* Cannot use VFP for unaligned data. */
	186	and tmp1, count, #0x3c
	187	add dst, dst, tmp1
	188	add src, src, tmp1
	189	rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
	190	/* Jump directly into the sequence below at the correct offset. */
	191	add pc, pc, tmp1, lsl #1
	192
	193	ldr tmp1, [src, #-60] /* 15 words to go. */
	194	str tmp1, [dst, #-60]
	195
	196	ldr tmp1, [src, #-56] /* 14 words to go. */
	197	str tmp1, [dst, #-56]
	198	ldr tmp1, [src, #-52]
	199	str tmp1, [dst, #-52]
	200
	201	ldr tmp1, [src, #-48] /* 12 words to go. */
	202	str tmp1, [dst, #-48]
	203	ldr tmp1, [src, #-44]
	204	str tmp1, [dst, #-44]
	205
	206	ldr tmp1, [src, #-40] /* 10 words to go. */
	207	str tmp1, [dst, #-40]
	208	ldr tmp1, [src, #-36]
	209	str tmp1, [dst, #-36]
210
211	ldr tmp1, [src, #-32] /* 8 words to go. */
212	str tmp1, [dst, #-32]
213	ldr tmp1, [src, #-28]
214	str tmp1, [dst, #-28]
215
216	ldr tmp1, [src, #-24] /* 6 words to go. */
217	str tmp1, [dst, #-24]
218	ldr tmp1, [src, #-20]
219	str tmp1, [dst, #-20]
220
221	ldr tmp1, [src, #-16] /* 4 words to go. */
222	str tmp1, [dst, #-16]
223	ldr tmp1, [src, #-12]
224	str tmp1, [dst, #-12]
225
226	ldr tmp1, [src, #-8] /* 2 words to go. */
227	str tmp1, [dst, #-8]
228	ldr tmp1, [src, #-4]
229	str tmp1, [dst, #-4]
230	#endif
231
232	lsls count, count, #31
233	ldrhcs tmp1, [src], #2
234	ldrbne src, [src] /* Src is dead, use as a scratch. */
235	strhcs tmp1, [dst], #2
236	strbne src, [dst]
237	bx lr
238
239	.Lcpy_not_short:
240	/* At least 64 bytes to copy, but don't know the alignment yet. */
241	str tmp2, [sp, #-FRAME_SIZE]!
116 by Will Newton Tweak memcpy for performance with misaligned buffers.	242	and tmp2, src, #7
	243	and tmp1, dst, #7
100 by Will Newton Integrate NEON/VFP/ARM optimised memcpy implementation.	244	cmp tmp1, tmp2
	245	bne .Lcpy_notaligned
	246
	247	#ifdef USE_VFP
	248	/* Magic dust alert! Force VFP on Cortex-A9. Experiments show
	249	that the FP pipeline is much better at streaming loads and
	250	stores. This is outside the critical loop. */
	251	vmov.f32 s0, s0
	252	#endif
	253
	254	/* SRC and DST have the same mutual 32-bit alignment, but we may
	255	still need to pre-copy some bytes to get to natural alignment.
	256	We bring DST into full 64-bit alignment. */
	257	lsls tmp2, dst, #29
	258	beq 1f
	259	rsbs tmp2, tmp2, #0
	260	sub count, count, tmp2, lsr #29
	261	ldrmi tmp1, [src], #4
	262	strmi tmp1, [dst], #4
	263	lsls tmp2, tmp2, #2
	264	ldrhcs tmp1, [src], #2
	265	ldrbne tmp2, [src], #1
	266	strhcs tmp1, [dst], #2
	267	strbne tmp2, [dst], #1
	268
	269	1:
	270	subs tmp2, count, #64 /* Use tmp2 for count. */
	271	blt .Ltail63aligned
	272
	273	cmp tmp2, #512
	274	bge .Lcpy_body_long
	275
	276	.Lcpy_body_medium: /* Count in tmp2. */
	277	#ifdef USE_VFP
	278	1:
	279	vldr d0, [src, #0]
	280	subs tmp2, tmp2, #64
	281	vldr d1, [src, #8]
	282	vstr d0, [dst, #0]
	283	vldr d0, [src, #16]
	284	vstr d1, [dst, #8]
	285	vldr d1, [src, #24]
	286	vstr d0, [dst, #16]
	287	vldr d0, [src, #32]
	288	vstr d1, [dst, #24]
	289	vldr d1, [src, #40]
	290	vstr d0, [dst, #32]
	291	vldr d0, [src, #48]
	292	vstr d1, [dst, #40]
	293	vldr d1, [src, #56]
	294	vstr d0, [dst, #48]
	295	add src, src, #64
	296	vstr d1, [dst, #56]
	297	add dst, dst, #64
	298	bge 1b
	299	tst tmp2, #0x3f
	300	beq .Ldone
	301
	302	.Ltail63aligned: /* Count in tmp2. */
	303	and tmp1, tmp2, #0x38
	304	add dst, dst, tmp1
	305	add src, src, tmp1
	306	rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
	307	add pc, pc, tmp1
308
309	vldr d0, [src, #-56] /* 14 words to go. */
310	vstr d0, [dst, #-56]
311	vldr d0, [src, #-48] /* 12 words to go. */
312	vstr d0, [dst, #-48]
313	vldr d0, [src, #-40] /* 10 words to go. */
314	vstr d0, [dst, #-40]
315	vldr d0, [src, #-32] /* 8 words to go. */
316	vstr d0, [dst, #-32]
317	vldr d0, [src, #-24] /* 6 words to go. */
318	vstr d0, [dst, #-24]
319	vldr d0, [src, #-16] /* 4 words to go. */
320	vstr d0, [dst, #-16]
321	vldr d0, [src, #-8] /* 2 words to go. */
322	vstr d0, [dst, #-8]
323	#else
324	sub src, src, #8
325	sub dst, dst, #8
326	1:
327	ldrd A_l, A_h, [src, #8]
328	strd A_l, A_h, [dst, #8]
329	ldrd A_l, A_h, [src, #16]
330	strd A_l, A_h, [dst, #16]
331	ldrd A_l, A_h, [src, #24]
332	strd A_l, A_h, [dst, #24]
333	ldrd A_l, A_h, [src, #32]
334	strd A_l, A_h, [dst, #32]
335	ldrd A_l, A_h, [src, #40]
336	strd A_l, A_h, [dst, #40]
337	ldrd A_l, A_h, [src, #48]
338	strd A_l, A_h, [dst, #48]
339	ldrd A_l, A_h, [src, #56]
340	strd A_l, A_h, [dst, #56]
341	ldrd A_l, A_h, [src, #64]!
342	strd A_l, A_h, [dst, #64]!
343	subs tmp2, tmp2, #64
344	bge 1b
345	tst tmp2, #0x3f
346	bne 1f
347	ldr tmp2,[sp], #FRAME_SIZE
348	bx lr
349	1:
350	add src, src, #8
351	add dst, dst, #8
352
353	.Ltail63aligned: /* Count in tmp2. */
354	/* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but
355	we know that the src and dest are 32-bit aligned so we can use
356	LDRD/STRD to improve efficiency. */
357	/* TMP2 is now negative, but we don't care about that. The bottom
358	six bits still tell us how many bytes are left to copy. */
359
360	and tmp1, tmp2, #0x38
361	add dst, dst, tmp1
362	add src, src, tmp1
363	rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
364	add pc, pc, tmp1
365	ldrd A_l, A_h, [src, #-56] /* 14 words to go. */
366	strd A_l, A_h, [dst, #-56]
367	ldrd A_l, A_h, [src, #-48] /* 12 words to go. */
368	strd A_l, A_h, [dst, #-48]
369	ldrd A_l, A_h, [src, #-40] /* 10 words to go. */
370	strd A_l, A_h, [dst, #-40]
371	ldrd A_l, A_h, [src, #-32] /* 8 words to go. */
372	strd A_l, A_h, [dst, #-32]
373	ldrd A_l, A_h, [src, #-24] /* 6 words to go. */
374	strd A_l, A_h, [dst, #-24]
375	ldrd A_l, A_h, [src, #-16] /* 4 words to go. */
376	strd A_l, A_h, [dst, #-16]
377	ldrd A_l, A_h, [src, #-8] /* 2 words to go. */
378	strd A_l, A_h, [dst, #-8]
379
380	#endif
381	tst tmp2, #4
382	ldrne tmp1, [src], #4
383	strne tmp1, [dst], #4
384	lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */
385	ldrhcs tmp1, [src], #2
386	ldrbne tmp2, [src]
387	strhcs tmp1, [dst], #2
388	strbne tmp2, [dst]
389
390	.Ldone:
391	ldr tmp2, [sp], #FRAME_SIZE
392	bx lr
393
394	.Lcpy_body_long: /* Count in tmp2. */
395
396	/* Long copy. We know that there's at least (prefetch_lines * 64)
397	bytes to go. */
398	#ifdef USE_VFP
399	/* Don't use PLD. Instead, read some data in advance of the current
400	copy position into a register. This should act like a PLD
401	operation but we won't have to repeat the transfer. */
402
403	vldr d3, [src, #0]
404	vldr d4, [src, #64]
405	vldr d5, [src, #128]
406	vldr d6, [src, #192]
407	vldr d7, [src, #256]
408
409	vldr d0, [src, #8]
410	vldr d1, [src, #16]
411	vldr d2, [src, #24]
412	add src, src, #32
413
414	subs tmp2, tmp2, #prefetch_lines * 64 * 2
415	blt 2f
416	1:
417	cpy_line_vfp d3, 0
418	cpy_line_vfp d4, 64
419	cpy_line_vfp d5, 128
420	add dst, dst, #3 * 64
421	add src, src, #3 * 64
422	cpy_line_vfp d6, 0
423	cpy_line_vfp d7, 64
424	add dst, dst, #2 * 64
425	add src, src, #2 * 64
426	subs tmp2, tmp2, #prefetch_lines * 64
427	bge 1b
428
429	2:
430	cpy_tail_vfp d3, 0
431	cpy_tail_vfp d4, 64
432	cpy_tail_vfp d5, 128
433	add src, src, #3 * 64
434	add dst, dst, #3 * 64
435	cpy_tail_vfp d6, 0
436	vstr d7, [dst, #64]
437	vldr d7, [src, #64]
438	vstr d0, [dst, #64 + 8]
439	vldr d0, [src, #64 + 8]
440	vstr d1, [dst, #64 + 16]
441	vldr d1, [src, #64 + 16]
442	vstr d2, [dst, #64 + 24]
443	vldr d2, [src, #64 + 24]
444	vstr d7, [dst, #64 + 32]
445	add src, src, #96
446	vstr d0, [dst, #64 + 40]
447	vstr d1, [dst, #64 + 48]
448	vstr d2, [dst, #64 + 56]
449	add dst, dst, #128
450	add tmp2, tmp2, #prefetch_lines * 64
451	b .Lcpy_body_medium
452	#else
453	/* Long copy. Use an SMS style loop to maximize the I/O
454	bandwidth of the core. We don't have enough spare registers
455	to synthesise prefetching, so use PLD operations. */
456	/* Pre-bias src and dst. */
457	sub src, src, #8
458	sub dst, dst, #8
459	pld [src, #8]
460	pld [src, #72]
461	subs tmp2, tmp2, #64
462	pld [src, #136]
463	ldrd A_l, A_h, [src, #8]
464	strd B_l, B_h, [sp, #8]
465	ldrd B_l, B_h, [src, #16]
466	strd C_l, C_h, [sp, #16]
467	ldrd C_l, C_h, [src, #24]
468	strd D_l, D_h, [sp, #24]
469	pld [src, #200]
470	ldrd D_l, D_h, [src, #32]!
471	b 1f
472	.p2align 6
473	2:
474	pld [src, #232]
475	strd A_l, A_h, [dst, #40]
476	ldrd A_l, A_h, [src, #40]
477	strd B_l, B_h, [dst, #48]
478	ldrd B_l, B_h, [src, #48]
479	strd C_l, C_h, [dst, #56]
480	ldrd C_l, C_h, [src, #56]
481	strd D_l, D_h, [dst, #64]!
482	ldrd D_l, D_h, [src, #64]!
483	subs tmp2, tmp2, #64
484	1:
485	strd A_l, A_h, [dst, #8]
486	ldrd A_l, A_h, [src, #8]
487	strd B_l, B_h, [dst, #16]
488	ldrd B_l, B_h, [src, #16]
489	strd C_l, C_h, [dst, #24]
490	ldrd C_l, C_h, [src, #24]
491	strd D_l, D_h, [dst, #32]
492	ldrd D_l, D_h, [src, #32]
493	bcs 2b
494	/* Save the remaining bytes and restore the callee-saved regs. */
495	strd A_l, A_h, [dst, #40]
496	add src, src, #40
497	strd B_l, B_h, [dst, #48]
498	ldrd B_l, B_h, [sp, #8]
499	strd C_l, C_h, [dst, #56]
500	ldrd C_l, C_h, [sp, #16]
501	strd D_l, D_h, [dst, #64]
502	ldrd D_l, D_h, [sp, #24]
503	add dst, dst, #72
504	tst tmp2, #0x3f
505	bne .Ltail63aligned
506	ldr tmp2, [sp], #FRAME_SIZE
507	bx lr
508	#endif
509
510	.Lcpy_notaligned:
511	pld [src]
512	pld [src, #64]
513	/* There's at least 64 bytes to copy, but there is no mutual
514	alignment. */
515	/* Bring DST to 64-bit alignment. */
516	lsls tmp2, dst, #29
517	pld [src, #(2 * 64)]
518	beq 1f
519	rsbs tmp2, tmp2, #0
520	sub count, count, tmp2, lsr #29
521	ldrmi tmp1, [src], #4
522	strmi tmp1, [dst], #4
523	lsls tmp2, tmp2, #2
524	ldrbne tmp1, [src], #1
525	ldrhcs tmp2, [src], #2
526	strbne tmp1, [dst], #1
527	strhcs tmp2, [dst], #2
528	1:
529	pld [src, #(3 * 64)]
530	subs count, count, #64
531	ldrmi tmp2, [sp], #FRAME_SIZE
532	bmi .Ltail63unaligned
533	pld [src, #(4 * 64)]
534
535	#ifdef USE_NEON
536	vld1.8 {d0-d3}, [src]!
537	vld1.8 {d4-d7}, [src]!
538	subs count, count, #64
539	bmi 2f
540	1:
541	pld [src, #(4 * 64)]
542	vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
543	vld1.8 {d0-d3}, [src]!
544	vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
545	vld1.8 {d4-d7}, [src]!
546	subs count, count, #64
547	bpl 1b
548	2:
549	vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
550	vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
551	ands count, count, #0x3f
552	#else
553	/* Use an SMS style loop to maximize the I/O bandwidth. */
554	sub src, src, #4
555	sub dst, dst, #8
556	subs tmp2, count, #64 /* Use tmp2 for count. */
557	ldr A_l, [src, #4]
558	ldr A_h, [src, #8]
559	strd B_l, B_h, [sp, #8]
560	ldr B_l, [src, #12]
561	ldr B_h, [src, #16]
562	strd C_l, C_h, [sp, #16]
563	ldr C_l, [src, #20]
564	ldr C_h, [src, #24]
565	strd D_l, D_h, [sp, #24]
566	ldr D_l, [src, #28]
567	ldr D_h, [src, #32]!
568	b 1f
569	.p2align 6
570	2:
571	pld [src, #(5 * 64) - (32 - 4)]
572	strd A_l, A_h, [dst, #40]
573	ldr A_l, [src, #36]
574	ldr A_h, [src, #40]
575	strd B_l, B_h, [dst, #48]
576	ldr B_l, [src, #44]
577	ldr B_h, [src, #48]
578	strd C_l, C_h, [dst, #56]
579	ldr C_l, [src, #52]
580	ldr C_h, [src, #56]
581	strd D_l, D_h, [dst, #64]!
582	ldr D_l, [src, #60]
583	ldr D_h, [src, #64]!
584	subs tmp2, tmp2, #64
585	1:
586	strd A_l, A_h, [dst, #8]
587	ldr A_l, [src, #4]
588	ldr A_h, [src, #8]
589	strd B_l, B_h, [dst, #16]
590	ldr B_l, [src, #12]
591	ldr B_h, [src, #16]
592	strd C_l, C_h, [dst, #24]
593	ldr C_l, [src, #20]
594	ldr C_h, [src, #24]
595	strd D_l, D_h, [dst, #32]
596	ldr D_l, [src, #28]
597	ldr D_h, [src, #32]
598	bcs 2b
599
600	/* Save the remaining bytes and restore the callee-saved regs. */
601	strd A_l, A_h, [dst, #40]
602	add src, src, #36
603	strd B_l, B_h, [dst, #48]
604	ldrd B_l, B_h, [sp, #8]
605	strd C_l, C_h, [dst, #56]
606	ldrd C_l, C_h, [sp, #16]
607	strd D_l, D_h, [dst, #64]
608	ldrd D_l, D_h, [sp, #24]
609	add dst, dst, #72
610	ands count, tmp2, #0x3f
611	#endif
612	ldr tmp2, [sp], #FRAME_SIZE
613	bne .Ltail63unaligned
614	bx lr
615
616	.size memcpy, . - memcpy