~linaro-toolchain-dev/cortex-strings/trunk : contents of src/linaro-a9/memcpy.S at revision 122

~linaro-toolchain-dev/cortex-strings/trunk : (revision 122)

/* Copyright (c) 2013, Linaro Limited
   All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
   are met:

      * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.

      * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.

      * Neither the name of Linaro Limited nor the names of its
      contributors may be used to endorse or promote products derived
      from this software without specific prior written permission.

   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
   This memcpy routine is optimised for Cortex-A15 cores and takes advantage
   of VFP or NEON when built with the appropriate flags.

   Assumptions:

    ARMv6 (ARMv7-a if using Neon)
    ARM state
    Unaligned accesses

 */

	.syntax unified
	/* This implementation requires ARM state.  */
	.arm

#ifdef __ARM_NEON__

	.fpu	neon
	.arch	armv7-a
# define FRAME_SIZE	4
# define USE_VFP
# define USE_NEON

#elif !defined (__SOFTFP__)

	.arch	armv6
	.fpu	vfpv2
# define FRAME_SIZE	32
# define USE_VFP

#else
	.arch	armv6
# define FRAME_SIZE    32

#endif

/* Old versions of GAS incorrectly implement the NEON align semantics.  */
#ifdef BROKEN_ASM_NEON_ALIGN
#define ALIGN(addr, align) addr,:align
#else
#define ALIGN(addr, align) addr:align
#endif

#define PC_OFFSET	8	/* PC pipeline compensation.  */
#define INSN_SIZE	4

/* Call parameters.  */
#define dstin	r0
#define src	r1
#define count	r2

/* Locals.  */
#define tmp1	r3
#define dst	ip
#define tmp2	r10

#ifndef USE_NEON
/* For bulk copies using GP registers.  */
#define	A_l	r2		/* Call-clobbered.  */
#define	A_h	r3		/* Call-clobbered.  */
#define	B_l	r4
#define	B_h	r5
#define	C_l	r6
#define	C_h	r7
#define	D_l	r8
#define	D_h	r9
#endif

/* Number of lines ahead to pre-fetch data.  If you change this the code
   below will need adjustment to compensate.  */

#define prefetch_lines	5

#ifdef USE_VFP
	.macro	cpy_line_vfp vreg, base
	vstr	\vreg, [dst, #\base]
	vldr	\vreg, [src, #\base]
	vstr	d0, [dst, #\base + 8]
	vldr	d0, [src, #\base + 8]
	vstr	d1, [dst, #\base + 16]
	vldr	d1, [src, #\base + 16]
	vstr	d2, [dst, #\base + 24]
	vldr	d2, [src, #\base + 24]
	vstr	\vreg, [dst, #\base + 32]
	vldr	\vreg, [src, #\base + prefetch_lines * 64 - 32]
	vstr	d0, [dst, #\base + 40]
	vldr	d0, [src, #\base + 40]
	vstr	d1, [dst, #\base + 48]
	vldr	d1, [src, #\base + 48]
	vstr	d2, [dst, #\base + 56]
	vldr	d2, [src, #\base + 56]
	.endm

	.macro	cpy_tail_vfp vreg, base
	vstr	\vreg, [dst, #\base]
	vldr	\vreg, [src, #\base]
	vstr	d0, [dst, #\base + 8]
	vldr	d0, [src, #\base + 8]
	vstr	d1, [dst, #\base + 16]
	vldr	d1, [src, #\base + 16]
	vstr	d2, [dst, #\base + 24]
	vldr	d2, [src, #\base + 24]
	vstr	\vreg, [dst, #\base + 32]
	vstr	d0, [dst, #\base + 40]
	vldr	d0, [src, #\base + 40]
	vstr	d1, [dst, #\base + 48]
	vldr	d1, [src, #\base + 48]
	vstr	d2, [dst, #\base + 56]
	vldr	d2, [src, #\base + 56]
	.endm
#endif

	.macro def_fn f p2align=0
	.text
	.p2align \p2align
	.global \f
	.type \f, %function
\f:
	.endm

def_fn memcpy p2align=6

	mov	dst, dstin	/* Preserve dstin, we need to return it.  */
	cmp	count, #64
	bge	.Lcpy_not_short
	/* Deal with small copies quickly by dropping straight into the
	   exit block.  */

.Ltail63unaligned:
#ifdef USE_NEON
	and	tmp1, count, #0x38
	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
	add	pc, pc, tmp1
	vld1.8	{d0}, [src]!	/* 14 words to go.  */
	vst1.8	{d0}, [dst]!
	vld1.8	{d0}, [src]!	/* 12 words to go.  */
	vst1.8	{d0}, [dst]!
	vld1.8	{d0}, [src]!	/* 10 words to go.  */
	vst1.8	{d0}, [dst]!
	vld1.8	{d0}, [src]!	/* 8 words to go.  */
	vst1.8	{d0}, [dst]!
	vld1.8	{d0}, [src]!	/* 6 words to go.  */
	vst1.8	{d0}, [dst]!
	vld1.8	{d0}, [src]!	/* 4 words to go.  */
	vst1.8	{d0}, [dst]!
	vld1.8	{d0}, [src]!	/* 2 words to go.  */
	vst1.8	{d0}, [dst]!

	tst	count, #4
	ldrne	tmp1, [src], #4
	strne	tmp1, [dst], #4
#else
	/* Copy up to 15 full words of data.  May not be aligned.  */
	/* Cannot use VFP for unaligned data.  */
	and	tmp1, count, #0x3c
	add	dst, dst, tmp1
	add	src, src, tmp1
	rsb	tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
	/* Jump directly into the sequence below at the correct offset.  */
	add	pc, pc, tmp1, lsl #1

	ldr	tmp1, [src, #-60]	/* 15 words to go.  */
	str	tmp1, [dst, #-60]

	ldr	tmp1, [src, #-56]	/* 14 words to go.  */
	str	tmp1, [dst, #-56]
	ldr	tmp1, [src, #-52]
	str	tmp1, [dst, #-52]

	ldr	tmp1, [src, #-48]	/* 12 words to go.  */
	str	tmp1, [dst, #-48]
	ldr	tmp1, [src, #-44]
	str	tmp1, [dst, #-44]

	ldr	tmp1, [src, #-40]	/* 10 words to go.  */
	str	tmp1, [dst, #-40]
	ldr	tmp1, [src, #-36]
	str	tmp1, [dst, #-36]

	ldr	tmp1, [src, #-32]	/* 8 words to go.  */
	str	tmp1, [dst, #-32]
	ldr	tmp1, [src, #-28]
	str	tmp1, [dst, #-28]

	ldr	tmp1, [src, #-24]	/* 6 words to go.  */
	str	tmp1, [dst, #-24]
	ldr	tmp1, [src, #-20]
	str	tmp1, [dst, #-20]

	ldr	tmp1, [src, #-16]	/* 4 words to go.  */
	str	tmp1, [dst, #-16]
	ldr	tmp1, [src, #-12]
	str	tmp1, [dst, #-12]

	ldr	tmp1, [src, #-8]	/* 2 words to go.  */
	str	tmp1, [dst, #-8]
	ldr	tmp1, [src, #-4]
	str	tmp1, [dst, #-4]
#endif

	lsls	count, count, #31
	ldrhcs	tmp1, [src], #2
	ldrbne	src, [src]		/* Src is dead, use as a scratch.  */
	strhcs	tmp1, [dst], #2
	strbne	src, [dst]
	bx	lr

.Lcpy_not_short:
	/* At least 64 bytes to copy, but don't know the alignment yet.  */
	str	tmp2, [sp, #-FRAME_SIZE]!
	and	tmp2, src, #7
	and	tmp1, dst, #7
	cmp	tmp1, tmp2
	bne	.Lcpy_notaligned

#ifdef USE_VFP
	/* Magic dust alert!  Force VFP on Cortex-A9.  Experiments show
	   that the FP pipeline is much better at streaming loads and
	   stores.  This is outside the critical loop.  */
	vmov.f32	s0, s0
#endif

	/* SRC and DST have the same mutual 64-bit alignment, but we may
	   still need to pre-copy some bytes to get to natural alignment.
	   We bring SRC and DST into full 64-bit alignment.  */
	lsls	tmp2, dst, #29
	beq	1f
	rsbs	tmp2, tmp2, #0
	sub	count, count, tmp2, lsr #29
	ldrmi	tmp1, [src], #4
	strmi	tmp1, [dst], #4
	lsls	tmp2, tmp2, #2
	ldrhcs	tmp1, [src], #2
	ldrbne	tmp2, [src], #1
	strhcs	tmp1, [dst], #2
	strbne	tmp2, [dst], #1

1:
	subs	tmp2, count, #64	/* Use tmp2 for count.  */
	blt	.Ltail63aligned

	cmp	tmp2, #512
	bge	.Lcpy_body_long

.Lcpy_body_medium:			/* Count in tmp2.  */
#ifdef USE_VFP
1:
	vldr	d0, [src, #0]
	subs	tmp2, tmp2, #64
	vldr	d1, [src, #8]
	vstr	d0, [dst, #0]
	vldr	d0, [src, #16]
	vstr	d1, [dst, #8]
	vldr	d1, [src, #24]
	vstr	d0, [dst, #16]
	vldr	d0, [src, #32]
	vstr	d1, [dst, #24]
	vldr	d1, [src, #40]
	vstr	d0, [dst, #32]
	vldr	d0, [src, #48]
	vstr	d1, [dst, #40]
	vldr	d1, [src, #56]
	vstr	d0, [dst, #48]
	add	src, src, #64
	vstr	d1, [dst, #56]
	add	dst, dst, #64
	bge	1b
	tst	tmp2, #0x3f
	beq	.Ldone

.Ltail63aligned:			/* Count in tmp2.  */
	and	tmp1, tmp2, #0x38
	add	dst, dst, tmp1
	add	src, src, tmp1
	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
	add	pc, pc, tmp1

	vldr	d0, [src, #-56]	/* 14 words to go.  */
	vstr	d0, [dst, #-56]
	vldr	d0, [src, #-48]	/* 12 words to go.  */
	vstr	d0, [dst, #-48]
	vldr	d0, [src, #-40]	/* 10 words to go.  */
	vstr	d0, [dst, #-40]
	vldr	d0, [src, #-32]	/* 8 words to go.  */
	vstr	d0, [dst, #-32]
	vldr	d0, [src, #-24]	/* 6 words to go.  */
	vstr	d0, [dst, #-24]
	vldr	d0, [src, #-16]	/* 4 words to go.  */
	vstr	d0, [dst, #-16]
	vldr	d0, [src, #-8]	/* 2 words to go.  */
	vstr	d0, [dst, #-8]
#else
	sub	src, src, #8
	sub	dst, dst, #8
1:
	ldrd	A_l, A_h, [src, #8]
	strd	A_l, A_h, [dst, #8]
	ldrd	A_l, A_h, [src, #16]
	strd	A_l, A_h, [dst, #16]
	ldrd	A_l, A_h, [src, #24]
	strd	A_l, A_h, [dst, #24]
	ldrd	A_l, A_h, [src, #32]
	strd	A_l, A_h, [dst, #32]
	ldrd	A_l, A_h, [src, #40]
	strd	A_l, A_h, [dst, #40]
	ldrd	A_l, A_h, [src, #48]
	strd	A_l, A_h, [dst, #48]
	ldrd	A_l, A_h, [src, #56]
	strd	A_l, A_h, [dst, #56]
	ldrd	A_l, A_h, [src, #64]!
	strd	A_l, A_h, [dst, #64]!
	subs	tmp2, tmp2, #64
	bge	1b
	tst	tmp2, #0x3f
	bne	1f
	ldr	tmp2,[sp], #FRAME_SIZE
	bx	lr
1:
	add	src, src, #8
	add	dst, dst, #8

.Ltail63aligned:			/* Count in tmp2.  */
	/* Copy up to 7 d-words of data.  Similar to Ltail63unaligned, but
	   we know that the src and dest are 64-bit aligned so we can use
	   LDRD/STRD to improve efficiency.  */
	/* TMP2 is now negative, but we don't care about that.  The bottom
	   six bits still tell us how many bytes are left to copy.  */

	and	tmp1, tmp2, #0x38
	add	dst, dst, tmp1
	add	src, src, tmp1
	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
	add	pc, pc, tmp1
	ldrd	A_l, A_h, [src, #-56]	/* 14 words to go.  */
	strd	A_l, A_h, [dst, #-56]
	ldrd	A_l, A_h, [src, #-48]	/* 12 words to go.  */
	strd	A_l, A_h, [dst, #-48]
	ldrd	A_l, A_h, [src, #-40]	/* 10 words to go.  */
	strd	A_l, A_h, [dst, #-40]
	ldrd	A_l, A_h, [src, #-32]	/* 8 words to go.  */
	strd	A_l, A_h, [dst, #-32]
	ldrd	A_l, A_h, [src, #-24]	/* 6 words to go.  */
	strd	A_l, A_h, [dst, #-24]
	ldrd	A_l, A_h, [src, #-16]	/* 4 words to go.  */
	strd	A_l, A_h, [dst, #-16]
	ldrd	A_l, A_h, [src, #-8]	/* 2 words to go.  */
	strd	A_l, A_h, [dst, #-8]

#endif
	tst	tmp2, #4
	ldrne	tmp1, [src], #4
	strne	tmp1, [dst], #4
	lsls	tmp2, tmp2, #31		/* Count (tmp2) now dead. */
	ldrhcs	tmp1, [src], #2
	ldrbne	tmp2, [src]
	strhcs	tmp1, [dst], #2
	strbne	tmp2, [dst]

.Ldone:
	ldr	tmp2, [sp], #FRAME_SIZE
	bx	lr

.Lcpy_body_long:			/* Count in tmp2.  */

	/* Long copy.  We know that there's at least (prefetch_lines * 64)
	   bytes to go.  */
#ifdef USE_VFP
	/* Don't use PLD.  Instead, read some data in advance of the current
	   copy position into a register.  This should act like a PLD
	   operation but we won't have to repeat the transfer.  */

	vldr	d3, [src, #0]
	vldr	d4, [src, #64]
	vldr	d5, [src, #128]
	vldr	d6, [src, #192]
	vldr	d7, [src, #256]

	vldr	d0, [src, #8]
	vldr	d1, [src, #16]
	vldr	d2, [src, #24]
	add	src, src, #32

	subs	tmp2, tmp2, #prefetch_lines * 64 * 2
	blt	2f
1:
	cpy_line_vfp	d3, 0
	cpy_line_vfp	d4, 64
	cpy_line_vfp	d5, 128
	add	dst, dst, #3 * 64
	add	src, src, #3 * 64
	cpy_line_vfp	d6, 0
	cpy_line_vfp	d7, 64
	add	dst, dst, #2 * 64
	add	src, src, #2 * 64
	subs	tmp2, tmp2, #prefetch_lines * 64
	bge	1b

2:
	cpy_tail_vfp	d3, 0
	cpy_tail_vfp	d4, 64
	cpy_tail_vfp	d5, 128
	add	src, src, #3 * 64
	add	dst, dst, #3 * 64
	cpy_tail_vfp	d6, 0
	vstr	d7, [dst, #64]
	vldr	d7, [src, #64]
	vstr	d0, [dst, #64 + 8]
	vldr	d0, [src, #64 + 8]
	vstr	d1, [dst, #64 + 16]
	vldr	d1, [src, #64 + 16]
	vstr	d2, [dst, #64 + 24]
	vldr	d2, [src, #64 + 24]
	vstr	d7, [dst, #64 + 32]
	add	src, src, #96
	vstr	d0, [dst, #64 + 40]
	vstr	d1, [dst, #64 + 48]
	vstr	d2, [dst, #64 + 56]
	add	dst, dst, #128
	add	tmp2, tmp2, #prefetch_lines * 64
	b	.Lcpy_body_medium
#else
	/* Long copy.  Use an SMS style loop to maximize the I/O
	   bandwidth of the core.  We don't have enough spare registers
	   to synthesise prefetching, so use PLD operations.  */
	/* Pre-bias src and dst.  */
	sub	src, src, #8
	sub	dst, dst, #8
	pld	[src, #8]
	pld	[src, #72]
	subs	tmp2, tmp2, #64
	pld	[src, #136]
	ldrd	A_l, A_h, [src, #8]
	strd	B_l, B_h, [sp, #8]
	ldrd	B_l, B_h, [src, #16]
	strd	C_l, C_h, [sp, #16]
	ldrd	C_l, C_h, [src, #24]
	strd	D_l, D_h, [sp, #24]
	pld	[src, #200]
	ldrd	D_l, D_h, [src, #32]!
	b	1f
	.p2align	6
2:
	pld	[src, #232]
	strd	A_l, A_h, [dst, #40]
	ldrd	A_l, A_h, [src, #40]
	strd	B_l, B_h, [dst, #48]
	ldrd	B_l, B_h, [src, #48]
	strd	C_l, C_h, [dst, #56]
	ldrd	C_l, C_h, [src, #56]
	strd	D_l, D_h, [dst, #64]!
	ldrd	D_l, D_h, [src, #64]!
	subs	tmp2, tmp2, #64
1:
	strd	A_l, A_h, [dst, #8]
	ldrd	A_l, A_h, [src, #8]
	strd	B_l, B_h, [dst, #16]
	ldrd	B_l, B_h, [src, #16]
	strd	C_l, C_h, [dst, #24]
	ldrd	C_l, C_h, [src, #24]
	strd	D_l, D_h, [dst, #32]
	ldrd	D_l, D_h, [src, #32]
	bcs	2b
	/* Save the remaining bytes and restore the callee-saved regs.  */
	strd	A_l, A_h, [dst, #40]
	add	src, src, #40
	strd	B_l, B_h, [dst, #48]
	ldrd	B_l, B_h, [sp, #8]
	strd	C_l, C_h, [dst, #56]
	ldrd	C_l, C_h, [sp, #16]
	strd	D_l, D_h, [dst, #64]
	ldrd	D_l, D_h, [sp, #24]
	add	dst, dst, #72
	tst	tmp2, #0x3f
	bne	.Ltail63aligned
	ldr	tmp2, [sp], #FRAME_SIZE
	bx	lr
#endif

.Lcpy_notaligned:
	pld	[src]
	pld	[src, #64]
	/* There's at least 64 bytes to copy, but there is no mutual
	   alignment.  */
	/* Bring DST to 64-bit alignment.  */
	lsls	tmp2, dst, #29
	pld	[src, #(2 * 64)]
	beq	1f
	rsbs	tmp2, tmp2, #0
	sub	count, count, tmp2, lsr #29
	ldrmi	tmp1, [src], #4
	strmi	tmp1, [dst], #4
	lsls	tmp2, tmp2, #2
	ldrbne	tmp1, [src], #1
	ldrhcs	tmp2, [src], #2
	strbne	tmp1, [dst], #1
	strhcs	tmp2, [dst], #2
1:
	pld	[src, #(3 * 64)]
	subs	count, count, #64
	ldrmi	tmp2, [sp], #FRAME_SIZE
	bmi	.Ltail63unaligned
	pld	[src, #(4 * 64)]

#ifdef USE_NEON
	vld1.8	{d0-d3}, [src]!
	vld1.8	{d4-d7}, [src]!
	subs	count, count, #64
	bmi	2f
1:
	pld	[src, #(4 * 64)]
	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
	vld1.8	{d0-d3}, [src]!
	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
	vld1.8	{d4-d7}, [src]!
	subs	count, count, #64
	bpl	1b
2:
	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
	ands	count, count, #0x3f
#else
	/* Use an SMS style loop to maximize the I/O bandwidth.  */
	sub	src, src, #4
	sub	dst, dst, #8
	subs	tmp2, count, #64	/* Use tmp2 for count.  */
	ldr	A_l, [src, #4]
	ldr	A_h, [src, #8]
	strd	B_l, B_h, [sp, #8]
	ldr	B_l, [src, #12]
	ldr	B_h, [src, #16]
	strd	C_l, C_h, [sp, #16]
	ldr	C_l, [src, #20]
	ldr	C_h, [src, #24]
	strd	D_l, D_h, [sp, #24]
	ldr	D_l, [src, #28]
	ldr	D_h, [src, #32]!
	b	1f
	.p2align	6
2:
	pld	[src, #(5 * 64) - (32 - 4)]
	strd	A_l, A_h, [dst, #40]
	ldr	A_l, [src, #36]
	ldr	A_h, [src, #40]
	strd	B_l, B_h, [dst, #48]
	ldr	B_l, [src, #44]
	ldr	B_h, [src, #48]
	strd	C_l, C_h, [dst, #56]
	ldr	C_l, [src, #52]
	ldr	C_h, [src, #56]
	strd	D_l, D_h, [dst, #64]!
	ldr	D_l, [src, #60]
	ldr	D_h, [src, #64]!
	subs	tmp2, tmp2, #64
1:
	strd	A_l, A_h, [dst, #8]
	ldr	A_l, [src, #4]
	ldr	A_h, [src, #8]
	strd	B_l, B_h, [dst, #16]
	ldr	B_l, [src, #12]
	ldr	B_h, [src, #16]
	strd	C_l, C_h, [dst, #24]
	ldr	C_l, [src, #20]
	ldr	C_h, [src, #24]
	strd	D_l, D_h, [dst, #32]
	ldr	D_l, [src, #28]
	ldr	D_h, [src, #32]
	bcs	2b

	/* Save the remaining bytes and restore the callee-saved regs.  */
	strd	A_l, A_h, [dst, #40]
	add	src, src, #36
	strd	B_l, B_h, [dst, #48]
	ldrd	B_l, B_h, [sp, #8]
	strd	C_l, C_h, [dst, #56]
	ldrd	C_l, C_h, [sp, #16]
	strd	D_l, D_h, [dst, #64]
	ldrd	D_l, D_h, [sp, #24]
	add	dst, dst, #72
	ands	count, tmp2, #0x3f
#endif
	ldr	tmp2, [sp], #FRAME_SIZE
	bne	.Ltail63unaligned
	bx	lr

	.size	memcpy, . - memcpy

100 by Will Newton Integrate NEON/VFP/ARM optimised memcpy implementation.	1	/* Copyright (c) 2013, Linaro Limited
52 by Dr. David Alan Gilbert spaces->tabs, use C style comments for the big top comment	2	All rights reserved.
	3
	4	Redistribution and use in source and binary forms, with or without
	5	modification, are permitted provided that the following conditions
	6	are met:
	7
	8	* Redistributions of source code must retain the above copyright
	9	notice, this list of conditions and the following disclaimer.
	10
	11	* Redistributions in binary form must reproduce the above copyright
	12	notice, this list of conditions and the following disclaimer in the
	13	documentation and/or other materials provided with the distribution.
	14
	15	* Neither the name of Linaro Limited nor the names of its
	16	contributors may be used to endorse or promote products derived
	17	from this software without specific prior written permission.
	18
	19	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	20	"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	21	LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	22	A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	23	HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	24	SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	25	LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	26	DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	27	THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	28	(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	29	OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
117 by Will Newton Split license/copyright and doc comments to ease bionic merging.	30	*/
52 by Dr. David Alan Gilbert spaces->tabs, use C style comments for the big top comment	31
117 by Will Newton Split license/copyright and doc comments to ease bionic merging.	32	/*
101 by Will Newton Update memcpy comments after testing on big endian.	33	This memcpy routine is optimised for Cortex-A15 cores and takes advantage
100 by Will Newton Integrate NEON/VFP/ARM optimised memcpy implementation.	34	of VFP or NEON when built with the appropriate flags.
	35
	36	Assumptions:
	37
	38	ARMv6 (ARMv7-a if using Neon)
	39	ARM state
	40	Unaligned accesses
	41
	42	*/
41.1.1 by Dr. David Alan Gilbert Import my memcpy routines (plain ARMv7 and Neon hybrid)	43
52 by Dr. David Alan Gilbert spaces->tabs, use C style comments for the big top comment	44	.syntax unified
100 by Will Newton Integrate NEON/VFP/ARM optimised memcpy implementation.	45	/* This implementation requires ARM state. */
	46	.arm
	47
	48	#ifdef __ARM_NEON__
	49
	50	.fpu neon
	51	.arch armv7-a
	52	# define FRAME_SIZE 4
	53	# define USE_VFP
	54	# define USE_NEON
	55
	56	#elif !defined (__SOFTFP__)
	57
	58	.arch armv6
	59	.fpu vfpv2
	60	# define FRAME_SIZE 32
	61	# define USE_VFP
	62
	63	#else
	64	.arch armv6
	65	# define FRAME_SIZE 32
	66
	67	#endif
	68
	69	/* Old versions of GAS incorrectly implement the NEON align semantics. */
	70	#ifdef BROKEN_ASM_NEON_ALIGN
	71	#define ALIGN(addr, align) addr,:align
	72	#else
	73	#define ALIGN(addr, align) addr:align
	74	#endif
	75
	76	#define PC_OFFSET 8 /* PC pipeline compensation. */
	77	#define INSN_SIZE 4
	78
	79	/* Call parameters. */
	80	#define dstin r0
	81	#define src r1
	82	#define count r2
	83
	84	/* Locals. */
	85	#define tmp1 r3
	86	#define dst ip
	87	#define tmp2 r10
	88
	89	#ifndef USE_NEON
	90	/* For bulk copies using GP registers. */
	91	#define A_l r2 /* Call-clobbered. */
	92	#define A_h r3 /* Call-clobbered. */
	93	#define B_l r4
	94	#define B_h r5
	95	#define C_l r6
	96	#define C_h r7
	97	#define D_l r8
	98	#define D_h r9
	99	#endif
	100
	101	/* Number of lines ahead to pre-fetch data. If you change this the code
	102	below will need adjustment to compensate. */
	103
	104	#define prefetch_lines 5
	105
	106	#ifdef USE_VFP
	107	.macro cpy_line_vfp vreg, base
	108	vstr \vreg, [dst, #\base]
109	vldr \vreg, [src, #\base]
110	vstr d0, [dst, #\base + 8]
111	vldr d0, [src, #\base + 8]
112	vstr d1, [dst, #\base + 16]
113	vldr d1, [src, #\base + 16]
114	vstr d2, [dst, #\base + 24]
115	vldr d2, [src, #\base + 24]
116	vstr \vreg, [dst, #\base + 32]
117	vldr \vreg, [src, #\base + prefetch_lines * 64 - 32]
118	vstr d0, [dst, #\base + 40]
119	vldr d0, [src, #\base + 40]
120	vstr d1, [dst, #\base + 48]
121	vldr d1, [src, #\base + 48]
122	vstr d2, [dst, #\base + 56]
123	vldr d2, [src, #\base + 56]
124	.endm
125
126	.macro cpy_tail_vfp vreg, base
127	vstr \vreg, [dst, #\base]
128	vldr \vreg, [src, #\base]
129	vstr d0, [dst, #\base + 8]
130	vldr d0, [src, #\base + 8]
131	vstr d1, [dst, #\base + 16]
132	vldr d1, [src, #\base + 16]
133	vstr d2, [dst, #\base + 24]
134	vldr d2, [src, #\base + 24]
135	vstr \vreg, [dst, #\base + 32]
136	vstr d0, [dst, #\base + 40]
137	vldr d0, [src, #\base + 40]
138	vstr d1, [dst, #\base + 48]
139	vldr d1, [src, #\base + 48]
140	vstr d2, [dst, #\base + 56]
141	vldr d2, [src, #\base + 56]
142	.endm
143	#endif
144
145	.macro def_fn f p2align=0
52 by Dr. David Alan Gilbert spaces->tabs, use C style comments for the big top comment	146	.text
100 by Will Newton Integrate NEON/VFP/ARM optimised memcpy implementation.	147	.p2align \p2align
	148	.global \f
	149	.type \f, %function
	150	\f:
	151	.endm
	152
	153	def_fn memcpy p2align=6
	154
	155	mov dst, dstin /* Preserve dstin, we need to return it. */
	156	cmp count, #64
	157	bge .Lcpy_not_short
	158	/* Deal with small copies quickly by dropping straight into the
	159	exit block. */
	160
	161	.Ltail63unaligned:
	162	#ifdef USE_NEON
	163	and tmp1, count, #0x38
	164	rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
	165	add pc, pc, tmp1
	166	vld1.8 {d0}, [src]! /* 14 words to go. */
	167	vst1.8 {d0}, [dst]!
	168	vld1.8 {d0}, [src]! /* 12 words to go. */
	169	vst1.8 {d0}, [dst]!
	170	vld1.8 {d0}, [src]! /* 10 words to go. */
	171	vst1.8 {d0}, [dst]!
	172	vld1.8 {d0}, [src]! /* 8 words to go. */
	173	vst1.8 {d0}, [dst]!
	174	vld1.8 {d0}, [src]! /* 6 words to go. */
	175	vst1.8 {d0}, [dst]!
	176	vld1.8 {d0}, [src]! /* 4 words to go. */
	177	vst1.8 {d0}, [dst]!
	178	vld1.8 {d0}, [src]! /* 2 words to go. */
	179	vst1.8 {d0}, [dst]!
	180
	181	tst count, #4
	182	ldrne tmp1, [src], #4
	183	strne tmp1, [dst], #4
	184	#else
	185	/* Copy up to 15 full words of data. May not be aligned. */
	186	/* Cannot use VFP for unaligned data. */
	187	and tmp1, count, #0x3c
	188	add dst, dst, tmp1
	189	add src, src, tmp1
	190	rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
	191	/* Jump directly into the sequence below at the correct offset. */
	192	add pc, pc, tmp1, lsl #1
	193
	194	ldr tmp1, [src, #-60] /* 15 words to go. */
	195	str tmp1, [dst, #-60]
	196
	197	ldr tmp1, [src, #-56] /* 14 words to go. */
	198	str tmp1, [dst, #-56]
	199	ldr tmp1, [src, #-52]
	200	str tmp1, [dst, #-52]
	201
	202	ldr tmp1, [src, #-48] /* 12 words to go. */
	203	str tmp1, [dst, #-48]
	204	ldr tmp1, [src, #-44]
	205	str tmp1, [dst, #-44]
	206
	207	ldr tmp1, [src, #-40] /* 10 words to go. */
	208	str tmp1, [dst, #-40]
	209	ldr tmp1, [src, #-36]
	210	str tmp1, [dst, #-36]
211
212	ldr tmp1, [src, #-32] /* 8 words to go. */
213	str tmp1, [dst, #-32]
214	ldr tmp1, [src, #-28]
215	str tmp1, [dst, #-28]
216
217	ldr tmp1, [src, #-24] /* 6 words to go. */
218	str tmp1, [dst, #-24]
219	ldr tmp1, [src, #-20]
220	str tmp1, [dst, #-20]
221
222	ldr tmp1, [src, #-16] /* 4 words to go. */
223	str tmp1, [dst, #-16]
224	ldr tmp1, [src, #-12]
225	str tmp1, [dst, #-12]
226
227	ldr tmp1, [src, #-8] /* 2 words to go. */
228	str tmp1, [dst, #-8]
229	ldr tmp1, [src, #-4]
230	str tmp1, [dst, #-4]
231	#endif
232
233	lsls count, count, #31
234	ldrhcs tmp1, [src], #2
235	ldrbne src, [src] /* Src is dead, use as a scratch. */
236	strhcs tmp1, [dst], #2
237	strbne src, [dst]
238	bx lr
239
240	.Lcpy_not_short:
241	/* At least 64 bytes to copy, but don't know the alignment yet. */
242	str tmp2, [sp, #-FRAME_SIZE]!
116 by Will Newton Tweak memcpy for performance with misaligned buffers.	243	and tmp2, src, #7
	244	and tmp1, dst, #7
100 by Will Newton Integrate NEON/VFP/ARM optimised memcpy implementation.	245	cmp tmp1, tmp2
	246	bne .Lcpy_notaligned
	247
	248	#ifdef USE_VFP
	249	/* Magic dust alert! Force VFP on Cortex-A9. Experiments show
	250	that the FP pipeline is much better at streaming loads and
	251	stores. This is outside the critical loop. */
	252	vmov.f32 s0, s0
	253	#endif
	254
121 by Will Newton memcpy: Fix comment relating to alignment.	255	/* SRC and DST have the same mutual 64-bit alignment, but we may
100 by Will Newton Integrate NEON/VFP/ARM optimised memcpy implementation.	256	still need to pre-copy some bytes to get to natural alignment.
121 by Will Newton memcpy: Fix comment relating to alignment.	257	We bring SRC and DST into full 64-bit alignment. */
100 by Will Newton Integrate NEON/VFP/ARM optimised memcpy implementation.	258	lsls tmp2, dst, #29
	259	beq 1f
	260	rsbs tmp2, tmp2, #0
	261	sub count, count, tmp2, lsr #29
	262	ldrmi tmp1, [src], #4
	263	strmi tmp1, [dst], #4
	264	lsls tmp2, tmp2, #2
	265	ldrhcs tmp1, [src], #2
	266	ldrbne tmp2, [src], #1
	267	strhcs tmp1, [dst], #2
	268	strbne tmp2, [dst], #1
	269
	270	1:
	271	subs tmp2, count, #64 /* Use tmp2 for count. */
	272	blt .Ltail63aligned
	273
	274	cmp tmp2, #512
	275	bge .Lcpy_body_long
	276
	277	.Lcpy_body_medium: /* Count in tmp2. */
	278	#ifdef USE_VFP
	279	1:
	280	vldr d0, [src, #0]
	281	subs tmp2, tmp2, #64
	282	vldr d1, [src, #8]
	283	vstr d0, [dst, #0]
	284	vldr d0, [src, #16]
	285	vstr d1, [dst, #8]
	286	vldr d1, [src, #24]
	287	vstr d0, [dst, #16]
	288	vldr d0, [src, #32]
	289	vstr d1, [dst, #24]
	290	vldr d1, [src, #40]
	291	vstr d0, [dst, #32]
	292	vldr d0, [src, #48]
	293	vstr d1, [dst, #40]
	294	vldr d1, [src, #56]
	295	vstr d0, [dst, #48]
	296	add src, src, #64
	297	vstr d1, [dst, #56]
	298	add dst, dst, #64
	299	bge 1b
	300	tst tmp2, #0x3f
	301	beq .Ldone
	302
	303	.Ltail63aligned: /* Count in tmp2. */
	304	and tmp1, tmp2, #0x38
	305	add dst, dst, tmp1
	306	add src, src, tmp1
	307	rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
	308	add pc, pc, tmp1
	309
	310	vldr d0, [src, #-56] /* 14 words to go. */
	311	vstr d0, [dst, #-56]
	312	vldr d0, [src, #-48] /* 12 words to go. */
	313	vstr d0, [dst, #-48]
	314	vldr d0, [src, #-40] /* 10 words to go. */
	315	vstr d0, [dst, #-40]
	316	vldr d0, [src, #-32] /* 8 words to go. */
	317	vstr d0, [dst, #-32]
	318	vldr d0, [src, #-24] /* 6 words to go. */
	319	vstr d0, [dst, #-24]
	320	vldr d0, [src, #-16] /* 4 words to go. */
	321	vstr d0, [dst, #-16]
322	vldr d0, [src, #-8] /* 2 words to go. */
323	vstr d0, [dst, #-8]
324	#else
325	sub src, src, #8
326	sub dst, dst, #8
327	1:
328	ldrd A_l, A_h, [src, #8]
329	strd A_l, A_h, [dst, #8]
330	ldrd A_l, A_h, [src, #16]
331	strd A_l, A_h, [dst, #16]
332	ldrd A_l, A_h, [src, #24]
333	strd A_l, A_h, [dst, #24]
334	ldrd A_l, A_h, [src, #32]
335	strd A_l, A_h, [dst, #32]
336	ldrd A_l, A_h, [src, #40]
337	strd A_l, A_h, [dst, #40]
338	ldrd A_l, A_h, [src, #48]
339	strd A_l, A_h, [dst, #48]
340	ldrd A_l, A_h, [src, #56]
341	strd A_l, A_h, [dst, #56]
342	ldrd A_l, A_h, [src, #64]!
343	strd A_l, A_h, [dst, #64]!
344	subs tmp2, tmp2, #64
345	bge 1b
346	tst tmp2, #0x3f
347	bne 1f
348	ldr tmp2,[sp], #FRAME_SIZE
349	bx lr
350	1:
351	add src, src, #8
352	add dst, dst, #8
353
354	.Ltail63aligned: /* Count in tmp2. */
355	/* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but
122 by Will Newton Fix more comments relating to alignment.	356	we know that the src and dest are 64-bit aligned so we can use
100 by Will Newton Integrate NEON/VFP/ARM optimised memcpy implementation.	357	LDRD/STRD to improve efficiency. */
	358	/* TMP2 is now negative, but we don't care about that. The bottom
	359	six bits still tell us how many bytes are left to copy. */
	360
	361	and tmp1, tmp2, #0x38
	362	add dst, dst, tmp1
	363	add src, src, tmp1
	364	rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
	365	add pc, pc, tmp1
	366	ldrd A_l, A_h, [src, #-56] /* 14 words to go. */
	367	strd A_l, A_h, [dst, #-56]
	368	ldrd A_l, A_h, [src, #-48] /* 12 words to go. */
	369	strd A_l, A_h, [dst, #-48]
	370	ldrd A_l, A_h, [src, #-40] /* 10 words to go. */
	371	strd A_l, A_h, [dst, #-40]
	372	ldrd A_l, A_h, [src, #-32] /* 8 words to go. */
	373	strd A_l, A_h, [dst, #-32]
	374	ldrd A_l, A_h, [src, #-24] /* 6 words to go. */
	375	strd A_l, A_h, [dst, #-24]
	376	ldrd A_l, A_h, [src, #-16] /* 4 words to go. */
	377	strd A_l, A_h, [dst, #-16]
	378	ldrd A_l, A_h, [src, #-8] /* 2 words to go. */
	379	strd A_l, A_h, [dst, #-8]
	380
	381	#endif
	382	tst tmp2, #4
	383	ldrne tmp1, [src], #4
	384	strne tmp1, [dst], #4
	385	lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */
	386	ldrhcs tmp1, [src], #2
	387	ldrbne tmp2, [src]
	388	strhcs tmp1, [dst], #2
	389	strbne tmp2, [dst]
	390
	391	.Ldone:
	392	ldr tmp2, [sp], #FRAME_SIZE
	393	bx lr
	394
	395	.Lcpy_body_long: /* Count in tmp2. */
	396
	397	/* Long copy. We know that there's at least (prefetch_lines * 64)
	398	bytes to go. */
	399	#ifdef USE_VFP
	400	/* Don't use PLD. Instead, read some data in advance of the current
	401	copy position into a register. This should act like a PLD
	402	operation but we won't have to repeat the transfer. */
	403
	404	vldr d3, [src, #0]
	405	vldr d4, [src, #64]
	406	vldr d5, [src, #128]
	407	vldr d6, [src, #192]
	408	vldr d7, [src, #256]
	409
	410	vldr d0, [src, #8]
	411	vldr d1, [src, #16]
	412	vldr d2, [src, #24]
	413	add src, src, #32
	414
	415	subs tmp2, tmp2, #prefetch_lines * 64 * 2
	416	blt 2f
	417	1:
	418	cpy_line_vfp d3, 0
	419	cpy_line_vfp d4, 64
	420	cpy_line_vfp d5, 128
421	add dst, dst, #3 * 64
422	add src, src, #3 * 64
423	cpy_line_vfp d6, 0
424	cpy_line_vfp d7, 64
425	add dst, dst, #2 * 64
426	add src, src, #2 * 64
427	subs tmp2, tmp2, #prefetch_lines * 64
428	bge 1b
429
430	2:
431	cpy_tail_vfp d3, 0
432	cpy_tail_vfp d4, 64
433	cpy_tail_vfp d5, 128
434	add src, src, #3 * 64
435	add dst, dst, #3 * 64
436	cpy_tail_vfp d6, 0
437	vstr d7, [dst, #64]
438	vldr d7, [src, #64]
439	vstr d0, [dst, #64 + 8]
440	vldr d0, [src, #64 + 8]
441	vstr d1, [dst, #64 + 16]
442	vldr d1, [src, #64 + 16]
443	vstr d2, [dst, #64 + 24]
444	vldr d2, [src, #64 + 24]
445	vstr d7, [dst, #64 + 32]
446	add src, src, #96
447	vstr d0, [dst, #64 + 40]
448	vstr d1, [dst, #64 + 48]
449	vstr d2, [dst, #64 + 56]
450	add dst, dst, #128
451	add tmp2, tmp2, #prefetch_lines * 64
452	b .Lcpy_body_medium
453	#else
454	/* Long copy. Use an SMS style loop to maximize the I/O
455	bandwidth of the core. We don't have enough spare registers
456	to synthesise prefetching, so use PLD operations. */
457	/* Pre-bias src and dst. */
458	sub src, src, #8
459	sub dst, dst, #8
460	pld [src, #8]
461	pld [src, #72]
462	subs tmp2, tmp2, #64
463	pld [src, #136]
464	ldrd A_l, A_h, [src, #8]
465	strd B_l, B_h, [sp, #8]
466	ldrd B_l, B_h, [src, #16]
467	strd C_l, C_h, [sp, #16]
468	ldrd C_l, C_h, [src, #24]
469	strd D_l, D_h, [sp, #24]
470	pld [src, #200]
471	ldrd D_l, D_h, [src, #32]!
472	b 1f
473	.p2align 6
474	2:
475	pld [src, #232]
476	strd A_l, A_h, [dst, #40]
477	ldrd A_l, A_h, [src, #40]
478	strd B_l, B_h, [dst, #48]
479	ldrd B_l, B_h, [src, #48]
480	strd C_l, C_h, [dst, #56]
481	ldrd C_l, C_h, [src, #56]
482	strd D_l, D_h, [dst, #64]!
483	ldrd D_l, D_h, [src, #64]!
484	subs tmp2, tmp2, #64
485	1:
486	strd A_l, A_h, [dst, #8]
487	ldrd A_l, A_h, [src, #8]
488	strd B_l, B_h, [dst, #16]
489	ldrd B_l, B_h, [src, #16]
490	strd C_l, C_h, [dst, #24]
491	ldrd C_l, C_h, [src, #24]
492	strd D_l, D_h, [dst, #32]
493	ldrd D_l, D_h, [src, #32]
494	bcs 2b
495	/* Save the remaining bytes and restore the callee-saved regs. */
496	strd A_l, A_h, [dst, #40]
497	add src, src, #40
498	strd B_l, B_h, [dst, #48]
499	ldrd B_l, B_h, [sp, #8]
500	strd C_l, C_h, [dst, #56]
501	ldrd C_l, C_h, [sp, #16]
502	strd D_l, D_h, [dst, #64]
503	ldrd D_l, D_h, [sp, #24]
504	add dst, dst, #72
505	tst tmp2, #0x3f
506	bne .Ltail63aligned
507	ldr tmp2, [sp], #FRAME_SIZE
508	bx lr
509	#endif
510
511	.Lcpy_notaligned:
512	pld [src]
513	pld [src, #64]
514	/* There's at least 64 bytes to copy, but there is no mutual
515	alignment. */
516	/* Bring DST to 64-bit alignment. */
517	lsls tmp2, dst, #29
518	pld [src, #(2 * 64)]
519	beq 1f
520	rsbs tmp2, tmp2, #0
521	sub count, count, tmp2, lsr #29
522	ldrmi tmp1, [src], #4
523	strmi tmp1, [dst], #4
524	lsls tmp2, tmp2, #2
525	ldrbne tmp1, [src], #1
526	ldrhcs tmp2, [src], #2
527	strbne tmp1, [dst], #1
528	strhcs tmp2, [dst], #2
529	1:
530	pld [src, #(3 * 64)]
531	subs count, count, #64
532	ldrmi tmp2, [sp], #FRAME_SIZE
533	bmi .Ltail63unaligned
534	pld [src, #(4 * 64)]
535
536	#ifdef USE_NEON
537	vld1.8 {d0-d3}, [src]!
538	vld1.8 {d4-d7}, [src]!
539	subs count, count, #64
540	bmi 2f
541	1:
542	pld [src, #(4 * 64)]
543	vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
544	vld1.8 {d0-d3}, [src]!
545	vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
546	vld1.8 {d4-d7}, [src]!
547	subs count, count, #64
548	bpl 1b
549	2:
550	vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
551	vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
552	ands count, count, #0x3f
553	#else
554	/* Use an SMS style loop to maximize the I/O bandwidth. */
555	sub src, src, #4
556	sub dst, dst, #8
557	subs tmp2, count, #64 /* Use tmp2 for count. */
558	ldr A_l, [src, #4]
559	ldr A_h, [src, #8]
560	strd B_l, B_h, [sp, #8]
561	ldr B_l, [src, #12]
562	ldr B_h, [src, #16]
563	strd C_l, C_h, [sp, #16]
564	ldr C_l, [src, #20]
565	ldr C_h, [src, #24]
566	strd D_l, D_h, [sp, #24]
567	ldr D_l, [src, #28]
568	ldr D_h, [src, #32]!
569	b 1f
570	.p2align 6
571	2:
572	pld [src, #(5 * 64) - (32 - 4)]
573	strd A_l, A_h, [dst, #40]
574	ldr A_l, [src, #36]
575	ldr A_h, [src, #40]
576	strd B_l, B_h, [dst, #48]
577	ldr B_l, [src, #44]
578	ldr B_h, [src, #48]
579	strd C_l, C_h, [dst, #56]
580	ldr C_l, [src, #52]
581	ldr C_h, [src, #56]
582	strd D_l, D_h, [dst, #64]!
583	ldr D_l, [src, #60]
584	ldr D_h, [src, #64]!
585	subs tmp2, tmp2, #64
586	1:
587	strd A_l, A_h, [dst, #8]
588	ldr A_l, [src, #4]
589	ldr A_h, [src, #8]
590	strd B_l, B_h, [dst, #16]
591	ldr B_l, [src, #12]
592	ldr B_h, [src, #16]
593	strd C_l, C_h, [dst, #24]
594	ldr C_l, [src, #20]
595	ldr C_h, [src, #24]
596	strd D_l, D_h, [dst, #32]
597	ldr D_l, [src, #28]
598	ldr D_h, [src, #32]
599	bcs 2b
600
601	/* Save the remaining bytes and restore the callee-saved regs. */
602	strd A_l, A_h, [dst, #40]
603	add src, src, #36
604	strd B_l, B_h, [dst, #48]
605	ldrd B_l, B_h, [sp, #8]
606	strd C_l, C_h, [dst, #56]
607	ldrd C_l, C_h, [sp, #16]
608	strd D_l, D_h, [dst, #64]
609	ldrd D_l, D_h, [sp, #24]
610	add dst, dst, #72
611	ands count, tmp2, #0x3f
612	#endif
613	ldr tmp2, [sp], #FRAME_SIZE
614	bne .Ltail63unaligned
615	bx lr
616
617	.size memcpy, . - memcpy