~linaro-toolchain-dev/cortex-strings/trunk

« back to all changes in this revision

Viewing changes to src/linaro-a9/memcpy-hybrid.S

Committer: Will Newton
Date: 2013-03-26 10:19:35 UTC
Revision ID: will.newton@linaro.org-20130326101935-c4i81dht78p9voqf

Integrate NEON/VFP/ARM optimised memcpy implementation.
Add --with-vfp configure option to allow testing VFP code.

files removed:
src/linaro-a9/memcpy-hybrid.S

files modified:
Makefile.am

configure.ac

src/linaro-a9/memcpy.S

Show diffs side-by-side

added added

removed removed

src/linaro-a9/memcpy-hybrid.S

Redistribution and use in source and binary forms, with or without

modification, are permitted provided that the following conditions

are met:

* Redistributions of source code must retain the above copyright

notice, this list of conditions and the following disclaimer.

* Redistributions in binary form must reproduce the above copyright

notice, this list of conditions and the following disclaimer in the

documentation and/or other materials provided with the distribution.

* Neither the name of Linaro Limited nor the names of its

contributors may be used to endorse or promote products derived

from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT

HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

Written by Dave Gilbert <david.gilbert@linaro.org>

This memcpy routine is optimised on a Cortex-A9 and should work on

all ARMv7 processors with NEON. */

@ 2011-09-01 david.gilbert@linaro.org

@ Extracted from local git 2f11b436

.syntax unified

.arch armv7-a

@ this lets us check a flag in a 00/ff byte easily in either endianness

#ifdef __ARMEB__

#define CHARTSTMASK(c) 1<<(31-(c*8))

#else

#define CHARTSTMASK(c) 1<<(c*8)

#endif

.text

.thumb

@ ---------------------------------------------------------------------------

.thumb_func

.align 2

.p2align 4,,15

.global memcpy

.type memcpy,%function

memcpy:

@ r0 = dest

@ r1 = source

@ r2 = count

@ returns dest in r0

@ Overlaps of source/dest not allowed according to spec

@ Note this routine relies on v7 misaligned loads/stores

pld [r1]

mov r12, r0 @ stash original r0

cmp r2,#32

blt 10f @ take the small copy case separately

@ test for either source or destination being misaligned

@ (We only rely on word align)

tst r0,#3

it eq

tsteq r1,#3

bne 30f @ misaligned case

@ at this point we are word (or better) aligned and have at least

@ 32 bytes to play with

@ If it's a huge copy, try Neon

cmp r2, #128*1024

bge 35f @ Sharing general non-aligned case here, aligned could be faster

push {r3,r4,r5,r6,r7,r8,r10,r11}

ldmia r1!,{r3,r4,r5,r6,r7,r8,r10,r11}

sub r2,r2,#32

pld [r1,#96]

cmp r2,#32

stmia r0!,{r3,r4,r5,r6,r7,r8,r10,r11}

bge 5b

pop {r3,r4,r5,r6,r7,r8,r10,r11}

@ We are now down to less than 32 bytes

cbz r2,15f @ quick exit for the case where we copied a multiple of 32

10: @ small copies (not necessarily aligned - note might be slightly more than 32bytes)

cmp r2,#4

blt 12f

100

11:

101

sub r2,r2,#4

102

cmp r2,#4

103

ldr r3, [r1],#4

104

str r3, [r0],#4

105

bge 11b

106

12:

107

tst r2,#2

108

itt ne

109

ldrhne r3, [r1],#2

110

strhne r3, [r0],#2

111

112

tst r2,#1

113

itt ne

114

ldrbne r3, [r1],#1

115

strbne r3, [r0],#1

116

117

15: @ exit

118

mov r0,r12 @ restore r0

119

bx lr

120

121

.align 2

122

.p2align 4,,15

123

30: @ non-aligned - at least 32 bytes to play with

124

@ Test for co-misalignment

125

eor r3, r0, r1

126

tst r3,#3

127

beq 50f

128

129

@ Use Neon for misaligned

130

35:

131

vld1.8 {d0,d1,d2,d3}, [r1]!

132

sub r2,r2,#32

133

cmp r2,#32

134

pld [r1,#96]

135

vst1.8 {d0,d1,d2,d3}, [r0]!

136

bge 35b

137

b 10b @ TODO: Probably a bad idea to switch to ARM at this point

138

139

.align 2

140

.p2align 4,,15

141

50: @ Co-misaligned

142

@ At this point we've got at least 32 bytes

143

51:

144

ldrb r3,[r1],#1

145

sub r2,r2,#1

146

strb r3,[r0],#1

147

tst r0,#7

148

bne 51b

149

150

cmp r2,#32

151

blt 10b

152

b 4b

Older »