~linaro-toolchain-dev/cortex-strings/trunk

« back to all changes in this revision

Viewing changes to src/linaro-a9/memcpy.S

Committer: Dr. David Alan Gilbert
Date: 2011-09-08 17:20:49 UTC
Revision ID: david.gilbert@linaro.org-20110908172049-sykgo503kal106w0

spaces->tabs, use C style comments for the big top comment

files modified:
src/linaro-a9/memchr.S

src/linaro-a9/memcpy-hybrid.S

src/linaro-a9/memcpy.S

src/linaro-a9/memset.S

src/linaro-a9/strchr.S

src/linaro-a9/strlen.S

Show diffs side-by-side

added added

removed removed

src/linaro-a9/memcpy.S

.syntax unified

.arch armv7-a

@ Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

@ * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.

@ * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.

@ * Neither the name of Linaro Limited nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.

@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

@ Written by Dave Gilbert <david.gilbert@linaro.org>

@ This memcpy routine is optimised on a Cortex-A9 and should work on all ARMv7

@ processors.

Redistribution and use in source and binary forms, with or without

modification, are permitted provided that the following conditions

are met:

* Redistributions of source code must retain the above copyright

notice, this list of conditions and the following disclaimer.

* Redistributions in binary form must reproduce the above copyright

notice, this list of conditions and the following disclaimer in the

documentation and/or other materials provided with the distribution.

* Neither the name of Linaro Limited nor the names of its

contributors may be used to endorse or promote products derived

from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT

HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

Written by Dave Gilbert <david.gilbert@linaro.org>

This memcpy routine is optimised on a Cortex-A9 and should work on

all ARMv7 processors. */

@ 2011-09-01 david.gilbert@linaro.org

@ Extracted from local git 2f11b436

.syntax unified

.arch armv7-a

@ this lets us check a flag in a 00/ff byte easily in either endianness

#ifdef __ARMEB__

#define CHARTSTMASK(c) 1<<(31-(c*8))

#else

#define CHARTSTMASK(c) 1<<(c*8)

#endif

.text

.thumb

.text

.thumb

@ ---------------------------------------------------------------------------

.thumb_func

.global memcpy

.type memcpy,%function

memcpy:

@ r0 = dest

@ r1 = source

@ r2 = count

@ returns dest in r0

@ Overlaps of source/dest not allowed according to spec

@ Note this routine relies on v7 misaligned loads/stores

pld [r1]

mov r12, r0 @ stash original r0

cmp r2,#32

blt 10f @ take the small copy case separately

@ r0 = dest

@ r1 = source

@ r2 = count

@ returns dest in r0

@ Overlaps of source/dest not allowed according to spec

@ Note this routine relies on v7 misaligned loads/stores

pld [r1]

mov r12, r0 @ stash original r0

cmp r2,#32

blt 10f @ take the small copy case separately

@ test for either source or destination being misaligned

@ (We only rely on word align)

@ TODO: Test for co-misalignment

tst r0,#3

it eq

tsteq r1,#3

bne 30f @ misaligned case

@ test for either source or destination being misaligned

@ (We only rely on word align)

@ TODO: Test for co-misalignment

tst r0,#3

it eq

tsteq r1,#3

bne 30f @ misaligned case

@ at this point we are word (or better) aligned and have at least

@ 32 bytes to play with

push {r3,r4,r5,r6,r7,r8,r10,r11}

@ at this point we are word (or better) aligned and have at least

@ 32 bytes to play with

push {r3,r4,r5,r6,r7,r8,r10,r11}

ldmia r1!,{r3,r4,r5,r6,r7,r8,r10,r11}

pld [r1,#96]

sub r2,r2,#32

cmp r2,#32

stmia r0!,{r3,r4,r5,r6,r7,r8,r10,r11}

bge 5b

ldmia r1!,{r3,r4,r5,r6,r7,r8,r10,r11}

pld [r1,#96]

sub r2,r2,#32

cmp r2,#32

stmia r0!,{r3,r4,r5,r6,r7,r8,r10,r11}

bge 5b

pop {r3,r4,r5,r6,r7,r8,r10,r11}

@ We are now down to less than 32 bytes

cbz r2,15f @ quick exit for the case where we copied a multiple of 32

pop {r3,r4,r5,r6,r7,r8,r10,r11}

@ We are now down to less than 32 bytes

cbz r2,15f @ quick exit for the case where we copied a multiple of 32

10: @ small copies (not necessarily aligned - note might be slightly more than 32bytes)

cmp r2,#4

blt 12f

cmp r2,#4

blt 12f

11:

sub r2,r2,#4

cmp r2,#4

ldr r3, [r1],#4

str r3, [r0],#4

bge 11b

sub r2,r2,#4

cmp r2,#4

ldr r3, [r1],#4

100

str r3, [r0],#4

101

bge 11b

102

12:

tst r2,#2

itt ne

ldrhne r3, [r1],#2

strhne r3, [r0],#2

tst r2,#1

itt ne

ldrbne r3, [r1],#1

strbne r3, [r0],#1

103

tst r2,#2

104

itt ne

105

ldrhne r3, [r1],#2

106

strhne r3, [r0],#2

107

108

tst r2,#1

109

itt ne

110

ldrbne r3, [r1],#1

111

strbne r3, [r0],#1

112

113

15: @ exit

mov r0,r12 @ restore r0

bx lr

114

mov r0,r12 @ restore r0

115

bx lr

116

117

30: @ non-aligned - at least 32 bytes to play with

@ On v7 we're allowed to do ldr's and str's from arbitrary alignments

100

@ but not ldrd/strd or ldm/stm

101

@ Note Neon is often a better choice misaligned using vld1

118

@ On v7 we're allowed to do ldr's and str's from arbitrary alignments

119

@ but not ldrd/strd or ldm/stm

120

@ Note Neon is often a better choice misaligned using vld1

102

121

103

@ copy a byte at a time until the point where we have an aligned destination

104

@ we know we have enough bytes to go to know we won't run out in this phase

105

tst r0,#7

106

beq 35f

122

@ copy a byte at a time until the point where we have an aligned destination

123

@ we know we have enough bytes to go to know we won't run out in this phase

124

tst r0,#7

125

beq 35f

107

126

108

127

31:

109

ldrb r3,[r1],#1

110

sub r2,r2,#1

111

strb r3,[r0],#1

112

tst r0,#7

113

bne 31b

114

115

cmp r2,#32 @ Lets get back to knowing we have 32 bytes to play with

116

blt 11b

117

118

@ Now the store address is aligned

128

ldrb r3,[r1],#1

129

sub r2,r2,#1

130

strb r3,[r0],#1

131

tst r0,#7

132

bne 31b

133

134

cmp r2,#32 @ Lets get back to knowing we have 32 bytes to play with

135

blt 11b

136

137

@ Now the store address is aligned

119

138

35:

120

push {r3,r4,r5,r6,r7,r8,r10,r11,r12,r14}

121

and r6,r1,#3 @ how misaligned we are

122

cmp r6,#2

123

cbz r6, 100f @ Go there if we're actually aligned

124

bge 120f @ And here if it's aligned on 2 or 3 byte

125

@ Note might be worth splitting to bgt and a separate beq

126

@ if the branches are well separated

139

push {r3,r4,r5,r6,r7,r8,r10,r11,r12,r14}

140

and r6,r1,#3 @ how misaligned we are

141

cmp r6,#2

142

cbz r6, 100f @ Go there if we're actually aligned

143

bge 120f @ And here if it's aligned on 2 or 3 byte

144

@ Note might be worth splitting to bgt and a separate beq

145

@ if the branches are well separated

127

146

128

@ At this point dest is aligned, source is 1 byte forward

147

@ At this point dest is aligned, source is 1 byte forward

129

148

110:

130

ldr r3,[r1] @ Misaligned load - but it gives the first 4 bytes to store

131

sub r2,r2,#3 @ Number of bytes left in whole words we can load

132

add r1,r1,#3 @ To aligned load address

133

bic r3,r3,#0xff000000

149

ldr r3,[r1] @ Misaligned load - but it gives the first 4 bytes to store

150

sub r2,r2,#3 @ Number of bytes left in whole words we can load

151

add r1,r1,#3 @ To aligned load address

152

bic r3,r3,#0xff000000

134

153

135

154

112:

136

ldmia r1!,{r5,r6,r7,r8}

137

sub r2,r2,#32

138

cmp r2,#32

139

pld [r1,#96]

140

141

orr r3,r3,r5,lsl#24

142

mov r4,r5,lsr#8

143

mov r5,r6,lsr#8

144

orr r4,r4,r6,lsl#24

145

mov r6,r7,lsr#8

146

ldmia r1!,{r10,r11,r12,r14}

147

orr r5,r5,r7,lsl#24

148

mov r7,r8,lsr#8

149

orr r6,r6,r8,lsl#24

150

mov r8,r10,lsr#8

151

orr r7,r7,r10,lsl#24

152

mov r10,r11,lsr#8

153

orr r8,r8,r11,lsl#24

154

orr r10,r10,r12,lsl#24

155

mov r11,r12,lsr#8

156

orr r11,r11,r14,lsl#24

157

stmia r0!,{r3,r4,r5,r6,r7,r8,r10,r11}

158

mov r3,r14,lsr#8

159

160

bge 112b

161

162

@ Deal with the stragglers

163

add r2,r2,#3

164

sub r1,r1,#3

165

pop {r3,r4,r5,r6,r7,r8,r10,r11,r12,r14}

166

b 10b

155

ldmia r1!,{r5,r6,r7,r8}

156

sub r2,r2,#32

157

cmp r2,#32

158

pld [r1,#96]

159

160

orr r3,r3,r5,lsl#24

161

mov r4,r5,lsr#8

162

mov r5,r6,lsr#8

163

orr r4,r4,r6,lsl#24

164

mov r6,r7,lsr#8

165

ldmia r1!,{r10,r11,r12,r14}

166

orr r5,r5,r7,lsl#24

167

mov r7,r8,lsr#8

168

orr r6,r6,r8,lsl#24

169

mov r8,r10,lsr#8

170

orr r7,r7,r10,lsl#24

171

mov r10,r11,lsr#8

172

orr r8,r8,r11,lsl#24

173

orr r10,r10,r12,lsl#24

174

mov r11,r12,lsr#8

175

orr r11,r11,r14,lsl#24

176

stmia r0!,{r3,r4,r5,r6,r7,r8,r10,r11}

177

mov r3,r14,lsr#8

178

179

bge 112b

180

181

@ Deal with the stragglers

182

add r2,r2,#3

183

sub r1,r1,#3

184

pop {r3,r4,r5,r6,r7,r8,r10,r11,r12,r14}

185

b 10b

167

186

168

187

100: @ Dest and source aligned - must have been originally co-misaligned

169

@ Fallback to main aligned case if still big enough

170

pop {r3,r4,r5,r6,r7,r8,r10,r11,r12,r14}

171

b 4b @ Big copies (32 bytes or more)

188

@ Fallback to main aligned case if still big enough

189

pop {r3,r4,r5,r6,r7,r8,r10,r11,r12,r14}

190

b 4b @ Big copies (32 bytes or more)

172

191

173

192

120: @ Dest is aligned, source is align+2 or 3

174

bgt 130f @ Now split off for 3 byte offset

193

bgt 130f @ Now split off for 3 byte offset

175

194

176

ldrh r3,[r1]

177

sub r2,r2,#2 @ Number of bytes left in whole words we can load

178

add r1,r1,#2 @ To aligned load address

195

ldrh r3,[r1]

196

sub r2,r2,#2 @ Number of bytes left in whole words we can load

197

add r1,r1,#2 @ To aligned load address

179

198

180

199

122:

181

ldmia r1!,{r5,r6,r7,r8}

182

sub r2,r2,#32

183

cmp r2,#32

184

pld [r1,#96]

185

186

orr r3,r3,r5,lsl#16

187

mov r4,r5,lsr#16

188

mov r5,r6,lsr#16

189

orr r4,r4,r6,lsl#16

190

mov r6,r7,lsr#16

191

ldmia r1!,{r10,r11,r12,r14}

192

orr r5,r5,r7,lsl#16

193

orr r6,r6,r8,lsl#16

194

mov r7,r8,lsr#16

195

orr r7,r7,r10,lsl#16

196

mov r8,r10,lsr#16

197

orr r8,r8,r11,lsl#16

198

mov r10,r11,lsr#16

199

orr r10,r10,r12,lsl#16

200

mov r11,r12,lsr#16

201

orr r11,r11,r14,lsl#16

202

stmia r0!,{r3,r4,r5,r6,r7,r8,r10,r11}

203

mov r3,r14,lsr#16

204

205

bge 122b

206

207

@ Deal with the stragglers

208

add r2,r2,#2

209

sub r1,r1,#2

210

pop {r3,r4,r5,r6,r7,r8,r10,r11,r12,r14}

211

b 10b

200

ldmia r1!,{r5,r6,r7,r8}

201

sub r2,r2,#32

202

cmp r2,#32

203

pld [r1,#96]

204

205

orr r3,r3,r5,lsl#16

206

mov r4,r5,lsr#16

207

mov r5,r6,lsr#16

208

orr r4,r4,r6,lsl#16

209

mov r6,r7,lsr#16

210

ldmia r1!,{r10,r11,r12,r14}

211

orr r5,r5,r7,lsl#16

212

orr r6,r6,r8,lsl#16

213

mov r7,r8,lsr#16

214

orr r7,r7,r10,lsl#16

215

mov r8,r10,lsr#16

216

orr r8,r8,r11,lsl#16

217

mov r10,r11,lsr#16

218

orr r10,r10,r12,lsl#16

219

mov r11,r12,lsr#16

220

orr r11,r11,r14,lsl#16

221

stmia r0!,{r3,r4,r5,r6,r7,r8,r10,r11}

222

mov r3,r14,lsr#16

223

224

bge 122b

225

226

@ Deal with the stragglers

227

add r2,r2,#2

228

sub r1,r1,#2

229

pop {r3,r4,r5,r6,r7,r8,r10,r11,r12,r14}

230

b 10b

212

231

213

232

130: @ Dest is aligned, source is align+3

214

ldrb r3,[r1]

215

sub r2,r2,#1 @ Number of bytes left in whole words we can load

216

add r1,r1,#1 @ To aligned load address

233

ldrb r3,[r1]

234

sub r2,r2,#1 @ Number of bytes left in whole words we can load

235

add r1,r1,#1 @ To aligned load address

217

236

218

237

132:

219

ldmia r1!,{r5,r6,r7,r8}

220

sub r2,r2,#32

221

cmp r2,#32

222

pld [r1,#96]

223

224

orr r3,r3,r5,lsl#8

225

mov r4,r5,lsr#24

226

mov r5,r6,lsr#24

227

orr r4,r4,r6,lsl#8

228

mov r6,r7,lsr#24

229

ldmia r1!,{r10,r11,r12,r14}

230

orr r5,r5,r7,lsl#8

231

mov r7,r8,lsr#24

232

orr r6,r6,r8,lsl#8

233

mov r8,r10,lsr#24

234

orr r7,r7,r10,lsl#8

235

orr r8,r8,r11,lsl#8

236

mov r10,r11,lsr#24

237

orr r10,r10,r12,lsl#8

238

mov r11,r12,lsr#24

239

orr r11,r11,r14,lsl#8

240

stmia r0!,{r3,r4,r5,r6,r7,r8,r10,r11}

241

mov r3,r14,lsr#24

242

243

bge 132b

244

245

@ Deal with the stragglers

246

add r2,r2,#1

247

sub r1,r1,#1

248

pop {r3,r4,r5,r6,r7,r8,r10,r11,r12,r14}

249

b 10b

238

ldmia r1!,{r5,r6,r7,r8}

239

sub r2,r2,#32

240

cmp r2,#32

241

pld [r1,#96]

242

243

orr r3,r3,r5,lsl#8

244

mov r4,r5,lsr#24

245

mov r5,r6,lsr#24

246

orr r4,r4,r6,lsl#8

247

mov r6,r7,lsr#24

248

ldmia r1!,{r10,r11,r12,r14}

249

orr r5,r5,r7,lsl#8

250

mov r7,r8,lsr#24

251

orr r6,r6,r8,lsl#8

252

mov r8,r10,lsr#24

253

orr r7,r7,r10,lsl#8

254

orr r8,r8,r11,lsl#8

255

mov r10,r11,lsr#24

256

orr r10,r10,r12,lsl#8

257

mov r11,r12,lsr#24

258

orr r11,r11,r14,lsl#8

259

stmia r0!,{r3,r4,r5,r6,r7,r8,r10,r11}

260

mov r3,r14,lsr#24

261

262

bge 132b

263

264

@ Deal with the stragglers

265

add r2,r2,#1

266

sub r1,r1,#1

267

pop {r3,r4,r5,r6,r7,r8,r10,r11,r12,r14}

268

b 10b

250

269

Older »