~linaro-toolchain-dev/cortex-strings/trunk

« back to all changes in this revision

Viewing changes to reference/bionic-a15/memcpy.S

Committer: Will Newton
Date: 2013-04-30 14:31:08 UTC
Revision ID: will.newton@linaro.org-20130430143108-ww31c741wek8dnus

Split bionic reference code into A15 and A9 versions.

files added:
reference/bionic-a15

reference/bionic-a15/memcmp.S

reference/bionic-a15/memcpy.S

reference/bionic-a15/memset.S

reference/bionic-a15/strcmp.S

reference/bionic-a15/strcpy.S

reference/bionic-a15/strlen.c

reference/bionic-a9

reference/bionic-a9/memcmp.S

reference/bionic-a9/memcpy.S

reference/bionic-a9/memset.S

reference/bionic-a9/strcmp.S

reference/bionic-a9/strcpy.S

reference/bionic-a9/strlen.c

files removed:
reference/bionic

reference/bionic/memcmp.S

reference/bionic/memcpy.S

reference/bionic/memset.S

reference/bionic/strcmp.S

reference/bionic/strcpy.S

reference/bionic/strlen.c

files modified:
Makefile.am

Show diffs side-by-side

added added

removed removed

reference/bionic-a15/memcpy.S

* Redistribution and use in source and binary forms, with or without

* modification, are permitted provided that the following conditions

* are met:

* * Redistributions of source code must retain the above copyright

* notice, this list of conditions and the following disclaimer.

* * Redistributions in binary form must reproduce the above copyright

* notice, this list of conditions and the following disclaimer in

* the documentation and/or other materials provided with the

* distribution.

* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS

* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE

* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,

* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS

* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED

* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,

* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT

* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF

* SUCH DAMAGE.

* Redistribution and use in source and binary forms, with or without

* modification, are permitted provided that the following conditions

* are met:

* 1. Redistributions of source code must retain the above copyright

* notice, this list of conditions and the following disclaimer.

* 2. Redistributions in binary form must reproduce the above copyright

* notice, this list of conditions and the following disclaimer in the

* documentation and/or other materials provided with the distribution.

* 3. The name of the company may not be used to endorse or promote

* products derived from this software without specific prior written

* permission.

* THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED

* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF

* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.

* IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED

* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF

* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

/* Prototype: void *memcpy (void *dst, const void *src, size_t count). */

// This version is tuned for the Cortex-A15 processor.

.text

.syntax unified

.fpu neon

#define CACHE_LINE_SIZE 64

.globl memcpy

.type memcpy,%function

memcpy:

.fnstart

// Assumes that n >= 0, and dst, src are valid pointers.

// For any sizes less than 832 use the neon code that doesn't

// care about the src alignment. This avoids any checks

// for src alignment, and offers the best improvement since

// smaller sized copies are dominated by the overhead of

// the pre and post main loop.

// For larger copies, if src and dst cannot both be aligned to

// word boundaries, use the neon code.

// For all other copies, align dst to a double word boundary

// and copy using LDRD/STRD instructions.

// Save registers (r0 holds the return value):

// optimized push {r0, lr}.

.save {r0, lr}

pld [r1, #(CACHE_LINE_SIZE*16)]

push {r0, lr}

cmp r2, #16

blo copy_less_than_16_unknown_align

cmp r2, #832

bge check_alignment

copy_unknown_alignment:

// Unknown alignment of src and dst.

// Assumes that the first few bytes have already been prefetched.

// Align destination to 128 bits. The mainloop store instructions

// require this alignment or they will throw an exception.

rsb r3, r0, #0

100

ands r3, r3, #0xF

101

beq 2f

102

103

// Copy up to 15 bytes (count in r3).

104

sub r2, r2, r3

105

movs ip, r3, lsl #31

106

107

itt mi

108

ldrbmi lr, [r1], #1

109

strbmi lr, [r0], #1

110

itttt cs

111

ldrbcs ip, [r1], #1

112

ldrbcs lr, [r1], #1

113

strbcs ip, [r0], #1

114

strbcs lr, [r0], #1

115

116

movs ip, r3, lsl #29

117

bge 1f

118

// Copies 4 bytes, dst 32 bits aligned before, at least 64 bits after.

119

vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!

120

vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!

121

1: bcc 2f

122

// Copies 8 bytes, dst 64 bits aligned before, at least 128 bits after.

123

vld1.8 {d0}, [r1]!

124

vst1.8 {d0}, [r0, :64]!

125

126

2: // Make sure we have at least 64 bytes to copy.

127

subs r2, r2, #64

128

blo 2f

129

130

1: // The main loop copies 64 bytes at a time.

131

vld1.8 {d0 - d3}, [r1]!

132

vld1.8 {d4 - d7}, [r1]!

133

pld [r1, #(CACHE_LINE_SIZE*4)]

134

subs r2, r2, #64

135

vst1.8 {d0 - d3}, [r0, :128]!

136

vst1.8 {d4 - d7}, [r0, :128]!

137

bhs 1b

138

139

2: // Fix-up the remaining count and make sure we have >= 32 bytes left.

140

adds r2, r2, #32

141

blo 3f

142

143

// 32 bytes. These cache lines were already preloaded.

144

vld1.8 {d0 - d3}, [r1]!

145

sub r2, r2, #32

146

vst1.8 {d0 - d3}, [r0, :128]!

147

3: // Less than 32 left.

148

add r2, r2, #32

149

tst r2, #0x10

150

beq copy_less_than_16_unknown_align

151

// Copies 16 bytes, destination 128 bits aligned.

152

vld1.8 {d0, d1}, [r1]!

153

vst1.8 {d0, d1}, [r0, :128]!

154

155

copy_less_than_16_unknown_align:

156

// Copy up to 15 bytes (count in r2).

157

movs ip, r2, lsl #29

158

bcc 1f

159

vld1.8 {d0}, [r1]!

160

vst1.8 {d0}, [r0]!

161

1: bge 2f

162

vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!

163

vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]!

164

165

2: // Copy 0 to 4 bytes.

166

lsls r2, r2, #31

167

itt ne

168

ldrbne lr, [r1], #1

169

strbne lr, [r0], #1

170

itttt cs

171

ldrbcs ip, [r1], #1

172

ldrbcs lr, [r1]

173

strbcs ip, [r0], #1

174

strbcs lr, [r0]

175

176

pop {r0, pc}

177

178

check_alignment:

179

// If src and dst cannot both be aligned to a word boundary,

180

// use the unaligned copy version.

181

eor r3, r0, r1

182

ands r3, r3, #0x3

183

bne copy_unknown_alignment

184

185

// To try and improve performance, stack layout changed,

186

// i.e., not keeping the stack looking like users expect

187

// (highest numbered register at highest address).

188

// TODO: Add debug frame directives.

189

// We don't need exception unwind directives, because the code below

190

// does not throw any exceptions and does not call any other functions.

191

// Generally, newlib functions like this lack debug information for

192

// assembler source.

193

.save {r4, r5}

194

strd r4, r5, [sp, #-8]!

195

.save {r6, r7}

196

strd r6, r7, [sp, #-8]!

197

.save {r8, r9}

198

strd r8, r9, [sp, #-8]!

199

200

// Optimized for already aligned dst code.

201

ands ip, r0, #3

202

bne dst_not_word_aligned

203

204

word_aligned:

205

// Align the destination buffer to 8 bytes, to make sure double

206

// loads and stores don't cross a cache line boundary,

207

// as they are then more expensive even if the data is in the cache

208

// (require two load/store issue cycles instead of one).

209

// If only one of the buffers is not 8 bytes aligned,

210

// then it's more important to align dst than src,

211

// because there is more penalty for stores

212

// than loads that cross a cacheline boundary.

213

// This check and realignment are only done if there is >= 832

214

// bytes to copy.

215

216

// Dst is word aligned, but check if it is already double word aligned.

217

ands r3, r0, #4

218

beq 1f

219

ldr r3, [r1], #4

220

str r3, [r0], #4

221

sub r2, #4

222

223

1: // Can only get here if > 64 bytes to copy, so don't do check r2.

224

sub r2, #64

225

226

2: // Every loop iteration copies 64 bytes.

227

.irp offset, #0, #8, #16, #24, #32

228

ldrd r4, r5, [r1, \offset]

229

strd r4, r5, [r0, \offset]

230

.endr

231

232

ldrd r4, r5, [r1, #40]

233

ldrd r6, r7, [r1, #48]

234

ldrd r8, r9, [r1, #56]

235

236

// Keep the pld as far from the next load as possible.

237

// The amount to prefetch was determined experimentally using

238

// large sizes, and verifying the prefetch size does not affect

239

// the smaller copies too much.

240

// WARNING: If the ldrd and strd instructions get too far away

241

// from each other, performance suffers. Three loads

242

// in a row is the best tradeoff.

243

pld [r1, #(CACHE_LINE_SIZE*16)]

244

strd r4, r5, [r0, #40]

245

strd r6, r7, [r0, #48]

246

strd r8, r9, [r0, #56]

247

248

add r0, r0, #64

249

add r1, r1, #64

250

subs r2, r2, #64

251

bge 2b

252

253

// Fix-up the remaining count and make sure we have >= 32 bytes left.

254

adds r2, r2, #32

255

blo 4f

256

257

// Copy 32 bytes. These cache lines were already preloaded.

258

.irp offset, #0, #8, #16, #24

259

ldrd r4, r5, [r1, \offset]

260

strd r4, r5, [r0, \offset]

261

.endr

262

add r1, r1, #32

263

add r0, r0, #32

264

sub r2, r2, #32

265

4: // Less than 32 left.

266

add r2, r2, #32

267

tst r2, #0x10

268

beq 5f

269

// Copy 16 bytes.

270

.irp offset, #0, #8

271

ldrd r4, r5, [r1, \offset]

272

strd r4, r5, [r0, \offset]

273

.endr

274

add r1, r1, #16

275

add r0, r0, #16

276

277

5: // Copy up to 15 bytes (count in r2).

278

movs ip, r2, lsl #29

279

bcc 1f

280

// Copy 8 bytes.

281

ldrd r4, r5, [r1], #8

282

strd r4, r5, [r0], #8

283

1: bge 2f

284

// Copy 4 bytes.

285

ldr r4, [r1], #4

286

str r4, [r0], #4

287

2: // Copy 0 to 4 bytes.

288

lsls r2, r2, #31

289

itt ne

290

ldrbne lr, [r1], #1

291

strbne lr, [r0], #1

292

itttt cs

293

ldrbcs ip, [r1], #1

294

ldrbcs lr, [r1]

295

strbcs ip, [r0], #1

296

strbcs lr, [r0]

297

298

// Restore registers: optimized pop {r0, pc}

299

ldrd r8, r9, [sp], #8

300

ldrd r6, r7, [sp], #8

301

ldrd r4, r5, [sp], #8

302

pop {r0, pc}

303

304

dst_not_word_aligned:

305

// Align dst to word.

306

rsb ip, ip, #4

307

cmp ip, #2

308

309

itt gt

310

ldrbgt lr, [r1], #1

311

strbgt lr, [r0], #1

312

313

itt ge

314

ldrbge lr, [r1], #1

315

strbge lr, [r0], #1

316

317

ldrb lr, [r1], #1

318

strb lr, [r0], #1

319

320

sub r2, r2, ip

321

322

// Src is guaranteed to be at least word aligned by this point.

323

b word_aligned

324

.fnend

325

.size memcpy, .-memcpy

Older »