~linaro-toolchain-dev/cortex-strings/trunk

« back to all changes in this revision

Viewing changes to reference/bionic/memcpy.S

Committer: Will Newton
Date: 2013-04-30 14:31:08 UTC
Revision ID: will.newton@linaro.org-20130430143108-ww31c741wek8dnus

Split bionic reference code into A15 and A9 versions.

files added:
reference/bionic-a15

reference/bionic-a15/memcmp.S

reference/bionic-a15/memcpy.S

reference/bionic-a15/memset.S

reference/bionic-a15/strcmp.S

reference/bionic-a15/strcpy.S

reference/bionic-a15/strlen.c

reference/bionic-a9

reference/bionic-a9/memcmp.S

reference/bionic-a9/memcpy.S

reference/bionic-a9/memset.S

reference/bionic-a9/strcmp.S

reference/bionic-a9/strcpy.S

reference/bionic-a9/strlen.c

files removed:
reference/bionic

reference/bionic/memcmp.S

reference/bionic/memcpy.S

reference/bionic/memset.S

reference/bionic/strcmp.S

reference/bionic/strcpy.S

reference/bionic/strlen.c

files modified:
Makefile.am

Show diffs side-by-side

added added

removed removed

reference/bionic/memcpy.S

* Redistribution and use in source and binary forms, with or without

* modification, are permitted provided that the following conditions

* are met:

* * Redistributions of source code must retain the above copyright

* notice, this list of conditions and the following disclaimer.

* * Redistributions in binary form must reproduce the above copyright

* notice, this list of conditions and the following disclaimer in

* the documentation and/or other materials provided with the

* distribution.

* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS

* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE

* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,

* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS

* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED

* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,

* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT

* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF

* SUCH DAMAGE.

//#define HAVE_32_BYTE_CACHE_LINE

#if defined(__ARM_NEON__)

.fpu neon

.global memcpy

.type memcpy, %function

.text

#ifdef HAVE_32_BYTE_CACHE_LINE

/* a prefetch distance of 2 cache-lines */

#define CACHE_LINE_SIZE 32

#define PREFETCH_DISTANCE (CACHE_LINE_SIZE*2)

#else

/* a prefetch distance of 4 cache-lines works best experimentally */

#define CACHE_LINE_SIZE 64

#define PREFETCH_DISTANCE (CACHE_LINE_SIZE*4)

#endif

memcpy:

.fnstart

.save {r0, lr}

/* start preloading as early as possible */

pld [r1, #(CACHE_LINE_SIZE*0)]

stmfd sp!, {r0, lr}

pld [r1, #(CACHE_LINE_SIZE*1)]

/* do we have at least 16-bytes to copy (needed for alignment below) */

cmp r2, #16

blo 5f

/* align destination to cache-line for the write-buffer */

rsb r3, r0, #0

ands r3, r3, #0xF

beq 0f

/* copy up to 15-bytes (count in r3) */

sub r2, r2, r3

movs ip, r3, lsl #31

ldrmib lr, [r1], #1

strmib lr, [r0], #1

ldrcsb ip, [r1], #1

ldrcsb lr, [r1], #1

strcsb ip, [r0], #1

strcsb lr, [r0], #1

movs ip, r3, lsl #29

bge 1f

// copies 4 bytes, destination 32-bits aligned

vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!

vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!

1: bcc 2f

// copies 8 bytes, destination 64-bits aligned

vld1.8 {d0}, [r1]!

vst1.8 {d0}, [r0, :64]!

0: /* preload immediately the next cache line, which we may need */

pld [r1, #(CACHE_LINE_SIZE*0)]

pld [r1, #(CACHE_LINE_SIZE*1)]

#ifdef HAVE_32_BYTE_CACHE_LINE

/* make sure we have at least 32 bytes to copy */

subs r2, r2, #32

blo 4f

/* preload all the cache lines we need.

* NOTE: the number of pld below depends on PREFETCH_DISTANCE,

* ideally would would increase the distance in the main loop to

* avoid the goofy code below. In practice this doesn't seem to make

* a big difference.

100

pld [r1, #(PREFETCH_DISTANCE)]

101

102

1: /* The main loop copies 32 bytes at a time */

103

vld1.8 {d0 - d3}, [r1]!

104

pld [r1, #(PREFETCH_DISTANCE)]

105

subs r2, r2, #32

106

vst1.8 {d0 - d3}, [r0, :128]!

107

bhs 1b

108

#else

109

/* make sure we have at least 64 bytes to copy */

110

subs r2, r2, #64

111

blo 2f

112

113

/* preload all the cache lines we need.

114

* NOTE: the number of pld below depends on PREFETCH_DISTANCE,

115

* ideally would would increase the distance in the main loop to

116

* avoid the goofy code below. In practice this doesn't seem to make

117

* a big difference.

118

119

pld [r1, #(CACHE_LINE_SIZE*2)]

120

pld [r1, #(CACHE_LINE_SIZE*3)]

121

pld [r1, #(PREFETCH_DISTANCE)]

122

123

1: /* The main loop copies 64 bytes at a time */

124

vld1.8 {d0 - d3}, [r1]!

125

vld1.8 {d4 - d7}, [r1]!

126

pld [r1, #(PREFETCH_DISTANCE)]

127

subs r2, r2, #64

128

vst1.8 {d0 - d3}, [r0, :128]!

129

vst1.8 {d4 - d7}, [r0, :128]!

130

bhs 1b

131

132

2: /* fix-up the remaining count and make sure we have >= 32 bytes left */

133

add r2, r2, #64

134

subs r2, r2, #32

135

blo 4f

136

137

3: /* 32 bytes at a time. These cache lines were already preloaded */

138

vld1.8 {d0 - d3}, [r1]!

139

subs r2, r2, #32

140

vst1.8 {d0 - d3}, [r0, :128]!

141

bhs 3b

142

#endif

143

4: /* less than 32 left */

144

add r2, r2, #32

145

tst r2, #0x10

146

beq 5f

147

// copies 16 bytes, 128-bits aligned

148

vld1.8 {d0, d1}, [r1]!

149

vst1.8 {d0, d1}, [r0, :128]!

150

151

5: /* copy up to 15-bytes (count in r2) */

152

movs ip, r2, lsl #29

153

bcc 1f

154

vld1.8 {d0}, [r1]!

155

vst1.8 {d0}, [r0]!

156

1: bge 2f

157

vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!

158

vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]!

159

2: movs ip, r2, lsl #31

160

ldrmib r3, [r1], #1

161

ldrcsb ip, [r1], #1

162

ldrcsb lr, [r1], #1

163

strmib r3, [r0], #1

164

strcsb ip, [r0], #1

165

strcsb lr, [r0], #1

166

167

ldmfd sp!, {r0, lr}

168

bx lr

169

.fnend

170

171

172

#else /* __ARM_ARCH__ < 7 */

173

174

175

176

* Optimized memcpy() for ARM.

177

178

* note that memcpy() always returns the destination pointer,

179

* so we have to preserve R0.

180

181

182

ENTRY(memcpy)

183

/* The stack must always be 64-bits aligned to be compliant with the

184

* ARM ABI. Since we have to save R0, we might as well save R4

185

* which we can use for better pipelining of the reads below

186

187

.save {r0, r4, lr}

188

stmfd sp!, {r0, r4, lr}

189

/* Making room for r5-r11 which will be spilled later */

190

.pad #28

191

sub sp, sp, #28

192

193

// preload the destination because we'll align it to a cache line

194

// with small writes. Also start the source "pump".

195

PLD (r0, #0)

196

PLD (r1, #0)

197

PLD (r1, #32)

198

199

/* it simplifies things to take care of len<4 early */

200

cmp r2, #4

201

blo copy_last_3_and_return

202

203

/* compute the offset to align the source

204

* offset = (4-(src&3))&3 = -src & 3

205

206

rsb r3, r1, #0

207

ands r3, r3, #3

208

beq src_aligned

209

210

/* align source to 32 bits. We need to insert 2 instructions between

211

* a ldr[b|h] and str[b|h] because byte and half-word instructions

212

* stall 2 cycles.

213

214

movs r12, r3, lsl #31

215

sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */

216

ldrmib r3, [r1], #1

217

ldrcsb r4, [r1], #1

218

ldrcsb r12,[r1], #1

219

strmib r3, [r0], #1

220

strcsb r4, [r0], #1

221

strcsb r12,[r0], #1

222

223

src_aligned:

224

225

/* see if src and dst are aligned together (congruent) */

226

eor r12, r0, r1

227

tst r12, #3

228

bne non_congruent

229

230

/* Use post-incriment mode for stm to spill r5-r11 to reserved stack

231

* frame. Don't update sp.

232

233

stmea sp, {r5-r11}

234

235

/* align the destination to a cache-line */

236

rsb r3, r0, #0

237

ands r3, r3, #0x1C

238

beq congruent_aligned32

239

cmp r3, r2

240

andhi r3, r2, #0x1C

241

242

/* conditionnaly copies 0 to 7 words (length in r3) */

243

movs r12, r3, lsl #28

244

ldmcsia r1!, {r4, r5, r6, r7} /* 16 bytes */

245

ldmmiia r1!, {r8, r9} /* 8 bytes */

246

stmcsia r0!, {r4, r5, r6, r7}

247

stmmiia r0!, {r8, r9}

248

tst r3, #0x4

249

ldrne r10,[r1], #4 /* 4 bytes */

250

strne r10,[r0], #4

251

sub r2, r2, r3

252

253

congruent_aligned32:

254

255

* here source is aligned to 32 bytes.

256

257

258

cached_aligned32:

259

subs r2, r2, #32

260

blo less_than_32_left

261

262

263

* We preload a cache-line up to 64 bytes ahead. On the 926, this will

264

* stall only until the requested world is fetched, but the linefill

265

* continues in the the background.

266

* While the linefill is going, we write our previous cache-line

267

* into the write-buffer (which should have some free space).

268

* When the linefill is done, the writebuffer will

269

* start dumping its content into memory

270

271

* While all this is going, we then load a full cache line into

272

* 8 registers, this cache line should be in the cache by now

273

* (or partly in the cache).

274

275

* This code should work well regardless of the source/dest alignment.

276

277

278

279

// Align the preload register to a cache-line because the cpu does

280

// "critical word first" (the first word requested is loaded first).

281

bic r12, r1, #0x1F

282

add r12, r12, #64

283

284

1: ldmia r1!, { r4-r11 }

285

PLD (r12, #64)

286

subs r2, r2, #32

287

288

// NOTE: if r12 is more than 64 ahead of r1, the following ldrhi

289

// for ARM9 preload will not be safely guarded by the preceding subs.

290

// When it is safely guarded the only possibility to have SIGSEGV here

291

// is because the caller overstates the length.

292

ldrhi r3, [r12], #32 /* cheap ARM9 preload */

293

stmia r0!, { r4-r11 }

294

bhs 1b

295

296

add r2, r2, #32

297

298

299

300

301

less_than_32_left:

302

303

* less than 32 bytes left at this point (length in r2)

304

305

306

/* skip all this if there is nothing to do, which should

307

* be a common case (if not executed the code below takes

308

* about 16 cycles)

309

310

tst r2, #0x1F

311

beq 1f

312

313

/* conditionnaly copies 0 to 31 bytes */

314

movs r12, r2, lsl #28

315

ldmcsia r1!, {r4, r5, r6, r7} /* 16 bytes */

316

ldmmiia r1!, {r8, r9} /* 8 bytes */

317

stmcsia r0!, {r4, r5, r6, r7}

318

stmmiia r0!, {r8, r9}

319

movs r12, r2, lsl #30

320

ldrcs r3, [r1], #4 /* 4 bytes */

321

ldrmih r4, [r1], #2 /* 2 bytes */

322

strcs r3, [r0], #4

323

strmih r4, [r0], #2

324

tst r2, #0x1

325

ldrneb r3, [r1] /* last byte */

326

strneb r3, [r0]

327

328

/* we're done! restore everything and return */

329

1: ldmfd sp!, {r5-r11}

330

ldmfd sp!, {r0, r4, lr}

331

bx lr

332

333

/********************************************************************/

334

335

non_congruent:

336

337

* here source is aligned to 4 bytes

338

* but destination is not.

339

340

* in the code below r2 is the number of bytes read

341

* (the number of bytes written is always smaller, because we have

342

* partial words in the shift queue)

343

344

cmp r2, #4

345

blo copy_last_3_and_return

346

347

/* Use post-incriment mode for stm to spill r5-r11 to reserved stack

348

* frame. Don't update sp.

349

350

stmea sp, {r5-r11}

351

352

/* compute shifts needed to align src to dest */

353

rsb r5, r0, #0

354

and r5, r5, #3 /* r5 = # bytes in partial words */

355

mov r12, r5, lsl #3 /* r12 = right */

356

rsb lr, r12, #32 /* lr = left */

357

358

/* read the first word */

359

ldr r3, [r1], #4

360

sub r2, r2, #4

361

362

/* write a partial word (0 to 3 bytes), such that destination

363

* becomes aligned to 32 bits (r5 = nb of words to copy for alignment)

364

365

movs r5, r5, lsl #31

366

strmib r3, [r0], #1

367

movmi r3, r3, lsr #8

368

strcsb r3, [r0], #1

369

movcs r3, r3, lsr #8

370

strcsb r3, [r0], #1

371

movcs r3, r3, lsr #8

372

373

cmp r2, #4

374

blo partial_word_tail

375

376

/* Align destination to 32 bytes (cache line boundary) */

377

1: tst r0, #0x1c

378

beq 2f

379

ldr r5, [r1], #4

380

sub r2, r2, #4

381

orr r4, r3, r5, lsl lr

382

mov r3, r5, lsr r12

383

str r4, [r0], #4

384

cmp r2, #4

385

bhs 1b

386

blo partial_word_tail

387

388

/* copy 32 bytes at a time */

389

2: subs r2, r2, #32

390

blo less_than_thirtytwo

391

392

/* Use immediate mode for the shifts, because there is an extra cycle

393

* for register shifts, which could account for up to 50% of

394

* performance hit.

395

396

397

cmp r12, #24

398

beq loop24

399

cmp r12, #8

400

beq loop8

401

402

loop16:

403

ldr r12, [r1], #4

404

1: mov r4, r12

405

ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}

406

PLD (r1, #64)

407

subs r2, r2, #32

408

ldrhs r12, [r1], #4

409

orr r3, r3, r4, lsl #16

410

mov r4, r4, lsr #16

411

orr r4, r4, r5, lsl #16

412

mov r5, r5, lsr #16

413

orr r5, r5, r6, lsl #16

414

mov r6, r6, lsr #16

415

orr r6, r6, r7, lsl #16

416

mov r7, r7, lsr #16

417

orr r7, r7, r8, lsl #16

418

mov r8, r8, lsr #16

419

orr r8, r8, r9, lsl #16

420

mov r9, r9, lsr #16

421

orr r9, r9, r10, lsl #16

422

mov r10, r10, lsr #16

423

orr r10, r10, r11, lsl #16

424

stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}

425

mov r3, r11, lsr #16

426

bhs 1b

427

b less_than_thirtytwo

428

429

loop8:

430

ldr r12, [r1], #4

431

1: mov r4, r12

432

ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}

433

PLD (r1, #64)

434

subs r2, r2, #32

435

ldrhs r12, [r1], #4

436

orr r3, r3, r4, lsl #24

437

mov r4, r4, lsr #8

438

orr r4, r4, r5, lsl #24

439

mov r5, r5, lsr #8

440

orr r5, r5, r6, lsl #24

441

mov r6, r6, lsr #8

442

orr r6, r6, r7, lsl #24

443

mov r7, r7, lsr #8

444

orr r7, r7, r8, lsl #24

445

mov r8, r8, lsr #8

446

orr r8, r8, r9, lsl #24

447

mov r9, r9, lsr #8

448

orr r9, r9, r10, lsl #24

449

mov r10, r10, lsr #8

450

orr r10, r10, r11, lsl #24

451

stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}

452

mov r3, r11, lsr #8

453

bhs 1b

454

b less_than_thirtytwo

455

456

loop24:

457

ldr r12, [r1], #4

458

1: mov r4, r12

459

ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}

460

PLD (r1, #64)

461

subs r2, r2, #32

462

ldrhs r12, [r1], #4

463

orr r3, r3, r4, lsl #8

464

mov r4, r4, lsr #24

465

orr r4, r4, r5, lsl #8

466

mov r5, r5, lsr #24

467

orr r5, r5, r6, lsl #8

468

mov r6, r6, lsr #24

469

orr r6, r6, r7, lsl #8

470

mov r7, r7, lsr #24

471

orr r7, r7, r8, lsl #8

472

mov r8, r8, lsr #24

473

orr r8, r8, r9, lsl #8

474

mov r9, r9, lsr #24

475

orr r9, r9, r10, lsl #8

476

mov r10, r10, lsr #24

477

orr r10, r10, r11, lsl #8

478

stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}

479

mov r3, r11, lsr #24

480

bhs 1b

481

482

483

less_than_thirtytwo:

484

/* copy the last 0 to 31 bytes of the source */

485

rsb r12, lr, #32 /* we corrupted r12, recompute it */

486

add r2, r2, #32

487

cmp r2, #4

488

blo partial_word_tail

489

490

1: ldr r5, [r1], #4

491

sub r2, r2, #4

492

orr r4, r3, r5, lsl lr

493

mov r3, r5, lsr r12

494

str r4, [r0], #4

495

cmp r2, #4

496

bhs 1b

497

498

partial_word_tail:

499

/* we have a partial word in the input buffer */

500

movs r5, lr, lsl #(31-3)

501

strmib r3, [r0], #1

502

movmi r3, r3, lsr #8

503

strcsb r3, [r0], #1

504

movcs r3, r3, lsr #8

505

strcsb r3, [r0], #1

506

507

/* Refill spilled registers from the stack. Don't update sp. */

508

ldmfd sp, {r5-r11}

509

510

copy_last_3_and_return:

511

movs r2, r2, lsl #31 /* copy remaining 0, 1, 2 or 3 bytes */

512

ldrmib r2, [r1], #1

513

ldrcsb r3, [r1], #1

514

ldrcsb r12,[r1]

515

strmib r2, [r0], #1

516

strcsb r3, [r0], #1

517

strcsb r12,[r0]

518

519

/* we're done! restore sp and spilled registers and return */

520

add sp, sp, #28

521

ldmfd sp!, {r0, r4, lr}

522

bx lr

523

END(memcpy)

524

525

526

#endif /* __ARM_ARCH__ < 7 */

Older »