~linaro-toolchain-dev/cortex-strings/trunk

« back to all changes in this revision

Viewing changes to src/reference/bionic/memcpy.S

Committer: Michael Hope
Date: 2010-08-26 22:19:29 UTC
Revision ID: michael.hope@linaro.org-20100826221929-ppeg01mnpx34aqrp

Pulled in the initial versions

files added:

src/reference

src/reference/Makefile.am

src/reference/bionic

src/reference/bionic/memcmp.S

src/reference/bionic/memcmp16.S

src/reference/bionic/memcpy.S

src/reference/bionic/memset.S

src/reference/bionic/strlen.c

src/reference/configure.ac

src/reference/glibc

src/reference/glibc/memcpy.S

src/reference/glibc/memmove.S

src/reference/glibc/memset.S

src/reference/glibc/strlen.S

src/reference/helpers

src/reference/helpers/bounce.c

src/reference/helpers/spawn.c

src/reference/newlib

src/reference/newlib/arm_asm.h

src/reference/newlib/strcmp.c

src/reference/newlib/strcpy.c

src/reference/newlib/strlen.c

src/reference/plain

src/reference/plain/memcpy.c

src/reference/test.py

Show diffs side-by-side

added added

removed removed

src/reference/bionic/memcpy.S

* Redistribution and use in source and binary forms, with or without

* modification, are permitted provided that the following conditions

* are met:

* * Redistributions of source code must retain the above copyright

* notice, this list of conditions and the following disclaimer.

* * Redistributions in binary form must reproduce the above copyright

* notice, this list of conditions and the following disclaimer in

* the documentation and/or other materials provided with the

* distribution.

* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS

* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE

* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,

* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS

* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED

* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,

* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT

* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF

* SUCH DAMAGE.

#include <machine/cpu-features.h>

#if defined(__ARM_NEON__)

.text

.fpu neon

.global memcpy

.type memcpy, %function

.align 4

/* a prefetch distance of 4 cache-lines works best experimentally */

#define CACHE_LINE_SIZE 64

#define PREFETCH_DISTANCE (CACHE_LINE_SIZE*4)

memcpy:

.fnstart

.save {r0, lr}

stmfd sp!, {r0, lr}

/* start preloading as early as possible */

pld [r1, #(CACHE_LINE_SIZE*0)]

pld [r1, #(CACHE_LINE_SIZE*1)]

/* do we have at least 16-bytes to copy (needed for alignment below) */

cmp r2, #16

blo 5f

/* align destination to half cache-line for the write-buffer */

rsb r3, r0, #0

ands r3, r3, #0xF

beq 0f

/* copy up to 15-bytes (count in r3) */

sub r2, r2, r3

movs ip, r3, lsl #31

ldrmib lr, [r1], #1

strmib lr, [r0], #1

ldrcsb ip, [r1], #1

ldrcsb lr, [r1], #1

strcsb ip, [r0], #1

strcsb lr, [r0], #1

movs ip, r3, lsl #29

bge 1f

// copies 4 bytes, destination 32-bits aligned

vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!

vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!

1: bcc 2f

// copies 8 bytes, destination 64-bits aligned

vld1.8 {d0}, [r1]!

vst1.8 {d0}, [r0, :64]!

0: /* preload immediately the next cache line, which we may need */

pld [r1, #(CACHE_LINE_SIZE*0)]

pld [r1, #(CACHE_LINE_SIZE*1)]

/* make sure we have at least 64 bytes to copy */

subs r2, r2, #64

blo 2f

/* preload all the cache lines we need.

* NOTE: the number of pld below depends on PREFETCH_DISTANCE,

* ideally would would increase the distance in the main loop to

* avoid the goofy code below. In practice this doesn't seem to make

* a big difference.

pld [r1, #(CACHE_LINE_SIZE*2)]

pld [r1, #(CACHE_LINE_SIZE*3)]

pld [r1, #(PREFETCH_DISTANCE)]

100

1: /* The main loop copies 64 bytes at a time */

101

vld1.8 {d0 - d3}, [r1]!

102

vld1.8 {d4 - d7}, [r1]!

103

pld [r1, #(PREFETCH_DISTANCE)]

104

subs r2, r2, #64

105

vst1.8 {d0 - d3}, [r0, :128]!

106

vst1.8 {d4 - d7}, [r0, :128]!

107

bhs 1b

108

109

2: /* fix-up the remaining count and make sure we have >= 32 bytes left */

110

add r2, r2, #64

111

subs r2, r2, #32

112

blo 4f

113

114

3: /* 32 bytes at a time. These cache lines were already preloaded */

115

vld1.8 {d0 - d3}, [r1]!

116

subs r2, r2, #32

117

vst1.8 {d0 - d3}, [r0, :128]!

118

bhs 3b

119

120

4: /* less than 32 left */

121

add r2, r2, #32

122

tst r2, #0x10

123

beq 5f

124

// copies 16 bytes, 128-bits aligned

125

vld1.8 {d0, d1}, [r1]!

126

vst1.8 {d0, d1}, [r0, :128]!

127

128

5: /* copy up to 15-bytes (count in r2) */

129

movs ip, r2, lsl #29

130

bcc 1f

131

vld1.8 {d0}, [r1]!

132

vst1.8 {d0}, [r0]!

133

1: bge 2f

134

vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!

135

vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]!

136

2: movs ip, r2, lsl #31

137

ldrmib r3, [r1], #1

138

ldrcsb ip, [r1], #1

139

ldrcsb lr, [r1], #1

140

strmib r3, [r0], #1

141

strcsb ip, [r0], #1

142

strcsb lr, [r0], #1

143

144

ldmfd sp!, {r0, lr}

145

bx lr

146

.fnend

147

148

149

#else /* __ARM_ARCH__ < 7 */

150

151

152

.text

153

154

.global memcpy

155

.type memcpy, %function

156

.align 4

157

158

159

* Optimized memcpy() for ARM.

160

161

* note that memcpy() always returns the destination pointer,

162

* so we have to preserve R0.

163

164

165

memcpy:

166

/* The stack must always be 64-bits aligned to be compliant with the

167

* ARM ABI. Since we have to save R0, we might as well save R4

168

* which we can use for better pipelining of the reads below

169

170

.fnstart

171

.save {r0, r4, lr}

172

stmfd sp!, {r0, r4, lr}

173

/* Making room for r5-r11 which will be spilled later */

174

.pad #28

175

sub sp, sp, #28

176

177

// preload the destination because we'll align it to a cache line

178

// with small writes. Also start the source "pump".

179

PLD (r0, #0)

180

PLD (r1, #0)

181

PLD (r1, #32)

182

183

/* it simplifies things to take care of len<4 early */

184

cmp r2, #4

185

blo copy_last_3_and_return

186

187

/* compute the offset to align the source

188

* offset = (4-(src&3))&3 = -src & 3

189

190

rsb r3, r1, #0

191

ands r3, r3, #3

192

beq src_aligned

193

194

/* align source to 32 bits. We need to insert 2 instructions between

195

* a ldr[b|h] and str[b|h] because byte and half-word instructions

196

* stall 2 cycles.

197

198

movs r12, r3, lsl #31

199

sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */

200

ldrmib r3, [r1], #1

201

ldrcsb r4, [r1], #1

202

ldrcsb r12,[r1], #1

203

strmib r3, [r0], #1

204

strcsb r4, [r0], #1

205

strcsb r12,[r0], #1

206

207

src_aligned:

208

209

/* see if src and dst are aligned together (congruent) */

210

eor r12, r0, r1

211

tst r12, #3

212

bne non_congruent

213

214

/* Use post-incriment mode for stm to spill r5-r11 to reserved stack

215

* frame. Don't update sp.

216

217

stmea sp, {r5-r11}

218

219

/* align the destination to a cache-line */

220

rsb r3, r0, #0

221

ands r3, r3, #0x1C

222

beq congruent_aligned32

223

cmp r3, r2

224

andhi r3, r2, #0x1C

225

226

/* conditionnaly copies 0 to 7 words (length in r3) */

227

movs r12, r3, lsl #28

228

ldmcsia r1!, {r4, r5, r6, r7} /* 16 bytes */

229

ldmmiia r1!, {r8, r9} /* 8 bytes */

230

stmcsia r0!, {r4, r5, r6, r7}

231

stmmiia r0!, {r8, r9}

232

tst r3, #0x4

233

ldrne r10,[r1], #4 /* 4 bytes */

234

strne r10,[r0], #4

235

sub r2, r2, r3

236

237

congruent_aligned32:

238

239

* here source is aligned to 32 bytes.

240

241

242

cached_aligned32:

243

subs r2, r2, #32

244

blo less_than_32_left

245

246

247

* We preload a cache-line up to 64 bytes ahead. On the 926, this will

248

* stall only until the requested world is fetched, but the linefill

249

* continues in the the background.

250

* While the linefill is going, we write our previous cache-line

251

* into the write-buffer (which should have some free space).

252

* When the linefill is done, the writebuffer will

253

* start dumping its content into memory

254

255

* While all this is going, we then load a full cache line into

256

* 8 registers, this cache line should be in the cache by now

257

* (or partly in the cache).

258

259

* This code should work well regardless of the source/dest alignment.

260

261

262

263

// Align the preload register to a cache-line because the cpu does

264

// "critical word first" (the first word requested is loaded first).

265

bic r12, r1, #0x1F

266

add r12, r12, #64

267

268

1: ldmia r1!, { r4-r11 }

269

PLD (r12, #64)

270

subs r2, r2, #32

271

272

// NOTE: if r12 is more than 64 ahead of r1, the following ldrhi

273

// for ARM9 preload will not be safely guarded by the preceding subs.

274

// When it is safely guarded the only possibility to have SIGSEGV here

275

// is because the caller overstates the length.

276

ldrhi r3, [r12], #32 /* cheap ARM9 preload */

277

stmia r0!, { r4-r11 }

278

bhs 1b

279

280

add r2, r2, #32

281

282

283

284

285

less_than_32_left:

286

287

* less than 32 bytes left at this point (length in r2)

288

289

290

/* skip all this if there is nothing to do, which should

291

* be a common case (if not executed the code below takes

292

* about 16 cycles)

293

294

tst r2, #0x1F

295

beq 1f

296

297

/* conditionnaly copies 0 to 31 bytes */

298

movs r12, r2, lsl #28

299

ldmcsia r1!, {r4, r5, r6, r7} /* 16 bytes */

300

ldmmiia r1!, {r8, r9} /* 8 bytes */

301

stmcsia r0!, {r4, r5, r6, r7}

302

stmmiia r0!, {r8, r9}

303

movs r12, r2, lsl #30

304

ldrcs r3, [r1], #4 /* 4 bytes */

305

ldrmih r4, [r1], #2 /* 2 bytes */

306

strcs r3, [r0], #4

307

strmih r4, [r0], #2

308

tst r2, #0x1

309

ldrneb r3, [r1] /* last byte */

310

strneb r3, [r0]

311

312

/* we're done! restore everything and return */

313

1: ldmfd sp!, {r5-r11}

314

ldmfd sp!, {r0, r4, lr}

315

bx lr

316

317

/********************************************************************/

318

319

non_congruent:

320

321

* here source is aligned to 4 bytes

322

* but destination is not.

323

324

* in the code below r2 is the number of bytes read

325

* (the number of bytes written is always smaller, because we have

326

* partial words in the shift queue)

327

328

cmp r2, #4

329

blo copy_last_3_and_return

330

331

/* Use post-incriment mode for stm to spill r5-r11 to reserved stack

332

* frame. Don't update sp.

333

334

stmea sp, {r5-r11}

335

336

/* compute shifts needed to align src to dest */

337

rsb r5, r0, #0

338

and r5, r5, #3 /* r5 = # bytes in partial words */

339

mov r12, r5, lsl #3 /* r12 = right */

340

rsb lr, r12, #32 /* lr = left */

341

342

/* read the first word */

343

ldr r3, [r1], #4

344

sub r2, r2, #4

345

346

/* write a partial word (0 to 3 bytes), such that destination

347

* becomes aligned to 32 bits (r5 = nb of words to copy for alignment)

348

349

movs r5, r5, lsl #31

350

strmib r3, [r0], #1

351

movmi r3, r3, lsr #8

352

strcsb r3, [r0], #1

353

movcs r3, r3, lsr #8

354

strcsb r3, [r0], #1

355

movcs r3, r3, lsr #8

356

357

cmp r2, #4

358

blo partial_word_tail

359

360

/* Align destination to 32 bytes (cache line boundary) */

361

1: tst r0, #0x1c

362

beq 2f

363

ldr r5, [r1], #4

364

sub r2, r2, #4

365

orr r4, r3, r5, lsl lr

366

mov r3, r5, lsr r12

367

str r4, [r0], #4

368

cmp r2, #4

369

bhs 1b

370

blo partial_word_tail

371

372

/* copy 32 bytes at a time */

373

2: subs r2, r2, #32

374

blo less_than_thirtytwo

375

376

/* Use immediate mode for the shifts, because there is an extra cycle

377

* for register shifts, which could account for up to 50% of

378

* performance hit.

379

380

381

cmp r12, #24

382

beq loop24

383

cmp r12, #8

384

beq loop8

385

386

loop16:

387

ldr r12, [r1], #4

388

1: mov r4, r12

389

ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}

390

PLD (r1, #64)

391

subs r2, r2, #32

392

ldrhs r12, [r1], #4

393

orr r3, r3, r4, lsl #16

394

mov r4, r4, lsr #16

395

orr r4, r4, r5, lsl #16

396

mov r5, r5, lsr #16

397

orr r5, r5, r6, lsl #16

398

mov r6, r6, lsr #16

399

orr r6, r6, r7, lsl #16

400

mov r7, r7, lsr #16

401

orr r7, r7, r8, lsl #16

402

mov r8, r8, lsr #16

403

orr r8, r8, r9, lsl #16

404

mov r9, r9, lsr #16

405

orr r9, r9, r10, lsl #16

406

mov r10, r10, lsr #16

407

orr r10, r10, r11, lsl #16

408

stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}

409

mov r3, r11, lsr #16

410

bhs 1b

411

b less_than_thirtytwo

412

413

loop8:

414

ldr r12, [r1], #4

415

1: mov r4, r12

416

ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}

417

PLD (r1, #64)

418

subs r2, r2, #32

419

ldrhs r12, [r1], #4

420

orr r3, r3, r4, lsl #24

421

mov r4, r4, lsr #8

422

orr r4, r4, r5, lsl #24

423

mov r5, r5, lsr #8

424

orr r5, r5, r6, lsl #24

425

mov r6, r6, lsr #8

426

orr r6, r6, r7, lsl #24

427

mov r7, r7, lsr #8

428

orr r7, r7, r8, lsl #24

429

mov r8, r8, lsr #8

430

orr r8, r8, r9, lsl #24

431

mov r9, r9, lsr #8

432

orr r9, r9, r10, lsl #24

433

mov r10, r10, lsr #8

434

orr r10, r10, r11, lsl #24

435

stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}

436

mov r3, r11, lsr #8

437

bhs 1b

438

b less_than_thirtytwo

439

440

loop24:

441

ldr r12, [r1], #4

442

1: mov r4, r12

443

ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}

444

PLD (r1, #64)

445

subs r2, r2, #32

446

ldrhs r12, [r1], #4

447

orr r3, r3, r4, lsl #8

448

mov r4, r4, lsr #24

449

orr r4, r4, r5, lsl #8

450

mov r5, r5, lsr #24

451

orr r5, r5, r6, lsl #8

452

mov r6, r6, lsr #24

453

orr r6, r6, r7, lsl #8

454

mov r7, r7, lsr #24

455

orr r7, r7, r8, lsl #8

456

mov r8, r8, lsr #24

457

orr r8, r8, r9, lsl #8

458

mov r9, r9, lsr #24

459

orr r9, r9, r10, lsl #8

460

mov r10, r10, lsr #24

461

orr r10, r10, r11, lsl #8

462

stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}

463

mov r3, r11, lsr #24

464

bhs 1b

465

466

467

less_than_thirtytwo:

468

/* copy the last 0 to 31 bytes of the source */

469

rsb r12, lr, #32 /* we corrupted r12, recompute it */

470

add r2, r2, #32

471

cmp r2, #4

472

blo partial_word_tail

473

474

1: ldr r5, [r1], #4

475

sub r2, r2, #4

476

orr r4, r3, r5, lsl lr

477

mov r3, r5, lsr r12

478

str r4, [r0], #4

479

cmp r2, #4

480

bhs 1b

481

482

partial_word_tail:

483

/* we have a partial word in the input buffer */

484

movs r5, lr, lsl #(31-3)

485

strmib r3, [r0], #1

486

movmi r3, r3, lsr #8

487

strcsb r3, [r0], #1

488

movcs r3, r3, lsr #8

489

strcsb r3, [r0], #1

490

491

/* Refill spilled registers from the stack. Don't update sp. */

492

ldmfd sp, {r5-r11}

493

494

copy_last_3_and_return:

495

movs r2, r2, lsl #31 /* copy remaining 0, 1, 2 or 3 bytes */

496

ldrmib r2, [r1], #1

497

ldrcsb r3, [r1], #1

498

ldrcsb r12,[r1]

499

strmib r2, [r0], #1

500

strcsb r3, [r0], #1

501

strcsb r12,[r0]

502

503

/* we're done! restore sp and spilled registers and return */

504

add sp, sp, #28

505

ldmfd sp!, {r0, r4, lr}

506

bx lr

507

.fnend

508

509

510

#endif /* __ARM_ARCH__ < 7 */

Older »