~ubuntu-branches/ubuntu/precise/xf86-video-msm-lts-quantal/precise-proposed : revision 1

1

/***************************************************************************

2

3

4

Redistribution and use in source and binary forms, with or without

5

modification, are permitted provided that the following conditions are met:

6

* Redistributions of source code must retain the above copyright

7

notice, this list of conditions and the following disclaimer.

8

* Redistributions in binary form must reproduce the above copyright

9

notice, this list of conditions and the following disclaimer in the

10

documentation and/or other materials provided with the distribution.

11

* Neither the name of Code Aurora nor the names of its contributors may

12

be used to endorse or promote products derived from this software

13

without specific prior written permission.

14

15

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

16

AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

17

IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

18

ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE

19

LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

20

CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

21

SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

22

INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

23

CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

24

ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

25

POSSIBILITY OF SUCH DAMAGE.

26

***************************************************************************/

27

28

/***************************************************************************

29

* Neon memmove: Attempts to do a memmove with Neon registers if possible,

30

* Inputs:

31

* dest: The destination buffer

32

* src: The source buffer

33

* n: The size of the buffer to transfer

34

* Outputs:

35

*

36

***************************************************************************/

37

38

/*

39

* General note:

40

* The original code that was compiled for rvct used PUSH/POP and VPUSH/VPOP

41

* However, it looks like the 2006 CodeSourcery Assembler has issues generating

42

* the correct object code for VPOP, resulting in horrific stack crashes.

43

* As a result, I've temporarily move PUSH->STMDB, POP->LDMIA, VPUSH->VSTMDB,

44

* and VPOP->VLDMIA. We can revert this back once we update our toolchain.

45

*

46

* Also, VSHL swaps the source register and the shift-amount register

47

* around in 2006-q3. I've coded this incorrectly so it turns out correct

48

* in the object code, but we'll need to undo that later...

49

*/

50

.code 32

51

.align 4

52

.globl neon_memmove

53

.func

54

55

neon_memmove:

56

#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)

57

stmdb sp!, {r0}

58

#else

59

push {r0}

60

#endif

61

62

/*

63

* The requirements for memmove state that the function should

64

* operate as if data were being copied from the source to a

65

* buffer, then to the destination. This is to allow a user

66

* to copy data from a source and target that overlap.

67

*

68

* We can't just do byte copies front-to-back automatically, since

69

* there's a good chance we may have an overlap (why else would someone

70

* intentionally use memmove then?).

71

*

72

* We'll break this into two parts. Front-to-back, or back-to-front

73

* copies.

74

*/

75

neon_memmove_cmf:

76

cmp r0, r1

77

blt neon_front_to_back_copy

78

bgt neon_back_to_front_copy

79

b neon_memmove_done

80

81

/* #############################################################

82

* Front to Back copy

83

*/

84

neon_front_to_back_copy:

85

/*

86

* For small copies, just do a quick memcpy. We can do this for

87

* front-to-back copies, aligned or unaligned, since we're only

88

* doing 1 byte at a time...

89

*/

90

cmp r2, #4

91

bgt neon_f2b_gt4

92

cmp r2, #0

93

neon_f2b_smallcopy_loop:

94

beq neon_memmove_done

95

ldrb r12, [r1], #1

96

subs r2, r2, #1

97

strb r12, [r0], #1

98

b neon_f2b_smallcopy_loop

99

neon_f2b_gt4:

100

/* Preload what we can...*/

101

pld [r0,#0]

102

pld [r1,#0]

103

/* The window size is in r3. */

104

sub r3, r1, r0

105

#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)

106

stmdb sp!, {r4-r6}

107

#else

108

push {r4-r6}

109

#endif

110

111

neon_f2b_check_align:

112

/* Check alignment. */

113

ands r12, r0, #0x3

114

beq neon_f2b_source_align_check

115

cmp r12, #2

116

ldrb r4, [r1], #1

117

ldrleb r5, [r1], #1

118

ldrltb r6, [r1], #1

119

rsb r12, r12, #4

120

sub r2, r2, r12

121

strb r4, [r0], #1

122

strleb r5, [r0], #1

123

strltb r6, [r0], #1

124

125

neon_f2b_source_align_check:

126

ands r12, r1, #0x3

127

bne neon_f2b_nonaligned

128

129

neon_f2b_try_16_align:

130

/* If we're >64, attempt to align on 16-bytes. Smaller amounts

131

* don't seem to be worth handling. */

132

cmp r2, #64

133

blt neon_f2b_align_route

134

/* This is where we try 16-byte alignment. */

135

ands r12, r0, #0xf

136

beq neon_f2b_align_route

137

rsb r12, r12, #16

138

neon_f2b_16_start:

139

sub r2, r2, r12

140

lsrs r5, r12, #2

141

neon_f2b_align_16_4:

142

ldr r4, [r1], #4

143

subs r5, r5, #1

144

str r4, [r0], #4

145

bne neon_f2b_align_16_4

146

neon_f2b_align_route:

147

/* #############################################################

148

* Front to Back copy - aligned

149

*/

150

/*

151

* Note that we can't just route based on the size in r2. If that's

152

* larger than the overlap window in r3, we could potentially

153

* (and likely!) destroy data we're copying.

154

*/

155

cmp r2, r3

156

movle r12, r2

157

movgt r12, r3

158

cmp r12, #256

159

bge neon_f2b_copy_128_a

160

cmp r12, #64

161

bge neon_f2b_copy_32_a

162

cmp r12, #16

163

bge neon_f2b_copy_16_a

164

cmp r12, #8

165

bge neon_f2b_copy_8_a

166

cmp r12, #4

167

bge neon_f2b_copy_4_a

168

b neon_f2b_copy_1_a

169

neon_f2b_copy_128_a:

170

#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)

171

vstmdb sp!, {q4-q7}

172

#else

173

vpush {q4-q7}

174

#endif

175

mov r12, r2, lsr #7

176

neon_f2b_copy_128_a_loop:

177

vld1.32 {q0,q1}, [r1]!

178

vld1.32 {q2,q3}, [r1]!

179

vld1.32 {q4,q5}, [r1]!

180

vld1.32 {q6,q7}, [r1]!

181

pld [r1, #0]

182

pld [r1, #128]

183

vst1.32 {q0,q1}, [r0]!

184

vst1.32 {q2,q3}, [r0]!

185

vst1.32 {q4,q5}, [r0]!

186

vst1.32 {q6,q7}, [r0]!

187

subs r12, r12, #1

188

pld [r0, #0]

189

pld [r0, #128]

190

bne neon_f2b_copy_128_a_loop

191

#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)

192

vldmia sp!, {q4-q7}

193

#else

194

vpop {q4-q7}

195

#endif

196

ands r2, r2, #0x7f

197

beq neon_f2b_finish

198

cmp r2, #32

199

bge neon_f2b_copy_32_a

200

b neon_f2b_copy_finish_a

201

neon_f2b_copy_32_a:

202

mov r12, r2, lsr #5

203

neon_f2b_copy_32_a_loop:

204

vld1.32 {q0,q1}, [r1]!

205

subs r12, r12, #1

206

pld [r1, #0]

207

vst1.32 {q0,q1}, [r0]!

208

bne neon_f2b_copy_32_a_loop

209

ands r2, r2, #0x1f

210

beq neon_f2b_finish

211

neon_f2b_copy_finish_a:

212

neon_f2b_copy_16_a:

213

movs r12, r2, lsr #4

214

beq neon_f2b_copy_8_a

215

neon_f2b_copy_16_a_loop:

216

vld1.32 {q0}, [r1]!

217

subs r12, r12, #1

218

vst1.32 {q0}, [r0]!

219

bne neon_f2b_copy_16_a_loop

220

ands r2, r2, #0xf

221

beq neon_f2b_finish

222

neon_f2b_copy_8_a:

223

cmp r2, #8

224

blt neon_f2b_copy_4_a

225

ldm r1!, {r4-r5}

226

subs r2, r2, #8

227

stm r0!, {r4-r5}

228

neon_f2b_copy_4_a:

229

cmp r2, #4

230

blt neon_f2b_copy_1_a

231

ldr r4, [r1], #4

232

subs r2, r2, #4

233

str r4, [r0], #4

234

neon_f2b_copy_1_a:

235

cmp r2, #0

236

beq neon_f2b_finish

237

neon_f2b_copy_1_a_loop:

238

ldrb r12, [r1], #1

239

subs r2, r2, #1

240

strb r12, [r0], #1

241

bne neon_f2b_copy_1_a_loop

242

b neon_f2b_finish

243

244

/* #############################################################

245

* Front to Back copy - unaligned

246

*/

247

neon_f2b_nonaligned:

248

/*

249

* For sizes < 8, does it really make sense to do the whole shift

250

* party? Note that we DON'T want to call neon_f2b_copy_1_u,

251

* since we'll end up trying to pop r8-r11, and we DON'T want

252

* to do that...

253

*/

254

cmp r2, #8

255

ble neon_f2b_copy_1_a

256

257

#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)

258

stmdb sp!, {r7-r9}

259

#else

260

push {r7-r9}

261

#endif

262

cmp r12, #2

263

ldrb r4, [r1], #1

264

ldrleb r5, [r1], #1

265

ldrltb r6, [r1], #1

266

rsb r8, r12, #4

267

sub r2, r2, r8

268

lsl r8, r8, #3

269

orrle r4, r4, r5, lsl #8

270

orrlt r4, r4, r6, lsl #16

271

rsb r9, r8, #32

272

/*

273

* r4 = overflow bits

274

* r8 = # of bits we copied into the r4 register to align source.

275

* r9 = 32 - r8

276

* r12 = Index counter for each size, so we determine how many times

277

* the given size will go into r2, then count down that # of

278

* times in r12.

279

*/

280

cmp r2, #64

281

blt neon_f2b_unaligned_route

282

ands r12, r0, #0xf

283

beq neon_f2b_unaligned_route

284

cmp r3, #4

285

blt neon_f2b_unaligned_route

286

rsb r12, r12, #16

287

neon_f2b_16_start_u:

288

sub r2, r2, r12

289

lsrs r6, r12, #2

290

neon_f2b_align_16_4_u:

291

ldr r5, [r1], #4

292

subs r6, r6, #1

293

orr r4, r4, r5, lsl r8

294

str r4, [r0], #4

295

mov r4, r5, lsr r9

296

bne neon_f2b_align_16_4_u

297

neon_f2b_unaligned_route:

298

cmp r2, r3

299

movle r12, r2

300

movgt r12, r3

301

cmp r12, #256

302

bge neon_f2b_copy_64_u

303

cmp r12, #64

304

bge neon_f2b_copy_32_u

305

cmp r12, #16

306

bge neon_f2b_copy_16_u

307

cmp r12, #8

308

bge neon_f2b_copy_8_u

309

cmp r12, #4

310

bge neon_f2b_copy_4_u

311

b neon_f2b_last_bits_u

312

neon_f2b_copy_64_u:

313

#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)

314

vstmdb sp!, {q4}

315

vstmdb sp!, {q5-q8}

316

#else

317

vpush {q4}

318

vpush {q5-q8}

319

#endif

320

vdup.u32 q8, r8

321

mov r12, r2, lsr #6

322

and r2, r2, #0x3f

323

neon_f2b_copy_64_u_loop:

324

vld1.32 {q4, q5}, [r1]!

325

vld1.32 {q6, q7}, [r1]!

326

lsls r5, r8, #28

327

bcc neon_f2b_copy_64_u_b8

328

bpl neon_f2b_copy_64_u_b16

329

vshr.u64 q0, q4, #40

330

vshr.u64 q1, q5, #40

331

vshr.u64 q2, q6, #40

332

vshr.u64 q3, q7, #40

333

b neon_f2b_copy_64_unify

334

neon_f2b_copy_64_u_b8:

335

vshr.u64 q0, q4, #56

336

vshr.u64 q1, q5, #56

337

vshr.u64 q2, q6, #56

338

vshr.u64 q3, q7, #56

339

b neon_f2b_copy_64_unify

340

neon_f2b_copy_64_u_b16:

341

vshr.u64 q0, q4, #48

342

vshr.u64 q1, q5, #48

343

vshr.u64 q2, q6, #48

344

vshr.u64 q3, q7, #48

345

neon_f2b_copy_64_unify:

346

#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)

347

vshl.u64 q4, q8, q4

348

vshl.u64 q5, q8, q5

349

vshl.u64 q6, q8, q6

350

vshl.u64 q7, q8, q7

351

#else

352

vshl.u64 q4, q4, q8

353

vshl.u64 q5, q5, q8

354

vshl.u64 q6, q6, q8

355

vshl.u64 q7, q7, q8

356

#endif

357

vmov r5, s14

358

vorr d9, d9, d0

359

vmov s14, r4

360

vorr d10, d10, d1

361

vorr d11, d11, d2

362

vorr d12, d12, d3

363

vorr d13, d13, d4

364

vorr d14, d14, d5

365

vorr d15, d15, d6

366

vorr d8, d8, d7

367

subs r12, r12, #1

368

pld [r1, #0]

369

pld [r1, #128]

370

mov r4, r5

371

vst1.32 {q4, q5}, [r0]!

372

vst1.32 {q6, q7}, [r0]!

373

pld [r0, #0]

374

pld [r0, #128]

375

bne neon_f2b_copy_64_u_loop

376

#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)

377

vldmia sp!, {q5-q8}

378

vldmia sp!, {q4}

379

#else

380

vpop {q5-q8}

381

vpop {q4}

382

#endif

383

cmp r2, #32

384

bge neon_f2b_copy_32_u

385

b neon_f2b_copy_finish_u

386

neon_f2b_copy_32_u:

387

#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)

388

vstmdb sp!, {q4}

389

#else

390

vpush {q4}

391

#endif

392

vdup.u32 q4, r8

393

mov r12, r2, lsr #5

394

and r2, r2, #0x1f

395

neon_f2b_copy_32_u_loop:

396

vld1.32 {q0, q1}, [r1]!

397

lsls r5, r8, #28

398

bcc neon_f2b_copy_32_u_b8

399

bpl neon_f2b_copy_32_u_b16

400

vshr.u64 q2, q0, #40

401

vshr.u64 q3, q1, #40

402

b neon_f2b_copy_32_unify

403

neon_f2b_copy_32_u_b8:

404

vshr.u64 q2, q0, #56

405

vshr.u64 q3, q1, #56

406

b neon_f2b_copy_32_unify

407

neon_f2b_copy_32_u_b16:

408

vshr.u64 q2, q0, #48

409

vshr.u64 q3, q1, #48

410

neon_f2b_copy_32_unify:

411

#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)

412

vshl.u64 q0, q4, q0

413

vshl.u64 q1, q4, q1

414

#else

415

vshl.u64 q0, q0, q4

416

vshl.u64 q1, q1, q4

417

#endif

418

vmov r5, s14

419

vorr d1, d1, d4

420

vmov s14, r4

421

vorr d2, d2, d5

422

vorr d3, d3, d6

423

vorr d0, d0, d7

424

subs r12, r12, #1

425

pld [r1, #0]

426

mov r4, r5

427

vst1.32 {q0, q1}, [r0]!

428

bne neon_f2b_copy_32_u_loop

429

#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)

430

vldmia sp!, {q4}

431

#else

432

vpop {q4}

433

#endif

434

neon_f2b_copy_finish_u:

435

neon_f2b_copy_16_u:

436

movs r12, r2, lsr #4

437

beq neon_f2b_copy_8_u

438

vdup.u32 q2, r8

439

and r2, r2, #0xf

440

neon_f2b_copy_16_u_loop:

441

vld1.32 {q0}, [r1]!

442

lsls r5, r8, #28

443

bcc neon_f2b_copy_16_u_b8

444

bpl neon_f2b_copy_16_u_b16

445

vshr.u64 q1, q0, #40

446

b neon_f2b_copy_16_unify

447

neon_f2b_copy_16_u_b8:

448

vshr.u64 q1, q0, #56

449

b neon_f2b_copy_16_unify

450

neon_f2b_copy_16_u_b16:

451

vshr.u64 q1, q0, #48

452

neon_f2b_copy_16_unify:

453

#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)

454

vshl.u64 q0, q2, q0

455

#else

456

vshl.u64 q0, q0, q2

457

#endif

458

vmov r5, s6

459

vorr d1, d1, d2

460

vmov s6, r4

461

vorr d0, d0, d3

462

subs r12, r12, #1

463

mov r4, r5

464

vst1.32 {q0}, [r0]!

465

bne neon_f2b_copy_16_u_loop

466

neon_f2b_copy_8_u:

467

cmp r2, #8

468

blt neon_f2b_copy_4_u

469

ldm r1!, {r6-r7}

470

subs r2, r2, #8

471

orr r4, r4, r6, lsl r8

472

mov r5, r6, lsr r9

473

orr r5, r5, r7, lsl r8

474

stm r0!, {r4-r5}

475

mov r4, r7, lsr r9

476

neon_f2b_copy_4_u:

477

cmp r2, #4

478

blt neon_f2b_last_bits_u

479

ldr r5, [r1], #4

480

subs r2, r2, #4

481

orr r4, r4, r5, lsl r8

482

str r4, [r0], #4

483

mov r4, r5, lsr r9

484

neon_f2b_last_bits_u:

485

lsr r8, r8, #0x3

486

neon_f2b_last_bits_u_loop:

487

strb r4, [r0], #1

488

subs r8, r8, #1

489

lsr r4, r4, #8

490

bne neon_f2b_last_bits_u_loop

491

neon_f2b_copy_1_u:

492

cmp r2, #0

493

beq neon_f2b_finish_u

494

neon_f2b_copy_1_u_loop:

495

ldrb r12, [r1], #1

496

subs r2, r2, #1

497

strb r12, [r0], #1

498

bne neon_f2b_copy_1_u_loop

499

neon_f2b_finish_u:

500

#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)

501

ldmia sp!, {r7-r9}

502

#else

503

pop {r7-r9}

504

#endif

505

/* #############################################################

506

* Front to Back copy - finish

507

*/

508

neon_f2b_finish:

509

#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)

510

ldmia sp!, {r4-r6}

511

#else

512

pop {r4-r6}

513

#endif

514

b neon_memmove_done

515

516

/* #############################################################

517

* Back to Front copy

518

*/

519

neon_back_to_front_copy:

520

/*

521

* Here, we'll want to shift to the end of the buffers. This

522

* actually points us one past where we need to go, but since

523

* we'll pre-decrement throughout, this will be fine.

524

*/

525

add r0, r0, r2

526

add r1, r1, r2

527

cmp r2, #4

528

bgt neon_b2f_gt4

529

cmp r2, #0

530

neon_b2f_smallcopy_loop:

531

beq neon_memmove_done

532

ldrb r12, [r1, #-1]!

533

subs r2, r2, #1

534

strb r12, [r0, #-1]!

535

b neon_b2f_smallcopy_loop

536

neon_b2f_gt4:

537

pld [r0, #0]

538

pld [r1, #0]

539

/*

540

* The minimum of the overlap window size and the copy size

541

* is in r3.

542

*/

543

sub r3, r0, r1

544

#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)

545

stmdb sp!, {r4-r5}

546

#else

547

push {r4-r5}

548

#endif

549

550

/*

551

* Check alignment. Since we'll pre-decrement as we step thru, we'll

552

* need to make sure we're on word-alignment.

553

*/

554

neon_b2f_check_align:

555

ands r12, r0, #0x3

556

beq neon_b2f_source_align_check

557

sub r2, r2, r12

558

neon_b2f_shift_align:

559

ldrb r4, [r1, #-1]!

560

subs r12, r12, #1

561

strb r4, [r0, #-1]!

562

bne neon_b2f_shift_align

563

neon_b2f_source_align_check:

564

ands r4, r1, #0x3

565

bne neon_b2f_nonaligned

566

567

neon_b2f_try_16_align:

568

/* If we're >64, attempt to align on 16-bytes. Smaller amounts

569

* don't seem to be worth handling. */

570

cmp r2, #64

571

blt neon_b2f_align_route

572

ands r12, r0, #0xf

573

beq neon_b2f_align_route

574

/* In this case, r12 has the number of bytes to roll backward. */

575

neon_b2f_16_start:

576

sub r2, r2, r12

577

lsrs r5, r12, #2

578

neon_b2f_align_16_4:

579

ldr r4, [r1, #-4]!

580

subs r5, r5, #1

581

str r4, [r0, #-4]!

582

bne neon_b2f_align_16_4

583

neon_b2f_align_route:

584

/*

585

* #############################################################

586

* Back to Front copy - aligned

587

*/

588

cmp r2, r3

589

movle r12, r2

590

movgt r12, r3

591

cmp r12, #256

592

bge neon_b2f_copy_128_a

593

cmp r12, #64

594

bge neon_b2f_copy_32_a

595

cmp r12, #8

596

bge neon_b2f_copy_8_a

597

cmp r12, #4

598

bge neon_b2f_copy_4_a

599

b neon_b2f_copy_1_a

600

neon_b2f_copy_128_a:

601

#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)

602

vstmdb sp!, {q4-q7}

603

#else

604

vpush {q4-q7}

605

#endif

606

movs r12, r2, lsr #7

607

/*

608

* This irks me. There MUST be a better way to read these in and

609

* scan the register backward instead of making it go forward. Then

610

* we need to do two subtractions...

611

*/

612

neon_b2f_copy_128_a_loop:

613

sub r1, r1, #128

614

sub r0, r0, #128

615

vld1.32 {q0, q1}, [r1]!

616

vld1.32 {q2, q3}, [r1]!

617

vld1.32 {q4, q5}, [r1]!

618

vld1.32 {q6, q7}, [r1]!

619

pld [r1, #-128]

620

pld [r1, #-256]

621

vst1.32 {q0, q1}, [r0]!

622

vst1.32 {q2, q3}, [r0]!

623

vst1.32 {q4, q5}, [r0]!

624

vst1.32 {q6, q7}, [r0]!

625

subs r12, r12, #1

626

pld [r0, #-128]

627

pld [r0, #-256]

628

sub r1, r1, #128

629

sub r0, r0, #128

630

bne neon_b2f_copy_128_a_loop

631

#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)

632

vldmia sp!, {q4-q7}

633

#else

634

vpop {q4-q7}

635

#endif

636

ands r2, r2, #0x7f

637

beq neon_b2f_finish

638

cmp r2, #32

639

bge neon_b2f_copy_32_a

640

b neon_b2f_copy_finish_a

641

neon_b2f_copy_32_a:

642

mov r12, r2, lsr #5

643

neon_b2f_copy_32_a_loop:

644

sub r1, r1, #32

645

sub r0, r0, #32

646

vld1.32 {q0,q1}, [r1]

647

subs r12, r12, #1

648

vst1.32 {q0,q1}, [r0]

649

pld [r1, #0]

650

bne neon_b2f_copy_32_a_loop

651

ands r2, r2, #0x1f

652

beq neon_b2f_finish

653

neon_b2f_copy_finish_a:

654

neon_b2f_copy_8_a:

655

movs r12, r2, lsr #0x3

656

beq neon_b2f_copy_4_a

657

neon_b2f_copy_8_a_loop:

658

ldmdb r1!, {r4-r5}

659

subs r12, r12, #1

660

stmdb r0!, {r4-r5}

661

bne neon_b2f_copy_8_a_loop

662

and r2, r2, #0x7

663

neon_b2f_copy_4_a:

664

movs r12, r2, lsr #0x2

665

beq neon_b2f_copy_1_a

666

and r2, r2, #0x3

667

neon_b2f_copy_4_a_loop:

668

ldr r4, [r1, #-4]!

669

subs r12, r12, #1

670

str r4, [r0, #-4]!

671

bne neon_b2f_copy_4_a_loop

672

neon_b2f_copy_1_a:

673

cmp r2, #0

674

beq neon_b2f_finish

675

neon_b2f_copy_1_a_loop:

676

ldrb r12, [r1, #-1]!

677

subs r2, r2, #1

678

strb r12, [r0, #-1]!

679

bne neon_b2f_copy_1_a_loop

680

681

/* #############################################################

682

* Back to Front copy - unaligned

683

*/

684

neon_b2f_nonaligned:

685

/*

686

* For sizes < 8, does it really make sense to do the whole shift

687

* party?

688

*/

689

cmp r2, #8

690

ble neon_b2f_copy_1_a

691

#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)

692

stmdb sp!, {r6-r11}

693

#else

694

push {r6-r11}

695

#endif

696

/*

697

* r3 = max window size

698

* r4 = overflow bytes

699

* r5 = bytes we're reading into

700

* r6 = # bytes we're off.

701

* r10 = copy of r6

702

*/

703

and r6, r1, #0x3

704

eor r4, r4, r4

705

mov r10, r6

706

neon_b2f_realign:

707

ldrb r5, [r1, #-1]!

708

subs r6, r6, #1

709

orr r4, r5, r4, lsl #8

710

bne neon_b2f_realign

711

/*

712

* r10 = # of bits we copied into the r4 register to align source.

713

* r11 = 32 - r10

714

* r12 = Index counter for each size, so we determine how many times

715

* the given size will go into r2, then count down that # of

716

* times in r12.

717

*/

718

sub r2, r2, r10

719

lsl r10, r10, #0x3

720

rsb r11, r10, #32

721

722

cmp r2, r3

723

movle r12, r2

724

movgt r12, r3

725

cmp r12, #256

726

bge neon_b2f_copy_64_u

727

cmp r12, #64

728

bge neon_b2f_copy_32_u

729

cmp r12, #8

730

bge neon_b2f_copy_8_u

731

cmp r12, #4

732

bge neon_b2f_copy_4_u

733

b neon_b2f_last_bits_u

734

neon_b2f_copy_64_u:

735

#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)

736

vstmdb sp!, {q4,q5}

737

vstmdb sp!, {q6-q8}

738

#else

739

vpush {q4,q5}

740

vpush {q6-q8}

741

#endif

742

add r7, r11, #32

743

movs r12, r2, lsr #6

744

vdup.u32 q8, r7

745

neon_b2f_copy_64_u_loop:

746

sub r1, r1, #64

747

sub r0, r0, #64

748

vld1.32 {q0, q1}, [r1]!

749

vld1.32 {q2, q3}, [r1]

750

sub r1, r1, #32

751

vmov q4, q0

752

vmov q5, q1

753

vmov q6, q2

754

vmov q7, q3

755

vmov r5, s0

756

mov r4, r4, lsl r11

757

lsls r6, r10, #28

758

bcc neon_b2f_copy_64_u_b8

759

bpl neon_b2f_copy_64_u_b16

760

vshr.u64 q0, q0, #24

761

vshr.u64 q1, q1, #24

762

vshr.u64 q2, q2, #24

763

vshr.u64 q3, q3, #24

764

b neon_b2f_copy_64_unify

765

neon_b2f_copy_64_u_b8:

766

vshr.u64 q0, q0, #8

767

vshr.u64 q1, q1, #8

768

vshr.u64 q2, q2, #8

769

vshr.u64 q3, q3, #8

770

b neon_b2f_copy_64_unify

771

neon_b2f_copy_64_u_b16:

772

vshr.u64 q0, q0, #16

773

vshr.u64 q1, q1, #16

774

vshr.u64 q2, q2, #16

775

vshr.u64 q3, q3, #16

776

neon_b2f_copy_64_unify:

777

#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)

778

vshl.u64 q4, q8, q4

779

vshl.u64 q5, q8, q5

780

vshl.u64 q6, q8, q6

781

vshl.u64 q7, q8, q7

782

#else

783

vshl.u64 q4, q4, q8

784

vshl.u64 q5, q5, q8

785

vshl.u64 q6, q6, q8

786

vshl.u64 q7, q7, q8

787

#endif

788

vmov s17, r4

789

vorr d7, d7, d8

790

vorr d6, d6, d15

791

vorr d5, d5, d14

792

vorr d4, d4, d13

793

vorr d3, d3, d12

794

vorr d2, d2, d11

795

vorr d1, d1, d10

796

vorr d0, d0, d9

797

mov r4, r5, lsl r11

798

subs r12, r12, #1

799

lsr r4, r4, r11

800

vst1.32 {q0, q1}, [r0]!

801

vst1.32 {q2, q3}, [r0]

802

pld [r1, #0]

803

sub r0, r0, #32

804

bne neon_b2f_copy_64_u_loop

805

#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)

806

vldmia sp!, {q6-q8}

807

vldmia sp!, {q4,q5}

808

#else

809

vpop {q6-q8}

810

vpop {q4,q5}

811

#endif

812

ands r2, r2, #0x3f

813

cmp r2, #32

814

bge neon_b2f_copy_32_u

815

b neon_b2f_copy_finish_u

816

neon_b2f_copy_32_u:

817

#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)

818

vstmdb sp!, {q4}

819

#else

820

vpush {q4}

821

#endif

822

add r7, r11, #32

823

movs r12, r2, lsr #5

824

vdup.u32 q4, r7

825

and r2, r2, #0x1f

826

neon_b2f_copy_32_u_loop:

827

sub r1, r1, #32

828

sub r0, r0, #32

829

vld1.32 {q0, q1}, [r1]

830

vmov q2, q0

831

vmov q3, q1

832

vmov r5, s0

833

mov r4, r4, lsl r11

834

lsls r6, r10, #28

835

bcc neon_b2f_copy_32_u_b8

836

bpl neon_b2f_copy_32_u_b16

837

vshr.u64 q0, q0, #24

838

vshr.u64 q1, q1, #24

839

b neon_b2f_copy_32_unify

840

neon_b2f_copy_32_u_b8:

841

vshr.u64 q0, q0, #8

842

vshr.u64 q1, q1, #8

843

b neon_b2f_copy_32_unify

844

neon_b2f_copy_32_u_b16:

845

vshr.u64 q0, q0, #16

846

vshr.u64 q1, q1, #16

847

neon_b2f_copy_32_unify:

848

#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)

849

vshl.u64 q2, q4, q2

850

vshl.u64 q3, q4, q3

851

#else

852

vshl.u64 q2, q2, q4

853

vshl.u64 q3, q3, q4

854

#endif

855

vmov s9, r4

856

vorr d3, d3, d4

857

vorr d2, d2, d7

858

vorr d1, d1, d6

859

vorr d0, d0, d5

860

mov r4, r5, lsl r11

861

subs r12, r12, #1

862

lsr r4, r4, r11

863

vst1.32 {q0, q1}, [r0]

864

pld [r1, #0]

865

bne neon_b2f_copy_32_u_loop

866

#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)

867

vldmia sp!, {q4}

868

#else

869

vpop {q4}

870

#endif

871

neon_b2f_copy_finish_u:

872

neon_b2f_copy_8_u:

873

movs r12, r2, lsr #0x3

874

beq neon_b2f_copy_4_u

875

mov r5, r4, lsl r11

876

neon_b2f_copy_8_u_loop:

877

ldmdb r1!, {r6-r7}

878

subs r12, r12, #1

879

orr r5, r5, r7, lsr r10

880

mov r4, r7, lsl r11

881

orr r4, r4, r6, lsr r10

882

stmdb r0!, {r4-r5}

883

mov r4, r6, lsl r11

884

lsr r4, r4, r11

885

mov r5, r4, lsl r11

886

bne neon_b2f_copy_8_u_loop

887

ands r2, r2, #0x7

888

neon_b2f_copy_4_u:

889

movs r12, r2, lsr #0x2

890

beq neon_b2f_last_bits_u

891

mov r5, r4, lsl r11

892

neon_b2f_copy_4_u_loop:

893

ldr r6, [r1, #-4]!

894

subs r12, r12, #1

895

orr r5, r5, r6, lsr r10

896

str r5, [r0, #-4]!

897

mov r4, r6, lsl r11

898

lsr r4, r4, r11

899

mov r5, r4, lsl r11

900

bne neon_b2f_copy_4_u_loop

901

and r2, r2, #0x3

902

neon_b2f_last_bits_u:

903

neon_b2f_last_bits_u_loop:

904

subs r10, r10, #8

905

mov r5, r4, lsr r10

906

strb r5, [r0, #-1]!

907

bne neon_b2f_last_bits_u_loop

908

neon_b2f_copy_1_u:

909

cmp r2, #0

910

beq neon_b2f_finish_u

911

neon_b2f_copy_1_u_loop:

912

ldrb r12, [r1, #-1]!

913

subs r2, r2, #1

914

strb r12, [r0, #-1]!

915

bne neon_b2f_copy_1_u_loop

916

neon_b2f_finish_u:

917

#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)

918

ldmia sp!, {r6-r11}

919

#else

920

pop {r6-r11}

921

#endif

922

923

neon_b2f_finish:

924

#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)

925

ldmia sp!, {r4-r5}

926

#else

927

pop {r4-r5}

928

#endif

929

930

neon_memmove_done:

931

#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)

932

ldmia sp!, {r0}

933

#else

934

pop {r0}

935

#endif

936

bx lr

937

938

.endfunc

939

.end