~linaro-toolchain-dev/cortex-strings/trunk

« back to all changes in this revision

Viewing changes to reference/bionic-a15/memcmp.S

Committer: Will Newton
Date: 2013-04-30 14:31:08 UTC
Revision ID: will.newton@linaro.org-20130430143108-ww31c741wek8dnus

Split bionic reference code into A15 and A9 versions.

files added:
reference/bionic-a15

reference/bionic-a15/memcmp.S

reference/bionic-a15/memcpy.S

reference/bionic-a15/memset.S

reference/bionic-a15/strcmp.S

reference/bionic-a15/strcpy.S

reference/bionic-a15/strlen.c

reference/bionic-a9

reference/bionic-a9/memcmp.S

reference/bionic-a9/memcpy.S

reference/bionic-a9/memset.S

reference/bionic-a9/strcmp.S

reference/bionic-a9/strcpy.S

reference/bionic-a9/strlen.c

files removed:
reference/bionic

reference/bionic/memcmp.S

reference/bionic/memcpy.S

reference/bionic/memset.S

reference/bionic/strcmp.S

reference/bionic/strcpy.S

reference/bionic/strlen.c

files modified:
Makefile.am

Show diffs side-by-side

added added

removed removed

reference/bionic-a15/memcmp.S

* Redistribution and use in source and binary forms, with or without

* modification, are permitted provided that the following conditions

* are met:

* * Redistributions of source code must retain the above copyright

* notice, this list of conditions and the following disclaimer.

* * Redistributions in binary form must reproduce the above copyright

* notice, this list of conditions and the following disclaimer in

* the documentation and/or other materials provided with the

* distribution.

* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS

* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE

* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,

* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS

* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED

* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,

* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT

* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF

* SUCH DAMAGE.

#ifdef HAVE_32_BYTE_CACHE_LINE

#define CACHE_LINE_SIZE 32

#else

#define CACHE_LINE_SIZE 64

#endif

* Optimized memcmp() for Cortex-A9.

.text

.globl memcmp

.type memcmp,%function

memcmp:

.fnstart

pld [r0, #(CACHE_LINE_SIZE * 0)]

pld [r0, #(CACHE_LINE_SIZE * 1)]

/* take of the case where length is 0 or the buffers are the same */

cmp r0, r1

moveq r0, #0

bxeq lr

pld [r1, #(CACHE_LINE_SIZE * 0)]

pld [r1, #(CACHE_LINE_SIZE * 1)]

/* make sure we have at least 8+4 bytes, this simplify things below

* and avoid some overhead for small blocks

cmp r2, #(8+4)

bmi 10f

* Neon optimization

* Comparing 32 bytes at a time

#if defined(__ARM_NEON__) && defined(NEON_UNALIGNED_ACCESS)

subs r2, r2, #32

blo 3f

/* preload all the cache lines we need. */

pld [r0, #(CACHE_LINE_SIZE * 2)]

pld [r1, #(CACHE_LINE_SIZE * 2)]

1: /* The main loop compares 32 bytes at a time */

vld1.8 {d0 - d3}, [r0]!

pld [r0, #(CACHE_LINE_SIZE * 2)]

vld1.8 {d4 - d7}, [r1]!

pld [r1, #(CACHE_LINE_SIZE * 2)]

/* Start subtracting the values and merge results */

vsub.i8 q0, q2

vsub.i8 q1, q3

vorr q2, q0, q1

vorr d4, d5

vmov r3, ip, d4

/* Check if there are any differences among the 32 bytes */

orrs r3, ip

bne 2f

subs r2, r2, #32

bhs 1b

b 3f

/* Check if the difference was in the first or last 16 bytes */

sub r0, #32

vorr d0, d1

sub r1, #32

vmov r3, ip, d0

orrs r3, ip

/* if the first 16 bytes are equal, we only have to rewind 16 bytes */

ittt eq

subeq r2, #16

addeq r0, #16

100

addeq r1, #16

101

102

3: /* fix-up the remaining count */

103

add r2, r2, #32

104

105

cmp r2, #(8+4)

106

bmi 10f

107

#endif

108

109

.save {r4, lr}

110

/* save registers */

111

stmfd sp!, {r4, lr}

112

113

/* since r0 hold the result, move the first source

114

* pointer somewhere else

115

116

mov r4, r0

117

118

/* align first pointer to word boundary

119

* offset = -src & 3

120

121

rsb r3, r4, #0

122

ands r3, r3, #3

123

beq 0f

124

125

/* align first pointer */

126

sub r2, r2, r3

127

1: ldrb r0, [r4], #1

128

ldrb ip, [r1], #1

129

subs r0, r0, ip

130

bne 9f

131

subs r3, r3, #1

132

bne 1b

133

134

135

0: /* here the first pointer is aligned, and we have at least 4 bytes

136

* to process.

137

138

139

/* see if the pointers are congruent */

140

eor r0, r4, r1

141

ands r0, r0, #3

142

bne 5f

143

144

/* congruent case, 32 bytes per iteration

145

* We need to make sure there are at least 32+4 bytes left

146

* because we effectively read ahead one word, and we could

147

* read past the buffer (and segfault) if we're not careful.

148

149

150

ldr ip, [r1]

151

subs r2, r2, #(32 + 4)

152

bmi 1f

153

154

0: pld [r4, #(CACHE_LINE_SIZE * 2)]

155

pld [r1, #(CACHE_LINE_SIZE * 2)]

156

ldr r0, [r4], #4

157

ldr lr, [r1, #4]!

158

eors r0, r0, ip

159

ldreq r0, [r4], #4

160

ldreq ip, [r1, #4]!

161

eoreqs r0, r0, lr

162

ldreq r0, [r4], #4

163

ldreq lr, [r1, #4]!

164

eoreqs r0, r0, ip

165

ldreq r0, [r4], #4

166

ldreq ip, [r1, #4]!

167

eoreqs r0, r0, lr

168

ldreq r0, [r4], #4

169

ldreq lr, [r1, #4]!

170

eoreqs r0, r0, ip

171

ldreq r0, [r4], #4

172

ldreq ip, [r1, #4]!

173

eoreqs r0, r0, lr

174

ldreq r0, [r4], #4

175

ldreq lr, [r1, #4]!

176

eoreqs r0, r0, ip

177

ldreq r0, [r4], #4

178

ldreq ip, [r1, #4]!

179

eoreqs r0, r0, lr

180

bne 2f

181

subs r2, r2, #32

182

bhs 0b

183

184

/* do we have at least 4 bytes left? */

185

1: adds r2, r2, #(32 - 4 + 4)

186

bmi 4f

187

188

/* finish off 4 bytes at a time */

189

3: ldr r0, [r4], #4

190

ldr ip, [r1], #4

191

eors r0, r0, ip

192

bne 2f

193

subs r2, r2, #4

194

bhs 3b

195

196

/* are we done? */

197

4: adds r2, r2, #4

198

moveq r0, #0

199

beq 9f

200

201

/* finish off the remaining bytes */

202

b 8f

203

204

2: /* the last 4 bytes are different, restart them */

205

sub r4, r4, #4

206

sub r1, r1, #4

207

mov r2, #4

208

209

/* process the last few bytes */

210

8: ldrb r0, [r4], #1

211

ldrb ip, [r1], #1

212

// stall

213

subs r0, r0, ip

214

bne 9f

215

subs r2, r2, #1

216

bne 8b

217

218

9: /* restore registers and return */

219

ldmfd sp!, {r4, lr}

220

bx lr

221

222

10: /* process less than 12 bytes */

223

cmp r2, #0

224

moveq r0, #0

225

bxeq lr

226

mov r3, r0

227

11:

228

ldrb r0, [r3], #1

229

ldrb ip, [r1], #1

230

subs r0, ip

231

bxne lr

232

subs r2, r2, #1

233

bne 11b

234

bx lr

235

236

5: /*************** non-congruent case ***************/

237

and r0, r1, #3

238

cmp r0, #2

239

bne 4f

240

241

/* here, offset is 2 (16-bits aligned, special cased) */

242

243

/* make sure we have at least 16 bytes to process */

244

subs r2, r2, #16

245

addmi r2, r2, #16

246

bmi 8b

247

248

/* align the unaligned pointer */

249

bic r1, r1, #3

250

ldr lr, [r1], #4

251

252

6: pld [r1, #(CACHE_LINE_SIZE * 2)]

253

pld [r4, #(CACHE_LINE_SIZE * 2)]

254

mov ip, lr, lsr #16

255

ldr lr, [r1], #4

256

ldr r0, [r4], #4

257

orr ip, ip, lr, lsl #16

258

eors r0, r0, ip

259

moveq ip, lr, lsr #16

260

ldreq lr, [r1], #4

261

ldreq r0, [r4], #4

262

orreq ip, ip, lr, lsl #16

263

eoreqs r0, r0, ip

264

moveq ip, lr, lsr #16

265

ldreq lr, [r1], #4

266

ldreq r0, [r4], #4

267

orreq ip, ip, lr, lsl #16

268

eoreqs r0, r0, ip

269

moveq ip, lr, lsr #16

270

ldreq lr, [r1], #4

271

ldreq r0, [r4], #4

272

orreq ip, ip, lr, lsl #16

273

eoreqs r0, r0, ip

274

bne 7f

275

subs r2, r2, #16

276

bhs 6b

277

sub r1, r1, #2

278

/* are we done? */

279

adds r2, r2, #16

280

moveq r0, #0

281

beq 9b

282

/* finish off the remaining bytes */

283

b 8b

284

285

7: /* fix up the 2 pointers and fallthrough... */

286

sub r1, r1, #(4+2)

287

sub r4, r4, #4

288

mov r2, #4

289

b 8b

290

291

292

4: /*************** offset is 1 or 3 (less optimized) ***************/

293

294

stmfd sp!, {r5, r6, r7}

295

296

// r5 = rhs

297

// r6 = lhs

298

// r7 = scratch

299

300

mov r5, r0, lsl #3 /* r5 = right shift */

301

rsb r6, r5, #32 /* r6 = left shift */

302

303

/* align the unaligned pointer */

304

bic r1, r1, #3

305

ldr r7, [r1], #4

306

sub r2, r2, #8

307

308

6: mov ip, r7, lsr r5

309

ldr r7, [r1], #4

310

ldr r0, [r4], #4

311

orr ip, ip, r7, lsl r6

312

eors r0, r0, ip

313

moveq ip, r7, lsr r5

314

ldreq r7, [r1], #4

315

ldreq r0, [r4], #4

316

orreq ip, ip, r7, lsl r6

317

eoreqs r0, r0, ip

318

bne 7f

319

subs r2, r2, #8

320

bhs 6b

321

322

sub r1, r1, r6, lsr #3

323

ldmfd sp!, {r5, r6, r7}

324

325

/* are we done? */

326

adds r2, r2, #8

327

moveq r0, #0

328

beq 9b

329

330

/* finish off the remaining bytes */

331

b 8b

332

333

7: /* fix up the 2 pointers and fallthrough... */

334

sub r1, r1, #4

335

sub r1, r1, r6, lsr #3

336

sub r4, r4, #4

337

mov r2, #4

338

ldmfd sp!, {r5, r6, r7}

339

b 8b

340

.fnend

341

.size memcmp, .-memcmp

Older »