~linaro-toolchain-dev/cortex-strings/trunk

« back to all changes in this revision

Viewing changes to reference/bionic/memcmp.S

Committer: Will Newton
Date: 2013-04-30 14:31:08 UTC
Revision ID: will.newton@linaro.org-20130430143108-ww31c741wek8dnus

Split bionic reference code into A15 and A9 versions.

files added:
reference/bionic-a15

reference/bionic-a15/memcmp.S

reference/bionic-a15/memcpy.S

reference/bionic-a15/memset.S

reference/bionic-a15/strcmp.S

reference/bionic-a15/strcpy.S

reference/bionic-a15/strlen.c

reference/bionic-a9

reference/bionic-a9/memcmp.S

reference/bionic-a9/memcpy.S

reference/bionic-a9/memset.S

reference/bionic-a9/strcmp.S

reference/bionic-a9/strcpy.S

reference/bionic-a9/strlen.c

files removed:
reference/bionic

reference/bionic/memcmp.S

reference/bionic/memcpy.S

reference/bionic/memset.S

reference/bionic/strcmp.S

reference/bionic/strcpy.S

reference/bionic/strlen.c

files modified:
Makefile.am

Show diffs side-by-side

added added

removed removed

reference/bionic/memcmp.S

* Redistribution and use in source and binary forms, with or without

* modification, are permitted provided that the following conditions

* are met:

* * Redistributions of source code must retain the above copyright

* notice, this list of conditions and the following disclaimer.

* * Redistributions in binary form must reproduce the above copyright

* notice, this list of conditions and the following disclaimer in

* the documentation and/or other materials provided with the

* distribution.

* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS

* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE

* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,

* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS

* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED

* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,

* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT

* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF

* SUCH DAMAGE.

.global memcmp

.type memcmp, %function

.text

* Optimized memcmp() for ARM9.

* This would not be optimal on XScale or ARM11, where more prefetching

* and use of PLD will be needed.

* The 2 major optimzations here are

* (1) The main loop compares 16 bytes at a time

* (2) The loads are scheduled in a way they won't stall

memcmp:

.fnstart

PLD [r0, #0]

PLD [r1, #0]

/* take of the case where length is 0 or the buffers are the same */

cmp r0, r1

cmpne r2, #0

moveq r0, #0

bxeq lr

.save {r4, lr}

/* save registers */

stmfd sp!, {r4, lr}

PLD [r0, #32]

PLD [r1, #32]

/* since r0 hold the result, move the first source

* pointer somewhere else

mov r4, r0

/* make sure we have at least 8+4 bytes, this simplify things below

* and avoid some overhead for small blocks

cmp r2, #(8+4)

bmi 8f

/* align first pointer to word boundary

* offset = -src & 3

rsb r3, r4, #0

ands r3, r3, #3

beq 0f

/* align first pointer */

sub r2, r2, r3

1: ldrb r0, [r4], #1

ldrb ip, [r1], #1

subs r0, r0, ip

bne 9f

subs r3, r3, #1

bne 1b

0: /* here the first pointer is aligned, and we have at least 4 bytes

* to process.

/* see if the pointers are congruent */

eor r0, r4, r1

ands r0, r0, #3

bne 5f

/* congruent case, 32 bytes per iteration

* We need to make sure there are at least 32+4 bytes left

100

* because we effectively read ahead one word, and we could

101

* read past the buffer (and segfault) if we're not careful.

102

103

104

ldr ip, [r1]

105

subs r2, r2, #(32 + 4)

106

bmi 1f

107

108

0: PLD [r4, #64]

109

PLD [r1, #64]

110

ldr r0, [r4], #4

111

ldr lr, [r1, #4]!

112

eors r0, r0, ip

113

ldreq r0, [r4], #4

114

ldreq ip, [r1, #4]!

115

eoreqs r0, r0, lr

116

ldreq r0, [r4], #4

117

ldreq lr, [r1, #4]!

118

eoreqs r0, r0, ip

119

ldreq r0, [r4], #4

120

ldreq ip, [r1, #4]!

121

eoreqs r0, r0, lr

122

ldreq r0, [r4], #4

123

ldreq lr, [r1, #4]!

124

eoreqs r0, r0, ip

125

ldreq r0, [r4], #4

126

ldreq ip, [r1, #4]!

127

eoreqs r0, r0, lr

128

ldreq r0, [r4], #4

129

ldreq lr, [r1, #4]!

130

eoreqs r0, r0, ip

131

ldreq r0, [r4], #4

132

ldreq ip, [r1, #4]!

133

eoreqs r0, r0, lr

134

bne 2f

135

subs r2, r2, #32

136

bhs 0b

137

138

/* do we have at least 4 bytes left? */

139

1: adds r2, r2, #(32 - 4 + 4)

140

bmi 4f

141

142

/* finish off 4 bytes at a time */

143

3: ldr r0, [r4], #4

144

ldr ip, [r1], #4

145

eors r0, r0, ip

146

bne 2f

147

subs r2, r2, #4

148

bhs 3b

149

150

/* are we done? */

151

4: adds r2, r2, #4

152

moveq r0, #0

153

beq 9f

154

155

/* finish off the remaining bytes */

156

b 8f

157

158

2: /* the last 4 bytes are different, restart them */

159

sub r4, r4, #4

160

sub r1, r1, #4

161

mov r2, #4

162

163

/* process the last few bytes */

164

8: ldrb r0, [r4], #1

165

ldrb ip, [r1], #1

166

// stall

167

subs r0, r0, ip

168

bne 9f

169

subs r2, r2, #1

170

bne 8b

171

172

9: /* restore registers and return */

173

ldmfd sp!, {r4, lr}

174

bx lr

175

.fnend

176

177

178

179

180

181

5: /*************** non-congruent case ***************/

182

and r0, r1, #3

183

cmp r0, #2

184

bne 4f

185

186

/* here, offset is 2 (16-bits aligned, special cased) */

187

188

/* make sure we have at least 16 bytes to process */

189

subs r2, r2, #16

190

addmi r2, r2, #16

191

bmi 8b

192

193

/* align the unaligned pointer */

194

bic r1, r1, #3

195

ldr lr, [r1], #4

196

197

6: PLD [r1, #64]

198

PLD [r4, #64]

199

mov ip, lr, lsr #16

200

ldr lr, [r1], #4

201

ldr r0, [r4], #4

202

orr ip, ip, lr, lsl #16

203

eors r0, r0, ip

204

moveq ip, lr, lsr #16

205

ldreq lr, [r1], #4

206

ldreq r0, [r4], #4

207

orreq ip, ip, lr, lsl #16

208

eoreqs r0, r0, ip

209

moveq ip, lr, lsr #16

210

ldreq lr, [r1], #4

211

ldreq r0, [r4], #4

212

orreq ip, ip, lr, lsl #16

213

eoreqs r0, r0, ip

214

moveq ip, lr, lsr #16

215

ldreq lr, [r1], #4

216

ldreq r0, [r4], #4

217

orreq ip, ip, lr, lsl #16

218

eoreqs r0, r0, ip

219

bne 7f

220

subs r2, r2, #16

221

bhs 6b

222

sub r1, r1, #2

223

/* are we done? */

224

adds r2, r2, #16

225

moveq r0, #0

226

beq 9b

227

/* finish off the remaining bytes */

228

b 8b

229

230

7: /* fix up the 2 pointers and fallthrough... */

231

sub r1, r1, #(4+2)

232

sub r4, r4, #4

233

mov r2, #4

234

b 8b

235

236

237

4: /*************** offset is 1 or 3 (less optimized) ***************/

238

239

stmfd sp!, {r5, r6, r7}

240

241

// r5 = rhs

242

// r6 = lhs

243

// r7 = scratch

244

245

mov r5, r0, lsl #3 /* r5 = right shift */

246

rsb r6, r5, #32 /* r6 = left shift */

247

248

/* align the unaligned pointer */

249

bic r1, r1, #3

250

ldr r7, [r1], #4

251

sub r2, r2, #8

252

253

6: mov ip, r7, lsr r5

254

ldr r7, [r1], #4

255

ldr r0, [r4], #4

256

orr ip, ip, r7, lsl r6

257

eors r0, r0, ip

258

moveq ip, r7, lsr r5

259

ldreq r7, [r1], #4

260

ldreq r0, [r4], #4

261

orreq ip, ip, r7, lsl r6

262

eoreqs r0, r0, ip

263

bne 7f

264

subs r2, r2, #8

265

bhs 6b

266

267

sub r1, r1, r6, lsr #3

268

ldmfd sp!, {r5, r6, r7}

269

270

/* are we done? */

271

adds r2, r2, #8

272

moveq r0, #0

273

beq 9b

274

275

/* finish off the remaining bytes */

276

b 8b

277

278

7: /* fix up the 2 pointers and fallthrough... */

279

sub r1, r1, #4

280

sub r1, r1, r6, lsr #3

281

sub r4, r4, #4

282

mov r2, #4

283

ldmfd sp!, {r5, r6, r7}

284

b 8b

Older »