~linaro-toolchain-dev/cortex-strings/trunk

« back to all changes in this revision

Viewing changes to src/reference/bionic/memcmp.S

Committer: Michael Hope
Date: 2010-08-26 22:19:29 UTC
Revision ID: michael.hope@linaro.org-20100826221929-ppeg01mnpx34aqrp

Pulled in the initial versions

files added:

src/reference

src/reference/Makefile.am

src/reference/bionic

src/reference/bionic/memcmp.S

src/reference/bionic/memcmp16.S

src/reference/bionic/memcpy.S

src/reference/bionic/memset.S

src/reference/bionic/strlen.c

src/reference/configure.ac

src/reference/glibc

src/reference/glibc/memcpy.S

src/reference/glibc/memmove.S

src/reference/glibc/memset.S

src/reference/glibc/strlen.S

src/reference/helpers

src/reference/helpers/bounce.c

src/reference/helpers/spawn.c

src/reference/newlib

src/reference/newlib/arm_asm.h

src/reference/newlib/strcmp.c

src/reference/newlib/strcpy.c

src/reference/newlib/strlen.c

src/reference/plain

src/reference/plain/memcpy.c

src/reference/test.py

Show diffs side-by-side

added added

removed removed

src/reference/bionic/memcmp.S

* Redistribution and use in source and binary forms, with or without

* modification, are permitted provided that the following conditions

* are met:

* * Redistributions of source code must retain the above copyright

* notice, this list of conditions and the following disclaimer.

* * Redistributions in binary form must reproduce the above copyright

* notice, this list of conditions and the following disclaimer in

* the documentation and/or other materials provided with the

* distribution.

* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS

* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE

* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,

* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS

* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED

* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,

* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT

* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF

* SUCH DAMAGE.

#include <machine/cpu-features.h>

.text

.global memcmp

.type memcmp, %function

.align 4

* Optimized memcmp() for ARM9.

* This would not be optimal on XScale or ARM11, where more prefetching

* and use of PLD will be needed.

* The 2 major optimzations here are

* (1) The main loop compares 16 bytes at a time

* (2) The loads are scheduled in a way they won't stall

memcmp:

.fnstart

PLD (r0, #0)

PLD (r1, #0)

/* take of the case where length is 0 or the buffers are the same */

cmp r0, r1

cmpne r2, #0

moveq r0, #0

bxeq lr

.save {r4, lr}

/* save registers */

stmfd sp!, {r4, lr}

PLD (r0, #32)

PLD (r1, #32)

/* since r0 hold the result, move the first source

* pointer somewhere else

mov r4, r0

/* make sure we have at least 8+4 bytes, this simplify things below

* and avoid some overhead for small blocks

cmp r2, #(8+4)

bmi 8f

/* align first pointer to word boundary

* offset = -src & 3

rsb r3, r4, #0

ands r3, r3, #3

beq 0f

/* align first pointer */

sub r2, r2, r3

1: ldrb r0, [r4], #1

ldrb ip, [r1], #1

subs r0, r0, ip

bne 9f

subs r3, r3, #1

bne 1b

0: /* here the first pointer is aligned, and we have at least 4 bytes

* to process.

/* see if the pointers are congruent */

eor r0, r4, r1

ands r0, r0, #3

100

bne 5f

101

102

/* congruent case, 32 bytes per iteration

103

* We need to make sure there are at least 32+4 bytes left

104

* because we effectively read ahead one word, and we could

105

* read past the buffer (and segfault) if we're not careful.

106

107

108

ldr ip, [r1]

109

subs r2, r2, #(32 + 4)

110

bmi 1f

111

112

0: PLD (r4, #64)

113

PLD (r1, #64)

114

ldr r0, [r4], #4

115

ldr lr, [r1, #4]!

116

eors r0, r0, ip

117

ldreq r0, [r4], #4

118

ldreq ip, [r1, #4]!

119

eoreqs r0, r0, lr

120

ldreq r0, [r4], #4

121

ldreq lr, [r1, #4]!

122

eoreqs r0, r0, ip

123

ldreq r0, [r4], #4

124

ldreq ip, [r1, #4]!

125

eoreqs r0, r0, lr

126

ldreq r0, [r4], #4

127

ldreq lr, [r1, #4]!

128

eoreqs r0, r0, ip

129

ldreq r0, [r4], #4

130

ldreq ip, [r1, #4]!

131

eoreqs r0, r0, lr

132

ldreq r0, [r4], #4

133

ldreq lr, [r1, #4]!

134

eoreqs r0, r0, ip

135

ldreq r0, [r4], #4

136

ldreq ip, [r1, #4]!

137

eoreqs r0, r0, lr

138

bne 2f

139

subs r2, r2, #32

140

bhs 0b

141

142

/* do we have at least 4 bytes left? */

143

1: adds r2, r2, #(32 - 4 + 4)

144

bmi 4f

145

146

/* finish off 4 bytes at a time */

147

3: ldr r0, [r4], #4

148

ldr ip, [r1], #4

149

eors r0, r0, ip

150

bne 2f

151

subs r2, r2, #4

152

bhs 3b

153

154

/* are we done? */

155

4: adds r2, r2, #4

156

moveq r0, #0

157

beq 9f

158

159

/* finish off the remaining bytes */

160

b 8f

161

162

2: /* the last 4 bytes are different, restart them */

163

sub r4, r4, #4

164

sub r1, r1, #4

165

mov r2, #4

166

167

/* process the last few bytes */

168

8: ldrb r0, [r4], #1

169

ldrb ip, [r1], #1

170

// stall

171

subs r0, r0, ip

172

bne 9f

173

subs r2, r2, #1

174

bne 8b

175

176

9: /* restore registers and return */

177

ldmfd sp!, {r4, lr}

178

bx lr

179

.fnend

180

181

182

183

184

185

5: /*************** non-congruent case ***************/

186

and r0, r1, #3

187

cmp r0, #2

188

bne 4f

189

190

/* here, offset is 2 (16-bits aligned, special cased) */

191

192

/* make sure we have at least 16 bytes to process */

193

subs r2, r2, #16

194

addmi r2, r2, #16

195

bmi 8b

196

197

/* align the unaligned pointer */

198

bic r1, r1, #3

199

ldr lr, [r1], #4

200

201

6: PLD (r1, #64)

202

PLD (r4, #64)

203

mov ip, lr, lsr #16

204

ldr lr, [r1], #4

205

ldr r0, [r4], #4

206

orr ip, ip, lr, lsl #16

207

eors r0, r0, ip

208

moveq ip, lr, lsr #16

209

ldreq lr, [r1], #4

210

ldreq r0, [r4], #4

211

orreq ip, ip, lr, lsl #16

212

eoreqs r0, r0, ip

213

moveq ip, lr, lsr #16

214

ldreq lr, [r1], #4

215

ldreq r0, [r4], #4

216

orreq ip, ip, lr, lsl #16

217

eoreqs r0, r0, ip

218

moveq ip, lr, lsr #16

219

ldreq lr, [r1], #4

220

ldreq r0, [r4], #4

221

orreq ip, ip, lr, lsl #16

222

eoreqs r0, r0, ip

223

bne 7f

224

subs r2, r2, #16

225

bhs 6b

226

sub r1, r1, #2

227

/* are we done? */

228

adds r2, r2, #16

229

moveq r0, #0

230

beq 9b

231

/* finish off the remaining bytes */

232

b 8b

233

234

7: /* fix up the 2 pointers and fallthrough... */

235

sub r1, r1, #(4+2)

236

sub r4, r4, #4

237

mov r2, #4

238

b 8b

239

240

241

4: /*************** offset is 1 or 3 (less optimized) ***************/

242

243

stmfd sp!, {r5, r6, r7}

244

245

// r5 = rhs

246

// r6 = lhs

247

// r7 = scratch

248

249

mov r5, r0, lsl #3 /* r5 = right shift */

250

rsb r6, r5, #32 /* r6 = left shift */

251

252

/* align the unaligned pointer */

253

bic r1, r1, #3

254

ldr r7, [r1], #4

255

sub r2, r2, #8

256

257

6: mov ip, r7, lsr r5

258

ldr r7, [r1], #4

259

ldr r0, [r4], #4

260

orr ip, ip, r7, lsl r6

261

eors r0, r0, ip

262

moveq ip, r7, lsr r5

263

ldreq r7, [r1], #4

264

ldreq r0, [r4], #4

265

orreq ip, ip, r7, lsl r6

266

eoreqs r0, r0, ip

267

bne 7f

268

subs r2, r2, #8

269

bhs 6b

270

271

sub r1, r1, r6, lsr #3

272

ldmfd sp!, {r5, r6, r7}

273

274

/* are we done? */

275

adds r2, r2, #8

276

moveq r0, #0

277

beq 9b

278

279

/* finish off the remaining bytes */

280

b 8b

281

282

7: /* fix up the 2 pointers and fallthrough... */

283

sub r1, r1, #4

284

sub r1, r1, r6, lsr #3

285

sub r4, r4, #4

286

mov r2, #4

287

ldmfd sp!, {r5, r6, r7}

288

b 8b

Older »