~linaro-toolchain-dev/cortex-strings/trunk : revision 94

1

2

3

4

Redistribution and use in source and binary forms, with or without

5

modification, are permitted provided that the following conditions are met:

6

* Redistributions of source code must retain the above copyright

7

notice, this list of conditions and the following disclaimer.

8

* Redistributions in binary form must reproduce the above copyright

9

notice, this list of conditions and the following disclaimer in the

10

documentation and/or other materials provided with the distribution.

11

* Neither the name of the Linaro nor the

12

names of its contributors may be used to endorse or promote products

13

derived from this software without specific prior written permission.

14

15

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

16

"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

17

LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

18

A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT

19

HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

20

SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

21

LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

22

DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

23

THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

24

(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

25

OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */

26

27

/* Assumptions:

28

*

29

* ARMv8-a, AArch64

30

* Unaligned accesses

31

*/

32

33

.macro def_fn f p2align=0

34

.text

35

.p2align \p2align

36

.global \f

37

.type \f, %function

38

\f:

39

.endm

40

41

/* Parameters and result. */

42

#define dstin x0

43

#define src x1

44

#define count x2

45

#define tmp1 x3

46

#define tmp1w w3

47

#define tmp2 x4

48

#define tmp2w w4

49

#define tmp3 x5

50

#define tmp3w w5

51

#define dst x6

52

53

#define A_l x7

54

#define A_h x8

55

#define B_l x9

56

#define B_h x10

57

#define C_l x11

58

#define C_h x12

59

#define D_l x13

60

#define D_h x14

61

62

def_fn memmove, 6

63

cmp dstin, src

64

b.lo .Ldownwards

65

add tmp1, src, count

66

cmp dstin, tmp1

67

b.hs memcpy /* No overlap. */

68

69

/* Upwards move with potential overlap.

70

* Need to move from the tail backwards. SRC and DST point one

71

* byte beyond the remaining data to move. */

72

add dst, dstin, count

73

add src, src, count

74

cmp count, #64

75

b.ge .Lmov_not_short_up

76

77

/* Deal with small moves quickly by dropping straight into the

78

* exit block. */

79

.Ltail63up:

80

/* Move up to 48 bytes of data. At this point we only need the

81

* bottom 6 bits of count to be accurate. */

82

ands tmp1, count, #0x30

83

b.eq .Ltail15up

84

sub dst, dst, tmp1

85

sub src, src, tmp1

86

cmp tmp1w, #0x20

87

b.eq 1f

88

b.lt 2f

89

ldp A_l, A_h, [src, #32]

90

stp A_l, A_h, [dst, #32]

91

1:

92

ldp A_l, A_h, [src, #16]

93

stp A_l, A_h, [dst, #16]

94

2:

95

ldp A_l, A_h, [src]

96

stp A_l, A_h, [dst]

97

.Ltail15up:

98

/* Move up to 15 bytes of data. Does not assume additional data

99

* being moved. */

100

tbz count, #3, 1f

101

ldr tmp1, [src, #-8]!

102

str tmp1, [dst, #-8]!

103

1:

104

tbz count, #2, 1f

105

ldr tmp1w, [src, #-4]!

106

str tmp1w, [dst, #-4]!

107

1:

108

tbz count, #1, 1f

109

ldrh tmp1w, [src, #-2]!

110

strh tmp1w, [dst, #-2]!

111

1:

112

tbz count, #0, 1f

113

ldrb tmp1w, [src, #-1]

114

strb tmp1w, [dst, #-1]

115

1:

116

ret

117

118

.Lmov_not_short_up:

119

/* We don't much care about the alignment of DST, but we want SRC

120

* to be 128-bit (16 byte) aligned so that we don't cross cache line

121

* boundaries on both loads and stores. */

122

ands tmp2, src, #15 /* Bytes to reach alignment. */

123

b.eq 2f

124

sub count, count, tmp2

125

/* Move enough data to reach alignment; unlike memcpy, we have to

126

* be aware of the overlap, which means we can't move data twice. */

127

tbz tmp2, #3, 1f

128

ldr tmp1, [src, #-8]!

129

str tmp1, [dst, #-8]!

130

1:

131

tbz tmp2, #2, 1f

132

ldr tmp1w, [src, #-4]!

133

str tmp1w, [dst, #-4]!

134

1:

135

tbz tmp2, #1, 1f

136

ldrh tmp1w, [src, #-2]!

137

strh tmp1w, [dst, #-2]!

138

1:

139

tbz tmp2, #0, 1f

140

ldrb tmp1w, [src, #-1]!

141

strb tmp1w, [dst, #-1]!

142

1:

143

144

/* There may be less than 63 bytes to go now. */

145

cmp count, #63

146

b.le .Ltail63up

147

2:

148

subs count, count, #128

149

b.ge .Lmov_body_large_up

150

/* Less than 128 bytes to move, so handle 64 here and then jump

151

* to the tail. */

152

ldp A_l, A_h, [src, #-64]!

153

ldp B_l, B_h, [src, #16]

154

ldp C_l, C_h, [src, #32]

155

ldp D_l, D_h, [src, #48]

156

stp A_l, A_h, [dst, #-64]!

157

stp B_l, B_h, [dst, #16]

158

stp C_l, C_h, [dst, #32]

159

stp D_l, D_h, [dst, #48]

160

tst count, #0x3f

161

b.ne .Ltail63up

162

ret

163

164

/* Critical loop. Start at a new Icache line boundary. Assuming

165

* 64 bytes per line this ensures the entire loop is in one line. */

166

.p2align 6

167

.Lmov_body_large_up:

168

/* There are at least 128 bytes to move. */

169

ldp A_l, A_h, [src, #-16]

170

ldp B_l, B_h, [src, #-32]

171

ldp C_l, C_h, [src, #-48]

172

ldp D_l, D_h, [src, #-64]!

173

1:

174

stp A_l, A_h, [dst, #-16]

175

ldp A_l, A_h, [src, #-16]

176

stp B_l, B_h, [dst, #-32]

177

ldp B_l, B_h, [src, #-32]

178

stp C_l, C_h, [dst, #-48]

179

ldp C_l, C_h, [src, #-48]

180

stp D_l, D_h, [dst, #-64]!

181

ldp D_l, D_h, [src, #-64]!

182

subs count, count, #64

183

b.ge 1b

184

stp A_l, A_h, [dst, #-16]

185

stp B_l, B_h, [dst, #-32]

186

stp C_l, C_h, [dst, #-48]

187

stp D_l, D_h, [dst, #-64]!

188

tst count, #0x3f

189

b.ne .Ltail63up

190

ret

191

192

193

.Ldownwards:

194

/* For a downwards move we can safely use memcpy provided that

195

* DST is more than 16 bytes away from SRC. */

196

sub tmp1, src, #16

197

cmp dstin, tmp1

198

b.ls memcpy /* May overlap, but not critically. */

199

200

mov dst, dstin /* Preserve DSTIN for return value. */

201

cmp count, #64

202

b.ge .Lmov_not_short_down

203

204

/* Deal with small moves quickly by dropping straight into the

205

* exit block. */

206

.Ltail63down:

207

/* Move up to 48 bytes of data. At this point we only need the

208

* bottom 6 bits of count to be accurate. */

209

ands tmp1, count, #0x30

210

b.eq .Ltail15down

211

add dst, dst, tmp1

212

add src, src, tmp1

213

cmp tmp1w, #0x20

214

b.eq 1f

215

b.lt 2f

216

ldp A_l, A_h, [src, #-48]

217

stp A_l, A_h, [dst, #-48]

218

1:

219

ldp A_l, A_h, [src, #-32]

220

stp A_l, A_h, [dst, #-32]

221

2:

222

ldp A_l, A_h, [src, #-16]

223

stp A_l, A_h, [dst, #-16]

224

.Ltail15down:

225

/* Move up to 15 bytes of data. Does not assume additional data

226

being moved. */

227

tbz count, #3, 1f

228

ldr tmp1, [src], #8

229

str tmp1, [dst], #8

230

1:

231

tbz count, #2, 1f

232

ldr tmp1w, [src], #4

233

str tmp1w, [dst], #4

234

1:

235

tbz count, #1, 1f

236

ldrh tmp1w, [src], #2

237

strh tmp1w, [dst], #2

238

1:

239

tbz count, #0, 1f

240

ldrb tmp1w, [src]

241

strb tmp1w, [dst]

242

1:

243

ret

244

245

.Lmov_not_short_down:

246

/* We don't much care about the alignment of DST, but we want SRC

247

* to be 128-bit (16 byte) aligned so that we don't cross cache line

248

* boundaries on both loads and stores. */

249

neg tmp2, src

250

ands tmp2, tmp2, #15 /* Bytes to reach alignment. */

251

b.eq 2f

252

sub count, count, tmp2

253

/* Move enough data to reach alignment; unlike memcpy, we have to

254

* be aware of the overlap, which means we can't move data twice. */

255

tbz tmp2, #3, 1f

256

ldr tmp1, [src], #8

257

str tmp1, [dst], #8

258

1:

259

tbz tmp2, #2, 1f

260

ldr tmp1w, [src], #4

261

str tmp1w, [dst], #4

262

1:

263

tbz tmp2, #1, 1f

264

ldrh tmp1w, [src], #2

265

strh tmp1w, [dst], #2

266

1:

267

tbz tmp2, #0, 1f

268

ldrb tmp1w, [src], #1

269

strb tmp1w, [dst], #1

270

1:

271

272

/* There may be less than 63 bytes to go now. */

273

cmp count, #63

274

b.le .Ltail63down

275

2:

276

subs count, count, #128

277

b.ge .Lmov_body_large_down

278

/* Less than 128 bytes to move, so handle 64 here and then jump

279

* to the tail. */

280

ldp A_l, A_h, [src]

281

ldp B_l, B_h, [src, #16]

282

ldp C_l, C_h, [src, #32]

283

ldp D_l, D_h, [src, #48]

284

stp A_l, A_h, [dst]

285

stp B_l, B_h, [dst, #16]

286

stp C_l, C_h, [dst, #32]

287

stp D_l, D_h, [dst, #48]

288

tst count, #0x3f

289

add src, src, #64

290

add dst, dst, #64

291

b.ne .Ltail63down

292

ret

293

294

/* Critical loop. Start at a new cache line boundary. Assuming

295

* 64 bytes per line this ensures the entire loop is in one line. */

296

.p2align 6

297

.Lmov_body_large_down:

298

/* There are at least 128 bytes to move. */

299

ldp A_l, A_h, [src, #0]

300

sub dst, dst, #16 /* Pre-bias. */

301

ldp B_l, B_h, [src, #16]

302

ldp C_l, C_h, [src, #32]

303

ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */

304

1:

305

stp A_l, A_h, [dst, #16]

306

ldp A_l, A_h, [src, #16]

307

stp B_l, B_h, [dst, #32]

308

ldp B_l, B_h, [src, #32]

309

stp C_l, C_h, [dst, #48]

310

ldp C_l, C_h, [src, #48]

311

stp D_l, D_h, [dst, #64]!

312

ldp D_l, D_h, [src, #64]!

313

subs count, count, #64

314

b.ge 1b

315

stp A_l, A_h, [dst, #16]

316

stp B_l, B_h, [dst, #32]

317

stp C_l, C_h, [dst, #48]

318

stp D_l, D_h, [dst, #64]

319

add src, src, #16

320

add dst, dst, #64 + 16

321

tst count, #0x3f

322

b.ne .Ltail63down

323

ret

324

.size memmove, . - memmove