~linaro-toolchain-dev/cortex-strings/trunk : revision 89

1

2

3

4

Redistribution and use in source and binary forms, with or without

5

modification, are permitted provided that the following conditions are met:

6

* Redistributions of source code must retain the above copyright

7

notice, this list of conditions and the following disclaimer.

8

* Redistributions in binary form must reproduce the above copyright

9

notice, this list of conditions and the following disclaimer in the

10

documentation and/or other materials provided with the distribution.

11

* Neither the name of the Linaro nor the

12

names of its contributors may be used to endorse or promote products

13

derived from this software without specific prior written permission.

14

15

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

16

"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

17

LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

18

A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT

19

HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

20

SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

21

LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

22

DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

23

THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

24

(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

25

OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */

26

27

/* Assumptions:

28

*

29

* ARMv8-a, AArch64

30

* Unaligned accesses

31

*

32

*/

33

34

35

/* By default we assume that the DC instruction can be used to zero

36

data blocks more efficiently. In some circumstances this might be

37

unsafe, for example in an asymmetric multiprocessor environment with

38

different DC clear lengths (neither the upper nor lower lengths are

39

safe to use). The feature can be disabled by defining DONT_USE_DC.

40

41

If code may be run in a virtualized environment, then define

42

MAYBE_VIRT. This will cause the code to cache the system register

43

values rather than re-reading them each call. */

44

45

#define dstin x0

46

#define val w1

47

#define count x2

48

#define tmp1 x3

49

#define tmp1w w3

50

#define tmp2 x4

51

#define tmp2w w4

52

#define zva_len_x x5

53

#define zva_len w5

54

#define zva_bits_x x6

55

56

#define A_l x7

57

#define A_lw w7

58

#define dst x8

59

#define tmp3w w9

60

61

62

.macro def_fn f p2align=0

63

.text

64

.p2align \p2align

65

.global \f

66

.type \f, %function

67

\f:

68

.endm

69

70

def_fn memset p2align=6

71

72

mov dst, dstin /* Preserve return value. */

73

ands A_lw, val, #255

74

#ifndef DONT_USE_DC

75

b.eq .Lzero_mem

76

#endif

77

orr A_lw, A_lw, A_lw, lsl #8

78

orr A_lw, A_lw, A_lw, lsl #16

79

orr A_l, A_l, A_l, lsl #32

80

.Ltail_maybe_long:

81

cmp count, #64

82

b.ge .Lnot_short

83

.Ltail_maybe_tiny:

84

cmp count, #15

85

b.le .Ltail15tiny

86

.Ltail63:

87

ands tmp1, count, #0x30

88

b.eq .Ltail15

89

add dst, dst, tmp1

90

cmp tmp1w, #0x20

91

b.eq 1f

92

b.lt 2f

93

stp A_l, A_l, [dst, #-48]

94

1:

95

stp A_l, A_l, [dst, #-32]

96

2:

97

stp A_l, A_l, [dst, #-16]

98

99

.Ltail15:

100

and count, count, #15

101

add dst, dst, count

102

stp A_l, A_l, [dst, #-16] /* Repeat some/all of last store. */

103

ret

104

105

.Ltail15tiny:

106

/* Set up to 15 bytes. Does not assume earlier memory

107

being set. */

108

tbz count, #3, 1f

109

str A_l, [dst], #8

110

1:

111

tbz count, #2, 1f

112

str A_lw, [dst], #4

113

1:

114

tbz count, #1, 1f

115

strh A_lw, [dst], #2

116

1:

117

tbz count, #0, 1f

118

strb A_lw, [dst]

119

1:

120

ret

121

122

/* Critical loop. Start at a new cache line boundary. Assuming

123

* 64 bytes per line, this ensures the entire loop is in one line. */

124

.p2align 6

125

.Lnot_short:

126

neg tmp2, dst

127

ands tmp2, tmp2, #15

128

b.eq 2f

129

/* Bring DST to 128-bit (16-byte) alignment. We know that there's

130

* more than that to set, so we simply store 16 bytes and advance by

131

* the amount required to reach alignment. */

132

sub count, count, tmp2

133

stp A_l, A_l, [dst]

134

add dst, dst, tmp2

135

/* There may be less than 63 bytes to go now. */

136

cmp count, #63

137

b.le .Ltail63

138

2:

139

sub dst, dst, #16 /* Pre-bias. */

140

sub count, count, #64

141

1:

142

stp A_l, A_l, [dst, #16]

143

stp A_l, A_l, [dst, #32]

144

stp A_l, A_l, [dst, #48]

145

stp A_l, A_l, [dst, #64]!

146

subs count, count, #64

147

b.ge 1b

148

tst count, #0x3f

149

add dst, dst, #16

150

b.ne .Ltail63

151

ret

152

153

#ifndef DONT_USE_DC

154

/* For zeroing memory, check to see if we can use the ZVA feature to

155

* zero entire 'cache' lines. */

156

.Lzero_mem:

157

mov A_l, #0

158

cmp count, #63

159

b.le .Ltail_maybe_tiny

160

neg tmp2, dst

161

ands tmp2, tmp2, #15

162

b.eq 1f

163

sub count, count, tmp2

164

stp A_l, A_l, [dst]

165

add dst, dst, tmp2

166

cmp count, #63

167

b.le .Ltail63

168

1:

169

/* For zeroing small amounts of memory, it's not worth setting up

170

* the line-clear code. */

171

cmp count, #128

172

b.lt .Lnot_short

173

#ifdef MAYBE_VIRT

174

/* For efficiency when virtualized, we cache the ZVA capability. */

175

adrp tmp2, .Lcache_clear

176

ldr zva_len, [tmp2, #:lo12:.Lcache_clear]

177

tbnz zva_len, #31, .Lnot_short

178

cbnz zva_len, .Lzero_by_line

179

mrs tmp1, dczid_el0

180

tbz tmp1, #4, 1f

181

/* ZVA not available. Remember this for next time. */

182

mov zva_len, #~0

183

str zva_len, [tmp2, #:lo12:.Lcache_clear]

184

b .Lnot_short

185

1:

186

mov tmp3w, #4

187

and zva_len, tmp1w, #15 /* Safety: other bits reserved. */

188

lsl zva_len, tmp3w, zva_len

189

str zva_len, [tmp2, #:lo12:.Lcache_clear]

190

#else

191

mrs tmp1, dczid_el0

192

tbnz tmp1, #4, .Lnot_short

193

mov tmp3w, #4

194

and zva_len, tmp1w, #15 /* Safety: other bits reserved. */

195

lsl zva_len, tmp3w, zva_len

196

#endif

197

198

.Lzero_by_line:

199

/* Compute how far we need to go to become suitably aligned. We're

200

* already at quad-word alignment. */

201

cmp count, zva_len_x

202

b.lt .Lnot_short /* Not enough to reach alignment. */

203

sub zva_bits_x, zva_len_x, #1

204

neg tmp2, dst

205

ands tmp2, tmp2, zva_bits_x

206

b.eq 1f /* Already aligned. */

207

/* Not aligned, check that there's enough to copy after alignment. */

208

sub tmp1, count, tmp2

209

cmp tmp1, #64

210

ccmp tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */

211

b.lt .Lnot_short

212

/* We know that there's at least 64 bytes to zero and that it's safe

213

* to overrun by 64 bytes. */

214

mov count, tmp1

215

2:

216

stp A_l, A_l, [dst]

217

stp A_l, A_l, [dst, #16]

218

stp A_l, A_l, [dst, #32]

219

subs tmp2, tmp2, #64

220

stp A_l, A_l, [dst, #48]

221

add dst, dst, #64

222

b.ge 2b

223

/* We've overrun a bit, so adjust dst downwards. */

224

add dst, dst, tmp2

225

1:

226

sub count, count, zva_len_x

227

3:

228

dc zva, dst

229

add dst, dst, zva_len_x

230

subs count, count, zva_len_x

231

b.ge 3b

232

ands count, count, zva_bits_x

233

b.ne .Ltail_maybe_long

234

ret

235

#ifdef MAYBE_VIRT

236

.bss

237

.p2align 2

238

.Lcache_clear:

239

.space 4

240

#endif

241

#endif /* DONT_USE_DC */