~linaro-toolchain-dev/cortex-strings/trunk : revision 95

1

2

3

4

Redistribution and use in source and binary forms, with or without

5

modification, are permitted provided that the following conditions are met:

6

* Redistributions of source code must retain the above copyright

7

notice, this list of conditions and the following disclaimer.

8

* Redistributions in binary form must reproduce the above copyright

9

notice, this list of conditions and the following disclaimer in the

10

documentation and/or other materials provided with the distribution.

11

* Neither the name of the Linaro nor the

12

names of its contributors may be used to endorse or promote products

13

derived from this software without specific prior written permission.

14

15

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

16

"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

17

LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

18

A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT

19

HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

20

SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

21

LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

22

DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

23

THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

24

(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

25

OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */

26

27

/* Assumptions:

28

*

29

* ARMv8-a, AArch64

30

*/

31

32

.macro def_fn f p2align=0

33

.text

34

.p2align \p2align

35

.global \f

36

.type \f, %function

37

\f:

38

.endm

39

40

#define REP8_01 0x0101010101010101

41

#define REP8_7f 0x7f7f7f7f7f7f7f7f

42

#define REP8_80 0x8080808080808080

43

44

/* Parameters and result. */

45

#define src1 x0

46

#define src2 x1

47

#define limit x2

48

#define result x0

49

50

/* Internal variables. */

51

#define data1 x3

52

#define data1w w3

53

#define data2 x4

54

#define data2w w4

55

#define has_nul x5

56

#define diff x6

57

#define syndrome x7

58

#define tmp1 x8

59

#define tmp2 x9

60

#define tmp3 x10

61

#define zeroones x11

62

#define pos x12

63

#define limit_wd x13

64

#define mask x14

65

#define endloop x15

66

67

.text

68

.p2align 6

69

.rep 7

70

nop /* Pad so that the loop below fits a cache line. */

71

.endr

72

def_fn strncmp

73

cbz limit, .Lret0

74

eor tmp1, src1, src2

75

mov zeroones, #REP8_01

76

tst tmp1, #7

77

b.ne .Lmisaligned8

78

ands tmp1, src1, #7

79

b.ne .Lmutual_align

80

add limit_wd, limit, #7

81

lsr limit_wd, limit_wd, #3

82

/* NUL detection works on the principle that (X - 1) & (~X) & 0x80

83

(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and

84

can be done in parallel across the entire word. */

85

/* Start of performance-critical section -- one 64B cache line. */

86

.Lloop_aligned:

87

ldr data1, [src1], #8

88

ldr data2, [src2], #8

89

.Lstart_realigned:

90

subs limit_wd, limit_wd, #1

91

sub tmp1, data1, zeroones

92

orr tmp2, data1, #REP8_7f

93

eor diff, data1, data2 /* Non-zero if differences found. */

94

csinv endloop, diff, xzr, ne /* Last Dword or differences. */

95

bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */

96

ccmp endloop, #0, #0, eq

97

b.eq .Lloop_aligned

98

/* End of performance-critical section -- one 64B cache line. */

99

100

/* Not reached the limit, must have found the end or a diff. */

101

cbnz limit_wd, .Lnot_limit

102

103

/* Limit % 8 == 0 => all bytes significant. */

104

ands limit, limit, #7

105

b.eq .Lnot_limit

106

107

lsl limit, limit, #3 /* Bits -> bytes. */

108

mov mask, #~0

109

#ifdef __AARCH64EB__

110

lsr mask, mask, limit

111

#else

112

lsl mask, mask, limit

113

#endif

114

bic data1, data1, mask

115

bic data2, data2, mask

116

117

/* Make sure that the NUL byte is marked in the syndrome. */

118

orr has_nul, has_nul, mask

119

120

.Lnot_limit:

121

orr syndrome, diff, has_nul

122

123

#ifndef __AARCH64EB__

124

rev syndrome, syndrome

125

rev data1, data1

126

/* The MS-non-zero bit of the syndrome marks either the first bit

127

that is different, or the top bit of the first zero byte.

128

Shifting left now will bring the critical information into the

129

top bits. */

130

clz pos, syndrome

131

rev data2, data2

132

lsl data1, data1, pos

133

lsl data2, data2, pos

134

/* But we need to zero-extend (char is unsigned) the value and then

135

perform a signed 32-bit subtraction. */

136

lsr data1, data1, #56

137

sub result, data1, data2, lsr #56

138

ret

139

#else

140

/* For big-endian we cannot use the trick with the syndrome value

141

as carry-propagation can corrupt the upper bits if the trailing

142

bytes in the string contain 0x01. */

143

/* However, if there is no NUL byte in the dword, we can generate

144

the result directly. We can't just subtract the bytes as the

145

MSB might be significant. */

146

cbnz has_nul, 1f

147

cmp data1, data2

148

cset result, ne

149

cneg result, result, lo

150

ret

151

1:

152

/* Re-compute the NUL-byte detection, using a byte-reversed value. */

153

rev tmp3, data1

154

sub tmp1, tmp3, zeroones

155

orr tmp2, tmp3, #REP8_7f

156

bic has_nul, tmp1, tmp2

157

rev has_nul, has_nul

158

orr syndrome, diff, has_nul

159

clz pos, syndrome

160

/* The MS-non-zero bit of the syndrome marks either the first bit

161

that is different, or the top bit of the first zero byte.

162

Shifting left now will bring the critical information into the

163

top bits. */

164

lsl data1, data1, pos

165

lsl data2, data2, pos

166

/* But we need to zero-extend (char is unsigned) the value and then

167

perform a signed 32-bit subtraction. */

168

lsr data1, data1, #56

169

sub result, data1, data2, lsr #56

170

ret

171

#endif

172

173

.Lmutual_align:

174

/* Sources are mutually aligned, but are not currently at an

175

alignment boundary. Round down the addresses and then mask off

176

the bytes that precede the start point. */

177

bic src1, src1, #7

178

bic src2, src2, #7

179

add limit, limit, tmp1 /* Adjust the limit for the extra. */

180

lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */

181

ldr data1, [src1], #8

182

neg tmp1, tmp1 /* Bits to alignment -64. */

183

ldr data2, [src2], #8

184

mov tmp2, #~0

185

#ifdef __AARCH64EB__

186

/* Big-endian. Early bytes are at MSB. */

187

lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */

188

#else

189

/* Little-endian. Early bytes are at LSB. */

190

lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */

191

#endif

192

add limit_wd, limit, #7

193

orr data1, data1, tmp2

194

orr data2, data2, tmp2

195

lsr limit_wd, limit_wd, #3

196

b .Lstart_realigned

197

198

.Lret0:

199

mov result, #0

200

ret

201

202

.p2align 6

203

.Lmisaligned8:

204

sub limit, limit, #1

205

1:

206

/* Perhaps we can do better than this. */

207

ldrb data1w, [src1], #1

208

ldrb data2w, [src2], #1

209

subs limit, limit, #1

210

ccmp data1w, #1, #0, cs /* NZCV = 0b0000. */

211

ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */

212

b.eq 1b

213

sub result, data1, data2

214

ret

215

.size strncmp, . - strncmp