~linaro-toolchain-dev/cortex-strings/trunk

« back to all changes in this revision

Viewing changes to src/aarch64/strnlen.S

Committer: Matthew Gretton-Dann
Date: 2013-01-07 16:59:29 UTC
mfrom: (96.1.3 aarch64-additions-2)
Revision ID: matthew.gretton-dann@linaro.org-20130107165929-2p27azr2amdypgnn

Merge further AArch64 optimised routines.

files added:
src/aarch64/memcmp.S

src/aarch64/strnlen.S

tests/test-strnlen.c

files modified:
Makefile.am

Show diffs side-by-side

added added

removed removed

src/aarch64/strnlen.S

/* strnlen - calculate the length of a string with limit.

Redistribution and use in source and binary forms, with or without

modification, are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright

notice, this list of conditions and the following disclaimer.

* Redistributions in binary form must reproduce the above copyright

notice, this list of conditions and the following disclaimer in the

documentation and/or other materials provided with the distribution.

* Neither the name of the Linaro nor the

names of its contributors may be used to endorse or promote products

derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT

HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */

/* Assumptions:

* ARMv8-a, AArch64

/* Arguments and results. */

#define srcin x0

#define len x0

#define limit x1

/* Locals and temporaries. */

#define src x2

#define data1 x3

#define data2 x4

#define data2a x5

#define has_nul1 x6

#define has_nul2 x7

#define tmp1 x8

#define tmp2 x9

#define tmp3 x10

#define tmp4 x11

#define zeroones x12

#define pos x13

#define limit_wd x14

.macro def_fn f p2align=0

.text

.p2align \p2align

.global \f

.type \f, %function

\f:

.endm

#define REP8_01 0x0101010101010101

#define REP8_7f 0x7f7f7f7f7f7f7f7f

#define REP8_80 0x8080808080808080

.text

.p2align 6

.Lstart:

/* Pre-pad to ensure critical loop begins an icache line. */

.rep 7

nop

.endr

/* Put this code here to avoid wasting more space with pre-padding. */

.Lhit_limit:

mov len, limit

ret

def_fn strnlen

cbz limit, .Lhit_limit

mov zeroones, #REP8_01

bic src, srcin, #15

ands tmp1, srcin, #15

b.ne .Lmisaligned

add limit_wd, limit, #15

lsr limit_wd, limit_wd, #4

/* NUL detection works on the principle that (X - 1) & (~X) & 0x80

(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and

can be done in parallel across the entire word. */

/* The inner loop deals with two Dwords at a time. This has a

slightly higher start-up cost, but we should win quite quickly,

especially on cores with a high number of issue slots per

cycle, as we get much better parallelism out of the operations. */

/* Start of critial section -- keep to one 64Byte cache line. */

.Lloop:

ldp data1, data2, [src], #16

.Lrealigned:

sub tmp1, data1, zeroones

orr tmp2, data1, #REP8_7f

100

sub tmp3, data2, zeroones

101

orr tmp4, data2, #REP8_7f

102

bic has_nul1, tmp1, tmp2

103

bic has_nul2, tmp3, tmp4

104

subs limit_wd, limit_wd, #1

105

orr tmp1, has_nul1, has_nul2

106

ccmp tmp1, #0, #0, ne /* NZCV = 0000 */

107

b.eq .Lloop

108

/* End of critical section -- keep to one 64Byte cache line. */

109

110

orr tmp1, has_nul1, has_nul2

111

cbz tmp1, .Lhit_limit /* No null in final Qword. */

112

113

/* We know there's a null in the final Qword. The easiest thing

114

to do now is work out the length of the string and return

115

MIN (len, limit). */

116

117

sub len, src, srcin

118

cbz has_nul1, .Lnul_in_data2

119

#ifdef __AARCH64EB__

120

mov data2, data1

121

#endif

122

sub len, len, #8

123

mov has_nul2, has_nul1

124

.Lnul_in_data2:

125

#ifdef __AARCH64EB__

126

/* For big-endian, carry propagation (if the final byte in the

127

string is 0x01) means we cannot use has_nul directly. The

128

easiest way to get the correct byte is to byte-swap the data

129

and calculate the syndrome a second time. */

130

rev data2, data2

131

sub tmp1, data2, zeroones

132

orr tmp2, data2, #REP8_7f

133

bic has_nul2, tmp1, tmp2

134

#endif

135

sub len, len, #8

136

rev has_nul2, has_nul2

137

clz pos, has_nul2

138

add len, len, pos, lsr #3 /* Bits to bytes. */

139

cmp len, limit

140

csel len, len, limit, ls /* Return the lower value. */

141

ret

142

143

.Lmisaligned:

144

add tmp3, limit, tmp1

145

cmp tmp1, #8

146

neg tmp1, tmp1

147

ldp data1, data2, [src], #16

148

add limit_wd, tmp3, #15

149

lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */

150

mov tmp2, #~0

151

lsr limit_wd, limit_wd, #4

152

#ifdef __AARCH64EB__

153

/* Big-endian. Early bytes are at MSB. */

154

lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */

155

#else

156

/* Little-endian. Early bytes are at LSB. */

157

lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */

158

#endif

159

orr data1, data1, tmp2

160

orr data2a, data2, tmp2

161

csinv data1, data1, xzr, le

162

csel data2, data2, data2a, le

163

b .Lrealigned

164

.size strnlen, . - .Lstart /* Include pre-padding in size. */

Older »