81
81
bic src, srcin, #15
82
82
ands tmp1, srcin, #15
84
add limit_wd, limit, #15
85
lsr limit_wd, limit_wd, #4
84
/* Calculate the number of full and partial words -1. */
85
sub limit_wd, limit, #1 /* Limit != 0, so no underflow. */
86
lsr limit_wd, limit_wd, #4 /* Convert to Qwords. */
86
88
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
87
89
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
88
90
can be done in parallel across the entire word. */
103
105
bic has_nul2, tmp3, tmp4
104
106
subs limit_wd, limit_wd, #1
105
107
orr tmp1, has_nul1, has_nul2
106
ccmp tmp1, #0, #0, ne /* NZCV = 0000 */
108
ccmp tmp1, #0, #0, pl /* NZCV = 0000 */
108
110
/* End of critical section -- keep to one 64Byte cache line. */
144
add tmp3, limit, tmp1
146
/* Deal with a partial first word.
147
We're doing two things in parallel here;
148
1) Calculate the number of words (but avoiding overflow if
149
limit is near ULONG_MAX) - to do this we need to work out
150
limit + tmp1 - 1 as a 65-bit value before shifting it;
151
2) Load and mask the initial data words - we force the bytes
152
before the ones we are interested in to 0xff - this ensures
153
early bytes will not hit any zero detection. */
154
sub limit_wd, limit, #1
147
ldp data1, data2, [src], #16
148
add limit_wd, tmp3, #15
149
lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
158
and tmp3, limit_wd, #15
159
lsr limit_wd, limit_wd, #4
151
lsr limit_wd, limit_wd, #4
162
ldp data1, data2, [src], #16
163
lsl tmp4, tmp4, #3 /* Bytes beyond alignment -> bits. */
152
166
#ifdef __AARCH64EB__
153
167
/* Big-endian. Early bytes are at MSB. */
154
lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
168
lsl tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */
156
170
/* Little-endian. Early bytes are at LSB. */
157
lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
171
lsr tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */
173
add limit_wd, limit_wd, tmp3, lsr #4
159
175
orr data1, data1, tmp2
160
176
orr data2a, data2, tmp2
161
178
csinv data1, data1, xzr, le
162
179
csel data2, data2, data2a, le