1
1
/* strstr with SSE4.2 intrinsics
2
Copyright (C) 2009 Free Software Foundation, Inc.
2
Copyright (C) 2009, 2010 Free Software Foundation, Inc.
3
3
Contributed by Intel Corporation.
4
4
This file is part of the GNU C Library.
82
83
5. failed string compare, go back to scanning
85
/* Fix-up of removal of unneeded data due to 16B aligned load
87
value: 16B data loaded from 16B aligned address.
88
offset: Offset of target data address relative to 16B aligned load
92
static __inline__ __m128i
93
__m128i_shift_right (__m128i value, int offset)
98
value = _mm_srli_si128 (value, 1);
101
value = _mm_srli_si128 (value, 2);
104
value = _mm_srli_si128 (value, 3);
107
value = _mm_srli_si128 (value, 4);
110
value = _mm_srli_si128 (value, 5);
113
value = _mm_srli_si128 (value, 6);
116
value = _mm_srli_si128 (value, 7);
119
value = _mm_srli_si128 (value, 8);
122
value = _mm_srli_si128 (value, 9);
125
value = _mm_srli_si128 (value, 10);
128
value = _mm_srli_si128 (value, 11);
131
value = _mm_srli_si128 (value, 12);
134
value = _mm_srli_si128 (value, 13);
137
value = _mm_srli_si128 (value, 14);
140
value = _mm_srli_si128 (value, 15);
146
86
/* Simple replacement of movdqu to address 4KB boundary cross issue.
147
87
If EOS occurs within less than 16B before 4KB boundary, we don't
148
88
cross to next page. */
151
__attribute__ ((section (".text.sse4.2")))
152
91
__m128i_strloadu (const unsigned char * p)
154
93
int offset = ((size_t) p & (16 - 1));
164
103
return _mm_loadu_si128 ((__m128i *) p);
167
#ifdef USE_AS_STRCASESTR
106
#if defined USE_AS_STRCASESTR && !defined STRCASESTR_NONASCII
169
108
/* Similar to __m128i_strloadu. Convert to lower case for POSIX/C
173
__attribute__ ((section (".text.sse4.2")))
174
__m128i_strloadu_tolower_posix (const unsigned char * p)
110
static inline __m128i
111
__m128i_strloadu_tolower (const unsigned char *p, __m128i rangeuc,
176
114
__m128i frag = __m128i_strloadu (p);
178
/* Convert frag to lower case for POSIX/C locale. */
179
__m128i rangeuc = _mm_set_epi64x (0x0, 0x5a41);
180
__m128i u2ldelta = _mm_set1_epi64x (0xe0e0e0e0e0e0e0e0);
181
__m128i mask1 = _mm_cmpistrm (rangeuc, frag, 0x44);
182
__m128i mask2 = _mm_blendv_epi8 (u2ldelta, frag, mask1);
183
mask2 = _mm_sub_epi8 (mask2, u2ldelta);
184
return _mm_blendv_epi8 (frag, mask2, mask1);
187
/* Similar to __m128i_strloadu. Convert to lower case for none-POSIX/C
191
__attribute__ ((section (".text.sse4.2")))
192
__m128i_strloadu_tolower (const unsigned char * p)
200
for (int i = 0; i < 16; i++)
207
u.b[i] = tolower (p[i]);
116
#define UCLOW 0x4040404040404040ULL
117
#define UCHIGH 0x5b5b5b5b5b5b5b5bULL
118
#define LCQWORD 0x2020202020202020ULL
119
/* Compare if 'Z' > bytes. Inverted way to get a mask for byte <= 'Z'. */
120
__m128i r2 = _mm_cmpgt_epi8 (_mm_set1_epi64x (UCHIGH), frag);
121
/* Compare if bytes are > 'A' - 1. */
122
__m128i r1 = _mm_cmpgt_epi8 (frag, _mm_set1_epi64x (UCLOW));
123
/* Mask byte == ff if byte(r2) <= 'Z' and byte(r1) > 'A' - 1. */
124
__m128i mask = _mm_and_si128 (r2, r1);
125
/* Apply lowercase bit 6 mask for above mask bytes == ff. */
126
return _mm_or_si128 (frag, _mm_and_si128 (mask, _mm_set1_epi64x (LCQWORD)));
213
131
/* Calculate Knuth-Morris-Pratt string searching algorithm (or KMP
214
132
algorithm) overlap for a fully populated 16B vector.
215
133
Input parameter: 1st 16Byte loaded from the reference string of a
217
We don't use KMP algorithm if reference string is less than 16B.
135
We don't use KMP algorithm if reference string is less than 16B. */
221
137
__inline__ __attribute__ ((__always_inline__,))
222
138
KMP16Bovrlap (__m128i s2)
257
173
const unsigned char *p2 = s2;
175
#ifndef STRCASESTR_NONASCII
176
if (__builtin_expect (p2[0] == '\0', 0))
260
177
return (char *) p1;
179
if (__builtin_expect (p1[0] == '\0', 0))
265
182
/* Check if p1 length is 1 byte long. */
183
if (__builtin_expect (p1[1] == '\0', 0))
267
184
return p2[1] == '\0' && CMPBYTE (p1[0], p2[0]) ? (char *) p1 : NULL;
269
187
#ifdef USE_AS_STRCASESTR
270
__m128i (*strloadu) (const unsigned char *);
188
# ifndef STRCASESTR_NONASCII
189
if (__builtin_expect (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_NONASCII_CASE)
191
return __strcasestr_sse42_nonascii (s1, s2);
272
if (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_NONASCII_CASE) == 0)
273
strloadu = __m128i_strloadu_tolower_posix;
275
strloadu = __m128i_strloadu_tolower;
193
const __m128i rangeuc = _mm_set_epi64x (0x0, 0x5a41);
194
const __m128i u2ldelta = _mm_set1_epi64x (0xe0e0e0e0e0e0e0e0);
195
# define strloadu(p) __m128i_strloadu_tolower (p, rangeuc, u2ldelta)
197
# define strloadu __m128i_strloadu_tolower
277
200
# define strloadu __m128i_strloadu