1
// Copyright (c) 2014 Couchbase, Inc.
3
// Licensed under the Apache License, Version 2.0 (the "License");
4
// you may not use this file except in compliance with the License.
5
// You may obtain a copy of the License at
7
// http://www.apache.org/licenses/LICENSE-2.0
9
// Unless required by applicable law or agreed to in writing, software
10
// distributed under the License is distributed on an "AS IS" BASIS,
11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
// See the License for the specific language governing permissions and
13
// limitations under the License.
21
"github.com/blevesearch/bleve/analysis"
24
func TestShingleFilter(t *testing.T) {
32
input analysis.TokenStream
33
output analysis.TokenStream
38
outputOriginal: false,
41
input: analysis.TokenStream{
46
Term: []byte("quick"),
49
Term: []byte("brown"),
55
output: analysis.TokenStream{
57
Term: []byte("the quick"),
58
Type: analysis.Shingle,
61
Term: []byte("quick brown"),
62
Type: analysis.Shingle,
65
Term: []byte("brown fox"),
66
Type: analysis.Shingle,
73
outputOriginal: false,
76
input: analysis.TokenStream{
81
Term: []byte("quick"),
84
Term: []byte("brown"),
90
output: analysis.TokenStream{
92
Term: []byte("the quick brown"),
93
Type: analysis.Shingle,
96
Term: []byte("quick brown fox"),
97
Type: analysis.Shingle,
104
outputOriginal: false,
107
input: analysis.TokenStream{
112
Term: []byte("quick"),
115
Term: []byte("brown"),
121
output: analysis.TokenStream{
123
Term: []byte("the quick"),
124
Type: analysis.Shingle,
127
Term: []byte("quick brown"),
128
Type: analysis.Shingle,
131
Term: []byte("the quick brown"),
132
Type: analysis.Shingle,
135
Term: []byte("brown fox"),
136
Type: analysis.Shingle,
139
Term: []byte("quick brown fox"),
140
Type: analysis.Shingle,
147
outputOriginal: false,
150
input: analysis.TokenStream{
152
Term: []byte("ugly"),
156
Term: []byte("quick"),
160
Term: []byte("brown"),
164
output: analysis.TokenStream{
166
Term: []byte("ugly _ quick"),
167
Type: analysis.Shingle,
171
Term: []byte("_ quick brown"),
172
Type: analysis.Shingle,
180
outputOriginal: false,
183
input: analysis.TokenStream{
185
Term: []byte("test"),
189
Term: []byte("text"),
192
// token 3 removed by stop filter
198
Term: []byte("shingles"),
202
output: analysis.TokenStream{
204
Term: []byte("test"),
205
Type: analysis.Shingle,
209
Term: []byte("text"),
210
Type: analysis.Shingle,
214
Term: []byte("test text"),
215
Type: analysis.Shingle,
220
Type: analysis.Shingle,
223
Term: []byte("text _"),
224
Type: analysis.Shingle,
228
Term: []byte("test text _"),
229
Type: analysis.Shingle,
234
Type: analysis.Shingle,
238
Term: []byte("_ see"),
239
Type: analysis.Shingle,
243
Term: []byte("text _ see"),
244
Type: analysis.Shingle,
248
Term: []byte("test text _ see"),
249
Type: analysis.Shingle,
253
Term: []byte("shingles"),
254
Type: analysis.Shingle,
258
Term: []byte("see shingles"),
259
Type: analysis.Shingle,
263
Term: []byte("_ see shingles"),
264
Type: analysis.Shingle,
268
Term: []byte("text _ see shingles"),
269
Type: analysis.Shingle,
273
Term: []byte("test text _ see shingles"),
274
Type: analysis.Shingle,
282
outputOriginal: true,
285
input: analysis.TokenStream{
290
Term: []byte("quick"),
293
Term: []byte("brown"),
299
output: analysis.TokenStream{
304
Term: []byte("quick"),
307
Term: []byte("the quick"),
308
Type: analysis.Shingle,
311
Term: []byte("brown"),
314
Term: []byte("quick brown"),
315
Type: analysis.Shingle,
321
Term: []byte("brown fox"),
322
Type: analysis.Shingle,
328
for _, test := range tests {
329
shingleFilter := NewShingleFilter(test.min, test.max, test.outputOriginal, test.separator, test.filler)
330
actual := shingleFilter.Filter(test.input)
331
if !reflect.DeepEqual(actual, test.output) {
332
t.Errorf("expected %s, got %s", test.output, actual)
337
// TestShingleFilterBug431 tests that the shingle filter is in fact stateless
338
// by making using the same filter instance twice and ensuring we do not get
339
// contaminated output
340
func TestShingleFilterBug431(t *testing.T) {
343
input analysis.TokenStream
344
output analysis.TokenStream
347
input: analysis.TokenStream{
352
Term: []byte("quick"),
355
Term: []byte("brown"),
361
output: analysis.TokenStream{
363
Term: []byte("the quick"),
364
Type: analysis.Shingle,
367
Term: []byte("quick brown"),
368
Type: analysis.Shingle,
371
Term: []byte("brown fox"),
372
Type: analysis.Shingle,
377
input: analysis.TokenStream{
385
Term: []byte("dirty"),
388
Term: []byte("sock"),
391
output: analysis.TokenStream{
393
Term: []byte("a sad"),
394
Type: analysis.Shingle,
397
Term: []byte("sad dirty"),
398
Type: analysis.Shingle,
401
Term: []byte("dirty sock"),
402
Type: analysis.Shingle,
408
shingleFilter := NewShingleFilter(2, 2, false, " ", "_")
409
for _, test := range tests {
410
actual := shingleFilter.Filter(test.input)
411
if !reflect.DeepEqual(actual, test.output) {
412
t.Errorf("expected %s, got %s", test.output, actual)