~ubuntu-branches/debian/sid/golang-github-blevesearch-bleve/sid

« back to all changes in this revision

Viewing changes to analysis/token/shingle/shingle_test.go

  • Committer: Package Import Robot
  • Author(s): Michael Lustfield
  • Date: 2017-03-30 16:06:03 UTC
  • Revision ID: package-import@ubuntu.com-20170330160603-0oogmb960l7918jx
Tags: upstream-0.5.0+git20170324.202.4702785f
ImportĀ upstreamĀ versionĀ 0.5.0+git20170324.202.4702785f

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
//  Copyright (c) 2014 Couchbase, Inc.
 
2
//
 
3
// Licensed under the Apache License, Version 2.0 (the "License");
 
4
// you may not use this file except in compliance with the License.
 
5
// You may obtain a copy of the License at
 
6
//
 
7
//              http://www.apache.org/licenses/LICENSE-2.0
 
8
//
 
9
// Unless required by applicable law or agreed to in writing, software
 
10
// distributed under the License is distributed on an "AS IS" BASIS,
 
11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 
12
// See the License for the specific language governing permissions and
 
13
// limitations under the License.
 
14
 
 
15
package shingle
 
16
 
 
17
import (
 
18
        "reflect"
 
19
        "testing"
 
20
 
 
21
        "github.com/blevesearch/bleve/analysis"
 
22
)
 
23
 
 
24
func TestShingleFilter(t *testing.T) {
 
25
 
 
26
        tests := []struct {
 
27
                min            int
 
28
                max            int
 
29
                outputOriginal bool
 
30
                separator      string
 
31
                filler         string
 
32
                input          analysis.TokenStream
 
33
                output         analysis.TokenStream
 
34
        }{
 
35
                {
 
36
                        min:            2,
 
37
                        max:            2,
 
38
                        outputOriginal: false,
 
39
                        separator:      " ",
 
40
                        filler:         "_",
 
41
                        input: analysis.TokenStream{
 
42
                                &analysis.Token{
 
43
                                        Term: []byte("the"),
 
44
                                },
 
45
                                &analysis.Token{
 
46
                                        Term: []byte("quick"),
 
47
                                },
 
48
                                &analysis.Token{
 
49
                                        Term: []byte("brown"),
 
50
                                },
 
51
                                &analysis.Token{
 
52
                                        Term: []byte("fox"),
 
53
                                },
 
54
                        },
 
55
                        output: analysis.TokenStream{
 
56
                                &analysis.Token{
 
57
                                        Term: []byte("the quick"),
 
58
                                        Type: analysis.Shingle,
 
59
                                },
 
60
                                &analysis.Token{
 
61
                                        Term: []byte("quick brown"),
 
62
                                        Type: analysis.Shingle,
 
63
                                },
 
64
                                &analysis.Token{
 
65
                                        Term: []byte("brown fox"),
 
66
                                        Type: analysis.Shingle,
 
67
                                },
 
68
                        },
 
69
                },
 
70
                {
 
71
                        min:            3,
 
72
                        max:            3,
 
73
                        outputOriginal: false,
 
74
                        separator:      " ",
 
75
                        filler:         "_",
 
76
                        input: analysis.TokenStream{
 
77
                                &analysis.Token{
 
78
                                        Term: []byte("the"),
 
79
                                },
 
80
                                &analysis.Token{
 
81
                                        Term: []byte("quick"),
 
82
                                },
 
83
                                &analysis.Token{
 
84
                                        Term: []byte("brown"),
 
85
                                },
 
86
                                &analysis.Token{
 
87
                                        Term: []byte("fox"),
 
88
                                },
 
89
                        },
 
90
                        output: analysis.TokenStream{
 
91
                                &analysis.Token{
 
92
                                        Term: []byte("the quick brown"),
 
93
                                        Type: analysis.Shingle,
 
94
                                },
 
95
                                &analysis.Token{
 
96
                                        Term: []byte("quick brown fox"),
 
97
                                        Type: analysis.Shingle,
 
98
                                },
 
99
                        },
 
100
                },
 
101
                {
 
102
                        min:            2,
 
103
                        max:            3,
 
104
                        outputOriginal: false,
 
105
                        separator:      " ",
 
106
                        filler:         "_",
 
107
                        input: analysis.TokenStream{
 
108
                                &analysis.Token{
 
109
                                        Term: []byte("the"),
 
110
                                },
 
111
                                &analysis.Token{
 
112
                                        Term: []byte("quick"),
 
113
                                },
 
114
                                &analysis.Token{
 
115
                                        Term: []byte("brown"),
 
116
                                },
 
117
                                &analysis.Token{
 
118
                                        Term: []byte("fox"),
 
119
                                },
 
120
                        },
 
121
                        output: analysis.TokenStream{
 
122
                                &analysis.Token{
 
123
                                        Term: []byte("the quick"),
 
124
                                        Type: analysis.Shingle,
 
125
                                },
 
126
                                &analysis.Token{
 
127
                                        Term: []byte("quick brown"),
 
128
                                        Type: analysis.Shingle,
 
129
                                },
 
130
                                &analysis.Token{
 
131
                                        Term: []byte("the quick brown"),
 
132
                                        Type: analysis.Shingle,
 
133
                                },
 
134
                                &analysis.Token{
 
135
                                        Term: []byte("brown fox"),
 
136
                                        Type: analysis.Shingle,
 
137
                                },
 
138
                                &analysis.Token{
 
139
                                        Term: []byte("quick brown fox"),
 
140
                                        Type: analysis.Shingle,
 
141
                                },
 
142
                        },
 
143
                },
 
144
                {
 
145
                        min:            3,
 
146
                        max:            3,
 
147
                        outputOriginal: false,
 
148
                        separator:      " ",
 
149
                        filler:         "_",
 
150
                        input: analysis.TokenStream{
 
151
                                &analysis.Token{
 
152
                                        Term:     []byte("ugly"),
 
153
                                        Position: 1,
 
154
                                },
 
155
                                &analysis.Token{
 
156
                                        Term:     []byte("quick"),
 
157
                                        Position: 3,
 
158
                                },
 
159
                                &analysis.Token{
 
160
                                        Term:     []byte("brown"),
 
161
                                        Position: 4,
 
162
                                },
 
163
                        },
 
164
                        output: analysis.TokenStream{
 
165
                                &analysis.Token{
 
166
                                        Term:     []byte("ugly _ quick"),
 
167
                                        Type:     analysis.Shingle,
 
168
                                        Position: 1,
 
169
                                },
 
170
                                &analysis.Token{
 
171
                                        Term:     []byte("_ quick brown"),
 
172
                                        Type:     analysis.Shingle,
 
173
                                        Position: 3,
 
174
                                },
 
175
                        },
 
176
                },
 
177
                {
 
178
                        min:            1,
 
179
                        max:            5,
 
180
                        outputOriginal: false,
 
181
                        separator:      " ",
 
182
                        filler:         "_",
 
183
                        input: analysis.TokenStream{
 
184
                                &analysis.Token{
 
185
                                        Term:     []byte("test"),
 
186
                                        Position: 1,
 
187
                                },
 
188
                                &analysis.Token{
 
189
                                        Term:     []byte("text"),
 
190
                                        Position: 2,
 
191
                                },
 
192
                                // token 3 removed by stop filter
 
193
                                &analysis.Token{
 
194
                                        Term:     []byte("see"),
 
195
                                        Position: 4,
 
196
                                },
 
197
                                &analysis.Token{
 
198
                                        Term:     []byte("shingles"),
 
199
                                        Position: 5,
 
200
                                },
 
201
                        },
 
202
                        output: analysis.TokenStream{
 
203
                                &analysis.Token{
 
204
                                        Term:     []byte("test"),
 
205
                                        Type:     analysis.Shingle,
 
206
                                        Position: 1,
 
207
                                },
 
208
                                &analysis.Token{
 
209
                                        Term:     []byte("text"),
 
210
                                        Type:     analysis.Shingle,
 
211
                                        Position: 2,
 
212
                                },
 
213
                                &analysis.Token{
 
214
                                        Term:     []byte("test text"),
 
215
                                        Type:     analysis.Shingle,
 
216
                                        Position: 1,
 
217
                                },
 
218
                                &analysis.Token{
 
219
                                        Term: []byte("_"),
 
220
                                        Type: analysis.Shingle,
 
221
                                },
 
222
                                &analysis.Token{
 
223
                                        Term:     []byte("text _"),
 
224
                                        Type:     analysis.Shingle,
 
225
                                        Position: 2,
 
226
                                },
 
227
                                &analysis.Token{
 
228
                                        Term:     []byte("test text _"),
 
229
                                        Type:     analysis.Shingle,
 
230
                                        Position: 1,
 
231
                                },
 
232
                                &analysis.Token{
 
233
                                        Term:     []byte("see"),
 
234
                                        Type:     analysis.Shingle,
 
235
                                        Position: 4,
 
236
                                },
 
237
                                &analysis.Token{
 
238
                                        Term:     []byte("_ see"),
 
239
                                        Type:     analysis.Shingle,
 
240
                                        Position: 4,
 
241
                                },
 
242
                                &analysis.Token{
 
243
                                        Term:     []byte("text _ see"),
 
244
                                        Type:     analysis.Shingle,
 
245
                                        Position: 2,
 
246
                                },
 
247
                                &analysis.Token{
 
248
                                        Term:     []byte("test text _ see"),
 
249
                                        Type:     analysis.Shingle,
 
250
                                        Position: 1,
 
251
                                },
 
252
                                &analysis.Token{
 
253
                                        Term:     []byte("shingles"),
 
254
                                        Type:     analysis.Shingle,
 
255
                                        Position: 5,
 
256
                                },
 
257
                                &analysis.Token{
 
258
                                        Term:     []byte("see shingles"),
 
259
                                        Type:     analysis.Shingle,
 
260
                                        Position: 4,
 
261
                                },
 
262
                                &analysis.Token{
 
263
                                        Term:     []byte("_ see shingles"),
 
264
                                        Type:     analysis.Shingle,
 
265
                                        Position: 4,
 
266
                                },
 
267
                                &analysis.Token{
 
268
                                        Term:     []byte("text _ see shingles"),
 
269
                                        Type:     analysis.Shingle,
 
270
                                        Position: 2,
 
271
                                },
 
272
                                &analysis.Token{
 
273
                                        Term:     []byte("test text _ see shingles"),
 
274
                                        Type:     analysis.Shingle,
 
275
                                        Position: 1,
 
276
                                },
 
277
                        },
 
278
                },
 
279
                {
 
280
                        min:            2,
 
281
                        max:            2,
 
282
                        outputOriginal: true,
 
283
                        separator:      " ",
 
284
                        filler:         "_",
 
285
                        input: analysis.TokenStream{
 
286
                                &analysis.Token{
 
287
                                        Term: []byte("the"),
 
288
                                },
 
289
                                &analysis.Token{
 
290
                                        Term: []byte("quick"),
 
291
                                },
 
292
                                &analysis.Token{
 
293
                                        Term: []byte("brown"),
 
294
                                },
 
295
                                &analysis.Token{
 
296
                                        Term: []byte("fox"),
 
297
                                },
 
298
                        },
 
299
                        output: analysis.TokenStream{
 
300
                                &analysis.Token{
 
301
                                        Term: []byte("the"),
 
302
                                },
 
303
                                &analysis.Token{
 
304
                                        Term: []byte("quick"),
 
305
                                },
 
306
                                &analysis.Token{
 
307
                                        Term: []byte("the quick"),
 
308
                                        Type: analysis.Shingle,
 
309
                                },
 
310
                                &analysis.Token{
 
311
                                        Term: []byte("brown"),
 
312
                                },
 
313
                                &analysis.Token{
 
314
                                        Term: []byte("quick brown"),
 
315
                                        Type: analysis.Shingle,
 
316
                                },
 
317
                                &analysis.Token{
 
318
                                        Term: []byte("fox"),
 
319
                                },
 
320
                                &analysis.Token{
 
321
                                        Term: []byte("brown fox"),
 
322
                                        Type: analysis.Shingle,
 
323
                                },
 
324
                        },
 
325
                },
 
326
        }
 
327
 
 
328
        for _, test := range tests {
 
329
                shingleFilter := NewShingleFilter(test.min, test.max, test.outputOriginal, test.separator, test.filler)
 
330
                actual := shingleFilter.Filter(test.input)
 
331
                if !reflect.DeepEqual(actual, test.output) {
 
332
                        t.Errorf("expected %s, got %s", test.output, actual)
 
333
                }
 
334
        }
 
335
}
 
336
 
 
337
// TestShingleFilterBug431 tests that the shingle filter is in fact stateless
 
338
// by making using the same filter instance twice and ensuring we do not get
 
339
// contaminated output
 
340
func TestShingleFilterBug431(t *testing.T) {
 
341
 
 
342
        tests := []struct {
 
343
                input  analysis.TokenStream
 
344
                output analysis.TokenStream
 
345
        }{
 
346
                {
 
347
                        input: analysis.TokenStream{
 
348
                                &analysis.Token{
 
349
                                        Term: []byte("the"),
 
350
                                },
 
351
                                &analysis.Token{
 
352
                                        Term: []byte("quick"),
 
353
                                },
 
354
                                &analysis.Token{
 
355
                                        Term: []byte("brown"),
 
356
                                },
 
357
                                &analysis.Token{
 
358
                                        Term: []byte("fox"),
 
359
                                },
 
360
                        },
 
361
                        output: analysis.TokenStream{
 
362
                                &analysis.Token{
 
363
                                        Term: []byte("the quick"),
 
364
                                        Type: analysis.Shingle,
 
365
                                },
 
366
                                &analysis.Token{
 
367
                                        Term: []byte("quick brown"),
 
368
                                        Type: analysis.Shingle,
 
369
                                },
 
370
                                &analysis.Token{
 
371
                                        Term: []byte("brown fox"),
 
372
                                        Type: analysis.Shingle,
 
373
                                },
 
374
                        },
 
375
                },
 
376
                {
 
377
                        input: analysis.TokenStream{
 
378
                                &analysis.Token{
 
379
                                        Term: []byte("a"),
 
380
                                },
 
381
                                &analysis.Token{
 
382
                                        Term: []byte("sad"),
 
383
                                },
 
384
                                &analysis.Token{
 
385
                                        Term: []byte("dirty"),
 
386
                                },
 
387
                                &analysis.Token{
 
388
                                        Term: []byte("sock"),
 
389
                                },
 
390
                        },
 
391
                        output: analysis.TokenStream{
 
392
                                &analysis.Token{
 
393
                                        Term: []byte("a sad"),
 
394
                                        Type: analysis.Shingle,
 
395
                                },
 
396
                                &analysis.Token{
 
397
                                        Term: []byte("sad dirty"),
 
398
                                        Type: analysis.Shingle,
 
399
                                },
 
400
                                &analysis.Token{
 
401
                                        Term: []byte("dirty sock"),
 
402
                                        Type: analysis.Shingle,
 
403
                                },
 
404
                        },
 
405
                },
 
406
        }
 
407
 
 
408
        shingleFilter := NewShingleFilter(2, 2, false, " ", "_")
 
409
        for _, test := range tests {
 
410
                actual := shingleFilter.Filter(test.input)
 
411
                if !reflect.DeepEqual(actual, test.output) {
 
412
                        t.Errorf("expected %s, got %s", test.output, actual)
 
413
                }
 
414
        }
 
415
 
 
416
}