~ubuntu-branches/debian/sid/golang-github-blevesearch-bleve/sid

« back to all changes in this revision

Viewing changes to analysis/lang/ar/stemmer_ar_test.go

  • Committer: Package Import Robot
  • Author(s): Michael Lustfield
  • Date: 2017-03-30 16:06:03 UTC
  • Revision ID: package-import@ubuntu.com-20170330160603-0oogmb960l7918jx
Tags: upstream-0.5.0+git20170324.202.4702785f
Import upstream version 0.5.0+git20170324.202.4702785f

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
//  Copyright (c) 2014 Couchbase, Inc.
 
2
//
 
3
// Licensed under the Apache License, Version 2.0 (the "License");
 
4
// you may not use this file except in compliance with the License.
 
5
// You may obtain a copy of the License at
 
6
//
 
7
//              http://www.apache.org/licenses/LICENSE-2.0
 
8
//
 
9
// Unless required by applicable law or agreed to in writing, software
 
10
// distributed under the License is distributed on an "AS IS" BASIS,
 
11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 
12
// See the License for the specific language governing permissions and
 
13
// limitations under the License.
 
14
 
 
15
package ar
 
16
 
 
17
import (
 
18
        "reflect"
 
19
        "testing"
 
20
 
 
21
        "github.com/blevesearch/bleve/analysis"
 
22
)
 
23
 
 
24
func TestArabicStemmerFilter(t *testing.T) {
 
25
        tests := []struct {
 
26
                input  analysis.TokenStream
 
27
                output analysis.TokenStream
 
28
        }{
 
29
                // AlPrefix
 
30
                {
 
31
                        input: analysis.TokenStream{
 
32
                                &analysis.Token{
 
33
                                        Term: []byte("الحسن"),
 
34
                                },
 
35
                        },
 
36
                        output: analysis.TokenStream{
 
37
                                &analysis.Token{
 
38
                                        Term: []byte("حسن"),
 
39
                                },
 
40
                        },
 
41
                },
 
42
                // WalPrefix
 
43
                {
 
44
                        input: analysis.TokenStream{
 
45
                                &analysis.Token{
 
46
                                        Term: []byte("والحسن"),
 
47
                                },
 
48
                        },
 
49
                        output: analysis.TokenStream{
 
50
                                &analysis.Token{
 
51
                                        Term: []byte("حسن"),
 
52
                                },
 
53
                        },
 
54
                },
 
55
                // BalPrefix
 
56
                {
 
57
                        input: analysis.TokenStream{
 
58
                                &analysis.Token{
 
59
                                        Term: []byte("بالحسن"),
 
60
                                },
 
61
                        },
 
62
                        output: analysis.TokenStream{
 
63
                                &analysis.Token{
 
64
                                        Term: []byte("حسن"),
 
65
                                },
 
66
                        },
 
67
                },
 
68
                // KalPrefix
 
69
                {
 
70
                        input: analysis.TokenStream{
 
71
                                &analysis.Token{
 
72
                                        Term: []byte("كالحسن"),
 
73
                                },
 
74
                        },
 
75
                        output: analysis.TokenStream{
 
76
                                &analysis.Token{
 
77
                                        Term: []byte("حسن"),
 
78
                                },
 
79
                        },
 
80
                },
 
81
                // FalPrefix
 
82
                {
 
83
                        input: analysis.TokenStream{
 
84
                                &analysis.Token{
 
85
                                        Term: []byte("فالحسن"),
 
86
                                },
 
87
                        },
 
88
                        output: analysis.TokenStream{
 
89
                                &analysis.Token{
 
90
                                        Term: []byte("حسن"),
 
91
                                },
 
92
                        },
 
93
                },
 
94
                // LlPrefix
 
95
                {
 
96
                        input: analysis.TokenStream{
 
97
                                &analysis.Token{
 
98
                                        Term: []byte("للاخر"),
 
99
                                },
 
100
                        },
 
101
                        output: analysis.TokenStream{
 
102
                                &analysis.Token{
 
103
                                        Term: []byte("اخر"),
 
104
                                },
 
105
                        },
 
106
                },
 
107
                // WaPrefix
 
108
                {
 
109
                        input: analysis.TokenStream{
 
110
                                &analysis.Token{
 
111
                                        Term: []byte("وحسن"),
 
112
                                },
 
113
                        },
 
114
                        output: analysis.TokenStream{
 
115
                                &analysis.Token{
 
116
                                        Term: []byte("حسن"),
 
117
                                },
 
118
                        },
 
119
                },
 
120
                // AhSuffix
 
121
                {
 
122
                        input: analysis.TokenStream{
 
123
                                &analysis.Token{
 
124
                                        Term: []byte("زوجها"),
 
125
                                },
 
126
                        },
 
127
                        output: analysis.TokenStream{
 
128
                                &analysis.Token{
 
129
                                        Term: []byte("زوج"),
 
130
                                },
 
131
                        },
 
132
                },
 
133
                // AnSuffix
 
134
                {
 
135
                        input: analysis.TokenStream{
 
136
                                &analysis.Token{
 
137
                                        Term: []byte("ساهدان"),
 
138
                                },
 
139
                        },
 
140
                        output: analysis.TokenStream{
 
141
                                &analysis.Token{
 
142
                                        Term: []byte("ساهد"),
 
143
                                },
 
144
                        },
 
145
                },
 
146
                // AtSuffix
 
147
                {
 
148
                        input: analysis.TokenStream{
 
149
                                &analysis.Token{
 
150
                                        Term: []byte("ساهدات"),
 
151
                                },
 
152
                        },
 
153
                        output: analysis.TokenStream{
 
154
                                &analysis.Token{
 
155
                                        Term: []byte("ساهد"),
 
156
                                },
 
157
                        },
 
158
                },
 
159
                // WnSuffix
 
160
                {
 
161
                        input: analysis.TokenStream{
 
162
                                &analysis.Token{
 
163
                                        Term: []byte("ساهدون"),
 
164
                                },
 
165
                        },
 
166
                        output: analysis.TokenStream{
 
167
                                &analysis.Token{
 
168
                                        Term: []byte("ساهد"),
 
169
                                },
 
170
                        },
 
171
                },
 
172
                // YnSuffix
 
173
                {
 
174
                        input: analysis.TokenStream{
 
175
                                &analysis.Token{
 
176
                                        Term: []byte("ساهدين"),
 
177
                                },
 
178
                        },
 
179
                        output: analysis.TokenStream{
 
180
                                &analysis.Token{
 
181
                                        Term: []byte("ساهد"),
 
182
                                },
 
183
                        },
 
184
                },
 
185
                // YhSuffix
 
186
                {
 
187
                        input: analysis.TokenStream{
 
188
                                &analysis.Token{
 
189
                                        Term: []byte("ساهديه"),
 
190
                                },
 
191
                        },
 
192
                        output: analysis.TokenStream{
 
193
                                &analysis.Token{
 
194
                                        Term: []byte("ساهد"),
 
195
                                },
 
196
                        },
 
197
                },
 
198
                // YpSuffix
 
199
                {
 
200
                        input: analysis.TokenStream{
 
201
                                &analysis.Token{
 
202
                                        Term: []byte("ساهدية"),
 
203
                                },
 
204
                        },
 
205
                        output: analysis.TokenStream{
 
206
                                &analysis.Token{
 
207
                                        Term: []byte("ساهد"),
 
208
                                },
 
209
                        },
 
210
                },
 
211
                // HSuffix
 
212
                {
 
213
                        input: analysis.TokenStream{
 
214
                                &analysis.Token{
 
215
                                        Term: []byte("ساهده"),
 
216
                                },
 
217
                        },
 
218
                        output: analysis.TokenStream{
 
219
                                &analysis.Token{
 
220
                                        Term: []byte("ساهد"),
 
221
                                },
 
222
                        },
 
223
                },
 
224
                // PSuffix
 
225
                {
 
226
                        input: analysis.TokenStream{
 
227
                                &analysis.Token{
 
228
                                        Term: []byte("ساهدة"),
 
229
                                },
 
230
                        },
 
231
                        output: analysis.TokenStream{
 
232
                                &analysis.Token{
 
233
                                        Term: []byte("ساهد"),
 
234
                                },
 
235
                        },
 
236
                },
 
237
                // YSuffix
 
238
                {
 
239
                        input: analysis.TokenStream{
 
240
                                &analysis.Token{
 
241
                                        Term: []byte("ساهدي"),
 
242
                                },
 
243
                        },
 
244
                        output: analysis.TokenStream{
 
245
                                &analysis.Token{
 
246
                                        Term: []byte("ساهد"),
 
247
                                },
 
248
                        },
 
249
                },
 
250
                // ComboPrefSuf
 
251
                {
 
252
                        input: analysis.TokenStream{
 
253
                                &analysis.Token{
 
254
                                        Term: []byte("وساهدون"),
 
255
                                },
 
256
                        },
 
257
                        output: analysis.TokenStream{
 
258
                                &analysis.Token{
 
259
                                        Term: []byte("ساهد"),
 
260
                                },
 
261
                        },
 
262
                },
 
263
                // ComboSuf
 
264
                {
 
265
                        input: analysis.TokenStream{
 
266
                                &analysis.Token{
 
267
                                        Term: []byte("ساهدهات"),
 
268
                                },
 
269
                        },
 
270
                        output: analysis.TokenStream{
 
271
                                &analysis.Token{
 
272
                                        Term: []byte("ساهد"),
 
273
                                },
 
274
                        },
 
275
                },
 
276
                // Shouldn't Stem
 
277
                {
 
278
                        input: analysis.TokenStream{
 
279
                                &analysis.Token{
 
280
                                        Term: []byte("الو"),
 
281
                                },
 
282
                        },
 
283
                        output: analysis.TokenStream{
 
284
                                &analysis.Token{
 
285
                                        Term: []byte("الو"),
 
286
                                },
 
287
                        },
 
288
                },
 
289
                // NonArabic
 
290
                {
 
291
                        input: analysis.TokenStream{
 
292
                                &analysis.Token{
 
293
                                        Term: []byte("English"),
 
294
                                },
 
295
                        },
 
296
                        output: analysis.TokenStream{
 
297
                                &analysis.Token{
 
298
                                        Term: []byte("English"),
 
299
                                },
 
300
                        },
 
301
                },
 
302
                {
 
303
                        input: analysis.TokenStream{
 
304
                                &analysis.Token{
 
305
                                        Term: []byte("سلام"),
 
306
                                },
 
307
                        },
 
308
                        output: analysis.TokenStream{
 
309
                                &analysis.Token{
 
310
                                        Term: []byte("سلام"),
 
311
                                },
 
312
                        },
 
313
                },
 
314
                {
 
315
                        input: analysis.TokenStream{
 
316
                                &analysis.Token{
 
317
                                        Term: []byte("السلام"),
 
318
                                },
 
319
                        },
 
320
                        output: analysis.TokenStream{
 
321
                                &analysis.Token{
 
322
                                        Term: []byte("سلام"),
 
323
                                },
 
324
                        },
 
325
                },
 
326
                {
 
327
                        input: analysis.TokenStream{
 
328
                                &analysis.Token{
 
329
                                        Term: []byte("سلامة"),
 
330
                                },
 
331
                        },
 
332
                        output: analysis.TokenStream{
 
333
                                &analysis.Token{
 
334
                                        Term: []byte("سلام"),
 
335
                                },
 
336
                        },
 
337
                },
 
338
                {
 
339
                        input: analysis.TokenStream{
 
340
                                &analysis.Token{
 
341
                                        Term: []byte("السلامة"),
 
342
                                },
 
343
                        },
 
344
                        output: analysis.TokenStream{
 
345
                                &analysis.Token{
 
346
                                        Term: []byte("سلام"),
 
347
                                },
 
348
                        },
 
349
                },
 
350
                {
 
351
                        input: analysis.TokenStream{
 
352
                                &analysis.Token{
 
353
                                        Term: []byte("الوصل"),
 
354
                                },
 
355
                        },
 
356
                        output: analysis.TokenStream{
 
357
                                &analysis.Token{
 
358
                                        Term: []byte("وصل"),
 
359
                                },
 
360
                        },
 
361
                },
 
362
                {
 
363
                        input: analysis.TokenStream{
 
364
                                &analysis.Token{
 
365
                                        Term: []byte("والصل"),
 
366
                                },
 
367
                        },
 
368
                        output: analysis.TokenStream{
 
369
                                &analysis.Token{
 
370
                                        Term: []byte("صل"),
 
371
                                },
 
372
                        },
 
373
                },
 
374
                // Empty
 
375
                {
 
376
                        input: analysis.TokenStream{
 
377
                                &analysis.Token{
 
378
                                        Term: []byte(""),
 
379
                                },
 
380
                        },
 
381
                        output: analysis.TokenStream{
 
382
                                &analysis.Token{
 
383
                                        Term: []byte(""),
 
384
                                },
 
385
                        },
 
386
                },
 
387
        }
 
388
 
 
389
        arabicStemmerFilter := NewArabicStemmerFilter()
 
390
        for _, test := range tests {
 
391
                actual := arabicStemmerFilter.Filter(test.input)
 
392
                if !reflect.DeepEqual(actual, test.output) {
 
393
                        t.Errorf("expected %#v, got %#v", test.output, actual)
 
394
                        t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
 
395
                }
 
396
        }
 
397
}