~ubuntu-branches/debian/sid/golang-github-blevesearch-bleve/sid

« back to all changes in this revision

Viewing changes to analysis/lang/ar/analyzer_ar_test.go

  • Committer: Package Import Robot
  • Author(s): Michael Lustfield
  • Date: 2017-03-30 16:06:03 UTC
  • Revision ID: package-import@ubuntu.com-20170330160603-0oogmb960l7918jx
Tags: upstream-0.5.0+git20170324.202.4702785f
Import upstream version 0.5.0+git20170324.202.4702785f

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
//  Copyright (c) 2014 Couchbase, Inc.
 
2
//
 
3
// Licensed under the Apache License, Version 2.0 (the "License");
 
4
// you may not use this file except in compliance with the License.
 
5
// You may obtain a copy of the License at
 
6
//
 
7
//              http://www.apache.org/licenses/LICENSE-2.0
 
8
//
 
9
// Unless required by applicable law or agreed to in writing, software
 
10
// distributed under the License is distributed on an "AS IS" BASIS,
 
11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 
12
// See the License for the specific language governing permissions and
 
13
// limitations under the License.
 
14
 
 
15
package ar
 
16
 
 
17
import (
 
18
        "reflect"
 
19
        "testing"
 
20
 
 
21
        "github.com/blevesearch/bleve/analysis"
 
22
        "github.com/blevesearch/bleve/registry"
 
23
)
 
24
 
 
25
func TestArabicAnalyzer(t *testing.T) {
 
26
        tests := []struct {
 
27
                input  []byte
 
28
                output analysis.TokenStream
 
29
        }{
 
30
                {
 
31
                        input: []byte("كبير"),
 
32
                        output: analysis.TokenStream{
 
33
                                &analysis.Token{
 
34
                                        Term:     []byte("كبير"),
 
35
                                        Position: 1,
 
36
                                        Start:    0,
 
37
                                        End:      8,
 
38
                                },
 
39
                        },
 
40
                },
 
41
                // feminine marker
 
42
                {
 
43
                        input: []byte("كبيرة"),
 
44
                        output: analysis.TokenStream{
 
45
                                &analysis.Token{
 
46
                                        Term:     []byte("كبير"),
 
47
                                        Position: 1,
 
48
                                        Start:    0,
 
49
                                        End:      10,
 
50
                                },
 
51
                        },
 
52
                },
 
53
                {
 
54
                        input: []byte("مشروب"),
 
55
                        output: analysis.TokenStream{
 
56
                                &analysis.Token{
 
57
                                        Term:     []byte("مشروب"),
 
58
                                        Position: 1,
 
59
                                        Start:    0,
 
60
                                        End:      10,
 
61
                                },
 
62
                        },
 
63
                },
 
64
                // plural -at
 
65
                {
 
66
                        input: []byte("مشروبات"),
 
67
                        output: analysis.TokenStream{
 
68
                                &analysis.Token{
 
69
                                        Term:     []byte("مشروب"),
 
70
                                        Position: 1,
 
71
                                        Start:    0,
 
72
                                        End:      14,
 
73
                                },
 
74
                        },
 
75
                },
 
76
                // plural -in
 
77
                {
 
78
                        input: []byte("أمريكيين"),
 
79
                        output: analysis.TokenStream{
 
80
                                &analysis.Token{
 
81
                                        Term:     []byte("امريك"),
 
82
                                        Position: 1,
 
83
                                        Start:    0,
 
84
                                        End:      16,
 
85
                                },
 
86
                        },
 
87
                },
 
88
                // singular with bare alif
 
89
                {
 
90
                        input: []byte("امريكي"),
 
91
                        output: analysis.TokenStream{
 
92
                                &analysis.Token{
 
93
                                        Term:     []byte("امريك"),
 
94
                                        Position: 1,
 
95
                                        Start:    0,
 
96
                                        End:      12,
 
97
                                },
 
98
                        },
 
99
                },
 
100
                {
 
101
                        input: []byte("كتاب"),
 
102
                        output: analysis.TokenStream{
 
103
                                &analysis.Token{
 
104
                                        Term:     []byte("كتاب"),
 
105
                                        Position: 1,
 
106
                                        Start:    0,
 
107
                                        End:      8,
 
108
                                },
 
109
                        },
 
110
                },
 
111
                // definite article
 
112
                {
 
113
                        input: []byte("الكتاب"),
 
114
                        output: analysis.TokenStream{
 
115
                                &analysis.Token{
 
116
                                        Term:     []byte("كتاب"),
 
117
                                        Position: 1,
 
118
                                        Start:    0,
 
119
                                        End:      12,
 
120
                                },
 
121
                        },
 
122
                },
 
123
                {
 
124
                        input: []byte("ما ملكت أيمانكم"),
 
125
                        output: analysis.TokenStream{
 
126
                                &analysis.Token{
 
127
                                        Term:     []byte("ملكت"),
 
128
                                        Position: 2,
 
129
                                        Start:    5,
 
130
                                        End:      13,
 
131
                                },
 
132
                                &analysis.Token{
 
133
                                        Term:     []byte("ايمانكم"),
 
134
                                        Position: 3,
 
135
                                        Start:    14,
 
136
                                        End:      28,
 
137
                                },
 
138
                        },
 
139
                },
 
140
                // stopwords
 
141
                {
 
142
                        input: []byte("الذين ملكت أيمانكم"),
 
143
                        output: analysis.TokenStream{
 
144
                                &analysis.Token{
 
145
                                        Term:     []byte("ملكت"),
 
146
                                        Position: 2,
 
147
                                        Start:    11,
 
148
                                        End:      19,
 
149
                                },
 
150
                                &analysis.Token{
 
151
                                        Term:     []byte("ايمانكم"),
 
152
                                        Position: 3,
 
153
                                        Start:    20,
 
154
                                        End:      34,
 
155
                                },
 
156
                        },
 
157
                },
 
158
                // presentation form normalization
 
159
                {
 
160
                        input: []byte("ﺍﻟﺴﻼﻢ"),
 
161
                        output: analysis.TokenStream{
 
162
                                &analysis.Token{
 
163
                                        Term:     []byte("سلام"),
 
164
                                        Position: 1,
 
165
                                        Start:    0,
 
166
                                        End:      15,
 
167
                                },
 
168
                        },
 
169
                },
 
170
        }
 
171
 
 
172
        cache := registry.NewCache()
 
173
        analyzer, err := cache.AnalyzerNamed(AnalyzerName)
 
174
        if err != nil {
 
175
                t.Fatal(err)
 
176
        }
 
177
        for _, test := range tests {
 
178
                actual := analyzer.Analyze(test.input)
 
179
                if !reflect.DeepEqual(actual, test.output) {
 
180
                        t.Errorf("expected %v, got %v", test.output, actual)
 
181
                        t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
 
182
                }
 
183
        }
 
184
}