1
// Copyright (c) 2014 Couchbase, Inc.
3
// Licensed under the Apache License, Version 2.0 (the "License");
4
// you may not use this file except in compliance with the License.
5
// You may obtain a copy of the License at
7
// http://www.apache.org/licenses/LICENSE-2.0
9
// Unless required by applicable law or agreed to in writing, software
10
// distributed under the License is distributed on an "AS IS" BASIS,
11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
// See the License for the specific language governing permissions and
13
// limitations under the License.
21
"github.com/blevesearch/bleve/analysis"
22
"github.com/blevesearch/bleve/registry"
25
const StemmerName = "stemmer_ckb"
27
type SoraniStemmerFilter struct {
30
func NewSoraniStemmerFilter() *SoraniStemmerFilter {
31
return &SoraniStemmerFilter{}
34
func (s *SoraniStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
35
for _, token := range input {
36
// if not protected keyword, stem it
38
stemmed := stem(token.Term)
45
func stem(input []byte) []byte {
46
inputLen := utf8.RuneCount(input)
49
if inputLen > 5 && bytes.HasSuffix(input, []byte("دا")) {
50
input = truncateRunes(input, 2)
51
inputLen = utf8.RuneCount(input)
52
} else if inputLen > 4 && bytes.HasSuffix(input, []byte("نا")) {
53
input = truncateRunes(input, 1)
54
inputLen = utf8.RuneCount(input)
55
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("ەوە")) {
56
input = truncateRunes(input, 3)
57
inputLen = utf8.RuneCount(input)
62
(bytes.HasSuffix(input, []byte("مان")) ||
63
bytes.HasSuffix(input, []byte("یان")) ||
64
bytes.HasSuffix(input, []byte("تان"))) {
65
input = truncateRunes(input, 3)
66
inputLen = utf8.RuneCount(input)
69
// indefinite singular ezafe
70
if inputLen > 6 && bytes.HasSuffix(input, []byte("ێکی")) {
71
return truncateRunes(input, 3)
72
} else if inputLen > 7 && bytes.HasSuffix(input, []byte("یەکی")) {
73
return truncateRunes(input, 4)
76
if inputLen > 5 && bytes.HasSuffix(input, []byte("ێک")) {
77
// indefinite singular
78
return truncateRunes(input, 2)
79
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("یەک")) {
80
// indefinite singular
81
return truncateRunes(input, 3)
82
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("ەکە")) {
84
return truncateRunes(input, 3)
85
} else if inputLen > 5 && bytes.HasSuffix(input, []byte("کە")) {
87
return truncateRunes(input, 2)
88
} else if inputLen > 7 && bytes.HasSuffix(input, []byte("ەکان")) {
90
return truncateRunes(input, 4)
91
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("کان")) {
93
return truncateRunes(input, 3)
94
} else if inputLen > 7 && bytes.HasSuffix(input, []byte("یانی")) {
95
// indefinite plural ezafe
96
return truncateRunes(input, 4)
97
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("انی")) {
98
// indefinite plural ezafe
99
return truncateRunes(input, 3)
100
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("یان")) {
102
return truncateRunes(input, 3)
103
} else if inputLen > 5 && bytes.HasSuffix(input, []byte("ان")) {
105
return truncateRunes(input, 2)
106
} else if inputLen > 7 && bytes.HasSuffix(input, []byte("یانە")) {
107
// demonstrative plural
108
return truncateRunes(input, 4)
109
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("انە")) {
110
// demonstrative plural
111
return truncateRunes(input, 3)
112
} else if inputLen > 5 && (bytes.HasSuffix(input, []byte("ایە")) || bytes.HasSuffix(input, []byte("ەیە"))) {
113
// demonstrative singular
114
return truncateRunes(input, 2)
115
} else if inputLen > 4 && bytes.HasSuffix(input, []byte("ە")) {
116
// demonstrative singular
117
return truncateRunes(input, 1)
118
} else if inputLen > 4 && bytes.HasSuffix(input, []byte("ی")) {
119
// absolute singular ezafe
120
return truncateRunes(input, 1)
125
func truncateRunes(input []byte, num int) []byte {
126
runes := bytes.Runes(input)
127
runes = runes[:len(runes)-num]
128
out := buildTermFromRunes(runes)
132
func buildTermFromRunes(runes []rune) []byte {
133
rv := make([]byte, 0, len(runes)*4)
134
for _, r := range runes {
135
runeBytes := make([]byte, utf8.RuneLen(r))
136
utf8.EncodeRune(runeBytes, r)
137
rv = append(rv, runeBytes...)
142
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
143
return NewSoraniStemmerFilter(), nil
147
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)