1
// Copyright (c) 2014 Couchbase, Inc.
3
// Licensed under the Apache License, Version 2.0 (the "License");
4
// you may not use this file except in compliance with the License.
5
// You may obtain a copy of the License at
7
// http://www.apache.org/licenses/LICENSE-2.0
9
// Unless required by applicable law or agreed to in writing, software
10
// distributed under the License is distributed on an "AS IS" BASIS,
11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
// See the License for the specific language governing permissions and
13
// limitations under the License.
22
"github.com/blevesearch/bleve/analysis"
23
"github.com/blevesearch/bleve/registry"
26
const BigramName = "cjk_bigram"
28
type CJKBigramFilter struct {
32
func NewCJKBigramFilter(outputUnigram bool) *CJKBigramFilter {
33
return &CJKBigramFilter{
34
outputUnigram: outputUnigram,
38
func (s *CJKBigramFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
44
rv := make(analysis.TokenStream, 0, len(input))
46
for _, tokout := range input {
47
if tokout.Type == analysis.Ideographic {
48
runes := bytes.Runes(tokout.Term)
50
for _, run := range runes {
51
rlen := utf8.RuneLen(run)
52
token := &analysis.Token{
53
Term: tokout.Term[sofar : sofar+rlen],
54
Start: tokout.Start + sofar,
55
End: tokout.Start + sofar + rlen,
58
KeyWord: tokout.KeyWord,
63
// if items already buffered
64
// check to see if this is aligned
65
curr := r.Value.(*analysis.Token)
66
if token.Start-curr.End != 0 {
68
flushToken := s.flush(r, &itemsInRing, outputPos)
69
if flushToken != nil {
71
rv = append(rv, flushToken)
75
// now we can add this token to the buffer
81
if itemsInRing > 1 && s.outputUnigram {
82
unigram := s.buildUnigram(r, &itemsInRing, outputPos)
84
rv = append(rv, unigram)
87
bigramToken := s.outputBigram(r, &itemsInRing, outputPos)
88
if bigramToken != nil {
89
rv = append(rv, bigramToken)
95
// flush anything already buffered
96
flushToken := s.flush(r, &itemsInRing, outputPos)
97
if flushToken != nil {
98
rv = append(rv, flushToken)
101
// output this token as is
102
tokout.Position = outputPos
103
rv = append(rv, tokout)
108
// deal with possible trailing unigram
109
if itemsInRing == 1 || s.outputUnigram {
110
if itemsInRing == 2 {
113
unigram := s.buildUnigram(r, &itemsInRing, outputPos)
115
rv = append(rv, unigram)
121
func (s *CJKBigramFilter) flush(r *ring.Ring, itemsInRing *int, pos int) *analysis.Token {
122
var rv *analysis.Token
123
if *itemsInRing == 1 {
124
rv = s.buildUnigram(r, itemsInRing, pos)
131
func (s *CJKBigramFilter) outputBigram(r *ring.Ring, itemsInRing *int, pos int) *analysis.Token {
132
if *itemsInRing == 2 {
133
thisShingleRing := r.Move(-1)
134
shingledBytes := make([]byte, 0)
137
prev := thisShingleRing.Value.(*analysis.Token)
138
shingledBytes = append(shingledBytes, prev.Term...)
141
thisShingleRing = thisShingleRing.Next()
142
curr := thisShingleRing.Value.(*analysis.Token)
143
shingledBytes = append(shingledBytes, curr.Term...)
145
token := analysis.Token{
146
Type: analysis.Double,
157
func (s *CJKBigramFilter) buildUnigram(r *ring.Ring, itemsInRing *int, pos int) *analysis.Token {
158
if *itemsInRing == 2 {
159
thisShingleRing := r.Move(-1)
161
prev := thisShingleRing.Value.(*analysis.Token)
162
token := analysis.Token{
163
Type: analysis.Single,
170
} else if *itemsInRing == 1 {
172
prev := r.Value.(*analysis.Token)
173
token := analysis.Token{
174
Type: analysis.Single,
185
func CJKBigramFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
186
outputUnigram := false
187
outVal, ok := config["output_unigram"].(bool)
189
outputUnigram = outVal
191
return NewCJKBigramFilter(outputUnigram), nil
195
registry.RegisterTokenFilter(BigramName, CJKBigramFilterConstructor)