~ubuntu-branches/debian/sid/golang-github-blevesearch-bleve/sid

« back to all changes in this revision

Viewing changes to analysis/lang/cjk/cjk_bigram.go

  • Committer: Package Import Robot
  • Author(s): Michael Lustfield
  • Date: 2017-03-30 16:06:03 UTC
  • Revision ID: package-import@ubuntu.com-20170330160603-0oogmb960l7918jx
Tags: upstream-0.5.0+git20170324.202.4702785f
ImportĀ upstreamĀ versionĀ 0.5.0+git20170324.202.4702785f

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
//  Copyright (c) 2014 Couchbase, Inc.
 
2
//
 
3
// Licensed under the Apache License, Version 2.0 (the "License");
 
4
// you may not use this file except in compliance with the License.
 
5
// You may obtain a copy of the License at
 
6
//
 
7
//              http://www.apache.org/licenses/LICENSE-2.0
 
8
//
 
9
// Unless required by applicable law or agreed to in writing, software
 
10
// distributed under the License is distributed on an "AS IS" BASIS,
 
11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 
12
// See the License for the specific language governing permissions and
 
13
// limitations under the License.
 
14
 
 
15
package cjk
 
16
 
 
17
import (
 
18
        "bytes"
 
19
        "container/ring"
 
20
        "unicode/utf8"
 
21
 
 
22
        "github.com/blevesearch/bleve/analysis"
 
23
        "github.com/blevesearch/bleve/registry"
 
24
)
 
25
 
 
26
const BigramName = "cjk_bigram"
 
27
 
 
28
type CJKBigramFilter struct {
 
29
        outputUnigram bool
 
30
}
 
31
 
 
32
func NewCJKBigramFilter(outputUnigram bool) *CJKBigramFilter {
 
33
        return &CJKBigramFilter{
 
34
                outputUnigram: outputUnigram,
 
35
        }
 
36
}
 
37
 
 
38
func (s *CJKBigramFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
 
39
        r := ring.New(2)
 
40
        itemsInRing := 0
 
41
        pos := 1
 
42
        outputPos := 1
 
43
 
 
44
        rv := make(analysis.TokenStream, 0, len(input))
 
45
 
 
46
        for _, tokout := range input {
 
47
                if tokout.Type == analysis.Ideographic {
 
48
                        runes := bytes.Runes(tokout.Term)
 
49
                        sofar := 0
 
50
                        for _, run := range runes {
 
51
                                rlen := utf8.RuneLen(run)
 
52
                                token := &analysis.Token{
 
53
                                        Term:     tokout.Term[sofar : sofar+rlen],
 
54
                                        Start:    tokout.Start + sofar,
 
55
                                        End:      tokout.Start + sofar + rlen,
 
56
                                        Position: pos,
 
57
                                        Type:     tokout.Type,
 
58
                                        KeyWord:  tokout.KeyWord,
 
59
                                }
 
60
                                pos++
 
61
                                sofar += rlen
 
62
                                if itemsInRing > 0 {
 
63
                                        // if items already buffered
 
64
                                        // check to see if this is aligned
 
65
                                        curr := r.Value.(*analysis.Token)
 
66
                                        if token.Start-curr.End != 0 {
 
67
                                                // not aligned flush
 
68
                                                flushToken := s.flush(r, &itemsInRing, outputPos)
 
69
                                                if flushToken != nil {
 
70
                                                        outputPos++
 
71
                                                        rv = append(rv, flushToken)
 
72
                                                }
 
73
                                        }
 
74
                                }
 
75
                                // now we can add this token to the buffer
 
76
                                r = r.Next()
 
77
                                r.Value = token
 
78
                                if itemsInRing < 2 {
 
79
                                        itemsInRing++
 
80
                                }
 
81
                                if itemsInRing > 1 && s.outputUnigram {
 
82
                                        unigram := s.buildUnigram(r, &itemsInRing, outputPos)
 
83
                                        if unigram != nil {
 
84
                                                rv = append(rv, unigram)
 
85
                                        }
 
86
                                }
 
87
                                bigramToken := s.outputBigram(r, &itemsInRing, outputPos)
 
88
                                if bigramToken != nil {
 
89
                                        rv = append(rv, bigramToken)
 
90
                                        outputPos++
 
91
                                }
 
92
                        }
 
93
 
 
94
                } else {
 
95
                        // flush anything already buffered
 
96
                        flushToken := s.flush(r, &itemsInRing, outputPos)
 
97
                        if flushToken != nil {
 
98
                                rv = append(rv, flushToken)
 
99
                                outputPos++
 
100
                        }
 
101
                        // output this token as is
 
102
                        tokout.Position = outputPos
 
103
                        rv = append(rv, tokout)
 
104
                        outputPos++
 
105
                }
 
106
        }
 
107
 
 
108
        // deal with possible trailing unigram
 
109
        if itemsInRing == 1 || s.outputUnigram {
 
110
                if itemsInRing == 2 {
 
111
                        r = r.Next()
 
112
                }
 
113
                unigram := s.buildUnigram(r, &itemsInRing, outputPos)
 
114
                if unigram != nil {
 
115
                        rv = append(rv, unigram)
 
116
                }
 
117
        }
 
118
        return rv
 
119
}
 
120
 
 
121
func (s *CJKBigramFilter) flush(r *ring.Ring, itemsInRing *int, pos int) *analysis.Token {
 
122
        var rv *analysis.Token
 
123
        if *itemsInRing == 1 {
 
124
                rv = s.buildUnigram(r, itemsInRing, pos)
 
125
        }
 
126
        r.Value = nil
 
127
        *itemsInRing = 0
 
128
        return rv
 
129
}
 
130
 
 
131
func (s *CJKBigramFilter) outputBigram(r *ring.Ring, itemsInRing *int, pos int) *analysis.Token {
 
132
        if *itemsInRing == 2 {
 
133
                thisShingleRing := r.Move(-1)
 
134
                shingledBytes := make([]byte, 0)
 
135
 
 
136
                // do first token
 
137
                prev := thisShingleRing.Value.(*analysis.Token)
 
138
                shingledBytes = append(shingledBytes, prev.Term...)
 
139
 
 
140
                // do second token
 
141
                thisShingleRing = thisShingleRing.Next()
 
142
                curr := thisShingleRing.Value.(*analysis.Token)
 
143
                shingledBytes = append(shingledBytes, curr.Term...)
 
144
 
 
145
                token := analysis.Token{
 
146
                        Type:     analysis.Double,
 
147
                        Term:     shingledBytes,
 
148
                        Position: pos,
 
149
                        Start:    prev.Start,
 
150
                        End:      curr.End,
 
151
                }
 
152
                return &token
 
153
        }
 
154
        return nil
 
155
}
 
156
 
 
157
func (s *CJKBigramFilter) buildUnigram(r *ring.Ring, itemsInRing *int, pos int) *analysis.Token {
 
158
        if *itemsInRing == 2 {
 
159
                thisShingleRing := r.Move(-1)
 
160
                // do first token
 
161
                prev := thisShingleRing.Value.(*analysis.Token)
 
162
                token := analysis.Token{
 
163
                        Type:     analysis.Single,
 
164
                        Term:     prev.Term,
 
165
                        Position: pos,
 
166
                        Start:    prev.Start,
 
167
                        End:      prev.End,
 
168
                }
 
169
                return &token
 
170
        } else if *itemsInRing == 1 {
 
171
                // do first token
 
172
                prev := r.Value.(*analysis.Token)
 
173
                token := analysis.Token{
 
174
                        Type:     analysis.Single,
 
175
                        Term:     prev.Term,
 
176
                        Position: pos,
 
177
                        Start:    prev.Start,
 
178
                        End:      prev.End,
 
179
                }
 
180
                return &token
 
181
        }
 
182
        return nil
 
183
}
 
184
 
 
185
func CJKBigramFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
 
186
        outputUnigram := false
 
187
        outVal, ok := config["output_unigram"].(bool)
 
188
        if ok {
 
189
                outputUnigram = outVal
 
190
        }
 
191
        return NewCJKBigramFilter(outputUnigram), nil
 
192
}
 
193
 
 
194
func init() {
 
195
        registry.RegisterTokenFilter(BigramName, CJKBigramFilterConstructor)
 
196
}