2
* Licensed to the Apache Software Foundation (ASF) under one or more
3
* contributor license agreements. See the NOTICE file distributed with
4
* this work for additional information regarding copyright ownership.
5
* The ASF licenses this file to You under the Apache License, Version 2.0
6
* (the "License"); you may not use this file except in compliance with
7
* the License. You may obtain a copy of the License at
9
* http://www.apache.org/licenses/LICENSE-2.0
11
* Unless required by applicable law or agreed to in writing, software
12
* distributed under the License is distributed on an "AS IS" BASIS,
13
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
* See the License for the specific language governing permissions and
15
* limitations under the License.
18
package org.apache.lucene.analysis.cn.smart;
20
import java.util.Collections;
21
import java.util.List;
23
import org.apache.lucene.analysis.cn.smart.hhmm.HHMMSegmenter;
24
import org.apache.lucene.analysis.cn.smart.hhmm.SegToken;
25
import org.apache.lucene.analysis.cn.smart.hhmm.SegTokenFilter;
28
* Segment a sentence of Chinese text into words.
29
* @lucene.experimental
33
private HHMMSegmenter hhmmSegmenter = new HHMMSegmenter();
35
private SegTokenFilter tokenFilter = new SegTokenFilter();
38
* Segment a sentence into words with {@link HHMMSegmenter}
40
* @param sentence input sentence
41
* @param startOffset start offset of sentence
42
* @return {@link List} of {@link SegToken}
44
public List<SegToken> segmentSentence(String sentence, int startOffset) {
46
List<SegToken> segTokenList = hhmmSegmenter.process(sentence);
47
// tokens from sentence, excluding WordType.SENTENCE_BEGIN and WordType.SENTENCE_END
48
List<SegToken> result = Collections.emptyList();
50
if (segTokenList.size() > 2) // if its not an empty sentence
51
result = segTokenList.subList(1, segTokenList.size() - 1);
53
for (SegToken st : result)
54
convertSegToken(st, sentence, startOffset);
60
* Process a {@link SegToken} so that it is ready for indexing.
62
* This method calculates offsets and normalizes the token with {@link SegTokenFilter}.
64
* @param st input {@link SegToken}
65
* @param sentence associated Sentence
66
* @param sentenceStartOffset offset into sentence
67
* @return Lucene {@link SegToken}
69
public SegToken convertSegToken(SegToken st, String sentence,
70
int sentenceStartOffset) {
72
switch (st.wordType) {
75
case WordType.FULLWIDTH_NUMBER:
76
case WordType.FULLWIDTH_STRING:
77
st.charArray = sentence.substring(st.startOffset, st.endOffset)
84
st = tokenFilter.filter(st);
85
st.startOffset += sentenceStartOffset;
86
st.endOffset += sentenceStartOffset;