1
package org.apache.lucene.analysis.icu.segmentation;
4
* Licensed to the Apache Software Foundation (ASF) under one or more
5
* contributor license agreements. See the NOTICE file distributed with
6
* this work for additional information regarding copyright ownership.
7
* The ASF licenses this file to You under the Apache License, Version 2.0
8
* (the "License"); you may not use this file except in compliance with
9
* the License. You may obtain a copy of the License at
11
* http://www.apache.org/licenses/LICENSE-2.0
13
* Unless required by applicable law or agreed to in writing, software
14
* distributed under the License is distributed on an "AS IS" BASIS,
15
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
* See the License for the specific language governing permissions and
17
* limitations under the License.
20
import com.ibm.icu.lang.UScript;
21
import com.ibm.icu.text.BreakIterator;
24
* An internal BreakIterator for multilingual text, following recommendations
25
* from: UAX #29: Unicode Text Segmentation. (http://unicode.org/reports/tr29/)
27
* See http://unicode.org/reports/tr29/#Tailoring for the motivation of this
30
* Text is first divided into script boundaries. The processing is then
31
* delegated to the appropriate break iterator for that specific script.
33
* This break iterator also allows you to retrieve the ISO 15924 script code
34
* associated with a piece of text.
36
* See also UAX #29, UTR #24
37
* @lucene.experimental
39
final class CompositeBreakIterator {
40
private final ICUTokenizerConfig config;
41
private final BreakIteratorWrapper wordBreakers[] = new BreakIteratorWrapper[UScript.CODE_LIMIT];
43
private BreakIteratorWrapper rbbi;
44
private final ScriptIterator scriptIterator = new ScriptIterator();
48
CompositeBreakIterator(ICUTokenizerConfig config) {
53
* Retrieve the next break position. If the RBBI range is exhausted within the
54
* script boundary, examine the next script boundary.
56
* @return the next break position or BreakIterator.DONE
59
int next = rbbi.next();
60
while (next == BreakIterator.DONE && scriptIterator.next()) {
61
rbbi = getBreakIterator(scriptIterator.getScriptCode());
62
rbbi.setText(text, scriptIterator.getScriptStart(),
63
scriptIterator.getScriptLimit() - scriptIterator.getScriptStart());
66
return (next == BreakIterator.DONE) ? BreakIterator.DONE : next
67
+ scriptIterator.getScriptStart();
71
* Retrieve the current break position.
73
* @return the current break position or BreakIterator.DONE
76
final int current = rbbi.current();
77
return (current == BreakIterator.DONE) ? BreakIterator.DONE : current
78
+ scriptIterator.getScriptStart();
82
* Retrieve the rule status code (token type) from the underlying break
85
* @return rule status code (see RuleBasedBreakIterator constants)
88
return rbbi.getRuleStatus();
92
* Retrieve the UScript script code for the current token. This code can be
93
* decoded with UScript into a name or ISO 15924 code.
95
* @return UScript script code for the current token.
98
return scriptIterator.getScriptCode();
102
* Set a new region of text to be examined by this iterator
104
* @param text buffer of text
105
* @param start offset into buffer
106
* @param length maximum length to examine
108
void setText(final char text[], int start, int length) {
110
scriptIterator.setText(text, start, length);
111
if (scriptIterator.next()) {
112
rbbi = getBreakIterator(scriptIterator.getScriptCode());
113
rbbi.setText(text, scriptIterator.getScriptStart(),
114
scriptIterator.getScriptLimit() - scriptIterator.getScriptStart());
116
rbbi = getBreakIterator(UScript.COMMON);
117
rbbi.setText(text, 0, 0);
121
private BreakIteratorWrapper getBreakIterator(int scriptCode) {
122
if (wordBreakers[scriptCode] == null)
123
wordBreakers[scriptCode] = BreakIteratorWrapper.wrap(config.getBreakIterator(scriptCode));
124
return wordBreakers[scriptCode];