1
package org.apache.lucene.analysis.util;
3
import java.text.BreakIterator; // javadoc
4
import java.text.CharacterIterator;
5
import java.util.Locale;
8
* A CharacterIterator used internally for use with {@link BreakIterator}
11
public abstract class CharArrayIterator implements CharacterIterator {
18
public char [] getText() {
22
public int getStart() {
26
public int getLength() {
31
* Set a new region of text to be examined by this iterator
33
* @param array text buffer to examine
34
* @param start offset into buffer
35
* @param length maximum length to examine
37
public void setText(final char array[], int start, int length) {
42
this.limit = start + length;
45
public char current() {
46
return (index == limit) ? DONE : jreBugWorkaround(array[index]);
49
protected abstract char jreBugWorkaround(char ch);
56
public int getBeginIndex() {
60
public int getEndIndex() {
64
public int getIndex() {
69
index = (limit == start) ? limit : limit - 1;
74
if (++index >= limit) {
82
public char previous() {
83
if (--index < start) {
91
public char setIndex(int position) {
92
if (position < getBeginIndex() || position > getEndIndex())
93
throw new IllegalArgumentException("Illegal Position: " + position);
94
index = start + position;
99
public Object clone() {
101
return super.clone();
102
} catch (CloneNotSupportedException e) {
103
// CharacterIterator does not allow you to throw CloneNotSupported
104
throw new RuntimeException(e);
109
* Create a new CharArrayIterator that works around JRE bugs
110
* in a manner suitable for {@link BreakIterator#getSentenceInstance()}
112
public static CharArrayIterator newSentenceInstance() {
113
if (HAS_BUGGY_BREAKITERATORS) {
114
return new CharArrayIterator() {
115
// work around this for now by lying about all surrogates to
116
// the sentence tokenizer, instead we treat them all as
117
// SContinue so we won't break around them.
119
protected char jreBugWorkaround(char ch) {
120
return ch >= 0xD800 && ch <= 0xDFFF ? 0x002C : ch;
124
return new CharArrayIterator() {
127
protected char jreBugWorkaround(char ch) {
135
* Create a new CharArrayIterator that works around JRE bugs
136
* in a manner suitable for {@link BreakIterator#getWordInstance()}
138
public static CharArrayIterator newWordInstance() {
139
if (HAS_BUGGY_BREAKITERATORS) {
140
return new CharArrayIterator() {
141
// work around this for now by lying about all surrogates to the word,
142
// instead we treat them all as ALetter so we won't break around them.
144
protected char jreBugWorkaround(char ch) {
145
return ch >= 0xD800 && ch <= 0xDFFF ? 0x0041 : ch;
149
return new CharArrayIterator() {
152
protected char jreBugWorkaround(char ch) {
160
* True if this JRE has a buggy BreakIterator implementation
162
public static final boolean HAS_BUGGY_BREAKITERATORS;
166
BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US);
167
bi.setText("\udb40\udc53");
170
} catch (Exception e) {
173
HAS_BUGGY_BREAKITERATORS = v;