2
*******************************************************************************
3
* Copyright (C) 2008-2010, Google Inc, International Business Machines Corporation
4
* and others. All Rights Reserved.
5
*******************************************************************************
7
package com.ibm.icu.text;
9
import java.util.ArrayList;
10
import java.util.Collection;
11
import java.util.Collections;
12
import java.util.Comparator;
13
import java.util.Iterator;
14
import java.util.LinkedHashMap;
15
import java.util.LinkedHashSet;
16
import java.util.List;
19
import java.util.TreeSet;
21
import com.ibm.icu.impl.MultiComparator;
22
import com.ibm.icu.lang.UCharacter;
23
import com.ibm.icu.util.LocaleData;
24
import com.ibm.icu.util.ULocale;
27
* A set of characters for use as a UI "index", that is, a
28
* list of clickable characters (or character sequences) that allow the user to
29
* see a segment of a larger "target" list. That is, each character corresponds
30
* to a bucket in the target list, where everything in the bucket is greater
31
* than or equal to the character (according to the locale's collation). The
32
* intention is to have two main functions; one that produces an index list that
33
* is relatively static, and the other is a list that produces roughly
34
* equally-sized buckets. Only the first is currently provided.
36
* The static list would be presented as something like
39
* A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
42
* In the UI, an index character could be omitted if its bucket is empty. For
43
* example, if there is nothing in the bucket for Q, then Q could be omitted.
45
* <b>Important Notes:</b>
47
* <li>Although we say "character" above, the index character could be a
48
* sequence, like "CH".</li>
49
* <li>There could be items in a target list that are less than the first or
50
* (much) greater than the last; examples include words from other scripts. The
51
* UI could bucket them with the first or last respectively, or have some symbol
52
* for those categories.</li>
53
* <li>The use of the list requires that the target list be sorted according to
54
* the locale that is used to create that list.</li>
55
* <li>For languages without widely accepted sorting methods (eg Chinese/Japanese)
56
* the results may appear arbitrary, and it may be best not to use these methods.</li>
57
* <li>In the initial version, an arbitrary limit of 100 is placed on these lists.</li>
62
* @provisional This API might change or be removed in a future release.
64
//TODO(markdavis) return an additional character that is the "least greater" character than
66
public class IndexCharacters {
67
private static final char CGJ = '\u034F';
68
private static final UnicodeSet ALPHABETIC = new UnicodeSet("[[:alphabetic:]-[:mark:]]");
69
private static final UnicodeSet HANGUL = new UnicodeSet("[\uAC00 \uB098 \uB2E4 \uB77C \uB9C8 \uBC14 \uC0AC \uC544 \uC790 \uCC28 \uCE74 \uD0C0 \uD30C \uD558]");
70
private static final UnicodeSet ETHIOPIC = new UnicodeSet("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]");
71
private static final UnicodeSet CORE_LATIN = new UnicodeSet("[a-z]");
73
private ULocale locale;
74
private Collator comparator;
75
private Set<String> indexCharacters;
76
private LinkedHashMap<String, Set<String>> alreadyIn = new LinkedHashMap<String, Set<String>>();
77
private List<String> noDistinctSorting = new ArrayList<String>();
78
private List<String> notAlphabetic = new ArrayList<String>();
81
* Create the index object.
82
* @param locale The locale to be passed.
84
* @provisional This API might change or be removed in a future release.
86
public IndexCharacters(ULocale locale) {
87
this(locale, LocaleData.getExemplarSet(locale, LocaleData.ES_STANDARD), Collator.getInstance(locale));
92
* @deprecated This API is ICU internal only.
94
@SuppressWarnings("unchecked")
95
public IndexCharacters(ULocale locale, UnicodeSet exemplarSet, Collator collator) {
98
comparator = (Collator) collator.clone();
99
} catch (CloneNotSupportedException e) {
100
throw new IllegalArgumentException(e);
102
comparator.setStrength(Collator.PRIMARY);
104
// get the exemplars, and handle special cases
106
UnicodeSet exemplars = exemplarSet.cloneAsThawed();
107
// question: should we add auxiliary exemplars?
108
if (exemplars.containsSome(CORE_LATIN)) {
109
exemplars.addAll(CORE_LATIN);
111
if (exemplars.containsSome(HANGUL)) {
112
// cut down to small list
113
exemplars.removeAll(new UnicodeSet("[:block=hangul_syllables:]")).addAll(HANGUL);
115
if (exemplars.containsSome(ETHIOPIC)) {
116
// cut down to small list
117
// make use of the fact that Ethiopic is allocated in 8's, where
118
// the base is 0 mod 8.
119
for (UnicodeSetIterator it = new UnicodeSetIterator(ETHIOPIC); it.next();) {
120
if ((it.codepoint & 0x7) != 0) {
121
exemplars.remove(it.codepoint);
126
// first sort them, with an "best" ordering among items that are the same according
128
Comparator<Object>[] comparators = (Comparator<Object>[])new Comparator[2];
129
comparators[0] = comparator;
130
comparators[1] = new PreferenceComparator(Collator.getInstance(locale));
132
Set<String> preferenceSorting = new TreeSet<String>(new MultiComparator<Object>(comparators));
133
for (UnicodeSetIterator it = new UnicodeSetIterator(exemplars); it.next();) {
134
preferenceSorting.add(it.getString());
137
indexCharacters = new TreeSet<String>(comparator);
139
// We nw make a sorted array of elements, uppercased
140
// Some of the input may, however, be redundant.
141
// That is, we might have c, ch, d, where "ch" sorts just like "c", "h"
142
// So we make a pass through, filtering out those cases.
144
for (String item : preferenceSorting) {
145
item = UCharacter.toUpperCase(locale, item);
146
if (indexCharacters.contains(item)) {
147
for (String itemAlreadyIn : indexCharacters) {
148
if (comparator.compare(item, itemAlreadyIn) == 0) {
149
Set<String> targets = alreadyIn.get(itemAlreadyIn);
150
if (targets == null) {
151
alreadyIn.put(itemAlreadyIn, targets = new LinkedHashSet<String>());
157
} else if (UTF16.countCodePoint(item) > 1 && comparator.compare(item, separated(item)) == 0){
158
noDistinctSorting.add(item);
159
} else if (!ALPHABETIC.containsSome(item)) {
160
notAlphabetic.add(item);
162
indexCharacters.add(item);
166
// if the result is still too large, cut down to 100 elements
168
final int size = indexCharacters.size() - 1;
172
for (Iterator<String> it = indexCharacters.iterator(); it.hasNext();) {
175
final int bump = count * 99 / size;
183
indexCharacters = Collections.unmodifiableSet(indexCharacters);
187
* Return the string with interspersed CGJs. Input must have more than 2 codepoints.
189
private String separated(String item) {
190
StringBuilder result = new StringBuilder();
191
// add a CGJ except within surrogates
192
char last = item.charAt(0);
194
for (int i = 1; i < item.length(); ++i) {
195
char ch = item.charAt(i);
196
if (!UCharacter.isHighSurrogate(last) || !UCharacter.isLowSurrogate(ch)) {
202
return result.toString();
206
* Get the index characters.
207
* @return A collection including the index characters
209
* @provisional This API might change or be removed in a future release.
211
public Collection<String> getIndexCharacters() {
212
return indexCharacters;
217
* @return The locale.
219
* @provisional This API might change or be removed in a future release.
221
public ULocale getLocale() {
226
* As the index is built, items may be discarded from the exemplars.
227
* This contains some of the discards, and is intended for debugging.
229
* @deprecated This API is ICU internal only.
231
public Map<String, Set<String>> getAlreadyIn() {
236
* As the index is built, items may be discarded from the exemplars.
237
* This contains some of the discards, and is intended for debugging.
239
* @deprecated This API is ICU internal only.
241
public List<String> getNoDistinctSorting() {
242
return noDistinctSorting;
246
* As the index is built, items may be discarded from the exemplars.
247
* This contains some of the discards, and is intended for debugging.
249
* @deprecated This API is ICU internal only.
251
public List<String> getNotAlphabetic() {
252
return notAlphabetic;
256
* Comparator that returns "better" items first, where shorter NFKD is better,
257
* and otherwise NFKD binary order is better, and otherwise binary order is better.
259
private static class PreferenceComparator implements Comparator<Object> {
260
static final Comparator<String> binary = new UTF16.StringComparator(true,false,0);
261
final Collator collator;
263
public PreferenceComparator(Collator collator) {
264
this.collator = collator;
267
public int compare(Object o1, Object o2) {
268
return compare((String)o1, (String)o2);
271
public int compare(String s1, String s2) {
275
String n1 = Normalizer.decompose(s1, true);
276
String n2 = Normalizer.decompose(s2, true);
277
int result = n1.length() - n2.length();
281
result = collator.compare(n1, n2);
285
return binary.compare(s1, s2);