~ubuntu-branches/ubuntu/vivid/icu4j-4.4/vivid

private static final UnicodeSet HANGUL = new UnicodeSet("[\uAC00 \uB098 \uB2E4 \uB77C \uB9C8 \uBC14 \uC0AC \uC544 \uC790 \uCC28 \uCE74 \uD0C0 \uD30C \uD558]");

private static final UnicodeSet ETHIOPIC = new UnicodeSet("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]");

private static final UnicodeSet CORE_LATIN = new UnicodeSet("[a-z]");

private ULocale locale;

private Collator comparator;

private Set<String> indexCharacters;

private LinkedHashMap<String, Set<String>> alreadyIn = new LinkedHashMap<String, Set<String>>();

private List<String> noDistinctSorting = new ArrayList<String>();

private List<String> notAlphabetic = new ArrayList<String>();

/**

* Create the index object.

* @param locale The locale to be passed.

* @draft ICU 4.2

* @provisional This API might change or be removed in a future release.

public IndexCharacters(ULocale locale) {

this(locale, LocaleData.getExemplarSet(locale, LocaleData.ES_STANDARD), Collator.getInstance(locale));

}

/**

* @internal

* @deprecated This API is ICU internal only.

@SuppressWarnings("unchecked")

public IndexCharacters(ULocale locale, UnicodeSet exemplarSet, Collator collator) {

this.locale = locale;

try {

comparator = (Collator) collator.clone();

} catch (CloneNotSupportedException e) {

100

throw new IllegalArgumentException(e);

101

}

102

comparator.setStrength(Collator.PRIMARY);

103

104

// get the exemplars, and handle special cases

105

106

UnicodeSet exemplars = exemplarSet.cloneAsThawed();

107

// question: should we add auxiliary exemplars?

108

if (exemplars.containsSome(CORE_LATIN)) {

109

exemplars.addAll(CORE_LATIN);

110

}

111

if (exemplars.containsSome(HANGUL)) {

112

// cut down to small list

113

exemplars.removeAll(new UnicodeSet("[:block=hangul_syllables:]")).addAll(HANGUL);

114

}

115

if (exemplars.containsSome(ETHIOPIC)) {

116

// cut down to small list

117

// make use of the fact that Ethiopic is allocated in 8's, where

118

// the base is 0 mod 8.

119

for (UnicodeSetIterator it = new UnicodeSetIterator(ETHIOPIC); it.next();) {

120

if ((it.codepoint & 0x7) != 0) {

121

exemplars.remove(it.codepoint);

122

}

123

}

124

}

125

126

// first sort them, with an "best" ordering among items that are the same according

127

// to the collator

128

Comparator<Object>[] comparators = (Comparator<Object>[])new Comparator[2];

129

comparators[0] = comparator;

130

comparators[1] = new PreferenceComparator(Collator.getInstance(locale));

131

132

Set<String> preferenceSorting = new TreeSet<String>(new MultiComparator<Object>(comparators));

133

for (UnicodeSetIterator it = new UnicodeSetIterator(exemplars); it.next();) {

134

preferenceSorting.add(it.getString());

135

}

136

137

indexCharacters = new TreeSet<String>(comparator);

138

139

// We nw make a sorted array of elements, uppercased

140

// Some of the input may, however, be redundant.

141

// That is, we might have c, ch, d, where "ch" sorts just like "c", "h"

142

// So we make a pass through, filtering out those cases.

143

144

for (String item : preferenceSorting) {

145

item = UCharacter.toUpperCase(locale, item);

146

if (indexCharacters.contains(item)) {

147

for (String itemAlreadyIn : indexCharacters) {

148

if (comparator.compare(item, itemAlreadyIn) == 0) {

149

Set<String> targets = alreadyIn.get(itemAlreadyIn);

150

if (targets == null) {

151

alreadyIn.put(itemAlreadyIn, targets = new LinkedHashSet<String>());

152

}

153

targets.add(item);

154

break;

155

}

156

}

157

} else if (UTF16.countCodePoint(item) > 1 && comparator.compare(item, separated(item)) == 0){

158

noDistinctSorting.add(item);

159

} else if (!ALPHABETIC.containsSome(item)) {

160

notAlphabetic.add(item);

161

} else {

162

indexCharacters.add(item);

163

}

164

}

165

166

// if the result is still too large, cut down to 100 elements

167

168

final int size = indexCharacters.size() - 1;

169

if (size > 99) {

170

int count = 0;

171

int old = -1;

172

for (Iterator<String> it = indexCharacters.iterator(); it.hasNext();) {

173

++ count;

174

it.next();

175

final int bump = count * 99 / size;

176

if (bump == old) {

177

it.remove();

178

} else {

179

old = bump;

180

}

181

}

182

}

183

indexCharacters = Collections.unmodifiableSet(indexCharacters);

184

}

185

186

187

* Return the string with interspersed CGJs. Input must have more than 2 codepoints.

188

189

private String separated(String item) {

190

StringBuilder result = new StringBuilder();

191

// add a CGJ except within surrogates

192

char last = item.charAt(0);

193

result.append(last);

194

for (int i = 1; i < item.length(); ++i) {

195

char ch = item.charAt(i);

196

if (!UCharacter.isHighSurrogate(last) || !UCharacter.isLowSurrogate(ch)) {

197

result.append(CGJ);

198

}

199

result.append(ch);

200

last = ch;

201

}

202

return result.toString();

203

}

204

205

/**

206

* Get the index characters.

207

* @return A collection including the index characters

208

* @draft ICU 4.2

209

* @provisional This API might change or be removed in a future release.

210

211

public Collection<String> getIndexCharacters() {

212

return indexCharacters;

213

}

214

215

/**

216

* Get the locale

217

* @return The locale.

218

* @draft ICU 4.2

219

* @provisional This API might change or be removed in a future release.

220

221

public ULocale getLocale() {

222

return locale;

223

}

224

225

/**

226

* As the index is built, items may be discarded from the exemplars.

227

* This contains some of the discards, and is intended for debugging.

228

* @internal

229

* @deprecated This API is ICU internal only.

230

231

public Map<String, Set<String>> getAlreadyIn() {

232

return alreadyIn;

233

}

234

235

/**

236

* As the index is built, items may be discarded from the exemplars.

237

* This contains some of the discards, and is intended for debugging.

238

* @internal

239

* @deprecated This API is ICU internal only.

240

241

public List<String> getNoDistinctSorting() {

242

return noDistinctSorting;

243

}

244

245

/**

246

* As the index is built, items may be discarded from the exemplars.

247

* This contains some of the discards, and is intended for debugging.

248

* @internal

249

* @deprecated This API is ICU internal only.

250

251

public List<String> getNotAlphabetic() {

252

return notAlphabetic;

253

}

254

255

256

* Comparator that returns "better" items first, where shorter NFKD is better,

257

* and otherwise NFKD binary order is better, and otherwise binary order is better.

258

259

private static class PreferenceComparator implements Comparator<Object> {

260

static final Comparator<String> binary = new UTF16.StringComparator(true,false,0);

261

final Collator collator;

262

263

public PreferenceComparator(Collator collator) {

264

this.collator = collator;

265

}

266

267

public int compare(Object o1, Object o2) {

268

return compare((String)o1, (String)o2);

269

}

270

271

public int compare(String s1, String s2) {

272

if (s1 == s2) {

273

return 0;

274

}

275

String n1 = Normalizer.decompose(s1, true);

276

String n2 = Normalizer.decompose(s2, true);

277

int result = n1.length() - n2.length();

278

if (result != 0) {

279

return result;

280

}

281

result = collator.compare(n1, n2);

282

if (result != 0) {

283

return result;

284

}

285

return binary.compare(s1, s2);

286

}

287

}

288

}

Older »