43
60
private String description;
44
61
private Charset charset;
46
private final byte[] primary = new byte[256];
47
private final byte[] secondary = new byte[256];
48
private final byte[] tertiary = new byte[256];
49
private final byte[] flags = new byte[256];
63
private final Page[] pages = new Page[256];
51
private final List<CodePosition> expansions = new ArrayList<CodePosition>();
65
private final List<CodePosition> expansions = new ArrayList<>();
52
66
private int maxExpSize = 1;
54
68
private CharsetEncoder encoder;
69
private boolean multi;
73
pages[0] = new Page();
56
76
public void add(int ch, int primary, int secondary, int tertiary, int flags) {
57
if (this.primary[ch & 0xff] != 0)
78
if (getPrimary(ch) != 0)
58
79
throw new ExitException(String.format("Repeated primary index 0x%x", ch & 0xff));
59
this.primary[ch & 0xff] = (byte) primary;
60
this.secondary[ch & 0xff] = (byte) secondary;
61
this.tertiary[ch & 0xff] = (byte) tertiary;
63
this.flags[ch & 0xff] = (byte) flags;
80
setPrimary (ch, primary);
81
setSecondary(ch, secondary);
82
setTertiary( ch, tertiary);
88
* Run after all sorting order points have been added.
90
* Make sure that all tertiary values of secondary ignorable are greater
91
* than any normal tertiary value.
93
* And the same for secondaries on primary ignorable.
95
public void finish() {
98
for (Page p : pages) {
102
for (int i = 0; i < 256; i++) {
103
if (((p.flags[i] >>> 4) & 0x3) == 0) {
104
if (p.getPrimary(i) != 0) {
105
byte second = p.getSecondary(i);
106
maxSecondary = Math.max(maxSecondary, second);
108
maxTertiary = Math.max(maxTertiary, p.getTertiary(i));
115
for (Page p : pages) {
119
for (int i = 0; i < 256; i++) {
120
if (((p.flags[i] >>> 4) & 0x3) != 0) continue;
122
if (p.getPrimary(i) == 0) {
123
if (p.getSecondary(i) == 0) {
124
if (p.getTertiary(i) != 0) {
125
p.setTertiary(i, p.getTertiary(i) + maxTertiary);
128
p.setSecondary(i, p.getSecondary(i) + maxSecondary);
67
136
* Return a table indexed by a character value in the target codepage, that gives the complete sort
68
137
* position of the character.
139
* This is only used for testing.
69
141
* @return A table of sort positions.
71
143
public char[] getSortPositions() {
72
144
char[] tab = new char[256];
74
146
for (int i = 1; i < 256; i++) {
75
tab[i] = (char) (((primary[i] << 8) & 0xff00) | ((secondary[i] << 4) & 0xf0) | (tertiary[i] & 0xf));
147
tab[i] = (char) (((getPrimary(i) << 8) & 0xff00) | ((getSecondary(i) << 4) & 0xf0) | (getTertiary(i) & 0xf));
115
194
// We need +1 for the null bytes, we also +2 for a couple of expanded characters. For a complete
116
195
// german map this was always enough in tests.
117
key = new byte[(bval.length + 1 + 2) * 3];
196
key = new byte[(chars.length + 1 + 2) * 4];
119
fillCompleteKey(bval, key);
198
fillCompleteKey(chars, key);
120
199
} catch (ArrayIndexOutOfBoundsException e) {
121
200
// Ok try again with the max possible key size allocated.
122
key = new byte[bval.length * 3 * maxExpSize + 3];
201
key = new byte[(chars.length+1) * 4 * maxExpSize];
202
fillCompleteKey(chars, key);
125
205
if (cache != null)
126
206
cache.put(s, key);
128
return new SrtSortKey<T>(object, key, second);
208
return new SrtSortKey<>(object, key, second);
129
209
} catch (CharacterCodingException e) {
130
return new SrtSortKey<T>(object, ZERO_KEY);
210
return new SrtSortKey<>(object, ZERO_KEY);
215
* Create a sort key based on a Label.
217
* The label will contain the actual characters (after transliteration for example)
218
* @param object This is saved in the sort key for later retrieval and plays no part in the sorting.
219
* @param label The label, the actual written bytes/chars will be used as input to the sort.
220
* @param second Secondary sort key.
221
* @param cache A cache for the created keys. This is for saving memory so it is essential that this
222
* is managed by the caller.
223
* @return A sort key.
225
public <T> SortKey<T> createSortKey(T object, Label label, int second, Map<Label, byte[]> cache) {
228
key = cache.get(label);
230
return new SrtSortKey<>(object, key, second);
233
char[] encText = label.getEncText();
235
// In theory you could have a string where every character expands into maxExpSize separate characters
236
// in the key. However if we allocate enough space to deal with the worst case, then we waste a
237
// vast amount of memory. So allocate a minimal amount of space, try it and if it fails reallocate the
240
// We need +1 for the null bytes, we also +2 for a couple of expanded characters. For a complete
241
// german map this was always enough in tests.
242
key = new byte[(encText.length + 1 + 2) * 4];
244
fillCompleteKey(encText, key);
245
} catch (ArrayIndexOutOfBoundsException e) {
246
// Ok try again with the max possible key size allocated.
247
key = new byte[encText.length * 4 * maxExpSize + 4];
248
fillCompleteKey(encText, key);
252
cache.put(label, key);
254
return new SrtSortKey<>(object, key, second);
258
* Convenient version of create sort key method.
259
* @see #createSortKey(Object, String, int, Map)
134
261
public <T> SortKey<T> createSortKey(T object, String s, int second) {
135
262
return createSortKey(object, s, second, null);
266
* Convenient version of create sort key method.
268
* @see #createSortKey(Object, String, int, Map)
138
270
public <T> SortKey<T> createSortKey(T object, String s) {
139
271
return createSortKey(object, s, 0, null);
274
public <T> SortKey<T> createSortKey(T object, Label label) {
275
return createSortKey(object, label, 0, null);
278
public <T> SortKey<T> createSortKey(T object, Label label, int second) {
279
return createSortKey(object, label, second, null);
143
283
* Fill in the key from the given byte string.
145
* @param bval The string for which we are creating the sort key.
285
* @param bVal The string for which we are creating the sort key.
146
286
* @param key The sort key. This will be filled in.
148
private void fillCompleteKey(byte[] bval, byte[] key) {
149
int start = fillKey(Collator.PRIMARY, primary, bval, key, 0);
150
start = fillKey(Collator.SECONDARY, secondary, bval, key, start);
151
fillKey(Collator.TERTIARY, tertiary, bval, key, start);
288
private void fillCompleteKey(char[] bVal, byte[] key) {
289
int start = fillKey(Collator.PRIMARY, bVal, key, 0);
290
start = fillKey(Collator.SECONDARY, bVal, key, start);
291
fillKey(Collator.TERTIARY, bVal, key, start);
155
295
* Fill in the output key for a given strength.
157
* @param sortPositions An array giving the sort position for each of the 256 characters.
158
297
* @param input The input string in a particular 8 bit codepage.
159
298
* @param outKey The output sort key.
160
299
* @param start The index into the output key to start at.
161
300
* @return The next position in the output key.
163
private int fillKey(int type, byte[] sortPositions, byte[] input, byte[] outKey, int start) {
302
private int fillKey(int type, char[] input, byte[] outKey, int start) {
164
303
int index = start;
165
for (byte inb : input) {
168
int exp = (flags[b] >> 4) & 0x3;
304
for (char c : input) {
306
if (!hasPage(c >>> 8))
309
int exp = (getFlags(c) >> 4) & 0x3;
170
// I am guessing that a sort position of 0 means that the character is ignorable at this
171
// strength. In other words it is as if it is not present in the string. This appears to
172
// be true for shield symbols, but perhaps not for other kinds of control characters.
173
byte pos = sortPositions[b];
175
outKey[index++] = pos;
311
index = writePos(type, c, outKey, index);
177
313
// now have to redirect to a list of input chars, get the list via the primary value always.
178
byte idx = primary[b];
179
//List<CodePosition> list = expansions.get(idx-1);
314
int idx = getPrimary(c);
181
315
for (int i = idx - 1; i < idx + exp; i++) {
182
byte pos = expansions.get(i).getPosition(type);
184
outKey[index++] = pos;
316
int pos = expansions.get(i).getPosition(type);
318
if (type == Collator.PRIMARY)
319
outKey[index++] = (byte) ((pos >>> 8) & 0xff);
320
outKey[index++] = (byte) pos;
326
if (type == Collator.PRIMARY)
327
outKey[index++] = '\0';
189
328
outKey[index++] = '\0';
193
public byte getPrimary(int ch) {
197
public byte getSecondary(int ch) {
198
return secondary[ch];
201
public byte getTertiary(int ch) {
332
public int getPrimary(int ch) {
333
return this.pages[ch >>> 8].getPrimary(ch);
336
public int getSecondary(int ch) {
337
return this.pages[ch >>> 8].getSecondary(ch);
340
public int getTertiary(int ch) {
341
return this.pages[ch >>> 8].getTertiary(ch);
205
344
public byte getFlags(int ch) {
346
return this.pages[ch >>> 8].flags[ch & 0xff];
209
349
public int getCodepage() {
280
412
* The case were two letters sort as if the were just one (and more complex cases) are
281
413
* not supported or are unknown to us.
283
* @param bval The code point of this letter in the code page.
415
* @param ch The code point of this letter in the code page.
284
416
* @param inFlags The initial flags, eg if it is a letter or not.
285
417
* @param expansionList The letters that this letter sorts as, as code points in the codepage.
287
public void addExpansion(byte bval, int inFlags, List<Byte> expansionList) {
288
int idx = bval & 0xff;
289
flags[idx] = (byte) ((inFlags & 0xf) | (((expansionList.size()-1) << 4) & 0x30));
419
public void addExpansion(int ch, int inFlags, List<Integer> expansionList) {
420
ensurePage(ch >>> 8);
421
setFlags(ch, (byte) ((inFlags & 0xf) | (((expansionList.size()-1) << 4) & 0xf0)));
291
423
// Check for repeated definitions
292
if (primary[idx] != 0)
293
throw new ExitException(String.format("repeated code point %x", idx));
424
if (getPrimary(ch) != 0)
425
throw new ExitException(String.format("repeated code point %x", ch));
295
primary[idx] = (byte) (expansions.size() + 1);
427
setPrimary(ch, (expansions.size() + 1));
298
430
maxExpSize = Math.max(maxExpSize, expansionList.size());
300
for (Byte b : expansionList) {
432
for (Integer b : expansionList) {
301
433
CodePosition cp = new CodePosition();
302
cp.setPrimary(primary[b & 0xff]);
303
cp.setSecondary(secondary[b & 0xff]);
304
cp.setTertiary((byte) (tertiary[b & 0xff] + 2));
434
cp.setPrimary((char) getPrimary(b & 0xff));
436
// Currently sort without secondary or tertiary differences to the base letters.
437
cp.setSecondary((byte) getSecondary(b & 0xff));
438
cp.setTertiary((byte) getTertiary(b & 0xff));
305
439
expansions.add(cp);
349
460
return String.format("sort cp=%d order=%08x", codepage, getSortOrderId());
463
private void setPrimary(int ch, int val) {
464
this.pages[ch >>> 8].setPrimary(ch, val);
467
private void setSecondary(int ch, int val) {
468
this.pages[ch >>> 8].setSecondary(ch, val);
471
private void setTertiary(int ch, int val) {
472
this.pages[ch >>> 8].setTertiary(ch, val);
475
private void setFlags(int ch, int val) {
476
this.pages[ch >>> 8].flags[ch & 0xff] = (byte) val;
479
public static Charset charsetFromCodepage(int codepage) {
483
charset = Charset.forName("ascii");
486
charset = Charset.forName("UTF-8");
489
// Java uses "ms932" for code page 932
490
// (Windows-31J, Shift-JIS + MS extensions)
491
charset = Charset.forName("ms932");
494
charset = Charset.forName("cp" + codepage);
500
public void setMulti(boolean multi) {
504
public boolean isMulti() {
508
public int getPos(int type, int ch) {
509
return pages[ch >>> 8].getPos(type, ch);
512
public int writePos(int type, int ch, byte[] outkey, int start) {
513
return pages[ch >>> 8].writePos(type, ch, outkey, start);
517
* Ensure that the given page exists in the page array.
519
* @param n The page index.
521
private void ensurePage(int n) {
522
assert n == 0 || isMulti();
523
if (this.pages[n] == null) {
524
this.pages[n] = new Page();
531
* The max page, top 8+ bits of the character that we have information on.
533
public int getMaxPage() {
538
* @return True if there is at least one character with the given page/block number.
540
public boolean hasPage(int p) {
541
return pages[p] != null;
545
* Holds the sort positions of a 256 character block.
547
private static class Page {
548
private final char[] primary = new char[256];
549
private final byte[] secondary = new byte[256];
550
private final byte[] tertiary = new byte[256];
551
private final byte[] flags = new byte[256];
553
char getPrimary(int ch) {
554
return primary[ch & 0xff];
557
void setPrimary(int ch, int val) {
558
primary[ch & 0xff] = (char) val;
561
byte getSecondary(int ch) {
562
return secondary[ch & 0xff];
565
void setSecondary(int ch, int val) {
566
secondary[ch & 0xff] = (byte) val;
569
byte getTertiary(int ch) {
570
return tertiary[ch & 0xff];
573
void setTertiary(int ch, int val) {
574
tertiary[ch & 0xff] = (byte) val;
578
* Get the sort position data for a given strength for a character.
579
* @param type The collation strength PRIMARY, SECONDARY etc.
580
* @param ch The character.
581
* @return The sorting weight for the given character.
583
public int getPos(int type, int ch) {
585
case Collator.PRIMARY:
586
return getPrimary(ch) & 0xffff;
587
case Collator.SECONDARY:
588
return getSecondary(ch) & 0xff;
589
case Collator.TERTIARY:
590
return getTertiary(ch) & 0xff;
592
assert false : "bad collation type passed";
598
* Write a sort position for a given character to a sort key.
599
* @param strength The sort strength type.
600
* @param ch The character.
601
* @param outKey The output key.
602
* @param start The offset into outKey, the new position is written here.
603
* @return The new start offset, after the key information has been written.
605
public int writePos(int strength, int ch, byte[] outKey, int start) {
606
int pos = getPos(strength, ch);
608
if (strength == Collator.PRIMARY)
609
outKey[start++] = (byte) ((pos >> 8) & 0xff); // for 2 byte charsets
610
outKey[start++] = (byte) (pos & 0xff);
353
617
* A collator that works with this sort. This should be used if you just need to compare two
354
618
* strings against each other once.
356
620
* The sort key is better when the comparison must be done several times as in a sort operation.
622
* This implementation has the same effect when used for sorting as the sort keys.
358
624
private class SrtCollator extends Collator {
359
625
private final int codepage;
365
631
public int compare(String source, String target) {
366
CharBuffer in1 = CharBuffer.wrap(source);
367
CharBuffer in2 = CharBuffer.wrap(target);
371
bytes1 = encoder.encode(in1).array();
372
bytes2 = encoder.encode(in2).array();
373
} catch (CharacterCodingException e) {
374
throw new ExitException("character encoding failed unexpectedly", e);
635
chars1 = source.toCharArray();
636
chars2 = target.toCharArray();
638
CharBuffer in1 = CharBuffer.wrap(source);
639
CharBuffer in2 = CharBuffer.wrap(target);
641
byte[] bytes1 = encoder.encode(in1).array();
642
byte[] bytes2 = encoder.encode(in2).array();
643
chars1 = new char[bytes1.length];
644
for (int i = 0; i < bytes1.length; i++)
645
chars1[i] = (char) (bytes1[i] & 0xff);
646
chars2 = new char[bytes2.length];
647
for (int i = 0; i < bytes2.length; i++)
648
chars2[i] = (char) (bytes2[i] & 0xff);
649
} catch (CharacterCodingException e) {
650
throw new ExitException("character encoding failed unexpectedly", e);
377
654
int strength = getStrength();
378
int res = compareOneStrength(bytes1, bytes2, primary, Collator.PRIMARY);
655
int res = compareOneStrength(chars1, chars2, Collator.PRIMARY);
380
657
if (res == 0 && strength != PRIMARY) {
381
res = compareOneStrength(bytes1, bytes2, secondary, Collator.SECONDARY);
658
res = compareOneStrength(chars1, chars2, Collator.SECONDARY);
382
659
if (res == 0 && strength != SECONDARY) {
383
res = compareOneStrength(bytes1, bytes2, tertiary, Collator.TERTIARY);
660
res = compareOneStrength(chars1, chars2, Collator.TERTIARY);
388
if (source.length() < target.length())
390
else if (source.length() > target.length())
397
668
* Compare the bytes against primary, secondary or tertiary arrays.
398
* @param bytes1 Bytes for the first string in the codepage encoding.
399
* @param bytes2 Bytes for the second string in the codepage encoding.
400
* @param typePositions The strength array to use in the comparison.
669
* @param char1 Bytes for the first string in the codepage encoding.
670
* @param char2 Bytes for the second string in the codepage encoding.
401
671
* @return Comparison result -1, 0 or 1.
403
@SuppressWarnings({"AssignmentToForLoopParameter"})
404
private int compareOneStrength(byte[] bytes1, byte[] bytes2, byte[] typePositions, int type) {
673
private int compareOneStrength(char[] char1, char[] char2, int type) {
407
PositionIterator it1 = new PositionIterator(bytes1, typePositions, type);
408
PositionIterator it2 = new PositionIterator(bytes2, typePositions, type);
676
PositionIterator it1 = new PositionIterator(char1, type);
677
PositionIterator it2 = new PositionIterator(char2, type);
410
while (it1.hasNext() && it2.hasNext()) {
679
while (it1.hasNext() || it2.hasNext()) {
411
680
int p1 = it1.next();
412
681
int p2 = it2.next();