2
*******************************************************************************
3
* Copyright (C) 2006-2008, International Business Machines Corporation and *
4
* others. All Rights Reserved. *
5
*******************************************************************************
7
*******************************************************************************
10
package com.ibm.icu.charset;
12
import java.nio.ByteBuffer;
13
import java.nio.CharBuffer;
14
import java.nio.IntBuffer;
15
import java.nio.charset.CharsetDecoder;
16
import java.nio.charset.CharsetEncoder;
17
import java.nio.charset.CoderResult;
19
import com.ibm.icu.text.UTF16;
20
import com.ibm.icu.text.UnicodeSet;
23
* @author Niti Hantaweepant
25
class CharsetUTF8 extends CharsetICU {
27
private static final byte[] fromUSubstitution = new byte[] { (byte) 0xef, (byte) 0xbf, (byte) 0xbd };
29
public CharsetUTF8(String icuCanonicalName, String javaCanonicalName, String[] aliases) {
30
super(icuCanonicalName, javaCanonicalName, aliases);
31
/* max 3 bytes per code unit from UTF-8 (4 bytes from surrogate _pair_) */
37
private static final int BITMASK_FROM_UTF8[] = { -1, 0x7f, 0x1f, 0xf, 0x7, 0x3, 0x1 };
39
private static final byte BYTES_FROM_UTF8[] = {
40
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
41
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
42
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
43
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
44
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
45
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
46
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
47
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
51
* Starting with Unicode 3.0.1: UTF-8 byte sequences of length N _must_ encode code points of or
52
* above utf8_minChar32[N]; byte sequences with more than 4 bytes are illegal in UTF-8, which is
53
* tested with impossible values for them
55
private static final int UTF8_MIN_CHAR32[] = { 0, 0, 0x80, 0x800, 0x10000,
56
Integer.MAX_VALUE, Integer.MAX_VALUE };
58
private final boolean isCESU8 = this instanceof CharsetCESU8;
60
class CharsetDecoderUTF8 extends CharsetDecoderICU {
62
public CharsetDecoderUTF8(CharsetICU cs) {
66
protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets,
68
if (!source.hasRemaining()) {
69
/* no input, nothing to do */
70
return CoderResult.UNDERFLOW;
72
if (!target.hasRemaining()) {
73
/* no output available, can't do anything */
74
return CoderResult.OVERFLOW;
77
if (source.hasArray() && target.hasArray()) {
78
/* source and target are backed by arrays, so use the arrays for optimal performance */
79
byte[] sourceArray = source.array();
80
int sourceIndex = source.arrayOffset() + source.position();
81
int sourceLimit = source.arrayOffset() + source.limit();
82
char[] targetArray = target.array();
83
int targetIndex = target.arrayOffset() + target.position();
84
int targetLimit = target.arrayOffset() + target.limit();
87
int char32, bytesExpected, bytesSoFar;
91
/* nothing is stored in toUnicodeStatus, read a byte as input */
92
char32 = (toUBytesArray[0] = sourceArray[sourceIndex++]) & 0xff;
93
bytesExpected = BYTES_FROM_UTF8[char32];
94
char32 &= BITMASK_FROM_UTF8[bytesExpected];
97
/* a partially or fully built code point is stored in toUnicodeStatus */
98
char32 = toUnicodeStatus;
100
bytesSoFar = toULength;
107
outer: while (true) {
108
if (bytesSoFar < bytesExpected) {
109
/* read a trail byte and insert its relevant bits into char32 */
110
if (sourceIndex >= sourceLimit) {
111
/* no source left, save the state for later and break out of the loop */
112
toUnicodeStatus = char32;
113
mode = bytesExpected;
114
toULength = bytesSoFar;
115
cr = CoderResult.UNDERFLOW;
118
if (((ch = toUBytesArray[bytesSoFar] = sourceArray[sourceIndex++]) & 0xc0) != 0x80) {
119
/* not a trail byte (is not of the form 10xxxxxx) */
121
toULength = bytesSoFar;
122
cr = CoderResult.malformedForLength(bytesSoFar);
125
char32 = (char32 << 6) | (ch & 0x3f);
127
} else if (bytesSoFar == bytesExpected && UTF8_MIN_CHAR32[bytesExpected] <= char32 && char32 <= 0x10ffff
128
&& (isCESU8 ? bytesExpected <= 3 : !UTF16.isSurrogate((char) char32))) {
130
* char32 is a valid code point and is composed of the correct number of
131
* bytes ... we now need to output it in UTF-16
134
if (char32 <= UConverterConstants.MAXIMUM_UCS2) {
135
/* fits in 16 bits */
136
targetArray[targetIndex++] = (char) char32;
138
/* fit char32 into 20 bits */
139
char32 -= UConverterConstants.HALF_BASE;
141
/* write out the surrogates */
142
targetArray[targetIndex++] = (char) ((char32 >>> UConverterConstants.HALF_SHIFT) + UConverterConstants.SURROGATE_HIGH_START);
144
if (targetIndex >= targetLimit) {
145
/* put in overflow buffer (not handled here) */
146
charErrorBufferArray[charErrorBufferBegin++] = (char) char32;
147
cr = CoderResult.OVERFLOW;
150
targetArray[targetIndex++] = (char) ((char32 & UConverterConstants.HALF_MASK) + UConverterConstants.SURROGATE_LOW_START);
154
* we're finished outputing, so now we need to read in the first byte of the
155
* next byte sequence that could form a code point
158
if (sourceIndex >= sourceLimit) {
159
cr = CoderResult.UNDERFLOW;
162
if (targetIndex >= targetLimit) {
163
cr = CoderResult.OVERFLOW;
167
/* keep reading the next input (and writing it) while bytes == 1 */
168
while ((bytesExpected = BYTES_FROM_UTF8[char32 = (toUBytesArray[0] = sourceArray[sourceIndex++]) & 0xff]) == 1) {
169
targetArray[targetIndex++] = (char) char32;
170
if (sourceIndex >= sourceLimit) {
171
cr = CoderResult.UNDERFLOW;
174
if (targetIndex >= targetLimit) {
175
cr = CoderResult.OVERFLOW;
180
/* remove the bits that indicate the number of bytes */
181
char32 &= BITMASK_FROM_UTF8[bytesExpected];
185
* either the lead byte in the code sequence is invalid (bytes == 0) or the
186
* lead byte combined with all the trail chars does not form a valid code
189
toULength = bytesSoFar;
190
cr = CoderResult.malformedForLength(bytesSoFar);
195
source.position(sourceIndex - source.arrayOffset());
196
target.position(targetIndex - target.arrayOffset());
201
int sourceIndex = source.position();
202
int sourceLimit = source.limit();
203
int targetIndex = target.position();
204
int targetLimit = target.limit();
207
int char32, bytesExpected, bytesSoFar;
211
/* nothing is stored in toUnicodeStatus, read a byte as input */
212
char32 = (toUBytesArray[0] = source.get(sourceIndex++)) & 0xff;
213
bytesExpected = BYTES_FROM_UTF8[char32];
214
char32 &= BITMASK_FROM_UTF8[bytesExpected];
217
/* a partially or fully built code point is stored in toUnicodeStatus */
218
char32 = toUnicodeStatus;
219
bytesExpected = mode;
220
bytesSoFar = toULength;
227
outer: while (true) {
228
if (bytesSoFar < bytesExpected) {
229
/* read a trail byte and insert its relevant bits into char32 */
230
if (sourceIndex >= sourceLimit) {
231
/* no source left, save the state for later and break out of the loop */
232
toUnicodeStatus = char32;
233
mode = bytesExpected;
234
toULength = bytesSoFar;
235
cr = CoderResult.UNDERFLOW;
238
if (((ch = toUBytesArray[bytesSoFar] = source.get(sourceIndex++)) & 0xc0) != 0x80) {
239
/* not a trail byte (is not of the form 10xxxxxx) */
241
toULength = bytesSoFar;
242
cr = CoderResult.malformedForLength(bytesSoFar);
245
char32 = (char32 << 6) | (ch & 0x3f);
249
* Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
250
* - use only trail bytes after a lead byte (checked above)
251
* - use the right number of trail bytes for a given lead byte
252
* - encode a code point <= U+10ffff
253
* - use the fewest possible number of bytes for their code points
254
* - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
256
* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
257
* There are no irregular sequences any more.
258
* In CESU-8, only surrogates, not supplementary code points, are encoded directly.
260
else if (bytesSoFar == bytesExpected && UTF8_MIN_CHAR32[bytesExpected] <= char32 && char32 <= 0x10ffff
261
&& (isCESU8 ? bytesExpected <= 3 : !UTF16.isSurrogate((char) char32))) {
263
* char32 is a valid code point and is composed of the correct number of
264
* bytes ... we now need to output it in UTF-16
267
if (char32 <= UConverterConstants.MAXIMUM_UCS2) {
268
/* fits in 16 bits */
269
target.put(targetIndex++, (char) char32);
271
/* fit char32 into 20 bits */
272
char32 -= UConverterConstants.HALF_BASE;
274
/* write out the surrogates */
277
(char) ((char32 >>> UConverterConstants.HALF_SHIFT) + UConverterConstants.SURROGATE_HIGH_START));
279
if (targetIndex >= targetLimit) {
280
/* put in overflow buffer (not handled here) */
281
charErrorBufferArray[charErrorBufferBegin++] = (char) char32;
282
cr = CoderResult.OVERFLOW;
287
(char) ((char32 & UConverterConstants.HALF_MASK) + UConverterConstants.SURROGATE_LOW_START));
291
* we're finished outputing, so now we need to read in the first byte of the
292
* next byte sequence that could form a code point
295
if (sourceIndex >= sourceLimit) {
296
cr = CoderResult.UNDERFLOW;
299
if (targetIndex >= targetLimit) {
300
cr = CoderResult.OVERFLOW;
304
/* keep reading the next input (and writing it) while bytes == 1 */
305
while ((bytesExpected = BYTES_FROM_UTF8[char32 = (toUBytesArray[0] = source.get(sourceIndex++)) & 0xff]) == 1) {
306
target.put(targetIndex++, (char) char32);
307
if (sourceIndex >= sourceLimit) {
308
cr = CoderResult.UNDERFLOW;
311
if (targetIndex >= targetLimit) {
312
cr = CoderResult.OVERFLOW;
317
/* remove the bits that indicate the number of bytes */
318
char32 &= BITMASK_FROM_UTF8[bytesExpected];
322
* either the lead byte in the code sequence is invalid (bytes == 0) or the
323
* lead byte combined with all the trail chars does not form a valid code
326
toULength = bytesSoFar;
327
cr = CoderResult.malformedForLength(bytesSoFar);
332
source.position(sourceIndex);
333
target.position(targetIndex);
340
class CharsetEncoderUTF8 extends CharsetEncoderICU {
342
public CharsetEncoderUTF8(CharsetICU cs) {
343
super(cs, fromUSubstitution);
347
protected void implReset() {
351
protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets,
353
if (!source.hasRemaining()) {
354
/* no input, nothing to do */
355
return CoderResult.UNDERFLOW;
357
if (!target.hasRemaining()) {
358
/* no output available, can't do anything */
359
return CoderResult.OVERFLOW;
362
if (source.hasArray() && target.hasArray()) {
363
/* source and target are backed by arrays, so use the arrays for optimal performance */
364
char[] sourceArray = source.array();
365
int srcIdx = source.arrayOffset() + source.position();
366
int sourceLimit = source.arrayOffset() + source.limit();
367
byte[] targetArray = target.array();
368
int tgtIdx = target.arrayOffset() + target.position();
369
int targetLimit = target.arrayOffset() + target.limit();
374
/* take care of the special condition of fromUChar32 not being 0 (it is a surrogate) */
375
if (fromUChar32 != 0) {
376
/* 4 bytes to encode from char32 and a following char in source */
378
sourceIndex = srcIdx;
379
targetIndex = tgtIdx;
380
cr = encodeFourBytes(sourceArray, targetArray, sourceLimit, targetLimit,
382
srcIdx = sourceIndex;
383
tgtIdx = targetIndex;
385
source.position(srcIdx - source.arrayOffset());
386
target.position(tgtIdx - target.arrayOffset());
392
if (srcIdx >= sourceLimit) {
393
/* nothing left to read */
394
cr = CoderResult.UNDERFLOW;
397
if (tgtIdx >= targetLimit) {
398
/* no space left to write */
399
cr = CoderResult.OVERFLOW;
403
/* reach the next char into char32 */
404
char32 = sourceArray[srcIdx++];
406
if (char32 <= 0x7f) {
407
/* 1 byte to encode from char32 */
409
targetArray[tgtIdx++] = encodeHeadOf1(char32);
411
} else if (char32 <= 0x7ff) {
412
/* 2 bytes to encode from char32 */
414
targetArray[tgtIdx++] = encodeHeadOf2(char32);
416
if (tgtIdx >= targetLimit) {
417
errorBuffer[errorBufferLength++] = encodeLastTail(char32);
418
cr = CoderResult.OVERFLOW;
421
targetArray[tgtIdx++] = encodeLastTail(char32);
423
} else if (!UTF16.isSurrogate((char) char32) || isCESU8) {
424
/* 3 bytes to encode from char32 */
426
targetArray[tgtIdx++] = encodeHeadOf3(char32);
428
if (tgtIdx >= targetLimit) {
429
errorBuffer[errorBufferLength++] = encodeSecondToLastTail(char32);
430
errorBuffer[errorBufferLength++] = encodeLastTail(char32);
431
cr = CoderResult.OVERFLOW;
434
targetArray[tgtIdx++] = encodeSecondToLastTail(char32);
436
if (tgtIdx >= targetLimit) {
437
errorBuffer[errorBufferLength++] = encodeLastTail(char32);
438
cr = CoderResult.OVERFLOW;
441
targetArray[tgtIdx++] = encodeLastTail(char32);
444
/* 4 bytes to encode from char32 and a following char in source */
446
sourceIndex = srcIdx;
447
targetIndex = tgtIdx;
448
cr = encodeFourBytes(sourceArray, targetArray, sourceLimit, targetLimit,
450
srcIdx = sourceIndex;
451
tgtIdx = targetIndex;
457
/* set the new source and target positions and return the CoderResult stored in cr */
458
source.position(srcIdx - source.arrayOffset());
459
target.position(tgtIdx - target.arrayOffset());
466
/* take care of the special condition of fromUChar32 not being 0 (it is a surrogate) */
467
if (fromUChar32 != 0) {
468
/* 4 bytes to encode from char32 and a following char in source */
470
cr = encodeFourBytes(source, target, fromUChar32);
476
if (!source.hasRemaining()) {
477
/* nothing left to read */
478
cr = CoderResult.UNDERFLOW;
481
if (!target.hasRemaining()) {
482
/* no space left to write */
483
cr = CoderResult.OVERFLOW;
487
/* reach the next char into char32 */
488
char32 = source.get();
490
if (char32 <= 0x7f) {
491
/* 1 byte to encode from char32 */
493
target.put(encodeHeadOf1(char32));
495
} else if (char32 <= 0x7ff) {
496
/* 2 bytes to encode from char32 */
498
target.put(encodeHeadOf2(char32));
500
if (!target.hasRemaining()) {
501
errorBuffer[errorBufferLength++] = encodeLastTail(char32);
502
cr = CoderResult.OVERFLOW;
505
target.put(encodeLastTail(char32));
507
} else if (!UTF16.isSurrogate((char) char32) || isCESU8) {
508
/* 3 bytes to encode from char32 */
510
target.put(encodeHeadOf3(char32));
512
if (!target.hasRemaining()) {
513
errorBuffer[errorBufferLength++] = encodeSecondToLastTail(char32);
514
errorBuffer[errorBufferLength++] = encodeLastTail(char32);
515
cr = CoderResult.OVERFLOW;
518
target.put(encodeSecondToLastTail(char32));
520
if (!target.hasRemaining()) {
521
errorBuffer[errorBufferLength++] = encodeLastTail(char32);
522
cr = CoderResult.OVERFLOW;
525
target.put(encodeLastTail(char32));
528
/* 4 bytes to encode from char32 and a following char in source */
530
cr = encodeFourBytes(source, target, char32);
536
/* set the new source and target positions and return the CoderResult stored in cr */
541
private final CoderResult encodeFourBytes(char[] sourceArray, byte[] targetArray,
542
int sourceLimit, int targetLimit, int char32) {
544
/* we need to read another char to match up the surrogate stored in char32 */
545
/* handle the surrogate stuff, returning on a non-null CoderResult */
546
CoderResult cr = handleSurrogates(sourceArray, sourceIndex, sourceLimit, (char)char32);
551
char32 = fromUChar32;
554
/* the rest is routine -- encode four bytes, stopping on overflow */
556
targetArray[targetIndex++] = encodeHeadOf4(char32);
558
if (targetIndex >= targetLimit) {
559
errorBuffer[errorBufferLength++] = encodeThirdToLastTail(char32);
560
errorBuffer[errorBufferLength++] = encodeSecondToLastTail(char32);
561
errorBuffer[errorBufferLength++] = encodeLastTail(char32);
562
return CoderResult.OVERFLOW;
564
targetArray[targetIndex++] = encodeThirdToLastTail(char32);
566
if (targetIndex >= targetLimit) {
567
errorBuffer[errorBufferLength++] = encodeSecondToLastTail(char32);
568
errorBuffer[errorBufferLength++] = encodeLastTail(char32);
569
return CoderResult.OVERFLOW;
571
targetArray[targetIndex++] = encodeSecondToLastTail(char32);
573
if (targetIndex >= targetLimit) {
574
errorBuffer[errorBufferLength++] = encodeLastTail(char32);
575
return CoderResult.OVERFLOW;
577
targetArray[targetIndex++] = encodeLastTail(char32);
579
/* return null for success */
583
private final CoderResult encodeFourBytes(CharBuffer source, ByteBuffer target, int char32) {
585
/* handle the surrogate stuff, returning on a non-null CoderResult */
586
CoderResult cr = handleSurrogates(source, (char)char32);
590
char32 = fromUChar32;
593
/* the rest is routine -- encode four bytes, stopping on overflow */
595
target.put(encodeHeadOf4(char32));
597
if (!target.hasRemaining()) {
598
errorBuffer[errorBufferLength++] = encodeThirdToLastTail(char32);
599
errorBuffer[errorBufferLength++] = encodeSecondToLastTail(char32);
600
errorBuffer[errorBufferLength++] = encodeLastTail(char32);
601
return CoderResult.OVERFLOW;
603
target.put(encodeThirdToLastTail(char32));
605
if (!target.hasRemaining()) {
606
errorBuffer[errorBufferLength++] = encodeSecondToLastTail(char32);
607
errorBuffer[errorBufferLength++] = encodeLastTail(char32);
608
return CoderResult.OVERFLOW;
610
target.put(encodeSecondToLastTail(char32));
612
if (!target.hasRemaining()) {
613
errorBuffer[errorBufferLength++] = encodeLastTail(char32);
614
return CoderResult.OVERFLOW;
616
target.put(encodeLastTail(char32));
618
/* return null for success */
622
private int sourceIndex;
624
private int targetIndex;
628
private static final byte encodeHeadOf1(int char32) {
629
return (byte) char32;
632
private static final byte encodeHeadOf2(int char32) {
633
return (byte) (0xc0 | (char32 >>> 6));
636
private static final byte encodeHeadOf3(int char32) {
637
return (byte) (0xe0 | ((char32 >>> 12)));
640
private static final byte encodeHeadOf4(int char32) {
641
return (byte) (0xf0 | ((char32 >>> 18)));
644
private static final byte encodeThirdToLastTail(int char32) {
645
return (byte) (0x80 | ((char32 >>> 12) & 0x3f));
648
private static final byte encodeSecondToLastTail(int char32) {
649
return (byte) (0x80 | ((char32 >>> 6) & 0x3f));
652
private static final byte encodeLastTail(int char32) {
653
return (byte) (0x80 | (char32 & 0x3f));
656
/* single-code point definitions -------------------------------------------- */
659
* Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
660
* @param c 8-bit code unit (byte)
661
* @return TRUE or FALSE
663
// static final boolean isSingle(byte c) {return (((c)&0x80)==0);}
665
* Is this code unit (byte) a UTF-8 lead byte?
666
* @param c 8-bit code unit (byte)
667
* @return TRUE or FALSE
669
// static final boolean isLead(byte c) {return ((((c)-0xc0) &
670
// UConverterConstants.UNSIGNED_BYTE_MASK)<0x3e);}
672
* Is this code unit (byte) a UTF-8 trail byte?
675
* 8-bit code unit (byte)
676
* @return TRUE or FALSE
678
/*private static final boolean isTrail(byte c) {
679
return (((c) & 0xc0) == 0x80);
682
public CharsetDecoder newDecoder() {
683
return new CharsetDecoderUTF8(this);
686
public CharsetEncoder newEncoder() {
687
return new CharsetEncoderUTF8(this);
691
void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
692
getNonSurrogateUnicodeSet(setFillIn);