2
* @(#)XmlReader.java 1.5 06/10/30
4
* Copyright (c) 2006 Sun Microsystems, Inc. All Rights Reserved.
5
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
7
* This code is free software; you can redistribute it and/or modify it
8
* under the terms of the GNU General Public License version 2 only, as
9
* published by the Free Software Foundation. Sun designates this
10
* particular file as subject to the "Classpath" exception as provided
11
* by Sun in the LICENSE file that accompanied this code.
13
* This code is distributed in the hope that it will be useful, but WITHOUT
14
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16
* version 2 for more details (a copy is included in the LICENSE file that
17
* accompanied this code).
19
* You should have received a copy of the GNU General Public License version
20
* 2 along with this work; if not, write to the Free Software Foundation,
21
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
23
* Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
24
* CA 95054 USA or visit www.sun.com if you need additional information or
28
* @(#) XmlReader.java 1.5 - last change made 10/30/06
31
package com.sun.java.help.impl;
34
import java.net.URLConnection;
37
* This handles several XML-related tasks that normal java.io Readers
38
* don't support, inluding use of IETF standard encoding names and
39
* automatic detection of most XML encodings. The former is needed
40
* for interoperability; the latter is needed to conform with the XML
41
* spec. This class also optimizes reading some common encodings by
42
* providing low-overhead unsynchronized Reader support.
44
* <P> Note that the autodetection facility should be used only on
45
* data streams which have an unknown character encoding. For example,
46
* it should never be used on MIME text/xml entities.
48
* <P> Note that XML processors are only required to support UTF-8 and
49
* UTF-16 character encodings. Autodetection permits the underlying Java
50
* implementation to provide support for many other encodings, such as
51
* ISO-8859-5, Shift_JIS, EUC-JP, and ISO-2022-JP.
53
* @author David Brownell
54
* @author Roger D. Brinkley
58
final public class XmlReader extends Reader
60
private boolean closed;
61
private InputStreamReader in;
64
// This class either handles reading itself, in common encodings
65
// (US-ASCII, ISO-Latin-1, UTF-8) or delegates to another Reader.
67
// Autodetection requires reading/buffering part of the XML declaration,
68
// then potentially switching _after entire characters are read_ to
69
// delegate further operations to such a Reader. The reader will get
70
// no header (e.g. no UTF-16 Byte Order Mark). This can work since,
71
// XML declarations contain only ASCII characters, which are a subset
72
// of many encodings (Shift_JIS, ISO-8859-*, EUC-*, ISO-2022-*, more).
74
// It's got do this efficiently: character I/O is solidly on the
75
// critical path. Synchronization isn't needed, and buffering must
76
// as always be done only with care. (So keep buffer length over
77
// 2 Kbytes to avoid excess buffering, since most URL handlers stuff
78
// a BufferedInputStream between here and the real data source. That
79
// class tries to be smart enough not to try buffering if you ask for
80
// more data than it could buffer for you.)
82
private InputStream raw;
83
private byte buffer [];
84
private boolean isASCII, isLatin1;
85
private int offset, length;
87
// 2nd half of UTF-8 surrogate pair
88
private char nextChar;
90
private int switchover;
91
private String encodingAssigned;
94
* Constructs the reader from a URLConnection. Uses the encoding
95
* specified in the HTTP header or autodetects
96
* the encoding to use according to the heuristic specified
97
* in the XML 1.0 recommendation.
99
* @param uc the URLConnection from which the reader is constructed
100
* @exception IOException on error
101
* @exception UnsupportedEncodingException when the input stream
102
* is not in an encoding which is supported; this is a fatal XML
105
public static Reader createReader (URLConnection uc) throws IOException
107
String encoding = getEncodingFromContentType(uc.getContentType());
108
if (encoding == null) {
109
return createReader (uc.getInputStream());
111
return createReader (uc.getInputStream(), encoding);
115
* Gets the encoding from the content type string.
116
* If there is a charset definition specified as a parameter
117
* of the content type specification, it will be used when
118
* loading input streams using the associated XmlReader.
119
* For example if the type is specified as
120
* <code>text/xml; charset=EUC-JP</code> the Reader will
121
* use the <code>EUC-JP</code> charset for translating
124
* @param type the non-null mime type for the content editing
127
private static String getEncodingFromContentType(String type) {
129
debug ("type=" + type);
130
// The type could have optional info is part of it,
131
// for example some charset info. We need to strip that
136
int parm = type.indexOf(";");
138
// Save the paramList.
139
String paramList = type.substring(parm);
140
// update the content type string.
141
type = type.substring(0, parm).trim();
142
if (type.compareTo("text/xml") == 0) {
143
// Set the charset name from the paramlist
144
return getCharsetFromContentTypeParameters(paramList);
151
* This method get's the charset information specified as part
152
* of the content type in the http header information.
154
private static String getCharsetFromContentTypeParameters(String paramlist) {
155
String charset = null;
157
// paramlist is handed to us with a leading ';', strip it.
158
int semi = paramlist.indexOf(';');
159
if (semi > -1 && semi < paramlist.length()-1) {
160
paramlist = paramlist.substring(semi + 1);
163
if (paramlist.length() > 0) {
164
// parse the paramlist into attr-value pairs & get the
165
// charset pair's value
166
HeaderParser hdrParser = new HeaderParser(paramlist);
167
charset = hdrParser.findValue("charset");
171
catch (IndexOutOfBoundsException e) {
172
// malformed parameter list, use charset we have
174
catch (NullPointerException e) {
175
// malformed parameter list, use charset we have
177
catch (Exception e) {
178
// malformed parameter list, use charset we have; but complain
179
System.err.println("Indexer.getCharsetFromContentTypeParameters failed on: " + paramlist);
186
* Constructs the reader from an input stream, autodetecting
187
* the encoding to use according to the heuristic specified
188
* in the XML 1.0 recommendation.
190
* @param in the input stream from which the reader is constructed
191
* @exception IOException on error
192
* @exception UnsupportedEncodingException when the input stream
193
* is not in an encoding which is supported; this is a fatal XML
196
public static Reader createReader (InputStream in) throws IOException
198
return new XmlReader (in);
202
* Creates a reader supporting the given encoding, mapping
203
* from standard encoding names to ones that understood by
204
* Java where necessary.
206
* @param in the input stream from which the reader is constructed
207
* @param encoding the IETF standard name of the encoding to use;
208
* if null, autodetection is used.
209
* @exception IOException on error
210
* @exception UnsupportedEncodingException when the input stream
211
* is not in an encoding which is supported; this is a fatal XML
214
public static Reader createReader (InputStream in, String encoding)
217
if (encoding == null)
218
return new XmlReader (in);
220
// UTF-16 == ISO-10646-UCS-2 plus surrogates.
221
// The sun.io "Unicode" encoders/decoders don't check
222
// for correctly paired surrogates, so they accept UTF-16.
224
if ("UTF-16".equalsIgnoreCase (encoding)
225
|| "ISO-106460-UCS-2".equalsIgnoreCase (encoding))
226
encoding = "Unicode";
227
else if ("UTF-8".equalsIgnoreCase (encoding))
228
return new XmlReader (in, "UTF-8");
229
else if ("EUC-JP".equalsIgnoreCase (encoding))
230
encoding = "EUCJIS"; // works on JDK 1.1 and 1.2
231
else if (isAsciiName (encoding))
232
return new XmlReader (in, "US-ASCII");
233
else if (isLatinName (encoding))
234
return new XmlReader (in, "ISO-8859-1");
236
// XXX we should provide provide better "unsupported encoding"
237
// diagnostics than this produces...
238
return new InputStreamReader (in, encoding);
241
private XmlReader (InputStream in, String encoding)
244
buffer = new byte [8 * 1024];
247
if ("US-ASCII".equals (encoding))
249
else if ("ISO-8859-1".equals (encoding))
251
else if (!"UTF-8".equals (encoding))
252
throw new UnsupportedEncodingException (encoding);
257
private static boolean isAsciiName (String encoding)
259
return "US-ASCII".equalsIgnoreCase (encoding)
260
|| "ASCII".equalsIgnoreCase (encoding);
263
private static boolean isLatinName (String encoding)
265
return "ISO-8859-1".equalsIgnoreCase (encoding)
266
|| "Latin1".equalsIgnoreCase (encoding)
267
|| "8859_1".equalsIgnoreCase (encoding);
270
private void setASCII ()
272
encodingAssigned = "US-ASCII";
278
private void setLatin1 ()
280
encodingAssigned = "ISO-8859-1";
286
private void setUTF8 ()
288
encodingAssigned = "UTF-8";
294
/** Returns the (non)standard name of the encoding in use */
295
public String getEncoding ()
297
return encodingAssigned;
300
private XmlReader (InputStream in) throws IOException
305
// Set up buffering ... we buffer at least the XML text
306
// declaration (if it's there), and for some encodings we
307
// manage bulk character I/O. We can reset within this
308
// buffer so long as we've not assigned the encoding.
312
buffer = new byte [8 * 1024];
317
// See if we can figure out the character encoding used
318
// in this file by peeking at the first few bytes. If not,
319
// we may be able to look at the whole XML declaration.
321
switch ((c = read ())) {
323
// 00 3c 00 3f == illegal UTF-16 big-endian
324
if ((c = read ()) == 0x3c
325
&& (c = read ()) == 0x00
326
&& (c = read ()) == 0x3f) {
327
setSwitchover ("UnicodeBig");
332
// 00 00 00 3c == UCS-4 big endian
333
// 00 00 3c 00 == UCS-4 unusual "2143" order
334
// 00 3c 00 00 == UCS-4 unusual "3412" order
335
// ... or something else! note that only some parts
336
// of UCS-4 work with UNICODE based systems.
337
throw new UnsupportedEncodingException ("UCS-4 (?)");
339
case '<': // 0x3c: the most common cases!
340
switch ((c = read ())) {
341
// First character is '<'; could be XML without
342
// an XML directive such as "<hello>", "<!-- ...",
343
// and so on. Default intelligently; the byte we
344
// just read could be _part_ of a UTF-8 character!
348
// 3c 00 3f 00 == illegal UTF-16 little endian
349
// 3c 00 00 00 == UCS-4 little endian
351
if (read () == 0x3f && read () == 0x00) {
352
setSwitchover ("UnicodeLittle");
356
throw new UnsupportedEncodingException ("UCS-4");
358
// 3c 3f 78 6d == ASCII and supersets '<?xm'
360
if (read () != 'x' || read () != 'm'
361
|| read () != 'l' || read () != ' ')
364
// One of several encodings could be used:
365
// Shift-JIS, ASCII, UTF-8, ISO-8859-*, etc
374
if ((c = read ()) != 0xff)
376
setSwitchover ("UnicodeBig");
377
offset = 2; // skip BOM
380
// UTF-16 little-endian
382
if ((c = read ()) != 0xfe)
384
setSwitchover ("UnicodeLittle");
385
offset = 2; // skip BOM
392
// default ... no XML declaration
398
// If all else fails, assume XML without a declaration, and
399
// using UTF-8 encoding. We must be prepared to re-interpret
400
// bytes we've already read as parts of UTF-8 characters, so
401
// we can't use the "switchover" technique (which works only
402
// "between" characters).
407
// When the buffered input is done, switch to a reader
408
// that can decode data in that encoding. Only call this
409
// routine after entire characters have been scanned.
410
private void setSwitchover (String encoding) throws IOException
413
encodingAssigned = encoding;
417
// we've consumed our buffer, now switch over to a reader
418
// which will decode the rest of the (non-ASCII) input
419
private void doSwitchover () throws IOException
421
if (offset != switchover)
422
throw new InternalError ();
423
in = new InputStreamReader (raw, encodingAssigned);
430
* Used for ASCII supersets ... including the default UTF-8 encoding.
432
private void guessEncoding () throws IOException
435
// We know that "<?xml " has been seen; so we'll skip any
436
// S? version="..." [or single quotes]
438
// S? encoding="..." [or single quotes]
439
// parts. We place an arbitrary limit on the amount of text we
440
// expect to find in the XML declarations; excessive whitespace
441
// will cause this to guess "UTF-8".
444
StringBuffer buf = new StringBuffer ();
445
StringBuffer keyBuf = null;
447
boolean sawEq = false;
449
boolean sawQuestion = false;
452
for (int i = 0; i < 100; ++i) {
453
if ((c = read ()) == -1) {
458
// ignore whitespace before/between "key = 'value'"
459
if (Character.isWhitespace ((char) c))
462
// terminate the loop ASAP
465
else if (sawQuestion) {
471
// did we get the "key =" bit yet?
472
if (key == null || !sawEq) {
473
if (keyBuf == null) {
474
if (Character.isWhitespace ((char) c))
478
buf.append ((char)c);
480
} else if (Character.isWhitespace ((char) c)) {
481
key = keyBuf.toString ();
482
} else if (c == '=') {
484
key = keyBuf.toString ();
489
keyBuf.append ((char)c);
493
// space before quoted value
494
if (Character.isWhitespace ((char) c))
496
if (c == '"' || c == '\'') {
497
if (quoteChar == 0) {
498
quoteChar = (char) c;
501
} else if (c == quoteChar) {
502
if ("encoding".equals (key)) {
503
String encoding = buf.toString ();
505
// [81] Encname ::= [A-Za-z] ([A-Za-z0-9._]|'-')*
506
for (i = 0; i < encoding.length(); i++) {
507
c = encoding.charAt (i);
508
if ((c >= 'A' && c <= 'Z')
509
|| (c >= 'a' && c <= 'z'))
511
if (i > 0 && (c == '-'
512
|| (c >= '0' && c <= '9')
513
|| c == '.' || c == '_'))
515
// map errors to UTF-8 default
519
// we handle ASCII directly
520
if (isAsciiName (encoding)) {
526
if (isLatinName (encoding)) {
532
if ("UTF-8".equalsIgnoreCase (encoding)
533
|| "UTF8".equalsIgnoreCase (encoding)
537
// JDK uses nonstandard names internally
538
if ("EUC-JP".equalsIgnoreCase (encoding))
541
// other encodings ... use a reader.
542
setSwitchover (encoding);
550
buf.append ((char) c);
557
* Converts a UTF-8 character from the input buffer, optionally
558
* restricting to the US-ASCII subset of UTF-8.
560
private char utf8char () throws IOException
565
// return second half of a surrogate pair
573
// Excerpted from RFC 2279:
575
// UCS-4 range (hex.) UTF-8 octet sequence (binary)
576
// 0000 0000-0000 007F 0xxxxxxx
577
// 0000 0080-0000 07FF 110xxxxx 10xxxxxx
578
// 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
579
// 0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
580
// 0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
581
// 0400 0000-7FFF FFFF 1111110x 10xxxxxx ... 10xxxxxx
583
// The last two encodings (5 and 6 bytes per char) aren't allowed
584
// in XML documents since the characters are out of range.
586
retval = (char) buffer [offset];
587
if ((retval & 0x80) == 0x00) { // 1 byte
592
throw new CharConversionException ("Not US-ASCII: 0x"
593
+ Integer.toHexString (retval & 0xff));
596
// Multibyte sequences -- check offsets optimistically,
597
// ditto the "10xx xxxx" format for non-initial bytes
602
if ((buffer [off] & 0x0E0) == 0x0C0) { // 2 bytes
603
character = (buffer [off++] & 0x1f) << 6;
604
character += buffer [off++] & 0x3f;
605
retval = (char) character;
607
} else if ((buffer [off] & 0x0F0) == 0x0E0) { // 3 bytes
608
character = (buffer [off++] & 0x0f) << 12;
609
character += (buffer [off++] & 0x3f) << 6;
610
character += buffer [off++] & 0x3f;
611
retval = (char) character;
613
} else if ((buffer [off] & 0x0f8) == 0x0F0) { // 4 bytes
614
character = (buffer [off++] & 0x07) << 18;
615
character += (buffer [off++] & 0x3f) << 12;
616
character += (buffer [off++] & 0x3f) << 6;
617
character += buffer [off++] & 0x3f;
618
// Convert UCS-4 char to a Unicode surrogate pair
619
character -= 0x10000;
620
retval = (char) (0xD800 + (character >> 10));
621
character = 0xDC00 + (character & 0x03ff);
623
// XXX actually a WF error ...
624
throw new CharConversionException ("Illegal XML character"
625
+ " 0x" + Integer.toHexString (buffer [offset] & 0xff)
628
} catch (ArrayIndexOutOfBoundsException e) {
629
// that is, off > length && length >= buffer.length
635
// if the buffer held only a partial character, compact it
636
// and try to read the rest of the character. worst case
637
// involves three single-byte reads.
640
System.arraycopy (buffer, offset, buffer, 0, length - offset);
643
off = raw.read (buffer, length, buffer.length - length);
645
throw new CharConversionException ("Partial UTF-8 char");
651
// check the format of the non-initial bytes, and return
653
for (offset++; offset < off; offset++)
654
if ((buffer [offset] & 0xC0) != 0x80)
655
throw new CharConversionException ("Malformed UTF-8 char");
656
nextChar = (char) character;
661
* Reads the number of characters read into the buffer, or -1 on EOF.
663
public int read (char buf [], int off, int len) throws IOException
669
if (switchover > 0 && offset == switchover)
672
return in.read (buf, off, len);
674
if (offset >= length) {
676
length = raw.read (buffer, 0, buffer.length);
680
if (encodingAssigned == null || isLatin1)
681
for (i = 0; i < len && offset < length; i++)
682
buf [off++] = (char) (buffer [offset++] & 0xff);
684
for (i = 0; i < len && offset < length; i++)
685
buf [off++] = utf8char ();
690
* Reads a single character.
692
public int read () throws IOException
696
if (switchover > 0 && offset == switchover)
701
if (offset >= length) {
702
if (encodingAssigned == null) {
703
// minimize readahead we might regret...
704
if (length == buffer.length)
705
throw new InternalError ("too much peekahead");
706
int len = raw.read (buffer, offset, 1);
712
length = raw.read (buffer, 0, buffer.length);
718
if (isLatin1 || encodingAssigned == null)
719
return buffer [offset++] & 0x0ff;
725
* Returns true iff the reader supports mark/reset.
727
public boolean markSupported ()
729
return in != null && in.markSupported ();
733
* Sets a mark allowing a limited number of characters to
734
* be "peeked", by reading and then resetting.
735
* @param value how many characters may be "peeked".
737
public void mark (int value) throws IOException
744
* Resets the current position to the last marked position.
746
public void reset () throws IOException
753
* Skips a specified number of characters.
755
public long skip (long value) throws IOException
760
return in.skip (value);
761
long avail = length - offset;
762
if (avail >= value) {
763
offset += (int) value;
767
return avail + raw.skip (value - avail);
771
* Returns true iff input characters are known to be ready.
773
public boolean ready () throws IOException
777
return (length > offset) || raw.available () != 0;
783
public void close () throws IOException
795
* For printf debugging.
797
private static boolean debug = false;
798
private static void debug(String str) {
800
System.out.println("XmlReader: " + str);