2
* The Apache Software License, Version 1.1
5
* Copyright (c) 2001-2004 The Apache Software Foundation. All rights
8
* Redistribution and use in source and binary forms, with or without
9
* modification, are permitted provided that the following conditions
12
* 1. Redistributions of source code must retain the above copyright
13
* notice, this list of conditions and the following disclaimer.
15
* 2. Redistributions in binary form must reproduce the above copyright
16
* notice, this list of conditions and the following disclaimer in
17
* the documentation and/or other materials provided with the
20
* 3. The end-user documentation included with the redistribution,
21
* if any, must include the following acknowledgment:
22
* "This product includes software developed by the
23
* Apache Software Foundation (http://www.apache.org/)."
24
* Alternately, this acknowledgment may appear in the software itself,
25
* if and wherever such third-party acknowledgments normally appear.
27
* 4. The names "Xerces" and "Apache Software Foundation" must
28
* not be used to endorse or promote products derived from this
29
* software without prior written permission. For written
30
* permission, please contact apache@apache.org.
32
* 5. Products derived from this software may not be called "Apache",
33
* nor may "Apache" appear in their name, without prior written
34
* permission of the Apache Software Foundation.
36
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
37
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
38
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
39
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
40
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
42
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
43
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
44
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
45
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
46
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
48
* ====================================================================
50
* This software consists of voluntary contributions made by many
51
* individuals on behalf of the Apache Software Foundation and was
52
* originally based on software copyright (c) 2003, International
53
* Business Machines, Inc., http://www.apache.org. For more
54
* information on the Apache Software Foundation, please see
55
* <http://www.apache.org/>.
57
package org.apache.xerces.xinclude;
59
import java.io.BufferedInputStream;
60
import java.io.IOException;
61
import java.io.InputStream;
62
import java.io.InputStreamReader;
63
import java.io.Reader;
64
import java.net.HttpURLConnection;
66
import java.net.URLConnection;
67
import java.util.Locale;
69
import org.apache.xerces.impl.io.ASCIIReader;
70
import org.apache.xerces.impl.io.UTF8Reader;
71
import org.apache.xerces.impl.msg.XMLMessageFormatter;
72
import org.apache.xerces.impl.XMLEntityManager;
73
import org.apache.xerces.impl.XMLErrorReporter;
74
import org.apache.xerces.util.EncodingMap;
75
import org.apache.xerces.util.MessageFormatter;
76
import org.apache.xerces.util.XMLChar;
77
import org.apache.xerces.util.XMLStringBuffer;
78
import org.apache.xerces.xni.parser.XMLInputSource;
81
* This class is used for reading resources requested in <include> elements,
82
* when the parse attribute of the <include> element is "text". Using this
83
* class will open the location, detect the encoding, and discard the byte order
84
* mark, if applicable.
87
* Much of the code in this class is taken from XMLEntityManager. It would be nice
88
* if this code could be shared in some way. However, since XMLEntityManager is used
89
* for reading files as XML, and this needs to read files as text, there would need
90
* to be some refactoring done.
92
* @author Michael Glavassevich, IBM
93
* @author Peter McCracken, IBM
94
* @author Arun Yadav, Sun Microsystems Inc.
96
* @version $Id: XIncludeTextReader.java,v 1.7 2004/01/21 17:07:30 mrglavas Exp $
98
* @see XIncludeHandler
100
public class XIncludeTextReader {
102
private Reader fReader;
103
private XIncludeHandler fHandler;
104
private XMLInputSource fSource;
105
private XMLErrorReporter fErrorReporter;
107
// Content negotation parameters
108
private String fAccept;
109
private String fAcceptCharset;
110
private String fAcceptLanguage;
113
* Construct the XIncludeReader using the XMLInputSource and XIncludeHandler.
115
* @param source The XMLInputSource to use.
116
* @param handler The XIncludeHandler to use.
118
public XIncludeTextReader(XMLInputSource source, XIncludeHandler handler)
125
* Sets the XMLErrorReporter used for reporting errors while
126
* reading the text include.
128
* @param errorReporter the XMLErrorReporter to be used for
131
public void setErrorReporter(XMLErrorReporter errorReporter) {
132
fErrorReporter = errorReporter;
136
* Sets content negotation parameters to
137
* be attached to an HTTP request.
139
* @param accept the Accept HTTP request property
140
* @param acceptCharset the Accept-Charset HTTP request property
141
* @param acceptLanguage the Accept-Language HTTP request property
143
public void setHttpProperties(String accept, String acceptCharset, String acceptLanguage) {
145
fAcceptCharset = acceptCharset;
146
fAcceptLanguage = acceptLanguage;
150
* Return the Reader for given XMLInputSource.
152
* @param source The XMLInputSource to use.
154
protected Reader getReader(XMLInputSource source) throws IOException {
155
if (source.getCharacterStream() != null) {
156
return source.getCharacterStream();
159
InputStream stream = null;
161
String encoding = source.getEncoding();
162
if (encoding == null) {
165
if (source.getByteStream() != null) {
166
stream = source.getByteStream();
167
// Wrap the InputStream so that it is possible to rewind it.
168
if (!(stream instanceof BufferedInputStream)) {
169
stream = new BufferedInputStream(stream);
173
String expandedSystemId = XMLEntityManager.expandSystemId(source.getSystemId(), source.getBaseSystemId(), false);
175
URL url = new URL(expandedSystemId);
176
URLConnection urlCon = url.openConnection();
178
// If this is an HTTP connection attach any
179
// content negotation parameters to the request.
180
if (urlCon instanceof HttpURLConnection) {
181
if( fAccept != null && fAccept.length() > 0) {
182
urlCon.setRequestProperty(XIncludeHandler.HTTP_ACCEPT, fAccept);
184
if( fAcceptCharset != null && fAcceptCharset.length() > 0) {
185
urlCon.setRequestProperty(XIncludeHandler.HTTP_ACCEPT_CHARSET, fAcceptCharset);
187
if( fAcceptLanguage != null && fAcceptLanguage.length() > 0) {
188
urlCon.setRequestProperty(XIncludeHandler.HTTP_ACCEPT_LANGUAGE, fAcceptLanguage);
192
// Wrap the InputStream so that it is possible to rewind it.
193
stream = new BufferedInputStream(urlCon.getInputStream());
195
// content type will be string like "text/xml; charset=UTF-8" or "text/xml"
196
String rawContentType = urlCon.getContentType();
198
// text/xml and application/xml offer only one optional parameter
199
int index = (rawContentType != null) ? rawContentType.indexOf(';') : -1;
201
String contentType = null;
202
String charset = null;
204
// this should be something like "text/xml"
205
contentType = rawContentType.substring(0, index).trim();
207
// this should be something like "charset=UTF-8", but we want to
208
// strip it down to just "UTF-8"
209
charset = rawContentType.substring(index + 1).trim();
210
if (charset.startsWith("charset=")) {
211
// 8 is the length of "charset="
212
charset = charset.substring(8).trim();
213
// strip quotes, if present
214
if ((charset.charAt(0) == '"'
215
&& charset.charAt(charset.length() - 1) == '"')
216
|| (charset.charAt(0) == '\''
217
&& charset.charAt(charset.length() - 1)
220
charset.substring(1, charset.length() - 1);
228
contentType = rawContentType.trim();
231
String detectedEncoding = null;
232
/** The encoding of such a resource is determined by:
233
1 external encoding information, if available, otherwise
234
-- the most common type of external information is the "charset" parameter of a MIME package
235
2 if the media type of the resource is text/xml, application/xml, or matches the conventions text/*+xml or application/*+xml as described in XML Media Types [IETF RFC 3023], the encoding is recognized as specified in XML 1.0, otherwise
236
3 the value of the encoding attribute if one exists, otherwise
239
if (contentType.equals("text/xml")) {
240
if (charset != null) {
241
detectedEncoding = charset;
244
// see RFC2376 or 3023, section 3.1
245
detectedEncoding = "US-ASCII";
248
else if (contentType.equals("application/xml")) {
249
if (charset != null) {
250
detectedEncoding = charset;
253
// see RFC2376 or 3023, section 3.2
254
detectedEncoding = getEncodingName(stream);
257
else if (contentType.endsWith("+xml")) {
258
detectedEncoding = getEncodingName(stream);
261
if (detectedEncoding != null) {
262
encoding = detectedEncoding;
267
encoding = encoding.toUpperCase(Locale.ENGLISH);
269
// eat the Byte Order Mark
270
consumeBOM(stream, encoding);
272
// If the document is UTF-8 or US-ASCII use
273
// the Xerces readers for these encodings.
274
if (encoding.equals("UTF-8")) {
275
return new UTF8Reader(stream,
276
XMLEntityManager.DEFAULT_BUFFER_SIZE,
277
fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN),
278
fErrorReporter.getLocale() );
280
else if (encoding.equals("US-ASCII")) {
281
return new ASCIIReader(stream,
282
XMLEntityManager.DEFAULT_BUFFER_SIZE,
283
fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN),
284
fErrorReporter.getLocale() );
287
// Try to use a Java reader.
288
String javaEncoding = EncodingMap.getIANA2JavaMapping(encoding);
290
// If the specified encoding wasn't a recognized IANA encoding throw an IOException.
291
// The XIncludeHandler will report this as a ResourceError and then will
292
// attempt to include a fallback if there is one.
293
if (javaEncoding == null) {
294
MessageFormatter aFormatter =
295
fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN);
296
Locale aLocale = fErrorReporter.getLocale();
297
throw new IOException( aFormatter.formatMessage( aLocale,
298
"EncodingDeclInvalid",
299
new Object[] {encoding} ) );
302
return new InputStreamReader(stream, javaEncoding);
307
* XMLEntityManager cares about endian-ness, since it creates its own optimized
308
* readers. Since we're just using generic Java readers for now, we're not caring
309
* about endian-ness. If this changes, even more code needs to be copied from
310
* XMLEntity manager. -- PJM
312
protected String getEncodingName(InputStream stream) throws IOException {
313
final byte[] b4 = new byte[4];
314
String encoding = null;
316
// this has the potential to throw an exception
317
// it will be fixed when we ensure the stream is rewindable (see note above)
319
int count = stream.read(b4, 0, 4);
322
encoding = getEncodingName(b4);
329
* Removes the byte order mark from the stream, if it exists.
332
* @throws IOException
334
protected void consumeBOM(InputStream stream, String encoding)
337
byte[] b = new byte[3];
340
if (encoding.equals("UTF-8")) {
341
count = stream.read(b, 0, 3);
343
int b0 = b[0] & 0xFF;
344
int b1 = b[1] & 0xFF;
345
int b2 = b[2] & 0xFF;
346
if (b0 != 0xEF || b1 != 0xBB || b2 != 0xBF) {
347
// First three bytes are not BOM, so reset.
355
else if (encoding.startsWith("UTF-16")) {
356
count = stream.read(b, 0, 2);
358
int b0 = b[0] & 0xFF;
359
int b1 = b[1] & 0xFF;
360
if ((b0 != 0xFE || b1 != 0xFF)
361
&& (b0 != 0xFF || b1 != 0xFE)) {
362
// First two bytes are not BOM, so reset.
370
// We could do UTF-32, but since the getEncodingName() doesn't support that
371
// we won't support it here.
372
// To implement UTF-32, look for: 00 00 FE FF for big-endian
373
// or FF FE 00 00 for little-endian
377
* REVISIT: This code is taken from org.apache.xerces.impl.XMLEntityManager.
378
* Is there any way we can share the code, without having it implemented twice?
379
* I think we should make it public and static in XMLEntityManager. --PJM
381
* Returns the IANA encoding name that is auto-detected from
382
* the bytes specified, with the endian-ness of that encoding where appropriate.
384
* @param b4 The first four bytes of the input.
385
* @return the encoding name, or null if no encoding could be detected
387
protected String getEncodingName(byte[] b4) {
390
int b0 = b4[0] & 0xFF;
391
int b1 = b4[1] & 0xFF;
392
if (b0 == 0xFE && b1 == 0xFF) {
393
// UTF-16, big-endian
396
if (b0 == 0xFF && b1 == 0xFE) {
397
// UTF-16, little-endian
402
int b2 = b4[2] & 0xFF;
403
if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
408
int b3 = b4[3] & 0xFF;
409
if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {
410
// UCS-4, big endian (1234)
411
return "ISO-10646-UCS-4";
413
if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {
414
// UCS-4, little endian (4321)
415
return "ISO-10646-UCS-4";
417
if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) {
418
// UCS-4, unusual octet order (2143)
419
return "ISO-10646-UCS-4";
421
if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) {
422
// UCS-4, unusual octect order (3412)
423
return "ISO-10646-UCS-4";
425
if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
426
// UTF-16, big-endian, no BOM
427
// (or could turn out to be UCS-2...
430
if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
431
// UTF-16, little-endian, no BOM
432
// (or could turn out to be UCS-2...
435
if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) {
437
// a la xerces1, return CP037 instead of EBCDIC here
441
// this signals us to use the value from the encoding attribute
444
} // getEncodingName(byte[]):Object[]
447
* Read the input stream as text, and pass the text on to the XIncludeHandler
448
* using calls to characters(). This will read all of the text it can from the
451
* @throws IOException
453
public void parse() throws IOException {
454
// REVISIT: This method needs to be rewritten to improve performance: both
455
// time and memory. We should be reading chunks and reporting chunks instead
456
// of reading characters individually and reporting all the characters in
457
// one callback. Also, currently we don't provide any locator information:
458
// line number, column number, etc... so if we report an error it will appear
459
// as if the invalid XML character was in the include parent. -- mrglavas
460
XMLStringBuffer buffer = new XMLStringBuffer();
461
fReader = getReader(fSource);
463
while((ch = fReader.read()) != -1) {
465
buffer.append((char)ch);
467
else if (XMLChar.isHighSurrogate(ch)) {
468
int ch2 = fReader.read();
469
if (XMLChar.isLowSurrogate(ch2)) {
471
// convert surrogates to a supplemental character
472
int sup = XMLChar.supplemental((char)ch, (char)ch2);
474
// supplemental character must be a valid XML character
476
fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
477
"InvalidCharInContent",
478
new Object[] { Integer.toString(sup, 16) },
479
XMLErrorReporter.SEVERITY_FATAL_ERROR);
482
buffer.append((char) ch);
483
buffer.append((char) ch2);
486
fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
487
"InvalidCharInContent",
488
new Object[] { Integer.toString(ch, 16) },
489
XMLErrorReporter.SEVERITY_FATAL_ERROR);
493
fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
494
"InvalidCharInContent",
495
new Object[] { Integer.toString(ch, 16) },
496
XMLErrorReporter.SEVERITY_FATAL_ERROR);
499
if (fHandler != null && buffer.length > 0) {
502
fHandler.modifyAugmentations(null, true));
507
* Closes the stream. Call this after parse(), or when there is no longer any need
510
* @throws IOException
512
public void close() throws IOException {
513
if (fReader != null) {
519
* Returns true if the specified character is a valid XML character
520
* as per the rules of XML 1.0.
522
* @param ch The character to check.
524
protected boolean isValid(int ch) {
525
return XMLChar.isValid(ch);
b'\\ No newline at end of file'