1
/* Copyright 2002, 2003, 2005 Elliotte Rusty Harold
3
This library is free software; you can redistribute it and/or modify
4
it under the terms of version 2.1 of the GNU Lesser General Public
5
License as published by the Free Software Foundation.
7
This library is distributed in the hope that it will be useful,
8
but WITHOUT ANY WARRANTY; without even the implied warranty of
9
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
GNU Lesser General Public License for more details.
12
You should have received a copy of the GNU Lesser General Public
13
License along with this library; if not, write to the
14
Free Software Foundation, Inc., 59 Temple Place, Suite 330,
15
Boston, MA 02111-1307 USA
17
You can contact Elliotte Rusty Harold by sending e-mail to
18
elharo@metalab.unc.edu. Please include the word "XOM" in the
19
subject line. The XOM home page is located at http://www.xom.nu/
22
package nu.xom.xinclude;
24
import java.io.IOException;
25
import java.io.InputStream;
29
* <code>EncodingHeuristics</code> reads from a stream
30
* (which should be buffered) and attempts to guess
31
* what the encoding of the text in the stream is.
32
* Byte order marks are stripped from the stream.
33
* If it fails to determine the type of the encoding,
34
* it returns the default UTF-8.
38
* @author Elliotte Rusty Harold
41
class EncodingHeuristics {
43
// No instances allowed
44
private EncodingHeuristics() {}
49
* This utility method uses a variety of heuristics to
50
* attempt to guess the encoding from the initial
54
* @param in <code>InputStream</code> to read from.
55
* @return String The name of the encoding.
56
* @throws IOException if the stream cannot be reset back
57
* to where it was when the method was invoked.
59
public static String readEncodingFromStream(InputStream in)
62
// This may fail if there are a lot of space
63
// characters before the end of the encoding declaration
67
// Lots of things can go wrong here. If any do,
68
// return "UTF-8" as the default.
69
int byte1 = in.read();
70
int byte2 = in.read();
71
if (byte1 == 0xFE && byte2 == 0xFF) {
72
// Don't reset because the byte order mark should not be
73
// included per section 4.3 of the XInclude spec
76
else if (byte1 == 0xFF && byte2 == 0xFE) {
77
// Don't reset because the byte order mark should not be
78
// included per section 4.3 of the XInclude spec
79
return "UnicodeLittle";
82
/* In accordance with the Character Model,
83
when the text format is a Unicode encoding, the XInclude
84
processor must fail the inclusion when the text in the
85
selected range is non-normalized. When transcoding
86
characters to a Unicode encoding from a legacy encoding,
87
a normalizing transcoder must be used. */
89
int byte3 = in.read();
90
// check for UTF-8 byte order mark
91
if (byte1 == 0xEF && byte2 == 0xBB && byte3 == 0xBF) {
92
// Don't reset because the byte order mark should not be
93
// included per section 4.3 of the XInclude spec
97
int byte4 = in.read();
99
&& byte2 == 0x00 && byte3 == 0xFE && byte4 == 0xFF) {
100
// Don't reset because the byte order mark should not be
101
// included per section 4.3 of the XInclude spec
102
// Most Java VMs don't support this next one
105
else if (byte1 == 0x00 && byte2 == 0x00
106
&& byte3 == 0xFF && byte4 == 0xFE) {
107
// Don't reset because the byte order mark should not be
108
// included per section 4.3 of the XInclude spec
109
// Most Java VMs don't support this next one
113
// no byte order mark present; first character must be
114
// less than sign or white space
115
// Let's look for less-than signs first
116
if (byte1 == 0x00 && byte2 == 0x00
117
&& byte3 == 0x00 && byte4 == '<') {
121
else if (byte1 == '<' && byte2 == 0x00
122
&& byte3 == 0x00 && byte4 == 0x00) {
126
else if (byte1 == 0x00 && byte2 == '<'
127
&& byte3 == 0x00 && byte4 == '?') {
129
return "UnicodeBigUnmarked";
131
else if (byte1 == '<' && byte2 == 0x00
132
&& byte3 == '?' && byte4 == 0x00) {
134
return "UnicodeLittleUnmarked";
136
else if (byte1 == '<' && byte2 == '?'
137
&& byte3 == 'x' && byte4 == 'm') {
138
// ASCII compatible, must read encoding declaration.
139
// 1024 bytes will be far enough to read most
141
byte[] data = new byte[1024];
142
data[0] = (byte) byte1;
143
data[1] = (byte) byte2;
144
data[2] = (byte) byte3;
145
data[3] = (byte) byte4;
146
int length = in.read(data, 4, 1020) + 4;
147
// Use Latin-1 (ISO-8859-1) because it's ASCII compatible
148
// and all byte sequences are legal Latin-1 sequences
149
// so I don't have to worry about encoding errors if I
150
// slip past the end of the XML/text declaration
151
String declaration=new String(data, 0, length, "8859_1");
152
// If any of these throw a
153
// StringIndexOutOfBoundsException,
154
// we just fall into the catch block and return null
155
// since this can't be well-formed XML
156
String encoding = findEncodingDeclaration(declaration);
161
else if (byte1 == 0x4C && byte2 == 0x6F
162
&& byte3 == 0xA7 && byte4 == 0x94) {
163
// EBCDIC compatible, must read encoding declaration
164
byte[] buffer = new byte[1016];
165
for (int i = 0; i < buffer.length; i++) {
168
buffer[i] = (byte) c;
171
// Most EBCDIC encodings are compatible with Cp037 over
172
// the range we care about
173
return findEncodingDeclaration(new String(buffer, "Cp037"));
177
catch (Exception ex) {
182
// no XML or text declaration present
189
private static String findEncodingDeclaration(String declaration)
192
int position = declaration.indexOf("encoding") + 8;
194
// get rid of white space before equals sign
196
c = declaration.charAt(position++);
197
if (c !=' ' && c != '\t' && c != '\r' && c != '\n') {
201
if (c != '=') { // malformed
202
throw new IOException("Couldn't determine encoding");
204
// get rid of white space after equals sign
206
c = declaration.charAt(position++);
207
if (c !=' ' && c != '\t' && c != '\r' && c != '\n') {
212
if (delimiter != '\'' && delimiter != '"') { // malformed
215
// now positioned to read encoding name
216
StringBuffer encodingName = new StringBuffer();
218
c = declaration.charAt(position++);
219
if (c == delimiter) break;
220
encodingName.append(c);
222
return encodingName.toString();
b'\\ No newline at end of file'