2
* Copyright 2003-2007 the original author or authors.
4
* Licensed under the Apache License, Version 2.0 (the "License");
5
* you may not use this file except in compliance with the License.
6
* You may obtain a copy of the License at
8
* http://www.apache.org/licenses/LICENSE-2.0
10
* Unless required by applicable law or agreed to in writing, software
11
* distributed under the License is distributed on an "AS IS" BASIS,
12
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
* See the License for the specific language governing permissions and
14
* limitations under the License.
20
import java.nio.charset.Charset;
21
import java.util.Collection;
24
* <p>Utility class to guess the encoding of a given text file.</p>
26
* <p>Unicode files encoded in UTF-16 (low or big endian) or UTF-8 files
27
* with a Byte Order Marker are correctly discovered. For UTF-8 files with no BOM, if the buffer
28
* is wide enough, the charset should also be discovered.</p>
30
* <p>A byte buffer of 4KB is usually sufficient to be able to guess the encoding.</p>
34
* // guess the encoding
35
* Charset guessedCharset = CharsetToolkit.guessEncoding(file, 4096);
37
* // create a reader with the correct charset
38
* CharsetToolkit toolkit = new CharsetToolkit(file);
39
* BufferedReader reader = toolkit.getReader();
41
* // read the file content
43
* while ((line = br.readLine())!= null)
45
* System.out.println(line);
49
* @author Guillaume Laforge
51
public class CharsetToolkit {
52
private byte[] buffer;
53
private Charset defaultCharset;
54
private Charset charset;
55
private boolean enforce8Bit = true;
56
private final File file;
57
private static final byte[] EMPTY_BYTE_ARRAY = new byte[0];
60
* Constructor of the <code>CharsetToolkit</code> utility class.
62
* @param file of which we want to know the encoding.
64
public CharsetToolkit(File file) throws IOException {
66
this.defaultCharset = getDefaultSystemCharset();
68
InputStream input = new FileInputStream(file);
70
byte[] bytes = new byte[4096];
71
int bytesRead = input.read(bytes);
72
if (bytesRead == -1) {
73
this.buffer = EMPTY_BYTE_ARRAY;
75
else if (bytesRead < 4096) {
76
byte[] bytesToGuess = new byte[bytesRead];
77
System.arraycopy(bytes, 0, bytesToGuess, 0, bytesRead);
78
this.buffer = bytesToGuess;
84
try {input.close();} catch (IOException e){
91
* Defines the default <code>Charset</code> used in case the buffer represents
92
* an 8-bit <code>Charset</code>.
94
* @param defaultCharset the default <code>Charset</code> to be returned by <code>guessEncoding()</code>
95
* if an 8-bit <code>Charset</code> is encountered.
97
public void setDefaultCharset(Charset defaultCharset) {
98
if (defaultCharset != null)
99
this.defaultCharset = defaultCharset;
101
this.defaultCharset = getDefaultSystemCharset();
104
public Charset getCharset() {
105
if (this.charset == null)
106
this.charset = guessEncoding();
111
* If US-ASCII is recognized, enforce to return the default encoding, rather than US-ASCII.
112
* It might be a file without any special character in the range 128-255, but that may be or become
113
* a file encoded with the default <code>charset</code> rather than US-ASCII.
115
* @param enforce a boolean specifying the use or not of US-ASCII.
117
public void setEnforce8Bit(boolean enforce) {
118
this.enforce8Bit = enforce;
122
* Gets the enforce8Bit flag, in case we do not want to ever get a US-ASCII encoding.
124
* @return a boolean representing the flag of use of US-ASCII.
126
public boolean getEnforce8Bit() {
127
return this.enforce8Bit;
131
* Retrieves the default Charset
133
public Charset getDefaultCharset() {
134
return defaultCharset;
138
* <p>Guess the encoding of the provided buffer.</p>
139
* If Byte Order Markers are encountered at the beginning of the buffer, we immidiately
140
* return the charset implied by this BOM. Otherwise, the file would not be a human
141
* readable text file.</p>
143
* <p>If there is no BOM, this method tries to discern whether the file is UTF-8 or not.
144
* If it is not UTF-8, we assume the encoding is the default system encoding
145
* (of course, it might be any 8-bit charset, but usually, an 8-bit charset is the default one).</p>
147
* <p>It is possible to discern UTF-8 thanks to the pattern of characters with a multi-byte sequence.</p>
149
* UCS-4 range (hex.) UTF-8 octet sequence (binary)
150
* 0000 0000-0000 007F 0xxxxxxx
151
* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
152
* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
153
* 0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
154
* 0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
155
* 0400 0000-7FFF FFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
157
* <p>With UTF-8, 0xFE and 0xFF never appear.</p>
159
* @return the Charset recognized.
161
private Charset guessEncoding() {
162
// if the file has a Byte Order Marker, we can assume the file is in UTF-xx
163
// otherwise, the file would not be human readable
165
return Charset.forName("UTF-8");
167
return Charset.forName("UTF-16LE");
169
return Charset.forName("UTF-16BE");
171
// if a byte has its most significant bit set, the file is in UTF-8 or in the default encoding
172
// otherwise, the file is in US-ASCII
173
boolean highOrderBit = false;
175
// if the file is in UTF-8, high order bytes must have a certain value, in order to be valid
176
// if it's not the case, we can assume the encoding is the default encoding of the system
177
boolean validU8Char = true;
179
// TODO the buffer is not read up to the end, but up to length - 6
181
int length = buffer.length;
183
while (i < length - 6) {
185
byte b1 = buffer[i + 1];
186
byte b2 = buffer[i + 2];
187
byte b3 = buffer[i + 3];
188
byte b4 = buffer[i + 4];
189
byte b5 = buffer[i + 5];
191
// a high order bit was encountered, thus the encoding is not US-ASCII
192
// it may be either an 8-bit encoding or UTF-8
194
// a two-bytes sequence was encoutered
195
if (isTwoBytesSequence(b0)) {
196
// there must be one continuation byte of the form 10xxxxxx,
197
// otherwise the following characteris is not a valid UTF-8 construct
198
if (!isContinuationChar(b1))
203
// a three-bytes sequence was encoutered
204
else if (isThreeBytesSequence(b0)) {
205
// there must be two continuation bytes of the form 10xxxxxx,
206
// otherwise the following characteris is not a valid UTF-8 construct
207
if (!(isContinuationChar(b1) && isContinuationChar(b2)))
212
// a four-bytes sequence was encoutered
213
else if (isFourBytesSequence(b0)) {
214
// there must be three continuation bytes of the form 10xxxxxx,
215
// otherwise the following characteris is not a valid UTF-8 construct
216
if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3)))
221
// a five-bytes sequence was encoutered
222
else if (isFiveBytesSequence(b0)) {
223
// there must be four continuation bytes of the form 10xxxxxx,
224
// otherwise the following characteris is not a valid UTF-8 construct
225
if (!(isContinuationChar(b1)
226
&& isContinuationChar(b2)
227
&& isContinuationChar(b3)
228
&& isContinuationChar(b4)))
233
// a six-bytes sequence was encoutered
234
else if (isSixBytesSequence(b0)) {
235
// there must be five continuation bytes of the form 10xxxxxx,
236
// otherwise the following characteris is not a valid UTF-8 construct
237
if (!(isContinuationChar(b1)
238
&& isContinuationChar(b2)
239
&& isContinuationChar(b3)
240
&& isContinuationChar(b4)
241
&& isContinuationChar(b5)))
253
// if no byte with an high order bit set, the encoding is US-ASCII
254
// (it might have been UTF-7, but this encoding is usually internally used only by mail systems)
256
// returns the default charset rather than US-ASCII if the enforce8Bit flag is set.
257
if (this.enforce8Bit)
258
return this.defaultCharset;
260
return Charset.forName("US-ASCII");
262
// if no invalid UTF-8 were encountered, we can assume the encoding is UTF-8,
263
// otherwise the file would not be human readable
265
return Charset.forName("UTF-8");
266
// finally, if it's not UTF-8 nor US-ASCII, let's assume the encoding is the default encoding
267
return this.defaultCharset;
271
* If the byte has the form 10xxxxx, then it's a continuation byte of a multiple byte character;
274
* @return true if it's a continuation char.
276
private static boolean isContinuationChar(byte b) {
277
return -128 <= b && b <= -65;
281
* If the byte has the form 110xxxx, then it's the first byte of a two-bytes sequence character.
284
* @return true if it's the first byte of a two-bytes sequence.
286
private static boolean isTwoBytesSequence(byte b) {
287
return -64 <= b && b <= -33;
291
* If the byte has the form 1110xxx, then it's the first byte of a three-bytes sequence character.
294
* @return true if it's the first byte of a three-bytes sequence.
296
private static boolean isThreeBytesSequence(byte b) {
297
return -32 <= b && b <= -17;
301
* If the byte has the form 11110xx, then it's the first byte of a four-bytes sequence character.
304
* @return true if it's the first byte of a four-bytes sequence.
306
private static boolean isFourBytesSequence(byte b) {
307
return -16 <= b && b <= -9;
311
* If the byte has the form 11110xx, then it's the first byte of a five-bytes sequence character.
314
* @return true if it's the first byte of a five-bytes sequence.
316
private static boolean isFiveBytesSequence(byte b) {
317
return -8 <= b && b <= -5;
321
* If the byte has the form 1110xxx, then it's the first byte of a six-bytes sequence character.
324
* @return true if it's the first byte of a six-bytes sequence.
326
private static boolean isSixBytesSequence(byte b) {
327
return -4 <= b && b <= -3;
331
* Retrieve the default charset of the system.
333
* @return the default <code>Charset</code>.
335
public static Charset getDefaultSystemCharset() {
336
return Charset.forName(System.getProperty("file.encoding"));
340
* Has a Byte Order Marker for UTF-8 (Used by Microsoft's Notepad and other editors).
342
* @return true if the buffer has a BOM for UTF8.
344
public boolean hasUTF8Bom() {
345
if (buffer.length >= 3)
346
return (buffer[0] == -17 && buffer[1] == -69 && buffer[2] == -65);
352
* Has a Byte Order Marker for UTF-16 Low Endian
353
* (ucs-2le, ucs-4le, and ucs-16le).
355
* @return true if the buffer has a BOM for UTF-16 Low Endian.
357
public boolean hasUTF16LEBom() {
358
if (buffer.length >= 2)
359
return (buffer[0] == -1 && buffer[1] == -2);
365
* Has a Byte Order Marker for UTF-16 Big Endian
366
* (utf-16 and ucs-2).
368
* @return true if the buffer has a BOM for UTF-16 Big Endian.
370
public boolean hasUTF16BEBom() {
371
if (buffer.length >= 2)
372
return (buffer[0] == -2 && buffer[1] == -1);
378
* Gets a <code>BufferedReader</code> (indeed a <code>LineNumberReader</code>) from the <code>File</code>
379
* specified in the constructor of <code>CharsetToolkit</code> using the charset discovered by the
380
* method <code>guessEncoding()</code>.
382
* @return a <code>BufferedReader</code>
383
* @throws FileNotFoundException if the file is not found.
385
public BufferedReader getReader() throws FileNotFoundException {
386
LineNumberReader reader = new LineNumberReader(new InputStreamReader(new FileInputStream(file), getCharset()));
387
if (hasUTF8Bom() || hasUTF16LEBom() || hasUTF16BEBom()) {
391
catch (IOException e) {
392
// should never happen, as a file with no content
393
// but with a BOM has at least one char
400
* Retrieves all the available <code>Charset</code>s on the platform,
401
* among which the default <code>charset</code>.
403
* @return an array of <code>Charset</code>s.
405
public static Charset[] getAvailableCharsets() {
406
Collection collection = Charset.availableCharsets().values();
407
return (Charset[]) collection.toArray(new Charset[collection.size()]);
2
* Copyright 2003-2007 the original author or authors.
4
* Licensed under the Apache License, Version 2.0 (the "License");
5
* you may not use this file except in compliance with the License.
6
* You may obtain a copy of the License at
8
* http://www.apache.org/licenses/LICENSE-2.0
10
* Unless required by applicable law or agreed to in writing, software
11
* distributed under the License is distributed on an "AS IS" BASIS,
12
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
* See the License for the specific language governing permissions and
14
* limitations under the License.
20
import java.nio.charset.Charset;
21
import java.util.Collection;
24
* <p>Utility class to guess the encoding of a given text file.</p>
26
* <p>Unicode files encoded in UTF-16 (low or big endian) or UTF-8 files
27
* with a Byte Order Marker are correctly discovered. For UTF-8 files with no BOM, if the buffer
28
* is wide enough, the charset should also be discovered.</p>
30
* <p>A byte buffer of 4KB is usually sufficient to be able to guess the encoding.</p>
34
* // guess the encoding
35
* Charset guessedCharset = CharsetToolkit.guessEncoding(file, 4096);
37
* // create a reader with the correct charset
38
* CharsetToolkit toolkit = new CharsetToolkit(file);
39
* BufferedReader reader = toolkit.getReader();
41
* // read the file content
43
* while ((line = br.readLine())!= null)
45
* System.out.println(line);
49
* @author Guillaume Laforge
51
public class CharsetToolkit {
52
private byte[] buffer;
53
private Charset defaultCharset;
54
private Charset charset;
55
private boolean enforce8Bit = true;
56
private final File file;
57
private static final byte[] EMPTY_BYTE_ARRAY = new byte[0];
60
* Constructor of the <code>CharsetToolkit</code> utility class.
62
* @param file of which we want to know the encoding.
64
public CharsetToolkit(File file) throws IOException {
66
this.defaultCharset = getDefaultSystemCharset();
68
InputStream input = new FileInputStream(file);
70
byte[] bytes = new byte[4096];
71
int bytesRead = input.read(bytes);
72
if (bytesRead == -1) {
73
this.buffer = EMPTY_BYTE_ARRAY;
75
else if (bytesRead < 4096) {
76
byte[] bytesToGuess = new byte[bytesRead];
77
System.arraycopy(bytes, 0, bytesToGuess, 0, bytesRead);
78
this.buffer = bytesToGuess;
84
try {input.close();} catch (IOException e){
91
* Defines the default <code>Charset</code> used in case the buffer represents
92
* an 8-bit <code>Charset</code>.
94
* @param defaultCharset the default <code>Charset</code> to be returned by <code>guessEncoding()</code>
95
* if an 8-bit <code>Charset</code> is encountered.
97
public void setDefaultCharset(Charset defaultCharset) {
98
if (defaultCharset != null)
99
this.defaultCharset = defaultCharset;
101
this.defaultCharset = getDefaultSystemCharset();
104
public Charset getCharset() {
105
if (this.charset == null)
106
this.charset = guessEncoding();
111
* If US-ASCII is recognized, enforce to return the default encoding, rather than US-ASCII.
112
* It might be a file without any special character in the range 128-255, but that may be or become
113
* a file encoded with the default <code>charset</code> rather than US-ASCII.
115
* @param enforce a boolean specifying the use or not of US-ASCII.
117
public void setEnforce8Bit(boolean enforce) {
118
this.enforce8Bit = enforce;
122
* Gets the enforce8Bit flag, in case we do not want to ever get a US-ASCII encoding.
124
* @return a boolean representing the flag of use of US-ASCII.
126
public boolean getEnforce8Bit() {
127
return this.enforce8Bit;
131
* Retrieves the default Charset
133
public Charset getDefaultCharset() {
134
return defaultCharset;
138
* <p>Guess the encoding of the provided buffer.</p>
139
* If Byte Order Markers are encountered at the beginning of the buffer, we immidiately
140
* return the charset implied by this BOM. Otherwise, the file would not be a human
141
* readable text file.</p>
143
* <p>If there is no BOM, this method tries to discern whether the file is UTF-8 or not.
144
* If it is not UTF-8, we assume the encoding is the default system encoding
145
* (of course, it might be any 8-bit charset, but usually, an 8-bit charset is the default one).</p>
147
* <p>It is possible to discern UTF-8 thanks to the pattern of characters with a multi-byte sequence.</p>
149
* UCS-4 range (hex.) UTF-8 octet sequence (binary)
150
* 0000 0000-0000 007F 0xxxxxxx
151
* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
152
* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
153
* 0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
154
* 0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
155
* 0400 0000-7FFF FFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
157
* <p>With UTF-8, 0xFE and 0xFF never appear.</p>
159
* @return the Charset recognized.
161
private Charset guessEncoding() {
162
// if the file has a Byte Order Marker, we can assume the file is in UTF-xx
163
// otherwise, the file would not be human readable
165
return Charset.forName("UTF-8");
167
return Charset.forName("UTF-16LE");
169
return Charset.forName("UTF-16BE");
171
// if a byte has its most significant bit set, the file is in UTF-8 or in the default encoding
172
// otherwise, the file is in US-ASCII
173
boolean highOrderBit = false;
175
// if the file is in UTF-8, high order bytes must have a certain value, in order to be valid
176
// if it's not the case, we can assume the encoding is the default encoding of the system
177
boolean validU8Char = true;
179
// TODO the buffer is not read up to the end, but up to length - 6
181
int length = buffer.length;
183
while (i < length - 6) {
185
byte b1 = buffer[i + 1];
186
byte b2 = buffer[i + 2];
187
byte b3 = buffer[i + 3];
188
byte b4 = buffer[i + 4];
189
byte b5 = buffer[i + 5];
191
// a high order bit was encountered, thus the encoding is not US-ASCII
192
// it may be either an 8-bit encoding or UTF-8
194
// a two-bytes sequence was encoutered
195
if (isTwoBytesSequence(b0)) {
196
// there must be one continuation byte of the form 10xxxxxx,
197
// otherwise the following characteris is not a valid UTF-8 construct
198
if (!isContinuationChar(b1))
203
// a three-bytes sequence was encoutered
204
else if (isThreeBytesSequence(b0)) {
205
// there must be two continuation bytes of the form 10xxxxxx,
206
// otherwise the following characteris is not a valid UTF-8 construct
207
if (!(isContinuationChar(b1) && isContinuationChar(b2)))
212
// a four-bytes sequence was encoutered
213
else if (isFourBytesSequence(b0)) {
214
// there must be three continuation bytes of the form 10xxxxxx,
215
// otherwise the following characteris is not a valid UTF-8 construct
216
if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3)))
221
// a five-bytes sequence was encoutered
222
else if (isFiveBytesSequence(b0)) {
223
// there must be four continuation bytes of the form 10xxxxxx,
224
// otherwise the following characteris is not a valid UTF-8 construct
225
if (!(isContinuationChar(b1)
226
&& isContinuationChar(b2)
227
&& isContinuationChar(b3)
228
&& isContinuationChar(b4)))
233
// a six-bytes sequence was encoutered
234
else if (isSixBytesSequence(b0)) {
235
// there must be five continuation bytes of the form 10xxxxxx,
236
// otherwise the following characteris is not a valid UTF-8 construct
237
if (!(isContinuationChar(b1)
238
&& isContinuationChar(b2)
239
&& isContinuationChar(b3)
240
&& isContinuationChar(b4)
241
&& isContinuationChar(b5)))
253
// if no byte with an high order bit set, the encoding is US-ASCII
254
// (it might have been UTF-7, but this encoding is usually internally used only by mail systems)
256
// returns the default charset rather than US-ASCII if the enforce8Bit flag is set.
257
if (this.enforce8Bit)
258
return this.defaultCharset;
260
return Charset.forName("US-ASCII");
262
// if no invalid UTF-8 were encountered, we can assume the encoding is UTF-8,
263
// otherwise the file would not be human readable
265
return Charset.forName("UTF-8");
266
// finally, if it's not UTF-8 nor US-ASCII, let's assume the encoding is the default encoding
267
return this.defaultCharset;
271
* If the byte has the form 10xxxxx, then it's a continuation byte of a multiple byte character;
274
* @return true if it's a continuation char.
276
private static boolean isContinuationChar(byte b) {
277
return -128 <= b && b <= -65;
281
* If the byte has the form 110xxxx, then it's the first byte of a two-bytes sequence character.
284
* @return true if it's the first byte of a two-bytes sequence.
286
private static boolean isTwoBytesSequence(byte b) {
287
return -64 <= b && b <= -33;
291
* If the byte has the form 1110xxx, then it's the first byte of a three-bytes sequence character.
294
* @return true if it's the first byte of a three-bytes sequence.
296
private static boolean isThreeBytesSequence(byte b) {
297
return -32 <= b && b <= -17;
301
* If the byte has the form 11110xx, then it's the first byte of a four-bytes sequence character.
304
* @return true if it's the first byte of a four-bytes sequence.
306
private static boolean isFourBytesSequence(byte b) {
307
return -16 <= b && b <= -9;
311
* If the byte has the form 11110xx, then it's the first byte of a five-bytes sequence character.
314
* @return true if it's the first byte of a five-bytes sequence.
316
private static boolean isFiveBytesSequence(byte b) {
317
return -8 <= b && b <= -5;
321
* If the byte has the form 1110xxx, then it's the first byte of a six-bytes sequence character.
324
* @return true if it's the first byte of a six-bytes sequence.
326
private static boolean isSixBytesSequence(byte b) {
327
return -4 <= b && b <= -3;
331
* Retrieve the default charset of the system.
333
* @return the default <code>Charset</code>.
335
public static Charset getDefaultSystemCharset() {
336
return Charset.forName(System.getProperty("file.encoding"));
340
* Has a Byte Order Marker for UTF-8 (Used by Microsoft's Notepad and other editors).
342
* @return true if the buffer has a BOM for UTF8.
344
public boolean hasUTF8Bom() {
345
if (buffer.length >= 3)
346
return (buffer[0] == -17 && buffer[1] == -69 && buffer[2] == -65);
352
* Has a Byte Order Marker for UTF-16 Low Endian
353
* (ucs-2le, ucs-4le, and ucs-16le).
355
* @return true if the buffer has a BOM for UTF-16 Low Endian.
357
public boolean hasUTF16LEBom() {
358
if (buffer.length >= 2)
359
return (buffer[0] == -1 && buffer[1] == -2);
365
* Has a Byte Order Marker for UTF-16 Big Endian
366
* (utf-16 and ucs-2).
368
* @return true if the buffer has a BOM for UTF-16 Big Endian.
370
public boolean hasUTF16BEBom() {
371
if (buffer.length >= 2)
372
return (buffer[0] == -2 && buffer[1] == -1);
378
* Gets a <code>BufferedReader</code> (indeed a <code>LineNumberReader</code>) from the <code>File</code>
379
* specified in the constructor of <code>CharsetToolkit</code> using the charset discovered by the
380
* method <code>guessEncoding()</code>.
382
* @return a <code>BufferedReader</code>
383
* @throws FileNotFoundException if the file is not found.
385
public BufferedReader getReader() throws FileNotFoundException {
386
LineNumberReader reader = new LineNumberReader(new InputStreamReader(new FileInputStream(file), getCharset()));
387
if (hasUTF8Bom() || hasUTF16LEBom() || hasUTF16BEBom()) {
391
catch (IOException e) {
392
// should never happen, as a file with no content
393
// but with a BOM has at least one char
400
* Retrieves all the available <code>Charset</code>s on the platform,
401
* among which the default <code>charset</code>.
403
* @return an array of <code>Charset</code>s.
405
public static Charset[] getAvailableCharsets() {
406
Collection collection = Charset.availableCharsets().values();
407
return (Charset[]) collection.toArray(new Charset[collection.size()]);