1
/* Copyright 2002-2005 Elliotte Rusty Harold
3
This library is free software; you can redistribute it and/or modify
4
it under the terms of version 2.1 of the GNU Lesser General Public
5
License as published by the Free Software Foundation.
7
This library is distributed in the hope that it will be useful,
8
but WITHOUT ANY WARRANTY; without even the implied warranty of
9
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
GNU Lesser General Public License for more details.
12
You should have received a copy of the GNU Lesser General Public
13
License along with this library; if not, write to the
14
Free Software Foundation, Inc., 59 Temple Place, Suite 330,
15
Boston, MA 02111-1307 USA
17
You can contact Elliotte Rusty Harold by sending e-mail to
18
elharo@metalab.unc.edu. Please include the word "XOM" in the
19
subject line. The XOM home page is located at http://www.xom.nu/
24
import java.io.ByteArrayInputStream;
25
import java.io.ByteArrayOutputStream;
26
import java.io.IOException;
27
import java.io.InputStream;
28
import java.io.UnsupportedEncodingException;
30
import nu.xom.Attribute;
31
import nu.xom.Builder;
32
import nu.xom.Document;
33
import nu.xom.Element;
34
import nu.xom.ParsingException;
35
import nu.xom.Serializer;
39
* Check serialization of almost all of Unicode
40
* in a variety of encodings.
43
* @author Elliotte Rusty Harold
47
public class EncodingTest extends XOMTestCase {
50
public EncodingTest(String name) {
57
private static int SURROGATE_OFFSET = 0x10000 - (0xD800 << 10) - 0xDC00;
59
protected void setUp() {
61
Element root = new Element("root");
62
doc = new Document(root);
64
Element prototype = new Element("d");
65
for (int i = 0x20; i <= 0xD7FF; i++) {
66
Element data = (Element) prototype.copy();
67
data.appendChild(String.valueOf(((char) i)));
68
data.addAttribute(new Attribute("c", String.valueOf(i)));
69
root.appendChild(data);
72
// skip surrogates between 0xD800 and 0xDFFF
73
for (int i = 0xE000; i <= 0xFFFD; i++) {
74
Element data = (Element) prototype.copy();
75
data.appendChild(String.valueOf(((char) i)));
76
data.addAttribute(new Attribute("c", String.valueOf(i)));
77
root.appendChild(data);
80
// Test Plane-1 characters. These are tricky because Java
81
// strings encode them as surrogate pairs. We'll test with
82
// the characters from 1D100 to 1D1FF (the musical symbols)
83
StringBuffer sb = new StringBuffer(2);
86
for (int i = 0; i < 256; i++) {
87
char low = (char) (0xDD00+i);
90
String s = sb.toString();
91
Element data = (Element) prototype.copy();
92
data.appendChild( s );
93
data.addAttribute(new Attribute("c", String.valueOf(0x1D100 + i)));
94
root.appendChild(data);
100
protected void tearDown() {
106
public void testEUCJP() throws ParsingException, IOException {
111
public void testShift_JIS() throws ParsingException, IOException {
112
checkAll("Shift_JIS");
116
public void testISO2022JP() throws ParsingException, IOException {
117
checkAll("ISO-2022-JP");
121
public void testGeneric() throws ParsingException, IOException {
126
// Main purpose here is to test a character set whose name is
128
public void testMacRoman() throws ParsingException, IOException {
129
checkAll("MacRoman");
133
public void testBig5() throws ParsingException, IOException {
137
public void testUSASCII() throws ParsingException, IOException {
138
checkAll("US-ASCII");
141
public void testASCII() throws ParsingException, IOException {
145
public void testLatin1() throws ParsingException, IOException {
146
checkAll("ISO-8859-1");
149
public void testLatin2() throws ParsingException, IOException {
150
checkAll("ISO-8859-2");
153
public void testLatin3() throws ParsingException, IOException {
154
checkAll("ISO-8859-3");
157
public void testLatin4() throws ParsingException, IOException {
158
checkAll("ISO-8859-4");
161
public void testCyrillic() throws ParsingException, IOException {
162
checkAll("ISO-8859-5");
165
public void testArabic() throws ParsingException, IOException {
166
checkAll("ISO-8859-6");
169
public void testGreek() throws ParsingException, IOException {
170
// This test seems to fail in Java 1.5, at least on Mac OS X
171
// It passes in 1.4. The problem is the delete character 127
172
checkAll("ISO-8859-7");
175
public void testThai() throws ParsingException, IOException {
179
public void testHebrew() throws ParsingException, IOException {
180
checkAll("ISO-8859-8");
183
public void testLatin5() throws ParsingException, IOException {
184
checkAll("ISO-8859-9");
187
public void testUTF8() throws ParsingException, IOException {
191
public void testUTF16() throws ParsingException, IOException {
195
public void testUCS2() throws ParsingException, IOException {
196
checkAll("ISO-10646-UCS-2");
199
public void testEBCDIC() throws ParsingException, IOException {
203
// These encodings are only available after Java 1.3
204
private static boolean java14OrLater = false;
207
String version = System.getProperty("java.version");
208
String majorVersion = version.substring(0, 3);
209
double versionNumber = Double.parseDouble(majorVersion);
210
if (versionNumber >= 1.4) java14OrLater = true;
213
public void testLatin7() throws ParsingException, IOException {
214
if (java14OrLater) checkAll("ISO-8859-13");
217
public void testLatin9() throws ParsingException, IOException {
218
if (java14OrLater) checkAll("ISO-8859-15");
221
public void testGB18030() throws ParsingException, IOException {
222
if (java14OrLater) checkAll("GB18030");
225
// These encodings are not installed in all distributions by
226
// default. They are only found currently in IBM's Java 1.4.1 VM.
227
// They don't seem to be supported in the 1.5 alpha
229
public void testUCS4() throws ParsingException, IOException {
230
if (charsetAvailable("ISO-10646-UCS-4")) checkAll("ISO-10646-UCS-4");
233
public void testLatin6() throws ParsingException, IOException {
234
if (charsetAvailable("ISO-8859-10")) checkAll("ISO-8859-10");
237
public void testLatin8() throws ParsingException, IOException {
238
if (charsetAvailable("ISO-8859-14")) checkAll("ISO-8859-14");
241
public void testLatin10() throws ParsingException, IOException {
242
if (charsetAvailable("ISO-8859-16")) checkAll("ISO-8859-16");
246
// Test that with an encoding XOM does not specifically support
247
// but the VM does, everything still works.
248
public void testUnsupportedEncoding()
249
throws ParsingException, IOException {
254
private static boolean charsetAvailable(String name) {
255
// hack to avoid using 1.4 classes
260
catch (UnsupportedEncodingException ex) {
267
private void checkAll(String encoding)
268
throws ParsingException, IOException {
270
Builder builder = new Builder();
272
ByteArrayOutputStream out = new ByteArrayOutputStream(100000);
273
// Write data into a byte array using encoding
274
Serializer serializer = new Serializer(out, encoding);
275
serializer.write(doc);
279
data = out.toByteArray();
280
InputStream in = new ByteArrayInputStream(data);
281
Document reparsed = builder.build(in);
285
Element reparsedRoot = reparsed.getRootElement();
286
int childCount = reparsedRoot.getChildCount();
287
for (int i = 0; i < childCount; i++) {
288
Element test = (Element) reparsedRoot.getChild(i);
289
String value = test.getValue();
291
= Integer.parseInt(test.getAttributeValue("c"));
292
// workaround for EBCDIC bugs
293
if (expected == 133 && encoding.equalsIgnoreCase("Cp037")) {
296
int actual = value.charAt(0);
297
if (value.length() > 1) {
298
int low = value.charAt(1);
299
actual = (actual << 10) + low + SURROGATE_OFFSET;
301
// This doesn't work for all encodings, because there are
302
// a few cases where you write a Unicode compatibility
303
// character such as an Arabic presentation form,
304
// but read back what is essentially a different version
305
// of the same character. That is the mapping from some
306
// legacy character sets to Unicode is not always 1-1.
308
assertEquals("Expected 0x"
309
+ Integer.toHexString(expected).toUpperCase()
311
+ Integer.toHexString(actual).toUpperCase(), expected, actual); */
312
assertEquals(expected, actual);