1
/* Copyright 2002, 2003, 2005 Elliotte Rusty Harold
3
This library is free software; you can redistribute it and/or modify
4
it under the terms of version 2.1 of the GNU Lesser General Public
5
License as published by the Free Software Foundation.
7
This library is distributed in the hope that it will be useful,
8
but WITHOUT ANY WARRANTY; without even the implied warranty of
9
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
GNU Lesser General Public License for more details.
12
You should have received a copy of the GNU Lesser General Public
13
License along with this library; if not, write to the
14
Free Software Foundation, Inc., 59 Temple Place, Suite 330,
15
Boston, MA 02111-1307 USA
17
You can contact Elliotte Rusty Harold by sending e-mail to
18
elharo@metalab.unc.edu. Please include the word "XOM" in the
19
subject line. The XOM home page is located at http://www.xom.nu/
24
import java.io.IOException;
25
import java.io.Writer;
28
* @author Elliotte Rusty Harold
32
final class UnicodeWriter extends TextWriter {
34
UnicodeWriter(Writer out, String encoding) {
39
* @see nu.xom.TextWriter#needsEscaping(char)
41
boolean needsEscaping(char c) {
46
void writeMarkup(String s) throws IOException {
52
int unicodeStringLength = getUnicodeLengthForMarkup(s);
53
if (unicodeStringLength >= 0) {
55
if (unicodeStringLength > 0) {
56
column += unicodeStringLength;
57
lastCharacterWasSpace = false;
58
skipFollowingLinefeed = false;
62
else { // write character by character
63
int length = s.length();
64
for (int i=0; i < length; i++) {
65
writeMarkup(s.charAt(i));
73
* This is tricky. This method is doing two things:
75
* 1. It's counting the number of Unicode characters in s.
76
* 2. It's checking to see if this text contains anything
77
* that might need to be escaped.
79
* If the latter it returns -1; otherwise it returns the number of characters.
81
private static int getUnicodeLengthForMarkup(String s) {
83
int unicodeLength = 0;
84
int javaLength = s.length();
85
for (int i = 0; i < javaLength; i++) {
88
// Really we're testing only for \t, \n, and space here.
89
// However all other characters less than or equal to 32
90
// can't appear in markup sections.
91
// These characters cause an adjustment of
92
// lastCharacterWasSpace, skipFollowingLinefeed, and justBroke
93
// They may need to be escaped but only in doctype declarations.
94
// Should these have their own writeDoctypeDeclaration method????
95
// Also an issue with spaces and such in PIs, XML declaration, comments
98
// Count the low surrogates but skip the high surrogates
99
// so surrogate pairs aren't counted twice.
100
else if (c < 0xD800 || c > 0xDBFF) unicodeLength++;
102
return unicodeLength;
107
void writeAttributeValue(String s) throws IOException {
112
int unicodeStringLength = getUnicodeLengthForAttributeValue(s);
113
if (unicodeStringLength >= 0) {
115
if (unicodeStringLength > 0) {
116
column += unicodeStringLength;
117
lastCharacterWasSpace = false;
118
skipFollowingLinefeed = false;
123
int length = s.length();
124
for (int i=0; i < length; i++) {
125
writeAttributeValue(s.charAt(i));
132
// All three getUnicodeLengthForFOO methods are very similar.
133
// Could the code duplciation be eliminated efficiently somehow?
134
private static int getUnicodeLengthForAttributeValue(String s) {
136
int unicodeLength = 0;
137
int javaLength = s.length();
138
for (int i = 0; i < javaLength; i++) {
139
char c = s.charAt(i);
141
case '\t': return -1;
142
case '\n': return -1;
143
case 11: // unreachable
144
case 12: throw new XMLException("Bad character snuck into document");
145
case '\r': return -1;
146
case 14: // unreachable
147
case 15: // unreachable
148
case 16: // unreachable
149
case 17: // unreachable
150
case 18: // unreachable
151
case 19: // unreachable
152
case 20: // unreachable
153
case 21: // unreachable
154
case 22: // unreachable
155
case 23: // unreachable
156
case 24: // unreachable
157
case 25: // unreachable
158
case 26: // unreachable
159
case 27: // unreachable
160
case 28: // unreachable
161
case 29: // unreachable
162
case 30: // unreachable
163
case 31: // unreachable
164
throw new XMLException("Bad character snuck into document");
253
if (c < 0xd800 || c > 0xDBFF) unicodeLength++;
256
return unicodeLength;
261
void writePCDATA(String s) throws IOException {
267
int unicodeStringLength = getUnicodeLengthForPCDATA(s);
268
if (unicodeStringLength >= 0) {
270
if (unicodeStringLength > 0) {
271
column += unicodeStringLength;
272
lastCharacterWasSpace = false;
273
skipFollowingLinefeed = false;
278
int length = s.length();
279
for (int i=0; i < length; i++) {
280
writePCDATA(s.charAt(i));
287
private static int getUnicodeLengthForPCDATA(String s) {
289
int unicodeLength = 0;
290
int javaLength = s.length();
291
for (int i = 0; i < javaLength; i++) {
292
char c = s.charAt(i);
294
case '\t': return -1;
295
case '\n': return -1;
296
case 11: // unreachable
297
case 12: throw new XMLException("Bad character snuck into document");
298
case '\r': return -1;
299
case 14: // unreachable
300
case 15: // unreachable
301
case 16: // unreachable
302
case 17: // unreachable
303
case 18: // unreachable
304
case 19: // unreachable
305
case 20: // unreachable
306
case 21: // unreachable
307
case 22: // unreachable
308
case 23: // unreachable
309
case 24: // unreachable
310
case 25: // unreachable
311
case 26: // unreachable
312
case 27: // unreachable
313
case 28: // unreachable
314
case 29: // unreachable
315
case 30: // unreachable
316
case 31: // unreachable
317
throw new XMLException("Bad character snuck into document");
407
if (c < 0xd800 || c > 0xDBFF) unicodeLength++;
410
return unicodeLength;