2
* Licensed to the Apache Software Foundation (ASF) under one or more
3
* contributor license agreements. See the NOTICE file distributed with
4
* this work for additional information regarding copyright ownership.
5
* The ASF licenses this file to You under the Apache License, Version 2.0
6
* (the "License"); you may not use this file except in compliance with
7
* the License. You may obtain a copy of the License at
9
* http://www.apache.org/licenses/LICENSE-2.0
11
* Unless required by applicable law or agreed to in writing, software
12
* distributed under the License is distributed on an "AS IS" BASIS,
13
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
* See the License for the specific language governing permissions and
15
* limitations under the License.
20
// Fixed serializer to report IO exception directly, instead at
21
// the end of document processing.
22
// Reported by Patrick Higgins <phiggins@transzap.com>
24
// Fixed bug in startDocument not calling prepare.
25
// Reported by Mikael Staldal <d96-mst-ingen-reklam@d.kth.se>
27
// Added ability to omit DOCTYPE declaration.
29
// If no output format is provided the serializer now defaults
30
// to ISO-8859-1 encoding. Reported by Mikael Staldal
34
package org.apache.xml.serialize;
36
import org.apache.xerces.dom.DOMMessageFormatter;
38
import java.io.IOException;
39
import java.io.OutputStream;
40
import java.io.Writer;
41
import java.util.Enumeration;
42
import java.util.Locale;
44
import org.w3c.dom.Attr;
45
import org.w3c.dom.Element;
46
import org.w3c.dom.NamedNodeMap;
47
import org.w3c.dom.Node;
48
import org.xml.sax.AttributeList;
49
import org.xml.sax.Attributes;
50
import org.xml.sax.SAXException;
54
* Implements an HTML/XHTML serializer supporting both DOM and SAX
55
* pretty serializing. HTML/XHTML mode is determined in the
56
* constructor. For usage instructions see {@link Serializer}.
58
* If an output stream is used, the encoding is taken from the
59
* output format (defaults to <tt>UTF-8</tt>). If a writer is
60
* used, make sure the writer uses the same encoding (if applies)
61
* as specified in the output format.
63
* The serializer supports both DOM and SAX. DOM serializing is done
64
* by calling {@link #serialize} and SAX serializing is done by firing
65
* SAX events and using the serializer as a document handler.
67
* If an I/O exception occurs while serializing, the serializer
68
* will not throw an exception directly, but only throw it
69
* at the end of serializing (either DOM or SAX's {@link
70
* org.xml.sax.DocumentHandler#endDocument}.
72
* For elements that are not specified as whitespace preserving,
73
* the serializer will potentially break long text lines at space
74
* boundaries, indent lines, and serialize elements on separate
75
* lines. Line terminators will be regarded as spaces, and
76
* spaces at beginning of line will be stripped.
78
* XHTML is slightly different than HTML:
80
* <li>Element/attribute names are lower case and case matters
81
* <li>Attributes must specify value, even if empty string
82
* <li>Empty elements must have '/' in empty tag
83
* <li>Contents of SCRIPT and STYLE elements serialized as CDATA
86
* @deprecated This class was deprecated in Xerces 2.6.2. It is
87
* recommended that new applications use JAXP's Transformation API
88
* for XML (TrAX) for serializing HTML. See the Xerces documentation
89
* for more information.
90
* @version $Revision: 1.2 $ $Date: 2009/12/10 03:18:31 $
91
* @author <a href="mailto:arkin@intalio.com">Assaf Arkin</a>
94
public class HTMLSerializer
95
extends BaseMarkupSerializer
100
* True if serializing in XHTML format.
102
private boolean _xhtml;
105
public static final String XHTMLNamespace = "http://www.w3.org/1999/xhtml";
107
// for users to override XHTMLNamespace if need be.
108
private String fUserXHTMLNamespace = null;
112
* Constructs a new HTML/XHTML serializer depending on the value of
113
* <tt>xhtml</tt>. The serializer cannot be used without calling
114
* {@link #setOutputCharStream} or {@link #setOutputByteStream} first.
116
* @param xhtml True if XHTML serializing
118
protected HTMLSerializer( boolean xhtml, OutputFormat format )
126
* Constructs a new serializer. The serializer cannot be used without
127
* calling {@link #setOutputCharStream} or {@link #setOutputByteStream}
130
public HTMLSerializer()
132
this( false, new OutputFormat( Method.HTML, "ISO-8859-1", false ) );
137
* Constructs a new serializer. The serializer cannot be used without
138
* calling {@link #setOutputCharStream} or {@link #setOutputByteStream}
141
public HTMLSerializer( OutputFormat format )
143
this( false, format != null ? format : new OutputFormat( Method.HTML, "ISO-8859-1", false ) );
149
* Constructs a new serializer that writes to the specified writer
150
* using the specified output format. If <tt>format</tt> is null,
151
* will use a default output format.
153
* @param writer The writer to use
154
* @param format The output format to use, null for the default
156
public HTMLSerializer( Writer writer, OutputFormat format )
158
this( false, format != null ? format : new OutputFormat( Method.HTML, "ISO-8859-1", false ) );
159
setOutputCharStream( writer );
164
* Constructs a new serializer that writes to the specified output
165
* stream using the specified output format. If <tt>format</tt>
166
* is null, will use a default output format.
168
* @param output The output stream to use
169
* @param format The output format to use, null for the default
171
public HTMLSerializer( OutputStream output, OutputFormat format )
173
this( false, format != null ? format : new OutputFormat( Method.HTML, "ISO-8859-1", false ) );
174
setOutputByteStream( output );
178
public void setOutputFormat( OutputFormat format )
180
super.setOutputFormat( format != null ? format : new OutputFormat( Method.HTML, "ISO-8859-1", false ) );
183
// Set value for alternate XHTML namespace.
184
public void setXHTMLNamespace(String newNamespace) {
185
fUserXHTMLNamespace = newNamespace;
186
} // setXHTMLNamespace(String)
188
//-----------------------------------------//
189
// SAX content handler serializing methods //
190
//-----------------------------------------//
193
public void startElement( String namespaceURI, String localName,
194
String rawName, Attributes attrs )
198
boolean preserveSpace;
203
boolean addNSAttr = false;
206
if ( _printer == null )
207
throw new IllegalStateException(
208
DOMMessageFormatter.formatMessage(
209
DOMMessageFormatter.SERIALIZER_DOMAIN,
210
"NoWriterSupplied", null));
212
state = getElementState();
213
if ( isDocumentState() ) {
214
// If this is the root element handle it differently.
215
// If the first root element in the document, serialize
216
// the document's DOCTYPE. Space preserving defaults
217
// to that of the output format.
219
startDocument( (localName == null || localName.length() == 0)
220
? rawName : localName );
222
// For any other element, if first in parent, then
223
// close parent's opening tag and use the parnet's
226
_printer.printText( '>' );
227
// Indent this element on a new line if the first
228
// content of the parent element or immediately
229
// following an element.
230
if ( _indenting && ! state.preserveSpace &&
231
( state.empty || state.afterElement ) )
232
_printer.breakLine();
234
preserveSpace = state.preserveSpace;
236
// Do not change the current element state yet.
237
// This only happens in endElement().
239
// As per SAX2, the namespace URI is an empty string if the element has no
240
// namespace URI, or namespaces is turned off. The check against null protects
241
// against broken SAX implementations, so I've left it there. - mrglavas
242
boolean hasNamespaceURI = (namespaceURI != null && namespaceURI.length() != 0);
244
// SAX2: rawName (QName) could be empty string if
245
// namespace-prefixes property is false.
246
if ( rawName == null || rawName.length() == 0) {
248
if ( hasNamespaceURI ) {
250
prefix = getPrefix( namespaceURI );
251
if ( prefix != null && prefix.length() != 0 )
252
rawName = prefix + ":" + localName;
256
if ( !hasNamespaceURI )
259
if ( namespaceURI.equals( XHTMLNamespace ) ||
260
(fUserXHTMLNamespace != null && fUserXHTMLNamespace.equals(namespaceURI)) )
261
htmlName = localName;
266
// XHTML: element names are lower case, DOM will be different
267
_printer.printText( '<' );
269
_printer.printText( rawName.toLowerCase(Locale.ENGLISH) );
271
_printer.printText( rawName );
274
// For each attribute serialize it's name and value as one part,
275
// separated with a space so the element can be broken on
277
if ( attrs != null ) {
278
for ( i = 0 ; i < attrs.getLength() ; ++i ) {
279
_printer.printSpace();
280
name = attrs.getQName( i ).toLowerCase(Locale.ENGLISH);
281
value = attrs.getValue( i );
282
if ( _xhtml || hasNamespaceURI ) {
283
// XHTML: print empty string for null values.
284
if ( value == null ) {
285
_printer.printText( name );
286
_printer.printText( "=\"\"" );
288
_printer.printText( name );
289
_printer.printText( "=\"" );
290
printEscaped( value );
291
_printer.printText( '"' );
294
// HTML: Empty values print as attribute name, no value.
295
// HTML: URI attributes will print unescaped
296
if ( value == null ) {
299
if ( !_format.getPreserveEmptyAttributes() && value.length() == 0 )
300
_printer.printText( name );
301
else if ( HTMLdtd.isURI( rawName, name ) ) {
302
_printer.printText( name );
303
_printer.printText( "=\"" );
304
_printer.printText( escapeURI( value ) );
305
_printer.printText( '"' );
306
} else if ( HTMLdtd.isBoolean( rawName, name ) )
307
_printer.printText( name );
309
_printer.printText( name );
310
_printer.printText( "=\"" );
311
printEscaped( value );
312
_printer.printText( '"' );
317
if ( htmlName != null && HTMLdtd.isPreserveSpace( htmlName ) )
318
preserveSpace = true;
323
keys = _prefixes.keys();
324
while ( keys.hasMoreElements() ) {
325
_printer.printSpace();
326
value = (String) keys.nextElement();
327
name = (String) _prefixes.get( value );
328
if ( name.length() == 0 ) {
329
_printer.printText( "xmlns=\"" );
330
printEscaped( value );
331
_printer.printText( '"' );
333
_printer.printText( "xmlns:" );
334
_printer.printText( name );
335
_printer.printText( "=\"" );
336
printEscaped( value );
337
_printer.printText( '"' );
342
// Now it's time to enter a new element state
343
// with the tag name and space preserving.
344
// We still do not change the curent element state.
345
state = enterElementState( namespaceURI, localName, rawName, preserveSpace );
347
// Prevents line breaks inside A/TD
349
if ( htmlName != null && ( htmlName.equalsIgnoreCase( "A" ) ||
350
htmlName.equalsIgnoreCase( "TD" ) ) ) {
352
_printer.printText( '>' );
355
// Handle SCRIPT and STYLE specifically by changing the
356
// state of the current element to CDATA (XHTML) or
358
if ( htmlName != null && ( rawName.equalsIgnoreCase( "SCRIPT" ) ||
359
rawName.equalsIgnoreCase( "STYLE" ) ) ) {
361
// XHTML: Print contents as CDATA section
362
state.doCData = true;
364
// HTML: Print contents unescaped
365
state.unescaped = true;
368
} catch ( IOException except ) {
369
throw new SAXException( except );
374
public void endElement( String namespaceURI, String localName,
379
endElementIO( namespaceURI, localName, rawName );
380
} catch ( IOException except ) {
381
throw new SAXException( except );
386
public void endElementIO( String namespaceURI, String localName,
393
// Works much like content() with additions for closing
394
// an element. Note the different checks for the closed
395
// element's state and the parent element's state.
397
state = getElementState();
399
if ( state.namespaceURI == null || state.namespaceURI.length() == 0 )
400
htmlName = state.rawName;
402
if ( state.namespaceURI.equals( XHTMLNamespace ) ||
403
(fUserXHTMLNamespace != null && fUserXHTMLNamespace.equals(state.namespaceURI)) )
404
htmlName = state.localName;
411
_printer.printText( " />" );
413
// Must leave CData section first
415
_printer.printText( "]]>" );
416
// XHTML: element names are lower case, DOM will be different
417
_printer.printText( "</" );
418
_printer.printText( state.rawName.toLowerCase(Locale.ENGLISH) );
419
_printer.printText( '>' );
423
_printer.printText( '>' );
424
// This element is not empty and that last content was
425
// another element, so print a line break before that
426
// last element and this element's closing tag.
427
// [keith] Provided this is not an anchor.
428
// HTML: some elements do not print closing tag (e.g. LI)
429
if ( htmlName == null || ! HTMLdtd.isOnlyOpening( htmlName ) ) {
430
if ( _indenting && ! state.preserveSpace && state.afterElement )
431
_printer.breakLine();
432
// Must leave CData section first (Illegal in HTML, but still)
434
_printer.printText( "]]>" );
435
_printer.printText( "</" );
436
_printer.printText( state.rawName );
437
_printer.printText( '>' );
440
// Leave the element state and update that of the parent
441
// (if we're not root) to not empty and after element.
442
state = leaveElementState();
443
// Temporary hack to prevent line breaks inside A/TD
444
if ( htmlName == null || ( ! htmlName.equalsIgnoreCase( "A" ) &&
445
! htmlName.equalsIgnoreCase( "TD" ) ) )
447
state.afterElement = true;
449
if ( isDocumentState() )
454
//------------------------------------------//
455
// SAX document handler serializing methods //
456
//------------------------------------------//
459
public void characters( char[] chars, int start, int length )
465
// HTML: no CDATA section
467
state.doCData = false;
468
super.characters( chars, start, length );
469
} catch ( IOException except ) {
470
throw new SAXException( except );
475
public void startElement( String tagName, AttributeList attrs )
479
boolean preserveSpace;
485
if ( _printer == null )
486
throw new IllegalStateException(
487
DOMMessageFormatter.formatMessage(
488
DOMMessageFormatter.SERIALIZER_DOMAIN,
489
"NoWriterSupplied", null));
492
state = getElementState();
493
if ( isDocumentState() ) {
494
// If this is the root element handle it differently.
495
// If the first root element in the document, serialize
496
// the document's DOCTYPE. Space preserving defaults
497
// to that of the output format.
499
startDocument( tagName );
501
// For any other element, if first in parent, then
502
// close parent's opening tag and use the parnet's
505
_printer.printText( '>' );
506
// Indent this element on a new line if the first
507
// content of the parent element or immediately
508
// following an element.
509
if ( _indenting && ! state.preserveSpace &&
510
( state.empty || state.afterElement ) )
511
_printer.breakLine();
513
preserveSpace = state.preserveSpace;
515
// Do not change the current element state yet.
516
// This only happens in endElement().
518
// XHTML: element names are lower case, DOM will be different
519
_printer.printText( '<' );
521
_printer.printText( tagName.toLowerCase(Locale.ENGLISH) );
523
_printer.printText( tagName );
526
// For each attribute serialize it's name and value as one part,
527
// separated with a space so the element can be broken on
529
if ( attrs != null ) {
530
for ( i = 0 ; i < attrs.getLength() ; ++i ) {
531
_printer.printSpace();
532
name = attrs.getName( i ).toLowerCase(Locale.ENGLISH);
533
value = attrs.getValue( i );
535
// XHTML: print empty string for null values.
536
if ( value == null ) {
537
_printer.printText( name );
538
_printer.printText( "=\"\"" );
540
_printer.printText( name );
541
_printer.printText( "=\"" );
542
printEscaped( value );
543
_printer.printText( '"' );
546
// HTML: Empty values print as attribute name, no value.
547
// HTML: URI attributes will print unescaped
548
if ( value == null ) {
551
if ( !_format.getPreserveEmptyAttributes() && value.length() == 0 )
552
_printer.printText( name );
553
else if ( HTMLdtd.isURI( tagName, name ) ) {
554
_printer.printText( name );
555
_printer.printText( "=\"" );
556
_printer.printText( escapeURI( value ) );
557
_printer.printText( '"' );
558
} else if ( HTMLdtd.isBoolean( tagName, name ) )
559
_printer.printText( name );
561
_printer.printText( name );
562
_printer.printText( "=\"" );
563
printEscaped( value );
564
_printer.printText( '"' );
569
if ( HTMLdtd.isPreserveSpace( tagName ) )
570
preserveSpace = true;
572
// Now it's time to enter a new element state
573
// with the tag name and space preserving.
574
// We still do not change the curent element state.
575
state = enterElementState( null, null, tagName, preserveSpace );
577
// Prevents line breaks inside A/TD
578
if ( tagName.equalsIgnoreCase( "A" ) || tagName.equalsIgnoreCase( "TD" ) ) {
580
_printer.printText( '>' );
583
// Handle SCRIPT and STYLE specifically by changing the
584
// state of the current element to CDATA (XHTML) or
586
if ( tagName.equalsIgnoreCase( "SCRIPT" ) ||
587
tagName.equalsIgnoreCase( "STYLE" ) ) {
589
// XHTML: Print contents as CDATA section
590
state.doCData = true;
592
// HTML: Print contents unescaped
593
state.unescaped = true;
596
} catch ( IOException except ) {
597
throw new SAXException( except );
602
public void endElement( String tagName )
605
endElement( null, null, tagName );
609
//------------------------------------------//
610
// Generic node serializing methods methods //
611
//------------------------------------------//
615
* Called to serialize the document's DOCTYPE by the root element.
616
* The document type declaration must name the root element,
617
* but the root element is only known when that element is serialized,
618
* and not at the start of the document.
620
* This method will check if it has not been called before ({@link #_started}),
621
* will serialize the document type declaration, and will serialize all
622
* pre-root comments and PIs that were accumulated in the document
623
* (see {@link #serializePreRoot}). Pre-root will be serialized even if
624
* this is not the first root element of the document.
626
protected void startDocument( String rootTagName )
629
// Not supported in HTML/XHTML, but we still have to switch
633
// If the public and system identifiers were not specified
634
// in the output format, use the appropriate ones for HTML
636
if ( _docTypePublicId == null && _docTypeSystemId == null ) {
638
_docTypePublicId = HTMLdtd.XHTMLPublicId;
639
_docTypeSystemId = HTMLdtd.XHTMLSystemId;
641
_docTypePublicId = HTMLdtd.HTMLPublicId;
642
_docTypeSystemId = HTMLdtd.HTMLSystemId;
646
if ( ! _format.getOmitDocumentType() ) {
647
// XHTML: If public identifier and system identifier
648
// specified, print them, else print just system identifier
649
// HTML: If public identifier specified, print it with
650
// system identifier, if specified.
651
// XHTML requires that all element names are lower case, so the
652
// root on the DOCTYPE must be 'html'. - mrglavas
653
if ( _docTypePublicId != null && ( ! _xhtml || _docTypeSystemId != null ) ) {
655
_printer.printText( "<!DOCTYPE html PUBLIC " );
658
_printer.printText( "<!DOCTYPE HTML PUBLIC " );
660
printDoctypeURL( _docTypePublicId );
661
if ( _docTypeSystemId != null ) {
663
_printer.breakLine();
664
_printer.printText( " " );
666
_printer.printText( ' ' );
667
printDoctypeURL( _docTypeSystemId );
669
_printer.printText( '>' );
670
_printer.breakLine();
671
} else if ( _docTypeSystemId != null ) {
673
_printer.printText( "<!DOCTYPE html SYSTEM " );
676
_printer.printText( "<!DOCTYPE HTML SYSTEM " );
678
printDoctypeURL( _docTypeSystemId );
679
_printer.printText( '>' );
680
_printer.breakLine();
686
// Always serialize these, even if not te first root element.
692
* Called to serialize a DOM element. Equivalent to calling {@link
693
* #startElement}, {@link #endElement} and serializing everything
694
* inbetween, but better optimized.
696
protected void serializeElement( Element elem )
700
NamedNodeMap attrMap;
704
boolean preserveSpace;
709
tagName = elem.getTagName();
710
state = getElementState();
711
if ( isDocumentState() ) {
712
// If this is the root element handle it differently.
713
// If the first root element in the document, serialize
714
// the document's DOCTYPE. Space preserving defaults
715
// to that of the output format.
717
startDocument( tagName );
719
// For any other element, if first in parent, then
720
// close parent's opening tag and use the parnet's
723
_printer.printText( '>' );
724
// Indent this element on a new line if the first
725
// content of the parent element or immediately
726
// following an element.
727
if ( _indenting && ! state.preserveSpace &&
728
( state.empty || state.afterElement ) )
729
_printer.breakLine();
731
preserveSpace = state.preserveSpace;
733
// Do not change the current element state yet.
734
// This only happens in endElement().
736
// XHTML: element names are lower case, DOM will be different
737
_printer.printText( '<' );
739
_printer.printText( tagName.toLowerCase(Locale.ENGLISH) );
741
_printer.printText( tagName );
744
// Lookup the element's attribute, but only print specified
745
// attributes. (Unspecified attributes are derived from the DTD.
746
// For each attribute print it's name and value as one part,
747
// separated with a space so the element can be broken on
749
attrMap = elem.getAttributes();
750
if ( attrMap != null ) {
751
for ( i = 0 ; i < attrMap.getLength() ; ++i ) {
752
attr = (Attr) attrMap.item( i );
753
name = attr.getName().toLowerCase(Locale.ENGLISH);
754
value = attr.getValue();
755
if ( attr.getSpecified() ) {
756
_printer.printSpace();
758
// XHTML: print empty string for null values.
759
if ( value == null ) {
760
_printer.printText( name );
761
_printer.printText( "=\"\"" );
763
_printer.printText( name );
764
_printer.printText( "=\"" );
765
printEscaped( value );
766
_printer.printText( '"' );
769
// HTML: Empty values print as attribute name, no value.
770
// HTML: URI attributes will print unescaped
771
if ( value == null ) {
774
if ( !_format.getPreserveEmptyAttributes() && value.length() == 0 )
775
_printer.printText( name );
776
else if ( HTMLdtd.isURI( tagName, name ) ) {
777
_printer.printText( name );
778
_printer.printText( "=\"" );
779
_printer.printText( escapeURI( value ) );
780
_printer.printText( '"' );
781
} else if ( HTMLdtd.isBoolean( tagName, name ) )
782
_printer.printText( name );
784
_printer.printText( name );
785
_printer.printText( "=\"" );
786
printEscaped( value );
787
_printer.printText( '"' );
793
if ( HTMLdtd.isPreserveSpace( tagName ) )
794
preserveSpace = true;
796
// If element has children, or if element is not an empty tag,
797
// serialize an opening tag.
798
if ( elem.hasChildNodes() || ! HTMLdtd.isEmptyTag( tagName ) ) {
799
// Enter an element state, and serialize the children
800
// one by one. Finally, end the element.
801
state = enterElementState( null, null, tagName, preserveSpace );
803
// Prevents line breaks inside A/TD
804
if ( tagName.equalsIgnoreCase( "A" ) || tagName.equalsIgnoreCase( "TD" ) ) {
806
_printer.printText( '>' );
809
// Handle SCRIPT and STYLE specifically by changing the
810
// state of the current element to CDATA (XHTML) or
812
if ( tagName.equalsIgnoreCase( "SCRIPT" ) ||
813
tagName.equalsIgnoreCase( "STYLE" ) ) {
815
// XHTML: Print contents as CDATA section
816
state.doCData = true;
818
// HTML: Print contents unescaped
819
state.unescaped = true;
822
child = elem.getFirstChild();
823
while ( child != null ) {
824
serializeNode( child );
825
child = child.getNextSibling();
827
endElementIO( null, null, tagName );
830
// XHTML: Close empty tag with ' />' so it's XML and HTML compatible.
831
// HTML: Empty tags are defined as such in DTD no in document.
833
_printer.printText( " />" );
835
_printer.printText( '>' );
836
// After element but parent element is no longer empty.
837
state.afterElement = true;
839
if ( isDocumentState() )
846
protected void characters( String text )
849
// HTML: no CDATA section
851
super.characters( text );
855
protected String getEntityRef( int ch )
857
return HTMLdtd.fromChar( ch );
861
protected String escapeURI( String uri )
865
// XXX Apparently Netscape doesn't like if we escape the URI
866
// using %nn, so we leave it as is, just remove any quotes.
867
index = uri.indexOf( "\"" );
869
return uri.substring( 0, index );