1
/* Copyright (c) 2006-2007, Vladimir Nikic
4
Redistribution and use of this software in source and binary forms,
5
with or without modification, are permitted provided that the following
8
* Redistributions of source code must retain the above
9
copyright notice, this list of conditions and the
12
* Redistributions in binary form must reproduce the above
13
copyright notice, this list of conditions and the
14
following disclaimer in the documentation and/or other
15
materials provided with the distribution.
17
* The name of HtmlCleaner may not be used to endorse or promote
18
products derived from this software without specific prior
21
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
25
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31
POSSIBILITY OF SUCH DAMAGE.
33
You can contact Vladimir Nikic by sending e-mail to
34
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
38
package org.htmlcleaner;
44
* Main HTML tokenizer.
45
* <p>It's task is to parse HTML and produce list of valid tokens:
46
* open tag tokens, end tag tokens, contents (text) and comments.
47
* As soon as new item is added to token list, cleaner is invoked
48
* to clean current list at the end.</p>
50
abstract public class HtmlTokenizer {
52
private final static int WORKING_BUFFER_SIZE = 1024;
54
private BufferedReader _reader;
55
private char[] _working = new char[WORKING_BUFFER_SIZE];
57
private transient int _pos = 0;
58
private transient int _len = -1;
60
private transient char _saved[] = new char[512];
61
private transient int _savedLen = 0;
63
private transient DoctypeToken _docType = null;
64
private transient TagToken _currentTagToken = null;
65
private transient List<BaseToken> _tokenList = new ArrayList<BaseToken>();
67
private boolean _asExpected = true;
69
private boolean _isScriptContext = false;
71
private CleanerProperties props;
73
private boolean isOmitUnknownTags;
74
private boolean isTreatUnknownTagsAsContent;
75
private boolean isOmitDeprecatedTags;
76
private boolean isTreatDeprecatedTagsAsContent;
77
private boolean isNamespacesAware;
78
private boolean isOmitComments;
79
private boolean isAllowMultiWordAttributes;
80
private boolean isAllowHtmlInsideAttributes;
82
private CleanerTransformations transformations;
83
private ITagInfoProvider tagInfoProvider;
85
private StringBuilder commonStr = new StringBuilder();
88
* Constructor - cretes instance of the parser with specified content.
92
* @param transformations
93
* @param tagInfoProvider
97
public HtmlTokenizer(Reader reader, CleanerProperties props, CleanerTransformations transformations, ITagInfoProvider tagInfoProvider) throws IOException {
98
this._reader = new BufferedReader(reader);
100
this.isOmitUnknownTags = props.isOmitUnknownTags();
101
this.isTreatUnknownTagsAsContent = props.isTreatUnknownTagsAsContent();
102
this.isOmitDeprecatedTags = props.isOmitDeprecatedTags();
103
this.isTreatDeprecatedTagsAsContent = props.isTreatDeprecatedTagsAsContent();
104
this.isNamespacesAware = props.isNamespacesAware();
105
this.isOmitComments = props.isOmitComments();
106
this.isAllowMultiWordAttributes = props.isAllowMultiWordAttributes();
107
this.isAllowHtmlInsideAttributes = props.isAllowHtmlInsideAttributes();
108
this.transformations = transformations;
109
this.tagInfoProvider = tagInfoProvider;
112
private void addToken(BaseToken token) {
113
_tokenList.add(token);
114
makeTree(_tokenList);
117
abstract void makeTree(List<BaseToken> tokenList);
119
abstract TagNode createTagNode(String name);
121
private void readIfNeeded(int neededChars) throws IOException {
122
if (_len == -1 && _pos + neededChars >= WORKING_BUFFER_SIZE) {
123
int numToCopy = WORKING_BUFFER_SIZE - _pos;
124
System.arraycopy(_working, _pos, _working, 0, numToCopy);
127
int expected = WORKING_BUFFER_SIZE - numToCopy;
130
int offset = numToCopy;
132
charsRead = _reader.read(_working, offset, expected);
133
if (charsRead >= 0) {
136
expected -= charsRead;
138
} while (charsRead >= 0 && expected > 0);
141
_len = size + numToCopy;
144
// convert invalid XML characters to spaces
145
for (int i = 0; i < (_len >= 0 ? _len : WORKING_BUFFER_SIZE); i++) {
146
int ch = _working[i];
147
if (ch >= 1 && ch <= 32 && ch != 10 && ch != 13) {
154
List<BaseToken> getTokenList() {
155
return this._tokenList;
158
private void go() throws IOException {
163
private void go(int step) throws IOException {
165
readIfNeeded(step - 1);
169
* Checks if content starts with specified value at the current position.
171
* @return true if starts with specified value, false otherwise.
172
* @throws IOException
174
private boolean startsWith(String value) throws IOException {
175
int valueLen = value.length();
176
readIfNeeded(valueLen);
177
if (_len >= 0 && _pos + valueLen > _len) {
181
for (int i = 0; i < valueLen; i++) {
182
char ch1 = Character.toLowerCase( value.charAt(i) );
183
char ch2 = Character.toLowerCase( _working[_pos + i] );
192
private boolean startsWithSimple(String value) throws IOException {
193
int valueLen = value.length();
194
readIfNeeded(valueLen);
195
if (_len >= 0 && _pos + valueLen > _len) {
199
for (int i = 0; i < valueLen; i++) {
200
if (value.charAt(i) != _working[_pos + i]) {
209
* Checks if character at specified position is whitespace.
211
* @return true is whitespace, false otherwise.
213
private boolean isWhitespace(int position) {
214
if (_len >= 0 && position >= _len) {
218
return Character.isWhitespace( _working[position] );
222
* Checks if character at current runtime position is whitespace.
223
* @return true is whitespace, false otherwise.
225
private boolean isWhitespace() {
226
return isWhitespace(_pos);
229
private boolean isWhitespaceSafe() {
230
return Character.isWhitespace( _working[_pos] );
234
* Checks if character at specified position is equal to specified char.
237
* @return true is equals, false otherwise.
239
private boolean isChar(int position, char ch) {
240
if (_len >= 0 && position >= _len) {
244
return Character.toLowerCase(ch) == Character.toLowerCase(_working[position]);
248
* Checks if character at current runtime position is equal to specified char.
250
* @return true is equal, false otherwise.
252
private boolean isChar(char ch) {
253
return isChar(_pos, ch);
256
private boolean isCharSimple(char ch) {
257
return (_len < 0 || _pos < _len) && (ch == _working[_pos]);
261
* @return Current character to be read, but first it must be checked if it exists.
262
* This method is made for performance reasons to be used instead of isChar(...).
264
private char getCurrentChar() {
265
return _working[_pos];
268
private boolean isCharEquals(char ch) {
269
return _working[_pos] == ch;
273
* Checks if character at specified position can be identifier start.
275
* @return true is may be identifier start, false otherwise.
277
private boolean isIdentifierStartChar(int position) {
278
if (_len >= 0 && position >= _len) {
282
char ch = _working[position];
283
return Character.isUnicodeIdentifierStart(ch) || ch == '_';
287
* Checks if character at current runtime position can be identifier start.
288
* @return true is may be identifier start, false otherwise.
290
private boolean isIdentifierStartChar() {
291
return isIdentifierStartChar(_pos);
295
* Checks if character at current runtime position can be identifier part.
296
* @return true is may be identifier part, false otherwise.
298
private boolean isIdentifierChar() {
299
if (_len >= 0 && _pos >= _len) {
303
char ch = _working[_pos];
304
return Character.isUnicodeIdentifierStart(ch) || Character.isDigit(ch) || Utils.isIdentifierHelperChar(ch);
307
private boolean isValidXmlChar() {
308
return isAllRead() || Utils.isValidXmlChar(_working[_pos]);
311
private boolean isValidXmlCharSafe() {
312
return Utils.isValidXmlChar(_working[_pos]);
316
* Checks if end of the content is reached.
318
private boolean isAllRead() {
319
return _len >= 0 && _pos >= _len;
323
* Saves specified character to the temporary buffer.
326
private void save(char ch) {
327
if (_savedLen >= _saved.length) {
328
char newSaved[] = new char[_saved.length + 512];
329
System.arraycopy(_saved, 0, newSaved, 0, _saved.length);
332
_saved[_savedLen++] = ch;
336
* Saves character at current runtime position to the temporary buffer.
338
private void saveCurrent() {
340
save( _working[_pos] );
344
private void saveCurrentSafe() {
345
save( _working[_pos] );
349
* Saves specified number of characters at current runtime position to the temporary buffer.
350
* @throws IOException
352
private void saveCurrent(int size) throws IOException {
355
while ( !isAllRead() && (size > 0) ) {
356
save( _working[pos] );
363
* Skips whitespaces at current position and moves foreward until
364
* non-whitespace character is found or the end of content is reached.
365
* @throws IOException
367
private void skipWhitespaces() throws IOException {
368
while ( !isAllRead() && isWhitespaceSafe() ) {
374
private boolean addSavedAsContent() {
376
addToken(new ContentNode(_saved, _savedLen));
385
* Starts parsing HTML.
386
* @throws IOException
388
void start() throws IOException {
389
// initialize runtime values
390
_currentTagToken = null;
393
_isScriptContext = false;
395
boolean isLateForDoctype = false;
397
this._pos = WORKING_BUFFER_SIZE;
400
boolean isScriptEmpty = true;
402
while ( !isAllRead() ) {
403
// resets all the runtime values
405
_currentTagToken = null;
408
// this is enough for making decision
411
if (_isScriptContext) {
412
if ( startsWith("</script") && (isWhitespace(_pos + 8) || isChar(_pos + 8, '>')) ) {
414
} else if ( isScriptEmpty && startsWithSimple("<!--") ) {
417
boolean isTokenAdded = content();
418
if (isScriptEmpty && isTokenAdded) {
419
final BaseToken lastToken = _tokenList.get(_tokenList.size() - 1);
420
if (lastToken != null) {
421
final String lastTokenAsString = lastToken.toString();
422
if (lastTokenAsString != null && lastTokenAsString.trim().length() > 0) {
423
isScriptEmpty = false;
428
if (!_isScriptContext) {
429
isScriptEmpty = true;
432
if ( startsWith("<!doctype") ) {
433
if ( !isLateForDoctype ) {
435
isLateForDoctype = true;
439
} else if ( startsWithSimple("</") && isIdentifierStartChar(_pos + 2) ) {
440
isLateForDoctype = true;
442
} else if ( startsWithSimple("<!--") ) {
444
} else if ( startsWithSimple("<") && isIdentifierStartChar(_pos + 1) ) {
445
isLateForDoctype = true;
447
} else if ( props.isIgnoreQuestAndExclam() && (startsWithSimple("<!") || startsWithSimple("<?")) ) {
449
if (isCharSimple('>')) {
462
* Checks if specified tag name is one of the reserved tags: HTML, HEAD or BODY
466
private boolean isReservedTag(String tagName) {
467
tagName = tagName.toLowerCase();
468
return "html".equals(tagName) || "head".equals(tagName) || "body".equals(tagName);
472
* Parses start of the tag.
473
* It expects that current position is at the "<" after which
474
* the tag's name follows.
475
* @throws IOException
477
private void tagStart() throws IOException {
485
String tagName = identifier();
487
TagTransformation tagTransformation = null;
488
if (transformations != null && transformations.hasTransformationForTag(tagName)) {
489
tagTransformation = transformations.getTransformation(tagName);
490
if (tagTransformation != null) {
491
tagName = tagTransformation.getDestTag();
495
if (tagName != null) {
496
TagInfo tagInfo = tagInfoProvider.getTagInfo(tagName);
497
if ( (tagInfo == null && !isOmitUnknownTags && isTreatUnknownTagsAsContent && !isReservedTag(tagName)) ||
498
(tagInfo != null && tagInfo.isDeprecated() && !isOmitDeprecatedTags && isTreatDeprecatedTagsAsContent) ) {
504
TagNode tagNode = createTagNode(tagName);
505
_currentTagToken = tagNode;
511
if (tagName != null) {
512
if (tagTransformation != null) {
513
tagNode.transformAttributes(tagTransformation);
515
addToken(_currentTagToken);
518
if ( isCharSimple('>') ) {
520
if ( "script".equalsIgnoreCase(tagName) ) {
521
_isScriptContext = true;
523
} else if ( startsWithSimple("/>") ) {
525
if ( "script".equalsIgnoreCase(tagName) ) {
526
addToken( new EndTagToken(tagName) );
530
_currentTagToken = null;
538
* Parses end of the tag.
539
* It expects that current position is at the "<" after which
540
* "/" and the tag's name follows.
541
* @throws IOException
543
private void tagEnd() throws IOException {
551
String tagName = identifier();
552
if (transformations != null && transformations.hasTransformationForTag(tagName)) {
553
TagTransformation tagTransformation = transformations.getTransformation(tagName);
554
if (tagTransformation != null) {
555
tagName = tagTransformation.getDestTag();
559
if (tagName != null) {
560
TagInfo tagInfo = tagInfoProvider.getTagInfo(tagName);
561
if ( (tagInfo == null && !isOmitUnknownTags && isTreatUnknownTagsAsContent && !isReservedTag(tagName)) ||
562
(tagInfo != null && tagInfo.isDeprecated() && !isOmitDeprecatedTags && isTreatDeprecatedTagsAsContent) ) {
568
_currentTagToken = new EndTagToken(tagName);
574
if (tagName != null) {
575
addToken(_currentTagToken);
578
if ( isCharSimple('>') ) {
582
if ( "script".equalsIgnoreCase(tagName) ) {
583
_isScriptContext = false;
586
_currentTagToken = null;
593
* Parses an identifier from the current position.
594
* @throws IOException
596
private String identifier() throws IOException {
599
if ( !isIdentifierStartChar() ) {
604
commonStr.delete(0, commonStr.length());
606
while ( !isAllRead() && isIdentifierChar() ) {
608
commonStr.append( _working[_pos] );
612
// strip invalid characters from the end
613
while ( commonStr.length() > 0 && Utils.isIdentifierHelperChar(commonStr.charAt(commonStr.length() - 1)) ) {
614
commonStr.deleteCharAt( commonStr.length() - 1 );
617
if ( commonStr.length() == 0 ) {
621
String id = commonStr.toString();
623
int columnIndex = id.indexOf(':');
624
if (columnIndex >= 0) {
625
String prefix = id.substring(0, columnIndex);
626
String suffix = id.substring(columnIndex + 1);
627
int nextColumnIndex = suffix.indexOf(':');
628
if (nextColumnIndex >= 0) {
629
suffix = suffix.substring(0, nextColumnIndex);
631
id = isNamespacesAware ? (prefix + ":" + suffix) : suffix;
638
* Parses list tag attributes from the current position.
639
* @throws IOException
641
private void tagAttributes() throws IOException {
642
while( !isAllRead() && _asExpected && !isCharSimple('>') && !startsWithSimple("/>") ) {
644
String attName = identifier();
647
if ( !isCharSimple('<') && !isCharSimple('>') && !startsWithSimple("/>") ) {
648
if (isValidXmlChar()) {
654
if (!isCharSimple('<')) {
664
if ( isCharSimple('=') ) {
667
attValue = attributeValue();
668
} else if (CleanerProperties.BOOL_ATT_EMPTY.equals(props.booleanAttributeValues)) {
670
} else if (CleanerProperties.BOOL_ATT_TRUE.equals(props.booleanAttributeValues)) {
677
_currentTagToken.setAttribute(attName, attValue);
683
* Parses a single tag attribute - it is expected to be in one of the forms:
688
* @throws IOException
690
private String attributeValue() throws IOException {
693
if ( isCharSimple('<') || isCharSimple('>') || startsWithSimple("/>") ) {
697
boolean isQuoteMode = false;
698
boolean isAposMode = false;
700
commonStr.delete(0, commonStr.length());
702
if ( isCharSimple('\'') ) {
706
} else if ( isCharSimple('\"') ) {
712
while ( !isAllRead() &&
713
( ((isAposMode && !isCharEquals('\'') || isQuoteMode && !isCharEquals('\"')) && (isAllowHtmlInsideAttributes || !isCharEquals('>') && !isCharEquals('<')) && (isAllowMultiWordAttributes || !isWhitespaceSafe())) ||
714
(!isAposMode && !isQuoteMode && !isWhitespaceSafe() && !isCharEquals('>') && !isCharEquals('<'))
717
if (isValidXmlCharSafe()) {
718
commonStr.append( _working[_pos] );
724
if ( isCharSimple('\'') && isAposMode ) {
727
} else if ( isCharSimple('\"') && isQuoteMode ) {
733
return commonStr.toString();
736
private boolean content() throws IOException {
737
while ( !isAllRead() ) {
738
if (isValidXmlCharSafe()) {
743
if ( isCharSimple('<') ) {
748
return addSavedAsContent();
751
private void ignoreUntil(char ch) throws IOException {
752
while ( !isAllRead() ) {
760
private void comment() throws IOException {
762
while ( !isAllRead() && !startsWithSimple("-->") ) {
763
if (isValidXmlCharSafe()) {
769
if (startsWithSimple("-->")) {
774
if (!isOmitComments) {
775
String hyphenRepl = props.getHyphenReplacementInComment();
776
String comment = new String(_saved, 0, _savedLen).replaceAll("--", hyphenRepl + hyphenRepl);
778
if ( comment.length() > 0 && comment.charAt(0) == '-' ) {
779
comment = hyphenRepl + comment.substring(1);
781
int len = comment.length();
782
if ( len > 0 && comment.charAt(len - 1) == '-' ) {
783
comment = comment.substring(0, len - 1) + hyphenRepl;
786
addToken( new CommentNode(comment) );
792
private void doctype() throws IOException {
796
String part1 = identifier();
798
String part2 = identifier();
800
String part3 = attributeValue();
802
String part4 = attributeValue();
806
_docType = new DoctypeToken(part1, part2, part3, part4);
809
public DoctypeToken getDocType() {