1
/* Copyright (c) 2006-2007, Vladimir Nikic
4
Redistribution and use of this software in source and binary forms,
5
with or without modification, are permitted provided that the following
8
* Redistributions of source code must retain the above
9
copyright notice, this list of conditions and the
12
* Redistributions in binary form must reproduce the above
13
copyright notice, this list of conditions and the
14
following disclaimer in the documentation and/or other
15
materials provided with the distribution.
17
* The name of HtmlCleaner may not be used to endorse or promote
18
products derived from this software without specific prior
21
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
25
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31
POSSIBILITY OF SUCH DAMAGE.
33
You can contact Vladimir Nikic by sending e-mail to
34
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
38
package org.htmlcleaner;
44
* Class contains information about single HTML tag.<br/>
45
* It also contains rules for tag balancing. For each tag, list of dependant
46
* tags may be defined. There are several kinds of dependancies used to reorder
50
* fatal tags - required outer tag - the tag will be ignored during
51
* parsing (will be skipped) if this fatal tag is missing. For example, most web
52
* browsers ignore elements TD, TR, TBODY if they are not in the context of TABLE tag.
55
* required enclosing tags - if there is no such, it is implicitely
56
* created. For example if TD is out of TR - open TR is created before.
59
* forbidden tags - it is not allowed to occure inside - for example
60
* FORM cannot be inside other FORM and it will be ignored during cleanup.
63
* allowed children tags - for example TR allowes TD and TH. If there
64
* are some dependant allowed tags defined then cleaner ignores other tags, treating
65
* them as unallowed, unless they are in some other relationship with this tag.
68
* higher level tags - for example for TR higher tags are THEAD, TBODY, TFOOT.
71
* tags that must be closed and copied - for example, in
72
* <code><a href="#"><div>....</code> tag A must be closed before DIV but
73
* copied again inside DIV.
76
* tags that must be closed before closing this tag and copied again after -
77
* for example, in <code><i><b>at</i> first</b> text </code>
78
* tag B must be closed before closing I, but it must be copied again after resulting
79
* finally in sequence: <code><i><b>at</b></i><b> first</b> text </code>.
85
* Tag TR for instance (table row) may define the following dependancies:
87
* <li>fatal tag is <code>table</code></li>
88
* <li>required enclosing tag is <code>tbody</code></li>
89
* <li>allowed children tags are <code>td,th</code></li>
90
* <li>higher level tags are <code>thead,tfoot</code></li>
91
* <li>tags that muste be closed before are <code>tr,td,th,caption,colgroup</code></li>
93
* meaning the following: <br>
95
* <li><code>tr</code> must be in context of <code>table</code>, otherwise it will be ignored,</li>
96
* <li><code>tr</code> may can be directly inside <code>tbody</code>, <code>tfoot</code> and <code>thead</code>,
97
* otherwise <code>tbody</code> will be implicitely created in front of it.</li>
98
* <li><code>tr</code> can contain <code>td</code> and <code>th</code>, all other tags and content will be pushed out of current
99
* limiting context, in the case of html tables, in front of enclosing <code>table</code> tag.</li>
100
* <li>if previous open tag is one of <code>tr</code>, <code>caption</code> or <code>colgroup</code>, it will be implicitely closed.</li>
104
public class TagInfo {
106
protected static final int HEAD_AND_BODY = 0;
107
protected static final int HEAD = 1;
108
protected static final int BODY = 2;
110
protected static final int CONTENT_ALL = 0;
111
protected static final int CONTENT_NONE = 1;
112
protected static final int CONTENT_TEXT = 2;
115
private int contentType;
116
private Set mustCloseTags = new HashSet();
117
private Set higherTags = new HashSet();
118
private Set childTags = new HashSet();
119
private Set permittedTags = new HashSet();
120
private Set copyTags = new HashSet();
121
private Set continueAfterTags = new HashSet();
122
private int belongsTo = BODY;
123
private String requiredParent = null;
124
private String fatalTag = null;
125
private boolean deprecated = false;
126
private boolean unique = false;
127
private boolean ignorePermitted = false;
130
public TagInfo(String name, int contentType, int belongsTo, boolean depricated, boolean unique, boolean ignorePermitted) {
132
this.contentType = contentType;
133
this.belongsTo = belongsTo;
134
this.deprecated = depricated;
135
this.unique = unique;
136
this.ignorePermitted = ignorePermitted;
139
public void defineFatalTags(String commaSeparatedListOfTags) {
140
StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ",");
141
while (tokenizer.hasMoreTokens()) {
142
String currTag = tokenizer.nextToken();
143
this.fatalTag = currTag;
144
this.higherTags.add(currTag);
148
public void defineRequiredEnclosingTags(String commaSeparatedListOfTags) {
149
StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ",");
150
while (tokenizer.hasMoreTokens()) {
151
String currTag = tokenizer.nextToken();
152
this.requiredParent = currTag;
153
this.higherTags.add(currTag);
157
public void defineForbiddenTags(String commaSeparatedListOfTags) {
158
StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ",");
159
while (tokenizer.hasMoreTokens()) {
160
String currTag = tokenizer.nextToken();
161
this.permittedTags.add(currTag);
165
public void defineAllowedChildrenTags(String commaSeparatedListOfTags) {
166
StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ",");
167
while (tokenizer.hasMoreTokens()) {
168
String currTag = tokenizer.nextToken();
169
this.childTags.add(currTag);
173
public void defineHigherLevelTags(String commaSeparatedListOfTags) {
174
StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ",");
175
while (tokenizer.hasMoreTokens()) {
176
String currTag = tokenizer.nextToken();
177
this.higherTags.add(currTag);
181
public void defineCloseBeforeCopyInsideTags(String commaSeparatedListOfTags) {
182
StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ",");
183
while (tokenizer.hasMoreTokens()) {
184
String currTag = tokenizer.nextToken();
185
this.copyTags.add(currTag);
186
this.mustCloseTags.add(currTag);
190
public void defineCloseInsideCopyAfterTags(String commaSeparatedListOfTags) {
191
StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ",");
192
while (tokenizer.hasMoreTokens()) {
193
String currTag = tokenizer.nextToken();
194
this.continueAfterTags.add(currTag);
198
public void defineCloseBeforeTags(String commaSeparatedListOfTags) {
199
StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ",");
200
while (tokenizer.hasMoreTokens()) {
201
String currTag = tokenizer.nextToken();
202
this.mustCloseTags.add(currTag);
206
// getters and setters
208
public String getName() {
212
public void setName(String name) {
216
public int getContentType() {
220
public Set getMustCloseTags() {
221
return mustCloseTags;
224
public void setMustCloseTags(Set mustCloseTags) {
225
this.mustCloseTags = mustCloseTags;
228
public Set getHigherTags() {
232
public void setHigherTags(Set higherTags) {
233
this.higherTags = higherTags;
236
public Set getChildTags() {
240
public void setChildTags(Set childTags) {
241
this.childTags = childTags;
244
public Set getPermittedTags() {
245
return permittedTags;
248
public void setPermittedTags(Set permittedTags) {
249
this.permittedTags = permittedTags;
252
public Set getCopyTags() {
256
public void setCopyTags(Set copyTags) {
257
this.copyTags = copyTags;
260
public Set getContinueAfterTags() {
261
return continueAfterTags;
264
public void setContinueAfterTags(Set continueAfterTags) {
265
this.continueAfterTags = continueAfterTags;
268
public String getRequiredParent() {
269
return requiredParent;
272
public void setRequiredParent(String requiredParent) {
273
this.requiredParent = requiredParent;
276
public int getBelongsTo() {
280
public void setBelongsTo(int belongsTo) {
281
this.belongsTo = belongsTo;
284
public String getFatalTag() {
288
public void setFatalTag(String fatalTag) {
289
this.fatalTag = fatalTag;
292
public boolean isDeprecated() {
296
public void setDeprecated(boolean deprecated) {
297
this.deprecated = deprecated;
300
public boolean isUnique() {
304
public void setUnique(boolean unique) {
305
this.unique = unique;
308
public boolean isIgnorePermitted() {
309
return ignorePermitted;
312
public boolean isEmptyTag() {
313
return CONTENT_NONE == contentType;
316
public void setIgnorePermitted(boolean ignorePermitted) {
317
this.ignorePermitted = ignorePermitted;
320
// other functionality
322
boolean allowsBody() {
323
return CONTENT_NONE != contentType;
326
boolean isHigher(String tagName) {
327
return higherTags.contains(tagName);
330
boolean isCopy(String tagName) {
331
return copyTags.contains(tagName);
334
boolean hasCopyTags() {
335
return !copyTags.isEmpty();
338
boolean isContinueAfter(String tagName) {
339
return continueAfterTags.contains(tagName);
342
boolean hasPermittedTags() {
343
return !permittedTags.isEmpty();
346
boolean isHeadTag() {
347
return belongsTo == HEAD;
350
boolean isHeadAndBodyTag() {
351
return belongsTo == HEAD || belongsTo == HEAD_AND_BODY;
354
boolean isMustCloseTag(TagInfo tagInfo) {
355
if (tagInfo != null) {
356
return mustCloseTags.contains( tagInfo.getName() ) || tagInfo.contentType == CONTENT_TEXT;
362
boolean allowsItem(BaseToken token) {
363
if ( contentType != CONTENT_NONE && token instanceof TagToken ) {
364
TagToken tagToken = (TagToken) token;
365
String tagName = tagToken.getName();
366
if ( "script".equals(tagName) ) {
371
if (CONTENT_ALL == contentType) {
372
if ( !childTags.isEmpty() ) {
373
return token instanceof TagToken ? childTags.contains( ((TagToken)token).getName() ) : false;
374
} else if ( !permittedTags.isEmpty() ) {
375
return token instanceof TagToken ? !permittedTags.contains( ((TagToken)token).getName() ) : true;
378
} else if ( CONTENT_TEXT == contentType ) {
379
return !(token instanceof TagToken);
385
boolean allowsAnything() {
386
return CONTENT_ALL == contentType && childTags.size() == 0;