5
* This set of functions allows you to filter html in order to remove
6
* any malicious tags from it. Useful in cases when you need to filter
7
* user input for any cross-site-scripting attempts.
9
* Copyright (C) 2002-2004 by Duke University
11
* This library is free software; you can redistribute it and/or
12
* modify it under the terms of the GNU Lesser General Public
13
* License as published by the Free Software Foundation; either
14
* version 2.1 of the License, or (at your option) any later version.
16
* This library is distributed in the hope that it will be useful,
17
* but WITHOUT ANY WARRANTY; without even the implied warranty of
18
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19
* Lesser General Public License for more details.
21
* You should have received a copy of the GNU Lesser General Public
22
* License along with this library; if not, write to the Free Software
23
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
* @Author Konstantin Riabitsev <icon@linux.duke.edu>
27
* @Version 1.1 ($Date: 2011-07-04 14:02:23 -0400 (Mon, 04 Jul 2011) $)
31
* @Author Jim Jagielski <jim@jaguNET.com / jimjag@gmail.com>
35
* This function returns the final tag out of the tag name, an array
36
* of attributes, and the type of the tag. This function is called by
37
* tln_sanitize internally.
39
* @param $tagname the name of the tag.
40
* @param $attary the array of attributes and their values
41
* @param $tagtype The type of the tag (see in comments).
42
* @return a string with the final tag representation.
44
function tln_tagprint($tagname, $attary, $tagtype){
47
$fulltag = '</' . $tagname . '>';
49
$fulltag = '<' . $tagname;
50
if (is_array($attary) && sizeof($attary)){
52
while (list($attname, $attvalue) = each($attary)){
53
array_push($atts, "$attname=$attvalue");
55
$fulltag .= ' ' . join(' ', $atts);
66
* A small helper function to use with array_walk. Modifies a by-ref
67
* value and makes it lowercase.
69
* @param $val a value passed by-ref.
70
* @return void since it modifies a by-ref value.
72
function tln_casenormalize(&$val){
73
$val = strtolower($val);
77
* This function skips any whitespace from the current position within
78
* a string and to the next non-whitespace value.
80
* @param $body the string
81
* @param $offset the offset within the string where we should start
82
* looking for the next non-whitespace character.
83
* @return the location within the $body where the next
84
* non-whitespace char is located.
86
function tln_skipspace($body, $offset){
87
$me = 'tln_skipspace';
88
preg_match('/^(\s*)/s', substr($body, $offset), $matches);
89
if (sizeof($matches[1])){
90
$count = strlen($matches[1]);
97
* This function looks for the next character within a string. It's
98
* really just a glorified "strpos", except it catches the failures
101
* @param $body The string to look for needle in.
102
* @param $offset Start looking from this position.
103
* @param $needle The character/string to look for.
104
* @return location of the next occurance of the needle, or
105
* strlen($body) if needle wasn't found.
107
function tln_findnxstr($body, $offset, $needle){
108
$me = 'tln_findnxstr';
109
$pos = strpos($body, $needle, $offset);
111
$pos = strlen($body);
117
* This function takes a PCRE-style regexp and tries to match it
120
* @param $body The string to look for needle in.
121
* @param $offset Start looking from here.
122
* @param $reg A PCRE-style regex to match.
123
* @return Returns a false if no matches found, or an array
124
* with the following members:
125
* - integer with the location of the match within $body
126
* - string with whatever content between offset and the match
127
* - string with whatever it is we matched
129
function tln_findnxreg($body, $offset, $reg){
130
$me = 'tln_findnxreg';
133
$preg_rule = '%^(.*?)(' . $reg . ')%s';
134
preg_match($preg_rule, substr($body, $offset), $matches);
135
if (!isset($matches[0])){
138
$retarr[0] = $offset + strlen($matches[1]);
139
$retarr[1] = $matches[1];
140
$retarr[2] = $matches[2];
146
* This function looks for the next tag.
148
* @param $body String where to look for the next tag.
149
* @param $offset Start looking from here.
150
* @return false if no more tags exist in the body, or
151
* an array with the following members:
152
* - string with the name of the tag
153
* - array with attributes and their values
154
* - integer with tag type (1, 2, or 3)
155
* - integer where the tag starts (starting "<")
156
* - integer where the tag ends (ending ">")
157
* first three members will be false, if the tag is invalid.
159
function tln_getnxtag($body, $offset){
160
$me = 'tln_getnxtag';
161
if ($offset > strlen($body)){
164
$lt = tln_findnxstr($body, $offset, '<');
165
if ($lt == strlen($body)){
170
* blah blah <tag attribute="value">
173
$pos = tln_skipspace($body, $lt + 1);
174
if ($pos >= strlen($body)){
175
return Array(false, false, false, $lt, strlen($body));
178
* There are 3 kinds of tags:
179
* 1. Opening tag, e.g.:
181
* 2. Closing tag, e.g.:
183
* 3. XHTML-style content-less tag, e.g.:
187
switch (substr($body, $pos, 1)){
194
* A comment or an SGML declaration.
196
if (substr($body, $pos+1, 2) == '--'){
197
$gt = strpos($body, '-->', $pos);
203
return Array(false, false, false, $lt, $gt);
205
$gt = tln_findnxstr($body, $pos, '>');
206
return Array(false, false, false, $lt, $gt);
211
* Assume tagtype 1 for now. If it's type 3, we'll switch values
221
* Look for next [\W-_], which will indicate the end of the tag name.
223
$regary = tln_findnxreg($body, $pos, '[^\w\-_]');
224
if ($regary == false){
225
return Array(false, false, false, $lt, strlen($body));
227
list($pos, $tagname, $match) = $regary;
228
$tagname = strtolower($tagname);
231
* $match can be either of these:
232
* '>' indicating the end of the tag entirely.
233
* '\s' indicating the end of the tag name.
234
* '/' indicating that this is type-3 xhtml tag.
236
* Whatever else we find there indicates an invalid tag.
241
* This is an xhtml-style tag with a closing / at the
242
* end, like so: <img src="blah"/>. Check if it's followed
243
* by the closing bracket. If not, then this tag is invalid
245
if (substr($body, $pos, 2) == '/>'){
249
$gt = tln_findnxstr($body, $pos, '>');
250
$retary = Array(false, false, false, $lt, $gt);
254
return Array($tagname, false, $tagtype, $lt, $pos);
258
* Check if it's whitespace
260
if (preg_match('/\s/', $match)){
263
* This is an invalid tag! Look for the next closing ">".
265
$gt = tln_findnxstr($body, $lt, '>');
266
return Array(false, false, false, $lt, $gt);
271
* At this point we're here:
272
* <tagname attribute='blah'>
275
* At this point we loop in order to find all attributes.
281
while ($pos <= strlen($body)){
282
$pos = tln_skipspace($body, $pos);
283
if ($pos == strlen($body)){
287
return Array(false, false, false, $lt, $pos);
290
* See if we arrived at a ">" or "/>", which means that we reached
291
* the end of the tag.
294
preg_match('%^(\s*)(>|/>)%s', substr($body, $pos), $matches);
295
if (isset($matches[0]) && $matches[0]){
299
$pos += strlen($matches[1]);
300
if ($matches[2] == '/>'){
304
return Array($tagname, $attary, $tagtype, $lt, $pos);
308
* There are several types of attributes, with optional
309
* [:space:] between members.
311
* attrname[:space:]=[:space:]'CDATA'
313
* attrname[:space:]=[:space:]"CDATA"
315
* attr[:space:]=[:space:]CDATA
319
* We leave types 1 and 2 the same, type 3 we check for
320
* '"' and convert to """ if needed, then wrap in
321
* double quotes. Type 4 we convert into:
324
$regary = tln_findnxreg($body, $pos, '[^\w\-_]');
325
if ($regary == false){
327
* Looks like body ended before the end of tag.
329
return Array(false, false, false, $lt, strlen($body));
331
list($pos, $attname, $match) = $regary;
332
$attname = strtolower($attname);
334
* We arrived at the end of attribute name. Several things possible
336
* '>' means the end of the tag and this is attribute type 4
337
* '/' if followed by '>' means the same thing as above
338
* '\s' means a lot of things -- look what it's followed by.
339
* anything else means the attribute is invalid.
344
* This is an xhtml-style tag with a closing / at the
345
* end, like so: <img src="blah"/>. Check if it's followed
346
* by the closing bracket. If not, then this tag is invalid
348
if (substr($body, $pos, 2) == '/>'){
352
$gt = tln_findnxstr($body, $pos, '>');
353
$retary = Array(false, false, false, $lt, $gt);
357
$attary{$attname} = '"yes"';
358
return Array($tagname, $attary, $tagtype, $lt, $pos);
362
* Skip whitespace and see what we arrive at.
364
$pos = tln_skipspace($body, $pos);
365
$char = substr($body, $pos, 1);
367
* Two things are valid here:
368
* '=' means this is attribute type 1 2 or 3.
369
* \w means this was attribute type 4.
370
* anything else we ignore and re-loop. End of tag and
371
* invalid stuff will be caught by our checks at the beginning
376
$pos = tln_skipspace($body, $pos);
378
* Here are 3 possibilities:
379
* "'" attribute type 1
380
* '"' attribute type 2
381
* everything else is the content of tag type 3
383
$quot = substr($body, $pos, 1);
385
$regary = tln_findnxreg($body, $pos+1, '\'');
386
if ($regary == false){
387
return Array(false, false, false, $lt, strlen($body));
389
list($pos, $attval, $match) = $regary;
391
$attary{$attname} = '\'' . $attval . '\'';
392
} else if ($quot == '"'){
393
$regary = tln_findnxreg($body, $pos+1, '\"');
394
if ($regary == false){
395
return Array(false, false, false, $lt, strlen($body));
397
list($pos, $attval, $match) = $regary;
399
$attary{$attname} = '"' . $attval . '"';
402
* These are hateful. Look for \s, or >.
404
$regary = tln_findnxreg($body, $pos, '[\s>]');
405
if ($regary == false){
406
return Array(false, false, false, $lt, strlen($body));
408
list($pos, $attval, $match) = $regary;
410
* If it's ">" it will be caught at the top.
412
$attval = preg_replace('/\"/s', '"', $attval);
413
$attary{$attname} = '"' . $attval . '"';
415
} else if (preg_match('|[\w/>]|', $char)) {
417
* That was attribute type 4.
419
$attary{$attname} = '"yes"';
422
* An illegal character. Find next '>' and return.
424
$gt = tln_findnxstr($body, $pos, '>');
425
return Array(false, false, false, $lt, $gt);
430
* The fact that we got here indicates that the tag end was never
431
* found. Return invalid tag indication so it gets stripped.
433
return Array(false, false, false, $lt, strlen($body));
437
* Translates entities into literal values so they can be checked.
439
* @param $attvalue the by-ref value to check.
440
* @param $regex the regular expression to check against.
441
* @param $hex whether the entites are hexadecimal.
442
* @return True or False depending on whether there were matches.
444
function tln_deent(&$attvalue, $regex, $hex=false){
447
preg_match_all($regex, $attvalue, $matches);
448
if (is_array($matches) && sizeof($matches[0]) > 0){
450
for ($i = 0; $i < sizeof($matches[0]); $i++){
451
$numval = $matches[1][$i];
453
$numval = hexdec($numval);
455
$repl{$matches[0][$i]} = chr($numval);
457
$attvalue = strtr($attvalue, $repl);
465
* This function checks attribute values for entity-encoded values
466
* and returns them translated into 8-bit strings so we can run
469
* @param $attvalue A string to run entity check against.
470
* @return Nothing, modifies a reference value.
472
function tln_defang(&$attvalue){
475
* Skip this if there aren't ampersands or backslashes.
477
if (strpos($attvalue, '&') === false
478
&& strpos($attvalue, '\\') === false){
484
$m = $m || tln_deent($attvalue, '/\�*(\d+);*/s');
485
$m = $m || tln_deent($attvalue, '/\�*((\d|[a-f])+);*/si', true);
486
$m = $m || tln_deent($attvalue, '/\\\\(\d+)/s', true);
487
} while ($m == true);
488
$attvalue = stripslashes($attvalue);
492
* Kill any tabs, newlines, or carriage returns. Our friends the
493
* makers of the browser with 95% market value decided that it'd
494
* be funny to make "java[tab]script" be just as good as "javascript".
496
* @param attvalue The attribute value before extraneous spaces removed.
497
* @return attvalue Nothing, modifies a reference value.
499
function tln_unspace(&$attvalue){
501
if (strcspn($attvalue, "\t\r\n\0 ") != strlen($attvalue)){
502
$attvalue = str_replace(Array("\t", "\r", "\n", "\0", " "),
503
Array('', '', '', '', ''), $attvalue);
508
* This function runs various checks against the attributes.
510
* @param $tagname String with the name of the tag.
511
* @param $attary Array with all tag attributes.
512
* @param $rm_attnames See description for tln_sanitize
513
* @param $bad_attvals See description for tln_sanitize
514
* @param $add_attr_to_tag See description for tln_sanitize
515
* @return Array with modified attributes.
517
function tln_fixatts($tagname,
524
while (list($attname, $attvalue) = each($attary)){
526
* See if this attribute should be removed.
528
foreach ($rm_attnames as $matchtag=>$matchattrs){
529
if (preg_match($matchtag, $tagname)){
530
foreach ($matchattrs as $matchattr){
531
if (preg_match($matchattr, $attname)){
532
unset($attary{$attname});
539
* Remove any backslashes, entities, or extraneous whitespace.
541
tln_defang($attvalue);
542
tln_unspace($attvalue);
545
* Now let's run checks on the attvalues.
546
* I don't expect anyone to comprehend this. If you do,
547
* get in touch with me so I can drive to where you live and
548
* shake your hand personally. :)
550
foreach ($bad_attvals as $matchtag=>$matchattrs){
551
if (preg_match($matchtag, $tagname)){
552
foreach ($matchattrs as $matchattr=>$valary){
553
if (preg_match($matchattr, $attname)){
555
* There are two arrays in valary.
557
* Second one is replacements
559
list($valmatch, $valrepl) = $valary;
560
$newvalue = preg_replace($valmatch,$valrepl,$attvalue);
561
if ($newvalue != $attvalue){
562
$attary{$attname} = $newvalue;
570
* See if we need to append any attributes to this tag.
572
foreach ($add_attr_to_tag as $matchtag=>$addattary){
573
if (preg_match($matchtag, $tagname)){
574
$attary = array_merge($attary, $addattary);
582
* @param $body the string with HTML you wish to filter
583
* @param $tag_list see description above
584
* @param $rm_tags_with_content see description above
585
* @param $self_closing_tags see description above
586
* @param $force_tag_closing see description above
587
* @param $rm_attnames see description above
588
* @param $bad_attvals see description above
589
* @param $add_attr_to_tag see description above
590
* @return tln_sanitized html safe to show on your pages.
592
function tln_sanitize($body,
594
$rm_tags_with_content,
602
$me = 'tln_sanitize';
604
* Normalize rm_tags and rm_tags_with_content.
606
$rm_tags = array_shift($tag_list);
607
@array_walk($tag_list, 'tln_casenormalize');
608
@array_walk($rm_tags_with_content, 'tln_casenormalize');
609
@array_walk($self_closing_tags, 'tln_casenormalize');
611
* See if tag_list is of tags to remove or tags to allow.
612
* false means remove these tags
613
* true means allow these tags
616
$open_tags = Array();
617
$trusted = "<!-- begin tln_sanitized html -->\n";
618
$skip_content = false;
620
* Take care of netscape's stupid javascript entities like
623
$body = preg_replace('/&(\{.*?\};)/si', '&\\1', $body);
624
while (($curtag = tln_getnxtag($body, $curpos)) != FALSE){
625
list($tagname, $attary, $tagtype, $lt, $gt) = $curtag;
626
$free_content = substr($body, $curpos, $lt - $curpos);
627
if ($skip_content == false){
628
$trusted .= $free_content;
631
if ($tagname != FALSE){
633
if ($skip_content == $tagname){
635
* Got to the end of tag we needed to remove.
638
$skip_content = false;
640
if ($skip_content == false){
641
if (isset($open_tags{$tagname}) &&
642
$open_tags{$tagname} > 0){
643
$open_tags{$tagname}--;
652
* $rm_tags_with_content
654
if ($skip_content == false){
656
* See if this is a self-closing type and change
657
* tagtype appropriately.
660
&& in_array($tagname, $self_closing_tags)){
664
* See if we should skip this tag and any content
668
&& in_array($tagname, $rm_tags_with_content)){
669
$skip_content = $tagname;
671
if (($rm_tags == false
672
&& in_array($tagname, $tag_list)) ||
674
&& !in_array($tagname, $tag_list))){
678
if (isset($open_tags{$tagname})){
679
$open_tags{$tagname}++;
681
$open_tags{$tagname} = 1;
685
* This is where we run other checks.
687
if (is_array($attary) && sizeof($attary) > 0){
688
$attary = tln_fixatts($tagname,
699
if ($tagname != false && $skip_content == false){
700
$trusted .= tln_tagprint($tagname, $attary, $tagtype);
706
$trusted .= substr($body, $curpos, strlen($body) - $curpos);
707
if ($force_tag_closing == true){
708
foreach ($open_tags as $tagname=>$opentimes){
709
while ($opentimes > 0){
710
$trusted .= '</' . $tagname . '>';
716
$trusted .= "<!-- end tln_sanitized html -->\n";
721
// Use the nifty htmlfilter library
725
function HTMLFilter($body, $trans_image_path, $block_external_images = false) {
741
$rm_tags_with_content = Array(
751
$self_closing_tags = Array(
759
$force_tag_closing = true;
761
$rm_attnames = Array(
772
$bad_attvals = Array(
775
"/^src|background/i" =>
778
"/^([\'\"])\s*\S+script\s*:.*([\'\"])/si",
779
"/^([\'\"])\s*mocha\s*:*.*([\'\"])/si",
780
"/^([\'\"])\s*about\s*:.*([\'\"])/si"
783
"\\1$trans_image_path\\2",
784
"\\1$trans_image_path\\2",
785
"\\1$trans_image_path\\2",
786
"\\1$trans_image_path\\2"
792
"/^([\'\"])\s*\S+script\s*:.*([\'\"])/si",
793
"/^([\'\"])\s*mocha\s*:*.*([\'\"])/si",
794
"/^([\'\"])\s*about\s*:.*([\'\"])/si"
810
"/position\s*:\s*absolute/i",
811
"/url\s*\(\s*([\'\"])\s*\S+script\s*:.*([\'\"])\s*\)/si",
812
"/url\s*\(\s*([\'\"])\s*mocha\s*:.*([\'\"])\s*\)/si",
813
"/url\s*\(\s*([\'\"])\s*about\s*:.*([\'\"])\s*\)/si",
814
"/(.*)\s*:\s*url\s*\(\s*([\'\"]*)\s*\S+script\s*:.*([\'\"]*)\s*\)/si"
833
if ($block_external_images){
834
array_push($bad_attvals{'/.*/'}{'/^src|background/i'}[0],
835
'/^([\'\"])\s*https*:.*([\'\"])/si');
836
array_push($bad_attvals{'/.*/'}{'/^src|background/i'}[1],
837
"\\1$trans_image_path\\1");
838
array_push($bad_attvals{'/.*/'}{'/^style/i'}[0],
839
'/url\(([\'\"])\s*https*:.*([\'\"])\)/si');
840
array_push($bad_attvals{'/.*/'}{'/^style/i'}[1],
841
"url(\\1$trans_image_path\\1)");
844
$add_attr_to_tag = Array(
846
Array('target'=>'"_blank"')
849
$trusted = tln_sanitize($body,
851
$rm_tags_with_content,