5
* This file was auto-generated by generate-includes.php and includes all of
6
* the core files required by HTML Purifier. Use this if performance is a
7
* primary concern and you are using an opcode cache. PLEASE DO NOT EDIT THIS
8
* FILE, changes will be overwritten the next time the script is run.
13
* You must *not* include any other HTML Purifier files before this file,
14
* because 'require' not 'require_once' is used.
17
* This file requires that the include path contains the HTML Purifier
18
* library directory; this is not auto-set.
25
* HTML Purifier is an HTML filter that will take an arbitrary snippet of
26
* HTML and rigorously test, validate and filter it into a version that
27
* is safe for output onto webpages. It achieves this by:
29
* -# Lexing (parsing into tokens) the document,
30
* -# Executing various strategies on the tokens:
31
* -# Removing all elements not in the whitelist,
32
* -# Making the tokens well-formed,
33
* -# Fixing the nesting of the nodes, and
34
* -# Validating attributes of the nodes; and
35
* -# Generating HTML from the purified tokens.
37
* However, most users will only need to interface with the HTMLPurifier
38
* and HTMLPurifier_Config.
42
HTML Purifier 3.2.0 - Standards Compliant HTML Filtering
43
Copyright (C) 2006-2008 Edward Z. Yang
45
This library is free software; you can redistribute it and/or
46
modify it under the terms of the GNU Lesser General Public
47
License as published by the Free Software Foundation; either
48
version 2.1 of the License, or (at your option) any later version.
50
This library is distributed in the hope that it will be useful,
51
but WITHOUT ANY WARRANTY; without even the implied warranty of
52
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
53
Lesser General Public License for more details.
55
You should have received a copy of the GNU Lesser General Public
56
License along with this library; if not, write to the Free Software
57
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
61
* Facade that coordinates HTML Purifier's subsystems in order to purify HTML.
63
* @note There are several points in which configuration can be specified
64
* for HTML Purifier. The precedence of these (from lowest to
65
* highest) is as follows:
66
* -# Instance: new HTMLPurifier($config)
67
* -# Invocation: purify($html, $config)
68
* These configurations are entirely independent of each other and
69
* are *not* merged (this behavior may change in the future).
71
* @todo We need an easier way to inject strategies using the configuration
77
/** Version of HTML Purifier */
78
public $version = '3.2.0';
80
/** Constant with version of HTML Purifier */
81
const VERSION = '3.2.0';
83
/** Global configuration object */
86
/** Array of extra HTMLPurifier_Filter objects to run on HTML, for backwards compatibility */
87
private $filters = array();
89
/** Single instance of HTML Purifier */
90
private static $instance;
92
protected $strategy, $generator;
95
* Resultant HTMLPurifier_Context of last run purification. Is an array
96
* of contexts if the last called method was purifyArray().
101
* Initializes the purifier.
102
* @param $config Optional HTMLPurifier_Config object for all instances of
103
* the purifier, if omitted, a default configuration is
104
* supplied (which can be overridden on a per-use basis).
105
* The parameter can also be any type that
106
* HTMLPurifier_Config::create() supports.
108
public function __construct($config = null) {
110
$this->config = HTMLPurifier_Config::create($config);
112
$this->strategy = new HTMLPurifier_Strategy_Core();
117
* Adds a filter to process the output. First come first serve
118
* @param $filter HTMLPurifier_Filter object
120
public function addFilter($filter) {
121
trigger_error('HTMLPurifier->addFilter() is deprecated, use configuration directives in the Filter namespace or Filter.Custom', E_USER_WARNING);
122
$this->filters[] = $filter;
126
* Filters an HTML snippet/document to be XSS-free and standards-compliant.
128
* @param $html String of HTML to purify
129
* @param $config HTMLPurifier_Config object for this operation, if omitted,
130
* defaults to the config object specified during this
131
* object's construction. The parameter can also be any type
132
* that HTMLPurifier_Config::create() supports.
133
* @return Purified HTML
135
public function purify($html, $config = null) {
137
// :TODO: make the config merge in, instead of replace
138
$config = $config ? HTMLPurifier_Config::create($config) : $this->config;
140
// implementation is partially environment dependant, partially
141
// configuration dependant
142
$lexer = HTMLPurifier_Lexer::create($config);
144
$context = new HTMLPurifier_Context();
146
// setup HTML generator
147
$this->generator = new HTMLPurifier_Generator($config, $context);
148
$context->register('Generator', $this->generator);
150
// set up global context variables
151
if ($config->get('Core', 'CollectErrors')) {
152
// may get moved out if other facilities use it
153
$language_factory = HTMLPurifier_LanguageFactory::instance();
154
$language = $language_factory->create($config, $context);
155
$context->register('Locale', $language);
157
$error_collector = new HTMLPurifier_ErrorCollector($context);
158
$context->register('ErrorCollector', $error_collector);
161
// setup id_accumulator context, necessary due to the fact that
162
// AttrValidator can be called from many places
163
$id_accumulator = HTMLPurifier_IDAccumulator::build($config, $context);
164
$context->register('IDAccumulator', $id_accumulator);
166
$html = HTMLPurifier_Encoder::convertToUTF8($html, $config, $context);
169
$filter_flags = $config->getBatch('Filter');
170
$custom_filters = $filter_flags['Custom'];
171
unset($filter_flags['Custom']);
173
foreach ($filter_flags as $filter => $flag) {
174
if (!$flag) continue;
175
$class = "HTMLPurifier_Filter_$filter";
176
$filters[] = new $class;
178
foreach ($custom_filters as $filter) {
179
// maybe "HTMLPurifier_Filter_$filter", but be consistent with AutoFormat
180
$filters[] = $filter;
182
$filters = array_merge($filters, $this->filters);
183
// maybe prepare(), but later
185
for ($i = 0, $filter_size = count($filters); $i < $filter_size; $i++) {
186
$html = $filters[$i]->preFilter($html, $config, $context);
191
$this->generator->generateFromTokens(
193
$this->strategy->execute(
194
// list of un-purified tokens
195
$lexer->tokenizeHTML(
197
$html, $config, $context
203
for ($i = $filter_size - 1; $i >= 0; $i--) {
204
$html = $filters[$i]->postFilter($html, $config, $context);
207
$html = HTMLPurifier_Encoder::convertFromUTF8($html, $config, $context);
208
$this->context =& $context;
213
* Filters an array of HTML snippets
214
* @param $config Optional HTMLPurifier_Config object for this operation.
215
* See HTMLPurifier::purify() for more details.
216
* @return Array of purified HTML
218
public function purifyArray($array_of_html, $config = null) {
219
$context_array = array();
220
foreach ($array_of_html as $key => $html) {
221
$array_of_html[$key] = $this->purify($html, $config);
222
$context_array[$key] = $this->context;
224
$this->context = $context_array;
225
return $array_of_html;
229
* Singleton for enforcing just one HTML Purifier in your system
230
* @param $prototype Optional prototype HTMLPurifier instance to
231
* overload singleton with, or HTMLPurifier_Config
232
* instance to configure the generated version with.
234
public static function instance($prototype = null) {
235
if (!self::$instance || $prototype) {
236
if ($prototype instanceof HTMLPurifier) {
237
self::$instance = $prototype;
238
} elseif ($prototype) {
239
self::$instance = new HTMLPurifier($prototype);
241
self::$instance = new HTMLPurifier();
244
return self::$instance;
248
* @note Backwards compatibility, see instance()
250
public static function getInstance($prototype = null) {
251
return HTMLPurifier::instance($prototype);
259
* Defines common attribute collections that modules reference
262
class HTMLPurifier_AttrCollections
266
* Associative array of attribute collections, indexed by name
268
public $info = array();
271
* Performs all expansions on internal data for use by other inclusions
272
* It also collects all attribute collection extensions from
274
* @param $attr_types HTMLPurifier_AttrTypes instance
275
* @param $modules Hash array of HTMLPurifier_HTMLModule members
277
public function __construct($attr_types, $modules) {
278
// load extensions from the modules
279
foreach ($modules as $module) {
280
foreach ($module->attr_collections as $coll_i => $coll) {
281
if (!isset($this->info[$coll_i])) {
282
$this->info[$coll_i] = array();
284
foreach ($coll as $attr_i => $attr) {
285
if ($attr_i === 0 && isset($this->info[$coll_i][$attr_i])) {
287
$this->info[$coll_i][$attr_i] = array_merge(
288
$this->info[$coll_i][$attr_i], $attr);
291
$this->info[$coll_i][$attr_i] = $attr;
295
// perform internal expansions and inclusions
296
foreach ($this->info as $name => $attr) {
297
// merge attribute collections that include others
298
$this->performInclusions($this->info[$name]);
299
// replace string identifiers with actual attribute objects
300
$this->expandIdentifiers($this->info[$name], $attr_types);
305
* Takes a reference to an attribute associative array and performs
306
* all inclusions specified by the zero index.
307
* @param &$attr Reference to attribute array
309
public function performInclusions(&$attr) {
310
if (!isset($attr[0])) return;
312
$seen = array(); // recursion guard
313
// loop through all the inclusions
314
for ($i = 0; isset($merge[$i]); $i++) {
315
if (isset($seen[$merge[$i]])) continue;
316
$seen[$merge[$i]] = true;
317
// foreach attribute of the inclusion, copy it over
318
if (!isset($this->info[$merge[$i]])) continue;
319
foreach ($this->info[$merge[$i]] as $key => $value) {
320
if (isset($attr[$key])) continue; // also catches more inclusions
321
$attr[$key] = $value;
323
if (isset($this->info[$merge[$i]][0])) {
325
$merge = array_merge($merge, $this->info[$merge[$i]][0]);
332
* Expands all string identifiers in an attribute array by replacing
333
* them with the appropriate values inside HTMLPurifier_AttrTypes
334
* @param &$attr Reference to attribute array
335
* @param $attr_types HTMLPurifier_AttrTypes instance
337
public function expandIdentifiers(&$attr, $attr_types) {
339
// because foreach will process new elements we add, make sure we
341
$processed = array();
343
foreach ($attr as $def_i => $def) {
345
if ($def_i === 0) continue;
347
if (isset($processed[$def_i])) continue;
349
// determine whether or not attribute is required
350
if ($required = (strpos($def_i, '*') !== false)) {
351
// rename the definition
352
unset($attr[$def_i]);
353
$def_i = trim($def_i, '*');
354
$attr[$def_i] = $def;
357
$processed[$def_i] = true;
359
// if we've already got a literal object, move on
360
if (is_object($def)) {
361
// preserve previous required
362
$attr[$def_i]->required = ($required || $attr[$def_i]->required);
366
if ($def === false) {
367
unset($attr[$def_i]);
371
if ($t = $attr_types->get($def)) {
373
$attr[$def_i]->required = $required;
375
unset($attr[$def_i]);
387
* Base class for all validating attribute definitions.
389
* This family of classes forms the core for not only HTML attribute validation,
390
* but also any sort of string that needs to be validated or cleaned (which
391
* means CSS properties and composite definitions are defined here too).
392
* Besides defining (through code) what precisely makes the string valid,
393
* subclasses are also responsible for cleaning the code if possible.
396
abstract class HTMLPurifier_AttrDef
400
* Tells us whether or not an HTML attribute is minimized. Has no
401
* meaning in other contexts.
403
public $minimized = false;
406
* Tells us whether or not an HTML attribute is required. Has no
407
* meaning in other contexts
409
public $required = false;
412
* Validates and cleans passed string according to a definition.
414
* @param $string String to be validated and cleaned.
415
* @param $config Mandatory HTMLPurifier_Config object.
416
* @param $context Mandatory HTMLPurifier_AttrContext object.
418
abstract public function validate($string, $config, $context);
421
* Convenience method that parses a string as if it were CDATA.
423
* This method process a string in the manner specified at
424
* <http://www.w3.org/TR/html4/types.html#h-6.2> by removing
425
* leading and trailing whitespace, ignoring line feeds, and replacing
426
* carriage returns and tabs with spaces. While most useful for HTML
427
* attributes specified as CDATA, it can also be applied to most CSS
430
* @note This method is not entirely standards compliant, as trim() removes
431
* more types of whitespace than specified in the spec. In practice,
432
* this is rarely a problem, as those extra characters usually have
433
* already been removed by HTMLPurifier_Encoder.
435
* @warning This processing is inconsistent with XML's whitespace handling
436
* as specified by section 3.3.3 and referenced XHTML 1.0 section
437
* 4.7. However, note that we are NOT necessarily
438
* parsing XML, thus, this behavior may still be correct. We
439
* assume that newlines have been normalized.
441
public function parseCDATA($string) {
442
$string = trim($string);
443
$string = str_replace(array("\n", "\t", "\r"), ' ', $string);
448
* Factory method for creating this class from a string.
449
* @param $string String construction info
450
* @return Created AttrDef object corresponding to $string
452
public function make($string) {
453
// default implementation, return a flyweight of this object.
454
// If $string has an effect on the returned object (i.e. you
455
// need to overload this method), it is best
456
// to clone or instantiate new copies. (Instantiation is safer.)
461
* Removes spaces from rgb(0, 0, 0) so that shorthand CSS properties work
462
* properly. THIS IS A HACK!
464
protected function mungeRgb($string) {
465
return preg_replace('/rgb\((\d+)\s*,\s*(\d+)\s*,\s*(\d+)\)/', 'rgb(\1,\2,\3)', $string);
474
* Processes an entire attribute array for corrections needing multiple values.
476
* Occasionally, a certain attribute will need to be removed and popped onto
477
* another value. Instead of creating a complex return syntax for
478
* HTMLPurifier_AttrDef, we just pass the whole attribute array to a
479
* specialized object and have that do the special work. That is the
480
* family of HTMLPurifier_AttrTransform.
482
* An attribute transformation can be assigned to run before or after
483
* HTMLPurifier_AttrDef validation. See HTMLPurifier_HTMLDefinition for
487
abstract class HTMLPurifier_AttrTransform
491
* Abstract: makes changes to the attributes dependent on multiple values.
493
* @param $attr Assoc array of attributes, usually from
494
* HTMLPurifier_Token_Tag::$attr
495
* @param $config Mandatory HTMLPurifier_Config object.
496
* @param $context Mandatory HTMLPurifier_Context object
497
* @returns Processed attribute array.
499
abstract public function transform($attr, $config, $context);
502
* Prepends CSS properties to the style attribute, creating the
503
* attribute if it doesn't exist.
504
* @param $attr Attribute array to process (passed by reference)
505
* @param $css CSS to prepend
507
public function prependCSS(&$attr, $css) {
508
$attr['style'] = isset($attr['style']) ? $attr['style'] : '';
509
$attr['style'] = $css . $attr['style'];
513
* Retrieves and removes an attribute
514
* @param $attr Attribute array to process (passed by reference)
515
* @param $key Key of attribute to confiscate
517
public function confiscateAttr(&$attr, $key) {
518
if (!isset($attr[$key])) return null;
519
$value = $attr[$key];
530
* Provides lookup array of attribute types to HTMLPurifier_AttrDef objects
532
class HTMLPurifier_AttrTypes
535
* Lookup array of attribute string identifiers to concrete implementations
537
protected $info = array();
540
* Constructs the info array, supplying default implementations for attribute
543
public function __construct() {
544
// pseudo-types, must be instantiated via shorthand
545
$this->info['Enum'] = new HTMLPurifier_AttrDef_Enum();
546
$this->info['Bool'] = new HTMLPurifier_AttrDef_HTML_Bool();
548
$this->info['CDATA'] = new HTMLPurifier_AttrDef_Text();
549
$this->info['ID'] = new HTMLPurifier_AttrDef_HTML_ID();
550
$this->info['Length'] = new HTMLPurifier_AttrDef_HTML_Length();
551
$this->info['MultiLength'] = new HTMLPurifier_AttrDef_HTML_MultiLength();
552
$this->info['NMTOKENS'] = new HTMLPurifier_AttrDef_HTML_Nmtokens();
553
$this->info['Pixels'] = new HTMLPurifier_AttrDef_HTML_Pixels();
554
$this->info['Text'] = new HTMLPurifier_AttrDef_Text();
555
$this->info['URI'] = new HTMLPurifier_AttrDef_URI();
556
$this->info['LanguageCode'] = new HTMLPurifier_AttrDef_Lang();
557
$this->info['Color'] = new HTMLPurifier_AttrDef_HTML_Color();
559
// unimplemented aliases
560
$this->info['ContentType'] = new HTMLPurifier_AttrDef_Text();
561
$this->info['ContentTypes'] = new HTMLPurifier_AttrDef_Text();
562
$this->info['Charsets'] = new HTMLPurifier_AttrDef_Text();
563
$this->info['Character'] = new HTMLPurifier_AttrDef_Text();
565
// number is really a positive integer (one or more digits)
566
// FIXME: ^^ not always, see start and value of list items
567
$this->info['Number'] = new HTMLPurifier_AttrDef_Integer(false, false, true);
572
* @param $type String type name
573
* @return Object AttrDef for type
575
public function get($type) {
577
// determine if there is any extra info tacked on
578
if (strpos($type, '#') !== false) list($type, $string) = explode('#', $type, 2);
581
if (!isset($this->info[$type])) {
582
trigger_error('Cannot retrieve undefined attribute type ' . $type, E_USER_ERROR);
586
return $this->info[$type]->make($string);
591
* Sets a new implementation for a type
592
* @param $type String type name
593
* @param $impl Object AttrDef for type
595
public function set($type, $impl) {
596
$this->info[$type] = $impl;
605
* Validates the attributes of a token. Doesn't manage required attributes
606
* very well. The only reason we factored this out was because RemoveForeignElements
607
* also needed it besides ValidateAttributes.
609
class HTMLPurifier_AttrValidator
613
* Validates the attributes of a token, returning a modified token
614
* that has valid tokens
615
* @param $token Reference to token to validate. We require a reference
616
* because the operation this class performs on the token are
617
* not atomic, so the context CurrentToken to be updated
619
* @param $config Instance of HTMLPurifier_Config
620
* @param $context Instance of HTMLPurifier_Context
622
public function validateToken(&$token, &$config, $context) {
624
$definition = $config->getHTMLDefinition();
625
$e =& $context->get('ErrorCollector', true);
627
// initialize IDAccumulator if necessary
628
$ok =& $context->get('IDAccumulator', true);
630
$id_accumulator = HTMLPurifier_IDAccumulator::build($config, $context);
631
$context->register('IDAccumulator', $id_accumulator);
634
// initialize CurrentToken if necessary
635
$current_token =& $context->get('CurrentToken', true);
636
if (!$current_token) $context->register('CurrentToken', $token);
639
!$token instanceof HTMLPurifier_Token_Start &&
640
!$token instanceof HTMLPurifier_Token_Empty
643
// create alias to global definition array, see also $defs
645
$d_defs = $definition->info_global_attr;
647
// don't update token until the very end, to ensure an atomic update
648
$attr = $token->attr;
650
// do global transformations (pre)
651
// nothing currently utilizes this
652
foreach ($definition->info_attr_transform_pre as $transform) {
653
$attr = $transform->transform($o = $attr, $config, $context);
655
if ($attr != $o) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr);
659
// do local transformations only applicable to this element (pre)
660
// ex. <p align="right"> to <p style="text-align:right;">
661
foreach ($definition->info[$token->name]->attr_transform_pre as $transform) {
662
$attr = $transform->transform($o = $attr, $config, $context);
664
if ($attr != $o) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr);
668
// create alias to this element's attribute definition array, see
669
// also $d_defs (global attribute definition array)
671
$defs = $definition->info[$token->name]->attr;
674
$context->register('CurrentAttr', $attr_key);
676
// iterate through all the attribute keypairs
677
// Watch out for name collisions: $key has previously been used
678
foreach ($attr as $attr_key => $value) {
680
// call the definition
681
if ( isset($defs[$attr_key]) ) {
682
// there is a local definition defined
683
if ($defs[$attr_key] === false) {
684
// We've explicitly been told not to allow this element.
685
// This is usually when there's a global definition
686
// that must be overridden.
687
// Theoretically speaking, we could have a
688
// AttrDef_DenyAll, but this is faster!
691
// validate according to the element's definition
692
$result = $defs[$attr_key]->validate(
693
$value, $config, $context
696
} elseif ( isset($d_defs[$attr_key]) ) {
697
// there is a global definition defined, validate according
698
// to the global definition
699
$result = $d_defs[$attr_key]->validate(
700
$value, $config, $context
703
// system never heard of the attribute? DELETE!
707
// put the results into effect
708
if ($result === false || $result === null) {
709
// this is a generic error message that should replaced
710
// with more specific ones when possible
711
if ($e) $e->send(E_ERROR, 'AttrValidator: Attribute removed');
713
// remove the attribute
714
unset($attr[$attr_key]);
715
} elseif (is_string($result)) {
716
// generally, if a substitution is happening, there
717
// was some sort of implicit correction going on. We'll
718
// delegate it to the attribute classes to say exactly what.
720
// simple substitution
721
$attr[$attr_key] = $result;
726
// we'd also want slightly more complicated substitution
727
// involving an array as the return value,
728
// although we're not sure how colliding attributes would
729
// resolve (certain ones would be completely overriden,
730
// others would prepend themselves).
733
$context->destroy('CurrentAttr');
737
// global (error reporting untested)
738
foreach ($definition->info_attr_transform_post as $transform) {
739
$attr = $transform->transform($o = $attr, $config, $context);
741
if ($attr != $o) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr);
745
// local (error reporting untested)
746
foreach ($definition->info[$token->name]->attr_transform_post as $transform) {
747
$attr = $transform->transform($o = $attr, $config, $context);
749
if ($attr != $o) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr);
753
$token->attr = $attr;
755
// destroy CurrentToken if we made it ourselves
756
if (!$current_token) $context->destroy('CurrentToken');
766
// constants are slow, so we use as few as possible
767
if (!defined('HTMLPURIFIER_PREFIX')) {
768
define('HTMLPURIFIER_PREFIX', dirname(__FILE__) . '/standalone');
769
set_include_path(HTMLPURIFIER_PREFIX . PATH_SEPARATOR . get_include_path());
772
// accomodations for versions earlier than 5.0.2
773
// borrowed from PHP_Compat, LGPL licensed, by Aidan Lister <aidan@php.net>
774
if (!defined('PHP_EOL')) {
775
switch (strtoupper(substr(PHP_OS, 0, 3))) {
777
define('PHP_EOL', "\r\n");
780
define('PHP_EOL', "\r");
783
define('PHP_EOL', "\n");
788
* Bootstrap class that contains meta-functionality for HTML Purifier such as
789
* the autoload function.
792
* This class may be used without any other files from HTML Purifier.
794
class HTMLPurifier_Bootstrap
798
* Autoload function for HTML Purifier
799
* @param $class Class to load
801
public static function autoload($class) {
802
$file = HTMLPurifier_Bootstrap::getPath($class);
803
if (!$file) return false;
804
require HTMLPURIFIER_PREFIX . '/' . $file;
809
* Returns the path for a specific class.
811
public static function getPath($class) {
812
if (strncmp('HTMLPurifier', $class, 12) !== 0) return false;
813
// Custom implementations
814
if (strncmp('HTMLPurifier_Language_', $class, 22) === 0) {
815
$code = str_replace('_', '-', substr($class, 22));
816
$file = 'HTMLPurifier/Language/classes/' . $code . '.php';
818
$file = str_replace('_', '/', $class) . '.php';
820
if (!file_exists(HTMLPURIFIER_PREFIX . '/' . $file)) return false;
825
* "Pre-registers" our autoloader on the SPL stack.
827
public static function registerAutoload() {
828
$autoload = array('HTMLPurifier_Bootstrap', 'autoload');
829
if ( ($funcs = spl_autoload_functions()) === false ) {
830
spl_autoload_register($autoload);
831
} elseif (function_exists('spl_autoload_unregister')) {
832
$compat = version_compare(PHP_VERSION, '5.1.2', '<=') &&
833
version_compare(PHP_VERSION, '5.1.0', '>=');
834
foreach ($funcs as $func) {
835
if (is_array($func)) {
836
// :TRICKY: There are some compatibility issues and some
837
// places where we need to error out
838
$reflector = new ReflectionMethod($func[0], $func[1]);
839
if (!$reflector->isStatic()) {
840
throw new Exception('
841
HTML Purifier autoloader registrar is not compatible
842
with non-static object methods due to PHP Bug #44144;
843
Please do not use HTMLPurifier.autoload.php (or any
844
file that includes this file); instead, place the code:
845
spl_autoload_register(array(\'HTMLPurifier_Bootstrap\', \'autoload\'))
846
after your own autoloaders.
849
// Suprisingly, spl_autoload_register supports the
850
// Class::staticMethod callback format, although call_user_func doesn't
851
if ($compat) $func = implode('::', $func);
853
spl_autoload_unregister($func);
855
spl_autoload_register($autoload);
856
foreach ($funcs as $func) spl_autoload_register($func);
865
* Super-class for definition datatype objects, implements serialization
866
* functions for the class.
868
abstract class HTMLPurifier_Definition
872
* Has setup() been called yet?
874
public $setup = false;
877
* What type of definition is it?
882
* Sets up the definition object into the final form, something
883
* not done by the constructor
884
* @param $config HTMLPurifier_Config instance
886
abstract protected function doSetup($config);
889
* Setup function that aborts if already setup
890
* @param $config HTMLPurifier_Config instance
892
public function setup($config) {
893
if ($this->setup) return;
895
$this->doSetup($config);
904
* Defines allowed CSS attributes and what their values are.
905
* @see HTMLPurifier_HTMLDefinition
907
class HTMLPurifier_CSSDefinition extends HTMLPurifier_Definition
910
public $type = 'CSS';
913
* Assoc array of attribute name to definition object.
915
public $info = array();
918
* Constructs the info array. The meat of this class.
920
protected function doSetup($config) {
922
$this->info['text-align'] = new HTMLPurifier_AttrDef_Enum(
923
array('left', 'right', 'center', 'justify'), false);
926
$this->info['border-bottom-style'] =
927
$this->info['border-right-style'] =
928
$this->info['border-left-style'] =
929
$this->info['border-top-style'] = new HTMLPurifier_AttrDef_Enum(
930
array('none', 'hidden', 'dotted', 'dashed', 'solid', 'double',
931
'groove', 'ridge', 'inset', 'outset'), false);
933
$this->info['border-style'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_style);
935
$this->info['clear'] = new HTMLPurifier_AttrDef_Enum(
936
array('none', 'left', 'right', 'both'), false);
937
$this->info['float'] = new HTMLPurifier_AttrDef_Enum(
938
array('none', 'left', 'right'), false);
939
$this->info['font-style'] = new HTMLPurifier_AttrDef_Enum(
940
array('normal', 'italic', 'oblique'), false);
941
$this->info['font-variant'] = new HTMLPurifier_AttrDef_Enum(
942
array('normal', 'small-caps'), false);
944
$uri_or_none = new HTMLPurifier_AttrDef_CSS_Composite(
946
new HTMLPurifier_AttrDef_Enum(array('none')),
947
new HTMLPurifier_AttrDef_CSS_URI()
951
$this->info['list-style-position'] = new HTMLPurifier_AttrDef_Enum(
952
array('inside', 'outside'), false);
953
$this->info['list-style-type'] = new HTMLPurifier_AttrDef_Enum(
954
array('disc', 'circle', 'square', 'decimal', 'lower-roman',
955
'upper-roman', 'lower-alpha', 'upper-alpha', 'none'), false);
956
$this->info['list-style-image'] = $uri_or_none;
958
$this->info['list-style'] = new HTMLPurifier_AttrDef_CSS_ListStyle($config);
960
$this->info['text-transform'] = new HTMLPurifier_AttrDef_Enum(
961
array('capitalize', 'uppercase', 'lowercase', 'none'), false);
962
$this->info['color'] = new HTMLPurifier_AttrDef_CSS_Color();
964
$this->info['background-image'] = $uri_or_none;
965
$this->info['background-repeat'] = new HTMLPurifier_AttrDef_Enum(
966
array('repeat', 'repeat-x', 'repeat-y', 'no-repeat')
968
$this->info['background-attachment'] = new HTMLPurifier_AttrDef_Enum(
969
array('scroll', 'fixed')
971
$this->info['background-position'] = new HTMLPurifier_AttrDef_CSS_BackgroundPosition();
974
$this->info['border-top-color'] =
975
$this->info['border-bottom-color'] =
976
$this->info['border-left-color'] =
977
$this->info['border-right-color'] =
978
$this->info['background-color'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
979
new HTMLPurifier_AttrDef_Enum(array('transparent')),
980
new HTMLPurifier_AttrDef_CSS_Color()
983
$this->info['background'] = new HTMLPurifier_AttrDef_CSS_Background($config);
985
$this->info['border-color'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_color);
988
$this->info['border-top-width'] =
989
$this->info['border-bottom-width'] =
990
$this->info['border-left-width'] =
991
$this->info['border-right-width'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
992
new HTMLPurifier_AttrDef_Enum(array('thin', 'medium', 'thick')),
993
new HTMLPurifier_AttrDef_CSS_Length('0') //disallow negative
996
$this->info['border-width'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_width);
998
$this->info['letter-spacing'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
999
new HTMLPurifier_AttrDef_Enum(array('normal')),
1000
new HTMLPurifier_AttrDef_CSS_Length()
1003
$this->info['word-spacing'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1004
new HTMLPurifier_AttrDef_Enum(array('normal')),
1005
new HTMLPurifier_AttrDef_CSS_Length()
1008
$this->info['font-size'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1009
new HTMLPurifier_AttrDef_Enum(array('xx-small', 'x-small',
1010
'small', 'medium', 'large', 'x-large', 'xx-large',
1011
'larger', 'smaller')),
1012
new HTMLPurifier_AttrDef_CSS_Percentage(),
1013
new HTMLPurifier_AttrDef_CSS_Length()
1016
$this->info['line-height'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1017
new HTMLPurifier_AttrDef_Enum(array('normal')),
1018
new HTMLPurifier_AttrDef_CSS_Number(true), // no negatives
1019
new HTMLPurifier_AttrDef_CSS_Length('0'),
1020
new HTMLPurifier_AttrDef_CSS_Percentage(true)
1024
$this->info['margin-top'] =
1025
$this->info['margin-bottom'] =
1026
$this->info['margin-left'] =
1027
$this->info['margin-right'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1028
new HTMLPurifier_AttrDef_CSS_Length(),
1029
new HTMLPurifier_AttrDef_CSS_Percentage(),
1030
new HTMLPurifier_AttrDef_Enum(array('auto'))
1033
$this->info['margin'] = new HTMLPurifier_AttrDef_CSS_Multiple($margin);
1037
$this->info['padding-top'] =
1038
$this->info['padding-bottom'] =
1039
$this->info['padding-left'] =
1040
$this->info['padding-right'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1041
new HTMLPurifier_AttrDef_CSS_Length('0'),
1042
new HTMLPurifier_AttrDef_CSS_Percentage(true)
1045
$this->info['padding'] = new HTMLPurifier_AttrDef_CSS_Multiple($padding);
1047
$this->info['text-indent'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1048
new HTMLPurifier_AttrDef_CSS_Length(),
1049
new HTMLPurifier_AttrDef_CSS_Percentage()
1052
$trusted_wh = new HTMLPurifier_AttrDef_CSS_Composite(array(
1053
new HTMLPurifier_AttrDef_CSS_Length('0'),
1054
new HTMLPurifier_AttrDef_CSS_Percentage(true),
1055
new HTMLPurifier_AttrDef_Enum(array('auto'))
1057
$max = $config->get('CSS', 'MaxImgLength');
1059
$this->info['width'] =
1060
$this->info['height'] =
1063
new HTMLPurifier_AttrDef_Switch('img',
1065
new HTMLPurifier_AttrDef_CSS_Composite(array(
1066
new HTMLPurifier_AttrDef_CSS_Length('0', $max),
1067
new HTMLPurifier_AttrDef_Enum(array('auto'))
1069
// For everyone else:
1073
$this->info['text-decoration'] = new HTMLPurifier_AttrDef_CSS_TextDecoration();
1075
$this->info['font-family'] = new HTMLPurifier_AttrDef_CSS_FontFamily();
1077
// this could use specialized code
1078
$this->info['font-weight'] = new HTMLPurifier_AttrDef_Enum(
1079
array('normal', 'bold', 'bolder', 'lighter', '100', '200', '300',
1080
'400', '500', '600', '700', '800', '900'), false);
1082
// MUST be called after other font properties, as it references
1083
// a CSSDefinition object
1084
$this->info['font'] = new HTMLPurifier_AttrDef_CSS_Font($config);
1087
$this->info['border'] =
1088
$this->info['border-bottom'] =
1089
$this->info['border-top'] =
1090
$this->info['border-left'] =
1091
$this->info['border-right'] = new HTMLPurifier_AttrDef_CSS_Border($config);
1093
$this->info['border-collapse'] = new HTMLPurifier_AttrDef_Enum(array(
1094
'collapse', 'separate'));
1096
$this->info['caption-side'] = new HTMLPurifier_AttrDef_Enum(array(
1099
$this->info['table-layout'] = new HTMLPurifier_AttrDef_Enum(array(
1102
$this->info['vertical-align'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1103
new HTMLPurifier_AttrDef_Enum(array('baseline', 'sub', 'super',
1104
'top', 'text-top', 'middle', 'bottom', 'text-bottom')),
1105
new HTMLPurifier_AttrDef_CSS_Length(),
1106
new HTMLPurifier_AttrDef_CSS_Percentage()
1109
$this->info['border-spacing'] = new HTMLPurifier_AttrDef_CSS_Multiple(new HTMLPurifier_AttrDef_CSS_Length(), 2);
1112
$this->info['white-space'] = new HTMLPurifier_AttrDef_Enum(array('nowrap'));
1114
if ($config->get('CSS', 'Proprietary')) {
1115
$this->doSetupProprietary($config);
1118
if ($config->get('CSS', 'AllowTricky')) {
1119
$this->doSetupTricky($config);
1122
$allow_important = $config->get('CSS', 'AllowImportant');
1123
// wrap all attr-defs with decorator that handles !important
1124
foreach ($this->info as $k => $v) {
1125
$this->info[$k] = new HTMLPurifier_AttrDef_CSS_ImportantDecorator($v, $allow_important);
1128
$this->setupConfigStuff($config);
1131
protected function doSetupProprietary($config) {
1132
// Internet Explorer only scrollbar colors
1133
$this->info['scrollbar-arrow-color'] = new HTMLPurifier_AttrDef_CSS_Color();
1134
$this->info['scrollbar-base-color'] = new HTMLPurifier_AttrDef_CSS_Color();
1135
$this->info['scrollbar-darkshadow-color'] = new HTMLPurifier_AttrDef_CSS_Color();
1136
$this->info['scrollbar-face-color'] = new HTMLPurifier_AttrDef_CSS_Color();
1137
$this->info['scrollbar-highlight-color'] = new HTMLPurifier_AttrDef_CSS_Color();
1138
$this->info['scrollbar-shadow-color'] = new HTMLPurifier_AttrDef_CSS_Color();
1140
// technically not proprietary, but CSS3, and no one supports it
1141
$this->info['opacity'] = new HTMLPurifier_AttrDef_CSS_AlphaValue();
1142
$this->info['-moz-opacity'] = new HTMLPurifier_AttrDef_CSS_AlphaValue();
1143
$this->info['-khtml-opacity'] = new HTMLPurifier_AttrDef_CSS_AlphaValue();
1145
// only opacity, for now
1146
$this->info['filter'] = new HTMLPurifier_AttrDef_CSS_Filter();
1150
protected function doSetupTricky($config) {
1151
$this->info['display'] = new HTMLPurifier_AttrDef_Enum(array(
1152
'inline', 'block', 'list-item', 'run-in', 'compact',
1153
'marker', 'table', 'inline-table', 'table-row-group',
1154
'table-header-group', 'table-footer-group', 'table-row',
1155
'table-column-group', 'table-column', 'table-cell', 'table-caption', 'none'
1157
$this->info['visibility'] = new HTMLPurifier_AttrDef_Enum(array(
1158
'visible', 'hidden', 'collapse'
1164
* Performs extra config-based processing. Based off of
1165
* HTMLPurifier_HTMLDefinition.
1166
* @todo Refactor duplicate elements into common class (probably using
1167
* composition, not inheritance).
1169
protected function setupConfigStuff($config) {
1171
// setup allowed elements
1172
$support = "(for information on implementing this, see the ".
1174
$allowed_attributes = $config->get('CSS', 'AllowedProperties');
1175
if ($allowed_attributes !== null) {
1176
foreach ($this->info as $name => $d) {
1177
if(!isset($allowed_attributes[$name])) unset($this->info[$name]);
1178
unset($allowed_attributes[$name]);
1181
foreach ($allowed_attributes as $name => $d) {
1182
// :TODO: Is this htmlspecialchars() call really necessary?
1183
$name = htmlspecialchars($name);
1184
trigger_error("Style attribute '$name' is not supported $support", E_USER_WARNING);
1195
* Defines allowed child nodes and validates tokens against it.
1197
abstract class HTMLPurifier_ChildDef
1200
* Type of child definition, usually right-most part of class name lowercase.
1201
* Used occasionally in terms of context.
1206
* Bool that indicates whether or not an empty array of children is okay
1208
* This is necessary for redundant checking when changes affecting
1209
* a child node may cause a parent node to now be disallowed.
1211
public $allow_empty;
1214
* Lookup array of all elements that this definition could possibly allow
1216
public $elements = array();
1219
* Get lookup of tag names that should not close this element automatically.
1220
* All other elements will do so.
1222
public function getNonAutoCloseElements($config) {
1223
return $this->elements;
1227
* Validates nodes according to definition and returns modification.
1229
* @param $tokens_of_children Array of HTMLPurifier_Token
1230
* @param $config HTMLPurifier_Config object
1231
* @param $context HTMLPurifier_Context object
1232
* @return bool true to leave nodes as is
1233
* @return bool false to remove parent node
1234
* @return array of replacement child tokens
1236
abstract public function validateChildren($tokens_of_children, $config, $context);
1244
* Configuration object that triggers customizable behavior.
1246
* @warning This class is strongly defined: that means that the class
1247
* will fail if an undefined directive is retrieved or set.
1249
* @note Many classes that could (although many times don't) use the
1250
* configuration object make it a mandatory parameter. This is
1251
* because a configuration object should always be forwarded,
1252
* otherwise, you run the risk of missing a parameter and then
1253
* being stumped when a configuration directive doesn't work.
1255
* @todo Reconsider some of the public member variables
1257
class HTMLPurifier_Config
1261
* HTML Purifier's version
1263
public $version = '3.2.0';
1266
* Bool indicator whether or not to automatically finalize
1267
* the object if a read operation is done
1269
public $autoFinalize = true;
1271
// protected member variables
1274
* Namespace indexed array of serials for specific namespaces (see
1275
* getSerial() for more info).
1277
protected $serials = array();
1280
* Serial for entire configuration object
1285
* Two-level associative array of configuration directives
1290
* Parser for variables
1295
* Reference HTMLPurifier_ConfigSchema for value checking
1296
* @note This is public for introspective purposes. Please don't
1302
* Indexed array of definitions
1304
protected $definitions;
1307
* Bool indicator whether or not config is finalized
1309
protected $finalized = false;
1312
* @param $definition HTMLPurifier_ConfigSchema that defines what directives
1315
public function __construct($definition) {
1316
$this->conf = $definition->defaults; // set up, copy in defaults
1317
$this->def = $definition; // keep a copy around for checking
1318
$this->parser = new HTMLPurifier_VarParser_Flexible();
1322
* Convenience constructor that creates a config object based on a mixed var
1323
* @param mixed $config Variable that defines the state of the config
1324
* object. Can be: a HTMLPurifier_Config() object,
1325
* an array of directives based on loadArray(),
1326
* or a string filename of an ini file.
1327
* @param HTMLPurifier_ConfigSchema Schema object
1328
* @return Configured HTMLPurifier_Config object
1330
public static function create($config, $schema = null) {
1331
if ($config instanceof HTMLPurifier_Config) {
1336
$ret = HTMLPurifier_Config::createDefault();
1338
$ret = new HTMLPurifier_Config($schema);
1340
if (is_string($config)) $ret->loadIni($config);
1341
elseif (is_array($config)) $ret->loadArray($config);
1346
* Convenience constructor that creates a default configuration object.
1347
* @return Default HTMLPurifier_Config object.
1349
public static function createDefault() {
1350
$definition = HTMLPurifier_ConfigSchema::instance();
1351
$config = new HTMLPurifier_Config($definition);
1356
* Retreives a value from the configuration.
1357
* @param $namespace String namespace
1358
* @param $key String key
1360
public function get($namespace, $key) {
1361
if (!$this->finalized && $this->autoFinalize) $this->finalize();
1362
if (!isset($this->def->info[$namespace][$key])) {
1363
// can't add % due to SimpleTest bug
1364
trigger_error('Cannot retrieve value of undefined directive ' . htmlspecialchars("$namespace.$key"),
1368
if (isset($this->def->info[$namespace][$key]->isAlias)) {
1369
$d = $this->def->info[$namespace][$key];
1370
trigger_error('Cannot get value from aliased directive, use real name ' . $d->namespace . '.' . $d->name,
1374
return $this->conf[$namespace][$key];
1378
* Retreives an array of directives to values from a given namespace
1379
* @param $namespace String namespace
1381
public function getBatch($namespace) {
1382
if (!$this->finalized && $this->autoFinalize) $this->finalize();
1383
if (!isset($this->def->info[$namespace])) {
1384
trigger_error('Cannot retrieve undefined namespace ' . htmlspecialchars($namespace),
1388
return $this->conf[$namespace];
1392
* Returns a md5 signature of a segment of the configuration object
1393
* that uniquely identifies that particular configuration
1394
* @note Revision is handled specially and is removed from the batch
1395
* before processing!
1396
* @param $namespace Namespace to get serial for
1398
public function getBatchSerial($namespace) {
1399
if (empty($this->serials[$namespace])) {
1400
$batch = $this->getBatch($namespace);
1401
unset($batch['DefinitionRev']);
1402
$this->serials[$namespace] = md5(serialize($batch));
1404
return $this->serials[$namespace];
1408
* Returns a md5 signature for the entire configuration object
1409
* that uniquely identifies that particular configuration
1411
public function getSerial() {
1412
if (empty($this->serial)) {
1413
$this->serial = md5(serialize($this->getAll()));
1415
return $this->serial;
1419
* Retrieves all directives, organized by namespace
1421
public function getAll() {
1422
if (!$this->finalized && $this->autoFinalize) $this->finalize();
1427
* Sets a value to configuration.
1428
* @param $namespace String namespace
1429
* @param $key String key
1430
* @param $value Mixed value
1432
public function set($namespace, $key, $value, $from_alias = false) {
1433
if ($this->isFinalized('Cannot set directive after finalization')) return;
1434
if (!isset($this->def->info[$namespace][$key])) {
1435
trigger_error('Cannot set undefined directive ' . htmlspecialchars("$namespace.$key") . ' to value',
1439
$def = $this->def->info[$namespace][$key];
1441
if (isset($def->isAlias)) {
1443
trigger_error('Double-aliases not allowed, please fix '.
1444
'ConfigSchema bug with' . "$namespace.$key", E_USER_ERROR);
1447
$this->set($new_ns = $def->namespace,
1448
$new_dir = $def->name,
1450
trigger_error("$namespace.$key is an alias, preferred directive name is $new_ns.$new_dir", E_USER_NOTICE);
1454
// Raw type might be negative when using the fully optimized form
1455
// of stdclass, which indicates allow_null == true
1456
$rtype = is_int($def) ? $def : $def->type;
1462
$allow_null = isset($def->allow_null);
1466
$value = $this->parser->parse($value, $type, $allow_null);
1467
} catch (HTMLPurifier_VarParserException $e) {
1468
trigger_error('Value for ' . "$namespace.$key" . ' is of invalid type, should be ' . HTMLPurifier_VarParser::getTypeName($type), E_USER_WARNING);
1471
if (is_string($value) && is_object($def)) {
1472
// resolve value alias if defined
1473
if (isset($def->aliases[$value])) {
1474
$value = $def->aliases[$value];
1476
// check to see if the value is allowed
1477
if (isset($def->allowed) && !isset($def->allowed[$value])) {
1478
trigger_error('Value not supported, valid values are: ' .
1479
$this->_listify($def->allowed), E_USER_WARNING);
1483
$this->conf[$namespace][$key] = $value;
1485
// reset definitions if the directives they depend on changed
1486
// this is a very costly process, so it's discouraged
1487
// with finalization
1488
if ($namespace == 'HTML' || $namespace == 'CSS') {
1489
$this->definitions[$namespace] = null;
1492
$this->serials[$namespace] = false;
1496
* Convenience function for error reporting
1498
private function _listify($lookup) {
1500
foreach ($lookup as $name => $b) $list[] = $name;
1501
return implode(', ', $list);
1505
* Retrieves object reference to the HTML definition.
1506
* @param $raw Return a copy that has not been setup yet. Must be
1507
* called before it's been setup, otherwise won't work.
1509
public function getHTMLDefinition($raw = false) {
1510
return $this->getDefinition('HTML', $raw);
1514
* Retrieves object reference to the CSS definition
1515
* @param $raw Return a copy that has not been setup yet. Must be
1516
* called before it's been setup, otherwise won't work.
1518
public function getCSSDefinition($raw = false) {
1519
return $this->getDefinition('CSS', $raw);
1523
* Retrieves a definition
1524
* @param $type Type of definition: HTML, CSS, etc
1525
* @param $raw Whether or not definition should be returned raw
1527
public function getDefinition($type, $raw = false) {
1528
if (!$this->finalized && $this->autoFinalize) $this->finalize();
1529
$factory = HTMLPurifier_DefinitionCacheFactory::instance();
1530
$cache = $factory->create($type, $this);
1532
// see if we can quickly supply a definition
1533
if (!empty($this->definitions[$type])) {
1534
if (!$this->definitions[$type]->setup) {
1535
$this->definitions[$type]->setup($this);
1536
$cache->set($this->definitions[$type], $this);
1538
return $this->definitions[$type];
1540
// memory check missed, try cache
1541
$this->definitions[$type] = $cache->get($this);
1542
if ($this->definitions[$type]) {
1543
// definition in cache, return it
1544
return $this->definitions[$type];
1547
!empty($this->definitions[$type]) &&
1548
!$this->definitions[$type]->setup
1550
// raw requested, raw in memory, quick return
1551
return $this->definitions[$type];
1553
// quick checks failed, let's create the object
1554
if ($type == 'HTML') {
1555
$this->definitions[$type] = new HTMLPurifier_HTMLDefinition();
1556
} elseif ($type == 'CSS') {
1557
$this->definitions[$type] = new HTMLPurifier_CSSDefinition();
1558
} elseif ($type == 'URI') {
1559
$this->definitions[$type] = new HTMLPurifier_URIDefinition();
1561
throw new HTMLPurifier_Exception("Definition of $type type not supported");
1563
// quick abort if raw
1565
if (is_null($this->get($type, 'DefinitionID'))) {
1566
// fatally error out if definition ID not set
1567
throw new HTMLPurifier_Exception("Cannot retrieve raw version without specifying %$type.DefinitionID");
1569
return $this->definitions[$type];
1572
$this->definitions[$type]->setup($this);
1574
$cache->set($this->definitions[$type], $this);
1575
return $this->definitions[$type];
1579
* Loads configuration values from an array with the following structure:
1580
* Namespace.Directive => Value
1581
* @param $config_array Configuration associative array
1583
public function loadArray($config_array) {
1584
if ($this->isFinalized('Cannot load directives after finalization')) return;
1585
foreach ($config_array as $key => $value) {
1586
$key = str_replace('_', '.', $key);
1587
if (strpos($key, '.') !== false) {
1589
list($namespace, $directive) = explode('.', $key);
1590
$this->set($namespace, $directive, $value);
1593
$namespace_values = $value;
1594
foreach ($namespace_values as $directive => $value) {
1595
$this->set($namespace, $directive, $value);
1602
* Returns a list of array(namespace, directive) for all directives
1603
* that are allowed in a web-form context as per an allowed
1604
* namespaces/directives list.
1605
* @param $allowed List of allowed namespaces/directives
1607
public static function getAllowedDirectivesForForm($allowed, $schema = null) {
1609
$schema = HTMLPurifier_ConfigSchema::instance();
1611
if ($allowed !== true) {
1612
if (is_string($allowed)) $allowed = array($allowed);
1613
$allowed_ns = array();
1614
$allowed_directives = array();
1615
$blacklisted_directives = array();
1616
foreach ($allowed as $ns_or_directive) {
1617
if (strpos($ns_or_directive, '.') !== false) {
1619
if ($ns_or_directive[0] == '-') {
1620
$blacklisted_directives[substr($ns_or_directive, 1)] = true;
1622
$allowed_directives[$ns_or_directive] = true;
1626
$allowed_ns[$ns_or_directive] = true;
1631
foreach ($schema->info as $ns => $keypairs) {
1632
foreach ($keypairs as $directive => $def) {
1633
if ($allowed !== true) {
1634
if (isset($blacklisted_directives["$ns.$directive"])) continue;
1635
if (!isset($allowed_directives["$ns.$directive"]) && !isset($allowed_ns[$ns])) continue;
1637
if (isset($def->isAlias)) continue;
1638
if ($directive == 'DefinitionID' || $directive == 'DefinitionRev') continue;
1639
$ret[] = array($ns, $directive);
1646
* Loads configuration values from $_GET/$_POST that were posted
1648
* @param $array $_GET or $_POST array to import
1649
* @param $index Index/name that the config variables are in
1650
* @param $allowed List of allowed namespaces/directives
1651
* @param $mq_fix Boolean whether or not to enable magic quotes fix
1652
* @param $schema Instance of HTMLPurifier_ConfigSchema to use, if not global copy
1654
public static function loadArrayFromForm($array, $index = false, $allowed = true, $mq_fix = true, $schema = null) {
1655
$ret = HTMLPurifier_Config::prepareArrayFromForm($array, $index, $allowed, $mq_fix, $schema);
1656
$config = HTMLPurifier_Config::create($ret, $schema);
1661
* Merges in configuration values from $_GET/$_POST to object. NOT STATIC.
1662
* @note Same parameters as loadArrayFromForm
1664
public function mergeArrayFromForm($array, $index = false, $allowed = true, $mq_fix = true) {
1665
$ret = HTMLPurifier_Config::prepareArrayFromForm($array, $index, $allowed, $mq_fix, $this->def);
1666
$this->loadArray($ret);
1670
* Prepares an array from a form into something usable for the more
1671
* strict parts of HTMLPurifier_Config
1673
public static function prepareArrayFromForm($array, $index = false, $allowed = true, $mq_fix = true, $schema = null) {
1674
if ($index !== false) $array = (isset($array[$index]) && is_array($array[$index])) ? $array[$index] : array();
1675
$mq = $mq_fix && function_exists('get_magic_quotes_gpc') && get_magic_quotes_gpc();
1677
$allowed = HTMLPurifier_Config::getAllowedDirectivesForForm($allowed, $schema);
1679
foreach ($allowed as $key) {
1680
list($ns, $directive) = $key;
1681
$skey = "$ns.$directive";
1682
if (!empty($array["Null_$skey"])) {
1683
$ret[$ns][$directive] = null;
1686
if (!isset($array[$skey])) continue;
1687
$value = $mq ? stripslashes($array[$skey]) : $array[$skey];
1688
$ret[$ns][$directive] = $value;
1694
* Loads configuration values from an ini file
1695
* @param $filename Name of ini file
1697
public function loadIni($filename) {
1698
if ($this->isFinalized('Cannot load directives after finalization')) return;
1699
$array = parse_ini_file($filename, true);
1700
$this->loadArray($array);
1704
* Checks whether or not the configuration object is finalized.
1705
* @param $error String error message, or false for no error
1707
public function isFinalized($error = false) {
1708
if ($this->finalized && $error) {
1709
trigger_error($error, E_USER_ERROR);
1711
return $this->finalized;
1715
* Finalizes configuration only if auto finalize is on and not
1718
public function autoFinalize() {
1719
if (!$this->finalized && $this->autoFinalize) $this->finalize();
1723
* Finalizes a configuration object, prohibiting further change
1725
public function finalize() {
1726
$this->finalized = true;
1737
* Configuration definition, defines directives and their defaults.
1739
class HTMLPurifier_ConfigSchema {
1742
* Defaults of the directives and namespaces.
1743
* @note This shares the exact same structure as HTMLPurifier_Config::$conf
1745
public $defaults = array();
1748
* Definition of the directives. The structure of this is:
1751
* 'Namespace' => array(
1752
* 'Directive' => new stdclass(),
1756
* The stdclass may have the following properties:
1758
* - If isAlias isn't set:
1759
* - type: Integer type of directive, see HTMLPurifier_VarParser for definitions
1760
* - allow_null: If set, this directive allows null values
1761
* - aliases: If set, an associative array of value aliases to real values
1762
* - allowed: If set, a lookup array of allowed (string) values
1763
* - If isAlias is set:
1764
* - namespace: Namespace this directive aliases to
1765
* - name: Directive name this directive aliases to
1767
* In certain degenerate cases, stdclass will actually be an integer. In
1768
* that case, the value is equivalent to an stdclass with the type
1769
* property set to the integer. If the integer is negative, type is
1770
* equal to the absolute value of integer, and allow_null is true.
1772
* This class is friendly with HTMLPurifier_Config. If you need introspection
1773
* about the schema, you're better of using the ConfigSchema_Interchange,
1774
* which uses more memory but has much richer information.
1776
public $info = array();
1779
* Application-wide singleton
1781
static protected $singleton;
1784
* Unserializes the default ConfigSchema.
1786
public static function makeFromSerial() {
1787
return unserialize(file_get_contents(HTMLPURIFIER_PREFIX . '/HTMLPurifier/ConfigSchema/schema.ser'));
1791
* Retrieves an instance of the application-wide configuration definition.
1793
public static function instance($prototype = null) {
1794
if ($prototype !== null) {
1795
HTMLPurifier_ConfigSchema::$singleton = $prototype;
1796
} elseif (HTMLPurifier_ConfigSchema::$singleton === null || $prototype === true) {
1797
HTMLPurifier_ConfigSchema::$singleton = HTMLPurifier_ConfigSchema::makeFromSerial();
1799
return HTMLPurifier_ConfigSchema::$singleton;
1803
* Defines a directive for configuration
1804
* @warning Will fail of directive's namespace is defined.
1805
* @warning This method's signature is slightly different from the legacy
1806
* define() static method! Beware!
1807
* @param $namespace Namespace the directive is in
1808
* @param $name Key of directive
1809
* @param $default Default value of directive
1810
* @param $type Allowed type of the directive. See
1811
* HTMLPurifier_DirectiveDef::$type for allowed values
1812
* @param $allow_null Whether or not to allow null values
1814
public function add($namespace, $name, $default, $type, $allow_null) {
1815
$obj = new stdclass();
1816
$obj->type = is_int($type) ? $type : HTMLPurifier_VarParser::$types[$type];
1817
if ($allow_null) $obj->allow_null = true;
1818
$this->info[$namespace][$name] = $obj;
1819
$this->defaults[$namespace][$name] = $default;
1823
* Defines a namespace for directives to be put into.
1824
* @warning This is slightly different from the corresponding static
1826
* @param $namespace Namespace's name
1828
public function addNamespace($namespace) {
1829
$this->info[$namespace] = array();
1830
$this->defaults[$namespace] = array();
1834
* Defines a directive value alias.
1836
* Directive value aliases are convenient for developers because it lets
1837
* them set a directive to several values and get the same result.
1838
* @param $namespace Directive's namespace
1839
* @param $name Name of Directive
1840
* @param $aliases Hash of aliased values to the real alias
1842
public function addValueAliases($namespace, $name, $aliases) {
1843
if (!isset($this->info[$namespace][$name]->aliases)) {
1844
$this->info[$namespace][$name]->aliases = array();
1846
foreach ($aliases as $alias => $real) {
1847
$this->info[$namespace][$name]->aliases[$alias] = $real;
1852
* Defines a set of allowed values for a directive.
1853
* @warning This is slightly different from the corresponding static
1854
* method definition.
1855
* @param $namespace Namespace of directive
1856
* @param $name Name of directive
1857
* @param $allowed Lookup array of allowed values
1859
public function addAllowedValues($namespace, $name, $allowed) {
1860
$this->info[$namespace][$name]->allowed = $allowed;
1864
* Defines a directive alias for backwards compatibility
1866
* @param $name Directive that will be aliased
1867
* @param $new_namespace
1868
* @param $new_name Directive that the alias will be to
1870
public function addAlias($namespace, $name, $new_namespace, $new_name) {
1871
$obj = new stdclass;
1872
$obj->namespace = $new_namespace;
1873
$obj->name = $new_name;
1874
$obj->isAlias = true;
1875
$this->info[$namespace][$name] = $obj;
1879
* Replaces any stdclass that only has the type property with type integer.
1881
public function postProcess() {
1882
foreach ($this->info as $namespace => $info) {
1883
foreach ($info as $directive => $v) {
1884
if (count((array) $v) == 1) {
1885
$this->info[$namespace][$directive] = $v->type;
1886
} elseif (count((array) $v) == 2 && isset($v->allow_null)) {
1887
$this->info[$namespace][$directive] = -$v->type;
1893
// DEPRECATED METHODS
1895
/** @see HTMLPurifier_ConfigSchema->set() */
1896
public static function define($namespace, $name, $default, $type, $description) {
1897
HTMLPurifier_ConfigSchema::deprecated(__METHOD__);
1898
$type_values = explode('/', $type, 2);
1899
$type = $type_values[0];
1900
$modifier = isset($type_values[1]) ? $type_values[1] : false;
1901
$allow_null = ($modifier === 'null');
1902
$def = HTMLPurifier_ConfigSchema::instance();
1903
$def->add($namespace, $name, $default, $type, $allow_null);
1906
/** @see HTMLPurifier_ConfigSchema->addNamespace() */
1907
public static function defineNamespace($namespace, $description) {
1908
HTMLPurifier_ConfigSchema::deprecated(__METHOD__);
1909
$def = HTMLPurifier_ConfigSchema::instance();
1910
$def->addNamespace($namespace);
1913
/** @see HTMLPurifier_ConfigSchema->addValueAliases() */
1914
public static function defineValueAliases($namespace, $name, $aliases) {
1915
HTMLPurifier_ConfigSchema::deprecated(__METHOD__);
1916
$def = HTMLPurifier_ConfigSchema::instance();
1917
$def->addValueAliases($namespace, $name, $aliases);
1920
/** @see HTMLPurifier_ConfigSchema->addAllowedValues() */
1921
public static function defineAllowedValues($namespace, $name, $allowed_values) {
1922
HTMLPurifier_ConfigSchema::deprecated(__METHOD__);
1924
foreach ($allowed_values as $value) {
1925
$allowed[$value] = true;
1927
$def = HTMLPurifier_ConfigSchema::instance();
1928
$def->addAllowedValues($namespace, $name, $allowed);
1931
/** @see HTMLPurifier_ConfigSchema->addAlias() */
1932
public static function defineAlias($namespace, $name, $new_namespace, $new_name) {
1933
HTMLPurifier_ConfigSchema::deprecated(__METHOD__);
1934
$def = HTMLPurifier_ConfigSchema::instance();
1935
$def->addAlias($namespace, $name, $new_namespace, $new_name);
1938
/** @deprecated, use HTMLPurifier_VarParser->parse() */
1939
public function validate($a, $b, $c = false) {
1940
trigger_error("HTMLPurifier_ConfigSchema->validate deprecated, use HTMLPurifier_VarParser->parse instead", E_USER_NOTICE);
1941
$parser = new HTMLPurifier_VarParser();
1942
return $parser->parse($a, $b, $c);
1946
* Throws an E_USER_NOTICE stating that a method is deprecated.
1948
private static function deprecated($method) {
1949
trigger_error("Static HTMLPurifier_ConfigSchema::$method deprecated, use add*() method instead", E_USER_NOTICE);
1961
class HTMLPurifier_ContentSets
1965
* List of content set strings (pipe seperators) indexed by name.
1967
public $info = array();
1970
* List of content set lookups (element => true) indexed by name.
1971
* @note This is in HTMLPurifier_HTMLDefinition->info_content_sets
1973
public $lookup = array();
1976
* Synchronized list of defined content sets (keys of info)
1978
protected $keys = array();
1980
* Synchronized list of defined content values (values of info)
1982
protected $values = array();
1985
* Merges in module's content sets, expands identifiers in the content
1986
* sets and populates the keys, values and lookup member variables.
1987
* @param $modules List of HTMLPurifier_HTMLModule
1989
public function __construct($modules) {
1990
if (!is_array($modules)) $modules = array($modules);
1991
// populate content_sets based on module hints
1992
// sorry, no way of overloading
1993
foreach ($modules as $module_i => $module) {
1994
foreach ($module->content_sets as $key => $value) {
1995
$temp = $this->convertToLookup($value);
1996
if (isset($this->lookup[$key])) {
1997
// add it into the existing content set
1998
$this->lookup[$key] = array_merge($this->lookup[$key], $temp);
2000
$this->lookup[$key] = $temp;
2004
$old_lookup = false;
2005
while ($old_lookup !== $this->lookup) {
2006
$old_lookup = $this->lookup;
2007
foreach ($this->lookup as $i => $set) {
2009
foreach ($set as $element => $x) {
2010
if (isset($this->lookup[$element])) {
2011
$add += $this->lookup[$element];
2012
unset($this->lookup[$i][$element]);
2015
$this->lookup[$i] += $add;
2019
foreach ($this->lookup as $key => $lookup) {
2020
$this->info[$key] = implode(' | ', array_keys($lookup));
2022
$this->keys = array_keys($this->info);
2023
$this->values = array_values($this->info);
2027
* Accepts a definition; generates and assigns a ChildDef for it
2028
* @param $def HTMLPurifier_ElementDef reference
2029
* @param $module Module that defined the ElementDef
2031
public function generateChildDef(&$def, $module) {
2032
if (!empty($def->child)) return; // already done!
2033
$content_model = $def->content_model;
2034
if (is_string($content_model)) {
2035
// Assume that $this->keys is alphanumeric
2036
$def->content_model = preg_replace_callback(
2037
'/\b(' . implode('|', $this->keys) . ')\b/',
2038
array($this, 'generateChildDefCallback'),
2041
//$def->content_model = str_replace(
2042
// $this->keys, $this->values, $content_model);
2044
$def->child = $this->getChildDef($def, $module);
2047
public function generateChildDefCallback($matches) {
2048
return $this->info[$matches[0]];
2052
* Instantiates a ChildDef based on content_model and content_model_type
2053
* member variables in HTMLPurifier_ElementDef
2054
* @note This will also defer to modules for custom HTMLPurifier_ChildDef
2055
* subclasses that need content set expansion
2056
* @param $def HTMLPurifier_ElementDef to have ChildDef extracted
2057
* @return HTMLPurifier_ChildDef corresponding to ElementDef
2059
public function getChildDef($def, $module) {
2060
$value = $def->content_model;
2061
if (is_object($value)) {
2063
'Literal object child definitions should be stored in '.
2064
'ElementDef->child not ElementDef->content_model',
2069
switch ($def->content_model_type) {
2071
return new HTMLPurifier_ChildDef_Required($value);
2073
return new HTMLPurifier_ChildDef_Optional($value);
2075
return new HTMLPurifier_ChildDef_Empty();
2077
return new HTMLPurifier_ChildDef_Custom($value);
2079
// defer to its module
2081
if ($module->defines_child_def) { // save a func call
2082
$return = $module->getChildDef($def);
2084
if ($return !== false) return $return;
2087
'Could not determine which ChildDef class to instantiate',
2094
* Converts a string list of elements separated by pipes into
2096
* @param $string List of elements
2097
* @return Lookup array of elements
2099
protected function convertToLookup($string) {
2100
$array = explode('|', str_replace(' ', '', $string));
2102
foreach ($array as $i => $k) {
2114
* Registry object that contains information about the current context.
2115
* @warning Is a bit buggy when variables are set to null: it thinks
2116
* they don't exist! So use false instead, please.
2117
* @note Since the variables Context deals with may not be objects,
2118
* references are very important here! Do not remove!
2120
class HTMLPurifier_Context
2124
* Private array that stores the references.
2126
private $_storage = array();
2129
* Registers a variable into the context.
2130
* @param $name String name
2131
* @param $ref Reference to variable to be registered
2133
public function register($name, &$ref) {
2134
if (isset($this->_storage[$name])) {
2135
trigger_error("Name $name produces collision, cannot re-register",
2139
$this->_storage[$name] =& $ref;
2143
* Retrieves a variable reference from the context.
2144
* @param $name String name
2145
* @param $ignore_error Boolean whether or not to ignore error
2147
public function &get($name, $ignore_error = false) {
2148
if (!isset($this->_storage[$name])) {
2149
if (!$ignore_error) {
2150
trigger_error("Attempted to retrieve non-existent variable $name",
2153
$var = null; // so we can return by reference
2156
return $this->_storage[$name];
2160
* Destorys a variable in the context.
2161
* @param $name String name
2163
public function destroy($name) {
2164
if (!isset($this->_storage[$name])) {
2165
trigger_error("Attempted to destroy non-existent variable $name",
2169
unset($this->_storage[$name]);
2173
* Checks whether or not the variable exists.
2174
* @param $name String name
2176
public function exists($name) {
2177
return isset($this->_storage[$name]);
2181
* Loads a series of variables from an associative array
2182
* @param $context_array Assoc array of variables to load
2184
public function loadArray($context_array) {
2185
foreach ($context_array as $key => $discard) {
2186
$this->register($key, $context_array[$key]);
2196
* Abstract class representing Definition cache managers that implements
2197
* useful common methods and is a factory.
2198
* @todo Create a separate maintenance file advanced users can use to
2199
* cache their custom HTMLDefinition, which can be loaded
2200
* via a configuration directive
2201
* @todo Implement memcached
2203
abstract class HTMLPurifier_DefinitionCache
2209
* @param $name Type of definition objects this instance of the
2210
* cache will handle.
2212
public function __construct($type) {
2213
$this->type = $type;
2217
* Generates a unique identifier for a particular configuration
2218
* @param Instance of HTMLPurifier_Config
2220
public function generateKey($config) {
2221
return $config->version . ',' . // possibly replace with function calls
2222
$config->getBatchSerial($this->type) . ',' .
2223
$config->get($this->type, 'DefinitionRev');
2227
* Tests whether or not a key is old with respect to the configuration's
2228
* version and revision number.
2229
* @param $key Key to test
2230
* @param $config Instance of HTMLPurifier_Config to test against
2232
public function isOld($key, $config) {
2233
if (substr_count($key, ',') < 2) return true;
2234
list($version, $hash, $revision) = explode(',', $key, 3);
2235
$compare = version_compare($version, $config->version);
2236
// version mismatch, is always old
2237
if ($compare != 0) return true;
2238
// versions match, ids match, check revision number
2240
$hash == $config->getBatchSerial($this->type) &&
2241
$revision < $config->get($this->type, 'DefinitionRev')
2247
* Checks if a definition's type jives with the cache's type
2248
* @note Throws an error on failure
2249
* @param $def Definition object to check
2250
* @return Boolean true if good, false if not
2252
public function checkDefType($def) {
2253
if ($def->type !== $this->type) {
2254
trigger_error("Cannot use definition of type {$def->type} in cache for {$this->type}");
2261
* Adds a definition object to the cache
2263
abstract public function add($def, $config);
2266
* Unconditionally saves a definition object to the cache
2268
abstract public function set($def, $config);
2271
* Replace an object in the cache
2273
abstract public function replace($def, $config);
2276
* Retrieves a definition object from the cache
2278
abstract public function get($config);
2281
* Removes a definition object to the cache
2283
abstract public function remove($config);
2286
* Clears all objects from cache
2288
abstract public function flush($config);
2291
* Clears all expired (older version or revision) objects from cache
2292
* @note Be carefuly implementing this method as flush. Flush must
2293
* not interfere with other Definition types, and cleanup()
2294
* should not be repeatedly called by userland code.
2296
abstract public function cleanup($config);
2304
* Responsible for creating definition caches.
2306
class HTMLPurifier_DefinitionCacheFactory
2309
protected $caches = array('Serializer' => array());
2310
protected $implementations = array();
2311
protected $decorators = array();
2314
* Initialize default decorators
2316
public function setup() {
2317
$this->addDecorator('Cleanup');
2321
* Retrieves an instance of global definition cache factory.
2323
public static function instance($prototype = null) {
2325
if ($prototype !== null) {
2326
$instance = $prototype;
2327
} elseif ($instance === null || $prototype === true) {
2328
$instance = new HTMLPurifier_DefinitionCacheFactory();
2335
* Registers a new definition cache object
2336
* @param $short Short name of cache object, for reference
2337
* @param $long Full class name of cache object, for construction
2339
public function register($short, $long) {
2340
$this->implementations[$short] = $long;
2344
* Factory method that creates a cache object based on configuration
2345
* @param $name Name of definitions handled by cache
2346
* @param $config Instance of HTMLPurifier_Config
2348
public function create($type, $config) {
2349
$method = $config->get('Cache', 'DefinitionImpl');
2350
if ($method === null) {
2351
return new HTMLPurifier_DefinitionCache_Null($type);
2353
if (!empty($this->caches[$method][$type])) {
2354
return $this->caches[$method][$type];
2357
isset($this->implementations[$method]) &&
2358
class_exists($class = $this->implementations[$method], false)
2360
$cache = new $class($type);
2362
if ($method != 'Serializer') {
2363
trigger_error("Unrecognized DefinitionCache $method, using Serializer instead", E_USER_WARNING);
2365
$cache = new HTMLPurifier_DefinitionCache_Serializer($type);
2367
foreach ($this->decorators as $decorator) {
2368
$new_cache = $decorator->decorate($cache);
2369
// prevent infinite recursion in PHP 4
2371
$cache = $new_cache;
2373
$this->caches[$method][$type] = $cache;
2374
return $this->caches[$method][$type];
2378
* Registers a decorator to add to all new cache objects
2381
public function addDecorator($decorator) {
2382
if (is_string($decorator)) {
2383
$class = "HTMLPurifier_DefinitionCache_Decorator_$decorator";
2384
$decorator = new $class;
2386
$this->decorators[$decorator->name] = $decorator;
2395
* Represents a document type, contains information on which modules
2396
* need to be loaded.
2397
* @note This class is inspected by Printer_HTMLDefinition->renderDoctype.
2398
* If structure changes, please update that function.
2400
class HTMLPurifier_Doctype
2403
* Full name of doctype
2408
* List of standard modules (string identifiers or literal objects)
2409
* that this doctype uses
2411
public $modules = array();
2414
* List of modules to use for tidying up code
2416
public $tidyModules = array();
2419
* Is the language derived from XML (i.e. XHTML)?
2424
* List of aliases for this doctype
2426
public $aliases = array();
2429
* Public DTD identifier
2434
* System DTD identifier
2438
public function __construct($name = null, $xml = true, $modules = array(),
2439
$tidyModules = array(), $aliases = array(), $dtd_public = null, $dtd_system = null
2441
$this->name = $name;
2443
$this->modules = $modules;
2444
$this->tidyModules = $tidyModules;
2445
$this->aliases = $aliases;
2446
$this->dtdPublic = $dtd_public;
2447
$this->dtdSystem = $dtd_system;
2454
class HTMLPurifier_DoctypeRegistry
2458
* Hash of doctype names to doctype objects
2460
protected $doctypes;
2463
* Lookup table of aliases to real doctype names
2468
* Registers a doctype to the registry
2469
* @note Accepts a fully-formed doctype object, or the
2470
* parameters for constructing a doctype object
2471
* @param $doctype Name of doctype or literal doctype object
2472
* @param $modules Modules doctype will load
2473
* @param $modules_for_modes Modules doctype will load for certain modes
2474
* @param $aliases Alias names for doctype
2475
* @return Editable registered doctype
2477
public function register($doctype, $xml = true, $modules = array(),
2478
$tidy_modules = array(), $aliases = array(), $dtd_public = null, $dtd_system = null
2480
if (!is_array($modules)) $modules = array($modules);
2481
if (!is_array($tidy_modules)) $tidy_modules = array($tidy_modules);
2482
if (!is_array($aliases)) $aliases = array($aliases);
2483
if (!is_object($doctype)) {
2484
$doctype = new HTMLPurifier_Doctype(
2485
$doctype, $xml, $modules, $tidy_modules, $aliases, $dtd_public, $dtd_system
2488
$this->doctypes[$doctype->name] = $doctype;
2489
$name = $doctype->name;
2491
foreach ($doctype->aliases as $alias) {
2492
if (isset($this->doctypes[$alias])) continue;
2493
$this->aliases[$alias] = $name;
2495
// remove old aliases
2496
if (isset($this->aliases[$name])) unset($this->aliases[$name]);
2501
* Retrieves reference to a doctype of a certain name
2502
* @note This function resolves aliases
2503
* @note When possible, use the more fully-featured make()
2504
* @param $doctype Name of doctype
2505
* @return Editable doctype object
2507
public function get($doctype) {
2508
if (isset($this->aliases[$doctype])) $doctype = $this->aliases[$doctype];
2509
if (!isset($this->doctypes[$doctype])) {
2510
trigger_error('Doctype ' . htmlspecialchars($doctype) . ' does not exist', E_USER_ERROR);
2511
$anon = new HTMLPurifier_Doctype($doctype);
2514
return $this->doctypes[$doctype];
2518
* Creates a doctype based on a configuration object,
2519
* will perform initialization on the doctype
2520
* @note Use this function to get a copy of doctype that config
2521
* can hold on to (this is necessary in order to tell
2522
* Generator whether or not the current document is XML
2525
public function make($config) {
2526
return clone $this->get($this->getDoctypeFromConfig($config));
2530
* Retrieves the doctype from the configuration object
2532
public function getDoctypeFromConfig($config) {
2534
$doctype = $config->get('HTML', 'Doctype');
2535
if (!empty($doctype)) return $doctype;
2536
$doctype = $config->get('HTML', 'CustomDoctype');
2537
if (!empty($doctype)) return $doctype;
2538
// backwards-compatibility
2539
if ($config->get('HTML', 'XHTML')) {
2540
$doctype = 'XHTML 1.0';
2542
$doctype = 'HTML 4.01';
2544
if ($config->get('HTML', 'Strict')) {
2545
$doctype .= ' Strict';
2547
$doctype .= ' Transitional';
2558
* Structure that stores an HTML element definition. Used by
2559
* HTMLPurifier_HTMLDefinition and HTMLPurifier_HTMLModule.
2560
* @note This class is inspected by HTMLPurifier_Printer_HTMLDefinition.
2561
* Please update that class too.
2563
class HTMLPurifier_ElementDef
2567
* Does the definition work by itself, or is it created solely
2568
* for the purpose of merging into another definition?
2570
public $standalone = true;
2573
* Associative array of attribute name to HTMLPurifier_AttrDef
2574
* @note Before being processed by HTMLPurifier_AttrCollections
2575
* when modules are finalized during
2576
* HTMLPurifier_HTMLDefinition->setup(), this array may also
2577
* contain an array at index 0 that indicates which attribute
2578
* collections to load into the full array. It may also
2579
* contain string indentifiers in lieu of HTMLPurifier_AttrDef,
2580
* see HTMLPurifier_AttrTypes on how they are expanded during
2581
* HTMLPurifier_HTMLDefinition->setup() processing.
2583
public $attr = array();
2586
* Indexed list of tag's HTMLPurifier_AttrTransform to be done before validation
2588
public $attr_transform_pre = array();
2591
* Indexed list of tag's HTMLPurifier_AttrTransform to be done after validation
2593
public $attr_transform_post = array();
2596
* HTMLPurifier_ChildDef of this tag.
2601
* Abstract string representation of internal ChildDef rules. See
2602
* HTMLPurifier_ContentSets for how this is parsed and then transformed
2603
* into an HTMLPurifier_ChildDef.
2604
* @warning This is a temporary variable that is not available after
2605
* being processed by HTMLDefinition
2607
public $content_model;
2610
* Value of $child->type, used to determine which ChildDef to use,
2611
* used in combination with $content_model.
2612
* @warning This must be lowercase
2613
* @warning This is a temporary variable that is not available after
2614
* being processed by HTMLDefinition
2616
public $content_model_type;
2621
* Does the element have a content model (#PCDATA | Inline)*? This
2622
* is important for chameleon ins and del processing in
2623
* HTMLPurifier_ChildDef_Chameleon. Dynamically set: modules don't
2624
* have to worry about this one.
2626
public $descendants_are_inline = false;
2629
* List of the names of required attributes this element has. Dynamically
2630
* populated by HTMLPurifier_HTMLDefinition::getElement
2632
public $required_attr = array();
2635
* Lookup table of tags excluded from all descendants of this tag.
2636
* @note SGML permits exclusions for all descendants, but this is
2637
* not possible with DTDs or XML Schemas. W3C has elected to
2638
* use complicated compositions of content_models to simulate
2639
* exclusion for children, but we go the simpler, SGML-style
2640
* route of flat-out exclusions, which correctly apply to
2641
* all descendants and not just children. Note that the XHTML
2642
* Modularization Abstract Modules are blithely unaware of such
2645
public $excludes = array();
2648
* Low-level factory constructor for creating new standalone element defs
2650
public static function create($content_model, $content_model_type, $attr) {
2651
$def = new HTMLPurifier_ElementDef();
2652
$def->content_model = $content_model;
2653
$def->content_model_type = $content_model_type;
2659
* Merges the values of another element definition into this one.
2660
* Values from the new element def take precedence if a value is
2663
public function mergeIn($def) {
2665
// later keys takes precedence
2666
foreach($def->attr as $k => $v) {
2668
// merge in the includes
2669
// sorry, no way to override an include
2670
foreach ($v as $v2) {
2671
$this->attr[0][] = $v2;
2676
if (isset($this->attr[$k])) unset($this->attr[$k]);
2679
$this->attr[$k] = $v;
2681
$this->_mergeAssocArray($this->attr_transform_pre, $def->attr_transform_pre);
2682
$this->_mergeAssocArray($this->attr_transform_post, $def->attr_transform_post);
2683
$this->_mergeAssocArray($this->excludes, $def->excludes);
2685
if(!empty($def->content_model)) {
2686
$this->content_model .= ' | ' . $def->content_model;
2687
$this->child = false;
2689
if(!empty($def->content_model_type)) {
2690
$this->content_model_type = $def->content_model_type;
2691
$this->child = false;
2693
if(!is_null($def->child)) $this->child = $def->child;
2694
if($def->descendants_are_inline) $this->descendants_are_inline = $def->descendants_are_inline;
2699
* Merges one array into another, removes values which equal false
2700
* @param $a1 Array by reference that is merged into
2701
* @param $a2 Array that merges into $a1
2703
private function _mergeAssocArray(&$a1, $a2) {
2704
foreach ($a2 as $k => $v) {
2706
if (isset($a1[$k])) unset($a1[$k]);
2720
* A UTF-8 specific character encoder that handles cleaning and transforming.
2721
* @note All functions in this class should be static.
2723
class HTMLPurifier_Encoder
2727
* Constructor throws fatal error if you attempt to instantiate class
2729
private function __construct() {
2730
trigger_error('Cannot instantiate encoder, call methods statically', E_USER_ERROR);
2734
* Error-handler that mutes errors, alternative to shut-up operator.
2736
private static function muteErrorHandler() {}
2739
* Cleans a UTF-8 string for well-formedness and SGML validity
2741
* It will parse according to UTF-8 and return a valid UTF8 string, with
2742
* non-SGML codepoints excluded.
2744
* @note Just for reference, the non-SGML code points are 0 to 31 and
2745
* 127 to 159, inclusive. However, we allow code points 9, 10
2746
* and 13, which are the tab, line feed and carriage return
2747
* respectively. 128 and above the code points map to multibyte
2748
* UTF-8 representations.
2750
* @note Fallback code adapted from utf8ToUnicode by Henri Sivonen and
2751
* hsivonen@iki.fi at <http://iki.fi/hsivonen/php-utf8/> under the
2752
* LGPL license. Notes on what changed are inside, but in general,
2753
* the original code transformed UTF-8 text into an array of integer
2754
* Unicode codepoints. Understandably, transforming that back to
2755
* a string would be somewhat expensive, so the function was modded to
2756
* directly operate on the string. However, this discourages code
2757
* reuse, and the logic enumerated here would be useful for any
2758
* function that needs to be able to understand UTF-8 characters.
2759
* As of right now, only smart lossless character encoding converters
2760
* would need that, and I'm probably not going to implement them.
2761
* Once again, PHP 6 should solve all our problems.
2763
public static function cleanUTF8($str, $force_php = false) {
2765
// UTF-8 validity is checked since PHP 4.3.5
2766
// This is an optimization: if the string is already valid UTF-8, no
2767
// need to do PHP stuff. 99% of the time, this will be the case.
2768
// The regexp matches the XML char production, as well as well as excluding
2769
// non-SGML codepoints U+007F to U+009F
2770
if (preg_match('/^[\x{9}\x{A}\x{D}\x{20}-\x{7E}\x{A0}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]*$/Du', $str)) {
2774
$mState = 0; // cached expected number of octets after the current octet
2775
// until the beginning of the next UTF8 character sequence
2776
$mUcs4 = 0; // cached Unicode character
2777
$mBytes = 1; // cached expected number of octets in the current sequence
2779
// original code involved an $out that was an array of Unicode
2780
// codepoints. Instead of having to convert back into UTF-8, we've
2781
// decided to directly append valid UTF-8 characters onto a string
2782
// $out once they're done. $char accumulates raw bytes, while $mUcs4
2783
// turns into the Unicode code point, so there's some redundancy.
2788
$len = strlen($str);
2789
for($i = 0; $i < $len; $i++) {
2790
$in = ord($str{$i});
2791
$char .= $str[$i]; // append byte to char
2793
// When mState is zero we expect either a US-ASCII character
2794
// or a multi-octet sequence.
2795
if (0 == (0x80 & ($in))) {
2796
// US-ASCII, pass straight through.
2797
if (($in <= 31 || $in == 127) &&
2798
!($in == 9 || $in == 13 || $in == 10) // save \r\t\n
2800
// control characters, remove
2807
} elseif (0xC0 == (0xE0 & ($in))) {
2808
// First octet of 2 octet sequence
2810
$mUcs4 = ($mUcs4 & 0x1F) << 6;
2813
} elseif (0xE0 == (0xF0 & ($in))) {
2814
// First octet of 3 octet sequence
2816
$mUcs4 = ($mUcs4 & 0x0F) << 12;
2819
} elseif (0xF0 == (0xF8 & ($in))) {
2820
// First octet of 4 octet sequence
2822
$mUcs4 = ($mUcs4 & 0x07) << 18;
2825
} elseif (0xF8 == (0xFC & ($in))) {
2826
// First octet of 5 octet sequence.
2828
// This is illegal because the encoded codepoint must be
2830
// (a) not the shortest form or
2831
// (b) outside the Unicode range of 0-0x10FFFF.
2832
// Rather than trying to resynchronize, we will carry on
2833
// until the end of the sequence and let the later error
2834
// handling code catch it.
2836
$mUcs4 = ($mUcs4 & 0x03) << 24;
2839
} elseif (0xFC == (0xFE & ($in))) {
2840
// First octet of 6 octet sequence, see comments for 5
2843
$mUcs4 = ($mUcs4 & 1) << 30;
2847
// Current octet is neither in the US-ASCII range nor a
2848
// legal first octet of a multi-octet sequence.
2855
// When mState is non-zero, we expect a continuation of the
2856
// multi-octet sequence
2857
if (0x80 == (0xC0 & ($in))) {
2858
// Legal continuation.
2859
$shift = ($mState - 1) * 6;
2861
$tmp = ($tmp & 0x0000003F) << $shift;
2864
if (0 == --$mState) {
2865
// End of the multi-octet sequence. mUcs4 now contains
2866
// the final Unicode codepoint to be output
2868
// Check for illegal sequences and codepoints.
2870
// From Unicode 3.1, non-shortest form is illegal
2871
if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
2872
((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
2873
((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
2875
// From Unicode 3.2, surrogate characters = illegal
2876
(($mUcs4 & 0xFFFFF800) == 0xD800) ||
2877
// Codepoints outside the Unicode range are illegal
2881
} elseif (0xFEFF != $mUcs4 && // omit BOM
2882
// check for valid Char unicode codepoints
2887
(0x20 <= $mUcs4 && 0x7E >= $mUcs4) ||
2888
// 7F-9F is not strictly prohibited by XML,
2889
// but it is non-SGML, and thus we don't allow it
2890
(0xA0 <= $mUcs4 && 0xD7FF >= $mUcs4) ||
2891
(0x10000 <= $mUcs4 && 0x10FFFF >= $mUcs4)
2896
// initialize UTF8 cache (reset)
2903
// ((0xC0 & (*in) != 0x80) && (mState != 0))
2904
// Incomplete multi-octet sequence.
2905
// used to result in complete fail, but we'll reset
2917
* Translates a Unicode codepoint into its corresponding UTF-8 character.
2918
* @note Based on Feyd's function at
2919
* <http://forums.devnetwork.net/viewtopic.php?p=191404#191404>,
2920
* which is in public domain.
2921
* @note While we're going to do code point parsing anyway, a good
2922
* optimization would be to refuse to translate code points that
2923
* are non-SGML characters. However, this could lead to duplication.
2924
* @note This is very similar to the unichr function in
2925
* maintenance/generate-entity-file.php (although this is superior,
2926
* due to its sanity checks).
2929
// +----------+----------+----------+----------+
2930
// | 33222222 | 22221111 | 111111 | |
2931
// | 10987654 | 32109876 | 54321098 | 76543210 | bit
2932
// +----------+----------+----------+----------+
2933
// | | | | 0xxxxxxx | 1 byte 0x00000000..0x0000007F
2934
// | | | 110yyyyy | 10xxxxxx | 2 byte 0x00000080..0x000007FF
2935
// | | 1110zzzz | 10yyyyyy | 10xxxxxx | 3 byte 0x00000800..0x0000FFFF
2936
// | 11110www | 10wwzzzz | 10yyyyyy | 10xxxxxx | 4 byte 0x00010000..0x0010FFFF
2937
// +----------+----------+----------+----------+
2938
// | 00000000 | 00011111 | 11111111 | 11111111 | Theoretical upper limit of legal scalars: 2097151 (0x001FFFFF)
2939
// | 00000000 | 00010000 | 11111111 | 11111111 | Defined upper limit of legal scalar codes
2940
// +----------+----------+----------+----------+
2942
public static function unichr($code) {
2943
if($code > 1114111 or $code < 0 or
2944
($code >= 55296 and $code <= 57343) ) {
2945
// bits are set outside the "valid" range as defined
2950
$x = $y = $z = $w = 0;
2952
// regular ASCII character
2955
// set up bits for UTF-8
2956
$x = ($code & 63) | 128;
2958
$y = (($code & 2047) >> 6) | 192;
2960
$y = (($code & 4032) >> 6) | 128;
2962
$z = (($code >> 12) & 15) | 224;
2964
$z = (($code >> 12) & 63) | 128;
2965
$w = (($code >> 18) & 7) | 240;
2969
// set up the actual character
2971
if($w) $ret .= chr($w);
2972
if($z) $ret .= chr($z);
2973
if($y) $ret .= chr($y);
2980
* Converts a string to UTF-8 based on configuration.
2982
public static function convertToUTF8($str, $config, $context) {
2983
$encoding = $config->get('Core', 'Encoding');
2984
if ($encoding === 'utf-8') return $str;
2985
static $iconv = null;
2986
if ($iconv === null) $iconv = function_exists('iconv');
2987
set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
2988
if ($iconv && !$config->get('Test', 'ForceNoIconv')) {
2989
$str = iconv($encoding, 'utf-8//IGNORE', $str);
2990
if ($str === false) {
2991
// $encoding is not a valid encoding
2992
restore_error_handler();
2993
trigger_error('Invalid encoding ' . $encoding, E_USER_ERROR);
2996
// If the string is bjorked by Shift_JIS or a similar encoding
2997
// that doesn't support all of ASCII, convert the naughty
2998
// characters to their true byte-wise ASCII/UTF-8 equivalents.
2999
$str = strtr($str, HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding));
3000
restore_error_handler();
3002
} elseif ($encoding === 'iso-8859-1') {
3003
$str = utf8_encode($str);
3004
restore_error_handler();
3007
trigger_error('Encoding not supported, please install iconv', E_USER_ERROR);
3011
* Converts a string from UTF-8 based on configuration.
3012
* @note Currently, this is a lossy conversion, with unexpressable
3013
* characters being omitted.
3015
public static function convertFromUTF8($str, $config, $context) {
3016
$encoding = $config->get('Core', 'Encoding');
3017
if ($encoding === 'utf-8') return $str;
3018
static $iconv = null;
3019
if ($iconv === null) $iconv = function_exists('iconv');
3020
if ($escape = $config->get('Core', 'EscapeNonASCIICharacters')) {
3021
$str = HTMLPurifier_Encoder::convertToASCIIDumbLossless($str);
3023
set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
3024
if ($iconv && !$config->get('Test', 'ForceNoIconv')) {
3025
// Undo our previous fix in convertToUTF8, otherwise iconv will barf
3026
$ascii_fix = HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding);
3027
if (!$escape && !empty($ascii_fix)) {
3028
$clear_fix = array();
3029
foreach ($ascii_fix as $utf8 => $native) $clear_fix[$utf8] = '';
3030
$str = strtr($str, $clear_fix);
3032
$str = strtr($str, array_flip($ascii_fix));
3034
$str = iconv('utf-8', $encoding . '//IGNORE', $str);
3035
restore_error_handler();
3037
} elseif ($encoding === 'iso-8859-1') {
3038
$str = utf8_decode($str);
3039
restore_error_handler();
3042
trigger_error('Encoding not supported', E_USER_ERROR);
3046
* Lossless (character-wise) conversion of HTML to ASCII
3047
* @param $str UTF-8 string to be converted to ASCII
3048
* @returns ASCII encoded string with non-ASCII character entity-ized
3049
* @warning Adapted from MediaWiki, claiming fair use: this is a common
3050
* algorithm. If you disagree with this license fudgery,
3051
* implement it yourself.
3052
* @note Uses decimal numeric entities since they are best supported.
3053
* @note This is a DUMB function: it has no concept of keeping
3054
* character entities that the projected character encoding
3055
* can allow. We could possibly implement a smart version
3056
* but that would require it to also know which Unicode
3057
* codepoints the charset supported (not an easy task).
3058
* @note Sort of with cleanUTF8() but it assumes that $str is
3061
public static function convertToASCIIDumbLossless($str) {
3065
$len = strlen($str);
3066
for( $i = 0; $i < $len; $i++ ) {
3067
$bytevalue = ord( $str[$i] );
3068
if( $bytevalue <= 0x7F ) { //0xxx xxxx
3069
$result .= chr( $bytevalue );
3071
} elseif( $bytevalue <= 0xBF ) { //10xx xxxx
3072
$working = $working << 6;
3073
$working += ($bytevalue & 0x3F);
3075
if( $bytesleft <= 0 ) {
3076
$result .= "&#" . $working . ";";
3078
} elseif( $bytevalue <= 0xDF ) { //110x xxxx
3079
$working = $bytevalue & 0x1F;
3081
} elseif( $bytevalue <= 0xEF ) { //1110 xxxx
3082
$working = $bytevalue & 0x0F;
3084
} else { //1111 0xxx
3085
$working = $bytevalue & 0x07;
3093
* This expensive function tests whether or not a given character
3094
* encoding supports ASCII. 7/8-bit encodings like Shift_JIS will
3095
* fail this test, and require special processing. Variable width
3096
* encodings shouldn't ever fail.
3098
* @param string $encoding Encoding name to test, as per iconv format
3099
* @param bool $bypass Whether or not to bypass the precompiled arrays.
3100
* @return Array of UTF-8 characters to their corresponding ASCII,
3101
* which can be used to "undo" any overzealous iconv action.
3103
public static function testEncodingSupportsASCII($encoding, $bypass = false) {
3104
static $encodings = array();
3106
if (isset($encodings[$encoding])) return $encodings[$encoding];
3107
$lenc = strtolower($encoding);
3110
return array("\xC2\xA5" => '\\', "\xE2\x80\xBE" => '~');
3112
return array("\xE2\x82\xA9" => '\\');
3114
if (strpos($lenc, 'iso-8859-') === 0) return array();
3117
set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
3118
if (iconv('UTF-8', $encoding, 'a') === false) return false;
3119
for ($i = 0x20; $i <= 0x7E; $i++) { // all printable ASCII chars
3121
if (iconv('UTF-8', "$encoding//IGNORE", $c) === '') {
3122
// Reverse engineer: what's the UTF-8 equiv of this byte
3123
// sequence? This assumes that there's no variable width
3124
// encoding that doesn't support ASCII.
3125
$ret[iconv($encoding, 'UTF-8//IGNORE', $c)] = $c;
3128
restore_error_handler();
3129
$encodings[$encoding] = $ret;
3140
* Object that provides entity lookup table from entity name to character
3142
class HTMLPurifier_EntityLookup {
3145
* Assoc array of entity name to character represented.
3150
* Sets up the entity lookup table from the serialized file contents.
3151
* @note The serialized contents are versioned, but were generated
3152
* using the maintenance script generate_entity_file.php
3153
* @warning This is not in constructor to help enforce the Singleton
3155
public function setup($file = false) {
3157
$file = HTMLPURIFIER_PREFIX . '/HTMLPurifier/EntityLookup/entities.ser';
3159
$this->table = unserialize(file_get_contents($file));
3163
* Retrieves sole instance of the object.
3164
* @param Optional prototype of custom lookup table to overload with.
3166
public static function instance($prototype = false) {
3167
// no references, since PHP doesn't copy unless modified
3168
static $instance = null;
3170
$instance = $prototype;
3171
} elseif (!$instance) {
3172
$instance = new HTMLPurifier_EntityLookup();
3183
// if want to implement error collecting here, we'll need to use some sort
3184
// of global data (probably trigger_error) because it's impossible to pass
3185
// $config or $context to the callback functions.
3188
* Handles referencing and derefencing character entities
3190
class HTMLPurifier_EntityParser
3194
* Reference to entity lookup table.
3196
protected $_entity_lookup;
3199
* Callback regex string for parsing entities.
3201
protected $_substituteEntitiesRegex =
3202
'/&(?:[#]x([a-fA-F0-9]+)|[#]0*(\d+)|([A-Za-z_:][A-Za-z0-9.\-_:]*));?/';
3203
// 1. hex 2. dec 3. string (XML style)
3207
* Decimal to parsed string conversion table for special entities.
3209
protected $_special_dec2str =
3219
* Stripped entity names to decimal conversion table for special entities.
3221
protected $_special_ent2dec =
3230
* Substitutes non-special entities with their parsed equivalents. Since
3231
* running this whenever you have parsed character is t3h 5uck, we run
3232
* it before everything else.
3234
* @param $string String to have non-special entities parsed.
3235
* @returns Parsed string.
3237
public function substituteNonSpecialEntities($string) {
3238
// it will try to detect missing semicolons, but don't rely on it
3239
return preg_replace_callback(
3240
$this->_substituteEntitiesRegex,
3241
array($this, 'nonSpecialEntityCallback'),
3247
* Callback function for substituteNonSpecialEntities() that does the work.
3249
* @param $matches PCRE matches array, with 0 the entire match, and
3250
* either index 1, 2 or 3 set with a hex value, dec value,
3251
* or string (respectively).
3252
* @returns Replacement string.
3255
protected function nonSpecialEntityCallback($matches) {
3256
// replaces all but big five
3257
$entity = $matches[0];
3258
$is_num = (@$matches[0][1] === '#');
3260
$is_hex = (@$entity[2] === 'x');
3261
$code = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
3263
// abort for special characters
3264
if (isset($this->_special_dec2str[$code])) return $entity;
3266
return HTMLPurifier_Encoder::unichr($code);
3268
if (isset($this->_special_ent2dec[$matches[3]])) return $entity;
3269
if (!$this->_entity_lookup) {
3270
$this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
3272
if (isset($this->_entity_lookup->table[$matches[3]])) {
3273
return $this->_entity_lookup->table[$matches[3]];
3281
* Substitutes only special entities with their parsed equivalents.
3283
* @notice We try to avoid calling this function because otherwise, it
3284
* would have to be called a lot (for every parsed section).
3286
* @param $string String to have non-special entities parsed.
3287
* @returns Parsed string.
3289
public function substituteSpecialEntities($string) {
3290
return preg_replace_callback(
3291
$this->_substituteEntitiesRegex,
3292
array($this, 'specialEntityCallback'),
3297
* Callback function for substituteSpecialEntities() that does the work.
3299
* This callback has same syntax as nonSpecialEntityCallback().
3301
* @param $matches PCRE-style matches array, with 0 the entire match, and
3302
* either index 1, 2 or 3 set with a hex value, dec value,
3303
* or string (respectively).
3304
* @returns Replacement string.
3306
protected function specialEntityCallback($matches) {
3307
$entity = $matches[0];
3308
$is_num = (@$matches[0][1] === '#');
3310
$is_hex = (@$entity[2] === 'x');
3311
$int = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
3312
return isset($this->_special_dec2str[$int]) ?
3313
$this->_special_dec2str[$int] :
3316
return isset($this->_special_ent2dec[$matches[3]]) ?
3317
$this->_special_ent2dec[$matches[3]] :
3328
* Error collection class that enables HTML Purifier to report HTML
3329
* problems back to the user
3331
class HTMLPurifier_ErrorCollector
3335
* Identifiers for the returned error array. These are purposely numeric
3336
* so list() can be used.
3344
protected $_current;
3345
protected $_stacks = array(array());
3347
protected $generator;
3350
protected $lines = array();
3352
public function __construct($context) {
3353
$this->locale =& $context->get('Locale');
3354
$this->context = $context;
3355
$this->_current =& $this->_stacks[0];
3356
$this->errors =& $this->_stacks[0];
3360
* Sends an error message to the collector for later use
3361
* @param $severity int Error severity, PHP error style (don't use E_USER_)
3362
* @param $msg string Error message text
3363
* @param $subst1 string First substitution for $msg
3364
* @param $subst2 string ...
3366
public function send($severity, $msg) {
3369
if (func_num_args() > 2) {
3370
$args = func_get_args();
3375
$token = $this->context->get('CurrentToken', true);
3376
$line = $token ? $token->line : $this->context->get('CurrentLine', true);
3377
$col = $token ? $token->col : $this->context->get('CurrentCol', true);
3378
$attr = $this->context->get('CurrentAttr', true);
3380
// perform special substitutions, also add custom parameters
3382
if (!is_null($token)) {
3383
$args['CurrentToken'] = $token;
3385
if (!is_null($attr)) {
3386
$subst['$CurrentAttr.Name'] = $attr;
3387
if (isset($token->attr[$attr])) $subst['$CurrentAttr.Value'] = $token->attr[$attr];
3391
$msg = $this->locale->getMessage($msg);
3393
$msg = $this->locale->formatMessage($msg, $args);
3396
if (!empty($subst)) $msg = strtr($msg, $subst);
3398
// (numerically indexed)
3400
self::LINENO => $line,
3401
self::SEVERITY => $severity,
3402
self::MESSAGE => $msg,
3403
self::CHILDREN => array()
3405
$this->_current[] = $error;
3408
// NEW CODE BELOW ...
3411
// Top-level errors are either:
3412
// TOKEN type, if $value is set appropriately, or
3413
// "syntax" type, if $value is null
3414
$new_struct = new HTMLPurifier_ErrorStruct();
3415
$new_struct->type = HTMLPurifier_ErrorStruct::TOKEN;
3416
if ($token) $new_struct->value = clone $token;
3417
if (is_int($line) && is_int($col)) {
3418
if (isset($this->lines[$line][$col])) {
3419
$struct = $this->lines[$line][$col];
3421
$struct = $this->lines[$line][$col] = $new_struct;
3423
// These ksorts may present a performance problem
3424
ksort($this->lines[$line], SORT_NUMERIC);
3426
if (isset($this->lines[-1])) {
3427
$struct = $this->lines[-1];
3429
$struct = $this->lines[-1] = $new_struct;
3432
ksort($this->lines, SORT_NUMERIC);
3434
// Now, check if we need to operate on a lower structure
3435
if (!empty($attr)) {
3436
$struct = $struct->getChild(HTMLPurifier_ErrorStruct::ATTR, $attr);
3437
if (!$struct->value) {
3438
$struct->value = array($attr, 'PUT VALUE HERE');
3441
if (!empty($cssprop)) {
3442
$struct = $struct->getChild(HTMLPurifier_ErrorStruct::CSSPROP, $cssprop);
3443
if (!$struct->value) {
3444
// if we tokenize CSS this might be a little more difficult to do
3445
$struct->value = array($cssprop, 'PUT VALUE HERE');
3449
// Ok, structs are all setup, now time to register the error
3450
$struct->addError($severity, $msg);
3454
* Retrieves raw error data for custom formatter to use
3455
* @param List of arrays in format of array(line of error,
3456
* error severity, error message,
3457
* recursive sub-errors array)
3459
public function getRaw() {
3460
return $this->errors;
3464
* Default HTML formatting implementation for error messages
3465
* @param $config Configuration array, vital for HTML output nature
3466
* @param $errors Errors array to display; used for recursion.
3468
public function getHTMLFormatted($config, $errors = null) {
3471
$this->generator = new HTMLPurifier_Generator($config, $this->context);
3472
if ($errors === null) $errors = $this->errors;
3474
// 'At line' message needs to be removed
3476
// generation code for new structure goes here. It needs to be recursive.
3477
foreach ($this->lines as $line => $col_array) {
3478
if ($line == -1) continue;
3479
foreach ($col_array as $col => $struct) {
3480
$this->_renderStruct($ret, $struct, $line, $col);
3483
if (isset($this->lines[-1])) {
3484
$this->_renderStruct($ret, $this->lines[-1]);
3487
if (empty($errors)) {
3488
return '<p>' . $this->locale->getMessage('ErrorCollector: No errors') . '</p>';
3490
return '<ul><li>' . implode('</li><li>', $ret) . '</li></ul>';
3495
private function _renderStruct(&$ret, $struct, $line = null, $col = null) {
3496
$stack = array($struct);
3497
$context_stack = array(array());
3498
while ($current = array_pop($stack)) {
3499
$context = array_pop($context_stack);
3500
foreach ($current->errors as $error) {
3501
list($severity, $msg) = $error;
3504
// W3C uses an icon to indicate the severity of the error.
3505
$error = $this->locale->getErrorName($severity);
3506
$string .= "<span class=\"error e$severity\"><strong>$error</strong></span> ";
3507
if (!is_null($line) && !is_null($col)) {
3508
$string .= "<em class=\"location\">Line $line, Column $col: </em> ";
3510
$string .= '<em class="location">End of Document: </em> ';
3512
$string .= '<strong class="description">' . $this->generator->escape($msg) . '</strong> ';
3513
$string .= '</div>';
3514
// Here, have a marker for the character on the column appropriate.
3515
// Be sure to clip extremely long lines.
3516
//$string .= '<pre>';
3518
//$string .= '</pre>';
3521
foreach ($current->children as $type => $array) {
3522
$context[] = $current;
3523
$stack = array_merge($stack, array_reverse($array, true));
3524
for ($i = count($array); $i > 0; $i--) {
3525
$context_stack[] = $context;
3537
* Records errors for particular segments of an HTML document such as tokens,
3538
* attributes or CSS properties. They can contain error structs (which apply
3539
* to components of what they represent), but their main purpose is to hold
3540
* errors applying to whatever struct is being used.
3542
class HTMLPurifier_ErrorStruct
3546
* Possible values for $children first-key. Note that top-level structures
3547
* are automatically token-level.
3554
* Type of this struct.
3559
* Value of the struct we are recording errors for. There are various
3561
* - TOKEN: Instance of HTMLPurifier_Token
3562
* - ATTR: array('attr-name', 'value')
3563
* - CSSPROP: array('prop-name', 'value')
3568
* Errors registered for this structure.
3570
public $errors = array();
3573
* Child ErrorStructs that are from this structure. For example, a TOKEN
3574
* ErrorStruct would contain ATTR ErrorStructs. This is a multi-dimensional
3575
* array in structure: [TYPE]['identifier']
3577
public $children = array();
3579
public function getChild($type, $id) {
3580
if (!isset($this->children[$type][$id])) {
3581
$this->children[$type][$id] = new HTMLPurifier_ErrorStruct();
3582
$this->children[$type][$id]->type = $type;
3584
return $this->children[$type][$id];
3587
public function addError($severity, $message) {
3588
$this->errors[] = array($severity, $message);
3596
* Global exception class for HTML Purifier; any exceptions we throw
3599
class HTMLPurifier_Exception extends Exception
3608
* Represents a pre or post processing filter on HTML Purifier's output
3610
* Sometimes, a little ad-hoc fixing of HTML has to be done before
3611
* it gets sent through HTML Purifier: you can use filters to acheive
3612
* this effect. For instance, YouTube videos can be preserved using
3613
* this manner. You could have used a decorator for this task, but
3614
* PHP's support for them is not terribly robust, so we're going
3615
* to just loop through the filters.
3617
* Filters should be exited first in, last out. If there are three filters,
3618
* named 1, 2 and 3, the order of execution should go 1->preFilter,
3619
* 2->preFilter, 3->preFilter, purify, 3->postFilter, 2->postFilter,
3622
* @note Methods are not declared abstract as it is perfectly legitimate
3623
* for an implementation not to want anything to happen on a step
3626
class HTMLPurifier_Filter
3630
* Name of the filter for identification purposes
3635
* Pre-processor function, handles HTML before HTML Purifier
3637
public function preFilter($html, $config, $context) {
3642
* Post-processor function, handles HTML after HTML Purifier
3644
public function postFilter($html, $config, $context) {
3654
* Generates HTML from tokens.
3655
* @todo Refactor interface so that configuration/context is determined
3656
* upon instantiation, no need for messy generateFromTokens() calls
3657
* @todo Make some of the more internal functions protected, and have
3658
* unit tests work around that
3660
class HTMLPurifier_Generator
3664
* Whether or not generator should produce XML output
3666
private $_xhtml = true;
3669
* :HACK: Whether or not generator should comment the insides of <script> tags
3671
private $_scriptFix = false;
3674
* Cache of HTMLDefinition during HTML output to determine whether or
3675
* not attributes should be minimized.
3680
* Cache of %Output.SortAttr
3685
* Configuration for the generator
3690
* @param $config Instance of HTMLPurifier_Config
3691
* @param $context Instance of HTMLPurifier_Context
3693
public function __construct($config, $context) {
3694
$this->config = $config;
3695
$this->_scriptFix = $config->get('Output', 'CommentScriptContents');
3696
$this->_sortAttr = $config->get('Output', 'SortAttr');
3697
$this->_def = $config->getHTMLDefinition();
3698
$this->_xhtml = $this->_def->doctype->xml;
3702
* Generates HTML from an array of tokens.
3703
* @param $tokens Array of HTMLPurifier_Token
3704
* @param $config HTMLPurifier_Config object
3705
* @return Generated HTML
3707
public function generateFromTokens($tokens) {
3708
if (!$tokens) return '';
3712
for ($i = 0, $size = count($tokens); $i < $size; $i++) {
3713
if ($this->_scriptFix && $tokens[$i]->name === 'script'
3714
&& $i + 2 < $size && $tokens[$i+2] instanceof HTMLPurifier_Token_End) {
3715
// script special case
3716
// the contents of the script block must be ONE token
3717
// for this to work.
3718
$html .= $this->generateFromToken($tokens[$i++]);
3719
$html .= $this->generateScriptFromToken($tokens[$i++]);
3721
$html .= $this->generateFromToken($tokens[$i]);
3725
if (extension_loaded('tidy') && $this->config->get('Output', 'TidyFormat')) {
3727
$tidy->parseString($html, array(
3729
'output-xhtml' => $this->_xhtml,
3730
'show-body-only' => true,
3731
'indent-spaces' => 2,
3734
$tidy->cleanRepair();
3735
$html = (string) $tidy; // explicit cast necessary
3738
// Normalize newlines to system defined value
3739
$nl = $this->config->get('Output', 'Newline');
3740
if ($nl === null) $nl = PHP_EOL;
3741
if ($nl !== "\n") $html = str_replace("\n", $nl, $html);
3746
* Generates HTML from a single token.
3747
* @param $token HTMLPurifier_Token object.
3748
* @return Generated HTML
3750
public function generateFromToken($token) {
3751
if (!$token instanceof HTMLPurifier_Token) {
3752
trigger_error('Cannot generate HTML from non-HTMLPurifier_Token object', E_USER_WARNING);
3755
} elseif ($token instanceof HTMLPurifier_Token_Start) {
3756
$attr = $this->generateAttributes($token->attr, $token->name);
3757
return '<' . $token->name . ($attr ? ' ' : '') . $attr . '>';
3759
} elseif ($token instanceof HTMLPurifier_Token_End) {
3760
return '</' . $token->name . '>';
3762
} elseif ($token instanceof HTMLPurifier_Token_Empty) {
3763
$attr = $this->generateAttributes($token->attr, $token->name);
3764
return '<' . $token->name . ($attr ? ' ' : '') . $attr .
3765
( $this->_xhtml ? ' /': '' ) // <br /> v. <br>
3768
} elseif ($token instanceof HTMLPurifier_Token_Text) {
3769
return $this->escape($token->data, ENT_NOQUOTES);
3771
} elseif ($token instanceof HTMLPurifier_Token_Comment) {
3772
return '<!--' . $token->data . '-->';
3780
* Special case processor for the contents of script tags
3781
* @warning This runs into problems if there's already a literal
3782
* --> somewhere inside the script contents.
3784
public function generateScriptFromToken($token) {
3785
if (!$token instanceof HTMLPurifier_Token_Text) return $this->generateFromToken($token);
3786
// Thanks <http://lachy.id.au/log/2005/05/script-comments>
3787
$data = preg_replace('#//\s*$#', '', $token->data);
3788
return '<!--//--><![CDATA[//><!--' . "\n" . trim($data) . "\n" . '//--><!]]>';
3792
* Generates attribute declarations from attribute array.
3793
* @note This does not include the leading or trailing space.
3794
* @param $assoc_array_of_attributes Attribute array
3795
* @param $element Name of element attributes are for, used to check
3796
* attribute minimization.
3797
* @return Generate HTML fragment for insertion.
3799
public function generateAttributes($assoc_array_of_attributes, $element = false) {
3801
if ($this->_sortAttr) ksort($assoc_array_of_attributes);
3802
foreach ($assoc_array_of_attributes as $key => $value) {
3803
if (!$this->_xhtml) {
3804
// Remove namespaced attributes
3805
if (strpos($key, ':') !== false) continue;
3806
// Check if we should minimize the attribute: val="val" -> val
3807
if ($element && !empty($this->_def->info[$element]->attr[$key]->minimized)) {
3808
$html .= $key . ' ';
3812
$html .= $key.'="'.$this->escape($value).'" ';
3814
return rtrim($html);
3818
* Escapes raw text data.
3819
* @todo This really ought to be protected, but until we have a facility
3820
* for properly generating HTML here w/o using tokens, it stays
3822
* @param $string String data to escape for HTML.
3823
* @param $quote Quoting style, like htmlspecialchars. ENT_NOQUOTES is
3824
* permissible for non-attribute output.
3825
* @return String escaped data.
3827
public function escape($string, $quote = ENT_COMPAT) {
3828
return htmlspecialchars($string, $quote, 'UTF-8');
3837
* Definition of the purified HTML that describes allowed children,
3838
* attributes, and many other things.
3842
* All member variables that are prefixed with info
3843
* (including the main $info array) are used by HTML Purifier internals
3844
* and should not be directly edited when customizing the HTMLDefinition.
3845
* They can usually be set via configuration directives or custom
3848
* On the other hand, member variables without the info prefix are used
3849
* internally by the HTMLDefinition and MUST NOT be used by other HTML
3850
* Purifier internals. Many of them, however, are public, and may be
3851
* edited by userspace code to tweak the behavior of HTMLDefinition.
3853
* @note This class is inspected by Printer_HTMLDefinition; please
3854
* update that class if things here change.
3856
* @warning Directives that change this object's structure must be in
3857
* the HTML or Attr namespace!
3859
class HTMLPurifier_HTMLDefinition extends HTMLPurifier_Definition
3862
// FULLY-PUBLIC VARIABLES ---------------------------------------------
3865
* Associative array of element names to HTMLPurifier_ElementDef
3867
public $info = array();
3870
* Associative array of global attribute name to attribute definition.
3872
public $info_global_attr = array();
3875
* String name of parent element HTML will be going into.
3877
public $info_parent = 'div';
3880
* Definition for parent element, allows parent element to be a
3881
* tag that's not allowed inside the HTML fragment.
3883
public $info_parent_def;
3886
* String name of element used to wrap inline elements in block context
3887
* @note This is rarely used except for BLOCKQUOTEs in strict mode
3889
public $info_block_wrapper = 'p';
3892
* Associative array of deprecated tag name to HTMLPurifier_TagTransform
3894
public $info_tag_transform = array();
3897
* Indexed list of HTMLPurifier_AttrTransform to be performed before validation.
3899
public $info_attr_transform_pre = array();
3902
* Indexed list of HTMLPurifier_AttrTransform to be performed after validation.
3904
public $info_attr_transform_post = array();
3907
* Nested lookup array of content set name (Block, Inline) to
3908
* element name to whether or not it belongs in that content set.
3910
public $info_content_sets = array();
3913
* Indexed list of HTMLPurifier_Injector to be used.
3915
public $info_injector = array();
3924
// RAW CUSTOMIZATION STUFF --------------------------------------------
3927
* Adds a custom attribute to a pre-existing element
3928
* @note This is strictly convenience, and does not have a corresponding
3929
* method in HTMLPurifier_HTMLModule
3930
* @param $element_name String element name to add attribute to
3931
* @param $attr_name String name of attribute
3932
* @param $def Attribute definition, can be string or object, see
3933
* HTMLPurifier_AttrTypes for details
3935
public function addAttribute($element_name, $attr_name, $def) {
3936
$module = $this->getAnonymousModule();
3937
if (!isset($module->info[$element_name])) {
3938
$element = $module->addBlankElement($element_name);
3940
$element = $module->info[$element_name];
3942
$element->attr[$attr_name] = $def;
3946
* Adds a custom element to your HTML definition
3947
* @note See HTMLPurifier_HTMLModule::addElement for detailed
3948
* parameter and return value descriptions.
3950
public function addElement($element_name, $type, $contents, $attr_collections, $attributes) {
3951
$module = $this->getAnonymousModule();
3952
// assume that if the user is calling this, the element
3953
// is safe. This may not be a good idea
3954
$element = $module->addElement($element_name, $type, $contents, $attr_collections, $attributes);
3959
* Adds a blank element to your HTML definition, for overriding
3961
* @note See HTMLPurifier_HTMLModule::addBlankElement for detailed
3962
* parameter and return value descriptions.
3964
public function addBlankElement($element_name) {
3965
$module = $this->getAnonymousModule();
3966
$element = $module->addBlankElement($element_name);
3971
* Retrieves a reference to the anonymous module, so you can
3972
* bust out advanced features without having to make your own
3975
public function getAnonymousModule() {
3976
if (!$this->_anonModule) {
3977
$this->_anonModule = new HTMLPurifier_HTMLModule();
3978
$this->_anonModule->name = 'Anonymous';
3980
return $this->_anonModule;
3983
private $_anonModule;
3986
// PUBLIC BUT INTERNAL VARIABLES --------------------------------------
3988
public $type = 'HTML';
3989
public $manager; /**< Instance of HTMLPurifier_HTMLModuleManager */
3992
* Performs low-cost, preliminary initialization.
3994
public function __construct() {
3995
$this->manager = new HTMLPurifier_HTMLModuleManager();
3998
protected function doSetup($config) {
3999
$this->processModules($config);
4000
$this->setupConfigStuff($config);
4001
unset($this->manager);
4003
// cleanup some of the element definitions
4004
foreach ($this->info as $k => $v) {
4005
unset($this->info[$k]->content_model);
4006
unset($this->info[$k]->content_model_type);
4011
* Extract out the information from the manager
4013
protected function processModules($config) {
4015
if ($this->_anonModule) {
4016
// for user specific changes
4017
// this is late-loaded so we don't have to deal with PHP4
4018
// reference wonky-ness
4019
$this->manager->addModule($this->_anonModule);
4020
unset($this->_anonModule);
4023
$this->manager->setup($config);
4024
$this->doctype = $this->manager->doctype;
4026
foreach ($this->manager->modules as $module) {
4027
foreach($module->info_tag_transform as $k => $v) {
4028
if ($v === false) unset($this->info_tag_transform[$k]);
4029
else $this->info_tag_transform[$k] = $v;
4031
foreach($module->info_attr_transform_pre as $k => $v) {
4032
if ($v === false) unset($this->info_attr_transform_pre[$k]);
4033
else $this->info_attr_transform_pre[$k] = $v;
4035
foreach($module->info_attr_transform_post as $k => $v) {
4036
if ($v === false) unset($this->info_attr_transform_post[$k]);
4037
else $this->info_attr_transform_post[$k] = $v;
4039
foreach ($module->info_injector as $k => $v) {
4040
if ($v === false) unset($this->info_injector[$k]);
4041
else $this->info_injector[$k] = $v;
4045
$this->info = $this->manager->getElements();
4046
$this->info_content_sets = $this->manager->contentSets->lookup;
4051
* Sets up stuff based on config. We need a better way of doing this.
4053
protected function setupConfigStuff($config) {
4055
$block_wrapper = $config->get('HTML', 'BlockWrapper');
4056
if (isset($this->info_content_sets['Block'][$block_wrapper])) {
4057
$this->info_block_wrapper = $block_wrapper;
4059
trigger_error('Cannot use non-block element as block wrapper',
4063
$parent = $config->get('HTML', 'Parent');
4064
$def = $this->manager->getElement($parent, true);
4066
$this->info_parent = $parent;
4067
$this->info_parent_def = $def;
4069
trigger_error('Cannot use unrecognized element as parent',
4071
$this->info_parent_def = $this->manager->getElement($this->info_parent, true);
4074
// support template text
4075
$support = "(for information on implementing this, see the ".
4078
// setup allowed elements -----------------------------------------
4080
$allowed_elements = $config->get('HTML', 'AllowedElements');
4081
$allowed_attributes = $config->get('HTML', 'AllowedAttributes'); // retrieve early
4083
if (!is_array($allowed_elements) && !is_array($allowed_attributes)) {
4084
$allowed = $config->get('HTML', 'Allowed');
4085
if (is_string($allowed)) {
4086
list($allowed_elements, $allowed_attributes) = $this->parseTinyMCEAllowedList($allowed);
4090
if (is_array($allowed_elements)) {
4091
foreach ($this->info as $name => $d) {
4092
if(!isset($allowed_elements[$name])) unset($this->info[$name]);
4093
unset($allowed_elements[$name]);
4096
foreach ($allowed_elements as $element => $d) {
4097
$element = htmlspecialchars($element); // PHP doesn't escape errors, be careful!
4098
trigger_error("Element '$element' is not supported $support", E_USER_WARNING);
4102
// setup allowed attributes ---------------------------------------
4104
$allowed_attributes_mutable = $allowed_attributes; // by copy!
4105
if (is_array($allowed_attributes)) {
4107
// This actually doesn't do anything, since we went away from
4108
// global attributes. It's possible that userland code uses
4109
// it, but HTMLModuleManager doesn't!
4110
foreach ($this->info_global_attr as $attr => $x) {
4111
$keys = array($attr, "*@$attr", "*.$attr");
4113
foreach ($keys as $key) {
4114
if ($delete && isset($allowed_attributes[$key])) {
4117
if (isset($allowed_attributes_mutable[$key])) {
4118
unset($allowed_attributes_mutable[$key]);
4121
if ($delete) unset($this->info_global_attr[$attr]);
4124
foreach ($this->info as $tag => $info) {
4125
foreach ($info->attr as $attr => $x) {
4126
$keys = array("$tag@$attr", $attr, "*@$attr", "$tag.$attr", "*.$attr");
4128
foreach ($keys as $key) {
4129
if ($delete && isset($allowed_attributes[$key])) {
4132
if (isset($allowed_attributes_mutable[$key])) {
4133
unset($allowed_attributes_mutable[$key]);
4136
if ($delete) unset($this->info[$tag]->attr[$attr]);
4140
foreach ($allowed_attributes_mutable as $elattr => $d) {
4141
$bits = preg_split('/[.@]/', $elattr, 2);
4145
if ($bits[0] !== '*') {
4146
$element = htmlspecialchars($bits[0]);
4147
$attribute = htmlspecialchars($bits[1]);
4148
if (!isset($this->info[$element])) {
4149
trigger_error("Cannot allow attribute '$attribute' if element '$element' is not allowed/supported $support");
4151
trigger_error("Attribute '$attribute' in element '$element' not supported $support",
4156
// otherwise fall through
4158
$attribute = htmlspecialchars($bits[0]);
4159
trigger_error("Global attribute '$attribute' is not ".
4160
"supported in any elements $support",
4168
// setup forbidden elements ---------------------------------------
4170
$forbidden_elements = $config->get('HTML', 'ForbiddenElements');
4171
$forbidden_attributes = $config->get('HTML', 'ForbiddenAttributes');
4173
foreach ($this->info as $tag => $info) {
4174
if (isset($forbidden_elements[$tag])) {
4175
unset($this->info[$tag]);
4178
foreach ($info->attr as $attr => $x) {
4180
isset($forbidden_attributes["$tag@$attr"]) ||
4181
isset($forbidden_attributes["*@$attr"]) ||
4182
isset($forbidden_attributes[$attr])
4184
unset($this->info[$tag]->attr[$attr]);
4186
} // this segment might get removed eventually
4187
elseif (isset($forbidden_attributes["$tag.$attr"])) {
4188
// $tag.$attr are not user supplied, so no worries!
4189
trigger_error("Error with $tag.$attr: tag.attr syntax not supported for HTML.ForbiddenAttributes; use tag@attr instead", E_USER_WARNING);
4193
foreach ($forbidden_attributes as $key => $v) {
4194
if (strlen($key) < 2) continue;
4195
if ($key[0] != '*') continue;
4196
if ($key[1] == '.') {
4197
trigger_error("Error with $key: *.attr syntax not supported for HTML.ForbiddenAttributes; use attr instead", E_USER_WARNING);
4201
// setup injectors -----------------------------------------------------
4202
foreach ($this->info_injector as $i => $injector) {
4203
if ($injector->checkNeeded($config) !== false) {
4204
// remove injector that does not have it's required
4205
// elements/attributes present, and is thus not needed.
4206
unset($this->info_injector[$i]);
4212
* Parses a TinyMCE-flavored Allowed Elements and Attributes list into
4213
* separate lists for processing. Format is element[attr1|attr2],element2...
4214
* @warning Although it's largely drawn from TinyMCE's implementation,
4215
* it is different, and you'll probably have to modify your lists
4216
* @param $list String list to parse
4217
* @param array($allowed_elements, $allowed_attributes)
4218
* @todo Give this its own class, probably static interface
4220
public function parseTinyMCEAllowedList($list) {
4222
$list = str_replace(array(' ', "\t"), '', $list);
4224
$elements = array();
4225
$attributes = array();
4227
$chunks = preg_split('/(,|[\n\r]+)/', $list);
4228
foreach ($chunks as $chunk) {
4229
if (empty($chunk)) continue;
4230
// remove TinyMCE element control characters
4231
if (!strpos($chunk, '[')) {
4235
list($element, $attr) = explode('[', $chunk);
4237
if ($element !== '*') $elements[$element] = true;
4238
if (!$attr) continue;
4239
$attr = substr($attr, 0, strlen($attr) - 1); // remove trailing ]
4240
$attr = explode('|', $attr);
4241
foreach ($attr as $key) {
4242
$attributes["$element.$key"] = true;
4246
return array($elements, $attributes);
4258
* Represents an XHTML 1.1 module, with information on elements, tags
4260
* @note Even though this is technically XHTML 1.1, it is also used for
4261
* regular HTML parsing. We are using modulization as a convenient
4262
* way to represent the internals of HTMLDefinition, and our
4263
* implementation is by no means conforming and does not directly
4264
* use the normative DTDs or XML schemas.
4265
* @note The public variables in a module should almost directly
4266
* correspond to the variables in HTMLPurifier_HTMLDefinition.
4267
* However, the prefix info carries no special meaning in these
4268
* objects (include it anyway if that's the correspondence though).
4269
* @todo Consider making some member functions protected
4272
class HTMLPurifier_HTMLModule
4275
// -- Overloadable ----------------------------------------------------
4278
* Short unique string identifier of the module
4283
* Informally, a list of elements this module changes. Not used in
4284
* any significant way.
4286
public $elements = array();
4289
* Associative array of element names to element definitions.
4290
* Some definitions may be incomplete, to be merged in later
4291
* with the full definition.
4293
public $info = array();
4296
* Associative array of content set names to content set additions.
4297
* This is commonly used to, say, add an A element to the Inline
4298
* content set. This corresponds to an internal variable $content_sets
4299
* and NOT info_content_sets member variable of HTMLDefinition.
4301
public $content_sets = array();
4304
* Associative array of attribute collection names to attribute
4305
* collection additions. More rarely used for adding attributes to
4306
* the global collections. Example is the StyleAttribute module adding
4307
* the style attribute to the Core. Corresponds to HTMLDefinition's
4308
* attr_collections->info, since the object's data is only info,
4309
* with extra behavior associated with it.
4311
public $attr_collections = array();
4314
* Associative array of deprecated tag name to HTMLPurifier_TagTransform
4316
public $info_tag_transform = array();
4319
* List of HTMLPurifier_AttrTransform to be performed before validation.
4321
public $info_attr_transform_pre = array();
4324
* List of HTMLPurifier_AttrTransform to be performed after validation.
4326
public $info_attr_transform_post = array();
4329
* List of HTMLPurifier_Injector to be performed during well-formedness fixing.
4330
* An injector will only be invoked if all of it's pre-requisites are met;
4331
* if an injector fails setup, there will be no error; it will simply be
4332
* silently disabled.
4334
public $info_injector = array();
4337
* Boolean flag that indicates whether or not getChildDef is implemented.
4338
* For optimization reasons: may save a call to a function. Be sure
4339
* to set it if you do implement getChildDef(), otherwise it will have
4342
public $defines_child_def = false;
4345
* Boolean flag whether or not this module is safe. If it is not safe, all
4346
* of its members are unsafe. Modules are safe by default (this might be
4347
* slightly dangerous, but it doesn't make much sense to force HTML Purifier,
4348
* which is based off of safe HTML, to explicitly say, "This is safe," even
4349
* though there are modules which are "unsafe")
4351
* @note Previously, safety could be applied at an element level granularity.
4352
* We've removed this ability, so in order to add "unsafe" elements
4353
* or attributes, a dedicated module with this property set to false
4356
public $safe = true;
4359
* Retrieves a proper HTMLPurifier_ChildDef subclass based on
4360
* content_model and content_model_type member variables of
4361
* the HTMLPurifier_ElementDef class. There is a similar function
4362
* in HTMLPurifier_HTMLDefinition.
4363
* @param $def HTMLPurifier_ElementDef instance
4364
* @return HTMLPurifier_ChildDef subclass
4366
public function getChildDef($def) {return false;}
4368
// -- Convenience -----------------------------------------------------
4371
* Convenience function that sets up a new element
4372
* @param $element Name of element to add
4373
* @param $type What content set should element be registered to?
4374
* Set as false to skip this step.
4375
* @param $contents Allowed children in form of:
4376
* "$content_model_type: $content_model"
4377
* @param $attr_includes What attribute collections to register to
4379
* @param $attr What unique attributes does the element define?
4380
* @note See ElementDef for in-depth descriptions of these parameters.
4381
* @return Created element definition object, so you
4382
* can set advanced parameters
4384
public function addElement($element, $type, $contents, $attr_includes = array(), $attr = array()) {
4385
$this->elements[] = $element;
4386
// parse content_model
4387
list($content_model_type, $content_model) = $this->parseContents($contents);
4388
// merge in attribute inclusions
4389
$this->mergeInAttrIncludes($attr, $attr_includes);
4390
// add element to content sets
4391
if ($type) $this->addElementToContentSet($element, $type);
4393
$this->info[$element] = HTMLPurifier_ElementDef::create(
4394
$content_model, $content_model_type, $attr
4396
// literal object $contents means direct child manipulation
4397
if (!is_string($contents)) $this->info[$element]->child = $contents;
4398
return $this->info[$element];
4402
* Convenience function that creates a totally blank, non-standalone
4404
* @param $element Name of element to create
4405
* @return Created element
4407
public function addBlankElement($element) {
4408
if (!isset($this->info[$element])) {
4409
$this->elements[] = $element;
4410
$this->info[$element] = new HTMLPurifier_ElementDef();
4411
$this->info[$element]->standalone = false;
4413
trigger_error("Definition for $element already exists in module, cannot redefine");
4415
return $this->info[$element];
4419
* Convenience function that registers an element to a content set
4420
* @param Element to register
4421
* @param Name content set (warning: case sensitive, usually upper-case
4424
public function addElementToContentSet($element, $type) {
4425
if (!isset($this->content_sets[$type])) $this->content_sets[$type] = '';
4426
else $this->content_sets[$type] .= ' | ';
4427
$this->content_sets[$type] .= $element;
4431
* Convenience function that transforms single-string contents
4432
* into separate content model and content model type
4433
* @param $contents Allowed children in form of:
4434
* "$content_model_type: $content_model"
4435
* @note If contents is an object, an array of two nulls will be
4436
* returned, and the callee needs to take the original $contents
4437
* and use it directly.
4439
public function parseContents($contents) {
4440
if (!is_string($contents)) return array(null, null); // defer
4441
switch ($contents) {
4442
// check for shorthand content model forms
4444
return array('empty', '');
4446
return array('optional', 'Inline | #PCDATA');
4448
return array('optional', 'Flow | #PCDATA');
4450
list($content_model_type, $content_model) = explode(':', $contents);
4451
$content_model_type = strtolower(trim($content_model_type));
4452
$content_model = trim($content_model);
4453
return array($content_model_type, $content_model);
4457
* Convenience function that merges a list of attribute includes into
4458
* an attribute array.
4459
* @param $attr Reference to attr array to modify
4460
* @param $attr_includes Array of includes / string include to merge in
4462
public function mergeInAttrIncludes(&$attr, $attr_includes) {
4463
if (!is_array($attr_includes)) {
4464
if (empty($attr_includes)) $attr_includes = array();
4465
else $attr_includes = array($attr_includes);
4467
$attr[0] = $attr_includes;
4471
* Convenience function that generates a lookup table with boolean
4473
* @param $list List of values to turn into a lookup
4474
* @note You can also pass an arbitrary number of arguments in
4475
* place of the regular argument
4476
* @return Lookup array equivalent of list
4478
public function makeLookup($list) {
4479
if (is_string($list)) $list = func_get_args();
4481
foreach ($list as $value) {
4482
if (is_null($value)) continue;
4483
$ret[$value] = true;
4489
* Lazy load construction of the module after determining whether
4490
* or not it's needed, and also when a finalized configuration object
4492
* @param $config Instance of HTMLPurifier_Config
4494
public function setup($config) {}
4501
class HTMLPurifier_HTMLModuleManager
4505
* Instance of HTMLPurifier_DoctypeRegistry
4510
* Instance of current doctype
4515
* Instance of HTMLPurifier_AttrTypes
4520
* Active instances of modules for the specified doctype are
4521
* indexed, by name, in this array.
4523
public $modules = array();
4526
* Array of recognized HTMLPurifier_Module instances, indexed by
4527
* module's class name. This array is usually lazy loaded, but a
4528
* user can overload a module by pre-emptively registering it.
4530
public $registeredModules = array();
4533
* List of extra modules that were added by the user using addModule().
4534
* These get unconditionally merged into the current doctype, whatever
4537
public $userModules = array();
4540
* Associative array of element name to list of modules that have
4541
* definitions for the element; this array is dynamically filled.
4543
public $elementLookup = array();
4545
/** List of prefixes we should use for registering small names */
4546
public $prefixes = array('HTMLPurifier_HTMLModule_');
4548
public $contentSets; /**< Instance of HTMLPurifier_ContentSets */
4549
public $attrCollections; /**< Instance of HTMLPurifier_AttrCollections */
4551
/** If set to true, unsafe elements and attributes will be allowed */
4552
public $trusted = false;
4554
public function __construct() {
4556
// editable internal objects
4557
$this->attrTypes = new HTMLPurifier_AttrTypes();
4558
$this->doctypes = new HTMLPurifier_DoctypeRegistry();
4560
// setup basic modules
4562
'CommonAttributes', 'Text', 'Hypertext', 'List',
4563
'Presentation', 'Edit', 'Bdo', 'Tables', 'Image',
4566
'Scripting', 'Object', 'Forms',
4567
// Sorta legacy, but present in strict:
4570
$transitional = array('Legacy', 'Target');
4571
$xml = array('XMLCommonAttributes');
4572
$non_xml = array('NonXMLCommonAttributes');
4574
// setup basic doctypes
4575
$this->doctypes->register(
4576
'HTML 4.01 Transitional', false,
4577
array_merge($common, $transitional, $non_xml),
4578
array('Tidy_Transitional', 'Tidy_Proprietary'),
4580
'-//W3C//DTD HTML 4.01 Transitional//EN',
4581
'http://www.w3.org/TR/html4/loose.dtd'
4584
$this->doctypes->register(
4585
'HTML 4.01 Strict', false,
4586
array_merge($common, $non_xml),
4587
array('Tidy_Strict', 'Tidy_Proprietary', 'Tidy_Name'),
4589
'-//W3C//DTD HTML 4.01//EN',
4590
'http://www.w3.org/TR/html4/strict.dtd'
4593
$this->doctypes->register(
4594
'XHTML 1.0 Transitional', true,
4595
array_merge($common, $transitional, $xml, $non_xml),
4596
array('Tidy_Transitional', 'Tidy_XHTML', 'Tidy_Proprietary', 'Tidy_Name'),
4598
'-//W3C//DTD XHTML 1.0 Transitional//EN',
4599
'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'
4602
$this->doctypes->register(
4603
'XHTML 1.0 Strict', true,
4604
array_merge($common, $xml, $non_xml),
4605
array('Tidy_Strict', 'Tidy_XHTML', 'Tidy_Strict', 'Tidy_Proprietary', 'Tidy_Name'),
4607
'-//W3C//DTD XHTML 1.0 Strict//EN',
4608
'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd'
4611
$this->doctypes->register(
4613
array_merge($common, $xml, array('Ruby')),
4614
array('Tidy_Strict', 'Tidy_XHTML', 'Tidy_Proprietary', 'Tidy_Strict', 'Tidy_Name'), // Tidy_XHTML1_1
4616
'-//W3C//DTD XHTML 1.1//EN',
4617
'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd'
4623
* Registers a module to the recognized module list, useful for
4624
* overloading pre-existing modules.
4625
* @param $module Mixed: string module name, with or without
4626
* HTMLPurifier_HTMLModule prefix, or instance of
4627
* subclass of HTMLPurifier_HTMLModule.
4628
* @param $overload Boolean whether or not to overload previous modules.
4629
* If this is not set, and you do overload a module,
4630
* HTML Purifier will complain with a warning.
4631
* @note This function will not call autoload, you must instantiate
4632
* (and thus invoke) autoload outside the method.
4633
* @note If a string is passed as a module name, different variants
4634
* will be tested in this order:
4635
* - Check for HTMLPurifier_HTMLModule_$name
4636
* - Check all prefixes with $name in order they were added
4637
* - Check for literal object name
4638
* - Throw fatal error
4639
* If your object name collides with an internal class, specify
4640
* your module manually. All modules must have been included
4641
* externally: registerModule will not perform inclusions for you!
4643
public function registerModule($module, $overload = false) {
4644
if (is_string($module)) {
4645
// attempt to load the module
4646
$original_module = $module;
4648
foreach ($this->prefixes as $prefix) {
4649
$module = $prefix . $original_module;
4650
if (class_exists($module)) {
4656
$module = $original_module;
4657
if (!class_exists($module)) {
4658
trigger_error($original_module . ' module does not exist',
4663
$module = new $module();
4665
if (empty($module->name)) {
4666
trigger_error('Module instance of ' . get_class($module) . ' must have name');
4669
if (!$overload && isset($this->registeredModules[$module->name])) {
4670
trigger_error('Overloading ' . $module->name . ' without explicit overload parameter', E_USER_WARNING);
4672
$this->registeredModules[$module->name] = $module;
4676
* Adds a module to the current doctype by first registering it,
4677
* and then tacking it on to the active doctype
4679
public function addModule($module) {
4680
$this->registerModule($module);
4681
if (is_object($module)) $module = $module->name;
4682
$this->userModules[] = $module;
4686
* Adds a class prefix that registerModule() will use to resolve a
4687
* string name to a concrete class
4689
public function addPrefix($prefix) {
4690
$this->prefixes[] = $prefix;
4694
* Performs processing on modules, after being called you may
4695
* use getElement() and getElements()
4696
* @param $config Instance of HTMLPurifier_Config
4698
public function setup($config) {
4700
$this->trusted = $config->get('HTML', 'Trusted');
4703
$this->doctype = $this->doctypes->make($config);
4704
$modules = $this->doctype->modules;
4706
// take out the default modules that aren't allowed
4707
$lookup = $config->get('HTML', 'AllowedModules');
4708
$special_cases = $config->get('HTML', 'CoreModules');
4710
if (is_array($lookup)) {
4711
foreach ($modules as $k => $m) {
4712
if (isset($special_cases[$m])) continue;
4713
if (!isset($lookup[$m])) unset($modules[$k]);
4717
// add proprietary module (this gets special treatment because
4718
// it is completely removed from doctypes, etc.)
4719
if ($config->get('HTML', 'Proprietary')) {
4720
$modules[] = 'Proprietary';
4723
// add SafeObject/Safeembed modules
4724
if ($config->get('HTML', 'SafeObject')) {
4725
$modules[] = 'SafeObject';
4727
if ($config->get('HTML', 'SafeEmbed')) {
4728
$modules[] = 'SafeEmbed';
4731
// merge in custom modules
4732
$modules = array_merge($modules, $this->userModules);
4734
foreach ($modules as $module) {
4735
$this->processModule($module);
4736
$this->modules[$module]->setup($config);
4739
foreach ($this->doctype->tidyModules as $module) {
4740
$this->processModule($module);
4741
$this->modules[$module]->setup($config);
4744
// prepare any injectors
4745
foreach ($this->modules as $module) {
4747
foreach ($module->info_injector as $i => $injector) {
4748
if (!is_object($injector)) {
4749
$class = "HTMLPurifier_Injector_$injector";
4750
$injector = new $class;
4752
$n[$injector->name] = $injector;
4754
$module->info_injector = $n;
4757
// setup lookup table based on all valid modules
4758
foreach ($this->modules as $module) {
4759
foreach ($module->info as $name => $def) {
4760
if (!isset($this->elementLookup[$name])) {
4761
$this->elementLookup[$name] = array();
4763
$this->elementLookup[$name][] = $module->name;
4767
// note the different choice
4768
$this->contentSets = new HTMLPurifier_ContentSets(
4769
// content set assembly deals with all possible modules,
4770
// not just ones deemed to be "safe"
4773
$this->attrCollections = new HTMLPurifier_AttrCollections(
4775
// there is no way to directly disable a global attribute,
4776
// but using AllowedAttributes or simply not including
4777
// the module in your custom doctype should be sufficient
4783
* Takes a module and adds it to the active module collection,
4784
* registering it if necessary.
4786
public function processModule($module) {
4787
if (!isset($this->registeredModules[$module]) || is_object($module)) {
4788
$this->registerModule($module);
4790
$this->modules[$module] = $this->registeredModules[$module];
4794
* Retrieves merged element definitions.
4795
* @return Array of HTMLPurifier_ElementDef
4797
public function getElements() {
4799
$elements = array();
4800
foreach ($this->modules as $module) {
4801
if (!$this->trusted && !$module->safe) continue;
4802
foreach ($module->info as $name => $v) {
4803
if (isset($elements[$name])) continue;
4804
$elements[$name] = $this->getElement($name);
4808
// remove dud elements, this happens when an element that
4809
// appeared to be safe actually wasn't
4810
foreach ($elements as $n => $v) {
4811
if ($v === false) unset($elements[$n]);
4819
* Retrieves a single merged element definition
4820
* @param $name Name of element
4821
* @param $trusted Boolean trusted overriding parameter: set to true
4822
* if you want the full version of an element
4823
* @return Merged HTMLPurifier_ElementDef
4824
* @note You may notice that modules are getting iterated over twice (once
4825
* in getElements() and once here). This
4828
public function getElement($name, $trusted = null) {
4830
if (!isset($this->elementLookup[$name])) {
4834
// setup global state variables
4836
if ($trusted === null) $trusted = $this->trusted;
4838
// iterate through each module that has registered itself to this
4840
foreach($this->elementLookup[$name] as $module_name) {
4842
$module = $this->modules[$module_name];
4844
// refuse to create/merge from a module that is deemed unsafe--
4845
// pretend the module doesn't exist--when trusted mode is not on.
4846
if (!$trusted && !$module->safe) {
4850
// clone is used because, ideally speaking, the original
4851
// definition should not be modified. Usually, this will
4852
// make no difference, but for consistency's sake
4853
$new_def = clone $module->info[$name];
4855
if (!$def && $new_def->standalone) {
4858
// This will occur even if $new_def is standalone. In practice,
4859
// this will usually result in a full replacement.
4860
$def->mergeIn($new_def);
4863
// non-standalone definitions that don't have a standalone
4864
// to merge into could be deferred to the end
4868
// attribute value expansions
4869
$this->attrCollections->performInclusions($def->attr);
4870
$this->attrCollections->expandIdentifiers($def->attr, $this->attrTypes);
4872
// descendants_are_inline, for ChildDef_Chameleon
4873
if (is_string($def->content_model) &&
4874
strpos($def->content_model, 'Inline') !== false) {
4875
if ($name != 'del' && $name != 'ins') {
4876
// this is for you, ins/del
4877
$def->descendants_are_inline = true;
4881
$this->contentSets->generateChildDef($def, $module);
4884
// This can occur if there is a blank definition, but no base to
4886
if (!$def) return false;
4888
// add information on required attributes
4889
foreach ($def->attr as $attr_name => $attr_def) {
4890
if ($attr_def->required) {
4891
$def->required_attr[] = $attr_name;
4906
* Component of HTMLPurifier_AttrContext that accumulates IDs to prevent dupes
4907
* @note In Slashdot-speak, dupe means duplicate.
4908
* @note The default constructor does not accept $config or $context objects:
4909
* use must use the static build() factory method to perform initialization.
4911
class HTMLPurifier_IDAccumulator
4915
* Lookup table of IDs we've accumulated.
4918
public $ids = array();
4921
* Builds an IDAccumulator, also initializing the default blacklist
4922
* @param $config Instance of HTMLPurifier_Config
4923
* @param $context Instance of HTMLPurifier_Context
4924
* @return Fully initialized HTMLPurifier_IDAccumulator
4926
public static function build($config, $context) {
4927
$id_accumulator = new HTMLPurifier_IDAccumulator();
4928
$id_accumulator->load($config->get('Attr', 'IDBlacklist'));
4929
return $id_accumulator;
4933
* Add an ID to the lookup table.
4934
* @param $id ID to be added.
4935
* @return Bool status, true if success, false if there's a dupe
4937
public function add($id) {
4938
if (isset($this->ids[$id])) return false;
4939
return $this->ids[$id] = true;
4943
* Load a list of IDs into the lookup table
4944
* @param $array_of_ids Array of IDs to load
4945
* @note This function doesn't care about duplicates
4947
public function load($array_of_ids) {
4948
foreach ($array_of_ids as $id) {
4949
$this->ids[$id] = true;
4959
* Injects tokens into the document while parsing for well-formedness.
4960
* This enables "formatter-like" functionality such as auto-paragraphing,
4961
* smiley-ification and linkification to take place.
4963
* A note on how handlers create changes; this is done by assigning a new
4964
* value to the $token reference. These values can take a variety of forms and
4965
* are best described HTMLPurifier_Strategy_MakeWellFormed->processToken()
4968
* @todo Allow injectors to request a re-run on their output. This
4969
* would help if an operation is recursive.
4971
abstract class HTMLPurifier_Injector
4975
* Advisory name of injector, this is for friendly error messages
4980
* Instance of HTMLPurifier_HTMLDefinition
4982
protected $htmlDefinition;
4985
* Reference to CurrentNesting variable in Context. This is an array
4986
* list of tokens that we are currently "inside"
4988
protected $currentNesting;
4991
* Reference to InputTokens variable in Context. This is an array
4992
* list of the input tokens that are being processed.
4994
protected $inputTokens;
4997
* Reference to InputIndex variable in Context. This is an integer
4998
* array index for $this->inputTokens that indicates what token
4999
* is currently being processed.
5001
protected $inputIndex;
5004
* Array of elements and attributes this injector creates and therefore
5005
* need to be allowed by the definition. Takes form of
5006
* array('element' => array('attr', 'attr2'), 'element2')
5008
public $needed = array();
5011
* Index of inputTokens to rewind to.
5013
protected $rewind = false;
5016
* Rewind to a spot to re-perform processing. This is useful if you
5017
* deleted a node, and now need to see if this change affected any
5018
* earlier nodes. Rewinding does not affect other injectors, and can
5019
* result in infinite loops if not used carefully.
5020
* @warning HTML Purifier will prevent you from fast-forwarding with this
5023
public function rewind($index) {
5024
$this->rewind = $index;
5028
* Retrieves rewind, and then unsets it.
5030
public function getRewind() {
5032
$this->rewind = false;
5037
* Prepares the injector by giving it the config and context objects:
5038
* this allows references to important variables to be made within
5039
* the injector. This function also checks if the HTML environment
5040
* will work with the Injector (see checkNeeded()).
5041
* @param $config Instance of HTMLPurifier_Config
5042
* @param $context Instance of HTMLPurifier_Context
5043
* @return Boolean false if success, string of missing needed element/attribute if failure
5045
public function prepare($config, $context) {
5046
$this->htmlDefinition = $config->getHTMLDefinition();
5047
// Even though this might fail, some unit tests ignore this and
5048
// still test checkNeeded, so be careful. Maybe get rid of that
5050
$result = $this->checkNeeded($config);
5051
if ($result !== false) return $result;
5052
$this->currentNesting =& $context->get('CurrentNesting');
5053
$this->inputTokens =& $context->get('InputTokens');
5054
$this->inputIndex =& $context->get('InputIndex');
5059
* This function checks if the HTML environment
5060
* will work with the Injector: if p tags are not allowed, the
5061
* Auto-Paragraphing injector should not be enabled.
5062
* @param $config Instance of HTMLPurifier_Config
5063
* @param $context Instance of HTMLPurifier_Context
5064
* @return Boolean false if success, string of missing needed element/attribute if failure
5066
public function checkNeeded($config) {
5067
$def = $config->getHTMLDefinition();
5068
foreach ($this->needed as $element => $attributes) {
5069
if (is_int($element)) $element = $attributes;
5070
if (!isset($def->info[$element])) return $element;
5071
if (!is_array($attributes)) continue;
5072
foreach ($attributes as $name) {
5073
if (!isset($def->info[$element]->attr[$name])) return "$element.$name";
5080
* Tests if the context node allows a certain element
5081
* @param $name Name of element to test for
5082
* @return True if element is allowed, false if it is not
5084
public function allowsElement($name) {
5085
if (!empty($this->currentNesting)) {
5086
$parent_token = array_pop($this->currentNesting);
5087
$this->currentNesting[] = $parent_token;
5088
$parent = $this->htmlDefinition->info[$parent_token->name];
5090
$parent = $this->htmlDefinition->info_parent_def;
5092
if (!isset($parent->child->elements[$name]) || isset($parent->excludes[$name])) {
5099
* Iterator function, which starts with the next token and continues until
5100
* you reach the end of the input tokens.
5101
* @warning Please prevent previous references from interfering with this
5102
* functions by setting $i = null beforehand!
5103
* @param &$i Current integer index variable for inputTokens
5104
* @param &$current Current token variable. Do NOT use $token, as that variable is also a reference
5106
protected function forward(&$i, &$current) {
5107
if ($i === null) $i = $this->inputIndex + 1;
5109
if (!isset($this->inputTokens[$i])) return false;
5110
$current = $this->inputTokens[$i];
5115
* Similar to _forward, but accepts a third parameter $nesting (which
5116
* should be initialized at 0) and stops when we hit the end tag
5117
* for the node $this->inputIndex starts in.
5119
protected function forwardUntilEndToken(&$i, &$current, &$nesting) {
5120
$result = $this->forward($i, $current);
5121
if (!$result) return false;
5122
if ($nesting === null) $nesting = 0;
5123
if ($current instanceof HTMLPurifier_Token_Start) $nesting++;
5124
elseif ($current instanceof HTMLPurifier_Token_End) {
5125
if ($nesting <= 0) return false;
5132
* Iterator function, starts with the previous token and continues until
5133
* you reach the beginning of input tokens.
5134
* @warning Please prevent previous references from interfering with this
5135
* functions by setting $i = null beforehand!
5136
* @param &$i Current integer index variable for inputTokens
5137
* @param &$current Current token variable. Do NOT use $token, as that variable is also a reference
5139
protected function backward(&$i, &$current) {
5140
if ($i === null) $i = $this->inputIndex - 1;
5142
if ($i < 0) return false;
5143
$current = $this->inputTokens[$i];
5148
* Initializes the iterator at the current position. Use in a do {} while;
5149
* loop to force the _forward and _backward functions to start at the
5151
* @warning Please prevent previous references from interfering with this
5152
* functions by setting $i = null beforehand!
5153
* @param &$i Current integer index variable for inputTokens
5154
* @param &$current Current token variable. Do NOT use $token, as that variable is also a reference
5156
protected function current(&$i, &$current) {
5157
if ($i === null) $i = $this->inputIndex;
5158
$current = $this->inputTokens[$i];
5162
* Handler that is called when a text token is processed
5164
public function handleText(&$token) {}
5167
* Handler that is called when a start or empty token is processed
5169
public function handleElement(&$token) {}
5172
* Handler that is called when an end token is processed
5174
public function handleEnd(&$token) {
5175
$this->notifyEnd($token);
5179
* Notifier that is called when an end token is processed
5180
* @note This differs from handlers in that the token is read-only
5183
public function notifyEnd($token) {}
5192
* Represents a language and defines localizable string formatting and
5193
* other functions, as well as the localized messages for HTML Purifier.
5195
class HTMLPurifier_Language
5199
* ISO 639 language code of language. Prefers shortest possible version
5201
public $code = 'en';
5204
* Fallback language code
5206
public $fallback = false;
5209
* Array of localizable messages
5211
public $messages = array();
5214
* Array of localizable error codes
5216
public $errorNames = array();
5219
* True if no message file was found for this language, so English
5220
* is being used instead. Check this if you'd like to notify the
5221
* user that they've used a non-supported language.
5223
public $error = false;
5226
* Has the language object been loaded yet?
5227
* @todo Make it private, fix usage in HTMLPurifier_LanguageTest
5229
public $_loaded = false;
5232
* Instances of HTMLPurifier_Config and HTMLPurifier_Context
5234
protected $config, $context;
5236
public function __construct($config, $context) {
5237
$this->config = $config;
5238
$this->context = $context;
5242
* Loads language object with necessary info from factory cache
5243
* @note This is a lazy loader
5245
public function load() {
5246
if ($this->_loaded) return;
5247
$factory = HTMLPurifier_LanguageFactory::instance();
5248
$factory->loadLanguage($this->code);
5249
foreach ($factory->keys as $key) {
5250
$this->$key = $factory->cache[$this->code][$key];
5252
$this->_loaded = true;
5256
* Retrieves a localised message.
5257
* @param $key string identifier of message
5258
* @return string localised message
5260
public function getMessage($key) {
5261
if (!$this->_loaded) $this->load();
5262
if (!isset($this->messages[$key])) return "[$key]";
5263
return $this->messages[$key];
5267
* Retrieves a localised error name.
5268
* @param $int integer error number, corresponding to PHP's error
5270
* @return string localised message
5272
public function getErrorName($int) {
5273
if (!$this->_loaded) $this->load();
5274
if (!isset($this->errorNames[$int])) return "[Error: $int]";
5275
return $this->errorNames[$int];
5279
* Converts an array list into a string readable representation
5281
public function listify($array) {
5282
$sep = $this->getMessage('Item separator');
5283
$sep_last = $this->getMessage('Item separator last');
5285
for ($i = 0, $c = count($array); $i < $c; $i++) {
5287
} elseif ($i + 1 < $c) {
5298
* Formats a localised message with passed parameters
5299
* @param $key string identifier of message
5300
* @param $args Parameters to substitute in
5301
* @return string localised message
5302
* @todo Implement conditionals? Right now, some messages make
5303
* reference to line numbers, but those aren't always available
5305
public function formatMessage($key, $args = array()) {
5306
if (!$this->_loaded) $this->load();
5307
if (!isset($this->messages[$key])) return "[$key]";
5308
$raw = $this->messages[$key];
5311
foreach ($args as $i => $value) {
5312
if (is_object($value)) {
5313
if ($value instanceof HTMLPurifier_Token) {
5314
// factor this out some time
5315
if (!$generator) $generator = $this->context->get('Generator');
5316
if (isset($value->name)) $subst['$'.$i.'.Name'] = $value->name;
5317
if (isset($value->data)) $subst['$'.$i.'.Data'] = $value->data;
5318
$subst['$'.$i.'.Compact'] =
5319
$subst['$'.$i.'.Serialized'] = $generator->generateFromToken($value);
5320
// a more complex algorithm for compact representation
5321
// could be introduced for all types of tokens. This
5322
// may need to be factored out into a dedicated class
5323
if (!empty($value->attr)) {
5324
$stripped_token = clone $value;
5325
$stripped_token->attr = array();
5326
$subst['$'.$i.'.Compact'] = $generator->generateFromToken($stripped_token);
5328
$subst['$'.$i.'.Line'] = $value->line ? $value->line : 'unknown';
5331
} elseif (is_array($value)) {
5332
$keys = array_keys($value);
5333
if (array_keys($keys) === $keys) {
5335
$subst['$'.$i] = $this->listify($value);
5337
// associative array
5338
// no $i implementation yet, sorry
5339
$subst['$'.$i.'.Keys'] = $this->listify($keys);
5340
$subst['$'.$i.'.Values'] = $this->listify(array_values($value));
5344
$subst['$' . $i] = $value;
5346
return strtr($raw, $subst);
5355
* Class responsible for generating HTMLPurifier_Language objects, managing
5356
* caching and fallbacks.
5357
* @note Thanks to MediaWiki for the general logic, although this version
5358
* has been entirely rewritten
5359
* @todo Serialized cache for languages
5361
class HTMLPurifier_LanguageFactory
5365
* Cache of language code information used to load HTMLPurifier_Language objects
5366
* Structure is: $factory->cache[$language_code][$key] = $value
5372
* Valid keys in the HTMLPurifier_Language object. Designates which
5373
* variables to slurp out of a message file.
5376
public $keys = array('fallback', 'messages', 'errorNames');
5379
* Instance of HTMLPurifier_AttrDef_Lang to validate language codes
5380
* @value object HTMLPurifier_AttrDef_Lang
5382
protected $validator;
5385
* Cached copy of dirname(__FILE__), directory of current file without
5387
* @value string filename
5392
* Keys whose contents are a hash map and can be merged
5393
* @value array lookup
5395
protected $mergeable_keys_map = array('messages' => true, 'errorNames' => true);
5398
* Keys whose contents are a list and can be merged
5399
* @value array lookup
5401
protected $mergeable_keys_list = array();
5404
* Retrieve sole instance of the factory.
5405
* @param $prototype Optional prototype to overload sole instance with,
5406
* or bool true to reset to default factory.
5408
public static function instance($prototype = null) {
5409
static $instance = null;
5410
if ($prototype !== null) {
5411
$instance = $prototype;
5412
} elseif ($instance === null || $prototype == true) {
5413
$instance = new HTMLPurifier_LanguageFactory();
5420
* Sets up the singleton, much like a constructor
5421
* @note Prevents people from getting this outside of the singleton
5423
public function setup() {
5424
$this->validator = new HTMLPurifier_AttrDef_Lang();
5425
$this->dir = HTMLPURIFIER_PREFIX . '/HTMLPurifier';
5429
* Creates a language object, handles class fallbacks
5430
* @param $config Instance of HTMLPurifier_Config
5431
* @param $context Instance of HTMLPurifier_Context
5432
* @param $code Code to override configuration with. Private parameter.
5434
public function create($config, $context, $code = false) {
5436
// validate language code
5437
if ($code === false) {
5438
$code = $this->validator->validate(
5439
$config->get('Core', 'Language'), $config, $context
5442
$code = $this->validator->validate($code, $config, $context);
5444
if ($code === false) $code = 'en'; // malformed code becomes English
5446
$pcode = str_replace('-', '_', $code); // make valid PHP classname
5447
static $depth = 0; // recursion protection
5449
if ($code == 'en') {
5450
$lang = new HTMLPurifier_Language($config, $context);
5452
$class = 'HTMLPurifier_Language_' . $pcode;
5453
$file = $this->dir . '/Language/classes/' . $code . '.php';
5454
if (file_exists($file) || class_exists($class, false)) {
5455
$lang = new $class($config, $context);
5458
$raw_fallback = $this->getFallbackFor($code);
5459
$fallback = $raw_fallback ? $raw_fallback : 'en';
5461
$lang = $this->create($config, $context, $fallback);
5462
if (!$raw_fallback) {
5463
$lang->error = true;
5469
$lang->code = $code;
5476
* Returns the fallback language for language
5477
* @note Loads the original language into cache
5478
* @param $code string language code
5480
public function getFallbackFor($code) {
5481
$this->loadLanguage($code);
5482
return $this->cache[$code]['fallback'];
5486
* Loads language into the cache, handles message file and fallbacks
5487
* @param $code string language code
5489
public function loadLanguage($code) {
5490
static $languages_seen = array(); // recursion guard
5492
// abort if we've already loaded it
5493
if (isset($this->cache[$code])) return;
5495
// generate filename
5496
$filename = $this->dir . '/Language/messages/' . $code . '.php';
5498
// default fallback : may be overwritten by the ensuing include
5499
$fallback = ($code != 'en') ? 'en' : false;
5501
// load primary localisation
5502
if (!file_exists($filename)) {
5503
// skip the include: will rely solely on fallback
5504
$filename = $this->dir . '/Language/messages/en.php';
5508
$cache = compact($this->keys);
5511
// load fallback localisation
5512
if (!empty($fallback)) {
5514
// infinite recursion guard
5515
if (isset($languages_seen[$code])) {
5516
trigger_error('Circular fallback reference in language ' .
5517
$code, E_USER_ERROR);
5520
$language_seen[$code] = true;
5522
// load the fallback recursively
5523
$this->loadLanguage($fallback);
5524
$fallback_cache = $this->cache[$fallback];
5526
// merge fallback with current language
5527
foreach ( $this->keys as $key ) {
5528
if (isset($cache[$key]) && isset($fallback_cache[$key])) {
5529
if (isset($this->mergeable_keys_map[$key])) {
5530
$cache[$key] = $cache[$key] + $fallback_cache[$key];
5531
} elseif (isset($this->mergeable_keys_list[$key])) {
5532
$cache[$key] = array_merge( $fallback_cache[$key], $cache[$key] );
5535
$cache[$key] = $fallback_cache[$key];
5541
// save to cache for later retrieval
5542
$this->cache[$code] = $cache;
5553
* Represents a measurable length, with a string numeric magnitude
5554
* and a unit. This object is immutable.
5556
class HTMLPurifier_Length
5560
* String numeric magnitude.
5565
* String unit. False is permitted if $n = 0.
5570
* Whether or not this length is valid. Null if not calculated yet.
5575
* Lookup array of units recognized by CSS 2.1
5577
protected static $allowedUnits = array(
5578
'em' => true, 'ex' => true, 'px' => true, 'in' => true,
5579
'cm' => true, 'mm' => true, 'pt' => true, 'pc' => true
5583
* @param number $n Magnitude
5584
* @param string $u Unit
5586
public function __construct($n = '0', $u = false) {
5587
$this->n = (string) $n;
5588
$this->unit = $u !== false ? (string) $u : false;
5592
* @param string $s Unit string, like '2em' or '3.4in'
5593
* @warning Does not perform validation.
5595
static public function make($s) {
5596
if ($s instanceof HTMLPurifier_Length) return $s;
5597
$n_length = strspn($s, '1234567890.+-');
5598
$n = substr($s, 0, $n_length);
5599
$unit = substr($s, $n_length);
5600
if ($unit === '') $unit = false;
5601
return new HTMLPurifier_Length($n, $unit);
5605
* Validates the number and unit.
5607
protected function validate() {
5609
if ($this->n === '+0' || $this->n === '-0') $this->n = '0';
5610
if ($this->n === '0' && $this->unit === false) return true;
5611
if (!ctype_lower($this->unit)) $this->unit = strtolower($this->unit);
5612
if (!isset(HTMLPurifier_Length::$allowedUnits[$this->unit])) return false;
5614
$def = new HTMLPurifier_AttrDef_CSS_Number();
5615
$result = $def->validate($this->n, false, false);
5616
if ($result === false) return false;
5622
* Returns string representation of number.
5624
public function toString() {
5625
if (!$this->isValid()) return false;
5626
return $this->n . $this->unit;
5630
* Retrieves string numeric magnitude.
5632
public function getN() {return $this->n;}
5635
* Retrieves string unit.
5637
public function getUnit() {return $this->unit;}
5640
* Returns true if this length unit is valid.
5642
public function isValid() {
5643
if ($this->isValid === null) $this->isValid = $this->validate();
5644
return $this->isValid;
5648
* Compares two lengths, and returns 1 if greater, -1 if less and 0 if equal.
5649
* @warning If both values are too large or small, this calculation will
5652
public function compareTo($l) {
5653
if ($l === false) return false;
5654
if ($l->unit !== $this->unit) {
5655
$converter = new HTMLPurifier_UnitConverter();
5656
$l = $converter->convert($l, $this->unit);
5657
if ($l === false) return false;
5659
return $this->n - $l->n;
5667
* Forgivingly lexes HTML (SGML-style) markup into tokens.
5669
* A lexer parses a string of SGML-style markup and converts them into
5670
* corresponding tokens. It doesn't check for well-formedness, although its
5671
* internal mechanism may make this automatic (such as the case of
5672
* HTMLPurifier_Lexer_DOMLex). There are several implementations to choose
5675
* A lexer is HTML-oriented: it might work with XML, but it's not
5676
* recommended, as we adhere to a subset of the specification for optimization
5677
* reasons. This might change in the future. Also, most tokenizers are not
5678
* expected to handle DTDs or PIs.
5680
* This class should not be directly instantiated, but you may use create() to
5681
* retrieve a default copy of the lexer. Being a supertype, this class
5682
* does not actually define any implementation, but offers commonly used
5683
* convenience functions for subclasses.
5685
* @note The unit tests will instantiate this class for testing purposes, as
5686
* many of the utility functions require a class to be instantiated.
5687
* This means that, even though this class is not runnable, it will
5688
* not be declared abstract.
5693
* We use tokens rather than create a DOM representation because DOM would:
5696
* -# Require more processing and memory to create,
5697
* -# Is not streamable, and
5698
* -# Has the entire document structure (html and body not needed).
5701
* However, DOM is helpful in that it makes it easy to move around nodes
5702
* without a lot of lookaheads to see when a tag is closed. This is a
5703
* limitation of the token system and some workarounds would be nice.
5705
class HTMLPurifier_Lexer
5709
* Whether or not this lexer implements line-number/column-number tracking.
5710
* If it does, set to true.
5712
public $tracksLineNumbers = false;
5714
// -- STATIC ----------------------------------------------------------
5717
* Retrieves or sets the default Lexer as a Prototype Factory.
5719
* By default HTMLPurifier_Lexer_DOMLex will be returned. There are
5720
* a few exceptions involving special features that only DirectLex
5723
* @note The behavior of this class has changed, rather than accepting
5724
* a prototype object, it now accepts a configuration object.
5725
* To specify your own prototype, set %Core.LexerImpl to it.
5726
* This change in behavior de-singletonizes the lexer object.
5728
* @param $config Instance of HTMLPurifier_Config
5729
* @return Concrete lexer.
5731
public static function create($config) {
5733
if (!($config instanceof HTMLPurifier_Config)) {
5735
trigger_error("Passing a prototype to
5736
HTMLPurifier_Lexer::create() is deprecated, please instead
5737
use %Core.LexerImpl", E_USER_WARNING);
5739
$lexer = $config->get('Core', 'LexerImpl');
5743
$config->get('Core', 'MaintainLineNumbers') ||
5744
$config->get('Core', 'CollectErrors');
5747
if (is_object($lexer)) {
5751
if (is_null($lexer)) { do {
5752
// auto-detection algorithm
5754
if ($needs_tracking) {
5755
$lexer = 'DirectLex';
5760
class_exists('DOMDocument') &&
5761
method_exists('DOMDocument', 'loadHTML') &&
5762
!extension_loaded('domxml')
5764
// check for DOM support, because while it's part of the
5765
// core, it can be disabled compile time. Also, the PECL
5766
// domxml extension overrides the default DOM, and is evil
5767
// and nasty and we shan't bother to support it
5770
$lexer = 'DirectLex';
5773
} while(0); } // do..while so we can break
5775
// instantiate recognized string names
5778
$inst = new HTMLPurifier_Lexer_DOMLex();
5781
$inst = new HTMLPurifier_Lexer_DirectLex();
5784
$inst = new HTMLPurifier_Lexer_PH5P();
5787
throw new HTMLPurifier_Exception("Cannot instantiate unrecognized Lexer type " . htmlspecialchars($lexer));
5791
if (!$inst) throw new HTMLPurifier_Exception('No lexer was instantiated');
5793
// once PHP DOM implements native line numbers, or we
5794
// hack out something using XSLT, remove this stipulation
5795
if ($needs_tracking && !$inst->tracksLineNumbers) {
5796
throw new HTMLPurifier_Exception('Cannot use lexer that does not support line numbers with Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)');
5803
// -- CONVENIENCE MEMBERS ---------------------------------------------
5805
public function __construct() {
5806
$this->_entity_parser = new HTMLPurifier_EntityParser();
5810
* Most common entity to raw value conversion table for special entities.
5812
protected $_special_entity2str =
5824
* Parses special entities into the proper characters.
5826
* This string will translate escaped versions of the special characters
5827
* into the correct ones.
5830
* You should be able to treat the output of this function as
5831
* completely parsed, but that's only because all other entities should
5832
* have been handled previously in substituteNonSpecialEntities()
5834
* @param $string String character data to be parsed.
5835
* @returns Parsed character data.
5837
public function parseData($string) {
5839
// following functions require at least one character
5840
if ($string === '') return '';
5842
// subtracts amps that cannot possibly be escaped
5843
$num_amp = substr_count($string, '&') - substr_count($string, '& ') -
5844
($string[strlen($string)-1] === '&' ? 1 : 0);
5846
if (!$num_amp) return $string; // abort if no entities
5847
$num_esc_amp = substr_count($string, '&');
5848
$string = strtr($string, $this->_special_entity2str);
5850
// code duplication for sake of optimization, see above
5851
$num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
5852
($string[strlen($string)-1] === '&' ? 1 : 0);
5854
if ($num_amp_2 <= $num_esc_amp) return $string;
5856
// hmm... now we have some uncommon entities. Use the callback.
5857
$string = $this->_entity_parser->substituteSpecialEntities($string);
5862
* Lexes an HTML string into tokens.
5864
* @param $string String HTML.
5865
* @return HTMLPurifier_Token array representation of HTML.
5867
public function tokenizeHTML($string, $config, $context) {
5868
trigger_error('Call to abstract class', E_USER_ERROR);
5872
* Translates CDATA sections into regular sections (through escaping).
5874
* @param $string HTML string to process.
5875
* @returns HTML with CDATA sections escaped.
5877
protected static function escapeCDATA($string) {
5878
return preg_replace_callback(
5879
'/<!\[CDATA\[(.+?)\]\]>/s',
5880
array('HTMLPurifier_Lexer', 'CDATACallback'),
5886
* Special CDATA case that is especially convoluted for <script>
5888
protected static function escapeCommentedCDATA($string) {
5889
return preg_replace_callback(
5890
'#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s',
5891
array('HTMLPurifier_Lexer', 'CDATACallback'),
5897
* Callback function for escapeCDATA() that does the work.
5899
* @warning Though this is public in order to let the callback happen,
5900
* calling it directly is not recommended.
5901
* @params $matches PCRE matches array, with index 0 the entire match
5902
* and 1 the inside of the CDATA section.
5903
* @returns Escaped internals of the CDATA section.
5905
protected static function CDATACallback($matches) {
5906
// not exactly sure why the character set is needed, but whatever
5907
return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8');
5911
* Takes a piece of HTML and normalizes it by converting entities, fixing
5912
* encoding, extracting bits, and other good stuff.
5913
* @todo Consider making protected
5915
public function normalize($html, $config, $context) {
5917
// normalize newlines to \n
5918
$html = str_replace("\r\n", "\n", $html);
5919
$html = str_replace("\r", "\n", $html);
5921
if ($config->get('HTML', 'Trusted')) {
5922
// escape convoluted CDATA
5923
$html = $this->escapeCommentedCDATA($html);
5927
$html = $this->escapeCDATA($html);
5929
// extract body from document if applicable
5930
if ($config->get('Core', 'ConvertDocumentToFragment')) {
5931
$html = $this->extractBody($html);
5934
// expand entities that aren't the big five
5935
$html = $this->_entity_parser->substituteNonSpecialEntities($html);
5937
// clean into wellformed UTF-8 string for an SGML context: this has
5938
// to be done after entity expansion because the entities sometimes
5939
// represent non-SGML characters (horror, horror!)
5940
$html = HTMLPurifier_Encoder::cleanUTF8($html);
5946
* Takes a string of HTML (fragment or document) and returns the content
5947
* @todo Consider making protected
5949
public function extractBody($html) {
5951
$result = preg_match('!<body[^>]*>(.+?)</body>!is', $html, $matches);
5965
* Class that handles operations involving percent-encoding in URIs.
5968
* Be careful when reusing instances of PercentEncoder. The object
5969
* you use for normalize() SHOULD NOT be used for encode(), or
5972
class HTMLPurifier_PercentEncoder
5976
* Reserved characters to preserve when using encode().
5978
protected $preserve = array();
5981
* String of characters that should be preserved while using encode().
5983
public function __construct($preserve = false) {
5984
// unreserved letters, ought to const-ify
5985
for ($i = 48; $i <= 57; $i++) $this->preserve[$i] = true; // digits
5986
for ($i = 65; $i <= 90; $i++) $this->preserve[$i] = true; // upper-case
5987
for ($i = 97; $i <= 122; $i++) $this->preserve[$i] = true; // lower-case
5988
$this->preserve[45] = true; // Dash -
5989
$this->preserve[46] = true; // Period .
5990
$this->preserve[95] = true; // Underscore _
5991
$this->preserve[126]= true; // Tilde ~
5993
// extra letters not to escape
5994
if ($preserve !== false) {
5995
for ($i = 0, $c = strlen($preserve); $i < $c; $i++) {
5996
$this->preserve[ord($preserve[$i])] = true;
6002
* Our replacement for urlencode, it encodes all non-reserved characters,
6003
* as well as any extra characters that were instructed to be preserved.
6005
* Assumes that the string has already been normalized, making any
6006
* and all percent escape sequences valid. Percents will not be
6007
* re-escaped, regardless of their status in $preserve
6008
* @param $string String to be encoded
6009
* @return Encoded string.
6011
public function encode($string) {
6013
for ($i = 0, $c = strlen($string); $i < $c; $i++) {
6014
if ($string[$i] !== '%' && !isset($this->preserve[$int = ord($string[$i])]) ) {
6015
$ret .= '%' . sprintf('%02X', $int);
6017
$ret .= $string[$i];
6024
* Fix up percent-encoding by decoding unreserved characters and normalizing.
6025
* @warning This function is affected by $preserve, even though the
6026
* usual desired behavior is for this not to preserve those
6027
* characters. Be careful when reusing instances of PercentEncoder!
6028
* @param $string String to normalize
6030
public function normalize($string) {
6031
if ($string == '') return '';
6032
$parts = explode('%', $string);
6033
$ret = array_shift($parts);
6034
foreach ($parts as $part) {
6035
$length = strlen($part);
6037
$ret .= '%25' . $part;
6040
$encoding = substr($part, 0, 2);
6041
$text = substr($part, 2);
6042
if (!ctype_xdigit($encoding)) {
6043
$ret .= '%25' . $part;
6046
$int = hexdec($encoding);
6047
if (isset($this->preserve[$int])) {
6048
$ret .= chr($int) . $text;
6051
$encoding = strtoupper($encoding);
6052
$ret .= '%' . $encoding . $text;
6063
* Supertype for classes that define a strategy for modifying/purifying tokens.
6065
* While HTMLPurifier's core purpose is fixing HTML into something proper,
6066
* strategies provide plug points for extra configuration or even extra
6067
* features, such as custom tags, custom parsing of text, etc.
6071
abstract class HTMLPurifier_Strategy
6075
* Executes the strategy on the tokens.
6077
* @param $tokens Array of HTMLPurifier_Token objects to be operated on.
6078
* @param $config Configuration options
6079
* @returns Processed array of token objects.
6081
abstract public function execute($tokens, $config, $context);
6089
* This is in almost every respect equivalent to an array except
6090
* that it keeps track of which keys were accessed.
6092
* @warning For the sake of backwards compatibility with early versions
6093
* of PHP 5, you must not use the $hash[$key] syntax; if you do
6094
* our version of offsetGet is never called.
6096
class HTMLPurifier_StringHash extends ArrayObject
6098
protected $accessed = array();
6101
* Retrieves a value, and logs the access.
6103
public function offsetGet($index) {
6104
$this->accessed[$index] = true;
6105
return parent::offsetGet($index);
6109
* Returns a lookup array of all array indexes that have been accessed.
6110
* @return Array in form array($index => true).
6112
public function getAccessed() {
6113
return $this->accessed;
6117
* Resets the access array.
6119
public function resetAccessed() {
6120
$this->accessed = array();
6127
* Parses string hash files. File format is as such:
6136
* Which would output something similar to:
6139
* 'ID' => 'DefaultKeyValue',
6141
* 'KEY2' => 'Value2',
6142
* 'MULTILINE-KEY' => "Multiline\nvalue.\n",
6145
* We use this as an easy to use file-format for configuration schema
6146
* files, but the class itself is usage agnostic.
6148
* You can use ---- to forcibly terminate parsing of a single string-hash;
6149
* this marker is used in multi string-hashes to delimit boundaries.
6151
class HTMLPurifier_StringHashParser
6154
public $default = 'ID';
6157
* Parses a file that contains a single string-hash.
6159
public function parseFile($file) {
6160
if (!file_exists($file)) return false;
6161
$fh = fopen($file, 'r');
6162
if (!$fh) return false;
6163
$ret = $this->parseHandle($fh);
6169
* Parses a file that contains multiple string-hashes delimited by '----'
6171
public function parseMultiFile($file) {
6172
if (!file_exists($file)) return false;
6174
$fh = fopen($file, 'r');
6175
if (!$fh) return false;
6176
while (!feof($fh)) {
6177
$ret[] = $this->parseHandle($fh);
6184
* Internal parser that acepts a file handle.
6185
* @note While it's possible to simulate in-memory parsing by using
6186
* custom stream wrappers, if such a use-case arises we should
6187
* factor out the file handle into its own class.
6188
* @param $fh File handle with pointer at start of valid string-hash
6191
protected function parseHandle($fh) {
6197
if ($line === false) break;
6198
$line = rtrim($line, "\n\r");
6199
if (!$state && $line === '') continue;
6200
if ($line === '----') break;
6201
if (strncmp('--', $line, 2) === 0) {
6202
// Multiline declaration
6203
$state = trim($line, '- ');
6204
if (!isset($ret[$state])) $ret[$state] = '';
6206
} elseif (!$state) {
6208
if (strpos($line, ':') !== false) {
6209
// Single-line declaration
6210
list($state, $line) = explode(': ', $line, 2);
6212
// Use default declaration
6213
$state = $this->default;
6217
$ret[$state] = $line;
6221
$ret[$state] .= "$line\n";
6223
} while (!feof($fh));
6232
* Defines a mutation of an obsolete tag into a valid tag.
6234
abstract class HTMLPurifier_TagTransform
6238
* Tag name to transform the tag to.
6240
public $transform_to;
6243
* Transforms the obsolete tag into the valid tag.
6244
* @param $tag Tag to be transformed.
6245
* @param $config Mandatory HTMLPurifier_Config object
6246
* @param $context Mandatory HTMLPurifier_Context object
6248
abstract public function transform($tag, $config, $context);
6251
* Prepends CSS properties to the style attribute, creating the
6252
* attribute if it doesn't exist.
6253
* @warning Copied over from AttrTransform, be sure to keep in sync
6254
* @param $attr Attribute array to process (passed by reference)
6255
* @param $css CSS to prepend
6257
protected function prependCSS(&$attr, $css) {
6258
$attr['style'] = isset($attr['style']) ? $attr['style'] : '';
6259
$attr['style'] = $css . $attr['style'];
6268
* Abstract base token class that all others inherit from.
6270
class HTMLPurifier_Token {
6271
public $line; /**< Line number node was on in source document. Null if unknown. */
6272
public $col; /**< Column of line node was on in source document. Null if unknown. */
6275
* Lookup array of processing that this token is exempt from.
6276
* Currently, valid values are "ValidateAttributes" and
6277
* "MakeWellFormed_TagClosedError"
6279
public $armor = array();
6282
* Used during MakeWellFormed.
6287
public function __get($n) {
6288
if ($n === 'type') {
6289
trigger_error('Deprecated type property called; use instanceof', E_USER_NOTICE);
6290
switch (get_class($this)) {
6291
case 'HTMLPurifier_Token_Start': return 'start';
6292
case 'HTMLPurifier_Token_Empty': return 'empty';
6293
case 'HTMLPurifier_Token_End': return 'end';
6294
case 'HTMLPurifier_Token_Text': return 'text';
6295
case 'HTMLPurifier_Token_Comment': return 'comment';
6296
default: return null;
6302
* Sets the position of the token in the source document.
6304
public function position($l = null, $c = null) {
6310
* Convenience function for DirectLex settings line/col position.
6312
public function rawPosition($l, $c) {
6313
if ($c === -1) $l++;
6323
* Factory for token generation.
6325
* @note Doing some benchmarking indicates that the new operator is much
6326
* slower than the clone operator (even discounting the cost of the
6327
* constructor). This class is for that optimization.
6328
* Other then that, there's not much point as we don't
6329
* maintain parallel HTMLPurifier_Token hierarchies (the main reason why
6330
* you'd want to use an abstract factory).
6331
* @todo Port DirectLex to use this
6333
class HTMLPurifier_TokenFactory
6337
* Prototypes that will be cloned.
6340
// p stands for prototype
6341
private $p_start, $p_end, $p_empty, $p_text, $p_comment;
6344
* Generates blank prototypes for cloning.
6346
public function __construct() {
6347
$this->p_start = new HTMLPurifier_Token_Start('', array());
6348
$this->p_end = new HTMLPurifier_Token_End('');
6349
$this->p_empty = new HTMLPurifier_Token_Empty('', array());
6350
$this->p_text = new HTMLPurifier_Token_Text('');
6351
$this->p_comment= new HTMLPurifier_Token_Comment('');
6355
* Creates a HTMLPurifier_Token_Start.
6356
* @param $name Tag name
6357
* @param $attr Associative array of attributes
6358
* @return Generated HTMLPurifier_Token_Start
6360
public function createStart($name, $attr = array()) {
6361
$p = clone $this->p_start;
6362
$p->__construct($name, $attr);
6367
* Creates a HTMLPurifier_Token_End.
6368
* @param $name Tag name
6369
* @return Generated HTMLPurifier_Token_End
6371
public function createEnd($name) {
6372
$p = clone $this->p_end;
6373
$p->__construct($name);
6378
* Creates a HTMLPurifier_Token_Empty.
6379
* @param $name Tag name
6380
* @param $attr Associative array of attributes
6381
* @return Generated HTMLPurifier_Token_Empty
6383
public function createEmpty($name, $attr = array()) {
6384
$p = clone $this->p_empty;
6385
$p->__construct($name, $attr);
6390
* Creates a HTMLPurifier_Token_Text.
6391
* @param $data Data of text token
6392
* @return Generated HTMLPurifier_Token_Text
6394
public function createText($data) {
6395
$p = clone $this->p_text;
6396
$p->__construct($data);
6401
* Creates a HTMLPurifier_Token_Comment.
6402
* @param $data Data of comment token
6403
* @return Generated HTMLPurifier_Token_Comment
6405
public function createComment($data) {
6406
$p = clone $this->p_comment;
6407
$p->__construct($data);
6417
* HTML Purifier's internal representation of a URI.
6419
* Internal data-structures are completely escaped. If the data needs
6420
* to be used in a non-URI context (which is very unlikely), be sure
6421
* to decode it first. The URI may not necessarily be well-formed until
6422
* validate() is called.
6424
class HTMLPurifier_URI
6427
public $scheme, $userinfo, $host, $port, $path, $query, $fragment;
6430
* @note Automatically normalizes scheme and port
6432
public function __construct($scheme, $userinfo, $host, $port, $path, $query, $fragment) {
6433
$this->scheme = is_null($scheme) || ctype_lower($scheme) ? $scheme : strtolower($scheme);
6434
$this->userinfo = $userinfo;
6435
$this->host = $host;
6436
$this->port = is_null($port) ? $port : (int) $port;
6437
$this->path = $path;
6438
$this->query = $query;
6439
$this->fragment = $fragment;
6443
* Retrieves a scheme object corresponding to the URI's scheme/default
6444
* @param $config Instance of HTMLPurifier_Config
6445
* @param $context Instance of HTMLPurifier_Context
6446
* @return Scheme object appropriate for validating this URI
6448
public function getSchemeObj($config, $context) {
6449
$registry = HTMLPurifier_URISchemeRegistry::instance();
6450
if ($this->scheme !== null) {
6451
$scheme_obj = $registry->getScheme($this->scheme, $config, $context);
6452
if (!$scheme_obj) return false; // invalid scheme, clean it out
6454
// no scheme: retrieve the default one
6455
$def = $config->getDefinition('URI');
6456
$scheme_obj = $registry->getScheme($def->defaultScheme, $config, $context);
6458
// something funky happened to the default scheme object
6460
'Default scheme object "' . $def->defaultScheme . '" was not readable',
6470
* Generic validation method applicable for all schemes. May modify
6471
* this URI in order to get it into a compliant form.
6472
* @param $config Instance of HTMLPurifier_Config
6473
* @param $context Instance of HTMLPurifier_Context
6474
* @return True if validation/filtering succeeds, false if failure
6476
public function validate($config, $context) {
6478
// ABNF definitions from RFC 3986
6479
$chars_sub_delims = '!$&\'()*+,;=';
6480
$chars_gen_delims = ':/?#[]@';
6481
$chars_pchar = $chars_sub_delims . ':@';
6483
// validate scheme (MUST BE FIRST!)
6484
if (!is_null($this->scheme) && is_null($this->host)) {
6485
$def = $config->getDefinition('URI');
6486
if ($def->defaultScheme === $this->scheme) {
6487
$this->scheme = null;
6492
if (!is_null($this->host)) {
6493
$host_def = new HTMLPurifier_AttrDef_URI_Host();
6494
$this->host = $host_def->validate($this->host, $config, $context);
6495
if ($this->host === false) $this->host = null;
6498
// validate username
6499
if (!is_null($this->userinfo)) {
6500
$encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . ':');
6501
$this->userinfo = $encoder->encode($this->userinfo);
6505
if (!is_null($this->port)) {
6506
if ($this->port < 1 || $this->port > 65535) $this->port = null;
6510
$path_parts = array();
6511
$segments_encoder = new HTMLPurifier_PercentEncoder($chars_pchar . '/');
6512
if (!is_null($this->host)) {
6513
// path-abempty (hier and relative)
6514
$this->path = $segments_encoder->encode($this->path);
6515
} elseif ($this->path !== '' && $this->path[0] === '/') {
6516
// path-absolute (hier and relative)
6517
if (strlen($this->path) >= 2 && $this->path[1] === '/') {
6518
// This shouldn't ever happen!
6521
$this->path = $segments_encoder->encode($this->path);
6523
} elseif (!is_null($this->scheme) && $this->path !== '') {
6524
// path-rootless (hier)
6525
// Short circuit evaluation means we don't need to check nz
6526
$this->path = $segments_encoder->encode($this->path);
6527
} elseif (is_null($this->scheme) && $this->path !== '') {
6528
// path-noscheme (relative)
6529
// (once again, not checking nz)
6530
$segment_nc_encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . '@');
6531
$c = strpos($this->path, '/');
6534
$segment_nc_encoder->encode(substr($this->path, 0, $c)) .
6535
$segments_encoder->encode(substr($this->path, $c));
6537
$this->path = $segment_nc_encoder->encode($this->path);
6540
// path-empty (hier and relative)
6541
$this->path = ''; // just to be safe
6544
// qf = query and fragment
6545
$qf_encoder = new HTMLPurifier_PercentEncoder($chars_pchar . '/?');
6547
if (!is_null($this->query)) {
6548
$this->query = $qf_encoder->encode($this->query);
6551
if (!is_null($this->fragment)) {
6552
$this->fragment = $qf_encoder->encode($this->fragment);
6560
* Convert URI back to string
6561
* @return String URI appropriate for output
6563
public function toString() {
6564
// reconstruct authority
6566
if (!is_null($this->host)) {
6568
if(!is_null($this->userinfo)) $authority .= $this->userinfo . '@';
6569
$authority .= $this->host;
6570
if(!is_null($this->port)) $authority .= ':' . $this->port;
6573
// reconstruct the result
6575
if (!is_null($this->scheme)) $result .= $this->scheme . ':';
6576
if (!is_null($authority)) $result .= '//' . $authority;
6577
$result .= $this->path;
6578
if (!is_null($this->query)) $result .= '?' . $this->query;
6579
if (!is_null($this->fragment)) $result .= '#' . $this->fragment;
6589
class HTMLPurifier_URIDefinition extends HTMLPurifier_Definition
6592
public $type = 'URI';
6593
protected $filters = array();
6594
protected $postFilters = array();
6595
protected $registeredFilters = array();
6598
* HTMLPurifier_URI object of the base specified at %URI.Base
6603
* String host to consider "home" base, derived off of $base
6608
* Name of default scheme based on %URI.DefaultScheme and %URI.Base
6610
public $defaultScheme;
6612
public function __construct() {
6613
$this->registerFilter(new HTMLPurifier_URIFilter_DisableExternal());
6614
$this->registerFilter(new HTMLPurifier_URIFilter_DisableExternalResources());
6615
$this->registerFilter(new HTMLPurifier_URIFilter_HostBlacklist());
6616
$this->registerFilter(new HTMLPurifier_URIFilter_MakeAbsolute());
6617
$this->registerFilter(new HTMLPurifier_URIFilter_Munge());
6620
public function registerFilter($filter) {
6621
$this->registeredFilters[$filter->name] = $filter;
6624
public function addFilter($filter, $config) {
6625
$r = $filter->prepare($config);
6626
if ($r === false) return; // null is ok, for backwards compat
6627
if ($filter->post) {
6628
$this->postFilters[$filter->name] = $filter;
6630
$this->filters[$filter->name] = $filter;
6634
protected function doSetup($config) {
6635
$this->setupMemberVariables($config);
6636
$this->setupFilters($config);
6639
protected function setupFilters($config) {
6640
foreach ($this->registeredFilters as $name => $filter) {
6641
$conf = $config->get('URI', $name);
6642
if ($conf !== false && $conf !== null) {
6643
$this->addFilter($filter, $config);
6646
unset($this->registeredFilters);
6649
protected function setupMemberVariables($config) {
6650
$this->host = $config->get('URI', 'Host');
6651
$base_uri = $config->get('URI', 'Base');
6652
if (!is_null($base_uri)) {
6653
$parser = new HTMLPurifier_URIParser();
6654
$this->base = $parser->parse($base_uri);
6655
$this->defaultScheme = $this->base->scheme;
6656
if (is_null($this->host)) $this->host = $this->base->host;
6658
if (is_null($this->defaultScheme)) $this->defaultScheme = $config->get('URI', 'DefaultScheme');
6661
public function filter(&$uri, $config, $context) {
6662
foreach ($this->filters as $name => $f) {
6663
$result = $f->filter($uri, $config, $context);
6664
if (!$result) return false;
6669
public function postFilter(&$uri, $config, $context) {
6670
foreach ($this->postFilters as $name => $f) {
6671
$result = $f->filter($uri, $config, $context);
6672
if (!$result) return false;
6682
* Chainable filters for custom URI processing.
6684
* These filters can perform custom actions on a URI filter object,
6685
* including transformation or blacklisting.
6687
* @warning This filter is called before scheme object validation occurs.
6688
* Make sure, if you require a specific scheme object, you
6689
* you check that it exists. This allows filters to convert
6690
* proprietary URI schemes into regular ones.
6692
abstract class HTMLPurifier_URIFilter
6696
* Unique identifier of filter
6701
* True if this filter should be run after scheme validation.
6703
public $post = false;
6706
* Performs initialization for the filter
6708
public function prepare($config) {return true;}
6711
* Filter a URI object
6712
* @param $uri Reference to URI object variable
6713
* @param $config Instance of HTMLPurifier_Config
6714
* @param $context Instance of HTMLPurifier_Context
6715
* @return bool Whether or not to continue processing: false indicates
6716
* URL is no good, true indicates continue processing. Note that
6717
* all changes are committed directly on the URI object
6719
abstract public function filter(&$uri, $config, $context);
6726
* Parses a URI into the components and fragment identifier as specified
6729
class HTMLPurifier_URIParser
6733
* Instance of HTMLPurifier_PercentEncoder to do normalization with.
6735
protected $percentEncoder;
6737
public function __construct() {
6738
$this->percentEncoder = new HTMLPurifier_PercentEncoder();
6743
* @param $uri string URI to parse
6744
* @return HTMLPurifier_URI representation of URI. This representation has
6745
* not been validated yet and may not conform to RFC.
6747
public function parse($uri) {
6749
$uri = $this->percentEncoder->normalize($uri);
6751
// Regexp is as per Appendix B.
6752
// Note that ["<>] are an addition to the RFC's recommended
6753
// characters, because they represent external delimeters.
6755
'(([^:/?#"<>]+):)?'. // 2. Scheme
6756
'(//([^/?#"<>]*))?'. // 4. Authority
6757
'([^?#"<>]*)'. // 5. Path
6758
'(\?([^#"<>]*))?'. // 7. Query
6759
'(#([^"<>]*))?'. // 8. Fragment
6763
$result = preg_match($r_URI, $uri, $matches);
6765
if (!$result) return false; // *really* invalid URI
6767
// seperate out parts
6768
$scheme = !empty($matches[1]) ? $matches[2] : null;
6769
$authority = !empty($matches[3]) ? $matches[4] : null;
6770
$path = $matches[5]; // always present, can be empty
6771
$query = !empty($matches[6]) ? $matches[7] : null;
6772
$fragment = !empty($matches[8]) ? $matches[9] : null;
6774
// further parse authority
6775
if ($authority !== null) {
6776
$r_authority = "/^((.+?)@)?(\[[^\]]+\]|[^:]*)(:(\d*))?/";
6778
preg_match($r_authority, $authority, $matches);
6779
$userinfo = !empty($matches[1]) ? $matches[2] : null;
6780
$host = !empty($matches[3]) ? $matches[3] : '';
6781
$port = !empty($matches[4]) ? (int) $matches[5] : null;
6783
$port = $host = $userinfo = null;
6786
return new HTMLPurifier_URI(
6787
$scheme, $userinfo, $host, $port, $path, $query, $fragment);
6796
* Validator for the components of a URI for a specific scheme
6798
class HTMLPurifier_URIScheme
6802
* Scheme's default port (integer)
6804
public $default_port = null;
6807
* Whether or not URIs of this schem are locatable by a browser
6808
* http and ftp are accessible, while mailto and news are not.
6810
public $browsable = false;
6813
* Whether or not the URI always uses <hier_part>, resolves edge cases
6814
* with making relative URIs absolute
6816
public $hierarchical = false;
6819
* Validates the components of a URI
6820
* @note This implementation should be called by children if they define
6821
* a default port, as it does port processing.
6822
* @param $uri Instance of HTMLPurifier_URI
6823
* @param $config HTMLPurifier_Config object
6824
* @param $context HTMLPurifier_Context object
6825
* @return Bool success or failure
6827
public function validate(&$uri, $config, $context) {
6828
if ($this->default_port == $uri->port) $uri->port = null;
6838
* Registry for retrieving specific URI scheme validator objects.
6840
class HTMLPurifier_URISchemeRegistry
6844
* Retrieve sole instance of the registry.
6845
* @param $prototype Optional prototype to overload sole instance with,
6846
* or bool true to reset to default registry.
6847
* @note Pass a registry object $prototype with a compatible interface and
6848
* the function will copy it and return it all further times.
6850
public static function instance($prototype = null) {
6851
static $instance = null;
6852
if ($prototype !== null) {
6853
$instance = $prototype;
6854
} elseif ($instance === null || $prototype == true) {
6855
$instance = new HTMLPurifier_URISchemeRegistry();
6861
* Cache of retrieved schemes.
6863
protected $schemes = array();
6866
* Retrieves a scheme validator object
6867
* @param $scheme String scheme name like http or mailto
6868
* @param $config HTMLPurifier_Config object
6869
* @param $config HTMLPurifier_Context object
6871
public function getScheme($scheme, $config, $context) {
6872
if (!$config) $config = HTMLPurifier_Config::createDefault();
6873
$null = null; // for the sake of passing by reference
6875
// important, otherwise attacker could include arbitrary file
6876
$allowed_schemes = $config->get('URI', 'AllowedSchemes');
6877
if (!$config->get('URI', 'OverrideAllowedSchemes') &&
6878
!isset($allowed_schemes[$scheme])
6883
if (isset($this->schemes[$scheme])) return $this->schemes[$scheme];
6884
if (!isset($allowed_schemes[$scheme])) return $null;
6886
$class = 'HTMLPurifier_URIScheme_' . $scheme;
6887
if (!class_exists($class)) return $null;
6888
$this->schemes[$scheme] = new $class();
6889
return $this->schemes[$scheme];
6893
* Registers a custom scheme to the cache, bypassing reflection.
6894
* @param $scheme Scheme name
6895
* @param $scheme_obj HTMLPurifier_URIScheme object
6897
public function register($scheme, $scheme_obj) {
6898
$this->schemes[$scheme] = $scheme_obj;
6908
* Class for converting between different unit-lengths as specified by
6911
class HTMLPurifier_UnitConverter
6919
* Units information array. Units are grouped into measuring systems
6920
* (English, Metric), and are assigned an integer representing
6921
* the conversion factor between that unit and the smallest unit in
6922
* the system. Numeric indexes are actually magical constants that
6923
* encode conversion data from one system to the next, with a O(n^2)
6924
* constraint on memory (this is generally not a problem, since
6925
* the number of measuring systems is small.)
6927
protected static $units = array(
6928
self::ENGLISH => array(
6929
'px' => 3, // This is as per CSS 2.1 and Firefox. Your mileage may vary
6933
self::METRIC => array('pt', '0.352777778', 'mm'),
6935
self::METRIC => array(
6938
self::ENGLISH => array('mm', '2.83464567', 'pt'),
6943
* Minimum bcmath precision for output.
6945
protected $outputPrecision;
6948
* Bcmath precision for internal calculations.
6950
protected $internalPrecision;
6953
* Whether or not BCMath is available
6957
public function __construct($output_precision = 4, $internal_precision = 10, $force_no_bcmath = false) {
6958
$this->outputPrecision = $output_precision;
6959
$this->internalPrecision = $internal_precision;
6960
$this->bcmath = !$force_no_bcmath && function_exists('bcmul');
6964
* Converts a length object of one unit into another unit.
6965
* @param HTMLPurifier_Length $length
6966
* Instance of HTMLPurifier_Length to convert. You must validate()
6967
* it before passing it here!
6968
* @param string $to_unit
6969
* Unit to convert to.
6971
* About precision: This conversion function pays very special
6972
* attention to the incoming precision of values and attempts
6973
* to maintain a number of significant figure. Results are
6974
* fairly accurate up to nine digits. Some caveats:
6975
* - If a number is zero-padded as a result of this significant
6976
* figure tracking, the zeroes will be eliminated.
6977
* - If a number contains less than four sigfigs ($outputPrecision)
6978
* and this causes some decimals to be excluded, those
6979
* decimals will be added on.
6981
public function convert($length, $to_unit) {
6983
if (!$length->isValid()) return false;
6985
$n = $length->getN();
6986
$unit = $length->getUnit();
6988
if ($n === '0' || $unit === false) {
6989
return new HTMLPurifier_Length('0', false);
6992
$state = $dest_state = false;
6993
foreach (self::$units as $k => $x) {
6994
if (isset($x[$unit])) $state = $k;
6995
if (isset($x[$to_unit])) $dest_state = $k;
6997
if (!$state || !$dest_state) return false;
6999
// Some calculations about the initial precision of the number;
7000
// this will be useful when we need to do final rounding.
7001
$sigfigs = $this->getSigFigs($n);
7002
if ($sigfigs < $this->outputPrecision) $sigfigs = $this->outputPrecision;
7004
// BCMath's internal precision deals only with decimals. Use
7005
// our default if the initial number has no decimals, or increase
7006
// it by how ever many decimals, thus, the number of guard digits
7007
// will always be greater than or equal to internalPrecision.
7008
$log = (int) floor(log(abs($n), 10));
7009
$cp = ($log < 0) ? $this->internalPrecision - $log : $this->internalPrecision; // internal precision
7011
for ($i = 0; $i < 2; $i++) {
7013
// Determine what unit IN THIS SYSTEM we need to convert to
7014
if ($dest_state === $state) {
7015
// Simple conversion
7016
$dest_unit = $to_unit;
7018
// Convert to the smallest unit, pending a system shift
7019
$dest_unit = self::$units[$state][$dest_state][0];
7022
// Do the conversion if necessary
7023
if ($dest_unit !== $unit) {
7024
$factor = $this->div(self::$units[$state][$unit], self::$units[$state][$dest_unit], $cp);
7025
$n = $this->mul($n, $factor, $cp);
7029
// Output was zero, so bail out early. Shouldn't ever happen.
7036
// It was a simple conversion, so bail out
7037
if ($dest_state === $state) {
7042
// Conversion failed! Apparently, the system we forwarded
7043
// to didn't have this unit. This should never happen!
7047
// Pre-condition: $i == 0
7049
// Perform conversion to next system of units
7050
$n = $this->mul($n, self::$units[$state][$dest_state][1], $cp);
7051
$unit = self::$units[$state][$dest_state][2];
7052
$state = $dest_state;
7054
// One more loop around to convert the unit in the new system.
7058
// Post-condition: $unit == $to_unit
7059
if ($unit !== $to_unit) return false;
7061
// Useful for debugging:
7063
//echo "$n\nsigfigs = $sigfigs\nnew_log = $new_log\nlog = $log\nrp = $rp\n</pre>\n";
7065
$n = $this->round($n, $sigfigs);
7066
if (strpos($n, '.') !== false) $n = rtrim($n, '0');
7067
$n = rtrim($n, '.');
7069
return new HTMLPurifier_Length($n, $unit);
7073
* Returns the number of significant figures in a string number.
7074
* @param string $n Decimal number
7075
* @return int number of sigfigs
7077
public function getSigFigs($n) {
7078
$n = ltrim($n, '0+-');
7079
$dp = strpos($n, '.'); // decimal position
7080
if ($dp === false) {
7081
$sigfigs = strlen(rtrim($n, '0'));
7083
$sigfigs = strlen(ltrim($n, '0.')); // eliminate extra decimal character
7084
if ($dp !== 0) $sigfigs--;
7090
* Adds two numbers, using arbitrary precision when available.
7092
private function add($s1, $s2, $scale) {
7093
if ($this->bcmath) return bcadd($s1, $s2, $scale);
7094
else return $this->scale($s1 + $s2, $scale);
7098
* Multiples two numbers, using arbitrary precision when available.
7100
private function mul($s1, $s2, $scale) {
7101
if ($this->bcmath) return bcmul($s1, $s2, $scale);
7102
else return $this->scale($s1 * $s2, $scale);
7106
* Divides two numbers, using arbitrary precision when available.
7108
private function div($s1, $s2, $scale) {
7109
if ($this->bcmath) return bcdiv($s1, $s2, $scale);
7110
else return $this->scale($s1 / $s2, $scale);
7114
* Rounds a number according to the number of sigfigs it should have,
7115
* using arbitrary precision when available.
7117
private function round($n, $sigfigs) {
7118
$new_log = (int) floor(log(abs($n), 10)); // Number of digits left of decimal - 1
7119
$rp = $sigfigs - $new_log - 1; // Number of decimal places needed
7120
$neg = $n < 0 ? '-' : ''; // Negative sign
7121
if ($this->bcmath) {
7123
$n = bcadd($n, $neg . '0.' . str_repeat('0', $rp) . '5', $rp + 1);
7124
$n = bcdiv($n, '1', $rp);
7126
// This algorithm partially depends on the standardized
7127
// form of numbers that comes out of bcmath.
7128
$n = bcadd($n, $neg . '5' . str_repeat('0', $new_log - $sigfigs), 0);
7129
$n = substr($n, 0, $sigfigs + strlen($neg)) . str_repeat('0', $new_log - $sigfigs + 1);
7133
return $this->scale(round($n, $sigfigs - $new_log - 1), $rp + 1);
7138
* Scales a float to $scale digits right of decimal point, like BCMath.
7140
private function scale($r, $scale) {
7142
// The f sprintf type doesn't support negative numbers, so we
7143
// need to cludge things manually. First get the string.
7144
$r = sprintf('%.0f', (float) $r);
7145
// Due to floating point precision loss, $r will more than likely
7146
// look something like 4652999999999.9234. We grab one more digit
7147
// than we need to precise from $r and then use that to round
7149
$precise = (string) round(substr($r, 0, strlen($r) + $scale), -1);
7150
// Now we return it, truncating the zero that was rounded off.
7151
return substr($precise, 0, -1) . str_repeat('0', -$scale + 1);
7153
return sprintf('%.' . $scale . 'f', (float) $r);
7161
* Parses string representations into their corresponding native PHP
7162
* variable type. The base implementation does a simple type-check.
7164
class HTMLPurifier_VarParser
7180
* Lookup table of allowed types. Mainly for backwards compatibility, but
7181
* also convenient for transforming string type names to the integer constants.
7183
static public $types = array(
7184
'string' => self::STRING,
7185
'istring' => self::ISTRING,
7186
'text' => self::TEXT,
7187
'itext' => self::ITEXT,
7189
'float' => self::FLOAT,
7190
'bool' => self::BOOL,
7191
'lookup' => self::LOOKUP,
7192
'list' => self::ALIST,
7193
'hash' => self::HASH,
7194
'mixed' => self::MIXED
7198
* Lookup table of types that are string, and can have aliases or
7199
* allowed value lists.
7201
static public $stringTypes = array(
7202
self::STRING => true,
7203
self::ISTRING => true,
7205
self::ITEXT => true,
7209
* Validate a variable according to type. Throws
7210
* HTMLPurifier_VarParserException if invalid.
7211
* It may return NULL as a valid type if $allow_null is true.
7213
* @param $var Variable to validate
7214
* @param $type Type of variable, see HTMLPurifier_VarParser->types
7215
* @param $allow_null Whether or not to permit null as a value
7216
* @return Validated and type-coerced variable
7218
final public function parse($var, $type, $allow_null = false) {
7219
if (is_string($type)) {
7220
if (!isset(HTMLPurifier_VarParser::$types[$type])) {
7221
throw new HTMLPurifier_VarParserException("Invalid type '$type'");
7223
$type = HTMLPurifier_VarParser::$types[$type];
7226
$var = $this->parseImplementation($var, $type, $allow_null);
7227
if ($allow_null && $var === null) return null;
7228
// These are basic checks, to make sure nothing horribly wrong
7229
// happened in our implementations.
7231
case (self::STRING):
7232
case (self::ISTRING):
7235
if (!is_string($var)) break;
7236
if ($type == self::ISTRING || $type == self::ITEXT) $var = strtolower($var);
7239
if (!is_int($var)) break;
7242
if (!is_float($var)) break;
7245
if (!is_bool($var)) break;
7247
case (self::LOOKUP):
7250
if (!is_array($var)) break;
7251
if ($type === self::LOOKUP) {
7252
foreach ($var as $k) if ($k !== true) $this->error('Lookup table contains value other than true');
7253
} elseif ($type === self::ALIST) {
7254
$keys = array_keys($var);
7255
if (array_keys($keys) !== $keys) $this->error('Indices for list are not uniform');
7261
$this->errorInconsistent(get_class($this), $type);
7263
$this->errorGeneric($var, $type);
7267
* Actually implements the parsing. Base implementation is to not
7268
* do anything to $var. Subclasses should overload this!
7270
protected function parseImplementation($var, $type, $allow_null) {
7275
* Throws an exception.
7277
protected function error($msg) {
7278
throw new HTMLPurifier_VarParserException($msg);
7282
* Throws an inconsistency exception.
7283
* @note This should not ever be called. It would be called if we
7284
* extend the allowed values of HTMLPurifier_VarParser without
7285
* updating subclasses.
7287
protected function errorInconsistent($class, $type) {
7288
throw new HTMLPurifier_Exception("Inconsistency in $class: ".HTMLPurifier_VarParser::getTypeName($type)." not implemented");
7292
* Generic error for if a type didn't work.
7294
protected function errorGeneric($var, $type) {
7295
$vtype = gettype($var);
7296
$this->error("Expected type ".HTMLPurifier_VarParser::getTypeName($type).", got $vtype");
7299
static public function getTypeName($type) {
7302
// Lazy load the alternative lookup table
7303
$lookup = array_flip(HTMLPurifier_VarParser::$types);
7305
if (!isset($lookup[$type])) return 'unknown';
7306
return $lookup[$type];
7314
* Exception type for HTMLPurifier_VarParser
7316
class HTMLPurifier_VarParserException extends HTMLPurifier_Exception
7324
* Validates the HTML attribute style, otherwise known as CSS.
7325
* @note We don't implement the whole CSS specification, so it might be
7326
* difficult to reuse this component in the context of validating
7327
* actual stylesheet declarations.
7328
* @note If we were really serious about validating the CSS, we would
7329
* tokenize the styles and then parse the tokens. Obviously, we
7330
* are not doing that. Doing that could seriously harm performance,
7331
* but would make these components a lot more viable for a CSS
7332
* filtering solution.
7334
class HTMLPurifier_AttrDef_CSS extends HTMLPurifier_AttrDef
7337
public function validate($css, $config, $context) {
7339
$css = $this->parseCDATA($css);
7341
$definition = $config->getCSSDefinition();
7343
// we're going to break the spec and explode by semicolons.
7344
// This is because semicolon rarely appears in escaped form
7345
// Doing this is generally flaky but fast
7346
// IT MIGHT APPEAR IN URIs, see HTMLPurifier_AttrDef_CSSURI
7349
$declarations = explode(';', $css);
7350
$propvalues = array();
7353
* Name of the current CSS property being validated.
7356
$context->register('CurrentCSSProperty', $property);
7358
foreach ($declarations as $declaration) {
7359
if (!$declaration) continue;
7360
if (!strpos($declaration, ':')) continue;
7361
list($property, $value) = explode(':', $declaration, 2);
7362
$property = trim($property);
7363
$value = trim($value);
7366
if (isset($definition->info[$property])) {
7370
if (ctype_lower($property)) break;
7371
$property = strtolower($property);
7372
if (isset($definition->info[$property])) {
7378
// inefficient call, since the validator will do this again
7379
if (strtolower(trim($value)) !== 'inherit') {
7380
// inherit works for everything (but only on the base property)
7381
$result = $definition->info[$property]->validate(
7382
$value, $config, $context );
7384
$result = 'inherit';
7386
if ($result === false) continue;
7387
$propvalues[$property] = $result;
7390
$context->destroy('CurrentCSSProperty');
7392
// procedure does not write the new CSS simultaneously, so it's
7393
// slightly inefficient, but it's the only way of getting rid of
7394
// duplicates. Perhaps config to optimize it, but not now.
7396
$new_declarations = '';
7397
foreach ($propvalues as $prop => $value) {
7398
$new_declarations .= "$prop:$value;";
7401
return $new_declarations ? $new_declarations : false;
7410
// Enum = Enumerated
7412
* Validates a keyword against a list of valid values.
7413
* @warning The case-insensitive compare of this function uses PHP's
7414
* built-in strtolower and ctype_lower functions, which may
7415
* cause problems with international comparisons
7417
class HTMLPurifier_AttrDef_Enum extends HTMLPurifier_AttrDef
7421
* Lookup table of valid values.
7422
* @todo Make protected
7424
public $valid_values = array();
7427
* Bool indicating whether or not enumeration is case sensitive.
7428
* @note In general this is always case insensitive.
7430
protected $case_sensitive = false; // values according to W3C spec
7433
* @param $valid_values List of valid values
7434
* @param $case_sensitive Bool indicating whether or not case sensitive
7436
public function __construct(
7437
$valid_values = array(), $case_sensitive = false
7439
$this->valid_values = array_flip($valid_values);
7440
$this->case_sensitive = $case_sensitive;
7443
public function validate($string, $config, $context) {
7444
$string = trim($string);
7445
if (!$this->case_sensitive) {
7446
// we may want to do full case-insensitive libraries
7447
$string = ctype_lower($string) ? $string : strtolower($string);
7449
$result = isset($this->valid_values[$string]);
7451
return $result ? $string : false;
7455
* @param $string In form of comma-delimited list of case-insensitive
7456
* valid values. Example: "foo,bar,baz". Prepend "s:" to make
7459
public function make($string) {
7460
if (strlen($string) > 2 && $string[0] == 's' && $string[1] == ':') {
7461
$string = substr($string, 2);
7466
$values = explode(',', $string);
7467
return new HTMLPurifier_AttrDef_Enum($values, $sensitive);
7476
* Validates an integer.
7477
* @note While this class was modeled off the CSS definition, no currently
7478
* allowed CSS uses this type. The properties that do are: widows,
7479
* orphans, z-index, counter-increment, counter-reset. Some of the
7480
* HTML attributes, however, find use for a non-negative version of this.
7482
class HTMLPurifier_AttrDef_Integer extends HTMLPurifier_AttrDef
7486
* Bool indicating whether or not negative values are allowed
7488
protected $negative = true;
7491
* Bool indicating whether or not zero is allowed
7493
protected $zero = true;
7496
* Bool indicating whether or not positive values are allowed
7498
protected $positive = true;
7501
* @param $negative Bool indicating whether or not negative values are allowed
7502
* @param $zero Bool indicating whether or not zero is allowed
7503
* @param $positive Bool indicating whether or not positive values are allowed
7505
public function __construct(
7506
$negative = true, $zero = true, $positive = true
7508
$this->negative = $negative;
7509
$this->zero = $zero;
7510
$this->positive = $positive;
7513
public function validate($integer, $config, $context) {
7515
$integer = $this->parseCDATA($integer);
7516
if ($integer === '') return false;
7518
// we could possibly simply typecast it to integer, but there are
7519
// certain fringe cases that must not return an integer.
7521
// clip leading sign
7522
if ( $this->negative && $integer[0] === '-' ) {
7523
$digits = substr($integer, 1);
7524
if ($digits === '0') $integer = '0'; // rm minus sign for zero
7525
} elseif( $this->positive && $integer[0] === '+' ) {
7526
$digits = $integer = substr($integer, 1); // rm unnecessary plus
7531
// test if it's numeric
7532
if (!ctype_digit($digits)) return false;
7534
// perform scope tests
7535
if (!$this->zero && $integer == 0) return false;
7536
if (!$this->positive && $integer > 0) return false;
7537
if (!$this->negative && $integer < 0) return false;
7549
* Validates the HTML attribute lang, effectively a language code.
7550
* @note Built according to RFC 3066, which obsoleted RFC 1766
7552
class HTMLPurifier_AttrDef_Lang extends HTMLPurifier_AttrDef
7555
public function validate($string, $config, $context) {
7557
$string = trim($string);
7558
if (!$string) return false;
7560
$subtags = explode('-', $string);
7561
$num_subtags = count($subtags);
7563
if ($num_subtags == 0) return false; // sanity check
7565
// process primary subtag : $subtags[0]
7566
$length = strlen($subtags[0]);
7571
if (! ($subtags[0] == 'x' || $subtags[0] == 'i') ) {
7577
if (! ctype_alpha($subtags[0]) ) {
7579
} elseif (! ctype_lower($subtags[0]) ) {
7580
$subtags[0] = strtolower($subtags[0]);
7587
$new_string = $subtags[0];
7588
if ($num_subtags == 1) return $new_string;
7590
// process second subtag : $subtags[1]
7591
$length = strlen($subtags[1]);
7592
if ($length == 0 || ($length == 1 && $subtags[1] != 'x') || $length > 8 || !ctype_alnum($subtags[1])) {
7595
if (!ctype_lower($subtags[1])) $subtags[1] = strtolower($subtags[1]);
7597
$new_string .= '-' . $subtags[1];
7598
if ($num_subtags == 2) return $new_string;
7600
// process all other subtags, index 2 and up
7601
for ($i = 2; $i < $num_subtags; $i++) {
7602
$length = strlen($subtags[$i]);
7603
if ($length == 0 || $length > 8 || !ctype_alnum($subtags[$i])) {
7606
if (!ctype_lower($subtags[$i])) {
7607
$subtags[$i] = strtolower($subtags[$i]);
7609
$new_string .= '-' . $subtags[$i];
7622
* Decorator that, depending on a token, switches between two definitions.
7624
class HTMLPurifier_AttrDef_Switch
7628
protected $withTag, $withoutTag;
7631
* @param string $tag Tag name to switch upon
7632
* @param HTMLPurifier_AttrDef $with_tag Call if token matches tag
7633
* @param HTMLPurifier_AttrDef $without_tag Call if token doesn't match, or there is no token
7635
public function __construct($tag, $with_tag, $without_tag) {
7637
$this->withTag = $with_tag;
7638
$this->withoutTag = $without_tag;
7641
public function validate($string, $config, $context) {
7642
$token = $context->get('CurrentToken', true);
7643
if (!$token || $token->name !== $this->tag) {
7644
return $this->withoutTag->validate($string, $config, $context);
7646
return $this->withTag->validate($string, $config, $context);
7655
* Validates arbitrary text according to the HTML spec.
7657
class HTMLPurifier_AttrDef_Text extends HTMLPurifier_AttrDef
7660
public function validate($string, $config, $context) {
7661
return $this->parseCDATA($string);
7670
* Validates a URI as defined by RFC 3986.
7671
* @note Scheme-specific mechanics deferred to HTMLPurifier_URIScheme
7673
class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
7677
protected $embedsResource;
7680
* @param $embeds_resource_resource Does the URI here result in an extra HTTP request?
7682
public function __construct($embeds_resource = false) {
7683
$this->parser = new HTMLPurifier_URIParser();
7684
$this->embedsResource = (bool) $embeds_resource;
7687
public function make($string) {
7688
$embeds = (bool) $string;
7689
return new HTMLPurifier_AttrDef_URI($embeds);
7692
public function validate($uri, $config, $context) {
7694
if ($config->get('URI', 'Disable')) return false;
7696
$uri = $this->parseCDATA($uri);
7699
$uri = $this->parser->parse($uri);
7700
if ($uri === false) return false;
7702
// add embedded flag to context for validators
7703
$context->register('EmbeddedURI', $this->embedsResource);
7708
// generic validation
7709
$result = $uri->validate($config, $context);
7710
if (!$result) break;
7712
// chained filtering
7713
$uri_def = $config->getDefinition('URI');
7714
$result = $uri_def->filter($uri, $config, $context);
7715
if (!$result) break;
7717
// scheme-specific validation
7718
$scheme_obj = $uri->getSchemeObj($config, $context);
7719
if (!$scheme_obj) break;
7720
if ($this->embedsResource && !$scheme_obj->browsable) break;
7721
$result = $scheme_obj->validate($uri, $config, $context);
7722
if (!$result) break;
7724
// Post chained filtering
7725
$result = $uri_def->postFilter($uri, $config, $context);
7726
if (!$result) break;
7728
// survived gauntlet
7733
$context->destroy('EmbeddedURI');
7734
if (!$ok) return false;
7737
return $uri->toString();
7748
* Validates a number as defined by the CSS spec.
7750
class HTMLPurifier_AttrDef_CSS_Number extends HTMLPurifier_AttrDef
7754
* Bool indicating whether or not only positive values allowed.
7756
protected $non_negative = false;
7759
* @param $non_negative Bool indicating whether negatives are forbidden
7761
public function __construct($non_negative = false) {
7762
$this->non_negative = $non_negative;
7766
* @warning Some contexts do not pass $config, $context. These
7767
* variables should not be used without checking HTMLPurifier_Length
7769
public function validate($number, $config, $context) {
7771
$number = $this->parseCDATA($number);
7773
if ($number === '') return false;
7774
if ($number === '0') return '0';
7777
switch ($number[0]) {
7779
if ($this->non_negative) return false;
7782
$number = substr($number, 1);
7785
if (ctype_digit($number)) {
7786
$number = ltrim($number, '0');
7787
return $number ? $sign . $number : '0';
7790
// Period is the only non-numeric character allowed
7791
if (strpos($number, '.') === false) return false;
7793
list($left, $right) = explode('.', $number, 2);
7795
if ($left === '' && $right === '') return false;
7796
if ($left !== '' && !ctype_digit($left)) return false;
7798
$left = ltrim($left, '0');
7799
$right = rtrim($right, '0');
7801
if ($right === '') {
7802
return $left ? $sign . $left : '0';
7803
} elseif (!ctype_digit($right)) {
7807
return $sign . $left . '.' . $right;
7816
class HTMLPurifier_AttrDef_CSS_AlphaValue extends HTMLPurifier_AttrDef_CSS_Number
7819
public function __construct() {
7820
parent::__construct(false); // opacity is non-negative, but we will clamp it
7823
public function validate($number, $config, $context) {
7824
$result = parent::validate($number, $config, $context);
7825
if ($result === false) return $result;
7826
$float = (float) $result;
7827
if ($float < 0.0) $result = '0';
7828
if ($float > 1.0) $result = '1';
7837
* Validates shorthand CSS property background.
7838
* @warning Does not support url tokens that have internal spaces.
7840
class HTMLPurifier_AttrDef_CSS_Background extends HTMLPurifier_AttrDef
7844
* Local copy of component validators.
7845
* @note See HTMLPurifier_AttrDef_Font::$info for a similar impl.
7849
public function __construct($config) {
7850
$def = $config->getCSSDefinition();
7851
$this->info['background-color'] = $def->info['background-color'];
7852
$this->info['background-image'] = $def->info['background-image'];
7853
$this->info['background-repeat'] = $def->info['background-repeat'];
7854
$this->info['background-attachment'] = $def->info['background-attachment'];
7855
$this->info['background-position'] = $def->info['background-position'];
7858
public function validate($string, $config, $context) {
7860
// regular pre-processing
7861
$string = $this->parseCDATA($string);
7862
if ($string === '') return false;
7864
// munge rgb() decl if necessary
7865
$string = $this->mungeRgb($string);
7867
// assumes URI doesn't have spaces in it
7868
$bits = explode(' ', strtolower($string)); // bits to process
7871
$caught['color'] = false;
7872
$caught['image'] = false;
7873
$caught['repeat'] = false;
7874
$caught['attachment'] = false;
7875
$caught['position'] = false;
7877
$i = 0; // number of catches
7880
foreach ($bits as $bit) {
7881
if ($bit === '') continue;
7882
foreach ($caught as $key => $status) {
7883
if ($key != 'position') {
7884
if ($status !== false) continue;
7885
$r = $this->info['background-' . $key]->validate($bit, $config, $context);
7889
if ($r === false) continue;
7890
if ($key == 'position') {
7891
if ($caught[$key] === false) $caught[$key] = '';
7892
$caught[$key] .= $r . ' ';
7901
if (!$i) return false;
7902
if ($caught['position'] !== false) {
7903
$caught['position'] = $this->info['background-position']->
7904
validate($caught['position'], $config, $context);
7908
foreach ($caught as $value) {
7909
if ($value === false) continue;
7913
if (empty($ret)) return false;
7914
return implode(' ', $ret);
7924
[ // adjective and number must be in correct order, even if
7925
// you could switch them without introducing ambiguity.
7926
// some browsers support that syntax
7928
<percentage> | <length> | left | center | right
7931
<percentage> | <length> | top | center | bottom
7934
[ // this signifies that the vertical and horizontal adjectives
7935
// can be arbitrarily ordered, however, there can only be two,
7936
// one of each, or none at all
7938
left | center | right
7941
top | center | bottom
7945
center, (none) = 50%
7946
bottom, right = 100%
7950
keyword + length/percentage must be ordered correctly, as per W3C
7952
Internet Explorer and Opera, however, support arbitrary ordering. We
7955
Minor issue though, not strictly necessary.
7958
// control freaks may appreciate the ability to convert these to
7959
// percentages or something, but it's not necessary
7962
* Validates the value of background-position.
7964
class HTMLPurifier_AttrDef_CSS_BackgroundPosition extends HTMLPurifier_AttrDef
7968
protected $percentage;
7970
public function __construct() {
7971
$this->length = new HTMLPurifier_AttrDef_CSS_Length();
7972
$this->percentage = new HTMLPurifier_AttrDef_CSS_Percentage();
7975
public function validate($string, $config, $context) {
7976
$string = $this->parseCDATA($string);
7977
$bits = explode(' ', $string);
7979
$keywords = array();
7980
$keywords['h'] = false; // left, right
7981
$keywords['v'] = false; // top, bottom
7982
$keywords['c'] = false; // center
7983
$measures = array();
7995
foreach ($bits as $bit) {
7996
if ($bit === '') continue;
7999
$lbit = ctype_lower($bit) ? $bit : strtolower($bit);
8000
if (isset($lookup[$lbit])) {
8001
$status = $lookup[$lbit];
8002
$keywords[$status] = $lbit;
8007
$r = $this->length->validate($bit, $config, $context);
8013
// test for percentage
8014
$r = $this->percentage->validate($bit, $config, $context);
8022
if (!$i) return false; // no valid values were caught
8028
if ($keywords['h']) $ret[] = $keywords['h'];
8029
elseif (count($measures)) $ret[] = array_shift($measures);
8030
elseif ($keywords['c']) {
8031
$ret[] = $keywords['c'];
8032
$keywords['c'] = false; // prevent re-use: center = center center
8035
if ($keywords['v']) $ret[] = $keywords['v'];
8036
elseif (count($measures)) $ret[] = array_shift($measures);
8037
elseif ($keywords['c']) $ret[] = $keywords['c'];
8039
if (empty($ret)) return false;
8040
return implode(' ', $ret);
8050
* Validates the border property as defined by CSS.
8052
class HTMLPurifier_AttrDef_CSS_Border extends HTMLPurifier_AttrDef
8056
* Local copy of properties this property is shorthand for.
8058
protected $info = array();
8060
public function __construct($config) {
8061
$def = $config->getCSSDefinition();
8062
$this->info['border-width'] = $def->info['border-width'];
8063
$this->info['border-style'] = $def->info['border-style'];
8064
$this->info['border-top-color'] = $def->info['border-top-color'];
8067
public function validate($string, $config, $context) {
8068
$string = $this->parseCDATA($string);
8069
$string = $this->mungeRgb($string);
8070
$bits = explode(' ', $string);
8071
$done = array(); // segments we've finished
8072
$ret = ''; // return value
8073
foreach ($bits as $bit) {
8074
foreach ($this->info as $propname => $validator) {
8075
if (isset($done[$propname])) continue;
8076
$r = $validator->validate($bit, $config, $context);
8079
$done[$propname] = true;
8093
* Validates Color as defined by CSS.
8095
class HTMLPurifier_AttrDef_CSS_Color extends HTMLPurifier_AttrDef
8098
public function validate($color, $config, $context) {
8100
static $colors = null;
8101
if ($colors === null) $colors = $config->get('Core', 'ColorKeywords');
8103
$color = trim($color);
8104
if ($color === '') return false;
8106
$lower = strtolower($color);
8107
if (isset($colors[$lower])) return $colors[$lower];
8109
if (strpos($color, 'rgb(') !== false) {
8110
// rgb literal handling
8111
$length = strlen($color);
8112
if (strpos($color, ')') !== $length - 1) return false;
8113
$triad = substr($color, 4, $length - 4 - 1);
8114
$parts = explode(',', $triad);
8115
if (count($parts) !== 3) return false;
8116
$type = false; // to ensure that they're all the same type
8117
$new_parts = array();
8118
foreach ($parts as $part) {
8119
$part = trim($part);
8120
if ($part === '') return false;
8121
$length = strlen($part);
8122
if ($part[$length - 1] === '%') {
8125
$type = 'percentage';
8126
} elseif ($type !== 'percentage') {
8129
$num = (float) substr($part, 0, $length - 1);
8130
if ($num < 0) $num = 0;
8131
if ($num > 100) $num = 100;
8132
$new_parts[] = "$num%";
8137
} elseif ($type !== 'integer') {
8141
if ($num < 0) $num = 0;
8142
if ($num > 255) $num = 255;
8143
$new_parts[] = (string) $num;
8146
$new_triad = implode(',', $new_parts);
8147
$color = "rgb($new_triad)";
8149
// hexadecimal handling
8150
if ($color[0] === '#') {
8151
$hex = substr($color, 1);
8154
$color = '#' . $color;
8156
$length = strlen($hex);
8157
if ($length !== 3 && $length !== 6) return false;
8158
if (!ctype_xdigit($hex)) return false;
8171
* Allows multiple validators to attempt to validate attribute.
8173
* Composite is just what it sounds like: a composite of many validators.
8174
* This means that multiple HTMLPurifier_AttrDef objects will have a whack
8175
* at the string. If one of them passes, that's what is returned. This is
8176
* especially useful for CSS values, which often are a choice between
8177
* an enumerated set of predefined values or a flexible data type.
8179
class HTMLPurifier_AttrDef_CSS_Composite extends HTMLPurifier_AttrDef
8183
* List of HTMLPurifier_AttrDef objects that may process strings
8184
* @todo Make protected
8189
* @param $defs List of HTMLPurifier_AttrDef objects
8191
public function __construct($defs) {
8192
$this->defs = $defs;
8195
public function validate($string, $config, $context) {
8196
foreach ($this->defs as $i => $def) {
8197
$result = $this->defs[$i]->validate($string, $config, $context);
8198
if ($result !== false) return $result;
8209
* Decorator which enables CSS properties to be disabled for specific elements.
8211
class HTMLPurifier_AttrDef_CSS_DenyElementDecorator extends HTMLPurifier_AttrDef
8213
protected $def, $element;
8216
* @param $def Definition to wrap
8217
* @param $element Element to deny
8219
public function __construct($def, $element) {
8221
$this->element = $element;
8224
* Checks if CurrentToken is set and equal to $this->element
8226
public function validate($string, $config, $context) {
8227
$token = $context->get('CurrentToken', true);
8228
if ($token && $token->name == $this->element) return false;
8229
return $this->def->validate($string, $config, $context);
8236
* Microsoft's proprietary filter: CSS property
8237
* @note Currently supports the alpha filter. In the future, this will
8238
* probably need an extensible framework
8240
class HTMLPurifier_AttrDef_CSS_Filter extends HTMLPurifier_AttrDef
8243
protected $intValidator;
8245
public function __construct() {
8246
$this->intValidator = new HTMLPurifier_AttrDef_Integer();
8249
public function validate($value, $config, $context) {
8250
$value = $this->parseCDATA($value);
8251
if ($value === 'none') return $value;
8252
// if we looped this we could support multiple filters
8253
$function_length = strcspn($value, '(');
8254
$function = trim(substr($value, 0, $function_length));
8255
if ($function !== 'alpha' &&
8256
$function !== 'Alpha' &&
8257
$function !== 'progid:DXImageTransform.Microsoft.Alpha'
8259
$cursor = $function_length + 1;
8260
$parameters_length = strcspn($value, ')', $cursor);
8261
$parameters = substr($value, $cursor, $parameters_length);
8262
$params = explode(',', $parameters);
8263
$ret_params = array();
8265
foreach ($params as $param) {
8266
list($key, $value) = explode('=', $param);
8268
$value = trim($value);
8269
if (isset($lookup[$key])) continue;
8270
if ($key !== 'opacity') continue;
8271
$value = $this->intValidator->validate($value, $config, $context);
8272
if ($value === false) continue;
8273
$int = (int) $value;
8274
if ($int > 100) $value = '100';
8275
if ($int < 0) $value = '0';
8276
$ret_params[] = "$key=$value";
8277
$lookup[$key] = true;
8279
$ret_parameters = implode(',', $ret_params);
8280
$ret_function = "$function($ret_parameters)";
8281
return $ret_function;
8289
* Validates shorthand CSS property font.
8291
class HTMLPurifier_AttrDef_CSS_Font extends HTMLPurifier_AttrDef
8295
* Local copy of component validators.
8297
* @note If we moved specific CSS property definitions to their own
8298
* classes instead of having them be assembled at run time by
8299
* CSSDefinition, this wouldn't be necessary. We'd instantiate
8302
protected $info = array();
8304
public function __construct($config) {
8305
$def = $config->getCSSDefinition();
8306
$this->info['font-style'] = $def->info['font-style'];
8307
$this->info['font-variant'] = $def->info['font-variant'];
8308
$this->info['font-weight'] = $def->info['font-weight'];
8309
$this->info['font-size'] = $def->info['font-size'];
8310
$this->info['line-height'] = $def->info['line-height'];
8311
$this->info['font-family'] = $def->info['font-family'];
8314
public function validate($string, $config, $context) {
8316
static $system_fonts = array(
8320
'message-box' => true,
8321
'small-caption' => true,
8322
'status-bar' => true
8325
// regular pre-processing
8326
$string = $this->parseCDATA($string);
8327
if ($string === '') return false;
8329
// check if it's one of the keywords
8330
$lowercase_string = strtolower($string);
8331
if (isset($system_fonts[$lowercase_string])) {
8332
return $lowercase_string;
8335
$bits = explode(' ', $string); // bits to process
8336
$stage = 0; // this indicates what we're looking for
8337
$caught = array(); // which stage 0 properties have we caught?
8338
$stage_1 = array('font-style', 'font-variant', 'font-weight');
8339
$final = ''; // output
8341
for ($i = 0, $size = count($bits); $i < $size; $i++) {
8342
if ($bits[$i] === '') continue;
8345
// attempting to catch font-style, font-variant or font-weight
8347
foreach ($stage_1 as $validator_name) {
8348
if (isset($caught[$validator_name])) continue;
8349
$r = $this->info[$validator_name]->validate(
8350
$bits[$i], $config, $context);
8353
$caught[$validator_name] = true;
8357
// all three caught, continue on
8358
if (count($caught) >= 3) $stage = 1;
8359
if ($r !== false) break;
8361
// attempting to catch font-size and perhaps line-height
8363
$found_slash = false;
8364
if (strpos($bits[$i], '/') !== false) {
8365
list($font_size, $line_height) =
8366
explode('/', $bits[$i]);
8367
if ($line_height === '') {
8368
// ooh, there's a space after the slash!
8369
$line_height = false;
8370
$found_slash = true;
8373
$font_size = $bits[$i];
8374
$line_height = false;
8376
$r = $this->info['font-size']->validate(
8377
$font_size, $config, $context);
8380
// attempt to catch line-height
8381
if ($line_height === false) {
8382
// we need to scroll forward
8383
for ($j = $i + 1; $j < $size; $j++) {
8384
if ($bits[$j] === '') continue;
8385
if ($bits[$j] === '/') {
8389
$found_slash = true;
8393
$line_height = $bits[$j];
8397
// slash already found
8398
$found_slash = true;
8403
$r = $this->info['line-height']->validate(
8404
$line_height, $config, $context);
8415
// attempting to catch font-family
8418
implode(' ', array_slice($bits, $i, $size - $i));
8419
$r = $this->info['font-family']->validate(
8420
$font_family, $config, $context);
8423
// processing completed successfully
8424
return rtrim($final);
8438
* Validates a font family list according to CSS spec
8439
* @todo whitelisting allowed fonts would be nice
8441
class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
8444
public function validate($string, $config, $context) {
8445
static $generic_names = array(
8447
'sans-serif' => true,
8448
'monospace' => true,
8453
// assume that no font names contain commas in them
8454
$fonts = explode(',', $string);
8456
foreach($fonts as $font) {
8457
$font = trim($font);
8458
if ($font === '') continue;
8459
// match a generic name
8460
if (isset($generic_names[$font])) {
8461
$final .= $font . ', ';
8464
// match a quoted name
8465
if ($font[0] === '"' || $font[0] === "'") {
8466
$length = strlen($font);
8467
if ($length <= 2) continue;
8469
if ($font[$length - 1] !== $quote) continue;
8470
$font = substr($font, 1, $length - 2);
8473
for ($i = 0, $c = strlen($font); $i < $c; $i++) {
8474
if ($font[$i] === '\\') {
8480
if (ctype_xdigit($font[$i])) {
8482
for ($a = 1, $i++; $i < $c && $a < 6; $i++, $a++) {
8483
if (!ctype_xdigit($font[$i])) break;
8486
// We have to be extremely careful when adding
8487
// new characters, to make sure we're not breaking
8489
$char = HTMLPurifier_Encoder::unichr(hexdec($code));
8490
if (HTMLPurifier_Encoder::cleanUTF8($char) === '') continue;
8492
if ($i < $c && trim($font[$i]) !== '') $i--;
8495
if ($font[$i] === "\n") continue;
8497
$new_font .= $font[$i];
8502
// $font is a pure representation of the font name
8504
if (ctype_alnum($font) && $font !== '') {
8505
// very simple font, allow it in unharmed
8506
$final .= $font . ', ';
8510
// complicated font, requires quoting
8512
// armor single quotes and new lines
8513
$font = str_replace("\\", "\\\\", $font);
8514
$font = str_replace("'", "\\'", $font);
8515
$final .= "'$font', ";
8517
$final = rtrim($final, ', ');
8518
if ($final === '') return false;
8528
* Decorator which enables !important to be used in CSS values.
8530
class HTMLPurifier_AttrDef_CSS_ImportantDecorator extends HTMLPurifier_AttrDef
8532
protected $def, $allow;
8535
* @param $def Definition to wrap
8536
* @param $allow Whether or not to allow !important
8538
public function __construct($def, $allow = false) {
8540
$this->allow = $allow;
8543
* Intercepts and removes !important if necessary
8545
public function validate($string, $config, $context) {
8546
// test for ! and important tokens
8547
$string = trim($string);
8548
$is_important = false;
8549
// :TODO: optimization: test directly for !important and ! important
8550
if (strlen($string) >= 9 && substr($string, -9) === 'important') {
8551
$temp = rtrim(substr($string, 0, -9));
8552
// use a temp, because we might want to restore important
8553
if (strlen($temp) >= 1 && substr($temp, -1) === '!') {
8554
$string = rtrim(substr($temp, 0, -1));
8555
$is_important = true;
8558
$string = $this->def->validate($string, $config, $context);
8559
if ($this->allow && $is_important) $string .= ' !important';
8567
* Represents a Length as defined by CSS.
8569
class HTMLPurifier_AttrDef_CSS_Length extends HTMLPurifier_AttrDef
8572
protected $min, $max;
8575
* @param HTMLPurifier_Length $max Minimum length, or null for no bound. String is also acceptable.
8576
* @param HTMLPurifier_Length $max Maximum length, or null for no bound. String is also acceptable.
8578
public function __construct($min = null, $max = null) {
8579
$this->min = $min !== null ? HTMLPurifier_Length::make($min) : null;
8580
$this->max = $max !== null ? HTMLPurifier_Length::make($max) : null;
8583
public function validate($string, $config, $context) {
8584
$string = $this->parseCDATA($string);
8587
if ($string === '') return false;
8588
if ($string === '0') return '0';
8589
if (strlen($string) === 1) return false;
8591
$length = HTMLPurifier_Length::make($string);
8592
if (!$length->isValid()) return false;
8595
$c = $length->compareTo($this->min);
8596
if ($c === false) return false;
8597
if ($c < 0) return false;
8600
$c = $length->compareTo($this->max);
8601
if ($c === false) return false;
8602
if ($c > 0) return false;
8605
return $length->toString();
8614
* Validates shorthand CSS property list-style.
8615
* @warning Does not support url tokens that have internal spaces.
8617
class HTMLPurifier_AttrDef_CSS_ListStyle extends HTMLPurifier_AttrDef
8621
* Local copy of component validators.
8622
* @note See HTMLPurifier_AttrDef_CSS_Font::$info for a similar impl.
8626
public function __construct($config) {
8627
$def = $config->getCSSDefinition();
8628
$this->info['list-style-type'] = $def->info['list-style-type'];
8629
$this->info['list-style-position'] = $def->info['list-style-position'];
8630
$this->info['list-style-image'] = $def->info['list-style-image'];
8633
public function validate($string, $config, $context) {
8635
// regular pre-processing
8636
$string = $this->parseCDATA($string);
8637
if ($string === '') return false;
8639
// assumes URI doesn't have spaces in it
8640
$bits = explode(' ', strtolower($string)); // bits to process
8643
$caught['type'] = false;
8644
$caught['position'] = false;
8645
$caught['image'] = false;
8647
$i = 0; // number of catches
8650
foreach ($bits as $bit) {
8651
if ($i >= 3) return; // optimization bit
8652
if ($bit === '') continue;
8653
foreach ($caught as $key => $status) {
8654
if ($status !== false) continue;
8655
$r = $this->info['list-style-' . $key]->validate($bit, $config, $context);
8656
if ($r === false) continue;
8657
if ($r === 'none') {
8658
if ($none) continue;
8660
if ($key == 'image') continue;
8668
if (!$i) return false;
8673
if ($caught['type']) $ret[] = $caught['type'];
8676
if ($caught['image']) $ret[] = $caught['image'];
8678
// construct position
8679
if ($caught['position']) $ret[] = $caught['position'];
8681
if (empty($ret)) return false;
8682
return implode(' ', $ret);
8692
* Framework class for strings that involve multiple values.
8694
* Certain CSS properties such as border-width and margin allow multiple
8695
* lengths to be specified. This class can take a vanilla border-width
8696
* definition and multiply it, usually into a max of four.
8698
* @note Even though the CSS specification isn't clear about it, inherit
8699
* can only be used alone: it will never manifest as part of a multi
8700
* shorthand declaration. Thus, this class does not allow inherit.
8702
class HTMLPurifier_AttrDef_CSS_Multiple extends HTMLPurifier_AttrDef
8706
* Instance of component definition to defer validation to.
8707
* @todo Make protected
8712
* Max number of values allowed.
8713
* @todo Make protected
8718
* @param $single HTMLPurifier_AttrDef to multiply
8719
* @param $max Max number of values allowed (usually four)
8721
public function __construct($single, $max = 4) {
8722
$this->single = $single;
8726
public function validate($string, $config, $context) {
8727
$string = $this->parseCDATA($string);
8728
if ($string === '') return false;
8729
$parts = explode(' ', $string); // parseCDATA replaced \r, \t and \n
8730
$length = count($parts);
8732
for ($i = 0, $num = 0; $i < $length && $num < $this->max; $i++) {
8733
if (ctype_space($parts[$i])) continue;
8734
$result = $this->single->validate($parts[$i], $config, $context);
8735
if ($result !== false) {
8736
$final .= $result . ' ';
8740
if ($final === '') return false;
8741
return rtrim($final);
8750
* Validates a Percentage as defined by the CSS spec.
8752
class HTMLPurifier_AttrDef_CSS_Percentage extends HTMLPurifier_AttrDef
8756
* Instance of HTMLPurifier_AttrDef_CSS_Number to defer number validation
8758
protected $number_def;
8761
* @param Bool indicating whether to forbid negative values
8763
public function __construct($non_negative = false) {
8764
$this->number_def = new HTMLPurifier_AttrDef_CSS_Number($non_negative);
8767
public function validate($string, $config, $context) {
8769
$string = $this->parseCDATA($string);
8771
if ($string === '') return false;
8772
$length = strlen($string);
8773
if ($length === 1) return false;
8774
if ($string[$length - 1] !== '%') return false;
8776
$number = substr($string, 0, $length - 1);
8777
$number = $this->number_def->validate($number, $config, $context);
8779
if ($number === false) return false;
8790
* Validates the value for the CSS property text-decoration
8791
* @note This class could be generalized into a version that acts sort of
8792
* like Enum except you can compound the allowed values.
8794
class HTMLPurifier_AttrDef_CSS_TextDecoration extends HTMLPurifier_AttrDef
8797
public function validate($string, $config, $context) {
8799
static $allowed_values = array(
8800
'line-through' => true,
8802
'underline' => true,
8805
$string = strtolower($this->parseCDATA($string));
8807
if ($string === 'none') return $string;
8809
$parts = explode(' ', $string);
8811
foreach ($parts as $part) {
8812
if (isset($allowed_values[$part])) {
8813
$final .= $part . ' ';
8816
$final = rtrim($final);
8817
if ($final === '') return false;
8828
* Validates a URI in CSS syntax, which uses url('http://example.com')
8829
* @note While theoretically speaking a URI in a CSS document could
8830
* be non-embedded, as of CSS2 there is no such usage so we're
8831
* generalizing it. This may need to be changed in the future.
8832
* @warning Since HTMLPurifier_AttrDef_CSS blindly uses semicolons as
8833
* the separator, you cannot put a literal semicolon in
8834
* in the URI. Try percent encoding it, in that case.
8836
class HTMLPurifier_AttrDef_CSS_URI extends HTMLPurifier_AttrDef_URI
8839
public function __construct() {
8840
parent::__construct(true); // always embedded
8843
public function validate($uri_string, $config, $context) {
8844
// parse the URI out of the string and then pass it onto
8845
// the parent object
8847
$uri_string = $this->parseCDATA($uri_string);
8848
if (strpos($uri_string, 'url(') !== 0) return false;
8849
$uri_string = substr($uri_string, 4);
8850
$new_length = strlen($uri_string) - 1;
8851
if ($uri_string[$new_length] != ')') return false;
8852
$uri = trim(substr($uri_string, 0, $new_length));
8854
if (!empty($uri) && ($uri[0] == "'" || $uri[0] == '"')) {
8856
$new_length = strlen($uri) - 1;
8857
if ($uri[$new_length] !== $quote) return false;
8858
$uri = substr($uri, 1, $new_length - 1);
8861
$keys = array( '(', ')', ',', ' ', '"', "'");
8862
$values = array('\\(', '\\)', '\\,', '\\ ', '\\"', "\\'");
8863
$uri = str_replace($values, $keys, $uri);
8865
$result = parent::validate($uri, $config, $context);
8867
if ($result === false) return false;
8869
// escape necessary characters according to CSS spec
8870
// except for the comma, none of these should appear in the
8872
$result = str_replace($keys, $values, $result);
8874
return "url($result)";
8884
* Validates a boolean attribute
8886
class HTMLPurifier_AttrDef_HTML_Bool extends HTMLPurifier_AttrDef
8890
public $minimized = true;
8892
public function __construct($name = false) {$this->name = $name;}
8894
public function validate($string, $config, $context) {
8895
if (empty($string)) return false;
8900
* @param $string Name of attribute
8902
public function make($string) {
8903
return new HTMLPurifier_AttrDef_HTML_Bool($string);
8912
* Validates a color according to the HTML spec.
8914
class HTMLPurifier_AttrDef_HTML_Color extends HTMLPurifier_AttrDef
8917
public function validate($string, $config, $context) {
8919
static $colors = null;
8920
if ($colors === null) $colors = $config->get('Core', 'ColorKeywords');
8922
$string = trim($string);
8924
if (empty($string)) return false;
8925
if (isset($colors[$string])) return $colors[$string];
8926
if ($string[0] === '#') $hex = substr($string, 1);
8927
else $hex = $string;
8929
$length = strlen($hex);
8930
if ($length !== 3 && $length !== 6) return false;
8931
if (!ctype_xdigit($hex)) return false;
8932
if ($length === 3) $hex = $hex[0].$hex[0].$hex[1].$hex[1].$hex[2].$hex[2];
8944
* Special-case enum attribute definition that lazy loads allowed frame targets
8946
class HTMLPurifier_AttrDef_HTML_FrameTarget extends HTMLPurifier_AttrDef_Enum
8949
public $valid_values = false; // uninitialized value
8950
protected $case_sensitive = false;
8952
public function __construct() {}
8954
public function validate($string, $config, $context) {
8955
if ($this->valid_values === false) $this->valid_values = $config->get('Attr', 'AllowedFrameTargets');
8956
return parent::validate($string, $config, $context);
8965
* Validates the HTML attribute ID.
8966
* @warning Even though this is the id processor, it
8967
* will ignore the directive Attr:IDBlacklist, since it will only
8968
* go according to the ID accumulator. Since the accumulator is
8969
* automatically generated, it will have already absorbed the
8970
* blacklist. If you're hacking around, make sure you use load()!
8973
class HTMLPurifier_AttrDef_HTML_ID extends HTMLPurifier_AttrDef
8976
// ref functionality disabled, since we also have to verify
8977
// whether or not the ID it refers to exists
8979
public function validate($id, $config, $context) {
8981
if (!$config->get('Attr', 'EnableID')) return false;
8983
$id = trim($id); // trim it first
8985
if ($id === '') return false;
8987
$prefix = $config->get('Attr', 'IDPrefix');
8988
if ($prefix !== '') {
8989
$prefix .= $config->get('Attr', 'IDPrefixLocal');
8990
// prevent re-appending the prefix
8991
if (strpos($id, $prefix) !== 0) $id = $prefix . $id;
8992
} elseif ($config->get('Attr', 'IDPrefixLocal') !== '') {
8993
trigger_error('%Attr.IDPrefixLocal cannot be used unless '.
8994
'%Attr.IDPrefix is set', E_USER_WARNING);
8997
//if (!$this->ref) {
8998
$id_accumulator =& $context->get('IDAccumulator');
8999
if (isset($id_accumulator->ids[$id])) return false;
9002
// we purposely avoid using regex, hopefully this is faster
9004
if (ctype_alpha($id)) {
9007
if (!ctype_alpha(@$id[0])) return false;
9008
$trim = trim( // primitive style of regexps, I suppose
9012
$result = ($trim === '');
9015
$regexp = $config->get('Attr', 'IDBlacklistRegexp');
9016
if ($regexp && preg_match($regexp, $id)) {
9020
if (/*!$this->ref && */$result) $id_accumulator->add($id);
9022
// if no change was made to the ID, return the result
9023
// else, return the new id if stripping whitespace made it
9024
// valid, or return false.
9025
return $result ? $id : false;
9035
* Validates an integer representation of pixels according to the HTML spec.
9037
class HTMLPurifier_AttrDef_HTML_Pixels extends HTMLPurifier_AttrDef
9042
public function __construct($max = null) {
9046
public function validate($string, $config, $context) {
9048
$string = trim($string);
9049
if ($string === '0') return $string;
9050
if ($string === '') return false;
9051
$length = strlen($string);
9052
if (substr($string, $length - 2) == 'px') {
9053
$string = substr($string, 0, $length - 2);
9055
if (!is_numeric($string)) return false;
9056
$int = (int) $string;
9058
if ($int < 0) return '0';
9060
// upper-bound value, extremely high values can
9061
// crash operating systems, see <http://ha.ckers.org/imagecrash.html>
9062
// WARNING, above link WILL crash you if you're using Windows
9064
if ($this->max !== null && $int > $this->max) return (string) $this->max;
9066
return (string) $int;
9070
public function make($string) {
9071
if ($string === '') $max = null;
9072
else $max = (int) $string;
9073
$class = get_class($this);
9074
return new $class($max);
9083
* Validates the HTML type length (not to be confused with CSS's length).
9085
* This accepts integer pixels or percentages as lengths for certain
9089
class HTMLPurifier_AttrDef_HTML_Length extends HTMLPurifier_AttrDef_HTML_Pixels
9092
public function validate($string, $config, $context) {
9094
$string = trim($string);
9095
if ($string === '') return false;
9097
$parent_result = parent::validate($string, $config, $context);
9098
if ($parent_result !== false) return $parent_result;
9100
$length = strlen($string);
9101
$last_char = $string[$length - 1];
9103
if ($last_char !== '%') return false;
9105
$points = substr($string, 0, $length - 1);
9107
if (!is_numeric($points)) return false;
9109
$points = (int) $points;
9111
if ($points < 0) return '0%';
9112
if ($points > 100) return '100%';
9114
return ((string) $points) . '%';
9124
* Validates a rel/rev link attribute against a directive of allowed values
9125
* @note We cannot use Enum because link types allow multiple
9127
* @note Assumes link types are ASCII text
9129
class HTMLPurifier_AttrDef_HTML_LinkTypes extends HTMLPurifier_AttrDef
9132
/** Name config attribute to pull. */
9135
public function __construct($name) {
9136
$configLookup = array(
9137
'rel' => 'AllowedRel',
9138
'rev' => 'AllowedRev'
9140
if (!isset($configLookup[$name])) {
9141
trigger_error('Unrecognized attribute name for link '.
9142
'relationship.', E_USER_ERROR);
9145
$this->name = $configLookup[$name];
9148
public function validate($string, $config, $context) {
9150
$allowed = $config->get('Attr', $this->name);
9151
if (empty($allowed)) return false;
9153
$string = $this->parseCDATA($string);
9154
$parts = explode(' ', $string);
9156
// lookup to prevent duplicates
9157
$ret_lookup = array();
9158
foreach ($parts as $part) {
9159
$part = strtolower(trim($part));
9160
if (!isset($allowed[$part])) continue;
9161
$ret_lookup[$part] = true;
9164
if (empty($ret_lookup)) return false;
9165
$string = implode(' ', array_keys($ret_lookup));
9177
* Validates a MultiLength as defined by the HTML spec.
9179
* A multilength is either a integer (pixel count), a percentage, or
9180
* a relative number.
9182
class HTMLPurifier_AttrDef_HTML_MultiLength extends HTMLPurifier_AttrDef_HTML_Length
9185
public function validate($string, $config, $context) {
9187
$string = trim($string);
9188
if ($string === '') return false;
9190
$parent_result = parent::validate($string, $config, $context);
9191
if ($parent_result !== false) return $parent_result;
9193
$length = strlen($string);
9194
$last_char = $string[$length - 1];
9196
if ($last_char !== '*') return false;
9198
$int = substr($string, 0, $length - 1);
9200
if ($int == '') return '*';
9201
if (!is_numeric($int)) return false;
9205
if ($int < 0) return false;
9206
if ($int == 0) return '0';
9207
if ($int == 1) return '*';
9208
return ((string) $int) . '*';
9218
* Validates contents based on NMTOKENS attribute type.
9219
* @note The only current use for this is the class attribute in HTML
9220
* @note Could have some functionality factored out into Nmtoken class
9221
* @warning We cannot assume this class will be used only for 'class'
9222
* attributes. Not sure how to hook in magic behavior, then.
9224
class HTMLPurifier_AttrDef_HTML_Nmtokens extends HTMLPurifier_AttrDef
9227
public function validate($string, $config, $context) {
9229
$string = trim($string);
9231
// early abort: '' and '0' (strings that convert to false) are invalid
9232
if (!$string) return false;
9235
// do the preg_match, capture all subpatterns for reformulation
9237
// we don't support U+00A1 and up codepoints or
9238
// escaping because I don't know how to do that with regexps
9239
// and plus it would complicate optimization efforts (you never
9240
// see that anyway).
9242
$pattern = '/(?:(?<=\s)|\A)'. // look behind for space or string start
9243
'((?:--|-?[A-Za-z_])[A-Za-z_\-0-9]*)'.
9244
'(?:(?=\s)|\z)/'; // look ahead for space or string end
9245
preg_match_all($pattern, $string, $matches);
9247
if (empty($matches[1])) return false;
9249
// reconstruct string
9251
foreach ($matches[1] as $token) {
9252
$new_string .= $token . ' ';
9254
$new_string = rtrim($new_string);
9265
abstract class HTMLPurifier_AttrDef_URI_Email extends HTMLPurifier_AttrDef
9269
* Unpacks a mailbox into its display-name and address
9271
function unpack($string) {
9272
// needs to be implemented
9277
// sub-implementations
9282
* Validates a host according to the IPv4, IPv6 and DNS (future) specifications.
9284
class HTMLPurifier_AttrDef_URI_Host extends HTMLPurifier_AttrDef
9288
* Instance of HTMLPurifier_AttrDef_URI_IPv4 sub-validator
9293
* Instance of HTMLPurifier_AttrDef_URI_IPv6 sub-validator
9297
public function __construct() {
9298
$this->ipv4 = new HTMLPurifier_AttrDef_URI_IPv4();
9299
$this->ipv6 = new HTMLPurifier_AttrDef_URI_IPv6();
9302
public function validate($string, $config, $context) {
9303
$length = strlen($string);
9304
if ($string === '') return '';
9305
if ($length > 1 && $string[0] === '[' && $string[$length-1] === ']') {
9307
$ip = substr($string, 1, $length - 2);
9308
$valid = $this->ipv6->validate($ip, $config, $context);
9309
if ($valid === false) return false;
9310
return '['. $valid . ']';
9313
// need to do checks on unusual encodings too
9314
$ipv4 = $this->ipv4->validate($string, $config, $context);
9315
if ($ipv4 !== false) return $ipv4;
9317
// A regular domain name.
9319
// This breaks I18N domain names, but we don't have proper IRI support,
9320
// so force users to insert Punycode. If there's complaining we'll
9321
// try to fix things into an international friendly form.
9323
// The productions describing this are:
9324
$a = '[a-z]'; // alpha
9325
$an = '[a-z0-9]'; // alphanum
9326
$and = '[a-z0-9-]'; // alphanum | "-"
9327
// domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum
9328
$domainlabel = "$an($and*$an)?";
9329
// toplabel = alpha | alpha *( alphanum | "-" ) alphanum
9330
$toplabel = "$a($and*$an)?";
9331
// hostname = *( domainlabel "." ) toplabel [ "." ]
9332
$match = preg_match("/^($domainlabel\.)*$toplabel\.?$/i", $string);
9333
if (!$match) return false;
9344
* Validates an IPv4 address
9345
* @author Feyd @ forums.devnetwork.net (public domain)
9347
class HTMLPurifier_AttrDef_URI_IPv4 extends HTMLPurifier_AttrDef
9351
* IPv4 regex, protected so that IPv6 can reuse it
9355
public function validate($aIP, $config, $context) {
9357
if (!$this->ip4) $this->_loadRegex();
9359
if (preg_match('#^' . $this->ip4 . '$#s', $aIP))
9369
* Lazy load function to prevent regex from being stuffed in
9372
protected function _loadRegex() {
9373
$oct = '(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])'; // 0-255
9374
$this->ip4 = "(?:{$oct}\\.{$oct}\\.{$oct}\\.{$oct})";
9383
* Validates an IPv6 address.
9384
* @author Feyd @ forums.devnetwork.net (public domain)
9385
* @note This function requires brackets to have been removed from address
9388
class HTMLPurifier_AttrDef_URI_IPv6 extends HTMLPurifier_AttrDef_URI_IPv4
9391
public function validate($aIP, $config, $context) {
9393
if (!$this->ip4) $this->_loadRegex();
9397
$hex = '[0-9a-fA-F]';
9398
$blk = '(?:' . $hex . '{1,4})';
9399
$pre = '(?:/(?:12[0-8]|1[0-1][0-9]|[1-9][0-9]|[0-9]))'; // /0 - /128
9402
if (strpos($aIP, '/') !== false)
9404
if (preg_match('#' . $pre . '$#s', $aIP, $find))
9406
$aIP = substr($aIP, 0, 0-strlen($find[0]));
9415
// IPv4-compatiblity check
9416
if (preg_match('#(?<=:'.')' . $this->ip4 . '$#s', $aIP, $find))
9418
$aIP = substr($aIP, 0, 0-strlen($find[0]));
9419
$ip = explode('.', $find[0]);
9420
$ip = array_map('dechex', $ip);
9421
$aIP .= $ip[0] . $ip[1] . ':' . $ip[2] . $ip[3];
9425
// compression check
9426
$aIP = explode('::', $aIP);
9434
list($first, $second) = $aIP;
9435
$first = explode(':', $first);
9436
$second = explode(':', $second);
9438
if (count($first) + count($second) > 8)
9443
while(count($first) < 8)
9445
array_push($first, '0');
9448
array_splice($first, 8 - count($second), 8, $second);
9450
unset($first,$second);
9454
$aIP = explode(':', $aIP[0]);
9463
// All the pieces should be 16-bit hex strings. Are they?
9464
foreach ($aIP as $piece)
9466
if (!preg_match('#^[0-9a-fA-F]{4}$#s', sprintf('%04s', $piece)))
9482
* Primitive email validation class based on the regexp found at
9483
* http://www.regular-expressions.info/email.html
9485
class HTMLPurifier_AttrDef_URI_Email_SimpleCheck extends HTMLPurifier_AttrDef_URI_Email
9488
public function validate($string, $config, $context) {
9489
// no support for named mailboxes i.e. "Bob <bob@example.com>"
9490
// that needs more percent encoding to be done
9491
if ($string == '') return false;
9492
$string = trim($string);
9493
$result = preg_match('/^[A-Z0-9._%-]+@[A-Z0-9.-]+\.[A-Z]{2,4}$/i', $string);
9494
return $result ? $string : false;
9503
* Pre-transform that changes proprietary background attribute to CSS.
9505
class HTMLPurifier_AttrTransform_Background extends HTMLPurifier_AttrTransform {
9507
public function transform($attr, $config, $context) {
9509
if (!isset($attr['background'])) return $attr;
9511
$background = $this->confiscateAttr($attr, 'background');
9512
// some validation should happen here
9514
$this->prependCSS($attr, "background-image:url($background);");
9525
// this MUST be placed in post, as it assumes that any value in dir is valid
9528
* Post-trasnform that ensures that bdo tags have the dir attribute set.
9530
class HTMLPurifier_AttrTransform_BdoDir extends HTMLPurifier_AttrTransform
9533
public function transform($attr, $config, $context) {
9534
if (isset($attr['dir'])) return $attr;
9535
$attr['dir'] = $config->get('Attr', 'DefaultTextDir');
9545
* Pre-transform that changes deprecated bgcolor attribute to CSS.
9547
class HTMLPurifier_AttrTransform_BgColor extends HTMLPurifier_AttrTransform {
9549
public function transform($attr, $config, $context) {
9551
if (!isset($attr['bgcolor'])) return $attr;
9553
$bgcolor = $this->confiscateAttr($attr, 'bgcolor');
9554
// some validation should happen here
9556
$this->prependCSS($attr, "background-color:$bgcolor;");
9568
* Pre-transform that changes converts a boolean attribute to fixed CSS
9570
class HTMLPurifier_AttrTransform_BoolToCSS extends HTMLPurifier_AttrTransform {
9573
* Name of boolean attribute that is trigger
9578
* CSS declarations to add to style, needs trailing semicolon
9583
* @param $attr string attribute name to convert from
9584
* @param $css string CSS declarations to add to style (needs semicolon)
9586
public function __construct($attr, $css) {
9587
$this->attr = $attr;
9591
public function transform($attr, $config, $context) {
9592
if (!isset($attr[$this->attr])) return $attr;
9593
unset($attr[$this->attr]);
9594
$this->prependCSS($attr, $this->css);
9604
* Pre-transform that changes deprecated border attribute to CSS.
9606
class HTMLPurifier_AttrTransform_Border extends HTMLPurifier_AttrTransform {
9608
public function transform($attr, $config, $context) {
9609
if (!isset($attr['border'])) return $attr;
9610
$border_width = $this->confiscateAttr($attr, 'border');
9611
// some validation should happen here
9612
$this->prependCSS($attr, "border:{$border_width}px solid;");
9622
* Generic pre-transform that converts an attribute with a fixed number of
9623
* values (enumerated) to CSS.
9625
class HTMLPurifier_AttrTransform_EnumToCSS extends HTMLPurifier_AttrTransform {
9628
* Name of attribute to transform from
9633
* Lookup array of attribute values to CSS
9635
protected $enumToCSS = array();
9638
* Case sensitivity of the matching
9639
* @warning Currently can only be guaranteed to work with ASCII
9642
protected $caseSensitive = false;
9645
* @param $attr String attribute name to transform from
9646
* @param $enumToCSS Lookup array of attribute values to CSS
9647
* @param $case_sensitive Boolean case sensitivity indicator, default false
9649
public function __construct($attr, $enum_to_css, $case_sensitive = false) {
9650
$this->attr = $attr;
9651
$this->enumToCSS = $enum_to_css;
9652
$this->caseSensitive = (bool) $case_sensitive;
9655
public function transform($attr, $config, $context) {
9657
if (!isset($attr[$this->attr])) return $attr;
9659
$value = trim($attr[$this->attr]);
9660
unset($attr[$this->attr]);
9662
if (!$this->caseSensitive) $value = strtolower($value);
9664
if (!isset($this->enumToCSS[$value])) {
9668
$this->prependCSS($attr, $this->enumToCSS[$value]);
9679
// must be called POST validation
9682
* Transform that supplies default values for the src and alt attributes
9683
* in img tags, as well as prevents the img tag from being removed
9684
* because of a missing alt tag. This needs to be registered as both
9685
* a pre and post attribute transform.
9687
class HTMLPurifier_AttrTransform_ImgRequired extends HTMLPurifier_AttrTransform
9690
public function transform($attr, $config, $context) {
9693
if (!isset($attr['src'])) {
9694
if ($config->get('Core', 'RemoveInvalidImg')) return $attr;
9695
$attr['src'] = $config->get('Attr', 'DefaultInvalidImage');
9699
if (!isset($attr['alt'])) {
9701
$alt = $config->get('Attr', 'DefaultImageAlt');
9702
if ($alt === null) {
9703
$attr['alt'] = basename($attr['src']);
9705
$attr['alt'] = $alt;
9708
$attr['alt'] = $config->get('Attr', 'DefaultInvalidImageAlt');
9722
* Pre-transform that changes deprecated hspace and vspace attributes to CSS
9724
class HTMLPurifier_AttrTransform_ImgSpace extends HTMLPurifier_AttrTransform {
9727
protected $css = array(
9728
'hspace' => array('left', 'right'),
9729
'vspace' => array('top', 'bottom')
9732
public function __construct($attr) {
9733
$this->attr = $attr;
9734
if (!isset($this->css[$attr])) {
9735
trigger_error(htmlspecialchars($attr) . ' is not valid space attribute');
9739
public function transform($attr, $config, $context) {
9741
if (!isset($attr[$this->attr])) return $attr;
9743
$width = $this->confiscateAttr($attr, $this->attr);
9744
// some validation could happen here
9746
if (!isset($this->css[$this->attr])) return $attr;
9749
foreach ($this->css[$this->attr] as $suffix) {
9750
$property = "margin-$suffix";
9751
$style .= "$property:{$width}px;";
9754
$this->prependCSS($attr, $style);
9766
* Performs miscellaneous cross attribute validation and filtering for
9767
* input elements. This is meant to be a post-transform.
9769
class HTMLPurifier_AttrTransform_Input extends HTMLPurifier_AttrTransform {
9773
public function __construct() {
9774
$this->pixels = new HTMLPurifier_AttrDef_HTML_Pixels();
9777
public function transform($attr, $config, $context) {
9778
if (!isset($attr['type'])) $t = 'text';
9779
else $t = strtolower($attr['type']);
9780
if (isset($attr['checked']) && $t !== 'radio' && $t !== 'checkbox') {
9781
unset($attr['checked']);
9783
if (isset($attr['maxlength']) && $t !== 'text' && $t !== 'password') {
9784
unset($attr['maxlength']);
9786
if (isset($attr['size']) && $t !== 'text' && $t !== 'password') {
9787
$result = $this->pixels->validate($attr['size'], $config, $context);
9788
if ($result === false) unset($attr['size']);
9789
else $attr['size'] = $result;
9791
if (isset($attr['src']) && $t !== 'image') {
9792
unset($attr['src']);
9794
if (!isset($attr['value']) && ($t === 'radio' || $t === 'checkbox')) {
9795
$attr['value'] = '';
9806
* Post-transform that copies lang's value to xml:lang (and vice-versa)
9807
* @note Theoretically speaking, this could be a pre-transform, but putting
9808
* post is more efficient.
9810
class HTMLPurifier_AttrTransform_Lang extends HTMLPurifier_AttrTransform
9813
public function transform($attr, $config, $context) {
9815
$lang = isset($attr['lang']) ? $attr['lang'] : false;
9816
$xml_lang = isset($attr['xml:lang']) ? $attr['xml:lang'] : false;
9818
if ($lang !== false && $xml_lang === false) {
9819
$attr['xml:lang'] = $lang;
9820
} elseif ($xml_lang !== false) {
9821
$attr['lang'] = $xml_lang;
9834
* Class for handling width/height length attribute transformations to CSS
9836
class HTMLPurifier_AttrTransform_Length extends HTMLPurifier_AttrTransform
9842
public function __construct($name, $css_name = null) {
9843
$this->name = $name;
9844
$this->cssName = $css_name ? $css_name : $name;
9847
public function transform($attr, $config, $context) {
9848
if (!isset($attr[$this->name])) return $attr;
9849
$length = $this->confiscateAttr($attr, $this->name);
9850
if(ctype_digit($length)) $length .= 'px';
9851
$this->prependCSS($attr, $this->cssName . ":$length;");
9861
* Pre-transform that changes deprecated name attribute to ID if necessary
9863
class HTMLPurifier_AttrTransform_Name extends HTMLPurifier_AttrTransform
9866
public function transform($attr, $config, $context) {
9867
if (!isset($attr['name'])) return $attr;
9868
$id = $this->confiscateAttr($attr, 'name');
9869
if ( isset($attr['id'])) return $attr;
9879
class HTMLPurifier_AttrTransform_SafeEmbed extends HTMLPurifier_AttrTransform
9881
public $name = "SafeEmbed";
9883
public function transform($attr, $config, $context) {
9884
$attr['allowscriptaccess'] = 'never';
9885
$attr['allownetworking'] = 'internal';
9886
$attr['type'] = 'application/x-shockwave-flash';
9894
* Writes default type for all objects. Currently only supports flash.
9896
class HTMLPurifier_AttrTransform_SafeObject extends HTMLPurifier_AttrTransform
9898
public $name = "SafeObject";
9900
function transform($attr, $config, $context) {
9901
if (!isset($attr['type'])) $attr['type'] = 'application/x-shockwave-flash';
9909
* Validates name/value pairs in param tags to be used in safe objects. This
9910
* will only allow name values it recognizes, and pre-fill certain attributes
9911
* with required values.
9914
* This class only supports Flash. In the future, Quicktime support
9918
* This class expects an injector to add the necessary parameters tags.
9920
class HTMLPurifier_AttrTransform_SafeParam extends HTMLPurifier_AttrTransform
9922
public $name = "SafeParam";
9925
public function __construct() {
9926
$this->uri = new HTMLPurifier_AttrDef_URI(true); // embedded
9929
public function transform($attr, $config, $context) {
9930
// If we add support for other objects, we'll need to alter the
9932
switch ($attr['name']) {
9933
// application/x-shockwave-flash
9934
// Keep this synchronized with Injector/SafeObject.php
9935
case 'allowScriptAccess':
9936
$attr['value'] = 'never';
9938
case 'allowNetworking':
9939
$attr['value'] = 'internal';
9942
$attr['value'] = 'window';
9945
$attr['value'] = $this->uri->validate($attr['value'], $config, $context);
9947
// add other cases to support other param name/value pairs
9949
$attr['name'] = $attr['value'] = null;
9958
* Implements required attribute stipulation for <script>
9960
class HTMLPurifier_AttrTransform_ScriptRequired extends HTMLPurifier_AttrTransform
9962
public function transform($attr, $config, $context) {
9963
if (!isset($attr['type'])) {
9964
$attr['type'] = 'text/javascript';
9973
* Sets height/width defaults for <textarea>
9975
class HTMLPurifier_AttrTransform_Textarea extends HTMLPurifier_AttrTransform
9978
public function transform($attr, $config, $context) {
9979
// Calculated from Firefox
9980
if (!isset($attr['cols'])) $attr['cols'] = '22';
9981
if (!isset($attr['rows'])) $attr['rows'] = '3';
9989
* Definition that uses different definitions depending on context.
9991
* The del and ins tags are notable because they allow different types of
9992
* elements depending on whether or not they're in a block or inline context.
9993
* Chameleon allows this behavior to happen by using two different
9994
* definitions depending on context. While this somewhat generalized,
9995
* it is specifically intended for those two tags.
9997
class HTMLPurifier_ChildDef_Chameleon extends HTMLPurifier_ChildDef
10001
* Instance of the definition object to use when inline. Usually stricter.
10006
* Instance of the definition object to use when block.
10010
public $type = 'chameleon';
10013
* @param $inline List of elements to allow when inline.
10014
* @param $block List of elements to allow when block.
10016
public function __construct($inline, $block) {
10017
$this->inline = new HTMLPurifier_ChildDef_Optional($inline);
10018
$this->block = new HTMLPurifier_ChildDef_Optional($block);
10019
$this->elements = $this->block->elements;
10022
public function validateChildren($tokens_of_children, $config, $context) {
10023
if ($context->get('IsInline') === false) {
10024
return $this->block->validateChildren(
10025
$tokens_of_children, $config, $context);
10027
return $this->inline->validateChildren(
10028
$tokens_of_children, $config, $context);
10037
* Custom validation class, accepts DTD child definitions
10039
* @warning Currently this class is an all or nothing proposition, that is,
10040
* it will only give a bool return value.
10042
class HTMLPurifier_ChildDef_Custom extends HTMLPurifier_ChildDef
10044
public $type = 'custom';
10045
public $allow_empty = false;
10047
* Allowed child pattern as defined by the DTD
10051
* PCRE regex derived from $dtd_regex
10054
private $_pcre_regex;
10056
* @param $dtd_regex Allowed child pattern from the DTD
10058
public function __construct($dtd_regex) {
10059
$this->dtd_regex = $dtd_regex;
10060
$this->_compileRegex();
10063
* Compiles the PCRE regex from a DTD regex ($dtd_regex to $_pcre_regex)
10065
protected function _compileRegex() {
10066
$raw = str_replace(' ', '', $this->dtd_regex);
10067
if ($raw{0} != '(') {
10070
$el = '[#a-zA-Z0-9_.-]+';
10073
// COMPLICATED! AND MIGHT BE BUGGY! I HAVE NO CLUE WHAT I'M
10074
// DOING! Seriously: if there's problems, please report them.
10076
// collect all elements into the $elements array
10077
preg_match_all("/$el/", $reg, $matches);
10078
foreach ($matches[0] as $match) {
10079
$this->elements[$match] = true;
10082
// setup all elements as parentheticals with leading commas
10083
$reg = preg_replace("/$el/", '(,\\0)', $reg);
10085
// remove commas when they were not solicited
10086
$reg = preg_replace("/([^,(|]\(+),/", '\\1', $reg);
10088
// remove all non-paranthetical commas: they are handled by first regex
10089
$reg = preg_replace("/,\(/", '(', $reg);
10091
$this->_pcre_regex = $reg;
10093
public function validateChildren($tokens_of_children, $config, $context) {
10094
$list_of_children = '';
10095
$nesting = 0; // depth into the nest
10096
foreach ($tokens_of_children as $token) {
10097
if (!empty($token->is_whitespace)) continue;
10099
$is_child = ($nesting == 0); // direct
10101
if ($token instanceof HTMLPurifier_Token_Start) {
10103
} elseif ($token instanceof HTMLPurifier_Token_End) {
10108
$list_of_children .= $token->name . ',';
10111
// add leading comma to deal with stray comma declarations
10112
$list_of_children = ',' . rtrim($list_of_children, ',');
10115
'/^,?'.$this->_pcre_regex.'$/',
10119
return (bool) $okay;
10127
* Definition that disallows all elements.
10128
* @warning validateChildren() in this class is actually never called, because
10129
* empty elements are corrected in HTMLPurifier_Strategy_MakeWellFormed
10130
* before child definitions are parsed in earnest by
10131
* HTMLPurifier_Strategy_FixNesting.
10133
class HTMLPurifier_ChildDef_Empty extends HTMLPurifier_ChildDef
10135
public $allow_empty = true;
10136
public $type = 'empty';
10137
public function __construct() {}
10138
public function validateChildren($tokens_of_children, $config, $context) {
10147
* Definition that allows a set of elements, but disallows empty children.
10149
class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef
10152
* Lookup table of allowed elements.
10155
public $elements = array();
10157
* @param $elements List of allowed element names (lowercase).
10159
public function __construct($elements) {
10160
if (is_string($elements)) {
10161
$elements = str_replace(' ', '', $elements);
10162
$elements = explode('|', $elements);
10164
$keys = array_keys($elements);
10165
if ($keys == array_keys($keys)) {
10166
$elements = array_flip($elements);
10167
foreach ($elements as $i => $x) {
10168
$elements[$i] = true;
10169
if (empty($i)) unset($elements[$i]); // remove blank
10172
$this->elements = $elements;
10174
public $allow_empty = false;
10175
public $type = 'required';
10176
public function validateChildren($tokens_of_children, $config, $context) {
10177
// if there are no tokens, delete parent node
10178
if (empty($tokens_of_children)) return false;
10180
// the new set of children
10183
// current depth into the nest
10186
// whether or not we're deleting a node
10187
$is_deleting = false;
10189
// whether or not parsed character data is allowed
10190
// this controls whether or not we silently drop a tag
10191
// or generate escaped HTML from it
10192
$pcdata_allowed = isset($this->elements['#PCDATA']);
10194
// a little sanity check to make sure it's not ALL whitespace
10195
$all_whitespace = true;
10197
// some configuration
10198
$escape_invalid_children = $config->get('Core', 'EscapeInvalidChildren');
10201
$gen = new HTMLPurifier_Generator($config, $context);
10203
foreach ($tokens_of_children as $token) {
10204
if (!empty($token->is_whitespace)) {
10205
$result[] = $token;
10208
$all_whitespace = false; // phew, we're not talking about whitespace
10210
$is_child = ($nesting == 0);
10212
if ($token instanceof HTMLPurifier_Token_Start) {
10214
} elseif ($token instanceof HTMLPurifier_Token_End) {
10219
$is_deleting = false;
10220
if (!isset($this->elements[$token->name])) {
10221
$is_deleting = true;
10222
if ($pcdata_allowed && $token instanceof HTMLPurifier_Token_Text) {
10223
$result[] = $token;
10224
} elseif ($pcdata_allowed && $escape_invalid_children) {
10225
$result[] = new HTMLPurifier_Token_Text(
10226
$gen->generateFromToken($token)
10232
if (!$is_deleting || ($pcdata_allowed && $token instanceof HTMLPurifier_Token_Text)) {
10233
$result[] = $token;
10234
} elseif ($pcdata_allowed && $escape_invalid_children) {
10236
new HTMLPurifier_Token_Text(
10237
$gen->generateFromToken($token)
10243
if (empty($result)) return false;
10244
if ($all_whitespace) return false;
10245
if ($tokens_of_children == $result) return true;
10254
* Definition that allows a set of elements, and allows no children.
10255
* @note This is a hack to reuse code from HTMLPurifier_ChildDef_Required,
10256
* really, one shouldn't inherit from the other. Only altered behavior
10257
* is to overload a returned false with an array. Thus, it will never
10260
class HTMLPurifier_ChildDef_Optional extends HTMLPurifier_ChildDef_Required
10262
public $allow_empty = true;
10263
public $type = 'optional';
10264
public function validateChildren($tokens_of_children, $config, $context) {
10265
$result = parent::validateChildren($tokens_of_children, $config, $context);
10266
if ($result === false) {
10267
if (empty($tokens_of_children)) return true;
10268
else return array();
10278
* Takes the contents of blockquote when in strict and reformats for validation.
10280
class HTMLPurifier_ChildDef_StrictBlockquote extends HTMLPurifier_ChildDef_Required
10282
protected $real_elements;
10283
protected $fake_elements;
10284
public $allow_empty = true;
10285
public $type = 'strictblockquote';
10286
protected $init = false;
10289
* @note We don't want MakeWellFormed to auto-close inline elements since
10290
* they might be allowed.
10292
public function getNonAutoCloseElements($config) {
10293
$this->init($config);
10294
return $this->fake_elements;
10297
public function validateChildren($tokens_of_children, $config, $context) {
10299
$this->init($config);
10301
// trick the parent class into thinking it allows more
10302
$this->elements = $this->fake_elements;
10303
$result = parent::validateChildren($tokens_of_children, $config, $context);
10304
$this->elements = $this->real_elements;
10306
if ($result === false) return array();
10307
if ($result === true) $result = $tokens_of_children;
10309
$def = $config->getHTMLDefinition();
10310
$block_wrap_start = new HTMLPurifier_Token_Start($def->info_block_wrapper);
10311
$block_wrap_end = new HTMLPurifier_Token_End( $def->info_block_wrapper);
10312
$is_inline = false;
10316
// assuming that there are no comment tokens
10317
foreach ($result as $i => $token) {
10318
$token = $result[$i];
10319
// ifs are nested for readability
10323
($token instanceof HTMLPurifier_Token_Text && !$token->is_whitespace) ||
10324
(!$token instanceof HTMLPurifier_Token_Text && !isset($this->elements[$token->name]))
10327
$ret[] = $block_wrap_start;
10332
// starting tokens have been inline text / empty
10333
if ($token instanceof HTMLPurifier_Token_Start || $token instanceof HTMLPurifier_Token_Empty) {
10334
if (isset($this->elements[$token->name])) {
10336
$ret[] = $block_wrap_end;
10337
$is_inline = false;
10343
if ($token instanceof HTMLPurifier_Token_Start) $depth++;
10344
if ($token instanceof HTMLPurifier_Token_End) $depth--;
10346
if ($is_inline) $ret[] = $block_wrap_end;
10350
private function init($config) {
10351
if (!$this->init) {
10352
$def = $config->getHTMLDefinition();
10353
// allow all inline elements
10354
$this->real_elements = $this->elements;
10355
$this->fake_elements = $def->info_content_sets['Flow'];
10356
$this->fake_elements['#PCDATA'] = true;
10357
$this->init = true;
10366
* Definition for tables
10368
class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
10370
public $allow_empty = false;
10371
public $type = 'table';
10372
public $elements = array('tr' => true, 'tbody' => true, 'thead' => true,
10373
'tfoot' => true, 'caption' => true, 'colgroup' => true, 'col' => true);
10374
public function __construct() {}
10375
public function validateChildren($tokens_of_children, $config, $context) {
10376
if (empty($tokens_of_children)) return false;
10378
// this ensures that the loop gets run one last time before closing
10379
// up. It's a little bit of a hack, but it works! Just make sure you
10380
// get rid of the token later.
10381
$tokens_of_children[] = false;
10383
// only one of these elements is allowed in a table
10388
// as many of these as you want
10390
$content = array();
10392
$nesting = 0; // current depth so we can determine nodes
10393
$is_collecting = false; // are we globbing together tokens to package
10394
// into one of the collectors?
10395
$collection = array(); // collected nodes
10396
$tag_index = 0; // the first node might be whitespace,
10397
// so this tells us where the start tag is
10399
foreach ($tokens_of_children as $token) {
10400
$is_child = ($nesting == 0);
10402
if ($token === false) {
10403
// terminating sequence started
10404
} elseif ($token instanceof HTMLPurifier_Token_Start) {
10406
} elseif ($token instanceof HTMLPurifier_Token_End) {
10410
// handle node collection
10411
if ($is_collecting) {
10413
// okay, let's stash the tokens away
10414
// first token tells us the type of the collection
10415
switch ($collection[$tag_index]->name) {
10418
$content[] = $collection;
10421
if ($caption !== false) break;
10422
$caption = $collection;
10426
// access the appropriate variable, $thead or $tfoot
10427
$var = $collection[$tag_index]->name;
10428
if ($$var === false) {
10429
$$var = $collection;
10431
// transmutate the first and less entries into
10432
// tbody tags, and then put into content
10433
$collection[$tag_index]->name = 'tbody';
10434
$collection[count($collection)-1]->name = 'tbody';
10435
$content[] = $collection;
10439
$cols[] = $collection;
10442
$collection = array();
10443
$is_collecting = false;
10446
// add the node to the collection
10447
$collection[] = $token;
10452
if ($token === false) break;
10455
// determine what we're dealing with
10456
if ($token->name == 'col') {
10457
// the only empty tag in the possie, we can handle it
10459
$cols[] = array_merge($collection, array($token));
10460
$collection = array();
10464
switch($token->name) {
10471
$is_collecting = true;
10472
$collection[] = $token;
10475
if ($token instanceof HTMLPurifier_Token_Text && $token->is_whitespace) {
10476
$collection[] = $token;
10484
if (empty($content)) return false;
10487
if ($caption !== false) $ret = array_merge($ret, $caption);
10488
if ($cols !== false) foreach ($cols as $token_array) $ret = array_merge($ret, $token_array);
10489
if ($thead !== false) $ret = array_merge($ret, $thead);
10490
if ($tfoot !== false) $ret = array_merge($ret, $tfoot);
10491
foreach ($content as $token_array) $ret = array_merge($ret, $token_array);
10492
if (!empty($collection) && $is_collecting == false){
10493
// grab the trailing space
10494
$ret = array_merge($ret, $collection);
10497
array_pop($tokens_of_children); // remove phantom token
10499
return ($ret === $tokens_of_children) ? true : $ret;
10507
class HTMLPurifier_DefinitionCache_Decorator extends HTMLPurifier_DefinitionCache
10511
* Cache object we are decorating
10515
public function __construct() {}
10518
* Lazy decorator function
10519
* @param $cache Reference to cache object to decorate
10521
public function decorate(&$cache) {
10522
$decorator = $this->copy();
10523
// reference is necessary for mocks in PHP 4
10524
$decorator->cache =& $cache;
10525
$decorator->type = $cache->type;
10530
* Cross-compatible clone substitute
10532
public function copy() {
10533
return new HTMLPurifier_DefinitionCache_Decorator();
10536
public function add($def, $config) {
10537
return $this->cache->add($def, $config);
10540
public function set($def, $config) {
10541
return $this->cache->set($def, $config);
10544
public function replace($def, $config) {
10545
return $this->cache->replace($def, $config);
10548
public function get($config) {
10549
return $this->cache->get($config);
10552
public function remove($config) {
10553
return $this->cache->remove($config);
10556
public function flush($config) {
10557
return $this->cache->flush($config);
10560
public function cleanup($config) {
10561
return $this->cache->cleanup($config);
10570
* Null cache object to use when no caching is on.
10572
class HTMLPurifier_DefinitionCache_Null extends HTMLPurifier_DefinitionCache
10575
public function add($def, $config) {
10579
public function set($def, $config) {
10583
public function replace($def, $config) {
10587
public function remove($config) {
10591
public function get($config) {
10595
public function flush($config) {
10599
public function cleanup($config) {
10608
class HTMLPurifier_DefinitionCache_Serializer extends
10609
HTMLPurifier_DefinitionCache
10612
public function add($def, $config) {
10613
if (!$this->checkDefType($def)) return;
10614
$file = $this->generateFilePath($config);
10615
if (file_exists($file)) return false;
10616
if (!$this->_prepareDir($config)) return false;
10617
return $this->_write($file, serialize($def));
10620
public function set($def, $config) {
10621
if (!$this->checkDefType($def)) return;
10622
$file = $this->generateFilePath($config);
10623
if (!$this->_prepareDir($config)) return false;
10624
return $this->_write($file, serialize($def));
10627
public function replace($def, $config) {
10628
if (!$this->checkDefType($def)) return;
10629
$file = $this->generateFilePath($config);
10630
if (!file_exists($file)) return false;
10631
if (!$this->_prepareDir($config)) return false;
10632
return $this->_write($file, serialize($def));
10635
public function get($config) {
10636
$file = $this->generateFilePath($config);
10637
if (!file_exists($file)) return false;
10638
return unserialize(file_get_contents($file));
10641
public function remove($config) {
10642
$file = $this->generateFilePath($config);
10643
if (!file_exists($file)) return false;
10644
return unlink($file);
10647
public function flush($config) {
10648
if (!$this->_prepareDir($config)) return false;
10649
$dir = $this->generateDirectoryPath($config);
10650
$dh = opendir($dir);
10651
while (false !== ($filename = readdir($dh))) {
10652
if (empty($filename)) continue;
10653
if ($filename[0] === '.') continue;
10654
unlink($dir . '/' . $filename);
10658
public function cleanup($config) {
10659
if (!$this->_prepareDir($config)) return false;
10660
$dir = $this->generateDirectoryPath($config);
10661
$dh = opendir($dir);
10662
while (false !== ($filename = readdir($dh))) {
10663
if (empty($filename)) continue;
10664
if ($filename[0] === '.') continue;
10665
$key = substr($filename, 0, strlen($filename) - 4);
10666
if ($this->isOld($key, $config)) unlink($dir . '/' . $filename);
10671
* Generates the file path to the serial file corresponding to
10672
* the configuration and definition name
10673
* @todo Make protected
10675
public function generateFilePath($config) {
10676
$key = $this->generateKey($config);
10677
return $this->generateDirectoryPath($config) . '/' . $key . '.ser';
10681
* Generates the path to the directory contain this cache's serial files
10682
* @note No trailing slash
10683
* @todo Make protected
10685
public function generateDirectoryPath($config) {
10686
$base = $this->generateBaseDirectoryPath($config);
10687
return $base . '/' . $this->type;
10691
* Generates path to base directory that contains all definition type
10693
* @todo Make protected
10695
public function generateBaseDirectoryPath($config) {
10696
$base = $config->get('Cache', 'SerializerPath');
10697
$base = is_null($base) ? HTMLPURIFIER_PREFIX . '/HTMLPurifier/DefinitionCache/Serializer' : $base;
10702
* Convenience wrapper function for file_put_contents
10703
* @param $file File name to write to
10704
* @param $data Data to write into file
10705
* @return Number of bytes written if success, or false if failure.
10707
private function _write($file, $data) {
10708
return file_put_contents($file, $data);
10712
* Prepares the directory that this type stores the serials in
10713
* @return True if successful
10715
private function _prepareDir($config) {
10716
$directory = $this->generateDirectoryPath($config);
10717
if (!is_dir($directory)) {
10718
$base = $this->generateBaseDirectoryPath($config);
10719
if (!is_dir($base)) {
10720
trigger_error('Base directory '.$base.' does not exist,
10721
please create or change using %Cache.SerializerPath',
10724
} elseif (!$this->_testPermissions($base)) {
10727
$old = umask(0022); // disable group and world writes
10730
} elseif (!$this->_testPermissions($directory)) {
10737
* Tests permissions on a directory and throws out friendly
10738
* error messages and attempts to chmod it itself if possible
10740
private function _testPermissions($dir) {
10741
// early abort, if it is writable, everything is hunky-dory
10742
if (is_writable($dir)) return true;
10743
if (!is_dir($dir)) {
10744
// generally, you'll want to handle this beforehand
10745
// so a more specific error message can be given
10746
trigger_error('Directory '.$dir.' does not exist',
10750
if (function_exists('posix_getuid')) {
10751
// POSIX system, we can give more specific advice
10752
if (fileowner($dir) === posix_getuid()) {
10753
// we can chmod it ourselves
10756
} elseif (filegroup($dir) === posix_getgid()) {
10759
// PHP's probably running as nobody, so we'll
10760
// need to give global permissions
10763
trigger_error('Directory '.$dir.' not writable, '.
10764
'please chmod to ' . $chmod,
10767
// generic error message
10768
trigger_error('Directory '.$dir.' not writable, '.
10769
'please alter file permissions',
10781
* Definition cache decorator class that cleans up the cache
10782
* whenever there is a cache miss.
10784
class HTMLPurifier_DefinitionCache_Decorator_Cleanup extends
10785
HTMLPurifier_DefinitionCache_Decorator
10788
public $name = 'Cleanup';
10790
public function copy() {
10791
return new HTMLPurifier_DefinitionCache_Decorator_Cleanup();
10794
public function add($def, $config) {
10795
$status = parent::add($def, $config);
10796
if (!$status) parent::cleanup($config);
10800
public function set($def, $config) {
10801
$status = parent::set($def, $config);
10802
if (!$status) parent::cleanup($config);
10806
public function replace($def, $config) {
10807
$status = parent::replace($def, $config);
10808
if (!$status) parent::cleanup($config);
10812
public function get($config) {
10813
$ret = parent::get($config);
10814
if (!$ret) parent::cleanup($config);
10824
* Definition cache decorator class that saves all cache retrievals
10825
* to PHP's memory; good for unit tests or circumstances where
10826
* there are lots of configuration objects floating around.
10828
class HTMLPurifier_DefinitionCache_Decorator_Memory extends
10829
HTMLPurifier_DefinitionCache_Decorator
10832
protected $definitions;
10833
public $name = 'Memory';
10835
public function copy() {
10836
return new HTMLPurifier_DefinitionCache_Decorator_Memory();
10839
public function add($def, $config) {
10840
$status = parent::add($def, $config);
10841
if ($status) $this->definitions[$this->generateKey($config)] = $def;
10845
public function set($def, $config) {
10846
$status = parent::set($def, $config);
10847
if ($status) $this->definitions[$this->generateKey($config)] = $def;
10851
public function replace($def, $config) {
10852
$status = parent::replace($def, $config);
10853
if ($status) $this->definitions[$this->generateKey($config)] = $def;
10857
public function get($config) {
10858
$key = $this->generateKey($config);
10859
if (isset($this->definitions[$key])) return $this->definitions[$key];
10860
$this->definitions[$key] = parent::get($config);
10861
return $this->definitions[$key];
10870
* XHTML 1.1 Bi-directional Text Module, defines elements that
10871
* declare directionality of content. Text Extension Module.
10873
class HTMLPurifier_HTMLModule_Bdo extends HTMLPurifier_HTMLModule
10876
public $name = 'Bdo';
10877
public $attr_collections = array(
10878
'I18N' => array('dir' => false)
10881
public function setup($config) {
10882
$bdo = $this->addElement(
10883
'bdo', 'Inline', 'Inline', array('Core', 'Lang'),
10885
'dir' => 'Enum#ltr,rtl', // required
10886
// The Abstract Module specification has the attribute
10887
// inclusions wrong for bdo: bdo allows Lang
10890
$bdo->attr_transform_post['required-dir'] = new HTMLPurifier_AttrTransform_BdoDir();
10892
$this->attr_collections['I18N']['dir'] = 'Enum#ltr,rtl';
10900
class HTMLPurifier_HTMLModule_CommonAttributes extends HTMLPurifier_HTMLModule
10902
public $name = 'CommonAttributes';
10904
public $attr_collections = array(
10906
0 => array('Style'),
10907
// 'xml:space' => false,
10908
'class' => 'NMTOKENS',
10910
'title' => 'CDATA',
10914
0 => array('Lang'), // proprietary, for xml:lang/lang
10917
0 => array('Core', 'I18N')
10926
* XHTML 1.1 Edit Module, defines editing-related elements. Text Extension
10929
class HTMLPurifier_HTMLModule_Edit extends HTMLPurifier_HTMLModule
10932
public $name = 'Edit';
10934
public function setup($config) {
10935
$contents = 'Chameleon: #PCDATA | Inline ! #PCDATA | Flow';
10938
// 'datetime' => 'Datetime', // not implemented
10940
$this->addElement('del', 'Inline', $contents, 'Common', $attr);
10941
$this->addElement('ins', 'Inline', $contents, 'Common', $attr);
10944
// HTML 4.01 specifies that ins/del must not contain block
10945
// elements when used in an inline context, chameleon is
10946
// a complicated workaround to acheive this effect
10948
// Inline context ! Block context (exclamation mark is
10949
// separator, see getChildDef for parsing)
10951
public $defines_child_def = true;
10952
public function getChildDef($def) {
10953
if ($def->content_model_type != 'chameleon') return false;
10954
$value = explode('!', $def->content_model);
10955
return new HTMLPurifier_ChildDef_Chameleon($value[0], $value[1]);
10964
* XHTML 1.1 Forms module, defines all form-related elements found in HTML 4.
10966
class HTMLPurifier_HTMLModule_Forms extends HTMLPurifier_HTMLModule
10968
public $name = 'Forms';
10969
public $safe = false;
10971
public $content_sets = array(
10973
'Inline' => 'Formctrl',
10976
public function setup($config) {
10977
$form = $this->addElement('form', 'Form',
10978
'Required: Heading | List | Block | fieldset', 'Common', array(
10979
'accept' => 'ContentTypes',
10980
'accept-charset' => 'Charsets',
10981
'action*' => 'URI',
10982
'method' => 'Enum#get,post',
10983
// really ContentType, but these two are the only ones used today
10984
'enctype' => 'Enum#application/x-www-form-urlencoded,multipart/form-data',
10986
$form->excludes = array('form' => true);
10988
$input = $this->addElement('input', 'Formctrl', 'Empty', 'Common', array(
10989
'accept' => 'ContentTypes',
10990
'accesskey' => 'Character',
10992
'checked' => 'Bool#checked',
10993
'disabled' => 'Bool#disabled',
10994
'maxlength' => 'Number',
10996
'readonly' => 'Bool#readonly',
10997
'size' => 'Number',
10998
'src' => 'URI#embeds',
10999
'tabindex' => 'Number',
11000
'type' => 'Enum#text,password,checkbox,button,radio,submit,reset,file,hidden,image',
11001
'value' => 'CDATA',
11003
$input->attr_transform_post[] = new HTMLPurifier_AttrTransform_Input();
11005
$this->addElement('select', 'Formctrl', 'Required: optgroup | option', 'Common', array(
11006
'disabled' => 'Bool#disabled',
11007
'multiple' => 'Bool#multiple',
11009
'size' => 'Number',
11010
'tabindex' => 'Number',
11013
$this->addElement('option', false, 'Optional: #PCDATA', 'Common', array(
11014
'disabled' => 'Bool#disabled',
11016
'selected' => 'Bool#selected',
11017
'value' => 'CDATA',
11019
// It's illegal for there to be more than one selected, but not
11020
// be multiple. Also, no selected means undefined behavior. This might
11021
// be difficult to implement; perhaps an injector, or a context variable.
11023
$textarea = $this->addElement('textarea', 'Formctrl', 'Optional: #PCDATA', 'Common', array(
11024
'accesskey' => 'Character',
11025
'cols*' => 'Number',
11026
'disabled' => 'Bool#disabled',
11028
'readonly' => 'Bool#readonly',
11029
'rows*' => 'Number',
11030
'tabindex' => 'Number',
11032
$textarea->attr_transform_pre[] = new HTMLPurifier_AttrTransform_Textarea();
11034
$button = $this->addElement('button', 'Formctrl', 'Optional: #PCDATA | Heading | List | Block | Inline', 'Common', array(
11035
'accesskey' => 'Character',
11036
'disabled' => 'Bool#disabled',
11038
'tabindex' => 'Number',
11039
'type' => 'Enum#button,submit,reset',
11040
'value' => 'CDATA',
11043
// For exclusions, ideally we'd specify content sets, not literal elements
11044
$button->excludes = $this->makeLookup(
11045
'form', 'fieldset', // Form
11046
'input', 'select', 'textarea', 'label', 'button', // Formctrl
11047
'a' // as per HTML 4.01 spec, this is omitted by modularization
11050
// Extra exclusion: img usemap="" is not permitted within this element.
11051
// We'll omit this for now, since we don't have any good way of
11052
// indicating it yet.
11054
// This is HIGHLY user-unfriendly; we need a custom child-def for this
11055
$this->addElement('fieldset', 'Form', 'Custom: (#WS?,legend,(Flow|#PCDATA)*)', 'Common');
11057
$label = $this->addElement('label', 'Formctrl', 'Optional: #PCDATA | Inline', 'Common', array(
11058
'accesskey' => 'Character',
11059
// 'for' => 'IDREF', // IDREF not implemented, cannot allow
11061
$label->excludes = array('label' => true);
11063
$this->addElement('legend', false, 'Optional: #PCDATA | Inline', 'Common', array(
11064
'accesskey' => 'Character',
11067
$this->addElement('optgroup', false, 'Required: option', 'Common', array(
11068
'disabled' => 'Bool#disabled',
11069
'label*' => 'Text',
11072
// Don't forget an injector for <isindex>. This one's a little complex
11073
// because it maps to multiple elements.
11082
* XHTML 1.1 Hypertext Module, defines hypertext links. Core Module.
11084
class HTMLPurifier_HTMLModule_Hypertext extends HTMLPurifier_HTMLModule
11087
public $name = 'Hypertext';
11089
public function setup($config) {
11090
$a = $this->addElement(
11091
'a', 'Inline', 'Inline', 'Common',
11093
// 'accesskey' => 'Character',
11094
// 'charset' => 'Charset',
11096
// 'hreflang' => 'LanguageCode',
11097
'rel' => new HTMLPurifier_AttrDef_HTML_LinkTypes('rel'),
11098
'rev' => new HTMLPurifier_AttrDef_HTML_LinkTypes('rev'),
11099
// 'tabindex' => 'Number',
11100
// 'type' => 'ContentType',
11103
$a->excludes = array('a' => true);
11112
* XHTML 1.1 Image Module provides basic image embedding.
11113
* @note There is specialized code for removing empty images in
11114
* HTMLPurifier_Strategy_RemoveForeignElements
11116
class HTMLPurifier_HTMLModule_Image extends HTMLPurifier_HTMLModule
11119
public $name = 'Image';
11121
public function setup($config) {
11122
$max = $config->get('HTML', 'MaxImgLength');
11123
$img = $this->addElement(
11124
'img', 'Inline', 'Empty', 'Common',
11127
// According to the spec, it's Length, but percents can
11128
// be abused, so we allow only Pixels.
11129
'height' => 'Pixels#' . $max,
11130
'width' => 'Pixels#' . $max,
11131
'longdesc' => 'URI',
11132
'src*' => new HTMLPurifier_AttrDef_URI(true), // embedded
11135
if ($max === null || $config->get('HTML', 'Trusted')) {
11136
$img->attr['height'] =
11137
$img->attr['width'] = 'Length';
11140
// kind of strange, but splitting things up would be inefficient
11141
$img->attr_transform_pre[] =
11142
$img->attr_transform_post[] =
11143
new HTMLPurifier_AttrTransform_ImgRequired();
11152
* XHTML 1.1 Legacy module defines elements that were previously
11155
* @note Not all legacy elements have been implemented yet, which
11156
* is a bit of a reverse problem as compared to browsers! In
11157
* addition, this legacy module may implement a bit more than
11158
* mandated by XHTML 1.1.
11160
* This module can be used in combination with TransformToStrict in order
11161
* to transform as many deprecated elements as possible, but retain
11162
* questionably deprecated elements that do not have good alternatives
11163
* as well as transform elements that don't have an implementation.
11164
* See docs/ref-strictness.txt for more details.
11167
class HTMLPurifier_HTMLModule_Legacy extends HTMLPurifier_HTMLModule
11170
public $name = 'Legacy';
11172
public function setup($config) {
11174
$this->addElement('basefont', 'Inline', 'Empty', false, array(
11175
'color' => 'Color',
11176
'face' => 'Text', // extremely broad, we should
11177
'size' => 'Text', // tighten it
11180
$this->addElement('center', 'Block', 'Flow', 'Common');
11181
$this->addElement('dir', 'Block', 'Required: li', 'Common', array(
11182
'compact' => 'Bool#compact'
11184
$this->addElement('font', 'Inline', 'Inline', array('Core', 'I18N'), array(
11185
'color' => 'Color',
11186
'face' => 'Text', // extremely broad, we should
11187
'size' => 'Text', // tighten it
11189
$this->addElement('menu', 'Block', 'Required: li', 'Common', array(
11190
'compact' => 'Bool#compact'
11192
$this->addElement('s', 'Inline', 'Inline', 'Common');
11193
$this->addElement('strike', 'Inline', 'Inline', 'Common');
11194
$this->addElement('u', 'Inline', 'Inline', 'Common');
11196
// setup modifications to old elements
11198
$align = 'Enum#left,right,center,justify';
11200
$address = $this->addBlankElement('address');
11201
$address->content_model = 'Inline | #PCDATA | p';
11202
$address->content_model_type = 'optional';
11203
$address->child = false;
11205
$blockquote = $this->addBlankElement('blockquote');
11206
$blockquote->content_model = 'Flow | #PCDATA';
11207
$blockquote->content_model_type = 'optional';
11208
$blockquote->child = false;
11210
$br = $this->addBlankElement('br');
11211
$br->attr['clear'] = 'Enum#left,all,right,none';
11213
$caption = $this->addBlankElement('caption');
11214
$caption->attr['align'] = 'Enum#top,bottom,left,right';
11216
$div = $this->addBlankElement('div');
11217
$div->attr['align'] = $align;
11219
$dl = $this->addBlankElement('dl');
11220
$dl->attr['compact'] = 'Bool#compact';
11222
for ($i = 1; $i <= 6; $i++) {
11223
$h = $this->addBlankElement("h$i");
11224
$h->attr['align'] = $align;
11227
$hr = $this->addBlankElement('hr');
11228
$hr->attr['align'] = $align;
11229
$hr->attr['noshade'] = 'Bool#noshade';
11230
$hr->attr['size'] = 'Pixels';
11231
$hr->attr['width'] = 'Length';
11233
$img = $this->addBlankElement('img');
11234
$img->attr['align'] = 'Enum#top,middle,bottom,left,right';
11235
$img->attr['border'] = 'Pixels';
11236
$img->attr['hspace'] = 'Pixels';
11237
$img->attr['vspace'] = 'Pixels';
11239
// figure out this integer business
11241
$li = $this->addBlankElement('li');
11242
$li->attr['value'] = new HTMLPurifier_AttrDef_Integer();
11243
$li->attr['type'] = 'Enum#s:1,i,I,a,A,disc,square,circle';
11245
$ol = $this->addBlankElement('ol');
11246
$ol->attr['compact'] = 'Bool#compact';
11247
$ol->attr['start'] = new HTMLPurifier_AttrDef_Integer();
11248
$ol->attr['type'] = 'Enum#s:1,i,I,a,A';
11250
$p = $this->addBlankElement('p');
11251
$p->attr['align'] = $align;
11253
$pre = $this->addBlankElement('pre');
11254
$pre->attr['width'] = 'Number';
11258
$table = $this->addBlankElement('table');
11259
$table->attr['align'] = 'Enum#left,center,right';
11260
$table->attr['bgcolor'] = 'Color';
11262
$tr = $this->addBlankElement('tr');
11263
$tr->attr['bgcolor'] = 'Color';
11265
$th = $this->addBlankElement('th');
11266
$th->attr['bgcolor'] = 'Color';
11267
$th->attr['height'] = 'Length';
11268
$th->attr['nowrap'] = 'Bool#nowrap';
11269
$th->attr['width'] = 'Length';
11271
$td = $this->addBlankElement('td');
11272
$td->attr['bgcolor'] = 'Color';
11273
$td->attr['height'] = 'Length';
11274
$td->attr['nowrap'] = 'Bool#nowrap';
11275
$td->attr['width'] = 'Length';
11277
$ul = $this->addBlankElement('ul');
11278
$ul->attr['compact'] = 'Bool#compact';
11279
$ul->attr['type'] = 'Enum#square,disc,circle';
11289
* XHTML 1.1 List Module, defines list-oriented elements. Core Module.
11291
class HTMLPurifier_HTMLModule_List extends HTMLPurifier_HTMLModule
11294
public $name = 'List';
11296
// According to the abstract schema, the List content set is a fully formed
11297
// one or more expr, but it invariably occurs in an optional declaration
11298
// so we're not going to do that subtlety. It might cause trouble
11299
// if a user defines "List" and expects that multiple lists are
11300
// allowed to be specified, but then again, that's not very intuitive.
11301
// Furthermore, the actual XML Schema may disagree. Regardless,
11302
// we don't have support for such nested expressions without using
11303
// the incredibly inefficient and draconic Custom ChildDef.
11305
public $content_sets = array('Flow' => 'List');
11307
public function setup($config) {
11308
$this->addElement('ol', 'List', 'Required: li', 'Common');
11309
$this->addElement('ul', 'List', 'Required: li', 'Common');
11310
$this->addElement('dl', 'List', 'Required: dt | dd', 'Common');
11312
$this->addElement('li', false, 'Flow', 'Common');
11314
$this->addElement('dd', false, 'Flow', 'Common');
11315
$this->addElement('dt', false, 'Inline', 'Common');
11323
class HTMLPurifier_HTMLModule_Name extends HTMLPurifier_HTMLModule
11326
public $name = 'Name';
11328
public function setup($config) {
11329
$elements = array('a', 'applet', 'form', 'frame', 'iframe', 'img', 'map');
11330
foreach ($elements as $name) {
11331
$element = $this->addBlankElement($name);
11332
$element->attr['name'] = 'ID';
11340
class HTMLPurifier_HTMLModule_NonXMLCommonAttributes extends HTMLPurifier_HTMLModule
11342
public $name = 'NonXMLCommonAttributes';
11344
public $attr_collections = array(
11346
'lang' => 'LanguageCode',
11355
* XHTML 1.1 Object Module, defines elements for generic object inclusion
11356
* @warning Users will commonly use <embed> to cater to legacy browsers: this
11357
* module does not allow this sort of behavior
11359
class HTMLPurifier_HTMLModule_Object extends HTMLPurifier_HTMLModule
11362
public $name = 'Object';
11363
public $safe = false;
11365
public function setup($config) {
11367
$this->addElement('object', 'Inline', 'Optional: #PCDATA | Flow | param', 'Common',
11369
'archive' => 'URI',
11370
'classid' => 'URI',
11371
'codebase' => 'URI',
11372
'codetype' => 'Text',
11374
'declare' => 'Bool#declare',
11375
'height' => 'Length',
11377
'standby' => 'Text',
11378
'tabindex' => 'Number',
11379
'type' => 'ContentType',
11380
'width' => 'Length'
11384
$this->addElement('param', false, 'Empty', false,
11390
'valuetype' => 'Enum#data,ref,object'
11402
* XHTML 1.1 Presentation Module, defines simple presentation-related
11403
* markup. Text Extension Module.
11404
* @note The official XML Schema and DTD specs further divide this into
11406
* - Block Presentation (hr)
11407
* - Inline Presentation (b, big, i, small, sub, sup, tt)
11408
* We have chosen not to heed this distinction, as content_sets
11409
* provides satisfactory disambiguation.
11411
class HTMLPurifier_HTMLModule_Presentation extends HTMLPurifier_HTMLModule
11414
public $name = 'Presentation';
11416
public function setup($config) {
11417
$this->addElement('b', 'Inline', 'Inline', 'Common');
11418
$this->addElement('big', 'Inline', 'Inline', 'Common');
11419
$this->addElement('hr', 'Block', 'Empty', 'Common');
11420
$this->addElement('i', 'Inline', 'Inline', 'Common');
11421
$this->addElement('small', 'Inline', 'Inline', 'Common');
11422
$this->addElement('sub', 'Inline', 'Inline', 'Common');
11423
$this->addElement('sup', 'Inline', 'Inline', 'Common');
11424
$this->addElement('tt', 'Inline', 'Inline', 'Common');
11433
* Module defines proprietary tags and attributes in HTML.
11434
* @warning If this module is enabled, standards-compliance is off!
11436
class HTMLPurifier_HTMLModule_Proprietary extends HTMLPurifier_HTMLModule
11439
public $name = 'Proprietary';
11441
public function setup($config) {
11443
$this->addElement('marquee', 'Inline', 'Flow', 'Common',
11445
'direction' => 'Enum#left,right,up,down',
11446
'behavior' => 'Enum#alternate',
11447
'width' => 'Length',
11448
'height' => 'Length',
11449
'scrolldelay' => 'Number',
11450
'scrollamount' => 'Number',
11451
'loop' => 'Number',
11452
'bgcolor' => 'Color',
11453
'hspace' => 'Pixels',
11454
'vspace' => 'Pixels',
11466
* XHTML 1.1 Ruby Annotation Module, defines elements that indicate
11467
* short runs of text alongside base text for annotation or pronounciation.
11469
class HTMLPurifier_HTMLModule_Ruby extends HTMLPurifier_HTMLModule
11472
public $name = 'Ruby';
11474
public function setup($config) {
11475
$this->addElement('ruby', 'Inline',
11476
'Custom: ((rb, (rt | (rp, rt, rp))) | (rbc, rtc, rtc?))',
11478
$this->addElement('rbc', false, 'Required: rb', 'Common');
11479
$this->addElement('rtc', false, 'Required: rt', 'Common');
11480
$rb = $this->addElement('rb', false, 'Inline', 'Common');
11481
$rb->excludes = array('ruby' => true);
11482
$rt = $this->addElement('rt', false, 'Inline', 'Common', array('rbspan' => 'Number'));
11483
$rt->excludes = array('ruby' => true);
11484
$this->addElement('rp', false, 'Optional: #PCDATA', 'Common');
11493
* A "safe" embed module. See SafeObject. This is a proprietary element.
11495
class HTMLPurifier_HTMLModule_SafeEmbed extends HTMLPurifier_HTMLModule
11498
public $name = 'SafeEmbed';
11500
public function setup($config) {
11502
$max = $config->get('HTML', 'MaxImgLength');
11503
$embed = $this->addElement(
11504
'embed', 'Inline', 'Empty', 'Common',
11506
'src*' => 'URI#embedded',
11507
'type' => 'Enum#application/x-shockwave-flash',
11508
'width' => 'Pixels#' . $max,
11509
'height' => 'Pixels#' . $max,
11510
'allowscriptaccess' => 'Enum#never',
11511
'allownetworking' => 'Enum#internal',
11512
'wmode' => 'Enum#window',
11516
$embed->attr_transform_post[] = new HTMLPurifier_AttrTransform_SafeEmbed();
11525
* A "safe" object module. In theory, objects permitted by this module will
11526
* be safe, and untrusted users can be allowed to embed arbitrary flash objects
11527
* (maybe other types too, but only Flash is supported as of right now).
11528
* Highly experimental.
11530
class HTMLPurifier_HTMLModule_SafeObject extends HTMLPurifier_HTMLModule
11533
public $name = 'SafeObject';
11535
public function setup($config) {
11537
// These definitions are not intrinsically safe: the attribute transforms
11538
// are a vital part of ensuring safety.
11540
$max = $config->get('HTML', 'MaxImgLength');
11541
$object = $this->addElement(
11544
'Optional: param | Flow | #PCDATA',
11547
// While technically not required by the spec, we're forcing
11548
// it to this value.
11549
'type' => 'Enum#application/x-shockwave-flash',
11550
'width' => 'Pixels#' . $max,
11551
'height' => 'Pixels#' . $max,
11552
'data' => 'URI#embedded'
11555
$object->attr_transform_post[] = new HTMLPurifier_AttrTransform_SafeObject();
11557
$param = $this->addElement('param', false, 'Empty', false,
11564
$param->attr_transform_post[] = new HTMLPurifier_AttrTransform_SafeParam();
11565
$this->info_injector[] = 'SafeObject';
11575
WARNING: THIS MODULE IS EXTREMELY DANGEROUS AS IT ENABLES INLINE SCRIPTING
11576
INSIDE HTML PURIFIER DOCUMENTS. USE ONLY WITH TRUSTED USER INPUT!!!
11581
* XHTML 1.1 Scripting module, defines elements that are used to contain
11582
* information pertaining to executable scripts or the lack of support
11583
* for executable scripts.
11584
* @note This module does not contain inline scripting elements
11586
class HTMLPurifier_HTMLModule_Scripting extends HTMLPurifier_HTMLModule
11588
public $name = 'Scripting';
11589
public $elements = array('script', 'noscript');
11590
public $content_sets = array('Block' => 'script | noscript', 'Inline' => 'script | noscript');
11591
public $safe = false;
11593
public function setup($config) {
11594
// TODO: create custom child-definition for noscript that
11595
// auto-wraps stray #PCDATA in a similar manner to
11596
// blockquote's custom definition (we would use it but
11597
// blockquote's contents are optional while noscript's contents
11600
// TODO: convert this to new syntax, main problem is getting
11601
// both content sets working
11603
// In theory, this could be safe, but I don't see any reason to
11605
$this->info['noscript'] = new HTMLPurifier_ElementDef();
11606
$this->info['noscript']->attr = array( 0 => array('Common') );
11607
$this->info['noscript']->content_model = 'Heading | List | Block';
11608
$this->info['noscript']->content_model_type = 'required';
11610
$this->info['script'] = new HTMLPurifier_ElementDef();
11611
$this->info['script']->attr = array(
11612
'defer' => new HTMLPurifier_AttrDef_Enum(array('defer')),
11613
'src' => new HTMLPurifier_AttrDef_URI(true),
11614
'type' => new HTMLPurifier_AttrDef_Enum(array('text/javascript'))
11616
$this->info['script']->content_model = '#PCDATA';
11617
$this->info['script']->content_model_type = 'optional';
11618
$this->info['script']->attr_transform_pre['type'] =
11619
$this->info['script']->attr_transform_post['type'] =
11620
new HTMLPurifier_AttrTransform_ScriptRequired();
11628
* XHTML 1.1 Edit Module, defines editing-related elements. Text Extension
11631
class HTMLPurifier_HTMLModule_StyleAttribute extends HTMLPurifier_HTMLModule
11634
public $name = 'StyleAttribute';
11635
public $attr_collections = array(
11636
// The inclusion routine differs from the Abstract Modules but
11637
// is in line with the DTD and XML Schemas.
11638
'Style' => array('style' => false), // see constructor
11639
'Core' => array(0 => array('Style'))
11642
public function setup($config) {
11643
$this->attr_collections['Style']['style'] = new HTMLPurifier_AttrDef_CSS();
11652
* XHTML 1.1 Tables Module, fully defines accessible table elements.
11654
class HTMLPurifier_HTMLModule_Tables extends HTMLPurifier_HTMLModule
11657
public $name = 'Tables';
11659
public function setup($config) {
11661
$this->addElement('caption', false, 'Inline', 'Common');
11663
$this->addElement('table', 'Block',
11664
new HTMLPurifier_ChildDef_Table(), 'Common',
11666
'border' => 'Pixels',
11667
'cellpadding' => 'Length',
11668
'cellspacing' => 'Length',
11669
'frame' => 'Enum#void,above,below,hsides,lhs,rhs,vsides,box,border',
11670
'rules' => 'Enum#none,groups,rows,cols,all',
11671
'summary' => 'Text',
11672
'width' => 'Length'
11676
// common attributes
11677
$cell_align = array(
11678
'align' => 'Enum#left,center,right,justify,char',
11679
'charoff' => 'Length',
11680
'valign' => 'Enum#top,middle,bottom,baseline',
11683
$cell_t = array_merge(
11686
'colspan' => 'Number',
11687
'rowspan' => 'Number',
11691
$this->addElement('td', false, 'Flow', 'Common', $cell_t);
11692
$this->addElement('th', false, 'Flow', 'Common', $cell_t);
11694
$this->addElement('tr', false, 'Required: td | th', 'Common', $cell_align);
11696
$cell_col = array_merge(
11698
'span' => 'Number',
11699
'width' => 'MultiLength',
11703
$this->addElement('col', false, 'Empty', 'Common', $cell_col);
11704
$this->addElement('colgroup', false, 'Optional: col', 'Common', $cell_col);
11706
$this->addElement('tbody', false, 'Required: tr', 'Common', $cell_align);
11707
$this->addElement('thead', false, 'Required: tr', 'Common', $cell_align);
11708
$this->addElement('tfoot', false, 'Required: tr', 'Common', $cell_align);
11718
* XHTML 1.1 Target Module, defines target attribute in link elements.
11720
class HTMLPurifier_HTMLModule_Target extends HTMLPurifier_HTMLModule
11723
public $name = 'Target';
11725
public function setup($config) {
11726
$elements = array('a');
11727
foreach ($elements as $name) {
11728
$e = $this->addBlankElement($name);
11730
'target' => new HTMLPurifier_AttrDef_HTML_FrameTarget()
11741
* XHTML 1.1 Text Module, defines basic text containers. Core Module.
11742
* @note In the normative XML Schema specification, this module
11743
* is further abstracted into the following modules:
11744
* - Block Phrasal (address, blockquote, pre, h1, h2, h3, h4, h5, h6)
11745
* - Block Structural (div, p)
11746
* - Inline Phrasal (abbr, acronym, cite, code, dfn, em, kbd, q, samp, strong, var)
11747
* - Inline Structural (br, span)
11748
* This module, functionally, does not distinguish between these
11749
* sub-modules, but the code is internally structured to reflect
11750
* these distinctions.
11752
class HTMLPurifier_HTMLModule_Text extends HTMLPurifier_HTMLModule
11755
public $name = 'Text';
11756
public $content_sets = array(
11757
'Flow' => 'Heading | Block | Inline'
11760
public function setup($config) {
11762
// Inline Phrasal -------------------------------------------------
11763
$this->addElement('abbr', 'Inline', 'Inline', 'Common');
11764
$this->addElement('acronym', 'Inline', 'Inline', 'Common');
11765
$this->addElement('cite', 'Inline', 'Inline', 'Common');
11766
$this->addElement('code', 'Inline', 'Inline', 'Common');
11767
$this->addElement('dfn', 'Inline', 'Inline', 'Common');
11768
$this->addElement('em', 'Inline', 'Inline', 'Common');
11769
$this->addElement('kbd', 'Inline', 'Inline', 'Common');
11770
$this->addElement('q', 'Inline', 'Inline', 'Common', array('cite' => 'URI'));
11771
$this->addElement('samp', 'Inline', 'Inline', 'Common');
11772
$this->addElement('strong', 'Inline', 'Inline', 'Common');
11773
$this->addElement('var', 'Inline', 'Inline', 'Common');
11775
// Inline Structural ----------------------------------------------
11776
$this->addElement('span', 'Inline', 'Inline', 'Common');
11777
$this->addElement('br', 'Inline', 'Empty', 'Core');
11779
// Block Phrasal --------------------------------------------------
11780
$this->addElement('address', 'Block', 'Inline', 'Common');
11781
$this->addElement('blockquote', 'Block', 'Optional: Heading | Block | List', 'Common', array('cite' => 'URI') );
11782
$pre = $this->addElement('pre', 'Block', 'Inline', 'Common');
11783
$pre->excludes = $this->makeLookup(
11784
'img', 'big', 'small', 'object', 'applet', 'font', 'basefont' );
11785
$this->addElement('h1', 'Heading', 'Inline', 'Common');
11786
$this->addElement('h2', 'Heading', 'Inline', 'Common');
11787
$this->addElement('h3', 'Heading', 'Inline', 'Common');
11788
$this->addElement('h4', 'Heading', 'Inline', 'Common');
11789
$this->addElement('h5', 'Heading', 'Inline', 'Common');
11790
$this->addElement('h6', 'Heading', 'Inline', 'Common');
11792
// Block Structural -----------------------------------------------
11793
$this->addElement('p', 'Block', 'Inline', 'Common');
11794
$this->addElement('div', 'Block', 'Flow', 'Common');
11804
* Abstract class for a set of proprietary modules that clean up (tidy)
11805
* poorly written HTML.
11806
* @todo Figure out how to protect some of these methods/properties
11808
class HTMLPurifier_HTMLModule_Tidy extends HTMLPurifier_HTMLModule
11812
* List of supported levels. Index zero is a special case "no fixes"
11815
public $levels = array(0 => 'none', 'light', 'medium', 'heavy');
11818
* Default level to place all fixes in. Disabled by default
11820
public $defaultLevel = null;
11823
* Lists of fixes used by getFixesForLevel(). Format is:
11824
* HTMLModule_Tidy->fixesForLevel[$level] = array('fix-1', 'fix-2');
11826
public $fixesForLevel = array(
11827
'light' => array(),
11828
'medium' => array(),
11833
* Lazy load constructs the module by determining the necessary
11834
* fixes to create and then delegating to the populate() function.
11835
* @todo Wildcard matching and error reporting when an added or
11836
* subtracted fix has no effect.
11838
public function setup($config) {
11840
// create fixes, initialize fixesForLevel
11841
$fixes = $this->makeFixes();
11842
$this->makeFixesForLevel($fixes);
11844
// figure out which fixes to use
11845
$level = $config->get('HTML', 'TidyLevel');
11846
$fixes_lookup = $this->getFixesForLevel($level);
11848
// get custom fix declarations: these need namespace processing
11849
$add_fixes = $config->get('HTML', 'TidyAdd');
11850
$remove_fixes = $config->get('HTML', 'TidyRemove');
11852
foreach ($fixes as $name => $fix) {
11853
// needs to be refactored a little to implement globbing
11855
isset($remove_fixes[$name]) ||
11856
(!isset($add_fixes[$name]) && !isset($fixes_lookup[$name]))
11858
unset($fixes[$name]);
11862
// populate this module with necessary fixes
11863
$this->populate($fixes);
11868
* Retrieves all fixes per a level, returning fixes for that specific
11869
* level as well as all levels below it.
11870
* @param $level String level identifier, see $levels for valid values
11871
* @return Lookup up table of fixes
11873
public function getFixesForLevel($level) {
11874
if ($level == $this->levels[0]) {
11877
$activated_levels = array();
11878
for ($i = 1, $c = count($this->levels); $i < $c; $i++) {
11879
$activated_levels[] = $this->levels[$i];
11880
if ($this->levels[$i] == $level) break;
11884
'Tidy level ' . htmlspecialchars($level) . ' not recognized',
11890
foreach ($activated_levels as $level) {
11891
foreach ($this->fixesForLevel[$level] as $fix) {
11899
* Dynamically populates the $fixesForLevel member variable using
11900
* the fixes array. It may be custom overloaded, used in conjunction
11901
* with $defaultLevel, or not used at all.
11903
public function makeFixesForLevel($fixes) {
11904
if (!isset($this->defaultLevel)) return;
11905
if (!isset($this->fixesForLevel[$this->defaultLevel])) {
11907
'Default level ' . $this->defaultLevel . ' does not exist',
11912
$this->fixesForLevel[$this->defaultLevel] = array_keys($fixes);
11916
* Populates the module with transforms and other special-case code
11917
* based on a list of fixes passed to it
11918
* @param $lookup Lookup table of fixes to activate
11920
public function populate($fixes) {
11921
foreach ($fixes as $name => $fix) {
11922
// determine what the fix is for
11923
list($type, $params) = $this->getFixType($name);
11925
case 'attr_transform_pre':
11926
case 'attr_transform_post':
11927
$attr = $params['attr'];
11928
if (isset($params['element'])) {
11929
$element = $params['element'];
11930
if (empty($this->info[$element])) {
11931
$e = $this->addBlankElement($element);
11933
$e = $this->info[$element];
11936
$type = "info_$type";
11939
// PHP does some weird parsing when I do
11940
// $e->$type[$attr], so I have to assign a ref.
11944
case 'tag_transform':
11945
$this->info_tag_transform[$params['element']] = $fix;
11948
case 'content_model_type':
11949
$element = $params['element'];
11950
if (empty($this->info[$element])) {
11951
$e = $this->addBlankElement($element);
11953
$e = $this->info[$element];
11958
trigger_error("Fix type $type not supported", E_USER_ERROR);
11965
* Parses a fix name and determines what kind of fix it is, as well
11966
* as other information defined by the fix
11967
* @param $name String name of fix
11968
* @return array(string $fix_type, array $fix_parameters)
11969
* @note $fix_parameters is type dependant, see populate() for usage
11970
* of these parameters
11972
public function getFixType($name) {
11974
$property = $attr = null;
11975
if (strpos($name, '#') !== false) list($name, $property) = explode('#', $name);
11976
if (strpos($name, '@') !== false) list($name, $attr) = explode('@', $name);
11978
// figure out the parameters
11980
if ($name !== '') $params['element'] = $name;
11981
if (!is_null($attr)) $params['attr'] = $attr;
11983
// special case: attribute transform
11984
if (!is_null($attr)) {
11985
if (is_null($property)) $property = 'pre';
11986
$type = 'attr_transform_' . $property;
11987
return array($type, $params);
11990
// special case: tag transform
11991
if (is_null($property)) {
11992
return array('tag_transform', $params);
11995
return array($property, $params);
12000
* Defines all fixes the module will perform in a compact
12001
* associative array of fix name to fix implementation.
12003
public function makeFixes() {}
12011
class HTMLPurifier_HTMLModule_XMLCommonAttributes extends HTMLPurifier_HTMLModule
12013
public $name = 'XMLCommonAttributes';
12015
public $attr_collections = array(
12017
'xml:lang' => 'LanguageCode',
12026
* Name is deprecated, but allowed in strict doctypes, so onl
12028
class HTMLPurifier_HTMLModule_Tidy_Name extends HTMLPurifier_HTMLModule_Tidy
12030
public $name = 'Tidy_Name';
12031
public $defaultLevel = 'heavy';
12032
public function makeFixes() {
12036
// @name for img, a -----------------------------------------------
12037
// Technically, it's allowed even on strict, so we allow authors to use
12038
// it. However, it's deprecated in future versions of XHTML.
12040
$r['a@name'] = new HTMLPurifier_AttrTransform_Name();
12049
class HTMLPurifier_HTMLModule_Tidy_Proprietary extends HTMLPurifier_HTMLModule_Tidy
12052
public $name = 'Tidy_Proprietary';
12053
public $defaultLevel = 'light';
12055
public function makeFixes() {
12057
$r['table@background'] = new HTMLPurifier_AttrTransform_Background();
12058
$r['td@background'] = new HTMLPurifier_AttrTransform_Background();
12059
$r['th@background'] = new HTMLPurifier_AttrTransform_Background();
12060
$r['tr@background'] = new HTMLPurifier_AttrTransform_Background();
12061
$r['thead@background'] = new HTMLPurifier_AttrTransform_Background();
12062
$r['tfoot@background'] = new HTMLPurifier_AttrTransform_Background();
12063
$r['tbody@background'] = new HTMLPurifier_AttrTransform_Background();
12072
class HTMLPurifier_HTMLModule_Tidy_XHTMLAndHTML4 extends HTMLPurifier_HTMLModule_Tidy
12075
public function makeFixes() {
12079
// == deprecated tag transforms ===================================
12081
$r['font'] = new HTMLPurifier_TagTransform_Font();
12082
$r['menu'] = new HTMLPurifier_TagTransform_Simple('ul');
12083
$r['dir'] = new HTMLPurifier_TagTransform_Simple('ul');
12084
$r['center'] = new HTMLPurifier_TagTransform_Simple('div', 'text-align:center;');
12085
$r['u'] = new HTMLPurifier_TagTransform_Simple('span', 'text-decoration:underline;');
12086
$r['s'] = new HTMLPurifier_TagTransform_Simple('span', 'text-decoration:line-through;');
12087
$r['strike'] = new HTMLPurifier_TagTransform_Simple('span', 'text-decoration:line-through;');
12089
// == deprecated attribute transforms =============================
12091
$r['caption@align'] =
12092
new HTMLPurifier_AttrTransform_EnumToCSS('align', array(
12093
// we're following IE's behavior, not Firefox's, due
12094
// to the fact that no one supports caption-side:right,
12095
// W3C included (with CSS 2.1). This is a slightly
12096
// unreasonable attribute!
12097
'left' => 'text-align:left;',
12098
'right' => 'text-align:right;',
12099
'top' => 'caption-side:top;',
12100
'bottom' => 'caption-side:bottom;' // not supported by IE
12103
// @align for img -------------------------------------------------
12105
new HTMLPurifier_AttrTransform_EnumToCSS('align', array(
12106
'left' => 'float:left;',
12107
'right' => 'float:right;',
12108
'top' => 'vertical-align:top;',
12109
'middle' => 'vertical-align:middle;',
12110
'bottom' => 'vertical-align:baseline;',
12113
// @align for table -----------------------------------------------
12114
$r['table@align'] =
12115
new HTMLPurifier_AttrTransform_EnumToCSS('align', array(
12116
'left' => 'float:left;',
12117
'center' => 'margin-left:auto;margin-right:auto;',
12118
'right' => 'float:right;'
12121
// @align for hr -----------------------------------------------
12123
new HTMLPurifier_AttrTransform_EnumToCSS('align', array(
12124
// we use both text-align and margin because these work
12125
// for different browsers (IE and Firefox, respectively)
12126
// and the melange makes for a pretty cross-compatible
12128
'left' => 'margin-left:0;margin-right:auto;text-align:left;',
12129
'center' => 'margin-left:auto;margin-right:auto;text-align:center;',
12130
'right' => 'margin-left:auto;margin-right:0;text-align:right;'
12133
// @align for h1, h2, h3, h4, h5, h6, p, div ----------------------
12135
$align_lookup = array();
12136
$align_values = array('left', 'right', 'center', 'justify');
12137
foreach ($align_values as $v) $align_lookup[$v] = "text-align:$v;";
12147
new HTMLPurifier_AttrTransform_EnumToCSS('align', $align_lookup);
12149
// @bgcolor for table, tr, td, th ---------------------------------
12150
$r['table@bgcolor'] =
12153
new HTMLPurifier_AttrTransform_BgColor();
12155
// @border for img ------------------------------------------------
12156
$r['img@border'] = new HTMLPurifier_AttrTransform_Border();
12158
// @clear for br --------------------------------------------------
12160
new HTMLPurifier_AttrTransform_EnumToCSS('clear', array(
12161
'left' => 'clear:left;',
12162
'right' => 'clear:right;',
12163
'all' => 'clear:both;',
12164
'none' => 'clear:none;',
12167
// @height for td, th ---------------------------------------------
12170
new HTMLPurifier_AttrTransform_Length('height');
12172
// @hspace for img ------------------------------------------------
12173
$r['img@hspace'] = new HTMLPurifier_AttrTransform_ImgSpace('hspace');
12175
// @noshade for hr ------------------------------------------------
12176
// this transformation is not precise but often good enough.
12177
// different browsers use different styles to designate noshade
12179
new HTMLPurifier_AttrTransform_BoolToCSS(
12181
'color:#808080;background-color:#808080;border:0;'
12184
// @nowrap for td, th ---------------------------------------------
12187
new HTMLPurifier_AttrTransform_BoolToCSS(
12189
'white-space:nowrap;'
12192
// @size for hr --------------------------------------------------
12193
$r['hr@size'] = new HTMLPurifier_AttrTransform_Length('size', 'height');
12195
// @type for li, ol, ul -------------------------------------------
12198
'disc' => 'list-style-type:disc;',
12199
'square' => 'list-style-type:square;',
12200
'circle' => 'list-style-type:circle;'
12203
'1' => 'list-style-type:decimal;',
12204
'i' => 'list-style-type:lower-roman;',
12205
'I' => 'list-style-type:upper-roman;',
12206
'a' => 'list-style-type:lower-alpha;',
12207
'A' => 'list-style-type:upper-alpha;'
12209
$li_types = $ul_types + $ol_types;
12212
$r['ul@type'] = new HTMLPurifier_AttrTransform_EnumToCSS('type', $ul_types);
12213
$r['ol@type'] = new HTMLPurifier_AttrTransform_EnumToCSS('type', $ol_types, true);
12214
$r['li@type'] = new HTMLPurifier_AttrTransform_EnumToCSS('type', $li_types, true);
12216
// @vspace for img ------------------------------------------------
12217
$r['img@vspace'] = new HTMLPurifier_AttrTransform_ImgSpace('vspace');
12219
// @width for hr, td, th ------------------------------------------
12222
$r['hr@width'] = new HTMLPurifier_AttrTransform_Length('width');
12233
class HTMLPurifier_HTMLModule_Tidy_Strict extends HTMLPurifier_HTMLModule_Tidy_XHTMLAndHTML4
12235
public $name = 'Tidy_Strict';
12236
public $defaultLevel = 'light';
12238
public function makeFixes() {
12239
$r = parent::makeFixes();
12240
$r['blockquote#content_model_type'] = 'strictblockquote';
12244
public $defines_child_def = true;
12245
public function getChildDef($def) {
12246
if ($def->content_model_type != 'strictblockquote') return parent::getChildDef($def);
12247
return new HTMLPurifier_ChildDef_StrictBlockquote($def->content_model);
12253
class HTMLPurifier_HTMLModule_Tidy_Transitional extends HTMLPurifier_HTMLModule_Tidy_XHTMLAndHTML4
12255
public $name = 'Tidy_Transitional';
12256
public $defaultLevel = 'heavy';
12262
class HTMLPurifier_HTMLModule_Tidy_XHTML extends HTMLPurifier_HTMLModule_Tidy
12265
public $name = 'Tidy_XHTML';
12266
public $defaultLevel = 'medium';
12268
public function makeFixes() {
12270
$r['@lang'] = new HTMLPurifier_AttrTransform_Lang();
12280
* Injector that auto paragraphs text in the root node based on
12282
* @todo Ensure all states are unit tested, including variations as well.
12283
* @todo Make a graph of the flow control for this Injector.
12285
class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
12288
public $name = 'AutoParagraph';
12289
public $needed = array('p');
12291
private function _pStart() {
12292
$par = new HTMLPurifier_Token_Start('p');
12293
$par->armor['MakeWellFormed_TagClosedError'] = true;
12297
public function handleText(&$token) {
12298
$text = $token->data;
12299
// Does the current parent allow <p> tags?
12300
if ($this->allowsElement('p')) {
12301
if (empty($this->currentNesting) || strpos($text, "\n\n") !== false) {
12302
// Note that we have differing behavior when dealing with text
12303
// in the anonymous root node, or a node inside the document.
12304
// If the text as a double-newline, the treatment is the same;
12305
// if it doesn't, see the next if-block if you're in the document.
12307
$i = $nesting = null;
12308
if (!$this->forwardUntilEndToken($i, $current, $nesting) && $token->is_whitespace) {
12309
// State 1.1: ... ^ (whitespace, then document end)
12311
// This is a degenerate case
12316
// State 1.3: PAR1\n\nPAR2
12319
// State 1.4: <div>PAR1\n\nPAR2 (see State 2)
12321
$token = array($this->_pStart());
12322
$this->_splitText($text, $token);
12325
// State 2: <div>PAR1... (similar to 1.4)
12328
// We're in an element that allows paragraph tags, but we're not
12329
// sure if we're going to need them.
12330
if ($this->_pLookAhead()) {
12331
// State 2.1: <div>PAR1<b>PAR1\n\nPAR2
12333
// Note: This will always be the first child, since any
12334
// previous inline element would have triggered this very
12335
// same routine, and found the double newline. One possible
12336
// exception would be a comment.
12337
$token = array($this->_pStart(), $token);
12339
// State 2.2.1: <div>PAR1<div>
12342
// State 2.2.2: <div>PAR1<b>PAR1</b></div>
12346
// Is the current parent a <p> tag?
12348
!empty($this->currentNesting) &&
12349
$this->currentNesting[count($this->currentNesting)-1]->name == 'p'
12351
// State 3.1: ...<p>PAR1
12354
// State 3.2: ...<p>PAR1\n\nPAR2
12357
$this->_splitText($text, $token);
12360
// State 4.1: ...<b>PAR1
12363
// State 4.2: ...<b>PAR1\n\nPAR2
12368
public function handleElement(&$token) {
12369
// We don't have to check if we're already in a <p> tag for block
12370
// tokens, because the tag would have been autoclosed by MakeWellFormed.
12371
if ($this->allowsElement('p')) {
12372
if (!empty($this->currentNesting)) {
12373
if ($this->_isInline($token)) {
12374
// State 1: <div>...<b>
12377
// Check if this token is adjacent to the parent token
12378
// (seek backwards until token isn't whitespace)
12380
$this->backward($i, $prev);
12382
if (!$prev instanceof HTMLPurifier_Token_Start) {
12383
// Token wasn't adjacent
12386
$prev instanceof HTMLPurifier_Token_Text &&
12387
substr($prev->data, -2) === "\n\n"
12389
// State 1.1.4: <div><p>PAR1</p>\n\n<b>
12392
// Quite frankly, this should be handled by splitText
12393
$token = array($this->_pStart(), $token);
12395
// State 1.1.1: <div><p>PAR1</p><b>
12398
// State 1.1.2: <div><br /><b>
12401
// State 1.1.3: <div>PAR<b>
12406
// State 1.2.1: <div><b>
12409
// Lookahead to see if <p> is needed.
12410
if ($this->_pLookAhead()) {
12411
// State 1.3.1: <div><b>PAR1\n\nPAR2
12413
$token = array($this->_pStart(), $token);
12415
// State 1.3.2: <div><b>PAR1</b></div>
12418
// State 1.3.3: <div><b>PAR1</b><div></div>\n\n</div>
12423
// State 2.3: ...<div>
12427
if ($this->_isInline($token)) {
12430
// This is where the {p} tag is inserted, not reflected in
12431
// inputTokens yet, however.
12432
$token = array($this->_pStart(), $token);
12434
// State 3.2: <div>
12439
if ($this->backward($i, $prev)) {
12441
!$prev instanceof HTMLPurifier_Token_Text
12443
// State 3.1.1: ...</p>{p}<b>
12446
// State 3.2.1: ...</p><div>
12449
if (!is_array($token)) $token = array($token);
12450
array_unshift($token, new HTMLPurifier_Token_Text("\n\n"));
12452
// State 3.1.2: ...</p>\n\n{p}<b>
12455
// State 3.2.2: ...</p>\n\n<div>
12458
// Note: PAR<ELEM> cannot occur because PAR would have been
12459
// wrapped in <p> tags.
12464
// State 2.2: <ul><li>
12467
// State 2.4: <p><b>
12473
* Splits up a text in paragraph tokens and appends them
12474
* to the result stream that will replace the original
12475
* @param $data String text data that will be processed
12477
* @param $result Reference to array of tokens that the
12478
* tags will be appended onto
12479
* @param $config Instance of HTMLPurifier_Config
12480
* @param $context Instance of HTMLPurifier_Context
12482
private function _splitText($data, &$result) {
12483
$raw_paragraphs = explode("\n\n", $data);
12484
$paragraphs = array(); // without empty paragraphs
12485
$needs_start = false;
12486
$needs_end = false;
12488
$c = count($raw_paragraphs);
12490
// There were no double-newlines, abort quickly. In theory this
12491
// should never happen.
12492
$result[] = new HTMLPurifier_Token_Text($data);
12495
for ($i = 0; $i < $c; $i++) {
12496
$par = $raw_paragraphs[$i];
12497
if (trim($par) !== '') {
12498
$paragraphs[] = $par;
12501
// Double newline at the front
12502
if (empty($result)) {
12503
// The empty result indicates that the AutoParagraph
12504
// injector did not add any start paragraph tokens.
12505
// This means that we have been in a paragraph for
12506
// a while, and the newline means we should start a new one.
12507
$result[] = new HTMLPurifier_Token_End('p');
12508
$result[] = new HTMLPurifier_Token_Text("\n\n");
12509
// However, the start token should only be added if
12510
// there is more processing to be done (i.e. there are
12511
// real paragraphs in here). If there are none, the
12512
// next start paragraph tag will be handled by the
12513
// next call to the injector
12514
$needs_start = true;
12516
// We just started a new paragraph!
12517
// Reinstate a double-newline for presentation's sake, since
12518
// it was in the source code.
12519
array_unshift($result, new HTMLPurifier_Token_Text("\n\n"));
12521
} elseif ($i + 1 == $c) {
12522
// Double newline at the end
12523
// There should be a trailing </p> when we're finally done.
12529
// Check if this was just a giant blob of whitespace. Move this earlier,
12531
if (empty($paragraphs)) {
12535
// Add the start tag indicated by \n\n at the beginning of $data
12536
if ($needs_start) {
12537
$result[] = $this->_pStart();
12540
// Append the paragraphs onto the result
12541
foreach ($paragraphs as $par) {
12542
$result[] = new HTMLPurifier_Token_Text($par);
12543
$result[] = new HTMLPurifier_Token_End('p');
12544
$result[] = new HTMLPurifier_Token_Text("\n\n");
12545
$result[] = $this->_pStart();
12548
// Remove trailing start token; Injector will handle this later if
12549
// it was indeed needed. This prevents from needing to do a lookahead,
12550
// at the cost of a lookbehind later.
12551
array_pop($result);
12553
// If there is no need for an end tag, remove all of it and let
12554
// MakeWellFormed close it later.
12556
array_pop($result); // removes \n\n
12557
array_pop($result); // removes </p>
12563
* Returns true if passed token is inline (and, ergo, allowed in
12566
private function _isInline($token) {
12567
return isset($this->htmlDefinition->info['p']->child->elements[$token->name]);
12571
* Looks ahead in the token list and determines whether or not we need
12572
* to insert a <p> tag.
12574
private function _pLookAhead() {
12575
$this->current($i, $current);
12576
if ($current instanceof HTMLPurifier_Token_Start) $nesting = 1;
12579
while ($this->forwardUntilEndToken($i, $current, $nesting)) {
12580
$result = $this->_checkNeedsP($current);
12581
if ($result !== null) {
12590
* Determines if a particular token requires an earlier inline token
12591
* to get a paragraph. This should be used with _forwardUntilEndToken
12593
private function _checkNeedsP($current) {
12594
if ($current instanceof HTMLPurifier_Token_Start){
12595
if (!$this->_isInline($current)) {
12598
// Terminate early, since we hit a block element
12601
} elseif ($current instanceof HTMLPurifier_Token_Text) {
12602
if (strpos($current->data, "\n\n") !== false) {
12603
// <div>PAR1<b>PAR1\n\nPAR2
12607
// <div>PAR1<b>PAR1...
12620
* Injector that displays the URL of an anchor instead of linking to it, in addition to showing the text of the link.
12622
class HTMLPurifier_Injector_DisplayLinkURI extends HTMLPurifier_Injector
12625
public $name = 'DisplayLinkURI';
12626
public $needed = array('a');
12628
public function handleElement(&$token) {
12631
public function handleEnd(&$token) {
12632
if (isset($token->start->attr['href'])){
12633
$url = $token->start->attr['href'];
12634
unset($token->start->attr['href']);
12635
$token = array($token, new HTMLPurifier_Token_Text(" ($url)"));
12637
// nothing to display
12644
* Injector that converts http, https and ftp text URLs to actual links.
12646
class HTMLPurifier_Injector_Linkify extends HTMLPurifier_Injector
12649
public $name = 'Linkify';
12650
public $needed = array('a' => array('href'));
12652
public function handleText(&$token) {
12653
if (!$this->allowsElement('a')) return;
12655
if (strpos($token->data, '://') === false) {
12656
// our really quick heuristic failed, abort
12657
// this may not work so well if we want to match things like
12658
// "google.com", but then again, most people don't
12662
// there is/are URL(s). Let's split the string:
12663
// Note: this regex is extremely permissive
12664
$bits = preg_split('#((?:https?|ftp)://[^\s\'"<>()]+)#S', $token->data, -1, PREG_SPLIT_DELIM_CAPTURE);
12671
for ($i = 0, $c = count($bits), $l = false; $i < $c; $i++, $l = !$l) {
12673
if ($bits[$i] === '') continue;
12674
$token[] = new HTMLPurifier_Token_Text($bits[$i]);
12676
$token[] = new HTMLPurifier_Token_Start('a', array('href' => $bits[$i]));
12677
$token[] = new HTMLPurifier_Token_Text($bits[$i]);
12678
$token[] = new HTMLPurifier_Token_End('a');
12690
* Injector that converts configuration directive syntax %Namespace.Directive
12693
class HTMLPurifier_Injector_PurifierLinkify extends HTMLPurifier_Injector
12696
public $name = 'PurifierLinkify';
12698
public $needed = array('a' => array('href'));
12700
public function prepare($config, $context) {
12701
$this->docURL = $config->get('AutoFormatParam', 'PurifierLinkifyDocURL');
12702
return parent::prepare($config, $context);
12705
public function handleText(&$token) {
12706
if (!$this->allowsElement('a')) return;
12707
if (strpos($token->data, '%') === false) return;
12709
$bits = preg_split('#%([a-z0-9]+\.[a-z0-9]+)#Si', $token->data, -1, PREG_SPLIT_DELIM_CAPTURE);
12715
for ($i = 0, $c = count($bits), $l = false; $i < $c; $i++, $l = !$l) {
12717
if ($bits[$i] === '') continue;
12718
$token[] = new HTMLPurifier_Token_Text($bits[$i]);
12720
$token[] = new HTMLPurifier_Token_Start('a',
12721
array('href' => str_replace('%s', $bits[$i], $this->docURL)));
12722
$token[] = new HTMLPurifier_Token_Text('%' . $bits[$i]);
12723
$token[] = new HTMLPurifier_Token_End('a');
12734
class HTMLPurifier_Injector_RemoveEmpty extends HTMLPurifier_Injector
12737
private $context, $config;
12739
public function prepare($config, $context) {
12740
parent::prepare($config, $context);
12741
$this->config = $config;
12742
$this->context = $context;
12743
$this->attrValidator = new HTMLPurifier_AttrValidator();
12746
public function handleElement(&$token) {
12747
if (!$token instanceof HTMLPurifier_Token_Start) return;
12749
for ($i = $this->inputIndex + 1, $c = count($this->inputTokens); $i < $c; $i++) {
12750
$next = $this->inputTokens[$i];
12751
if ($next instanceof HTMLPurifier_Token_Text && $next->is_whitespace) continue;
12754
if (!$next || ($next instanceof HTMLPurifier_Token_End && $next->name == $token->name)) {
12755
if ($token->name == 'colgroup') return;
12756
$this->attrValidator->validateToken($token, $this->config, $this->context);
12757
$token->armor['ValidateAttributes'] = true;
12758
if (isset($token->attr['id']) || isset($token->attr['name'])) return;
12759
$token = $i - $this->inputIndex + 1;
12760
for ($b = $this->inputIndex - 1; $b > 0; $b--) {
12761
$prev = $this->inputTokens[$b];
12762
if ($prev instanceof HTMLPurifier_Token_Text && $prev->is_whitespace) continue;
12765
// This is safe because we removed the token that triggered this.
12766
$this->rewind($b - 1);
12776
* Adds important param elements to inside of object in order to make
12779
class HTMLPurifier_Injector_SafeObject extends HTMLPurifier_Injector
12781
public $name = 'SafeObject';
12782
public $needed = array('object', 'param');
12784
protected $objectStack = array();
12785
protected $paramStack = array();
12787
// Keep this synchronized with AttrTransform/SafeParam.php
12788
protected $addParam = array(
12789
'allowScriptAccess' => 'never',
12790
'allowNetworking' => 'internal',
12792
protected $allowedParam = array(
12797
public function prepare($config, $context) {
12798
parent::prepare($config, $context);
12801
public function handleElement(&$token) {
12802
if ($token->name == 'object') {
12803
$this->objectStack[] = $token;
12804
$this->paramStack[] = array();
12805
$new = array($token);
12806
foreach ($this->addParam as $name => $value) {
12807
$new[] = new HTMLPurifier_Token_Empty('param', array('name' => $name, 'value' => $value));
12810
} elseif ($token->name == 'param') {
12811
$nest = count($this->currentNesting) - 1;
12812
if ($nest >= 0 && $this->currentNesting[$nest]->name === 'object') {
12813
$i = count($this->objectStack) - 1;
12814
if (!isset($token->attr['name'])) {
12818
$n = $token->attr['name'];
12819
// We need this fix because YouTube doesn't supply a data
12820
// attribute, which we need if a type is specified. This is
12821
// *very* Flash specific.
12822
if (!isset($this->objectStack[$i]->attr['data']) && $token->attr['name'] == 'movie') {
12823
$this->objectStack[$i]->attr['data'] = $token->attr['value'];
12825
// Check if the parameter is the correct value but has not
12826
// already been added
12828
!isset($this->paramStack[$i][$n]) &&
12829
isset($this->addParam[$n]) &&
12830
$token->attr['name'] === $this->addParam[$n]
12832
// keep token, and add to param stack
12833
$this->paramStack[$i][$n] = true;
12834
} elseif (isset($this->allowedParam[$n])) {
12835
// keep token, don't do anything to it
12836
// (could possibly check for duplicates here)
12841
// not directly inside an object, DENY!
12847
public function handleEnd(&$token) {
12848
// This is the WRONG way of handling the object and param stacks;
12849
// we should be inserting them directly on the relevant object tokens
12850
// so that the global stack handling handles it.
12851
if ($token->name == 'object') {
12852
array_pop($this->objectStack);
12853
array_pop($this->paramStack);
12863
* Parser that uses PHP 5's DOM extension (part of the core).
12865
* In PHP 5, the DOM XML extension was revamped into DOM and added to the core.
12866
* It gives us a forgiving HTML parser, which we use to transform the HTML
12867
* into a DOM, and then into the tokens. It is blazingly fast (for large
12868
* documents, it performs twenty times faster than
12869
* HTMLPurifier_Lexer_DirectLex,and is the default choice for PHP 5.
12871
* @note Any empty elements will have empty tokens associated with them, even if
12872
* this is prohibited by the spec. This is cannot be fixed until the spec
12875
* @note PHP's DOM extension does not actually parse any entities, we use
12876
* our own function to do that.
12878
* @warning DOM tends to drop whitespace, which may wreak havoc on indenting.
12879
* If this is a huge problem, due to the fact that HTML is hand
12880
* edited and you are unable to get a parser cache that caches the
12881
* the output of HTML Purifier while keeping the original HTML lying
12882
* around, you may want to run Tidy on the resulting output or use
12883
* HTMLPurifier_DirectLex
12886
class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
12891
public function __construct() {
12892
// setup the factory
12893
parent::__construct();
12894
$this->factory = new HTMLPurifier_TokenFactory();
12897
public function tokenizeHTML($html, $config, $context) {
12899
$html = $this->normalize($html, $config, $context);
12901
// attempt to armor stray angled brackets that cannot possibly
12902
// form tags and thus are probably being used as emoticons
12903
if ($config->get('Core', 'AggressivelyFixLt')) {
12904
$char = '[^a-z!\/]';
12905
$comment = "/<!--(.*?)(-->|\z)/is";
12906
$html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html);
12909
$html = preg_replace("/<($char)/i", '<\\1', $html);
12910
} while ($html !== $old);
12911
$html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments
12914
// preprocess html, essential for UTF-8
12915
$html = $this->wrapHTML($html, $config, $context);
12917
$doc = new DOMDocument();
12918
$doc->encoding = 'UTF-8'; // theoretically, the above has this covered
12920
set_error_handler(array($this, 'muteErrorHandler'));
12921
$doc->loadHTML($html);
12922
restore_error_handler();
12925
$this->tokenizeDOM(
12926
$doc->getElementsByTagName('html')->item(0)-> // <html>
12927
getElementsByTagName('body')->item(0)-> // <body>
12928
getElementsByTagName('div')->item(0) // <div>
12934
* Recursive function that tokenizes a node, putting it into an accumulator.
12936
* @param $node DOMNode to be tokenized.
12937
* @param $tokens Array-list of already tokenized tokens.
12938
* @param $collect Says whether or start and close are collected, set to
12939
* false at first recursion because it's the implicit DIV
12940
* tag you're dealing with.
12941
* @returns Tokens of node appended to previously passed tokens.
12943
protected function tokenizeDOM($node, &$tokens, $collect = false) {
12945
// intercept non element nodes. WE MUST catch all of them,
12946
// but we're not getting the character reference nodes because
12947
// those should have been preprocessed
12948
if ($node->nodeType === XML_TEXT_NODE) {
12949
$tokens[] = $this->factory->createText($node->data);
12951
} elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {
12952
// undo libxml's special treatment of <script> and <style> tags
12953
$last = end($tokens);
12954
$data = $node->data;
12955
// (note $node->tagname is already normalized)
12956
if ($last instanceof HTMLPurifier_Token_Start && ($last->name == 'script' || $last->name == 'style')) {
12957
$new_data = trim($data);
12958
if (substr($new_data, 0, 4) === '<!--') {
12959
$data = substr($new_data, 4);
12960
if (substr($data, -3) === '-->') {
12961
$data = substr($data, 0, -3);
12963
// Highly suspicious! Not sure what to do...
12967
$tokens[] = $this->factory->createText($this->parseData($data));
12969
} elseif ($node->nodeType === XML_COMMENT_NODE) {
12970
// this is code is only invoked for comments in script/style in versions
12971
// of libxml pre-2.6.28 (regular comments, of course, are still
12972
// handled regularly)
12973
$tokens[] = $this->factory->createComment($node->data);
12976
// not-well tested: there may be other nodes we have to grab
12977
$node->nodeType !== XML_ELEMENT_NODE
12982
$attr = $node->hasAttributes() ?
12983
$this->transformAttrToAssoc($node->attributes) :
12986
// We still have to make sure that the element actually IS empty
12987
if (!$node->childNodes->length) {
12989
$tokens[] = $this->factory->createEmpty($node->tagName, $attr);
12992
if ($collect) { // don't wrap on first iteration
12993
$tokens[] = $this->factory->createStart(
12994
$tag_name = $node->tagName, // somehow, it get's dropped
12998
foreach ($node->childNodes as $node) {
12999
// remember, it's an accumulator. Otherwise, we'd have
13000
// to use array_merge
13001
$this->tokenizeDOM($node, $tokens, true);
13004
$tokens[] = $this->factory->createEnd($tag_name);
13011
* Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array.
13013
* @param $attribute_list DOMNamedNodeMap of DOMAttr objects.
13014
* @returns Associative array of attributes.
13016
protected function transformAttrToAssoc($node_map) {
13017
// NamedNodeMap is documented very well, so we're using undocumented
13018
// features, namely, the fact that it implements Iterator and
13019
// has a ->length attribute
13020
if ($node_map->length === 0) return array();
13022
foreach ($node_map as $attr) {
13023
$array[$attr->name] = $attr->value;
13029
* An error handler that mutes all errors
13031
public function muteErrorHandler($errno, $errstr) {}
13034
* Callback function for undoing escaping of stray angled brackets
13037
public function callbackUndoCommentSubst($matches) {
13038
return '<!--' . strtr($matches[1], array('&'=>'&','<'=>'<')) . $matches[2];
13042
* Callback function that entity-izes ampersands in comments so that
13043
* callbackUndoCommentSubst doesn't clobber them
13045
public function callbackArmorCommentEntities($matches) {
13046
return '<!--' . str_replace('&', '&', $matches[1]) . $matches[2];
13050
* Wraps an HTML fragment in the necessary HTML
13052
protected function wrapHTML($html, $config, $context) {
13053
$def = $config->getDefinition('HTML');
13056
if (!empty($def->doctype->dtdPublic) || !empty($def->doctype->dtdSystem)) {
13057
$ret .= '<!DOCTYPE html ';
13058
if (!empty($def->doctype->dtdPublic)) $ret .= 'PUBLIC "' . $def->doctype->dtdPublic . '" ';
13059
if (!empty($def->doctype->dtdSystem)) $ret .= '"' . $def->doctype->dtdSystem . '" ';
13063
$ret .= '<html><head>';
13064
$ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />';
13065
// No protection if $html contains a stray </div>!
13066
$ret .= '</head><body><div>'.$html.'</div></body></html>';
13076
* Our in-house implementation of a parser.
13078
* A pure PHP parser, DirectLex has absolutely no dependencies, making
13079
* it a reasonably good default for PHP4. Written with efficiency in mind,
13080
* it can be four times faster than HTMLPurifier_Lexer_PEARSax3, although it
13081
* pales in comparison to HTMLPurifier_Lexer_DOMLex.
13083
* @todo Reread XML spec and document differences.
13085
class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
13088
public $tracksLineNumbers = true;
13091
* Whitespace characters for str(c)spn.
13093
protected $_whitespace = "\x20\x09\x0D\x0A";
13096
* Callback function for script CDATA fudge
13097
* @param $matches, in form of array(opening tag, contents, closing tag)
13099
protected function scriptCallback($matches) {
13100
return $matches[1] . htmlspecialchars($matches[2], ENT_COMPAT, 'UTF-8') . $matches[3];
13103
public function tokenizeHTML($html, $config, $context) {
13105
// special normalization for script tags without any armor
13106
// our "armor" heurstic is a < sign any number of whitespaces after
13107
// the first script tag
13108
if ($config->get('HTML', 'Trusted')) {
13109
$html = preg_replace_callback('#(<script[^>]*>)(\s*[^<].+?)(</script>)#si',
13110
array($this, 'scriptCallback'), $html);
13113
$html = $this->normalize($html, $config, $context);
13115
$cursor = 0; // our location in the text
13116
$inside_tag = false; // whether or not we're parsing the inside of a tag
13117
$array = array(); // result array
13119
// This is also treated to mean maintain *column* numbers too
13120
$maintain_line_numbers = $config->get('Core', 'MaintainLineNumbers');
13122
if ($maintain_line_numbers === null) {
13123
// automatically determine line numbering by checking
13124
// if error collection is on
13125
$maintain_line_numbers = $config->get('Core', 'CollectErrors');
13128
if ($maintain_line_numbers) {
13131
$length = strlen($html);
13133
$current_line = false;
13134
$current_col = false;
13137
$context->register('CurrentLine', $current_line);
13138
$context->register('CurrentCol', $current_col);
13140
// how often to manually recalculate. This will ALWAYS be right,
13141
// but it's pretty wasteful. Set to 0 to turn off
13142
$synchronize_interval = $config->get('Core', 'DirectLexLineNumberSyncInterval');
13145
if ($config->get('Core', 'CollectErrors')) {
13146
$e =& $context->get('ErrorCollector');
13149
// for testing synchronization
13154
// $cursor is either at the start of a token, or inside of
13155
// a tag (i.e. there was a < immediately before it), as indicated
13158
if ($maintain_line_numbers) {
13160
// $rcursor, however, is always at the start of a token.
13161
$rcursor = $cursor - (int) $inside_tag;
13163
// Column number is cheap, so we calculate it every round.
13164
// We're interested at the *end* of the newline string, so
13165
// we need to add strlen($nl) == 1 to $nl_pos before subtracting it
13166
// from our "rcursor" position.
13167
$nl_pos = strrpos($html, $nl, $rcursor - $length);
13168
$current_col = $rcursor - (is_bool($nl_pos) ? 0 : $nl_pos + 1);
13170
// recalculate lines
13172
$synchronize_interval && // synchronization is on
13173
$cursor > 0 && // cursor is further than zero
13174
$loops % $synchronize_interval === 0 // time to synchronize!
13176
$current_line = 1 + $this->substrCount($html, $nl, 0, $cursor);
13181
$position_next_lt = strpos($html, '<', $cursor);
13182
$position_next_gt = strpos($html, '>', $cursor);
13184
// triggers on "<b>asdf</b>" but not "asdf <b></b>"
13185
// special case to set up context
13186
if ($position_next_lt === $cursor) {
13187
$inside_tag = true;
13191
if (!$inside_tag && $position_next_lt !== false) {
13192
// We are not inside tag and there still is another tag to parse
13194
HTMLPurifier_Token_Text(
13197
$html, $cursor, $position_next_lt - $cursor
13201
if ($maintain_line_numbers) {
13202
$token->rawPosition($current_line, $current_col);
13203
$current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor);
13206
$cursor = $position_next_lt + 1;
13207
$inside_tag = true;
13209
} elseif (!$inside_tag) {
13210
// We are not inside tag but there are no more tags
13211
// If we're already at the end, break
13212
if ($cursor === strlen($html)) break;
13213
// Create Text of rest of string
13215
HTMLPurifier_Token_Text(
13222
if ($maintain_line_numbers) $token->rawPosition($current_line, $current_col);
13225
} elseif ($inside_tag && $position_next_gt !== false) {
13226
// We are in tag and it is well formed
13227
// Grab the internals of the tag
13228
$strlen_segment = $position_next_gt - $cursor;
13230
if ($strlen_segment < 1) {
13231
// there's nothing to process!
13232
$token = new HTMLPurifier_Token_Text('<');
13237
$segment = substr($html, $cursor, $strlen_segment);
13239
if ($segment === false) {
13240
// somehow, we attempted to access beyond the end of
13241
// the string, defense-in-depth, reported by Nate Abele
13245
// Check if it's a comment
13247
substr($segment, 0, 3) === '!--'
13249
// re-determine segment length, looking for -->
13250
$position_comment_end = strpos($html, '-->', $cursor);
13251
if ($position_comment_end === false) {
13252
// uh oh, we have a comment that extends to
13253
// infinity. Can't be helped: set comment
13254
// end position to end of string
13255
if ($e) $e->send(E_WARNING, 'Lexer: Unclosed comment');
13256
$position_comment_end = strlen($html);
13261
$strlen_segment = $position_comment_end - $cursor;
13262
$segment = substr($html, $cursor, $strlen_segment);
13264
HTMLPurifier_Token_Comment(
13266
$segment, 3, $strlen_segment - 3
13269
if ($maintain_line_numbers) {
13270
$token->rawPosition($current_line, $current_col);
13271
$current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment);
13274
$cursor = $end ? $position_comment_end : $position_comment_end + 3;
13275
$inside_tag = false;
13279
// Check if it's an end tag
13280
$is_end_tag = (strpos($segment,'/') === 0);
13282
$type = substr($segment, 1);
13283
$token = new HTMLPurifier_Token_End($type);
13284
if ($maintain_line_numbers) {
13285
$token->rawPosition($current_line, $current_col);
13286
$current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
13289
$inside_tag = false;
13290
$cursor = $position_next_gt + 1;
13294
// Check leading character is alnum, if not, we may
13295
// have accidently grabbed an emoticon. Translate into
13296
// text and go our merry way
13297
if (!ctype_alpha($segment[0])) {
13298
// XML: $segment[0] !== '_' && $segment[0] !== ':'
13299
if ($e) $e->send(E_NOTICE, 'Lexer: Unescaped lt');
13300
$token = new HTMLPurifier_Token_Text('<');
13301
if ($maintain_line_numbers) {
13302
$token->rawPosition($current_line, $current_col);
13303
$current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
13306
$inside_tag = false;
13310
// Check if it is explicitly self closing, if so, remove
13311
// trailing slash. Remember, we could have a tag like <br>, so
13312
// any later token processing scripts must convert improperly
13313
// classified EmptyTags from StartTags.
13314
$is_self_closing = (strrpos($segment,'/') === $strlen_segment-1);
13315
if ($is_self_closing) {
13317
$segment = substr($segment, 0, $strlen_segment);
13320
// Check if there are any attributes
13321
$position_first_space = strcspn($segment, $this->_whitespace);
13323
if ($position_first_space >= $strlen_segment) {
13324
if ($is_self_closing) {
13325
$token = new HTMLPurifier_Token_Empty($segment);
13327
$token = new HTMLPurifier_Token_Start($segment);
13329
if ($maintain_line_numbers) {
13330
$token->rawPosition($current_line, $current_col);
13331
$current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
13334
$inside_tag = false;
13335
$cursor = $position_next_gt + 1;
13339
// Grab out all the data
13340
$type = substr($segment, 0, $position_first_space);
13341
$attribute_string =
13344
$segment, $position_first_space
13347
if ($attribute_string) {
13348
$attr = $this->parseAttributeString(
13350
, $config, $context
13356
if ($is_self_closing) {
13357
$token = new HTMLPurifier_Token_Empty($type, $attr);
13359
$token = new HTMLPurifier_Token_Start($type, $attr);
13361
if ($maintain_line_numbers) {
13362
$token->rawPosition($current_line, $current_col);
13363
$current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
13366
$cursor = $position_next_gt + 1;
13367
$inside_tag = false;
13370
// inside tag, but there's no ending > sign
13371
if ($e) $e->send(E_WARNING, 'Lexer: Missing gt');
13373
HTMLPurifier_Token_Text(
13376
substr($html, $cursor)
13379
if ($maintain_line_numbers) $token->rawPosition($current_line, $current_col);
13380
// no cursor scroll? Hmm...
13387
$context->destroy('CurrentLine');
13388
$context->destroy('CurrentCol');
13393
* PHP 5.0.x compatible substr_count that implements offset and length
13395
protected function substrCount($haystack, $needle, $offset, $length) {
13396
static $oldVersion;
13397
if ($oldVersion === null) {
13398
$oldVersion = version_compare(PHP_VERSION, '5.1', '<');
13401
$haystack = substr($haystack, $offset, $length);
13402
return substr_count($haystack, $needle);
13404
return substr_count($haystack, $needle, $offset, $length);
13409
* Takes the inside of an HTML tag and makes an assoc array of attributes.
13411
* @param $string Inside of tag excluding name.
13412
* @returns Assoc array of attributes.
13414
public function parseAttributeString($string, $config, $context) {
13415
$string = (string) $string; // quick typecast
13417
if ($string == '') return array(); // no attributes
13420
if ($config->get('Core', 'CollectErrors')) {
13421
$e =& $context->get('ErrorCollector');
13424
// let's see if we can abort as quickly as possible
13425
// one equal sign, no spaces => one attribute
13426
$num_equal = substr_count($string, '=');
13427
$has_space = strpos($string, ' ');
13428
if ($num_equal === 0 && !$has_space) {
13430
return array($string => $string);
13431
} elseif ($num_equal === 1 && !$has_space) {
13432
// only one attribute
13433
list($key, $quoted_value) = explode('=', $string);
13434
$quoted_value = trim($quoted_value);
13436
if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
13439
if (!$quoted_value) return array($key => '');
13440
$first_char = @$quoted_value[0];
13441
$last_char = @$quoted_value[strlen($quoted_value)-1];
13443
$same_quote = ($first_char == $last_char);
13444
$open_quote = ($first_char == '"' || $first_char == "'");
13446
if ( $same_quote && $open_quote) {
13448
$value = substr($quoted_value, 1, strlen($quoted_value) - 2);
13450
// not well behaved
13452
if ($e) $e->send(E_ERROR, 'Lexer: Missing end quote');
13453
$value = substr($quoted_value, 1);
13455
$value = $quoted_value;
13458
if ($value === false) $value = '';
13459
return array($key => $value);
13462
// setup loop environment
13463
$array = array(); // return assoc array of attributes
13464
$cursor = 0; // current position in string (moves forward)
13465
$size = strlen($string); // size of the string (stays the same)
13467
// if we have unquoted attributes, the parser expects a terminating
13468
// space, so let's guarantee that there's always a terminating space.
13473
if ($cursor >= $size) {
13477
$cursor += ($value = strspn($string, $this->_whitespace, $cursor));
13480
$key_begin = $cursor; //we're currently at the start of the key
13482
// scroll past all characters that are the key (not whitespace or =)
13483
$cursor += strcspn($string, $this->_whitespace . '=', $cursor);
13485
$key_end = $cursor; // now at the end of the key
13487
$key = substr($string, $key_begin, $key_end - $key_begin);
13490
if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
13491
$cursor += strcspn($string, $this->_whitespace, $cursor + 1); // prevent infinite loop
13492
continue; // empty key
13495
// scroll past all whitespace
13496
$cursor += strspn($string, $this->_whitespace, $cursor);
13498
if ($cursor >= $size) {
13499
$array[$key] = $key;
13503
// if the next character is an equal sign, we've got a regular
13504
// pair, otherwise, it's a bool attribute
13505
$first_char = @$string[$cursor];
13507
if ($first_char == '=') {
13511
$cursor += strspn($string, $this->_whitespace, $cursor);
13513
if ($cursor === false) {
13518
// we might be in front of a quote right now
13520
$char = @$string[$cursor];
13522
if ($char == '"' || $char == "'") {
13523
// it's quoted, end bound is $char
13525
$value_begin = $cursor;
13526
$cursor = strpos($string, $char, $cursor);
13527
$value_end = $cursor;
13529
// it's not quoted, end bound is whitespace
13530
$value_begin = $cursor;
13531
$cursor += strcspn($string, $this->_whitespace, $cursor);
13532
$value_end = $cursor;
13535
// we reached a premature end
13536
if ($cursor === false) {
13538
$value_end = $cursor;
13541
$value = substr($string, $value_begin, $value_end - $value_begin);
13542
if ($value === false) $value = '';
13543
$array[$key] = $this->parseData($value);
13549
$array[$key] = $key;
13551
// purely theoretical
13552
if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
13566
* Composite strategy that runs multiple strategies on tokens.
13568
abstract class HTMLPurifier_Strategy_Composite extends HTMLPurifier_Strategy
13572
* List of strategies to run tokens through.
13574
protected $strategies = array();
13576
abstract public function __construct();
13578
public function execute($tokens, $config, $context) {
13579
foreach ($this->strategies as $strategy) {
13580
$tokens = $strategy->execute($tokens, $config, $context);
13591
* Core strategy composed of the big four strategies.
13593
class HTMLPurifier_Strategy_Core extends HTMLPurifier_Strategy_Composite
13596
public function __construct() {
13597
$this->strategies[] = new HTMLPurifier_Strategy_RemoveForeignElements();
13598
$this->strategies[] = new HTMLPurifier_Strategy_MakeWellFormed();
13599
$this->strategies[] = new HTMLPurifier_Strategy_FixNesting();
13600
$this->strategies[] = new HTMLPurifier_Strategy_ValidateAttributes();
13609
* Takes a well formed list of tokens and fixes their nesting.
13611
* HTML elements dictate which elements are allowed to be their children,
13612
* for example, you can't have a p tag in a span tag. Other elements have
13613
* much more rigorous definitions: tables, for instance, require a specific
13614
* order for their elements. There are also constraints not expressible by
13615
* document type definitions, such as the chameleon nature of ins/del
13616
* tags and global child exclusions.
13618
* The first major objective of this strategy is to iterate through all the
13619
* nodes (not tokens) of the list of tokens and determine whether or not
13620
* their children conform to the element's definition. If they do not, the
13621
* child definition may optionally supply an amended list of elements that
13622
* is valid or require that the entire node be deleted (and the previous
13625
* The second objective is to ensure that explicitly excluded elements of
13626
* an element do not appear in its children. Code that accomplishes this
13627
* task is pervasive through the strategy, though the two are distinct tasks
13628
* and could, theoretically, be seperated (although it's not recommended).
13630
* @note Whether or not unrecognized children are silently dropped or
13631
* translated into text depends on the child definitions.
13633
* @todo Enable nodes to be bubbled out of the structure.
13636
class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
13639
public function execute($tokens, $config, $context) {
13640
//####################################################################//
13643
// get a copy of the HTML definition
13644
$definition = $config->getHTMLDefinition();
13646
// insert implicit "parent" node, will be removed at end.
13648
$parent_name = $definition->info_parent;
13649
array_unshift($tokens, new HTMLPurifier_Token_Start($parent_name));
13650
$tokens[] = new HTMLPurifier_Token_End($parent_name);
13652
// setup the context variable 'IsInline', for chameleon processing
13653
// is 'false' when we are not inline, 'true' when it must always
13654
// be inline, and an integer when it is inline for a certain
13655
// branch of the document tree
13656
$is_inline = $definition->info_parent_def->descendants_are_inline;
13657
$context->register('IsInline', $is_inline);
13659
// setup error collector
13660
$e =& $context->get('ErrorCollector', true);
13662
//####################################################################//
13663
// Loop initialization
13665
// stack that contains the indexes of all parents,
13666
// $stack[count($stack)-1] being the current parent
13669
// stack that contains all elements that are excluded
13670
// it is organized by parent elements, similar to $stack,
13671
// but it is only populated when an element with exclusions is
13672
// processed, i.e. there won't be empty exclusions.
13673
$exclude_stack = array();
13675
// variable that contains the start token while we are processing
13676
// nodes. This enables error reporting to do its job
13677
$start_token = false;
13678
$context->register('CurrentToken', $start_token);
13680
//####################################################################//
13683
// iterate through all start nodes. Determining the start node
13684
// is complicated so it has been omitted from the loop construct
13685
for ($i = 0, $size = count($tokens) ; $i < $size; ) {
13687
//################################################################//
13688
// Gather information on children
13690
// child token accumulator
13691
$child_tokens = array();
13693
// scroll to the end of this node, report number, and collect
13695
for ($j = $i, $depth = 0; ; $j++) {
13696
if ($tokens[$j] instanceof HTMLPurifier_Token_Start) {
13698
// skip token assignment on first iteration, this is the
13699
// token we currently are on
13700
if ($depth == 1) continue;
13701
} elseif ($tokens[$j] instanceof HTMLPurifier_Token_End) {
13703
// skip token assignment on last iteration, this is the
13704
// end token of the token we're currently on
13705
if ($depth == 0) break;
13707
$child_tokens[] = $tokens[$j];
13710
// $i is index of start token
13711
// $j is index of end token
13713
$start_token = $tokens[$i]; // to make token available via CurrentToken
13715
//################################################################//
13716
// Gather information on parent
13718
// calculate parent information
13719
if ($count = count($stack)) {
13720
$parent_index = $stack[$count-1];
13721
$parent_name = $tokens[$parent_index]->name;
13722
if ($parent_index == 0) {
13723
$parent_def = $definition->info_parent_def;
13725
$parent_def = $definition->info[$parent_name];
13728
// processing as if the parent were the "root" node
13729
// unknown info, it won't be used anyway, in the future,
13730
// we may want to enforce one element only (this is
13731
// necessary for HTML Purifier to clean entire documents
13732
$parent_index = $parent_name = $parent_def = null;
13735
// calculate context
13736
if ($is_inline === false) {
13737
// check if conditions make it inline
13738
if (!empty($parent_def) && $parent_def->descendants_are_inline) {
13739
$is_inline = $count - 1;
13742
// check if we're out of inline
13743
if ($count === $is_inline) {
13744
$is_inline = false;
13748
//################################################################//
13749
// Determine whether element is explicitly excluded SGML-style
13751
// determine whether or not element is excluded by checking all
13752
// parent exclusions. The array should not be very large, two
13753
// elements at most.
13755
if (!empty($exclude_stack)) {
13756
foreach ($exclude_stack as $lookup) {
13757
if (isset($lookup[$tokens[$i]->name])) {
13759
// no need to continue processing
13765
//################################################################//
13766
// Perform child validation
13769
// there is an exclusion, remove the entire node
13771
$excludes = array(); // not used, but good to initialize anyway
13775
// special processing for the first node
13776
$def = $definition->info_parent_def;
13778
$def = $definition->info[$tokens[$i]->name];
13782
if (!empty($def->child)) {
13783
// have DTD child def validate children
13784
$result = $def->child->validateChildren(
13785
$child_tokens, $config, $context);
13787
// weird, no child definition, get rid of everything
13791
// determine whether or not this element has any exclusions
13792
$excludes = $def->excludes;
13795
// $result is now a bool or array
13797
//################################################################//
13798
// Process result by interpreting $result
13800
if ($result === true || $child_tokens === $result) {
13801
// leave the node as is
13803
// register start token as a parental node start
13806
// register exclusions if there are any
13807
if (!empty($excludes)) $exclude_stack[] = $excludes;
13809
// move cursor to next possible start node
13812
} elseif($result === false) {
13813
// remove entire node
13817
$e->send(E_ERROR, 'Strategy_FixNesting: Node excluded');
13819
$e->send(E_ERROR, 'Strategy_FixNesting: Node removed');
13823
// calculate length of inner tokens and current tokens
13824
$length = $j - $i + 1;
13827
array_splice($tokens, $i, $length);
13832
// there is no start token to register,
13833
// current node is now the next possible start node
13834
// unless it turns out that we need to do a double-check
13836
// this is a rought heuristic that covers 100% of HTML's
13837
// cases and 99% of all other cases. A child definition
13838
// that would be tricked by this would be something like:
13839
// ( | a b c) where it's all or nothing. Fortunately,
13840
// our current implementation claims that that case would
13841
// not allow empty, even if it did
13842
if (!$parent_def->child->allow_empty) {
13843
// we need to do a double-check
13844
$i = $parent_index;
13848
// PROJECTED OPTIMIZATION: Process all children elements before
13849
// reprocessing parent node.
13852
// replace node with $result
13854
// calculate length of inner tokens
13855
$length = $j - $i - 1;
13858
if (empty($result) && $length) {
13859
$e->send(E_ERROR, 'Strategy_FixNesting: Node contents removed');
13861
$e->send(E_WARNING, 'Strategy_FixNesting: Node reorganized');
13865
// perform replacement
13866
array_splice($tokens, $i + 1, $length, $result);
13870
$size += count($result);
13872
// register start token as a parental node start
13875
// register exclusions if there are any
13876
if (!empty($excludes)) $exclude_stack[] = $excludes;
13878
// move cursor to next possible start node
13883
//################################################################//
13884
// Scroll to next start node
13886
// We assume, at this point, that $i is the index of the token
13887
// that is the first possible new start point for a node.
13889
// Test if the token indeed is a start tag, if not, move forward
13891
$size = count($tokens);
13892
while ($i < $size and !$tokens[$i] instanceof HTMLPurifier_Token_Start) {
13893
if ($tokens[$i] instanceof HTMLPurifier_Token_End) {
13894
// pop a token index off the stack if we ended a node
13896
// pop an exclusion lookup off exclusion stack if
13897
// we ended node and that node had exclusions
13898
if ($i == 0 || $i == $size - 1) {
13899
// use specialized var if it's the super-parent
13900
$s_excludes = $definition->info_parent_def->excludes;
13902
$s_excludes = $definition->info[$tokens[$i]->name]->excludes;
13905
array_pop($exclude_stack);
13913
//####################################################################//
13916
// remove implicit parent tokens at the beginning and end
13917
array_shift($tokens);
13918
array_pop($tokens);
13920
// remove context variables
13921
$context->destroy('IsInline');
13922
$context->destroy('CurrentToken');
13924
//####################################################################//
13938
* Takes tokens makes them well-formed (balance end tags, etc.)
13940
class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
13944
* Array stream of tokens being processed.
13949
* Current index in $tokens.
13954
* Current nesting of elements.
13959
* Injectors active in this stream processing.
13961
protected $injectors;
13964
* Current instance of HTMLPurifier_Config.
13969
* Current instance of HTMLPurifier_Context.
13971
protected $context;
13973
public function execute($tokens, $config, $context) {
13975
$definition = $config->getHTMLDefinition();
13978
$generator = new HTMLPurifier_Generator($config, $context);
13979
$escape_invalid_tags = $config->get('Core', 'EscapeInvalidTags');
13980
$e = $context->get('ErrorCollector', true);
13981
$t = false; // token index
13982
$i = false; // injector index
13983
$token = false; // the current token
13984
$reprocess = false; // whether or not to reprocess the same token
13987
// member variables
13988
$this->stack =& $stack;
13990
$this->tokens =& $tokens;
13991
$this->config = $config;
13992
$this->context = $context;
13994
// context variables
13995
$context->register('CurrentNesting', $stack);
13996
$context->register('InputIndex', $t);
13997
$context->register('InputTokens', $tokens);
13998
$context->register('CurrentToken', $token);
14000
// -- begin INJECTOR --
14002
$this->injectors = array();
14004
$injectors = $config->getBatch('AutoFormat');
14005
$def_injectors = $definition->info_injector;
14006
$custom_injectors = $injectors['Custom'];
14007
unset($injectors['Custom']); // special case
14008
foreach ($injectors as $injector => $b) {
14009
$injector = "HTMLPurifier_Injector_$injector";
14011
$this->injectors[] = new $injector;
14013
foreach ($def_injectors as $injector) {
14014
// assumed to be objects
14015
$this->injectors[] = $injector;
14017
foreach ($custom_injectors as $injector) {
14018
if (is_string($injector)) {
14019
$injector = "HTMLPurifier_Injector_$injector";
14020
$injector = new $injector;
14022
$this->injectors[] = $injector;
14025
// give the injectors references to the definition and context
14026
// variables for performance reasons
14027
foreach ($this->injectors as $ix => $injector) {
14028
$error = $injector->prepare($config, $context);
14029
if (!$error) continue;
14030
array_splice($this->injectors, $ix, 1); // rm the injector
14031
trigger_error("Cannot enable {$injector->name} injector because $error is not allowed", E_USER_WARNING);
14034
// -- end INJECTOR --
14036
// a note on punting:
14037
// In order to reduce code duplication, whenever some code needs
14038
// to make HTML changes in order to make things "correct", the
14039
// new HTML gets sent through the purifier, regardless of its
14040
// status. This means that if we add a start token, because it
14041
// was totally necessary, we don't have to update nesting; we just
14042
// punt ($reprocess = true; continue;) and it does that for us.
14044
// isset is in loop because $tokens size changes during loop exec
14047
$t == 0 || isset($tokens[$t - 1]);
14048
// only increment if we don't need to reprocess
14049
$reprocess ? $reprocess = false : $t++
14052
// check for a rewind
14053
if (is_int($i) && $i >= 0) {
14054
// possibility: disable rewinding if the current token has a
14055
// rewind set on it already. This would offer protection from
14056
// infinite loop, but might hinder some advanced rewinding.
14057
$rewind_to = $this->injectors[$i]->getRewind();
14058
if (is_int($rewind_to) && $rewind_to < $t) {
14059
if ($rewind_to < 0) $rewind_to = 0;
14060
while ($t > $rewind_to) {
14062
$prev = $tokens[$t];
14063
// indicate that other injectors should not process this token,
14064
// but we need to reprocess it
14065
unset($prev->skip[$i]);
14066
$prev->rewind = $i;
14067
if ($prev instanceof HTMLPurifier_Token_Start) array_pop($this->stack);
14068
elseif ($prev instanceof HTMLPurifier_Token_End) $this->stack[] = $prev->start;
14074
// handle case of document end
14075
if (!isset($tokens[$t])) {
14076
// kill processing if stack is empty
14077
if (empty($this->stack)) break;
14080
$top_nesting = array_pop($this->stack);
14081
$this->stack[] = $top_nesting;
14084
if ($e && !isset($top_nesting->armor['MakeWellFormed_TagClosedError'])) {
14085
$e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by document end', $top_nesting);
14088
// append, don't splice, since this is the end
14089
$tokens[] = new HTMLPurifier_Token_End($top_nesting->name);
14096
// if all goes well, this token will be passed through unharmed
14097
$token = $tokens[$t];
14100
//printTokens($tokens, $t);
14101
//var_dump($this->stack);
14103
// quick-check: if it's not a tag, no need to process
14104
if (empty($token->is_tag)) {
14105
if ($token instanceof HTMLPurifier_Token_Text) {
14106
foreach ($this->injectors as $i => $injector) {
14107
if (isset($token->skip[$i])) continue;
14108
if ($token->rewind !== null && $token->rewind !== $i) continue;
14109
$injector->handleText($token);
14110
$this->processToken($token, $i);
14115
// another possibility is a comment
14119
if (isset($definition->info[$token->name])) {
14120
$type = $definition->info[$token->name]->child->type;
14122
$type = false; // Type is unknown, treat accordingly
14125
// quick tag checks: anything that's *not* an end tag
14127
if ($type === 'empty' && $token instanceof HTMLPurifier_Token_Start) {
14128
// claims to be a start tag but is empty
14129
$token = new HTMLPurifier_Token_Empty($token->name, $token->attr);
14131
} elseif ($type && $type !== 'empty' && $token instanceof HTMLPurifier_Token_Empty) {
14132
// claims to be empty but really is a start tag
14133
$this->swap(new HTMLPurifier_Token_End($token->name));
14134
$this->insertBefore(new HTMLPurifier_Token_Start($token->name, $token->attr));
14135
// punt (since we had to modify the input stream in a non-trivial way)
14138
} elseif ($token instanceof HTMLPurifier_Token_Empty) {
14139
// real empty token
14141
} elseif ($token instanceof HTMLPurifier_Token_Start) {
14144
// ...unless they also have to close their parent
14145
if (!empty($this->stack)) {
14147
$parent = array_pop($this->stack);
14148
$this->stack[] = $parent;
14150
if (isset($definition->info[$parent->name])) {
14151
$elements = $definition->info[$parent->name]->child->getNonAutoCloseElements($config);
14152
$autoclose = !isset($elements[$token->name]);
14154
$autoclose = false;
14158
if ($e) $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag auto closed', $parent);
14159
// insert parent end tag before this tag
14160
$new_token = new HTMLPurifier_Token_End($parent->name);
14161
$new_token->start = $parent;
14162
$this->insertBefore($new_token);
14172
foreach ($this->injectors as $i => $injector) {
14173
if (isset($token->skip[$i])) continue;
14174
if ($token->rewind !== null && $token->rewind !== $i) continue;
14175
$injector->handleElement($token);
14176
$this->processToken($token, $i);
14181
// ah, nothing interesting happened; do normal processing
14182
$this->swap($token);
14183
if ($token instanceof HTMLPurifier_Token_Start) {
14184
$this->stack[] = $token;
14185
} elseif ($token instanceof HTMLPurifier_Token_End) {
14186
throw new HTMLPurifier_Exception('Improper handling of end tag in start code; possible error in MakeWellFormed');
14192
// sanity check: we should be dealing with a closing tag
14193
if (!$token instanceof HTMLPurifier_Token_End) {
14194
throw new HTMLPurifier_Exception('Unaccounted for tag token in input stream, bug in HTML Purifier');
14197
// make sure that we have something open
14198
if (empty($this->stack)) {
14199
if ($escape_invalid_tags) {
14200
if ($e) $e->send(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag to text');
14201
$this->swap(new HTMLPurifier_Token_Text(
14202
$generator->generateFromToken($token)
14206
if ($e) $e->send(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag removed');
14212
// first, check for the simplest case: everything closes neatly.
14213
// Eventually, everything passes through here; if there are problems
14214
// we modify the input stream accordingly and then punt, so that
14215
// the tokens get processed again.
14216
$current_parent = array_pop($this->stack);
14217
if ($current_parent->name == $token->name) {
14218
$token->start = $current_parent;
14219
foreach ($this->injectors as $i => $injector) {
14220
if (isset($token->skip[$i])) continue;
14221
if ($token->rewind !== null && $token->rewind !== $i) continue;
14222
$injector->handleEnd($token);
14223
$this->processToken($token, $i);
14224
$this->stack[] = $current_parent;
14231
// okay, so we're trying to close the wrong tag
14233
// undo the pop previous pop
14234
$this->stack[] = $current_parent;
14236
// scroll back the entire nest, trying to find our tag.
14237
// (feature could be to specify how far you'd like to go)
14238
$size = count($this->stack);
14239
// -2 because -1 is the last element, but we already checked that
14240
$skipped_tags = false;
14241
for ($j = $size - 2; $j >= 0; $j--) {
14242
if ($this->stack[$j]->name == $token->name) {
14243
$skipped_tags = array_slice($this->stack, $j);
14248
// we didn't find the tag, so remove
14249
if ($skipped_tags === false) {
14250
if ($escape_invalid_tags) {
14251
$this->swap(new HTMLPurifier_Token_Text(
14252
$generator->generateFromToken($token)
14254
if ($e) $e->send(E_WARNING, 'Strategy_MakeWellFormed: Stray end tag to text');
14257
if ($e) $e->send(E_WARNING, 'Strategy_MakeWellFormed: Stray end tag removed');
14263
// do errors, in REVERSE $j order: a,b,c with </a></b></c>
14264
$c = count($skipped_tags);
14266
for ($j = $c - 1; $j > 0; $j--) {
14267
// notice we exclude $j == 0, i.e. the current ending tag, from
14269
if (!isset($skipped_tags[$j]->armor['MakeWellFormed_TagClosedError'])) {
14270
$e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by element end', $skipped_tags[$j]);
14275
// insert tags, in FORWARD $j order: c,b,a with </a></b></c>
14276
for ($j = 1; $j < $c; $j++) {
14277
// ...as well as from the insertions
14278
$new_token = new HTMLPurifier_Token_End($skipped_tags[$j]->name);
14279
$new_token->start = $skipped_tags[$j];
14280
$this->insertBefore($new_token);
14286
$context->destroy('CurrentNesting');
14287
$context->destroy('InputTokens');
14288
$context->destroy('InputIndex');
14289
$context->destroy('CurrentToken');
14291
unset($this->injectors, $this->stack, $this->tokens, $this->t);
14296
* Processes arbitrary token values for complicated substitution patterns.
14299
* If $token is an array, it is a list of tokens to substitute for the
14300
* current token. These tokens then get individually processed. If there
14301
* is a leading integer in the list, that integer determines how many
14302
* tokens from the stream should be removed.
14304
* If $token is a regular token, it is swapped with the current token.
14306
* If $token is false, the current token is deleted.
14308
* If $token is an integer, that number of tokens (with the first token
14309
* being the current one) will be deleted.
14311
* @param $token Token substitution value
14312
* @param $injector Injector that performed the substitution; default is if
14313
* this is not an injector related operation.
14315
protected function processToken($token, $injector = -1) {
14317
// normalize forms of token
14318
if (is_object($token)) $token = array(1, $token);
14319
if (is_int($token)) $token = array($token);
14320
if ($token === false) $token = array(1);
14321
if (!is_array($token)) throw new HTMLPurifier_Exception('Invalid token type from injector');
14322
if (!is_int($token[0])) array_unshift($token, 1);
14323
if ($token[0] === 0) throw new HTMLPurifier_Exception('Deleting zero tokens is not valid');
14325
// $token is now an array with the following form:
14326
// array(number nodes to delete, new node 1, new node 2, ...)
14328
$delete = array_shift($token);
14329
$old = array_splice($this->tokens, $this->t, $delete, $token);
14331
if ($injector > -1) {
14332
// determine appropriate skips
14333
$oldskip = isset($old[0]) ? $old[0]->skip : array();
14334
foreach ($token as $object) {
14335
$object->skip = $oldskip;
14336
$object->skip[$injector] = true;
14343
* Inserts a token before the current token. Cursor now points to this token
14345
private function insertBefore($token) {
14346
array_splice($this->tokens, $this->t, 0, array($token));
14350
* Removes current token. Cursor now points to new token occupying previously
14353
private function remove() {
14354
array_splice($this->tokens, $this->t, 1);
14358
* Swap current token with new token. Cursor points to new token (no change).
14360
private function swap($token) {
14361
$this->tokens[$this->t] = $token;
14370
* Removes all unrecognized tags from the list of tokens.
14372
* This strategy iterates through all the tokens and removes unrecognized
14373
* tokens. If a token is not recognized but a TagTransform is defined for
14374
* that element, the element will be transformed accordingly.
14377
class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy
14380
public function execute($tokens, $config, $context) {
14381
$definition = $config->getHTMLDefinition();
14382
$generator = new HTMLPurifier_Generator($config, $context);
14385
$escape_invalid_tags = $config->get('Core', 'EscapeInvalidTags');
14386
$remove_invalid_img = $config->get('Core', 'RemoveInvalidImg');
14388
// currently only used to determine if comments should be kept
14389
$trusted = $config->get('HTML', 'Trusted');
14391
$remove_script_contents = $config->get('Core', 'RemoveScriptContents');
14392
$hidden_elements = $config->get('Core', 'HiddenElements');
14394
// remove script contents compatibility
14395
if ($remove_script_contents === true) {
14396
$hidden_elements['script'] = true;
14397
} elseif ($remove_script_contents === false && isset($hidden_elements['script'])) {
14398
unset($hidden_elements['script']);
14401
$attr_validator = new HTMLPurifier_AttrValidator();
14403
// removes tokens until it reaches a closing tag with its value
14404
$remove_until = false;
14406
// converts comments into text tokens when this is equal to a tag name
14407
$textify_comments = false;
14410
$context->register('CurrentToken', $token);
14413
if ($config->get('Core', 'CollectErrors')) {
14414
$e =& $context->get('ErrorCollector');
14417
foreach($tokens as $token) {
14418
if ($remove_until) {
14419
if (empty($token->is_tag) || $token->name !== $remove_until) {
14423
if (!empty( $token->is_tag )) {
14426
// before any processing, try to transform the element
14428
isset($definition->info_tag_transform[$token->name])
14430
$original_name = $token->name;
14431
// there is a transformation for this tag
14433
$token = $definition->
14434
info_tag_transform[$token->name]->
14435
transform($token, $config, $context);
14436
if ($e) $e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Tag transform', $original_name);
14439
if (isset($definition->info[$token->name])) {
14441
// mostly everything's good, but
14442
// we need to make sure required attributes are in order
14444
($token instanceof HTMLPurifier_Token_Start || $token instanceof HTMLPurifier_Token_Empty) &&
14445
$definition->info[$token->name]->required_attr &&
14446
($token->name != 'img' || $remove_invalid_img) // ensure config option still works
14448
$attr_validator->validateToken($token, $config, $context);
14450
foreach ($definition->info[$token->name]->required_attr as $name) {
14451
if (!isset($token->attr[$name])) {
14457
if ($e) $e->send(E_ERROR, 'Strategy_RemoveForeignElements: Missing required attribute', $name);
14460
$token->armor['ValidateAttributes'] = true;
14463
if (isset($hidden_elements[$token->name]) && $token instanceof HTMLPurifier_Token_Start) {
14464
$textify_comments = $token->name;
14465
} elseif ($token->name === $textify_comments && $token instanceof HTMLPurifier_Token_End) {
14466
$textify_comments = false;
14469
} elseif ($escape_invalid_tags) {
14470
// invalid tag, generate HTML representation and insert in
14471
if ($e) $e->send(E_WARNING, 'Strategy_RemoveForeignElements: Foreign element to text');
14472
$token = new HTMLPurifier_Token_Text(
14473
$generator->generateFromToken($token)
14476
// check if we need to destroy all of the tag's children
14477
// CAN BE GENERICIZED
14478
if (isset($hidden_elements[$token->name])) {
14479
if ($token instanceof HTMLPurifier_Token_Start) {
14480
$remove_until = $token->name;
14481
} elseif ($token instanceof HTMLPurifier_Token_Empty) {
14482
// do nothing: we're still looking
14484
$remove_until = false;
14486
if ($e) $e->send(E_ERROR, 'Strategy_RemoveForeignElements: Foreign meta element removed');
14488
if ($e) $e->send(E_ERROR, 'Strategy_RemoveForeignElements: Foreign element removed');
14492
} elseif ($token instanceof HTMLPurifier_Token_Comment) {
14493
// textify comments in script tags when they are allowed
14494
if ($textify_comments !== false) {
14495
$data = $token->data;
14496
$token = new HTMLPurifier_Token_Text($data);
14497
} elseif ($trusted) {
14498
// keep, but perform comment cleaning
14500
// perform check whether or not there's a trailing hyphen
14501
if (substr($token->data, -1) == '-') {
14502
$e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Trailing hyphen in comment removed');
14505
$token->data = rtrim($token->data, '-');
14506
$found_double_hyphen = false;
14507
while (strpos($token->data, '--') !== false) {
14508
if ($e && !$found_double_hyphen) {
14509
$e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Hyphens in comment collapsed');
14511
$found_double_hyphen = true; // prevent double-erroring
14512
$token->data = str_replace('--', '-', $token->data);
14516
if ($e) $e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Comment removed');
14519
} elseif ($token instanceof HTMLPurifier_Token_Text) {
14523
$result[] = $token;
14525
if ($remove_until && $e) {
14526
// we removed tokens until the end, throw error
14527
$e->send(E_ERROR, 'Strategy_RemoveForeignElements: Token removed to end', $remove_until);
14530
$context->destroy('CurrentToken');
14541
* Validate all attributes in the tokens.
14544
class HTMLPurifier_Strategy_ValidateAttributes extends HTMLPurifier_Strategy
14547
public function execute($tokens, $config, $context) {
14550
$validator = new HTMLPurifier_AttrValidator();
14553
$context->register('CurrentToken', $token);
14555
foreach ($tokens as $key => $token) {
14557
// only process tokens that have attributes,
14558
// namely start and empty tags
14559
if (!$token instanceof HTMLPurifier_Token_Start && !$token instanceof HTMLPurifier_Token_Empty) continue;
14561
// skip tokens that are armored
14562
if (!empty($token->armor['ValidateAttributes'])) continue;
14564
// note that we have no facilities here for removing tokens
14565
$validator->validateToken($token, $config, $context);
14567
$tokens[$key] = $token; // for PHP 4
14569
$context->destroy('CurrentToken');
14580
* Transforms FONT tags to the proper form (SPAN with CSS styling)
14582
* This transformation takes the three proprietary attributes of FONT and
14583
* transforms them into their corresponding CSS attributes. These are color,
14586
* @note Size is an interesting case because it doesn't map cleanly to CSS.
14588
* http://style.cleverchimp.com/font_size_intervals/altintervals.html
14589
* for reasonable mappings.
14591
class HTMLPurifier_TagTransform_Font extends HTMLPurifier_TagTransform
14594
public $transform_to = 'span';
14596
protected $_size_lookup = array(
14613
public function transform($tag, $config, $context) {
14615
if ($tag instanceof HTMLPurifier_Token_End) {
14616
$new_tag = clone $tag;
14617
$new_tag->name = $this->transform_to;
14621
$attr = $tag->attr;
14622
$prepend_style = '';
14624
// handle color transform
14625
if (isset($attr['color'])) {
14626
$prepend_style .= 'color:' . $attr['color'] . ';';
14627
unset($attr['color']);
14630
// handle face transform
14631
if (isset($attr['face'])) {
14632
$prepend_style .= 'font-family:' . $attr['face'] . ';';
14633
unset($attr['face']);
14636
// handle size transform
14637
if (isset($attr['size'])) {
14638
// normalize large numbers
14639
if ($attr['size']{0} == '+' || $attr['size']{0} == '-') {
14640
$size = (int) $attr['size'];
14641
if ($size < -2) $attr['size'] = '-2';
14642
if ($size > 4) $attr['size'] = '+4';
14644
$size = (int) $attr['size'];
14645
if ($size > 7) $attr['size'] = '7';
14647
if (isset($this->_size_lookup[$attr['size']])) {
14648
$prepend_style .= 'font-size:' .
14649
$this->_size_lookup[$attr['size']] . ';';
14651
unset($attr['size']);
14654
if ($prepend_style) {
14655
$attr['style'] = isset($attr['style']) ?
14656
$prepend_style . $attr['style'] :
14660
$new_tag = clone $tag;
14661
$new_tag->name = $this->transform_to;
14662
$new_tag->attr = $attr;
14673
* Simple transformation, just change tag name to something else,
14674
* and possibly add some styling. This will cover most of the deprecated
14677
class HTMLPurifier_TagTransform_Simple extends HTMLPurifier_TagTransform
14683
* @param $transform_to Tag name to transform to.
14684
* @param $style CSS style to add to the tag
14686
public function __construct($transform_to, $style = null) {
14687
$this->transform_to = $transform_to;
14688
$this->style = $style;
14691
public function transform($tag, $config, $context) {
14692
$new_tag = clone $tag;
14693
$new_tag->name = $this->transform_to;
14694
if (!is_null($this->style) &&
14695
($new_tag instanceof HTMLPurifier_Token_Start || $new_tag instanceof HTMLPurifier_Token_Empty)
14697
$this->prependCSS($new_tag->attr, $this->style);
14708
* Concrete comment token class. Generally will be ignored.
14710
class HTMLPurifier_Token_Comment extends HTMLPurifier_Token
14712
public $data; /**< Character data within comment. */
14714
* Transparent constructor.
14716
* @param $data String comment data.
14718
public function __construct($data, $line = null, $col = null) {
14719
$this->data = $data;
14720
$this->line = $line;
14729
* Abstract class of a tag token (start, end or empty), and its behavior.
14731
class HTMLPurifier_Token_Tag extends HTMLPurifier_Token
14734
* Static bool marker that indicates the class is a tag.
14736
* This allows us to check objects with <tt>!empty($obj->is_tag)</tt>
14737
* without having to use a function call <tt>is_a()</tt>.
14739
public $is_tag = true;
14742
* The lower-case name of the tag, like 'a', 'b' or 'blockquote'.
14744
* @note Strictly speaking, XML tags are case sensitive, so we shouldn't
14745
* be lower-casing them, but these tokens cater to HTML tags, which are
14751
* Associative array of the tag's attributes.
14753
public $attr = array();
14756
* Non-overloaded constructor, which lower-cases passed tag name.
14758
* @param $name String name.
14759
* @param $attr Associative array of attributes.
14761
public function __construct($name, $attr = array(), $line = null, $col = null) {
14762
$this->name = ctype_lower($name) ? $name : strtolower($name);
14763
foreach ($attr as $key => $value) {
14764
// normalization only necessary when key is not lowercase
14765
if (!ctype_lower($key)) {
14766
$new_key = strtolower($key);
14767
if (!isset($attr[$new_key])) {
14768
$attr[$new_key] = $attr[$key];
14770
if ($new_key !== $key) {
14771
unset($attr[$key]);
14775
$this->attr = $attr;
14776
$this->line = $line;
14784
* Concrete empty token class.
14786
class HTMLPurifier_Token_Empty extends HTMLPurifier_Token_Tag
14794
* Concrete end token class.
14796
* @warning This class accepts attributes even though end tags cannot. This
14797
* is for optimization reasons, as under normal circumstances, the Lexers
14798
* do not pass attributes.
14800
class HTMLPurifier_Token_End extends HTMLPurifier_Token_Tag
14803
* Token that started this node. Added by MakeWellFormed. Please
14804
* do not edit this!
14812
* Concrete start token class.
14814
class HTMLPurifier_Token_Start extends HTMLPurifier_Token_Tag
14822
* Concrete text token class.
14824
* Text tokens comprise of regular parsed character data (PCDATA) and raw
14825
* character data (from the CDATA sections). Internally, their
14826
* data is parsed with all entities expanded. Surprisingly, the text token
14827
* does have a "tag name" called #PCDATA, which is how the DTD represents it
14828
* in permissible child nodes.
14830
class HTMLPurifier_Token_Text extends HTMLPurifier_Token
14833
public $name = '#PCDATA'; /**< PCDATA tag name compatible with DTD. */
14834
public $data; /**< Parsed character data of text. */
14835
public $is_whitespace; /**< Bool indicating if node is whitespace. */
14838
* Constructor, accepts data and determines if it is whitespace.
14840
* @param $data String parsed character data.
14842
public function __construct($data, $line = null, $col = null) {
14843
$this->data = $data;
14844
$this->is_whitespace = ctype_space($data);
14845
$this->line = $line;
14853
class HTMLPurifier_URIFilter_DisableExternal extends HTMLPurifier_URIFilter
14855
public $name = 'DisableExternal';
14856
protected $ourHostParts = false;
14857
public function prepare($config) {
14858
$our_host = $config->getDefinition('URI')->host;
14859
if ($our_host !== null) $this->ourHostParts = array_reverse(explode('.', $our_host));
14861
public function filter(&$uri, $config, $context) {
14862
if (is_null($uri->host)) return true;
14863
if ($this->ourHostParts === false) return false;
14864
$host_parts = array_reverse(explode('.', $uri->host));
14865
foreach ($this->ourHostParts as $i => $x) {
14866
if (!isset($host_parts[$i])) return false;
14867
if ($host_parts[$i] != $this->ourHostParts[$i]) return false;
14876
class HTMLPurifier_URIFilter_DisableExternalResources extends HTMLPurifier_URIFilter_DisableExternal
14878
public $name = 'DisableExternalResources';
14879
public function filter(&$uri, $config, $context) {
14880
if (!$context->get('EmbeddedURI', true)) return true;
14881
return parent::filter($uri, $config, $context);
14888
class HTMLPurifier_URIFilter_HostBlacklist extends HTMLPurifier_URIFilter
14890
public $name = 'HostBlacklist';
14891
protected $blacklist = array();
14892
public function prepare($config) {
14893
$this->blacklist = $config->get('URI', 'HostBlacklist');
14896
public function filter(&$uri, $config, $context) {
14897
foreach($this->blacklist as $blacklisted_host_fragment) {
14898
if (strpos($uri->host, $blacklisted_host_fragment) !== false) {
14908
// does not support network paths
14910
class HTMLPurifier_URIFilter_MakeAbsolute extends HTMLPurifier_URIFilter
14912
public $name = 'MakeAbsolute';
14914
protected $basePathStack = array();
14915
public function prepare($config) {
14916
$def = $config->getDefinition('URI');
14917
$this->base = $def->base;
14918
if (is_null($this->base)) {
14919
trigger_error('URI.MakeAbsolute is being ignored due to lack of value for URI.Base configuration', E_USER_WARNING);
14922
$this->base->fragment = null; // fragment is invalid for base URI
14923
$stack = explode('/', $this->base->path);
14924
array_pop($stack); // discard last segment
14925
$stack = $this->_collapseStack($stack); // do pre-parsing
14926
$this->basePathStack = $stack;
14929
public function filter(&$uri, $config, $context) {
14930
if (is_null($this->base)) return true; // abort early
14932
$uri->path === '' && is_null($uri->scheme) &&
14933
is_null($uri->host) && is_null($uri->query) && is_null($uri->fragment)
14935
// reference to current document
14936
$uri = clone $this->base;
14939
if (!is_null($uri->scheme)) {
14940
// absolute URI already: don't change
14941
if (!is_null($uri->host)) return true;
14942
$scheme_obj = $uri->getSchemeObj($config, $context);
14943
if (!$scheme_obj) {
14944
// scheme not recognized
14947
if (!$scheme_obj->hierarchical) {
14948
// non-hierarchal URI with explicit scheme, don't change
14951
// special case: had a scheme but always is hierarchical and had no authority
14953
if (!is_null($uri->host)) {
14954
// network path, don't bother
14957
if ($uri->path === '') {
14958
$uri->path = $this->base->path;
14959
} elseif ($uri->path[0] !== '/') {
14960
// relative path, needs more complicated processing
14961
$stack = explode('/', $uri->path);
14962
$new_stack = array_merge($this->basePathStack, $stack);
14963
if ($new_stack[0] !== '' && !is_null($this->base->host)) {
14964
array_unshift($new_stack, '');
14966
$new_stack = $this->_collapseStack($new_stack);
14967
$uri->path = implode('/', $new_stack);
14969
// absolute path, but still we should collapse
14970
$uri->path = implode('/', $this->_collapseStack(explode('/', $uri->path)));
14973
$uri->scheme = $this->base->scheme;
14974
if (is_null($uri->userinfo)) $uri->userinfo = $this->base->userinfo;
14975
if (is_null($uri->host)) $uri->host = $this->base->host;
14976
if (is_null($uri->port)) $uri->port = $this->base->port;
14981
* Resolve dots and double-dots in a path stack
14983
private function _collapseStack($stack) {
14985
$is_folder = false;
14986
for ($i = 0; isset($stack[$i]); $i++) {
14987
$is_folder = false;
14988
// absorb an internally duplicated slash
14989
if ($stack[$i] == '' && $i && isset($stack[$i+1])) continue;
14990
if ($stack[$i] == '..') {
14991
if (!empty($result)) {
14992
$segment = array_pop($result);
14993
if ($segment === '' && empty($result)) {
14994
// error case: attempted to back out too far:
14995
// restore the leading slash
14997
} elseif ($segment === '..') {
14998
$result[] = '..'; // cannot remove .. with ..
15001
// relative path, preserve the double-dots
15007
if ($stack[$i] == '.') {
15012
$result[] = $stack[$i];
15014
if ($is_folder) $result[] = '';
15022
class HTMLPurifier_URIFilter_Munge extends HTMLPurifier_URIFilter
15024
public $name = 'Munge';
15025
public $post = true;
15026
private $target, $parser, $doEmbed, $secretKey;
15028
protected $replace = array();
15030
public function prepare($config) {
15031
$this->target = $config->get('URI', $this->name);
15032
$this->parser = new HTMLPurifier_URIParser();
15033
$this->doEmbed = $config->get('URI', 'MungeResources');
15034
$this->secretKey = $config->get('URI', 'MungeSecretKey');
15037
public function filter(&$uri, $config, $context) {
15038
if ($context->get('EmbeddedURI', true) && !$this->doEmbed) return true;
15040
$scheme_obj = $uri->getSchemeObj($config, $context);
15041
if (!$scheme_obj) return true; // ignore unknown schemes, maybe another postfilter did it
15042
if (is_null($uri->host) || empty($scheme_obj->browsable)) {
15046
$this->makeReplace($uri, $config, $context);
15047
$this->replace = array_map('rawurlencode', $this->replace);
15049
$new_uri = strtr($this->target, $this->replace);
15050
$new_uri = $this->parser->parse($new_uri);
15051
// don't redirect if the target host is the same as the
15053
if ($uri->host === $new_uri->host) return true;
15054
$uri = $new_uri; // overwrite
15058
protected function makeReplace($uri, $config, $context) {
15059
$string = $uri->toString();
15060
// always available
15061
$this->replace['%s'] = $string;
15062
$this->replace['%r'] = $context->get('EmbeddedURI', true);
15063
$token = $context->get('CurrentToken', true);
15064
$this->replace['%n'] = $token ? $token->name : null;
15065
$this->replace['%m'] = $context->get('CurrentAttr', true);
15066
$this->replace['%p'] = $context->get('CurrentCSSProperty', true);
15067
// not always available
15068
if ($this->secretKey) $this->replace['%t'] = sha1($this->secretKey . ':' . $string);
15076
* Validates ftp (File Transfer Protocol) URIs as defined by generic RFC 1738.
15078
class HTMLPurifier_URIScheme_ftp extends HTMLPurifier_URIScheme {
15080
public $default_port = 21;
15081
public $browsable = true; // usually
15082
public $hierarchical = true;
15084
public function validate(&$uri, $config, $context) {
15085
parent::validate($uri, $config, $context);
15086
$uri->query = null;
15089
$semicolon_pos = strrpos($uri->path, ';'); // reverse
15090
if ($semicolon_pos !== false) {
15091
$type = substr($uri->path, $semicolon_pos + 1); // no semicolon
15092
$uri->path = substr($uri->path, 0, $semicolon_pos);
15094
if (strpos($type, '=') !== false) {
15095
// figure out whether or not the declaration is correct
15096
list($key, $typecode) = explode('=', $type, 2);
15097
if ($key !== 'type') {
15098
// invalid key, tack it back on encoded
15099
$uri->path .= '%3B' . $type;
15100
} elseif ($typecode === 'a' || $typecode === 'i' || $typecode === 'd') {
15101
$type_ret = ";type=$typecode";
15104
$uri->path .= '%3B' . $type;
15106
$uri->path = str_replace(';', '%3B', $uri->path);
15107
$uri->path .= $type_ret;
15119
* Validates http (HyperText Transfer Protocol) as defined by RFC 2616
15121
class HTMLPurifier_URIScheme_http extends HTMLPurifier_URIScheme {
15123
public $default_port = 80;
15124
public $browsable = true;
15125
public $hierarchical = true;
15127
public function validate(&$uri, $config, $context) {
15128
parent::validate($uri, $config, $context);
15129
$uri->userinfo = null;
15139
* Validates https (Secure HTTP) according to http scheme.
15141
class HTMLPurifier_URIScheme_https extends HTMLPurifier_URIScheme_http {
15143
public $default_port = 443;
15150
// VERY RELAXED! Shouldn't cause problems, not even Firefox checks if the
15151
// email is valid, but be careful!
15154
* Validates mailto (for E-mail) according to RFC 2368
15155
* @todo Validate the email address
15156
* @todo Filter allowed query parameters
15159
class HTMLPurifier_URIScheme_mailto extends HTMLPurifier_URIScheme {
15161
public $browsable = false;
15163
public function validate(&$uri, $config, $context) {
15164
parent::validate($uri, $config, $context);
15165
$uri->userinfo = null;
15168
// we need to validate path against RFC 2368's addr-spec
15178
* Validates news (Usenet) as defined by generic RFC 1738
15180
class HTMLPurifier_URIScheme_news extends HTMLPurifier_URIScheme {
15182
public $browsable = false;
15184
public function validate(&$uri, $config, $context) {
15185
parent::validate($uri, $config, $context);
15186
$uri->userinfo = null;
15189
$uri->query = null;
15190
// typecode check needed on path
15200
* Validates nntp (Network News Transfer Protocol) as defined by generic RFC 1738
15202
class HTMLPurifier_URIScheme_nntp extends HTMLPurifier_URIScheme {
15204
public $default_port = 119;
15205
public $browsable = false;
15207
public function validate(&$uri, $config, $context) {
15208
parent::validate($uri, $config, $context);
15209
$uri->userinfo = null;
15210
$uri->query = null;
15220
* Performs safe variable parsing based on types which can be used by
15221
* users. This may not be able to represent all possible data inputs,
15224
class HTMLPurifier_VarParser_Flexible extends HTMLPurifier_VarParser
15227
protected function parseImplementation($var, $type, $allow_null) {
15228
if ($allow_null && $var === null) return null;
15230
// Note: if code "breaks" from the switch, it triggers a generic
15231
// exception to be thrown. Specific errors can be specifically
15234
case self::ISTRING :
15235
case self::STRING :
15240
if (is_string($var) && ctype_digit($var)) $var = (int) $var;
15243
if ((is_string($var) && is_numeric($var)) || is_int($var)) $var = (float) $var;
15246
if (is_int($var) && ($var === 0 || $var === 1)) {
15247
$var = (bool) $var;
15248
} elseif (is_string($var)) {
15249
if ($var == 'on' || $var == 'true' || $var == '1') {
15251
} elseif ($var == 'off' || $var == 'false' || $var == '0') {
15254
throw new HTMLPurifier_VarParserException("Unrecognized value '$var' for $type");
15260
case self::LOOKUP :
15261
if (is_string($var)) {
15262
// special case: technically, this is an array with
15263
// a single empty string item, but having an empty
15264
// array is more intuitive
15265
if ($var == '') return array();
15266
if (strpos($var, "\n") === false && strpos($var, "\r") === false) {
15267
// simplistic string to array method that only works
15268
// for simple lists of tag names or alphanumeric characters
15269
$var = explode(',',$var);
15271
$var = preg_split('/(,|[\n\r]+)/', $var);
15274
foreach ($var as $i => $j) $var[$i] = trim($j);
15275
if ($type === self::HASH) {
15276
// key:value,key2:value2
15278
foreach ($var as $keypair) {
15279
$c = explode(':', $keypair, 2);
15280
if (!isset($c[1])) continue;
15281
$nvar[$c[0]] = $c[1];
15286
if (!is_array($var)) break;
15287
$keys = array_keys($var);
15288
if ($keys === array_keys($keys)) {
15289
if ($type == self::ALIST) return $var;
15290
elseif ($type == self::LOOKUP) {
15292
foreach ($var as $key) {
15298
if ($type === self::LOOKUP) {
15299
foreach ($var as $key => $value) {
15305
$this->errorInconsistent(__CLASS__, $type);
15307
$this->errorGeneric($var, $type);
15315
* This variable parser uses PHP's internal code engine. Because it does
15316
* this, it can represent all inputs; however, it is dangerous and cannot
15317
* be used by users.
15319
class HTMLPurifier_VarParser_Native extends HTMLPurifier_VarParser
15322
protected function parseImplementation($var, $type, $allow_null) {
15323
return $this->evalExpression($var);
15326
protected function evalExpression($expr) {
15328
$result = eval("\$var = $expr;");
15329
if ($result === false) {
15330
throw new HTMLPurifier_VarParserException("Fatal error in evaluated code");
5
* This file was auto-generated by generate-includes.php and includes all of
6
* the core files required by HTML Purifier. Use this if performance is a
7
* primary concern and you are using an opcode cache. PLEASE DO NOT EDIT THIS
8
* FILE, changes will be overwritten the next time the script is run.
13
* You must *not* include any other HTML Purifier files before this file,
14
* because 'require' not 'require_once' is used.
17
* This file requires that the include path contains the HTML Purifier
18
* library directory; this is not auto-set.
25
* HTML Purifier is an HTML filter that will take an arbitrary snippet of
26
* HTML and rigorously test, validate and filter it into a version that
27
* is safe for output onto webpages. It achieves this by:
29
* -# Lexing (parsing into tokens) the document,
30
* -# Executing various strategies on the tokens:
31
* -# Removing all elements not in the whitelist,
32
* -# Making the tokens well-formed,
33
* -# Fixing the nesting of the nodes, and
34
* -# Validating attributes of the nodes; and
35
* -# Generating HTML from the purified tokens.
37
* However, most users will only need to interface with the HTMLPurifier
38
* and HTMLPurifier_Config.
42
HTML Purifier 4.0.0 - Standards Compliant HTML Filtering
43
Copyright (C) 2006-2008 Edward Z. Yang
45
This library is free software; you can redistribute it and/or
46
modify it under the terms of the GNU Lesser General Public
47
License as published by the Free Software Foundation; either
48
version 2.1 of the License, or (at your option) any later version.
50
This library is distributed in the hope that it will be useful,
51
but WITHOUT ANY WARRANTY; without even the implied warranty of
52
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
53
Lesser General Public License for more details.
55
You should have received a copy of the GNU Lesser General Public
56
License along with this library; if not, write to the Free Software
57
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
61
* Facade that coordinates HTML Purifier's subsystems in order to purify HTML.
63
* @note There are several points in which configuration can be specified
64
* for HTML Purifier. The precedence of these (from lowest to
65
* highest) is as follows:
66
* -# Instance: new HTMLPurifier($config)
67
* -# Invocation: purify($html, $config)
68
* These configurations are entirely independent of each other and
69
* are *not* merged (this behavior may change in the future).
71
* @todo We need an easier way to inject strategies using the configuration
77
/** Version of HTML Purifier */
78
public $version = '4.0.0';
80
/** Constant with version of HTML Purifier */
81
const VERSION = '4.0.0';
83
/** Global configuration object */
86
/** Array of extra HTMLPurifier_Filter objects to run on HTML, for backwards compatibility */
87
private $filters = array();
89
/** Single instance of HTML Purifier */
90
private static $instance;
92
protected $strategy, $generator;
95
* Resultant HTMLPurifier_Context of last run purification. Is an array
96
* of contexts if the last called method was purifyArray().
101
* Initializes the purifier.
102
* @param $config Optional HTMLPurifier_Config object for all instances of
103
* the purifier, if omitted, a default configuration is
104
* supplied (which can be overridden on a per-use basis).
105
* The parameter can also be any type that
106
* HTMLPurifier_Config::create() supports.
108
public function __construct($config = null) {
110
$this->config = HTMLPurifier_Config::create($config);
112
$this->strategy = new HTMLPurifier_Strategy_Core();
117
* Adds a filter to process the output. First come first serve
118
* @param $filter HTMLPurifier_Filter object
120
public function addFilter($filter) {
121
trigger_error('HTMLPurifier->addFilter() is deprecated, use configuration directives in the Filter namespace or Filter.Custom', E_USER_WARNING);
122
$this->filters[] = $filter;
126
* Filters an HTML snippet/document to be XSS-free and standards-compliant.
128
* @param $html String of HTML to purify
129
* @param $config HTMLPurifier_Config object for this operation, if omitted,
130
* defaults to the config object specified during this
131
* object's construction. The parameter can also be any type
132
* that HTMLPurifier_Config::create() supports.
133
* @return Purified HTML
135
public function purify($html, $config = null) {
137
// :TODO: make the config merge in, instead of replace
138
$config = $config ? HTMLPurifier_Config::create($config) : $this->config;
140
// implementation is partially environment dependant, partially
141
// configuration dependant
142
$lexer = HTMLPurifier_Lexer::create($config);
144
$context = new HTMLPurifier_Context();
146
// setup HTML generator
147
$this->generator = new HTMLPurifier_Generator($config, $context);
148
$context->register('Generator', $this->generator);
150
// set up global context variables
151
if ($config->get('Core.CollectErrors')) {
152
// may get moved out if other facilities use it
153
$language_factory = HTMLPurifier_LanguageFactory::instance();
154
$language = $language_factory->create($config, $context);
155
$context->register('Locale', $language);
157
$error_collector = new HTMLPurifier_ErrorCollector($context);
158
$context->register('ErrorCollector', $error_collector);
161
// setup id_accumulator context, necessary due to the fact that
162
// AttrValidator can be called from many places
163
$id_accumulator = HTMLPurifier_IDAccumulator::build($config, $context);
164
$context->register('IDAccumulator', $id_accumulator);
166
$html = HTMLPurifier_Encoder::convertToUTF8($html, $config, $context);
169
$filter_flags = $config->getBatch('Filter');
170
$custom_filters = $filter_flags['Custom'];
171
unset($filter_flags['Custom']);
173
foreach ($filter_flags as $filter => $flag) {
174
if (!$flag) continue;
175
if (strpos($filter, '.') !== false) continue;
176
$class = "HTMLPurifier_Filter_$filter";
177
$filters[] = new $class;
179
foreach ($custom_filters as $filter) {
180
// maybe "HTMLPurifier_Filter_$filter", but be consistent with AutoFormat
181
$filters[] = $filter;
183
$filters = array_merge($filters, $this->filters);
184
// maybe prepare(), but later
186
for ($i = 0, $filter_size = count($filters); $i < $filter_size; $i++) {
187
$html = $filters[$i]->preFilter($html, $config, $context);
192
$this->generator->generateFromTokens(
194
$this->strategy->execute(
195
// list of un-purified tokens
196
$lexer->tokenizeHTML(
198
$html, $config, $context
204
for ($i = $filter_size - 1; $i >= 0; $i--) {
205
$html = $filters[$i]->postFilter($html, $config, $context);
208
$html = HTMLPurifier_Encoder::convertFromUTF8($html, $config, $context);
209
$this->context =& $context;
214
* Filters an array of HTML snippets
215
* @param $config Optional HTMLPurifier_Config object for this operation.
216
* See HTMLPurifier::purify() for more details.
217
* @return Array of purified HTML
219
public function purifyArray($array_of_html, $config = null) {
220
$context_array = array();
221
foreach ($array_of_html as $key => $html) {
222
$array_of_html[$key] = $this->purify($html, $config);
223
$context_array[$key] = $this->context;
225
$this->context = $context_array;
226
return $array_of_html;
230
* Singleton for enforcing just one HTML Purifier in your system
231
* @param $prototype Optional prototype HTMLPurifier instance to
232
* overload singleton with, or HTMLPurifier_Config
233
* instance to configure the generated version with.
235
public static function instance($prototype = null) {
236
if (!self::$instance || $prototype) {
237
if ($prototype instanceof HTMLPurifier) {
238
self::$instance = $prototype;
239
} elseif ($prototype) {
240
self::$instance = new HTMLPurifier($prototype);
242
self::$instance = new HTMLPurifier();
245
return self::$instance;
249
* @note Backwards compatibility, see instance()
251
public static function getInstance($prototype = null) {
252
return HTMLPurifier::instance($prototype);
262
* Defines common attribute collections that modules reference
265
class HTMLPurifier_AttrCollections
269
* Associative array of attribute collections, indexed by name
271
public $info = array();
274
* Performs all expansions on internal data for use by other inclusions
275
* It also collects all attribute collection extensions from
277
* @param $attr_types HTMLPurifier_AttrTypes instance
278
* @param $modules Hash array of HTMLPurifier_HTMLModule members
280
public function __construct($attr_types, $modules) {
281
// load extensions from the modules
282
foreach ($modules as $module) {
283
foreach ($module->attr_collections as $coll_i => $coll) {
284
if (!isset($this->info[$coll_i])) {
285
$this->info[$coll_i] = array();
287
foreach ($coll as $attr_i => $attr) {
288
if ($attr_i === 0 && isset($this->info[$coll_i][$attr_i])) {
290
$this->info[$coll_i][$attr_i] = array_merge(
291
$this->info[$coll_i][$attr_i], $attr);
294
$this->info[$coll_i][$attr_i] = $attr;
298
// perform internal expansions and inclusions
299
foreach ($this->info as $name => $attr) {
300
// merge attribute collections that include others
301
$this->performInclusions($this->info[$name]);
302
// replace string identifiers with actual attribute objects
303
$this->expandIdentifiers($this->info[$name], $attr_types);
308
* Takes a reference to an attribute associative array and performs
309
* all inclusions specified by the zero index.
310
* @param &$attr Reference to attribute array
312
public function performInclusions(&$attr) {
313
if (!isset($attr[0])) return;
315
$seen = array(); // recursion guard
316
// loop through all the inclusions
317
for ($i = 0; isset($merge[$i]); $i++) {
318
if (isset($seen[$merge[$i]])) continue;
319
$seen[$merge[$i]] = true;
320
// foreach attribute of the inclusion, copy it over
321
if (!isset($this->info[$merge[$i]])) continue;
322
foreach ($this->info[$merge[$i]] as $key => $value) {
323
if (isset($attr[$key])) continue; // also catches more inclusions
324
$attr[$key] = $value;
326
if (isset($this->info[$merge[$i]][0])) {
328
$merge = array_merge($merge, $this->info[$merge[$i]][0]);
335
* Expands all string identifiers in an attribute array by replacing
336
* them with the appropriate values inside HTMLPurifier_AttrTypes
337
* @param &$attr Reference to attribute array
338
* @param $attr_types HTMLPurifier_AttrTypes instance
340
public function expandIdentifiers(&$attr, $attr_types) {
342
// because foreach will process new elements we add, make sure we
344
$processed = array();
346
foreach ($attr as $def_i => $def) {
348
if ($def_i === 0) continue;
350
if (isset($processed[$def_i])) continue;
352
// determine whether or not attribute is required
353
if ($required = (strpos($def_i, '*') !== false)) {
354
// rename the definition
355
unset($attr[$def_i]);
356
$def_i = trim($def_i, '*');
357
$attr[$def_i] = $def;
360
$processed[$def_i] = true;
362
// if we've already got a literal object, move on
363
if (is_object($def)) {
364
// preserve previous required
365
$attr[$def_i]->required = ($required || $attr[$def_i]->required);
369
if ($def === false) {
370
unset($attr[$def_i]);
374
if ($t = $attr_types->get($def)) {
376
$attr[$def_i]->required = $required;
378
unset($attr[$def_i]);
391
* Base class for all validating attribute definitions.
393
* This family of classes forms the core for not only HTML attribute validation,
394
* but also any sort of string that needs to be validated or cleaned (which
395
* means CSS properties and composite definitions are defined here too).
396
* Besides defining (through code) what precisely makes the string valid,
397
* subclasses are also responsible for cleaning the code if possible.
400
abstract class HTMLPurifier_AttrDef
404
* Tells us whether or not an HTML attribute is minimized. Has no
405
* meaning in other contexts.
407
public $minimized = false;
410
* Tells us whether or not an HTML attribute is required. Has no
411
* meaning in other contexts
413
public $required = false;
416
* Validates and cleans passed string according to a definition.
418
* @param $string String to be validated and cleaned.
419
* @param $config Mandatory HTMLPurifier_Config object.
420
* @param $context Mandatory HTMLPurifier_AttrContext object.
422
abstract public function validate($string, $config, $context);
425
* Convenience method that parses a string as if it were CDATA.
427
* This method process a string in the manner specified at
428
* <http://www.w3.org/TR/html4/types.html#h-6.2> by removing
429
* leading and trailing whitespace, ignoring line feeds, and replacing
430
* carriage returns and tabs with spaces. While most useful for HTML
431
* attributes specified as CDATA, it can also be applied to most CSS
434
* @note This method is not entirely standards compliant, as trim() removes
435
* more types of whitespace than specified in the spec. In practice,
436
* this is rarely a problem, as those extra characters usually have
437
* already been removed by HTMLPurifier_Encoder.
439
* @warning This processing is inconsistent with XML's whitespace handling
440
* as specified by section 3.3.3 and referenced XHTML 1.0 section
441
* 4.7. However, note that we are NOT necessarily
442
* parsing XML, thus, this behavior may still be correct. We
443
* assume that newlines have been normalized.
445
public function parseCDATA($string) {
446
$string = trim($string);
447
$string = str_replace(array("\n", "\t", "\r"), ' ', $string);
452
* Factory method for creating this class from a string.
453
* @param $string String construction info
454
* @return Created AttrDef object corresponding to $string
456
public function make($string) {
457
// default implementation, return a flyweight of this object.
458
// If $string has an effect on the returned object (i.e. you
459
// need to overload this method), it is best
460
// to clone or instantiate new copies. (Instantiation is safer.)
465
* Removes spaces from rgb(0, 0, 0) so that shorthand CSS properties work
466
* properly. THIS IS A HACK!
468
protected function mungeRgb($string) {
469
return preg_replace('/rgb\((\d+)\s*,\s*(\d+)\s*,\s*(\d+)\)/', 'rgb(\1,\2,\3)', $string);
479
* Processes an entire attribute array for corrections needing multiple values.
481
* Occasionally, a certain attribute will need to be removed and popped onto
482
* another value. Instead of creating a complex return syntax for
483
* HTMLPurifier_AttrDef, we just pass the whole attribute array to a
484
* specialized object and have that do the special work. That is the
485
* family of HTMLPurifier_AttrTransform.
487
* An attribute transformation can be assigned to run before or after
488
* HTMLPurifier_AttrDef validation. See HTMLPurifier_HTMLDefinition for
492
abstract class HTMLPurifier_AttrTransform
496
* Abstract: makes changes to the attributes dependent on multiple values.
498
* @param $attr Assoc array of attributes, usually from
499
* HTMLPurifier_Token_Tag::$attr
500
* @param $config Mandatory HTMLPurifier_Config object.
501
* @param $context Mandatory HTMLPurifier_Context object
502
* @returns Processed attribute array.
504
abstract public function transform($attr, $config, $context);
507
* Prepends CSS properties to the style attribute, creating the
508
* attribute if it doesn't exist.
509
* @param $attr Attribute array to process (passed by reference)
510
* @param $css CSS to prepend
512
public function prependCSS(&$attr, $css) {
513
$attr['style'] = isset($attr['style']) ? $attr['style'] : '';
514
$attr['style'] = $css . $attr['style'];
518
* Retrieves and removes an attribute
519
* @param $attr Attribute array to process (passed by reference)
520
* @param $key Key of attribute to confiscate
522
public function confiscateAttr(&$attr, $key) {
523
if (!isset($attr[$key])) return null;
524
$value = $attr[$key];
536
* Provides lookup array of attribute types to HTMLPurifier_AttrDef objects
538
class HTMLPurifier_AttrTypes
541
* Lookup array of attribute string identifiers to concrete implementations
543
protected $info = array();
546
* Constructs the info array, supplying default implementations for attribute
549
public function __construct() {
550
// pseudo-types, must be instantiated via shorthand
551
$this->info['Enum'] = new HTMLPurifier_AttrDef_Enum();
552
$this->info['Bool'] = new HTMLPurifier_AttrDef_HTML_Bool();
554
$this->info['CDATA'] = new HTMLPurifier_AttrDef_Text();
555
$this->info['ID'] = new HTMLPurifier_AttrDef_HTML_ID();
556
$this->info['Length'] = new HTMLPurifier_AttrDef_HTML_Length();
557
$this->info['MultiLength'] = new HTMLPurifier_AttrDef_HTML_MultiLength();
558
$this->info['NMTOKENS'] = new HTMLPurifier_AttrDef_HTML_Nmtokens();
559
$this->info['Pixels'] = new HTMLPurifier_AttrDef_HTML_Pixels();
560
$this->info['Text'] = new HTMLPurifier_AttrDef_Text();
561
$this->info['URI'] = new HTMLPurifier_AttrDef_URI();
562
$this->info['LanguageCode'] = new HTMLPurifier_AttrDef_Lang();
563
$this->info['Color'] = new HTMLPurifier_AttrDef_HTML_Color();
565
// unimplemented aliases
566
$this->info['ContentType'] = new HTMLPurifier_AttrDef_Text();
567
$this->info['ContentTypes'] = new HTMLPurifier_AttrDef_Text();
568
$this->info['Charsets'] = new HTMLPurifier_AttrDef_Text();
569
$this->info['Character'] = new HTMLPurifier_AttrDef_Text();
571
// "proprietary" types
572
$this->info['Class'] = new HTMLPurifier_AttrDef_HTML_Class();
574
// number is really a positive integer (one or more digits)
575
// FIXME: ^^ not always, see start and value of list items
576
$this->info['Number'] = new HTMLPurifier_AttrDef_Integer(false, false, true);
581
* @param $type String type name
582
* @return Object AttrDef for type
584
public function get($type) {
586
// determine if there is any extra info tacked on
587
if (strpos($type, '#') !== false) list($type, $string) = explode('#', $type, 2);
590
if (!isset($this->info[$type])) {
591
trigger_error('Cannot retrieve undefined attribute type ' . $type, E_USER_ERROR);
595
return $this->info[$type]->make($string);
600
* Sets a new implementation for a type
601
* @param $type String type name
602
* @param $impl Object AttrDef for type
604
public function set($type, $impl) {
605
$this->info[$type] = $impl;
614
* Validates the attributes of a token. Doesn't manage required attributes
615
* very well. The only reason we factored this out was because RemoveForeignElements
616
* also needed it besides ValidateAttributes.
618
class HTMLPurifier_AttrValidator
622
* Validates the attributes of a token, returning a modified token
623
* that has valid tokens
624
* @param $token Reference to token to validate. We require a reference
625
* because the operation this class performs on the token are
626
* not atomic, so the context CurrentToken to be updated
628
* @param $config Instance of HTMLPurifier_Config
629
* @param $context Instance of HTMLPurifier_Context
631
public function validateToken(&$token, &$config, $context) {
633
$definition = $config->getHTMLDefinition();
634
$e =& $context->get('ErrorCollector', true);
636
// initialize IDAccumulator if necessary
637
$ok =& $context->get('IDAccumulator', true);
639
$id_accumulator = HTMLPurifier_IDAccumulator::build($config, $context);
640
$context->register('IDAccumulator', $id_accumulator);
643
// initialize CurrentToken if necessary
644
$current_token =& $context->get('CurrentToken', true);
645
if (!$current_token) $context->register('CurrentToken', $token);
648
!$token instanceof HTMLPurifier_Token_Start &&
649
!$token instanceof HTMLPurifier_Token_Empty
652
// create alias to global definition array, see also $defs
654
$d_defs = $definition->info_global_attr;
656
// don't update token until the very end, to ensure an atomic update
657
$attr = $token->attr;
659
// do global transformations (pre)
660
// nothing currently utilizes this
661
foreach ($definition->info_attr_transform_pre as $transform) {
662
$attr = $transform->transform($o = $attr, $config, $context);
664
if ($attr != $o) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr);
668
// do local transformations only applicable to this element (pre)
669
// ex. <p align="right"> to <p style="text-align:right;">
670
foreach ($definition->info[$token->name]->attr_transform_pre as $transform) {
671
$attr = $transform->transform($o = $attr, $config, $context);
673
if ($attr != $o) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr);
677
// create alias to this element's attribute definition array, see
678
// also $d_defs (global attribute definition array)
680
$defs = $definition->info[$token->name]->attr;
683
$context->register('CurrentAttr', $attr_key);
685
// iterate through all the attribute keypairs
686
// Watch out for name collisions: $key has previously been used
687
foreach ($attr as $attr_key => $value) {
689
// call the definition
690
if ( isset($defs[$attr_key]) ) {
691
// there is a local definition defined
692
if ($defs[$attr_key] === false) {
693
// We've explicitly been told not to allow this element.
694
// This is usually when there's a global definition
695
// that must be overridden.
696
// Theoretically speaking, we could have a
697
// AttrDef_DenyAll, but this is faster!
700
// validate according to the element's definition
701
$result = $defs[$attr_key]->validate(
702
$value, $config, $context
705
} elseif ( isset($d_defs[$attr_key]) ) {
706
// there is a global definition defined, validate according
707
// to the global definition
708
$result = $d_defs[$attr_key]->validate(
709
$value, $config, $context
712
// system never heard of the attribute? DELETE!
716
// put the results into effect
717
if ($result === false || $result === null) {
718
// this is a generic error message that should replaced
719
// with more specific ones when possible
720
if ($e) $e->send(E_ERROR, 'AttrValidator: Attribute removed');
722
// remove the attribute
723
unset($attr[$attr_key]);
724
} elseif (is_string($result)) {
725
// generally, if a substitution is happening, there
726
// was some sort of implicit correction going on. We'll
727
// delegate it to the attribute classes to say exactly what.
729
// simple substitution
730
$attr[$attr_key] = $result;
735
// we'd also want slightly more complicated substitution
736
// involving an array as the return value,
737
// although we're not sure how colliding attributes would
738
// resolve (certain ones would be completely overriden,
739
// others would prepend themselves).
742
$context->destroy('CurrentAttr');
746
// global (error reporting untested)
747
foreach ($definition->info_attr_transform_post as $transform) {
748
$attr = $transform->transform($o = $attr, $config, $context);
750
if ($attr != $o) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr);
754
// local (error reporting untested)
755
foreach ($definition->info[$token->name]->attr_transform_post as $transform) {
756
$attr = $transform->transform($o = $attr, $config, $context);
758
if ($attr != $o) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr);
762
$token->attr = $attr;
764
// destroy CurrentToken if we made it ourselves
765
if (!$current_token) $context->destroy('CurrentToken');
776
// constants are slow, so we use as few as possible
777
if (!defined('HTMLPURIFIER_PREFIX')) {
778
define('HTMLPURIFIER_PREFIX', dirname(__FILE__) . '/standalone');
779
set_include_path(HTMLPURIFIER_PREFIX . PATH_SEPARATOR . get_include_path());
782
// accomodations for versions earlier than 5.0.2
783
// borrowed from PHP_Compat, LGPL licensed, by Aidan Lister <aidan@php.net>
784
if (!defined('PHP_EOL')) {
785
switch (strtoupper(substr(PHP_OS, 0, 3))) {
787
define('PHP_EOL', "\r\n");
790
define('PHP_EOL', "\r");
793
define('PHP_EOL', "\n");
798
* Bootstrap class that contains meta-functionality for HTML Purifier such as
799
* the autoload function.
802
* This class may be used without any other files from HTML Purifier.
804
class HTMLPurifier_Bootstrap
808
* Autoload function for HTML Purifier
809
* @param $class Class to load
811
public static function autoload($class) {
812
$file = HTMLPurifier_Bootstrap::getPath($class);
813
if (!$file) return false;
814
require HTMLPURIFIER_PREFIX . '/' . $file;
819
* Returns the path for a specific class.
821
public static function getPath($class) {
822
if (strncmp('HTMLPurifier', $class, 12) !== 0) return false;
823
// Custom implementations
824
if (strncmp('HTMLPurifier_Language_', $class, 22) === 0) {
825
$code = str_replace('_', '-', substr($class, 22));
826
$file = 'HTMLPurifier/Language/classes/' . $code . '.php';
828
$file = str_replace('_', '/', $class) . '.php';
830
if (!file_exists(HTMLPURIFIER_PREFIX . '/' . $file)) return false;
835
* "Pre-registers" our autoloader on the SPL stack.
837
public static function registerAutoload() {
838
$autoload = array('HTMLPurifier_Bootstrap', 'autoload');
839
if ( ($funcs = spl_autoload_functions()) === false ) {
840
spl_autoload_register($autoload);
841
} elseif (function_exists('spl_autoload_unregister')) {
842
$compat = version_compare(PHP_VERSION, '5.1.2', '<=') &&
843
version_compare(PHP_VERSION, '5.1.0', '>=');
844
foreach ($funcs as $func) {
845
if (is_array($func)) {
846
// :TRICKY: There are some compatibility issues and some
847
// places where we need to error out
848
$reflector = new ReflectionMethod($func[0], $func[1]);
849
if (!$reflector->isStatic()) {
850
throw new Exception('
851
HTML Purifier autoloader registrar is not compatible
852
with non-static object methods due to PHP Bug #44144;
853
Please do not use HTMLPurifier.autoload.php (or any
854
file that includes this file); instead, place the code:
855
spl_autoload_register(array(\'HTMLPurifier_Bootstrap\', \'autoload\'))
856
after your own autoloaders.
859
// Suprisingly, spl_autoload_register supports the
860
// Class::staticMethod callback format, although call_user_func doesn't
861
if ($compat) $func = implode('::', $func);
863
spl_autoload_unregister($func);
865
spl_autoload_register($autoload);
866
foreach ($funcs as $func) spl_autoload_register($func);
877
* Super-class for definition datatype objects, implements serialization
878
* functions for the class.
880
abstract class HTMLPurifier_Definition
884
* Has setup() been called yet?
886
public $setup = false;
889
* What type of definition is it?
894
* Sets up the definition object into the final form, something
895
* not done by the constructor
896
* @param $config HTMLPurifier_Config instance
898
abstract protected function doSetup($config);
901
* Setup function that aborts if already setup
902
* @param $config HTMLPurifier_Config instance
904
public function setup($config) {
905
if ($this->setup) return;
907
$this->doSetup($config);
917
* Defines allowed CSS attributes and what their values are.
918
* @see HTMLPurifier_HTMLDefinition
920
class HTMLPurifier_CSSDefinition extends HTMLPurifier_Definition
923
public $type = 'CSS';
926
* Assoc array of attribute name to definition object.
928
public $info = array();
931
* Constructs the info array. The meat of this class.
933
protected function doSetup($config) {
935
$this->info['text-align'] = new HTMLPurifier_AttrDef_Enum(
936
array('left', 'right', 'center', 'justify'), false);
939
$this->info['border-bottom-style'] =
940
$this->info['border-right-style'] =
941
$this->info['border-left-style'] =
942
$this->info['border-top-style'] = new HTMLPurifier_AttrDef_Enum(
943
array('none', 'hidden', 'dotted', 'dashed', 'solid', 'double',
944
'groove', 'ridge', 'inset', 'outset'), false);
946
$this->info['border-style'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_style);
948
$this->info['clear'] = new HTMLPurifier_AttrDef_Enum(
949
array('none', 'left', 'right', 'both'), false);
950
$this->info['float'] = new HTMLPurifier_AttrDef_Enum(
951
array('none', 'left', 'right'), false);
952
$this->info['font-style'] = new HTMLPurifier_AttrDef_Enum(
953
array('normal', 'italic', 'oblique'), false);
954
$this->info['font-variant'] = new HTMLPurifier_AttrDef_Enum(
955
array('normal', 'small-caps'), false);
957
$uri_or_none = new HTMLPurifier_AttrDef_CSS_Composite(
959
new HTMLPurifier_AttrDef_Enum(array('none')),
960
new HTMLPurifier_AttrDef_CSS_URI()
964
$this->info['list-style-position'] = new HTMLPurifier_AttrDef_Enum(
965
array('inside', 'outside'), false);
966
$this->info['list-style-type'] = new HTMLPurifier_AttrDef_Enum(
967
array('disc', 'circle', 'square', 'decimal', 'lower-roman',
968
'upper-roman', 'lower-alpha', 'upper-alpha', 'none'), false);
969
$this->info['list-style-image'] = $uri_or_none;
971
$this->info['list-style'] = new HTMLPurifier_AttrDef_CSS_ListStyle($config);
973
$this->info['text-transform'] = new HTMLPurifier_AttrDef_Enum(
974
array('capitalize', 'uppercase', 'lowercase', 'none'), false);
975
$this->info['color'] = new HTMLPurifier_AttrDef_CSS_Color();
977
$this->info['background-image'] = $uri_or_none;
978
$this->info['background-repeat'] = new HTMLPurifier_AttrDef_Enum(
979
array('repeat', 'repeat-x', 'repeat-y', 'no-repeat')
981
$this->info['background-attachment'] = new HTMLPurifier_AttrDef_Enum(
982
array('scroll', 'fixed')
984
$this->info['background-position'] = new HTMLPurifier_AttrDef_CSS_BackgroundPosition();
987
$this->info['border-top-color'] =
988
$this->info['border-bottom-color'] =
989
$this->info['border-left-color'] =
990
$this->info['border-right-color'] =
991
$this->info['background-color'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
992
new HTMLPurifier_AttrDef_Enum(array('transparent')),
993
new HTMLPurifier_AttrDef_CSS_Color()
996
$this->info['background'] = new HTMLPurifier_AttrDef_CSS_Background($config);
998
$this->info['border-color'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_color);
1001
$this->info['border-top-width'] =
1002
$this->info['border-bottom-width'] =
1003
$this->info['border-left-width'] =
1004
$this->info['border-right-width'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1005
new HTMLPurifier_AttrDef_Enum(array('thin', 'medium', 'thick')),
1006
new HTMLPurifier_AttrDef_CSS_Length('0') //disallow negative
1009
$this->info['border-width'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_width);
1011
$this->info['letter-spacing'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1012
new HTMLPurifier_AttrDef_Enum(array('normal')),
1013
new HTMLPurifier_AttrDef_CSS_Length()
1016
$this->info['word-spacing'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1017
new HTMLPurifier_AttrDef_Enum(array('normal')),
1018
new HTMLPurifier_AttrDef_CSS_Length()
1021
$this->info['font-size'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1022
new HTMLPurifier_AttrDef_Enum(array('xx-small', 'x-small',
1023
'small', 'medium', 'large', 'x-large', 'xx-large',
1024
'larger', 'smaller')),
1025
new HTMLPurifier_AttrDef_CSS_Percentage(),
1026
new HTMLPurifier_AttrDef_CSS_Length()
1029
$this->info['line-height'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1030
new HTMLPurifier_AttrDef_Enum(array('normal')),
1031
new HTMLPurifier_AttrDef_CSS_Number(true), // no negatives
1032
new HTMLPurifier_AttrDef_CSS_Length('0'),
1033
new HTMLPurifier_AttrDef_CSS_Percentage(true)
1037
$this->info['margin-top'] =
1038
$this->info['margin-bottom'] =
1039
$this->info['margin-left'] =
1040
$this->info['margin-right'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1041
new HTMLPurifier_AttrDef_CSS_Length(),
1042
new HTMLPurifier_AttrDef_CSS_Percentage(),
1043
new HTMLPurifier_AttrDef_Enum(array('auto'))
1046
$this->info['margin'] = new HTMLPurifier_AttrDef_CSS_Multiple($margin);
1050
$this->info['padding-top'] =
1051
$this->info['padding-bottom'] =
1052
$this->info['padding-left'] =
1053
$this->info['padding-right'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1054
new HTMLPurifier_AttrDef_CSS_Length('0'),
1055
new HTMLPurifier_AttrDef_CSS_Percentage(true)
1058
$this->info['padding'] = new HTMLPurifier_AttrDef_CSS_Multiple($padding);
1060
$this->info['text-indent'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1061
new HTMLPurifier_AttrDef_CSS_Length(),
1062
new HTMLPurifier_AttrDef_CSS_Percentage()
1065
$trusted_wh = new HTMLPurifier_AttrDef_CSS_Composite(array(
1066
new HTMLPurifier_AttrDef_CSS_Length('0'),
1067
new HTMLPurifier_AttrDef_CSS_Percentage(true),
1068
new HTMLPurifier_AttrDef_Enum(array('auto'))
1070
$max = $config->get('CSS.MaxImgLength');
1072
$this->info['width'] =
1073
$this->info['height'] =
1076
new HTMLPurifier_AttrDef_Switch('img',
1078
new HTMLPurifier_AttrDef_CSS_Composite(array(
1079
new HTMLPurifier_AttrDef_CSS_Length('0', $max),
1080
new HTMLPurifier_AttrDef_Enum(array('auto'))
1082
// For everyone else:
1086
$this->info['text-decoration'] = new HTMLPurifier_AttrDef_CSS_TextDecoration();
1088
$this->info['font-family'] = new HTMLPurifier_AttrDef_CSS_FontFamily();
1090
// this could use specialized code
1091
$this->info['font-weight'] = new HTMLPurifier_AttrDef_Enum(
1092
array('normal', 'bold', 'bolder', 'lighter', '100', '200', '300',
1093
'400', '500', '600', '700', '800', '900'), false);
1095
// MUST be called after other font properties, as it references
1096
// a CSSDefinition object
1097
$this->info['font'] = new HTMLPurifier_AttrDef_CSS_Font($config);
1100
$this->info['border'] =
1101
$this->info['border-bottom'] =
1102
$this->info['border-top'] =
1103
$this->info['border-left'] =
1104
$this->info['border-right'] = new HTMLPurifier_AttrDef_CSS_Border($config);
1106
$this->info['border-collapse'] = new HTMLPurifier_AttrDef_Enum(array(
1107
'collapse', 'separate'));
1109
$this->info['caption-side'] = new HTMLPurifier_AttrDef_Enum(array(
1112
$this->info['table-layout'] = new HTMLPurifier_AttrDef_Enum(array(
1115
$this->info['vertical-align'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1116
new HTMLPurifier_AttrDef_Enum(array('baseline', 'sub', 'super',
1117
'top', 'text-top', 'middle', 'bottom', 'text-bottom')),
1118
new HTMLPurifier_AttrDef_CSS_Length(),
1119
new HTMLPurifier_AttrDef_CSS_Percentage()
1122
$this->info['border-spacing'] = new HTMLPurifier_AttrDef_CSS_Multiple(new HTMLPurifier_AttrDef_CSS_Length(), 2);
1125
$this->info['white-space'] = new HTMLPurifier_AttrDef_Enum(array('nowrap'));
1127
if ($config->get('CSS.Proprietary')) {
1128
$this->doSetupProprietary($config);
1131
if ($config->get('CSS.AllowTricky')) {
1132
$this->doSetupTricky($config);
1135
$allow_important = $config->get('CSS.AllowImportant');
1136
// wrap all attr-defs with decorator that handles !important
1137
foreach ($this->info as $k => $v) {
1138
$this->info[$k] = new HTMLPurifier_AttrDef_CSS_ImportantDecorator($v, $allow_important);
1141
$this->setupConfigStuff($config);
1144
protected function doSetupProprietary($config) {
1145
// Internet Explorer only scrollbar colors
1146
$this->info['scrollbar-arrow-color'] = new HTMLPurifier_AttrDef_CSS_Color();
1147
$this->info['scrollbar-base-color'] = new HTMLPurifier_AttrDef_CSS_Color();
1148
$this->info['scrollbar-darkshadow-color'] = new HTMLPurifier_AttrDef_CSS_Color();
1149
$this->info['scrollbar-face-color'] = new HTMLPurifier_AttrDef_CSS_Color();
1150
$this->info['scrollbar-highlight-color'] = new HTMLPurifier_AttrDef_CSS_Color();
1151
$this->info['scrollbar-shadow-color'] = new HTMLPurifier_AttrDef_CSS_Color();
1153
// technically not proprietary, but CSS3, and no one supports it
1154
$this->info['opacity'] = new HTMLPurifier_AttrDef_CSS_AlphaValue();
1155
$this->info['-moz-opacity'] = new HTMLPurifier_AttrDef_CSS_AlphaValue();
1156
$this->info['-khtml-opacity'] = new HTMLPurifier_AttrDef_CSS_AlphaValue();
1158
// only opacity, for now
1159
$this->info['filter'] = new HTMLPurifier_AttrDef_CSS_Filter();
1163
protected function doSetupTricky($config) {
1164
$this->info['display'] = new HTMLPurifier_AttrDef_Enum(array(
1165
'inline', 'block', 'list-item', 'run-in', 'compact',
1166
'marker', 'table', 'inline-table', 'table-row-group',
1167
'table-header-group', 'table-footer-group', 'table-row',
1168
'table-column-group', 'table-column', 'table-cell', 'table-caption', 'none'
1170
$this->info['visibility'] = new HTMLPurifier_AttrDef_Enum(array(
1171
'visible', 'hidden', 'collapse'
1173
$this->info['overflow'] = new HTMLPurifier_AttrDef_Enum(array('visible', 'hidden', 'auto', 'scroll'));
1178
* Performs extra config-based processing. Based off of
1179
* HTMLPurifier_HTMLDefinition.
1180
* @todo Refactor duplicate elements into common class (probably using
1181
* composition, not inheritance).
1183
protected function setupConfigStuff($config) {
1185
// setup allowed elements
1186
$support = "(for information on implementing this, see the ".
1188
$allowed_attributes = $config->get('CSS.AllowedProperties');
1189
if ($allowed_attributes !== null) {
1190
foreach ($this->info as $name => $d) {
1191
if(!isset($allowed_attributes[$name])) unset($this->info[$name]);
1192
unset($allowed_attributes[$name]);
1195
foreach ($allowed_attributes as $name => $d) {
1196
// :TODO: Is this htmlspecialchars() call really necessary?
1197
$name = htmlspecialchars($name);
1198
trigger_error("Style attribute '$name' is not supported $support", E_USER_WARNING);
1210
* Defines allowed child nodes and validates tokens against it.
1212
abstract class HTMLPurifier_ChildDef
1215
* Type of child definition, usually right-most part of class name lowercase.
1216
* Used occasionally in terms of context.
1221
* Bool that indicates whether or not an empty array of children is okay
1223
* This is necessary for redundant checking when changes affecting
1224
* a child node may cause a parent node to now be disallowed.
1226
public $allow_empty;
1229
* Lookup array of all elements that this definition could possibly allow
1231
public $elements = array();
1234
* Get lookup of tag names that should not close this element automatically.
1235
* All other elements will do so.
1237
public function getAllowedElements($config) {
1238
return $this->elements;
1242
* Validates nodes according to definition and returns modification.
1244
* @param $tokens_of_children Array of HTMLPurifier_Token
1245
* @param $config HTMLPurifier_Config object
1246
* @param $context HTMLPurifier_Context object
1247
* @return bool true to leave nodes as is
1248
* @return bool false to remove parent node
1249
* @return array of replacement child tokens
1251
abstract public function validateChildren($tokens_of_children, $config, $context);
1259
* Configuration object that triggers customizable behavior.
1261
* @warning This class is strongly defined: that means that the class
1262
* will fail if an undefined directive is retrieved or set.
1264
* @note Many classes that could (although many times don't) use the
1265
* configuration object make it a mandatory parameter. This is
1266
* because a configuration object should always be forwarded,
1267
* otherwise, you run the risk of missing a parameter and then
1268
* being stumped when a configuration directive doesn't work.
1270
* @todo Reconsider some of the public member variables
1272
class HTMLPurifier_Config
1276
* HTML Purifier's version
1278
public $version = '4.0.0';
1281
* Bool indicator whether or not to automatically finalize
1282
* the object if a read operation is done
1284
public $autoFinalize = true;
1286
// protected member variables
1289
* Namespace indexed array of serials for specific namespaces (see
1290
* getSerial() for more info).
1292
protected $serials = array();
1295
* Serial for entire configuration object
1300
* Parser for variables
1305
* Reference HTMLPurifier_ConfigSchema for value checking
1306
* @note This is public for introspective purposes. Please don't
1312
* Indexed array of definitions
1314
protected $definitions;
1317
* Bool indicator whether or not config is finalized
1319
protected $finalized = false;
1322
* Property list containing configuration directives.
1327
* Whether or not a set is taking place due to an
1333
* Set to false if you do not want line and file numbers in errors
1334
* (useful when unit testing)
1336
public $chatty = true;
1339
* Current lock; only gets to this namespace are allowed.
1344
* @param $definition HTMLPurifier_ConfigSchema that defines what directives
1347
public function __construct($definition, $parent = null) {
1348
$parent = $parent ? $parent : $definition->defaultPlist;
1349
$this->plist = new HTMLPurifier_PropertyList($parent);
1350
$this->def = $definition; // keep a copy around for checking
1351
$this->parser = new HTMLPurifier_VarParser_Flexible();
1355
* Convenience constructor that creates a config object based on a mixed var
1356
* @param mixed $config Variable that defines the state of the config
1357
* object. Can be: a HTMLPurifier_Config() object,
1358
* an array of directives based on loadArray(),
1359
* or a string filename of an ini file.
1360
* @param HTMLPurifier_ConfigSchema Schema object
1361
* @return Configured HTMLPurifier_Config object
1363
public static function create($config, $schema = null) {
1364
if ($config instanceof HTMLPurifier_Config) {
1369
$ret = HTMLPurifier_Config::createDefault();
1371
$ret = new HTMLPurifier_Config($schema);
1373
if (is_string($config)) $ret->loadIni($config);
1374
elseif (is_array($config)) $ret->loadArray($config);
1379
* Creates a new config object that inherits from a previous one.
1380
* @param HTMLPurifier_Config $config Configuration object to inherit
1382
* @return HTMLPurifier_Config object with $config as its parent.
1384
public static function inherit(HTMLPurifier_Config $config) {
1385
return new HTMLPurifier_Config($config->def, $config->plist);
1389
* Convenience constructor that creates a default configuration object.
1390
* @return Default HTMLPurifier_Config object.
1392
public static function createDefault() {
1393
$definition = HTMLPurifier_ConfigSchema::instance();
1394
$config = new HTMLPurifier_Config($definition);
1399
* Retreives a value from the configuration.
1400
* @param $key String key
1402
public function get($key, $a = null) {
1404
$this->triggerError("Using deprecated API: use \$config->get('$key.$a') instead", E_USER_WARNING);
1407
if (!$this->finalized) $this->autoFinalize();
1408
if (!isset($this->def->info[$key])) {
1409
// can't add % due to SimpleTest bug
1410
$this->triggerError('Cannot retrieve value of undefined directive ' . htmlspecialchars($key),
1414
if (isset($this->def->info[$key]->isAlias)) {
1415
$d = $this->def->info[$key];
1416
$this->triggerError('Cannot get value from aliased directive, use real name ' . $d->key,
1421
list($ns) = explode('.', $key);
1422
if ($ns !== $this->lock) {
1423
$this->triggerError('Cannot get value of namespace ' . $ns . ' when lock for ' . $this->lock . ' is active, this probably indicates a Definition setup method is accessing directives that are not within its namespace', E_USER_ERROR);
1427
return $this->plist->get($key);
1431
* Retreives an array of directives to values from a given namespace
1432
* @param $namespace String namespace
1434
public function getBatch($namespace) {
1435
if (!$this->finalized) $this->autoFinalize();
1436
$full = $this->getAll();
1437
if (!isset($full[$namespace])) {
1438
$this->triggerError('Cannot retrieve undefined namespace ' . htmlspecialchars($namespace),
1442
return $full[$namespace];
1446
* Returns a md5 signature of a segment of the configuration object
1447
* that uniquely identifies that particular configuration
1448
* @note Revision is handled specially and is removed from the batch
1449
* before processing!
1450
* @param $namespace Namespace to get serial for
1452
public function getBatchSerial($namespace) {
1453
if (empty($this->serials[$namespace])) {
1454
$batch = $this->getBatch($namespace);
1455
unset($batch['DefinitionRev']);
1456
$this->serials[$namespace] = md5(serialize($batch));
1458
return $this->serials[$namespace];
1462
* Returns a md5 signature for the entire configuration object
1463
* that uniquely identifies that particular configuration
1465
public function getSerial() {
1466
if (empty($this->serial)) {
1467
$this->serial = md5(serialize($this->getAll()));
1469
return $this->serial;
1473
* Retrieves all directives, organized by namespace
1474
* @warning This is a pretty inefficient function, avoid if you can
1476
public function getAll() {
1477
if (!$this->finalized) $this->autoFinalize();
1479
foreach ($this->plist->squash() as $name => $value) {
1480
list($ns, $key) = explode('.', $name, 2);
1481
$ret[$ns][$key] = $value;
1487
* Sets a value to configuration.
1488
* @param $key String key
1489
* @param $value Mixed value
1491
public function set($key, $value, $a = null) {
1492
if (strpos($key, '.') === false) {
1494
$directive = $value;
1496
$key = "$key.$directive";
1497
$this->triggerError("Using deprecated API: use \$config->set('$key', ...) instead", E_USER_NOTICE);
1499
list($namespace) = explode('.', $key);
1501
if ($this->isFinalized('Cannot set directive after finalization')) return;
1502
if (!isset($this->def->info[$key])) {
1503
$this->triggerError('Cannot set undefined directive ' . htmlspecialchars($key) . ' to value',
1507
$def = $this->def->info[$key];
1509
if (isset($def->isAlias)) {
1510
if ($this->aliasMode) {
1511
$this->triggerError('Double-aliases not allowed, please fix '.
1512
'ConfigSchema bug with' . $key, E_USER_ERROR);
1515
$this->aliasMode = true;
1516
$this->set($def->key, $value);
1517
$this->aliasMode = false;
1518
$this->triggerError("$key is an alias, preferred directive name is {$def->key}", E_USER_NOTICE);
1522
// Raw type might be negative when using the fully optimized form
1523
// of stdclass, which indicates allow_null == true
1524
$rtype = is_int($def) ? $def : $def->type;
1530
$allow_null = isset($def->allow_null);
1534
$value = $this->parser->parse($value, $type, $allow_null);
1535
} catch (HTMLPurifier_VarParserException $e) {
1536
$this->triggerError('Value for ' . $key . ' is of invalid type, should be ' . HTMLPurifier_VarParser::getTypeName($type), E_USER_WARNING);
1539
if (is_string($value) && is_object($def)) {
1540
// resolve value alias if defined
1541
if (isset($def->aliases[$value])) {
1542
$value = $def->aliases[$value];
1544
// check to see if the value is allowed
1545
if (isset($def->allowed) && !isset($def->allowed[$value])) {
1546
$this->triggerError('Value not supported, valid values are: ' .
1547
$this->_listify($def->allowed), E_USER_WARNING);
1551
$this->plist->set($key, $value);
1553
// reset definitions if the directives they depend on changed
1554
// this is a very costly process, so it's discouraged
1555
// with finalization
1556
if ($namespace == 'HTML' || $namespace == 'CSS' || $namespace == 'URI') {
1557
$this->definitions[$namespace] = null;
1560
$this->serials[$namespace] = false;
1564
* Convenience function for error reporting
1566
private function _listify($lookup) {
1568
foreach ($lookup as $name => $b) $list[] = $name;
1569
return implode(', ', $list);
1573
* Retrieves object reference to the HTML definition.
1574
* @param $raw Return a copy that has not been setup yet. Must be
1575
* called before it's been setup, otherwise won't work.
1577
public function getHTMLDefinition($raw = false) {
1578
return $this->getDefinition('HTML', $raw);
1582
* Retrieves object reference to the CSS definition
1583
* @param $raw Return a copy that has not been setup yet. Must be
1584
* called before it's been setup, otherwise won't work.
1586
public function getCSSDefinition($raw = false) {
1587
return $this->getDefinition('CSS', $raw);
1591
* Retrieves a definition
1592
* @param $type Type of definition: HTML, CSS, etc
1593
* @param $raw Whether or not definition should be returned raw
1595
public function getDefinition($type, $raw = false) {
1596
if (!$this->finalized) $this->autoFinalize();
1597
// temporarily suspend locks, so we can handle recursive definition calls
1598
$lock = $this->lock;
1600
$factory = HTMLPurifier_DefinitionCacheFactory::instance();
1601
$cache = $factory->create($type, $this);
1602
$this->lock = $lock;
1604
// see if we can quickly supply a definition
1605
if (!empty($this->definitions[$type])) {
1606
if (!$this->definitions[$type]->setup) {
1607
$this->definitions[$type]->setup($this);
1608
$cache->set($this->definitions[$type], $this);
1610
return $this->definitions[$type];
1612
// memory check missed, try cache
1613
$this->definitions[$type] = $cache->get($this);
1614
if ($this->definitions[$type]) {
1615
// definition in cache, return it
1616
return $this->definitions[$type];
1619
!empty($this->definitions[$type]) &&
1620
!$this->definitions[$type]->setup
1622
// raw requested, raw in memory, quick return
1623
return $this->definitions[$type];
1625
// quick checks failed, let's create the object
1626
if ($type == 'HTML') {
1627
$this->definitions[$type] = new HTMLPurifier_HTMLDefinition();
1628
} elseif ($type == 'CSS') {
1629
$this->definitions[$type] = new HTMLPurifier_CSSDefinition();
1630
} elseif ($type == 'URI') {
1631
$this->definitions[$type] = new HTMLPurifier_URIDefinition();
1633
throw new HTMLPurifier_Exception("Definition of $type type not supported");
1635
// quick abort if raw
1637
if (is_null($this->get($type . '.DefinitionID'))) {
1638
// fatally error out if definition ID not set
1639
throw new HTMLPurifier_Exception("Cannot retrieve raw version without specifying %$type.DefinitionID");
1641
return $this->definitions[$type];
1644
$this->lock = $type;
1645
$this->definitions[$type]->setup($this);
1648
$cache->set($this->definitions[$type], $this);
1649
return $this->definitions[$type];
1653
* Loads configuration values from an array with the following structure:
1654
* Namespace.Directive => Value
1655
* @param $config_array Configuration associative array
1657
public function loadArray($config_array) {
1658
if ($this->isFinalized('Cannot load directives after finalization')) return;
1659
foreach ($config_array as $key => $value) {
1660
$key = str_replace('_', '.', $key);
1661
if (strpos($key, '.') !== false) {
1662
$this->set($key, $value);
1665
$namespace_values = $value;
1666
foreach ($namespace_values as $directive => $value) {
1667
$this->set($namespace .'.'. $directive, $value);
1674
* Returns a list of array(namespace, directive) for all directives
1675
* that are allowed in a web-form context as per an allowed
1676
* namespaces/directives list.
1677
* @param $allowed List of allowed namespaces/directives
1679
public static function getAllowedDirectivesForForm($allowed, $schema = null) {
1681
$schema = HTMLPurifier_ConfigSchema::instance();
1683
if ($allowed !== true) {
1684
if (is_string($allowed)) $allowed = array($allowed);
1685
$allowed_ns = array();
1686
$allowed_directives = array();
1687
$blacklisted_directives = array();
1688
foreach ($allowed as $ns_or_directive) {
1689
if (strpos($ns_or_directive, '.') !== false) {
1691
if ($ns_or_directive[0] == '-') {
1692
$blacklisted_directives[substr($ns_or_directive, 1)] = true;
1694
$allowed_directives[$ns_or_directive] = true;
1698
$allowed_ns[$ns_or_directive] = true;
1703
foreach ($schema->info as $key => $def) {
1704
list($ns, $directive) = explode('.', $key, 2);
1705
if ($allowed !== true) {
1706
if (isset($blacklisted_directives["$ns.$directive"])) continue;
1707
if (!isset($allowed_directives["$ns.$directive"]) && !isset($allowed_ns[$ns])) continue;
1709
if (isset($def->isAlias)) continue;
1710
if ($directive == 'DefinitionID' || $directive == 'DefinitionRev') continue;
1711
$ret[] = array($ns, $directive);
1717
* Loads configuration values from $_GET/$_POST that were posted
1719
* @param $array $_GET or $_POST array to import
1720
* @param $index Index/name that the config variables are in
1721
* @param $allowed List of allowed namespaces/directives
1722
* @param $mq_fix Boolean whether or not to enable magic quotes fix
1723
* @param $schema Instance of HTMLPurifier_ConfigSchema to use, if not global copy
1725
public static function loadArrayFromForm($array, $index = false, $allowed = true, $mq_fix = true, $schema = null) {
1726
$ret = HTMLPurifier_Config::prepareArrayFromForm($array, $index, $allowed, $mq_fix, $schema);
1727
$config = HTMLPurifier_Config::create($ret, $schema);
1732
* Merges in configuration values from $_GET/$_POST to object. NOT STATIC.
1733
* @note Same parameters as loadArrayFromForm
1735
public function mergeArrayFromForm($array, $index = false, $allowed = true, $mq_fix = true) {
1736
$ret = HTMLPurifier_Config::prepareArrayFromForm($array, $index, $allowed, $mq_fix, $this->def);
1737
$this->loadArray($ret);
1741
* Prepares an array from a form into something usable for the more
1742
* strict parts of HTMLPurifier_Config
1744
public static function prepareArrayFromForm($array, $index = false, $allowed = true, $mq_fix = true, $schema = null) {
1745
if ($index !== false) $array = (isset($array[$index]) && is_array($array[$index])) ? $array[$index] : array();
1746
$mq = $mq_fix && function_exists('get_magic_quotes_gpc') && get_magic_quotes_gpc();
1748
$allowed = HTMLPurifier_Config::getAllowedDirectivesForForm($allowed, $schema);
1750
foreach ($allowed as $key) {
1751
list($ns, $directive) = $key;
1752
$skey = "$ns.$directive";
1753
if (!empty($array["Null_$skey"])) {
1754
$ret[$ns][$directive] = null;
1757
if (!isset($array[$skey])) continue;
1758
$value = $mq ? stripslashes($array[$skey]) : $array[$skey];
1759
$ret[$ns][$directive] = $value;
1765
* Loads configuration values from an ini file
1766
* @param $filename Name of ini file
1768
public function loadIni($filename) {
1769
if ($this->isFinalized('Cannot load directives after finalization')) return;
1770
$array = parse_ini_file($filename, true);
1771
$this->loadArray($array);
1775
* Checks whether or not the configuration object is finalized.
1776
* @param $error String error message, or false for no error
1778
public function isFinalized($error = false) {
1779
if ($this->finalized && $error) {
1780
$this->triggerError($error, E_USER_ERROR);
1782
return $this->finalized;
1786
* Finalizes configuration only if auto finalize is on and not
1789
public function autoFinalize() {
1790
if ($this->autoFinalize) {
1793
$this->plist->squash(true);
1798
* Finalizes a configuration object, prohibiting further change
1800
public function finalize() {
1801
$this->finalized = true;
1802
unset($this->parser);
1806
* Produces a nicely formatted error message by supplying the
1807
* stack frame information from two levels up and OUTSIDE of
1808
* HTMLPurifier_Config.
1810
protected function triggerError($msg, $no) {
1811
// determine previous stack frame
1812
$backtrace = debug_backtrace();
1813
if ($this->chatty && isset($backtrace[1])) {
1814
$frame = $backtrace[1];
1815
$extra = " on line {$frame['line']} in file {$frame['file']}";
1819
trigger_error($msg . $extra, $no);
1823
* Returns a serialized form of the configuration object that can
1826
public function serialize() {
1827
$this->getDefinition('HTML');
1828
$this->getDefinition('CSS');
1829
$this->getDefinition('URI');
1830
return serialize($this);
1840
* Configuration definition, defines directives and their defaults.
1842
class HTMLPurifier_ConfigSchema {
1845
* Defaults of the directives and namespaces.
1846
* @note This shares the exact same structure as HTMLPurifier_Config::$conf
1848
public $defaults = array();
1851
* The default property list. Do not edit this property list.
1853
public $defaultPlist;
1856
* Definition of the directives. The structure of this is:
1859
* 'Namespace' => array(
1860
* 'Directive' => new stdclass(),
1864
* The stdclass may have the following properties:
1866
* - If isAlias isn't set:
1867
* - type: Integer type of directive, see HTMLPurifier_VarParser for definitions
1868
* - allow_null: If set, this directive allows null values
1869
* - aliases: If set, an associative array of value aliases to real values
1870
* - allowed: If set, a lookup array of allowed (string) values
1871
* - If isAlias is set:
1872
* - namespace: Namespace this directive aliases to
1873
* - name: Directive name this directive aliases to
1875
* In certain degenerate cases, stdclass will actually be an integer. In
1876
* that case, the value is equivalent to an stdclass with the type
1877
* property set to the integer. If the integer is negative, type is
1878
* equal to the absolute value of integer, and allow_null is true.
1880
* This class is friendly with HTMLPurifier_Config. If you need introspection
1881
* about the schema, you're better of using the ConfigSchema_Interchange,
1882
* which uses more memory but has much richer information.
1884
public $info = array();
1887
* Application-wide singleton
1889
static protected $singleton;
1891
public function __construct() {
1892
$this->defaultPlist = new HTMLPurifier_PropertyList();
1896
* Unserializes the default ConfigSchema.
1898
public static function makeFromSerial() {
1899
return unserialize(file_get_contents(HTMLPURIFIER_PREFIX . '/HTMLPurifier/ConfigSchema/schema.ser'));
1903
* Retrieves an instance of the application-wide configuration definition.
1905
public static function instance($prototype = null) {
1906
if ($prototype !== null) {
1907
HTMLPurifier_ConfigSchema::$singleton = $prototype;
1908
} elseif (HTMLPurifier_ConfigSchema::$singleton === null || $prototype === true) {
1909
HTMLPurifier_ConfigSchema::$singleton = HTMLPurifier_ConfigSchema::makeFromSerial();
1911
return HTMLPurifier_ConfigSchema::$singleton;
1915
* Defines a directive for configuration
1916
* @warning Will fail of directive's namespace is defined.
1917
* @warning This method's signature is slightly different from the legacy
1918
* define() static method! Beware!
1919
* @param $namespace Namespace the directive is in
1920
* @param $name Key of directive
1921
* @param $default Default value of directive
1922
* @param $type Allowed type of the directive. See
1923
* HTMLPurifier_DirectiveDef::$type for allowed values
1924
* @param $allow_null Whether or not to allow null values
1926
public function add($key, $default, $type, $allow_null) {
1927
$obj = new stdclass();
1928
$obj->type = is_int($type) ? $type : HTMLPurifier_VarParser::$types[$type];
1929
if ($allow_null) $obj->allow_null = true;
1930
$this->info[$key] = $obj;
1931
$this->defaults[$key] = $default;
1932
$this->defaultPlist->set($key, $default);
1936
* Defines a directive value alias.
1938
* Directive value aliases are convenient for developers because it lets
1939
* them set a directive to several values and get the same result.
1940
* @param $namespace Directive's namespace
1941
* @param $name Name of Directive
1942
* @param $aliases Hash of aliased values to the real alias
1944
public function addValueAliases($key, $aliases) {
1945
if (!isset($this->info[$key]->aliases)) {
1946
$this->info[$key]->aliases = array();
1948
foreach ($aliases as $alias => $real) {
1949
$this->info[$key]->aliases[$alias] = $real;
1954
* Defines a set of allowed values for a directive.
1955
* @warning This is slightly different from the corresponding static
1956
* method definition.
1957
* @param $namespace Namespace of directive
1958
* @param $name Name of directive
1959
* @param $allowed Lookup array of allowed values
1961
public function addAllowedValues($key, $allowed) {
1962
$this->info[$key]->allowed = $allowed;
1966
* Defines a directive alias for backwards compatibility
1968
* @param $name Directive that will be aliased
1969
* @param $new_namespace
1970
* @param $new_name Directive that the alias will be to
1972
public function addAlias($key, $new_key) {
1973
$obj = new stdclass;
1974
$obj->key = $new_key;
1975
$obj->isAlias = true;
1976
$this->info[$key] = $obj;
1980
* Replaces any stdclass that only has the type property with type integer.
1982
public function postProcess() {
1983
foreach ($this->info as $key => $v) {
1984
if (count((array) $v) == 1) {
1985
$this->info[$key] = $v->type;
1986
} elseif (count((array) $v) == 2 && isset($v->allow_null)) {
1987
$this->info[$key] = -$v->type;
2001
class HTMLPurifier_ContentSets
2005
* List of content set strings (pipe seperators) indexed by name.
2007
public $info = array();
2010
* List of content set lookups (element => true) indexed by name.
2011
* @note This is in HTMLPurifier_HTMLDefinition->info_content_sets
2013
public $lookup = array();
2016
* Synchronized list of defined content sets (keys of info)
2018
protected $keys = array();
2020
* Synchronized list of defined content values (values of info)
2022
protected $values = array();
2025
* Merges in module's content sets, expands identifiers in the content
2026
* sets and populates the keys, values and lookup member variables.
2027
* @param $modules List of HTMLPurifier_HTMLModule
2029
public function __construct($modules) {
2030
if (!is_array($modules)) $modules = array($modules);
2031
// populate content_sets based on module hints
2032
// sorry, no way of overloading
2033
foreach ($modules as $module_i => $module) {
2034
foreach ($module->content_sets as $key => $value) {
2035
$temp = $this->convertToLookup($value);
2036
if (isset($this->lookup[$key])) {
2037
// add it into the existing content set
2038
$this->lookup[$key] = array_merge($this->lookup[$key], $temp);
2040
$this->lookup[$key] = $temp;
2044
$old_lookup = false;
2045
while ($old_lookup !== $this->lookup) {
2046
$old_lookup = $this->lookup;
2047
foreach ($this->lookup as $i => $set) {
2049
foreach ($set as $element => $x) {
2050
if (isset($this->lookup[$element])) {
2051
$add += $this->lookup[$element];
2052
unset($this->lookup[$i][$element]);
2055
$this->lookup[$i] += $add;
2059
foreach ($this->lookup as $key => $lookup) {
2060
$this->info[$key] = implode(' | ', array_keys($lookup));
2062
$this->keys = array_keys($this->info);
2063
$this->values = array_values($this->info);
2067
* Accepts a definition; generates and assigns a ChildDef for it
2068
* @param $def HTMLPurifier_ElementDef reference
2069
* @param $module Module that defined the ElementDef
2071
public function generateChildDef(&$def, $module) {
2072
if (!empty($def->child)) return; // already done!
2073
$content_model = $def->content_model;
2074
if (is_string($content_model)) {
2075
// Assume that $this->keys is alphanumeric
2076
$def->content_model = preg_replace_callback(
2077
'/\b(' . implode('|', $this->keys) . ')\b/',
2078
array($this, 'generateChildDefCallback'),
2081
//$def->content_model = str_replace(
2082
// $this->keys, $this->values, $content_model);
2084
$def->child = $this->getChildDef($def, $module);
2087
public function generateChildDefCallback($matches) {
2088
return $this->info[$matches[0]];
2092
* Instantiates a ChildDef based on content_model and content_model_type
2093
* member variables in HTMLPurifier_ElementDef
2094
* @note This will also defer to modules for custom HTMLPurifier_ChildDef
2095
* subclasses that need content set expansion
2096
* @param $def HTMLPurifier_ElementDef to have ChildDef extracted
2097
* @return HTMLPurifier_ChildDef corresponding to ElementDef
2099
public function getChildDef($def, $module) {
2100
$value = $def->content_model;
2101
if (is_object($value)) {
2103
'Literal object child definitions should be stored in '.
2104
'ElementDef->child not ElementDef->content_model',
2109
switch ($def->content_model_type) {
2111
return new HTMLPurifier_ChildDef_Required($value);
2113
return new HTMLPurifier_ChildDef_Optional($value);
2115
return new HTMLPurifier_ChildDef_Empty();
2117
return new HTMLPurifier_ChildDef_Custom($value);
2119
// defer to its module
2121
if ($module->defines_child_def) { // save a func call
2122
$return = $module->getChildDef($def);
2124
if ($return !== false) return $return;
2127
'Could not determine which ChildDef class to instantiate',
2134
* Converts a string list of elements separated by pipes into
2136
* @param $string List of elements
2137
* @return Lookup array of elements
2139
protected function convertToLookup($string) {
2140
$array = explode('|', str_replace(' ', '', $string));
2142
foreach ($array as $i => $k) {
2155
* Registry object that contains information about the current context.
2156
* @warning Is a bit buggy when variables are set to null: it thinks
2157
* they don't exist! So use false instead, please.
2158
* @note Since the variables Context deals with may not be objects,
2159
* references are very important here! Do not remove!
2161
class HTMLPurifier_Context
2165
* Private array that stores the references.
2167
private $_storage = array();
2170
* Registers a variable into the context.
2171
* @param $name String name
2172
* @param $ref Reference to variable to be registered
2174
public function register($name, &$ref) {
2175
if (isset($this->_storage[$name])) {
2176
trigger_error("Name $name produces collision, cannot re-register",
2180
$this->_storage[$name] =& $ref;
2184
* Retrieves a variable reference from the context.
2185
* @param $name String name
2186
* @param $ignore_error Boolean whether or not to ignore error
2188
public function &get($name, $ignore_error = false) {
2189
if (!isset($this->_storage[$name])) {
2190
if (!$ignore_error) {
2191
trigger_error("Attempted to retrieve non-existent variable $name",
2194
$var = null; // so we can return by reference
2197
return $this->_storage[$name];
2201
* Destorys a variable in the context.
2202
* @param $name String name
2204
public function destroy($name) {
2205
if (!isset($this->_storage[$name])) {
2206
trigger_error("Attempted to destroy non-existent variable $name",
2210
unset($this->_storage[$name]);
2214
* Checks whether or not the variable exists.
2215
* @param $name String name
2217
public function exists($name) {
2218
return isset($this->_storage[$name]);
2222
* Loads a series of variables from an associative array
2223
* @param $context_array Assoc array of variables to load
2225
public function loadArray($context_array) {
2226
foreach ($context_array as $key => $discard) {
2227
$this->register($key, $context_array[$key]);
2238
* Abstract class representing Definition cache managers that implements
2239
* useful common methods and is a factory.
2240
* @todo Create a separate maintenance file advanced users can use to
2241
* cache their custom HTMLDefinition, which can be loaded
2242
* via a configuration directive
2243
* @todo Implement memcached
2245
abstract class HTMLPurifier_DefinitionCache
2251
* @param $name Type of definition objects this instance of the
2252
* cache will handle.
2254
public function __construct($type) {
2255
$this->type = $type;
2259
* Generates a unique identifier for a particular configuration
2260
* @param Instance of HTMLPurifier_Config
2262
public function generateKey($config) {
2263
return $config->version . ',' . // possibly replace with function calls
2264
$config->getBatchSerial($this->type) . ',' .
2265
$config->get($this->type . '.DefinitionRev');
2269
* Tests whether or not a key is old with respect to the configuration's
2270
* version and revision number.
2271
* @param $key Key to test
2272
* @param $config Instance of HTMLPurifier_Config to test against
2274
public function isOld($key, $config) {
2275
if (substr_count($key, ',') < 2) return true;
2276
list($version, $hash, $revision) = explode(',', $key, 3);
2277
$compare = version_compare($version, $config->version);
2278
// version mismatch, is always old
2279
if ($compare != 0) return true;
2280
// versions match, ids match, check revision number
2282
$hash == $config->getBatchSerial($this->type) &&
2283
$revision < $config->get($this->type . '.DefinitionRev')
2289
* Checks if a definition's type jives with the cache's type
2290
* @note Throws an error on failure
2291
* @param $def Definition object to check
2292
* @return Boolean true if good, false if not
2294
public function checkDefType($def) {
2295
if ($def->type !== $this->type) {
2296
trigger_error("Cannot use definition of type {$def->type} in cache for {$this->type}");
2303
* Adds a definition object to the cache
2305
abstract public function add($def, $config);
2308
* Unconditionally saves a definition object to the cache
2310
abstract public function set($def, $config);
2313
* Replace an object in the cache
2315
abstract public function replace($def, $config);
2318
* Retrieves a definition object from the cache
2320
abstract public function get($config);
2323
* Removes a definition object to the cache
2325
abstract public function remove($config);
2328
* Clears all objects from cache
2330
abstract public function flush($config);
2333
* Clears all expired (older version or revision) objects from cache
2334
* @note Be carefuly implementing this method as flush. Flush must
2335
* not interfere with other Definition types, and cleanup()
2336
* should not be repeatedly called by userland code.
2338
abstract public function cleanup($config);
2347
* Responsible for creating definition caches.
2349
class HTMLPurifier_DefinitionCacheFactory
2352
protected $caches = array('Serializer' => array());
2353
protected $implementations = array();
2354
protected $decorators = array();
2357
* Initialize default decorators
2359
public function setup() {
2360
$this->addDecorator('Cleanup');
2364
* Retrieves an instance of global definition cache factory.
2366
public static function instance($prototype = null) {
2368
if ($prototype !== null) {
2369
$instance = $prototype;
2370
} elseif ($instance === null || $prototype === true) {
2371
$instance = new HTMLPurifier_DefinitionCacheFactory();
2378
* Registers a new definition cache object
2379
* @param $short Short name of cache object, for reference
2380
* @param $long Full class name of cache object, for construction
2382
public function register($short, $long) {
2383
$this->implementations[$short] = $long;
2387
* Factory method that creates a cache object based on configuration
2388
* @param $name Name of definitions handled by cache
2389
* @param $config Instance of HTMLPurifier_Config
2391
public function create($type, $config) {
2392
$method = $config->get('Cache.DefinitionImpl');
2393
if ($method === null) {
2394
return new HTMLPurifier_DefinitionCache_Null($type);
2396
if (!empty($this->caches[$method][$type])) {
2397
return $this->caches[$method][$type];
2400
isset($this->implementations[$method]) &&
2401
class_exists($class = $this->implementations[$method], false)
2403
$cache = new $class($type);
2405
if ($method != 'Serializer') {
2406
trigger_error("Unrecognized DefinitionCache $method, using Serializer instead", E_USER_WARNING);
2408
$cache = new HTMLPurifier_DefinitionCache_Serializer($type);
2410
foreach ($this->decorators as $decorator) {
2411
$new_cache = $decorator->decorate($cache);
2412
// prevent infinite recursion in PHP 4
2414
$cache = $new_cache;
2416
$this->caches[$method][$type] = $cache;
2417
return $this->caches[$method][$type];
2421
* Registers a decorator to add to all new cache objects
2424
public function addDecorator($decorator) {
2425
if (is_string($decorator)) {
2426
$class = "HTMLPurifier_DefinitionCache_Decorator_$decorator";
2427
$decorator = new $class;
2429
$this->decorators[$decorator->name] = $decorator;
2439
* Represents a document type, contains information on which modules
2440
* need to be loaded.
2441
* @note This class is inspected by Printer_HTMLDefinition->renderDoctype.
2442
* If structure changes, please update that function.
2444
class HTMLPurifier_Doctype
2447
* Full name of doctype
2452
* List of standard modules (string identifiers or literal objects)
2453
* that this doctype uses
2455
public $modules = array();
2458
* List of modules to use for tidying up code
2460
public $tidyModules = array();
2463
* Is the language derived from XML (i.e. XHTML)?
2468
* List of aliases for this doctype
2470
public $aliases = array();
2473
* Public DTD identifier
2478
* System DTD identifier
2482
public function __construct($name = null, $xml = true, $modules = array(),
2483
$tidyModules = array(), $aliases = array(), $dtd_public = null, $dtd_system = null
2485
$this->name = $name;
2487
$this->modules = $modules;
2488
$this->tidyModules = $tidyModules;
2489
$this->aliases = $aliases;
2490
$this->dtdPublic = $dtd_public;
2491
$this->dtdSystem = $dtd_system;
2499
class HTMLPurifier_DoctypeRegistry
2503
* Hash of doctype names to doctype objects
2505
protected $doctypes;
2508
* Lookup table of aliases to real doctype names
2513
* Registers a doctype to the registry
2514
* @note Accepts a fully-formed doctype object, or the
2515
* parameters for constructing a doctype object
2516
* @param $doctype Name of doctype or literal doctype object
2517
* @param $modules Modules doctype will load
2518
* @param $modules_for_modes Modules doctype will load for certain modes
2519
* @param $aliases Alias names for doctype
2520
* @return Editable registered doctype
2522
public function register($doctype, $xml = true, $modules = array(),
2523
$tidy_modules = array(), $aliases = array(), $dtd_public = null, $dtd_system = null
2525
if (!is_array($modules)) $modules = array($modules);
2526
if (!is_array($tidy_modules)) $tidy_modules = array($tidy_modules);
2527
if (!is_array($aliases)) $aliases = array($aliases);
2528
if (!is_object($doctype)) {
2529
$doctype = new HTMLPurifier_Doctype(
2530
$doctype, $xml, $modules, $tidy_modules, $aliases, $dtd_public, $dtd_system
2533
$this->doctypes[$doctype->name] = $doctype;
2534
$name = $doctype->name;
2536
foreach ($doctype->aliases as $alias) {
2537
if (isset($this->doctypes[$alias])) continue;
2538
$this->aliases[$alias] = $name;
2540
// remove old aliases
2541
if (isset($this->aliases[$name])) unset($this->aliases[$name]);
2546
* Retrieves reference to a doctype of a certain name
2547
* @note This function resolves aliases
2548
* @note When possible, use the more fully-featured make()
2549
* @param $doctype Name of doctype
2550
* @return Editable doctype object
2552
public function get($doctype) {
2553
if (isset($this->aliases[$doctype])) $doctype = $this->aliases[$doctype];
2554
if (!isset($this->doctypes[$doctype])) {
2555
trigger_error('Doctype ' . htmlspecialchars($doctype) . ' does not exist', E_USER_ERROR);
2556
$anon = new HTMLPurifier_Doctype($doctype);
2559
return $this->doctypes[$doctype];
2563
* Creates a doctype based on a configuration object,
2564
* will perform initialization on the doctype
2565
* @note Use this function to get a copy of doctype that config
2566
* can hold on to (this is necessary in order to tell
2567
* Generator whether or not the current document is XML
2570
public function make($config) {
2571
return clone $this->get($this->getDoctypeFromConfig($config));
2575
* Retrieves the doctype from the configuration object
2577
public function getDoctypeFromConfig($config) {
2579
$doctype = $config->get('HTML.Doctype');
2580
if (!empty($doctype)) return $doctype;
2581
$doctype = $config->get('HTML.CustomDoctype');
2582
if (!empty($doctype)) return $doctype;
2583
// backwards-compatibility
2584
if ($config->get('HTML.XHTML')) {
2585
$doctype = 'XHTML 1.0';
2587
$doctype = 'HTML 4.01';
2589
if ($config->get('HTML.Strict')) {
2590
$doctype .= ' Strict';
2592
$doctype .= ' Transitional';
2604
* Structure that stores an HTML element definition. Used by
2605
* HTMLPurifier_HTMLDefinition and HTMLPurifier_HTMLModule.
2606
* @note This class is inspected by HTMLPurifier_Printer_HTMLDefinition.
2607
* Please update that class too.
2608
* @warning If you add new properties to this class, you MUST update
2609
* the mergeIn() method.
2611
class HTMLPurifier_ElementDef
2615
* Does the definition work by itself, or is it created solely
2616
* for the purpose of merging into another definition?
2618
public $standalone = true;
2621
* Associative array of attribute name to HTMLPurifier_AttrDef
2622
* @note Before being processed by HTMLPurifier_AttrCollections
2623
* when modules are finalized during
2624
* HTMLPurifier_HTMLDefinition->setup(), this array may also
2625
* contain an array at index 0 that indicates which attribute
2626
* collections to load into the full array. It may also
2627
* contain string indentifiers in lieu of HTMLPurifier_AttrDef,
2628
* see HTMLPurifier_AttrTypes on how they are expanded during
2629
* HTMLPurifier_HTMLDefinition->setup() processing.
2631
public $attr = array();
2634
* Indexed list of tag's HTMLPurifier_AttrTransform to be done before validation
2636
public $attr_transform_pre = array();
2639
* Indexed list of tag's HTMLPurifier_AttrTransform to be done after validation
2641
public $attr_transform_post = array();
2644
* HTMLPurifier_ChildDef of this tag.
2649
* Abstract string representation of internal ChildDef rules. See
2650
* HTMLPurifier_ContentSets for how this is parsed and then transformed
2651
* into an HTMLPurifier_ChildDef.
2652
* @warning This is a temporary variable that is not available after
2653
* being processed by HTMLDefinition
2655
public $content_model;
2658
* Value of $child->type, used to determine which ChildDef to use,
2659
* used in combination with $content_model.
2660
* @warning This must be lowercase
2661
* @warning This is a temporary variable that is not available after
2662
* being processed by HTMLDefinition
2664
public $content_model_type;
2669
* Does the element have a content model (#PCDATA | Inline)*? This
2670
* is important for chameleon ins and del processing in
2671
* HTMLPurifier_ChildDef_Chameleon. Dynamically set: modules don't
2672
* have to worry about this one.
2674
public $descendants_are_inline = false;
2677
* List of the names of required attributes this element has. Dynamically
2678
* populated by HTMLPurifier_HTMLDefinition::getElement
2680
public $required_attr = array();
2683
* Lookup table of tags excluded from all descendants of this tag.
2684
* @note SGML permits exclusions for all descendants, but this is
2685
* not possible with DTDs or XML Schemas. W3C has elected to
2686
* use complicated compositions of content_models to simulate
2687
* exclusion for children, but we go the simpler, SGML-style
2688
* route of flat-out exclusions, which correctly apply to
2689
* all descendants and not just children. Note that the XHTML
2690
* Modularization Abstract Modules are blithely unaware of such
2693
public $excludes = array();
2696
* This tag is explicitly auto-closed by the following tags.
2698
public $autoclose = array();
2701
* Whether or not this is a formatting element affected by the
2702
* "Active Formatting Elements" algorithm.
2707
* Low-level factory constructor for creating new standalone element defs
2709
public static function create($content_model, $content_model_type, $attr) {
2710
$def = new HTMLPurifier_ElementDef();
2711
$def->content_model = $content_model;
2712
$def->content_model_type = $content_model_type;
2718
* Merges the values of another element definition into this one.
2719
* Values from the new element def take precedence if a value is
2722
public function mergeIn($def) {
2724
// later keys takes precedence
2725
foreach($def->attr as $k => $v) {
2727
// merge in the includes
2728
// sorry, no way to override an include
2729
foreach ($v as $v2) {
2730
$this->attr[0][] = $v2;
2735
if (isset($this->attr[$k])) unset($this->attr[$k]);
2738
$this->attr[$k] = $v;
2740
$this->_mergeAssocArray($this->attr_transform_pre, $def->attr_transform_pre);
2741
$this->_mergeAssocArray($this->attr_transform_post, $def->attr_transform_post);
2742
$this->_mergeAssocArray($this->excludes, $def->excludes);
2744
if(!empty($def->content_model)) {
2745
$this->content_model =
2746
str_replace("#SUPER", $this->content_model, $def->content_model);
2747
$this->child = false;
2749
if(!empty($def->content_model_type)) {
2750
$this->content_model_type = $def->content_model_type;
2751
$this->child = false;
2753
if(!is_null($def->child)) $this->child = $def->child;
2754
if(!is_null($def->formatting)) $this->formatting = $def->formatting;
2755
if($def->descendants_are_inline) $this->descendants_are_inline = $def->descendants_are_inline;
2760
* Merges one array into another, removes values which equal false
2761
* @param $a1 Array by reference that is merged into
2762
* @param $a2 Array that merges into $a1
2764
private function _mergeAssocArray(&$a1, $a2) {
2765
foreach ($a2 as $k => $v) {
2767
if (isset($a1[$k])) unset($a1[$k]);
2781
* A UTF-8 specific character encoder that handles cleaning and transforming.
2782
* @note All functions in this class should be static.
2784
class HTMLPurifier_Encoder
2788
* Constructor throws fatal error if you attempt to instantiate class
2790
private function __construct() {
2791
trigger_error('Cannot instantiate encoder, call methods statically', E_USER_ERROR);
2795
* Error-handler that mutes errors, alternative to shut-up operator.
2797
public static function muteErrorHandler() {}
2800
* Cleans a UTF-8 string for well-formedness and SGML validity
2802
* It will parse according to UTF-8 and return a valid UTF8 string, with
2803
* non-SGML codepoints excluded.
2805
* @note Just for reference, the non-SGML code points are 0 to 31 and
2806
* 127 to 159, inclusive. However, we allow code points 9, 10
2807
* and 13, which are the tab, line feed and carriage return
2808
* respectively. 128 and above the code points map to multibyte
2809
* UTF-8 representations.
2811
* @note Fallback code adapted from utf8ToUnicode by Henri Sivonen and
2812
* hsivonen@iki.fi at <http://iki.fi/hsivonen/php-utf8/> under the
2813
* LGPL license. Notes on what changed are inside, but in general,
2814
* the original code transformed UTF-8 text into an array of integer
2815
* Unicode codepoints. Understandably, transforming that back to
2816
* a string would be somewhat expensive, so the function was modded to
2817
* directly operate on the string. However, this discourages code
2818
* reuse, and the logic enumerated here would be useful for any
2819
* function that needs to be able to understand UTF-8 characters.
2820
* As of right now, only smart lossless character encoding converters
2821
* would need that, and I'm probably not going to implement them.
2822
* Once again, PHP 6 should solve all our problems.
2824
public static function cleanUTF8($str, $force_php = false) {
2826
// UTF-8 validity is checked since PHP 4.3.5
2827
// This is an optimization: if the string is already valid UTF-8, no
2828
// need to do PHP stuff. 99% of the time, this will be the case.
2829
// The regexp matches the XML char production, as well as well as excluding
2830
// non-SGML codepoints U+007F to U+009F
2831
if (preg_match('/^[\x{9}\x{A}\x{D}\x{20}-\x{7E}\x{A0}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]*$/Du', $str)) {
2835
$mState = 0; // cached expected number of octets after the current octet
2836
// until the beginning of the next UTF8 character sequence
2837
$mUcs4 = 0; // cached Unicode character
2838
$mBytes = 1; // cached expected number of octets in the current sequence
2840
// original code involved an $out that was an array of Unicode
2841
// codepoints. Instead of having to convert back into UTF-8, we've
2842
// decided to directly append valid UTF-8 characters onto a string
2843
// $out once they're done. $char accumulates raw bytes, while $mUcs4
2844
// turns into the Unicode code point, so there's some redundancy.
2849
$len = strlen($str);
2850
for($i = 0; $i < $len; $i++) {
2851
$in = ord($str{$i});
2852
$char .= $str[$i]; // append byte to char
2854
// When mState is zero we expect either a US-ASCII character
2855
// or a multi-octet sequence.
2856
if (0 == (0x80 & ($in))) {
2857
// US-ASCII, pass straight through.
2858
if (($in <= 31 || $in == 127) &&
2859
!($in == 9 || $in == 13 || $in == 10) // save \r\t\n
2861
// control characters, remove
2868
} elseif (0xC0 == (0xE0 & ($in))) {
2869
// First octet of 2 octet sequence
2871
$mUcs4 = ($mUcs4 & 0x1F) << 6;
2874
} elseif (0xE0 == (0xF0 & ($in))) {
2875
// First octet of 3 octet sequence
2877
$mUcs4 = ($mUcs4 & 0x0F) << 12;
2880
} elseif (0xF0 == (0xF8 & ($in))) {
2881
// First octet of 4 octet sequence
2883
$mUcs4 = ($mUcs4 & 0x07) << 18;
2886
} elseif (0xF8 == (0xFC & ($in))) {
2887
// First octet of 5 octet sequence.
2889
// This is illegal because the encoded codepoint must be
2891
// (a) not the shortest form or
2892
// (b) outside the Unicode range of 0-0x10FFFF.
2893
// Rather than trying to resynchronize, we will carry on
2894
// until the end of the sequence and let the later error
2895
// handling code catch it.
2897
$mUcs4 = ($mUcs4 & 0x03) << 24;
2900
} elseif (0xFC == (0xFE & ($in))) {
2901
// First octet of 6 octet sequence, see comments for 5
2904
$mUcs4 = ($mUcs4 & 1) << 30;
2908
// Current octet is neither in the US-ASCII range nor a
2909
// legal first octet of a multi-octet sequence.
2916
// When mState is non-zero, we expect a continuation of the
2917
// multi-octet sequence
2918
if (0x80 == (0xC0 & ($in))) {
2919
// Legal continuation.
2920
$shift = ($mState - 1) * 6;
2922
$tmp = ($tmp & 0x0000003F) << $shift;
2925
if (0 == --$mState) {
2926
// End of the multi-octet sequence. mUcs4 now contains
2927
// the final Unicode codepoint to be output
2929
// Check for illegal sequences and codepoints.
2931
// From Unicode 3.1, non-shortest form is illegal
2932
if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
2933
((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
2934
((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
2936
// From Unicode 3.2, surrogate characters = illegal
2937
(($mUcs4 & 0xFFFFF800) == 0xD800) ||
2938
// Codepoints outside the Unicode range are illegal
2942
} elseif (0xFEFF != $mUcs4 && // omit BOM
2943
// check for valid Char unicode codepoints
2948
(0x20 <= $mUcs4 && 0x7E >= $mUcs4) ||
2949
// 7F-9F is not strictly prohibited by XML,
2950
// but it is non-SGML, and thus we don't allow it
2951
(0xA0 <= $mUcs4 && 0xD7FF >= $mUcs4) ||
2952
(0x10000 <= $mUcs4 && 0x10FFFF >= $mUcs4)
2957
// initialize UTF8 cache (reset)
2964
// ((0xC0 & (*in) != 0x80) && (mState != 0))
2965
// Incomplete multi-octet sequence.
2966
// used to result in complete fail, but we'll reset
2978
* Translates a Unicode codepoint into its corresponding UTF-8 character.
2979
* @note Based on Feyd's function at
2980
* <http://forums.devnetwork.net/viewtopic.php?p=191404#191404>,
2981
* which is in public domain.
2982
* @note While we're going to do code point parsing anyway, a good
2983
* optimization would be to refuse to translate code points that
2984
* are non-SGML characters. However, this could lead to duplication.
2985
* @note This is very similar to the unichr function in
2986
* maintenance/generate-entity-file.php (although this is superior,
2987
* due to its sanity checks).
2990
// +----------+----------+----------+----------+
2991
// | 33222222 | 22221111 | 111111 | |
2992
// | 10987654 | 32109876 | 54321098 | 76543210 | bit
2993
// +----------+----------+----------+----------+
2994
// | | | | 0xxxxxxx | 1 byte 0x00000000..0x0000007F
2995
// | | | 110yyyyy | 10xxxxxx | 2 byte 0x00000080..0x000007FF
2996
// | | 1110zzzz | 10yyyyyy | 10xxxxxx | 3 byte 0x00000800..0x0000FFFF
2997
// | 11110www | 10wwzzzz | 10yyyyyy | 10xxxxxx | 4 byte 0x00010000..0x0010FFFF
2998
// +----------+----------+----------+----------+
2999
// | 00000000 | 00011111 | 11111111 | 11111111 | Theoretical upper limit of legal scalars: 2097151 (0x001FFFFF)
3000
// | 00000000 | 00010000 | 11111111 | 11111111 | Defined upper limit of legal scalar codes
3001
// +----------+----------+----------+----------+
3003
public static function unichr($code) {
3004
if($code > 1114111 or $code < 0 or
3005
($code >= 55296 and $code <= 57343) ) {
3006
// bits are set outside the "valid" range as defined
3011
$x = $y = $z = $w = 0;
3013
// regular ASCII character
3016
// set up bits for UTF-8
3017
$x = ($code & 63) | 128;
3019
$y = (($code & 2047) >> 6) | 192;
3021
$y = (($code & 4032) >> 6) | 128;
3023
$z = (($code >> 12) & 15) | 224;
3025
$z = (($code >> 12) & 63) | 128;
3026
$w = (($code >> 18) & 7) | 240;
3030
// set up the actual character
3032
if($w) $ret .= chr($w);
3033
if($z) $ret .= chr($z);
3034
if($y) $ret .= chr($y);
3041
* Converts a string to UTF-8 based on configuration.
3043
public static function convertToUTF8($str, $config, $context) {
3044
$encoding = $config->get('Core.Encoding');
3045
if ($encoding === 'utf-8') return $str;
3046
static $iconv = null;
3047
if ($iconv === null) $iconv = function_exists('iconv');
3048
set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
3049
if ($iconv && !$config->get('Test.ForceNoIconv')) {
3050
$str = iconv($encoding, 'utf-8//IGNORE', $str);
3051
if ($str === false) {
3052
// $encoding is not a valid encoding
3053
restore_error_handler();
3054
trigger_error('Invalid encoding ' . $encoding, E_USER_ERROR);
3057
// If the string is bjorked by Shift_JIS or a similar encoding
3058
// that doesn't support all of ASCII, convert the naughty
3059
// characters to their true byte-wise ASCII/UTF-8 equivalents.
3060
$str = strtr($str, HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding));
3061
restore_error_handler();
3063
} elseif ($encoding === 'iso-8859-1') {
3064
$str = utf8_encode($str);
3065
restore_error_handler();
3068
trigger_error('Encoding not supported, please install iconv', E_USER_ERROR);
3072
* Converts a string from UTF-8 based on configuration.
3073
* @note Currently, this is a lossy conversion, with unexpressable
3074
* characters being omitted.
3076
public static function convertFromUTF8($str, $config, $context) {
3077
$encoding = $config->get('Core.Encoding');
3078
if ($encoding === 'utf-8') return $str;
3079
static $iconv = null;
3080
if ($iconv === null) $iconv = function_exists('iconv');
3081
if ($escape = $config->get('Core.EscapeNonASCIICharacters')) {
3082
$str = HTMLPurifier_Encoder::convertToASCIIDumbLossless($str);
3084
set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
3085
if ($iconv && !$config->get('Test.ForceNoIconv')) {
3086
// Undo our previous fix in convertToUTF8, otherwise iconv will barf
3087
$ascii_fix = HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding);
3088
if (!$escape && !empty($ascii_fix)) {
3089
$clear_fix = array();
3090
foreach ($ascii_fix as $utf8 => $native) $clear_fix[$utf8] = '';
3091
$str = strtr($str, $clear_fix);
3093
$str = strtr($str, array_flip($ascii_fix));
3095
$str = iconv('utf-8', $encoding . '//IGNORE', $str);
3096
restore_error_handler();
3098
} elseif ($encoding === 'iso-8859-1') {
3099
$str = utf8_decode($str);
3100
restore_error_handler();
3103
trigger_error('Encoding not supported', E_USER_ERROR);
3107
* Lossless (character-wise) conversion of HTML to ASCII
3108
* @param $str UTF-8 string to be converted to ASCII
3109
* @returns ASCII encoded string with non-ASCII character entity-ized
3110
* @warning Adapted from MediaWiki, claiming fair use: this is a common
3111
* algorithm. If you disagree with this license fudgery,
3112
* implement it yourself.
3113
* @note Uses decimal numeric entities since they are best supported.
3114
* @note This is a DUMB function: it has no concept of keeping
3115
* character entities that the projected character encoding
3116
* can allow. We could possibly implement a smart version
3117
* but that would require it to also know which Unicode
3118
* codepoints the charset supported (not an easy task).
3119
* @note Sort of with cleanUTF8() but it assumes that $str is
3122
public static function convertToASCIIDumbLossless($str) {
3126
$len = strlen($str);
3127
for( $i = 0; $i < $len; $i++ ) {
3128
$bytevalue = ord( $str[$i] );
3129
if( $bytevalue <= 0x7F ) { //0xxx xxxx
3130
$result .= chr( $bytevalue );
3132
} elseif( $bytevalue <= 0xBF ) { //10xx xxxx
3133
$working = $working << 6;
3134
$working += ($bytevalue & 0x3F);
3136
if( $bytesleft <= 0 ) {
3137
$result .= "&#" . $working . ";";
3139
} elseif( $bytevalue <= 0xDF ) { //110x xxxx
3140
$working = $bytevalue & 0x1F;
3142
} elseif( $bytevalue <= 0xEF ) { //1110 xxxx
3143
$working = $bytevalue & 0x0F;
3145
} else { //1111 0xxx
3146
$working = $bytevalue & 0x07;
3154
* This expensive function tests whether or not a given character
3155
* encoding supports ASCII. 7/8-bit encodings like Shift_JIS will
3156
* fail this test, and require special processing. Variable width
3157
* encodings shouldn't ever fail.
3159
* @param string $encoding Encoding name to test, as per iconv format
3160
* @param bool $bypass Whether or not to bypass the precompiled arrays.
3161
* @return Array of UTF-8 characters to their corresponding ASCII,
3162
* which can be used to "undo" any overzealous iconv action.
3164
public static function testEncodingSupportsASCII($encoding, $bypass = false) {
3165
static $encodings = array();
3167
if (isset($encodings[$encoding])) return $encodings[$encoding];
3168
$lenc = strtolower($encoding);
3171
return array("\xC2\xA5" => '\\', "\xE2\x80\xBE" => '~');
3173
return array("\xE2\x82\xA9" => '\\');
3175
if (strpos($lenc, 'iso-8859-') === 0) return array();
3178
set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
3179
if (iconv('UTF-8', $encoding, 'a') === false) return false;
3180
for ($i = 0x20; $i <= 0x7E; $i++) { // all printable ASCII chars
3181
$c = chr($i); // UTF-8 char
3182
$r = iconv('UTF-8', "$encoding//IGNORE", $c); // initial conversion
3185
// This line is needed for iconv implementations that do not
3186
// omit characters that do not exist in the target character set
3187
($r === $c && iconv($encoding, 'UTF-8//IGNORE', $r) !== $c)
3189
// Reverse engineer: what's the UTF-8 equiv of this byte
3190
// sequence? This assumes that there's no variable width
3191
// encoding that doesn't support ASCII.
3192
$ret[iconv($encoding, 'UTF-8//IGNORE', $c)] = $c;
3195
restore_error_handler();
3196
$encodings[$encoding] = $ret;
3208
* Object that provides entity lookup table from entity name to character
3210
class HTMLPurifier_EntityLookup {
3213
* Assoc array of entity name to character represented.
3218
* Sets up the entity lookup table from the serialized file contents.
3219
* @note The serialized contents are versioned, but were generated
3220
* using the maintenance script generate_entity_file.php
3221
* @warning This is not in constructor to help enforce the Singleton
3223
public function setup($file = false) {
3225
$file = HTMLPURIFIER_PREFIX . '/HTMLPurifier/EntityLookup/entities.ser';
3227
$this->table = unserialize(file_get_contents($file));
3231
* Retrieves sole instance of the object.
3232
* @param Optional prototype of custom lookup table to overload with.
3234
public static function instance($prototype = false) {
3235
// no references, since PHP doesn't copy unless modified
3236
static $instance = null;
3238
$instance = $prototype;
3239
} elseif (!$instance) {
3240
$instance = new HTMLPurifier_EntityLookup();
3252
// if want to implement error collecting here, we'll need to use some sort
3253
// of global data (probably trigger_error) because it's impossible to pass
3254
// $config or $context to the callback functions.
3257
* Handles referencing and derefencing character entities
3259
class HTMLPurifier_EntityParser
3263
* Reference to entity lookup table.
3265
protected $_entity_lookup;
3268
* Callback regex string for parsing entities.
3270
protected $_substituteEntitiesRegex =
3271
'/&(?:[#]x([a-fA-F0-9]+)|[#]0*(\d+)|([A-Za-z_:][A-Za-z0-9.\-_:]*));?/';
3272
// 1. hex 2. dec 3. string (XML style)
3276
* Decimal to parsed string conversion table for special entities.
3278
protected $_special_dec2str =
3288
* Stripped entity names to decimal conversion table for special entities.
3290
protected $_special_ent2dec =
3299
* Substitutes non-special entities with their parsed equivalents. Since
3300
* running this whenever you have parsed character is t3h 5uck, we run
3301
* it before everything else.
3303
* @param $string String to have non-special entities parsed.
3304
* @returns Parsed string.
3306
public function substituteNonSpecialEntities($string) {
3307
// it will try to detect missing semicolons, but don't rely on it
3308
return preg_replace_callback(
3309
$this->_substituteEntitiesRegex,
3310
array($this, 'nonSpecialEntityCallback'),
3316
* Callback function for substituteNonSpecialEntities() that does the work.
3318
* @param $matches PCRE matches array, with 0 the entire match, and
3319
* either index 1, 2 or 3 set with a hex value, dec value,
3320
* or string (respectively).
3321
* @returns Replacement string.
3324
protected function nonSpecialEntityCallback($matches) {
3325
// replaces all but big five
3326
$entity = $matches[0];
3327
$is_num = (@$matches[0][1] === '#');
3329
$is_hex = (@$entity[2] === 'x');
3330
$code = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
3332
// abort for special characters
3333
if (isset($this->_special_dec2str[$code])) return $entity;
3335
return HTMLPurifier_Encoder::unichr($code);
3337
if (isset($this->_special_ent2dec[$matches[3]])) return $entity;
3338
if (!$this->_entity_lookup) {
3339
$this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
3341
if (isset($this->_entity_lookup->table[$matches[3]])) {
3342
return $this->_entity_lookup->table[$matches[3]];
3350
* Substitutes only special entities with their parsed equivalents.
3352
* @notice We try to avoid calling this function because otherwise, it
3353
* would have to be called a lot (for every parsed section).
3355
* @param $string String to have non-special entities parsed.
3356
* @returns Parsed string.
3358
public function substituteSpecialEntities($string) {
3359
return preg_replace_callback(
3360
$this->_substituteEntitiesRegex,
3361
array($this, 'specialEntityCallback'),
3366
* Callback function for substituteSpecialEntities() that does the work.
3368
* This callback has same syntax as nonSpecialEntityCallback().
3370
* @param $matches PCRE-style matches array, with 0 the entire match, and
3371
* either index 1, 2 or 3 set with a hex value, dec value,
3372
* or string (respectively).
3373
* @returns Replacement string.
3375
protected function specialEntityCallback($matches) {
3376
$entity = $matches[0];
3377
$is_num = (@$matches[0][1] === '#');
3379
$is_hex = (@$entity[2] === 'x');
3380
$int = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
3381
return isset($this->_special_dec2str[$int]) ?
3382
$this->_special_dec2str[$int] :
3385
return isset($this->_special_ent2dec[$matches[3]]) ?
3386
$this->_special_ent2dec[$matches[3]] :
3398
* Error collection class that enables HTML Purifier to report HTML
3399
* problems back to the user
3401
class HTMLPurifier_ErrorCollector
3405
* Identifiers for the returned error array. These are purposely numeric
3406
* so list() can be used.
3414
protected $_current;
3415
protected $_stacks = array(array());
3417
protected $generator;
3420
protected $lines = array();
3422
public function __construct($context) {
3423
$this->locale =& $context->get('Locale');
3424
$this->context = $context;
3425
$this->_current =& $this->_stacks[0];
3426
$this->errors =& $this->_stacks[0];
3430
* Sends an error message to the collector for later use
3431
* @param $severity int Error severity, PHP error style (don't use E_USER_)
3432
* @param $msg string Error message text
3433
* @param $subst1 string First substitution for $msg
3434
* @param $subst2 string ...
3436
public function send($severity, $msg) {
3439
if (func_num_args() > 2) {
3440
$args = func_get_args();
3445
$token = $this->context->get('CurrentToken', true);
3446
$line = $token ? $token->line : $this->context->get('CurrentLine', true);
3447
$col = $token ? $token->col : $this->context->get('CurrentCol', true);
3448
$attr = $this->context->get('CurrentAttr', true);
3450
// perform special substitutions, also add custom parameters
3452
if (!is_null($token)) {
3453
$args['CurrentToken'] = $token;
3455
if (!is_null($attr)) {
3456
$subst['$CurrentAttr.Name'] = $attr;
3457
if (isset($token->attr[$attr])) $subst['$CurrentAttr.Value'] = $token->attr[$attr];
3461
$msg = $this->locale->getMessage($msg);
3463
$msg = $this->locale->formatMessage($msg, $args);
3466
if (!empty($subst)) $msg = strtr($msg, $subst);
3468
// (numerically indexed)
3470
self::LINENO => $line,
3471
self::SEVERITY => $severity,
3472
self::MESSAGE => $msg,
3473
self::CHILDREN => array()
3475
$this->_current[] = $error;
3478
// NEW CODE BELOW ...
3481
// Top-level errors are either:
3482
// TOKEN type, if $value is set appropriately, or
3483
// "syntax" type, if $value is null
3484
$new_struct = new HTMLPurifier_ErrorStruct();
3485
$new_struct->type = HTMLPurifier_ErrorStruct::TOKEN;
3486
if ($token) $new_struct->value = clone $token;
3487
if (is_int($line) && is_int($col)) {
3488
if (isset($this->lines[$line][$col])) {
3489
$struct = $this->lines[$line][$col];
3491
$struct = $this->lines[$line][$col] = $new_struct;
3493
// These ksorts may present a performance problem
3494
ksort($this->lines[$line], SORT_NUMERIC);
3496
if (isset($this->lines[-1])) {
3497
$struct = $this->lines[-1];
3499
$struct = $this->lines[-1] = $new_struct;
3502
ksort($this->lines, SORT_NUMERIC);
3504
// Now, check if we need to operate on a lower structure
3505
if (!empty($attr)) {
3506
$struct = $struct->getChild(HTMLPurifier_ErrorStruct::ATTR, $attr);
3507
if (!$struct->value) {
3508
$struct->value = array($attr, 'PUT VALUE HERE');
3511
if (!empty($cssprop)) {
3512
$struct = $struct->getChild(HTMLPurifier_ErrorStruct::CSSPROP, $cssprop);
3513
if (!$struct->value) {
3514
// if we tokenize CSS this might be a little more difficult to do
3515
$struct->value = array($cssprop, 'PUT VALUE HERE');
3519
// Ok, structs are all setup, now time to register the error
3520
$struct->addError($severity, $msg);
3524
* Retrieves raw error data for custom formatter to use
3525
* @param List of arrays in format of array(line of error,
3526
* error severity, error message,
3527
* recursive sub-errors array)
3529
public function getRaw() {
3530
return $this->errors;
3534
* Default HTML formatting implementation for error messages
3535
* @param $config Configuration array, vital for HTML output nature
3536
* @param $errors Errors array to display; used for recursion.
3538
public function getHTMLFormatted($config, $errors = null) {
3541
$this->generator = new HTMLPurifier_Generator($config, $this->context);
3542
if ($errors === null) $errors = $this->errors;
3544
// 'At line' message needs to be removed
3546
// generation code for new structure goes here. It needs to be recursive.
3547
foreach ($this->lines as $line => $col_array) {
3548
if ($line == -1) continue;
3549
foreach ($col_array as $col => $struct) {
3550
$this->_renderStruct($ret, $struct, $line, $col);
3553
if (isset($this->lines[-1])) {
3554
$this->_renderStruct($ret, $this->lines[-1]);
3557
if (empty($errors)) {
3558
return '<p>' . $this->locale->getMessage('ErrorCollector: No errors') . '</p>';
3560
return '<ul><li>' . implode('</li><li>', $ret) . '</li></ul>';
3565
private function _renderStruct(&$ret, $struct, $line = null, $col = null) {
3566
$stack = array($struct);
3567
$context_stack = array(array());
3568
while ($current = array_pop($stack)) {
3569
$context = array_pop($context_stack);
3570
foreach ($current->errors as $error) {
3571
list($severity, $msg) = $error;
3574
// W3C uses an icon to indicate the severity of the error.
3575
$error = $this->locale->getErrorName($severity);
3576
$string .= "<span class=\"error e$severity\"><strong>$error</strong></span> ";
3577
if (!is_null($line) && !is_null($col)) {
3578
$string .= "<em class=\"location\">Line $line, Column $col: </em> ";
3580
$string .= '<em class="location">End of Document: </em> ';
3582
$string .= '<strong class="description">' . $this->generator->escape($msg) . '</strong> ';
3583
$string .= '</div>';
3584
// Here, have a marker for the character on the column appropriate.
3585
// Be sure to clip extremely long lines.
3586
//$string .= '<pre>';
3588
//$string .= '</pre>';
3591
foreach ($current->children as $type => $array) {
3592
$context[] = $current;
3593
$stack = array_merge($stack, array_reverse($array, true));
3594
for ($i = count($array); $i > 0; $i--) {
3595
$context_stack[] = $context;
3608
* Records errors for particular segments of an HTML document such as tokens,
3609
* attributes or CSS properties. They can contain error structs (which apply
3610
* to components of what they represent), but their main purpose is to hold
3611
* errors applying to whatever struct is being used.
3613
class HTMLPurifier_ErrorStruct
3617
* Possible values for $children first-key. Note that top-level structures
3618
* are automatically token-level.
3625
* Type of this struct.
3630
* Value of the struct we are recording errors for. There are various
3632
* - TOKEN: Instance of HTMLPurifier_Token
3633
* - ATTR: array('attr-name', 'value')
3634
* - CSSPROP: array('prop-name', 'value')
3639
* Errors registered for this structure.
3641
public $errors = array();
3644
* Child ErrorStructs that are from this structure. For example, a TOKEN
3645
* ErrorStruct would contain ATTR ErrorStructs. This is a multi-dimensional
3646
* array in structure: [TYPE]['identifier']
3648
public $children = array();
3650
public function getChild($type, $id) {
3651
if (!isset($this->children[$type][$id])) {
3652
$this->children[$type][$id] = new HTMLPurifier_ErrorStruct();
3653
$this->children[$type][$id]->type = $type;
3655
return $this->children[$type][$id];
3658
public function addError($severity, $message) {
3659
$this->errors[] = array($severity, $message);
3669
* Global exception class for HTML Purifier; any exceptions we throw
3672
class HTMLPurifier_Exception extends Exception
3682
* Represents a pre or post processing filter on HTML Purifier's output
3684
* Sometimes, a little ad-hoc fixing of HTML has to be done before
3685
* it gets sent through HTML Purifier: you can use filters to acheive
3686
* this effect. For instance, YouTube videos can be preserved using
3687
* this manner. You could have used a decorator for this task, but
3688
* PHP's support for them is not terribly robust, so we're going
3689
* to just loop through the filters.
3691
* Filters should be exited first in, last out. If there are three filters,
3692
* named 1, 2 and 3, the order of execution should go 1->preFilter,
3693
* 2->preFilter, 3->preFilter, purify, 3->postFilter, 2->postFilter,
3696
* @note Methods are not declared abstract as it is perfectly legitimate
3697
* for an implementation not to want anything to happen on a step
3700
class HTMLPurifier_Filter
3704
* Name of the filter for identification purposes
3709
* Pre-processor function, handles HTML before HTML Purifier
3711
public function preFilter($html, $config, $context) {
3716
* Post-processor function, handles HTML after HTML Purifier
3718
public function postFilter($html, $config, $context) {
3729
* Generates HTML from tokens.
3730
* @todo Refactor interface so that configuration/context is determined
3731
* upon instantiation, no need for messy generateFromTokens() calls
3732
* @todo Make some of the more internal functions protected, and have
3733
* unit tests work around that
3735
class HTMLPurifier_Generator
3739
* Whether or not generator should produce XML output
3741
private $_xhtml = true;
3744
* :HACK: Whether or not generator should comment the insides of <script> tags
3746
private $_scriptFix = false;
3749
* Cache of HTMLDefinition during HTML output to determine whether or
3750
* not attributes should be minimized.
3755
* Cache of %Output.SortAttr
3760
* Configuration for the generator
3765
* @param $config Instance of HTMLPurifier_Config
3766
* @param $context Instance of HTMLPurifier_Context
3768
public function __construct($config, $context) {
3769
$this->config = $config;
3770
$this->_scriptFix = $config->get('Output.CommentScriptContents');
3771
$this->_sortAttr = $config->get('Output.SortAttr');
3772
$this->_def = $config->getHTMLDefinition();
3773
$this->_xhtml = $this->_def->doctype->xml;
3777
* Generates HTML from an array of tokens.
3778
* @param $tokens Array of HTMLPurifier_Token
3779
* @param $config HTMLPurifier_Config object
3780
* @return Generated HTML
3782
public function generateFromTokens($tokens) {
3783
if (!$tokens) return '';
3787
for ($i = 0, $size = count($tokens); $i < $size; $i++) {
3788
if ($this->_scriptFix && $tokens[$i]->name === 'script'
3789
&& $i + 2 < $size && $tokens[$i+2] instanceof HTMLPurifier_Token_End) {
3790
// script special case
3791
// the contents of the script block must be ONE token
3792
// for this to work.
3793
$html .= $this->generateFromToken($tokens[$i++]);
3794
$html .= $this->generateScriptFromToken($tokens[$i++]);
3796
$html .= $this->generateFromToken($tokens[$i]);
3800
if (extension_loaded('tidy') && $this->config->get('Output.TidyFormat')) {
3802
$tidy->parseString($html, array(
3804
'output-xhtml' => $this->_xhtml,
3805
'show-body-only' => true,
3806
'indent-spaces' => 2,
3809
$tidy->cleanRepair();
3810
$html = (string) $tidy; // explicit cast necessary
3813
// Normalize newlines to system defined value
3814
$nl = $this->config->get('Output.Newline');
3815
if ($nl === null) $nl = PHP_EOL;
3816
if ($nl !== "\n") $html = str_replace("\n", $nl, $html);
3821
* Generates HTML from a single token.
3822
* @param $token HTMLPurifier_Token object.
3823
* @return Generated HTML
3825
public function generateFromToken($token) {
3826
if (!$token instanceof HTMLPurifier_Token) {
3827
trigger_error('Cannot generate HTML from non-HTMLPurifier_Token object', E_USER_WARNING);
3830
} elseif ($token instanceof HTMLPurifier_Token_Start) {
3831
$attr = $this->generateAttributes($token->attr, $token->name);
3832
return '<' . $token->name . ($attr ? ' ' : '') . $attr . '>';
3834
} elseif ($token instanceof HTMLPurifier_Token_End) {
3835
return '</' . $token->name . '>';
3837
} elseif ($token instanceof HTMLPurifier_Token_Empty) {
3838
$attr = $this->generateAttributes($token->attr, $token->name);
3839
return '<' . $token->name . ($attr ? ' ' : '') . $attr .
3840
( $this->_xhtml ? ' /': '' ) // <br /> v. <br>
3843
} elseif ($token instanceof HTMLPurifier_Token_Text) {
3844
return $this->escape($token->data, ENT_NOQUOTES);
3846
} elseif ($token instanceof HTMLPurifier_Token_Comment) {
3847
return '<!--' . $token->data . '-->';
3855
* Special case processor for the contents of script tags
3856
* @warning This runs into problems if there's already a literal
3857
* --> somewhere inside the script contents.
3859
public function generateScriptFromToken($token) {
3860
if (!$token instanceof HTMLPurifier_Token_Text) return $this->generateFromToken($token);
3861
// Thanks <http://lachy.id.au/log/2005/05/script-comments>
3862
$data = preg_replace('#//\s*$#', '', $token->data);
3863
return '<!--//--><![CDATA[//><!--' . "\n" . trim($data) . "\n" . '//--><!]]>';
3867
* Generates attribute declarations from attribute array.
3868
* @note This does not include the leading or trailing space.
3869
* @param $assoc_array_of_attributes Attribute array
3870
* @param $element Name of element attributes are for, used to check
3871
* attribute minimization.
3872
* @return Generate HTML fragment for insertion.
3874
public function generateAttributes($assoc_array_of_attributes, $element = false) {
3876
if ($this->_sortAttr) ksort($assoc_array_of_attributes);
3877
foreach ($assoc_array_of_attributes as $key => $value) {
3878
if (!$this->_xhtml) {
3879
// Remove namespaced attributes
3880
if (strpos($key, ':') !== false) continue;
3881
// Check if we should minimize the attribute: val="val" -> val
3882
if ($element && !empty($this->_def->info[$element]->attr[$key]->minimized)) {
3883
$html .= $key . ' ';
3887
$html .= $key.'="'.$this->escape($value).'" ';
3889
return rtrim($html);
3893
* Escapes raw text data.
3894
* @todo This really ought to be protected, but until we have a facility
3895
* for properly generating HTML here w/o using tokens, it stays
3897
* @param $string String data to escape for HTML.
3898
* @param $quote Quoting style, like htmlspecialchars. ENT_NOQUOTES is
3899
* permissible for non-attribute output.
3900
* @return String escaped data.
3902
public function escape($string, $quote = ENT_COMPAT) {
3903
return htmlspecialchars($string, $quote, 'UTF-8');
3913
* Definition of the purified HTML that describes allowed children,
3914
* attributes, and many other things.
3918
* All member variables that are prefixed with info
3919
* (including the main $info array) are used by HTML Purifier internals
3920
* and should not be directly edited when customizing the HTMLDefinition.
3921
* They can usually be set via configuration directives or custom
3924
* On the other hand, member variables without the info prefix are used
3925
* internally by the HTMLDefinition and MUST NOT be used by other HTML
3926
* Purifier internals. Many of them, however, are public, and may be
3927
* edited by userspace code to tweak the behavior of HTMLDefinition.
3929
* @note This class is inspected by Printer_HTMLDefinition; please
3930
* update that class if things here change.
3932
* @warning Directives that change this object's structure must be in
3933
* the HTML or Attr namespace!
3935
class HTMLPurifier_HTMLDefinition extends HTMLPurifier_Definition
3938
// FULLY-PUBLIC VARIABLES ---------------------------------------------
3941
* Associative array of element names to HTMLPurifier_ElementDef
3943
public $info = array();
3946
* Associative array of global attribute name to attribute definition.
3948
public $info_global_attr = array();
3951
* String name of parent element HTML will be going into.
3953
public $info_parent = 'div';
3956
* Definition for parent element, allows parent element to be a
3957
* tag that's not allowed inside the HTML fragment.
3959
public $info_parent_def;
3962
* String name of element used to wrap inline elements in block context
3963
* @note This is rarely used except for BLOCKQUOTEs in strict mode
3965
public $info_block_wrapper = 'p';
3968
* Associative array of deprecated tag name to HTMLPurifier_TagTransform
3970
public $info_tag_transform = array();
3973
* Indexed list of HTMLPurifier_AttrTransform to be performed before validation.
3975
public $info_attr_transform_pre = array();
3978
* Indexed list of HTMLPurifier_AttrTransform to be performed after validation.
3980
public $info_attr_transform_post = array();
3983
* Nested lookup array of content set name (Block, Inline) to
3984
* element name to whether or not it belongs in that content set.
3986
public $info_content_sets = array();
3989
* Indexed list of HTMLPurifier_Injector to be used.
3991
public $info_injector = array();
4000
// RAW CUSTOMIZATION STUFF --------------------------------------------
4003
* Adds a custom attribute to a pre-existing element
4004
* @note This is strictly convenience, and does not have a corresponding
4005
* method in HTMLPurifier_HTMLModule
4006
* @param $element_name String element name to add attribute to
4007
* @param $attr_name String name of attribute
4008
* @param $def Attribute definition, can be string or object, see
4009
* HTMLPurifier_AttrTypes for details
4011
public function addAttribute($element_name, $attr_name, $def) {
4012
$module = $this->getAnonymousModule();
4013
if (!isset($module->info[$element_name])) {
4014
$element = $module->addBlankElement($element_name);
4016
$element = $module->info[$element_name];
4018
$element->attr[$attr_name] = $def;
4022
* Adds a custom element to your HTML definition
4023
* @note See HTMLPurifier_HTMLModule::addElement for detailed
4024
* parameter and return value descriptions.
4026
public function addElement($element_name, $type, $contents, $attr_collections, $attributes = array()) {
4027
$module = $this->getAnonymousModule();
4028
// assume that if the user is calling this, the element
4029
// is safe. This may not be a good idea
4030
$element = $module->addElement($element_name, $type, $contents, $attr_collections, $attributes);
4035
* Adds a blank element to your HTML definition, for overriding
4037
* @note See HTMLPurifier_HTMLModule::addBlankElement for detailed
4038
* parameter and return value descriptions.
4040
public function addBlankElement($element_name) {
4041
$module = $this->getAnonymousModule();
4042
$element = $module->addBlankElement($element_name);
4047
* Retrieves a reference to the anonymous module, so you can
4048
* bust out advanced features without having to make your own
4051
public function getAnonymousModule() {
4052
if (!$this->_anonModule) {
4053
$this->_anonModule = new HTMLPurifier_HTMLModule();
4054
$this->_anonModule->name = 'Anonymous';
4056
return $this->_anonModule;
4059
private $_anonModule;
4062
// PUBLIC BUT INTERNAL VARIABLES --------------------------------------
4064
public $type = 'HTML';
4065
public $manager; /**< Instance of HTMLPurifier_HTMLModuleManager */
4068
* Performs low-cost, preliminary initialization.
4070
public function __construct() {
4071
$this->manager = new HTMLPurifier_HTMLModuleManager();
4074
protected function doSetup($config) {
4075
$this->processModules($config);
4076
$this->setupConfigStuff($config);
4077
unset($this->manager);
4079
// cleanup some of the element definitions
4080
foreach ($this->info as $k => $v) {
4081
unset($this->info[$k]->content_model);
4082
unset($this->info[$k]->content_model_type);
4087
* Extract out the information from the manager
4089
protected function processModules($config) {
4091
if ($this->_anonModule) {
4092
// for user specific changes
4093
// this is late-loaded so we don't have to deal with PHP4
4094
// reference wonky-ness
4095
$this->manager->addModule($this->_anonModule);
4096
unset($this->_anonModule);
4099
$this->manager->setup($config);
4100
$this->doctype = $this->manager->doctype;
4102
foreach ($this->manager->modules as $module) {
4103
foreach($module->info_tag_transform as $k => $v) {
4104
if ($v === false) unset($this->info_tag_transform[$k]);
4105
else $this->info_tag_transform[$k] = $v;
4107
foreach($module->info_attr_transform_pre as $k => $v) {
4108
if ($v === false) unset($this->info_attr_transform_pre[$k]);
4109
else $this->info_attr_transform_pre[$k] = $v;
4111
foreach($module->info_attr_transform_post as $k => $v) {
4112
if ($v === false) unset($this->info_attr_transform_post[$k]);
4113
else $this->info_attr_transform_post[$k] = $v;
4115
foreach ($module->info_injector as $k => $v) {
4116
if ($v === false) unset($this->info_injector[$k]);
4117
else $this->info_injector[$k] = $v;
4121
$this->info = $this->manager->getElements();
4122
$this->info_content_sets = $this->manager->contentSets->lookup;
4127
* Sets up stuff based on config. We need a better way of doing this.
4129
protected function setupConfigStuff($config) {
4131
$block_wrapper = $config->get('HTML.BlockWrapper');
4132
if (isset($this->info_content_sets['Block'][$block_wrapper])) {
4133
$this->info_block_wrapper = $block_wrapper;
4135
trigger_error('Cannot use non-block element as block wrapper',
4139
$parent = $config->get('HTML.Parent');
4140
$def = $this->manager->getElement($parent, true);
4142
$this->info_parent = $parent;
4143
$this->info_parent_def = $def;
4145
trigger_error('Cannot use unrecognized element as parent',
4147
$this->info_parent_def = $this->manager->getElement($this->info_parent, true);
4150
// support template text
4151
$support = "(for information on implementing this, see the ".
4154
// setup allowed elements -----------------------------------------
4156
$allowed_elements = $config->get('HTML.AllowedElements');
4157
$allowed_attributes = $config->get('HTML.AllowedAttributes'); // retrieve early
4159
if (!is_array($allowed_elements) && !is_array($allowed_attributes)) {
4160
$allowed = $config->get('HTML.Allowed');
4161
if (is_string($allowed)) {
4162
list($allowed_elements, $allowed_attributes) = $this->parseTinyMCEAllowedList($allowed);
4166
if (is_array($allowed_elements)) {
4167
foreach ($this->info as $name => $d) {
4168
if(!isset($allowed_elements[$name])) unset($this->info[$name]);
4169
unset($allowed_elements[$name]);
4172
foreach ($allowed_elements as $element => $d) {
4173
$element = htmlspecialchars($element); // PHP doesn't escape errors, be careful!
4174
trigger_error("Element '$element' is not supported $support", E_USER_WARNING);
4178
// setup allowed attributes ---------------------------------------
4180
$allowed_attributes_mutable = $allowed_attributes; // by copy!
4181
if (is_array($allowed_attributes)) {
4183
// This actually doesn't do anything, since we went away from
4184
// global attributes. It's possible that userland code uses
4185
// it, but HTMLModuleManager doesn't!
4186
foreach ($this->info_global_attr as $attr => $x) {
4187
$keys = array($attr, "*@$attr", "*.$attr");
4189
foreach ($keys as $key) {
4190
if ($delete && isset($allowed_attributes[$key])) {
4193
if (isset($allowed_attributes_mutable[$key])) {
4194
unset($allowed_attributes_mutable[$key]);
4197
if ($delete) unset($this->info_global_attr[$attr]);
4200
foreach ($this->info as $tag => $info) {
4201
foreach ($info->attr as $attr => $x) {
4202
$keys = array("$tag@$attr", $attr, "*@$attr", "$tag.$attr", "*.$attr");
4204
foreach ($keys as $key) {
4205
if ($delete && isset($allowed_attributes[$key])) {
4208
if (isset($allowed_attributes_mutable[$key])) {
4209
unset($allowed_attributes_mutable[$key]);
4212
if ($delete) unset($this->info[$tag]->attr[$attr]);
4216
foreach ($allowed_attributes_mutable as $elattr => $d) {
4217
$bits = preg_split('/[.@]/', $elattr, 2);
4221
if ($bits[0] !== '*') {
4222
$element = htmlspecialchars($bits[0]);
4223
$attribute = htmlspecialchars($bits[1]);
4224
if (!isset($this->info[$element])) {
4225
trigger_error("Cannot allow attribute '$attribute' if element '$element' is not allowed/supported $support");
4227
trigger_error("Attribute '$attribute' in element '$element' not supported $support",
4232
// otherwise fall through
4234
$attribute = htmlspecialchars($bits[0]);
4235
trigger_error("Global attribute '$attribute' is not ".
4236
"supported in any elements $support",
4244
// setup forbidden elements ---------------------------------------
4246
$forbidden_elements = $config->get('HTML.ForbiddenElements');
4247
$forbidden_attributes = $config->get('HTML.ForbiddenAttributes');
4249
foreach ($this->info as $tag => $info) {
4250
if (isset($forbidden_elements[$tag])) {
4251
unset($this->info[$tag]);
4254
foreach ($info->attr as $attr => $x) {
4256
isset($forbidden_attributes["$tag@$attr"]) ||
4257
isset($forbidden_attributes["*@$attr"]) ||
4258
isset($forbidden_attributes[$attr])
4260
unset($this->info[$tag]->attr[$attr]);
4262
} // this segment might get removed eventually
4263
elseif (isset($forbidden_attributes["$tag.$attr"])) {
4264
// $tag.$attr are not user supplied, so no worries!
4265
trigger_error("Error with $tag.$attr: tag.attr syntax not supported for HTML.ForbiddenAttributes; use tag@attr instead", E_USER_WARNING);
4269
foreach ($forbidden_attributes as $key => $v) {
4270
if (strlen($key) < 2) continue;
4271
if ($key[0] != '*') continue;
4272
if ($key[1] == '.') {
4273
trigger_error("Error with $key: *.attr syntax not supported for HTML.ForbiddenAttributes; use attr instead", E_USER_WARNING);
4277
// setup injectors -----------------------------------------------------
4278
foreach ($this->info_injector as $i => $injector) {
4279
if ($injector->checkNeeded($config) !== false) {
4280
// remove injector that does not have it's required
4281
// elements/attributes present, and is thus not needed.
4282
unset($this->info_injector[$i]);
4288
* Parses a TinyMCE-flavored Allowed Elements and Attributes list into
4289
* separate lists for processing. Format is element[attr1|attr2],element2...
4290
* @warning Although it's largely drawn from TinyMCE's implementation,
4291
* it is different, and you'll probably have to modify your lists
4292
* @param $list String list to parse
4293
* @param array($allowed_elements, $allowed_attributes)
4294
* @todo Give this its own class, probably static interface
4296
public function parseTinyMCEAllowedList($list) {
4298
$list = str_replace(array(' ', "\t"), '', $list);
4300
$elements = array();
4301
$attributes = array();
4303
$chunks = preg_split('/(,|[\n\r]+)/', $list);
4304
foreach ($chunks as $chunk) {
4305
if (empty($chunk)) continue;
4306
// remove TinyMCE element control characters
4307
if (!strpos($chunk, '[')) {
4311
list($element, $attr) = explode('[', $chunk);
4313
if ($element !== '*') $elements[$element] = true;
4314
if (!$attr) continue;
4315
$attr = substr($attr, 0, strlen($attr) - 1); // remove trailing ]
4316
$attr = explode('|', $attr);
4317
foreach ($attr as $key) {
4318
$attributes["$element.$key"] = true;
4322
return array($elements, $attributes);
4334
* Represents an XHTML 1.1 module, with information on elements, tags
4336
* @note Even though this is technically XHTML 1.1, it is also used for
4337
* regular HTML parsing. We are using modulization as a convenient
4338
* way to represent the internals of HTMLDefinition, and our
4339
* implementation is by no means conforming and does not directly
4340
* use the normative DTDs or XML schemas.
4341
* @note The public variables in a module should almost directly
4342
* correspond to the variables in HTMLPurifier_HTMLDefinition.
4343
* However, the prefix info carries no special meaning in these
4344
* objects (include it anyway if that's the correspondence though).
4345
* @todo Consider making some member functions protected
4348
class HTMLPurifier_HTMLModule
4351
// -- Overloadable ----------------------------------------------------
4354
* Short unique string identifier of the module
4359
* Informally, a list of elements this module changes. Not used in
4360
* any significant way.
4362
public $elements = array();
4365
* Associative array of element names to element definitions.
4366
* Some definitions may be incomplete, to be merged in later
4367
* with the full definition.
4369
public $info = array();
4372
* Associative array of content set names to content set additions.
4373
* This is commonly used to, say, add an A element to the Inline
4374
* content set. This corresponds to an internal variable $content_sets
4375
* and NOT info_content_sets member variable of HTMLDefinition.
4377
public $content_sets = array();
4380
* Associative array of attribute collection names to attribute
4381
* collection additions. More rarely used for adding attributes to
4382
* the global collections. Example is the StyleAttribute module adding
4383
* the style attribute to the Core. Corresponds to HTMLDefinition's
4384
* attr_collections->info, since the object's data is only info,
4385
* with extra behavior associated with it.
4387
public $attr_collections = array();
4390
* Associative array of deprecated tag name to HTMLPurifier_TagTransform
4392
public $info_tag_transform = array();
4395
* List of HTMLPurifier_AttrTransform to be performed before validation.
4397
public $info_attr_transform_pre = array();
4400
* List of HTMLPurifier_AttrTransform to be performed after validation.
4402
public $info_attr_transform_post = array();
4405
* List of HTMLPurifier_Injector to be performed during well-formedness fixing.
4406
* An injector will only be invoked if all of it's pre-requisites are met;
4407
* if an injector fails setup, there will be no error; it will simply be
4408
* silently disabled.
4410
public $info_injector = array();
4413
* Boolean flag that indicates whether or not getChildDef is implemented.
4414
* For optimization reasons: may save a call to a function. Be sure
4415
* to set it if you do implement getChildDef(), otherwise it will have
4418
public $defines_child_def = false;
4421
* Boolean flag whether or not this module is safe. If it is not safe, all
4422
* of its members are unsafe. Modules are safe by default (this might be
4423
* slightly dangerous, but it doesn't make much sense to force HTML Purifier,
4424
* which is based off of safe HTML, to explicitly say, "This is safe," even
4425
* though there are modules which are "unsafe")
4427
* @note Previously, safety could be applied at an element level granularity.
4428
* We've removed this ability, so in order to add "unsafe" elements
4429
* or attributes, a dedicated module with this property set to false
4432
public $safe = true;
4435
* Retrieves a proper HTMLPurifier_ChildDef subclass based on
4436
* content_model and content_model_type member variables of
4437
* the HTMLPurifier_ElementDef class. There is a similar function
4438
* in HTMLPurifier_HTMLDefinition.
4439
* @param $def HTMLPurifier_ElementDef instance
4440
* @return HTMLPurifier_ChildDef subclass
4442
public function getChildDef($def) {return false;}
4444
// -- Convenience -----------------------------------------------------
4447
* Convenience function that sets up a new element
4448
* @param $element Name of element to add
4449
* @param $type What content set should element be registered to?
4450
* Set as false to skip this step.
4451
* @param $contents Allowed children in form of:
4452
* "$content_model_type: $content_model"
4453
* @param $attr_includes What attribute collections to register to
4455
* @param $attr What unique attributes does the element define?
4456
* @note See ElementDef for in-depth descriptions of these parameters.
4457
* @return Created element definition object, so you
4458
* can set advanced parameters
4460
public function addElement($element, $type, $contents, $attr_includes = array(), $attr = array()) {
4461
$this->elements[] = $element;
4462
// parse content_model
4463
list($content_model_type, $content_model) = $this->parseContents($contents);
4464
// merge in attribute inclusions
4465
$this->mergeInAttrIncludes($attr, $attr_includes);
4466
// add element to content sets
4467
if ($type) $this->addElementToContentSet($element, $type);
4469
$this->info[$element] = HTMLPurifier_ElementDef::create(
4470
$content_model, $content_model_type, $attr
4472
// literal object $contents means direct child manipulation
4473
if (!is_string($contents)) $this->info[$element]->child = $contents;
4474
return $this->info[$element];
4478
* Convenience function that creates a totally blank, non-standalone
4480
* @param $element Name of element to create
4481
* @return Created element
4483
public function addBlankElement($element) {
4484
if (!isset($this->info[$element])) {
4485
$this->elements[] = $element;
4486
$this->info[$element] = new HTMLPurifier_ElementDef();
4487
$this->info[$element]->standalone = false;
4489
trigger_error("Definition for $element already exists in module, cannot redefine");
4491
return $this->info[$element];
4495
* Convenience function that registers an element to a content set
4496
* @param Element to register
4497
* @param Name content set (warning: case sensitive, usually upper-case
4500
public function addElementToContentSet($element, $type) {
4501
if (!isset($this->content_sets[$type])) $this->content_sets[$type] = '';
4502
else $this->content_sets[$type] .= ' | ';
4503
$this->content_sets[$type] .= $element;
4507
* Convenience function that transforms single-string contents
4508
* into separate content model and content model type
4509
* @param $contents Allowed children in form of:
4510
* "$content_model_type: $content_model"
4511
* @note If contents is an object, an array of two nulls will be
4512
* returned, and the callee needs to take the original $contents
4513
* and use it directly.
4515
public function parseContents($contents) {
4516
if (!is_string($contents)) return array(null, null); // defer
4517
switch ($contents) {
4518
// check for shorthand content model forms
4520
return array('empty', '');
4522
return array('optional', 'Inline | #PCDATA');
4524
return array('optional', 'Flow | #PCDATA');
4526
list($content_model_type, $content_model) = explode(':', $contents);
4527
$content_model_type = strtolower(trim($content_model_type));
4528
$content_model = trim($content_model);
4529
return array($content_model_type, $content_model);
4533
* Convenience function that merges a list of attribute includes into
4534
* an attribute array.
4535
* @param $attr Reference to attr array to modify
4536
* @param $attr_includes Array of includes / string include to merge in
4538
public function mergeInAttrIncludes(&$attr, $attr_includes) {
4539
if (!is_array($attr_includes)) {
4540
if (empty($attr_includes)) $attr_includes = array();
4541
else $attr_includes = array($attr_includes);
4543
$attr[0] = $attr_includes;
4547
* Convenience function that generates a lookup table with boolean
4549
* @param $list List of values to turn into a lookup
4550
* @note You can also pass an arbitrary number of arguments in
4551
* place of the regular argument
4552
* @return Lookup array equivalent of list
4554
public function makeLookup($list) {
4555
if (is_string($list)) $list = func_get_args();
4557
foreach ($list as $value) {
4558
if (is_null($value)) continue;
4559
$ret[$value] = true;
4565
* Lazy load construction of the module after determining whether
4566
* or not it's needed, and also when a finalized configuration object
4568
* @param $config Instance of HTMLPurifier_Config
4570
public function setup($config) {}
4578
class HTMLPurifier_HTMLModuleManager
4582
* Instance of HTMLPurifier_DoctypeRegistry
4587
* Instance of current doctype
4592
* Instance of HTMLPurifier_AttrTypes
4597
* Active instances of modules for the specified doctype are
4598
* indexed, by name, in this array.
4600
public $modules = array();
4603
* Array of recognized HTMLPurifier_Module instances, indexed by
4604
* module's class name. This array is usually lazy loaded, but a
4605
* user can overload a module by pre-emptively registering it.
4607
public $registeredModules = array();
4610
* List of extra modules that were added by the user using addModule().
4611
* These get unconditionally merged into the current doctype, whatever
4614
public $userModules = array();
4617
* Associative array of element name to list of modules that have
4618
* definitions for the element; this array is dynamically filled.
4620
public $elementLookup = array();
4622
/** List of prefixes we should use for registering small names */
4623
public $prefixes = array('HTMLPurifier_HTMLModule_');
4625
public $contentSets; /**< Instance of HTMLPurifier_ContentSets */
4626
public $attrCollections; /**< Instance of HTMLPurifier_AttrCollections */
4628
/** If set to true, unsafe elements and attributes will be allowed */
4629
public $trusted = false;
4631
public function __construct() {
4633
// editable internal objects
4634
$this->attrTypes = new HTMLPurifier_AttrTypes();
4635
$this->doctypes = new HTMLPurifier_DoctypeRegistry();
4637
// setup basic modules
4639
'CommonAttributes', 'Text', 'Hypertext', 'List',
4640
'Presentation', 'Edit', 'Bdo', 'Tables', 'Image',
4643
'Scripting', 'Object', 'Forms',
4644
// Sorta legacy, but present in strict:
4647
$transitional = array('Legacy', 'Target');
4648
$xml = array('XMLCommonAttributes');
4649
$non_xml = array('NonXMLCommonAttributes');
4651
// setup basic doctypes
4652
$this->doctypes->register(
4653
'HTML 4.01 Transitional', false,
4654
array_merge($common, $transitional, $non_xml),
4655
array('Tidy_Transitional', 'Tidy_Proprietary'),
4657
'-//W3C//DTD HTML 4.01 Transitional//EN',
4658
'http://www.w3.org/TR/html4/loose.dtd'
4661
$this->doctypes->register(
4662
'HTML 4.01 Strict', false,
4663
array_merge($common, $non_xml),
4664
array('Tidy_Strict', 'Tidy_Proprietary', 'Tidy_Name'),
4666
'-//W3C//DTD HTML 4.01//EN',
4667
'http://www.w3.org/TR/html4/strict.dtd'
4670
$this->doctypes->register(
4671
'XHTML 1.0 Transitional', true,
4672
array_merge($common, $transitional, $xml, $non_xml),
4673
array('Tidy_Transitional', 'Tidy_XHTML', 'Tidy_Proprietary', 'Tidy_Name'),
4675
'-//W3C//DTD XHTML 1.0 Transitional//EN',
4676
'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'
4679
$this->doctypes->register(
4680
'XHTML 1.0 Strict', true,
4681
array_merge($common, $xml, $non_xml),
4682
array('Tidy_Strict', 'Tidy_XHTML', 'Tidy_Strict', 'Tidy_Proprietary', 'Tidy_Name'),
4684
'-//W3C//DTD XHTML 1.0 Strict//EN',
4685
'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd'
4688
$this->doctypes->register(
4690
array_merge($common, $xml, array('Ruby')),
4691
array('Tidy_Strict', 'Tidy_XHTML', 'Tidy_Proprietary', 'Tidy_Strict', 'Tidy_Name'), // Tidy_XHTML1_1
4693
'-//W3C//DTD XHTML 1.1//EN',
4694
'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd'
4700
* Registers a module to the recognized module list, useful for
4701
* overloading pre-existing modules.
4702
* @param $module Mixed: string module name, with or without
4703
* HTMLPurifier_HTMLModule prefix, or instance of
4704
* subclass of HTMLPurifier_HTMLModule.
4705
* @param $overload Boolean whether or not to overload previous modules.
4706
* If this is not set, and you do overload a module,
4707
* HTML Purifier will complain with a warning.
4708
* @note This function will not call autoload, you must instantiate
4709
* (and thus invoke) autoload outside the method.
4710
* @note If a string is passed as a module name, different variants
4711
* will be tested in this order:
4712
* - Check for HTMLPurifier_HTMLModule_$name
4713
* - Check all prefixes with $name in order they were added
4714
* - Check for literal object name
4715
* - Throw fatal error
4716
* If your object name collides with an internal class, specify
4717
* your module manually. All modules must have been included
4718
* externally: registerModule will not perform inclusions for you!
4720
public function registerModule($module, $overload = false) {
4721
if (is_string($module)) {
4722
// attempt to load the module
4723
$original_module = $module;
4725
foreach ($this->prefixes as $prefix) {
4726
$module = $prefix . $original_module;
4727
if (class_exists($module)) {
4733
$module = $original_module;
4734
if (!class_exists($module)) {
4735
trigger_error($original_module . ' module does not exist',
4740
$module = new $module();
4742
if (empty($module->name)) {
4743
trigger_error('Module instance of ' . get_class($module) . ' must have name');
4746
if (!$overload && isset($this->registeredModules[$module->name])) {
4747
trigger_error('Overloading ' . $module->name . ' without explicit overload parameter', E_USER_WARNING);
4749
$this->registeredModules[$module->name] = $module;
4753
* Adds a module to the current doctype by first registering it,
4754
* and then tacking it on to the active doctype
4756
public function addModule($module) {
4757
$this->registerModule($module);
4758
if (is_object($module)) $module = $module->name;
4759
$this->userModules[] = $module;
4763
* Adds a class prefix that registerModule() will use to resolve a
4764
* string name to a concrete class
4766
public function addPrefix($prefix) {
4767
$this->prefixes[] = $prefix;
4771
* Performs processing on modules, after being called you may
4772
* use getElement() and getElements()
4773
* @param $config Instance of HTMLPurifier_Config
4775
public function setup($config) {
4777
$this->trusted = $config->get('HTML.Trusted');
4780
$this->doctype = $this->doctypes->make($config);
4781
$modules = $this->doctype->modules;
4783
// take out the default modules that aren't allowed
4784
$lookup = $config->get('HTML.AllowedModules');
4785
$special_cases = $config->get('HTML.CoreModules');
4787
if (is_array($lookup)) {
4788
foreach ($modules as $k => $m) {
4789
if (isset($special_cases[$m])) continue;
4790
if (!isset($lookup[$m])) unset($modules[$k]);
4794
// add proprietary module (this gets special treatment because
4795
// it is completely removed from doctypes, etc.)
4796
if ($config->get('HTML.Proprietary')) {
4797
$modules[] = 'Proprietary';
4800
// add SafeObject/Safeembed modules
4801
if ($config->get('HTML.SafeObject')) {
4802
$modules[] = 'SafeObject';
4804
if ($config->get('HTML.SafeEmbed')) {
4805
$modules[] = 'SafeEmbed';
4808
// merge in custom modules
4809
$modules = array_merge($modules, $this->userModules);
4811
foreach ($modules as $module) {
4812
$this->processModule($module);
4813
$this->modules[$module]->setup($config);
4816
foreach ($this->doctype->tidyModules as $module) {
4817
$this->processModule($module);
4818
$this->modules[$module]->setup($config);
4821
// prepare any injectors
4822
foreach ($this->modules as $module) {
4824
foreach ($module->info_injector as $i => $injector) {
4825
if (!is_object($injector)) {
4826
$class = "HTMLPurifier_Injector_$injector";
4827
$injector = new $class;
4829
$n[$injector->name] = $injector;
4831
$module->info_injector = $n;
4834
// setup lookup table based on all valid modules
4835
foreach ($this->modules as $module) {
4836
foreach ($module->info as $name => $def) {
4837
if (!isset($this->elementLookup[$name])) {
4838
$this->elementLookup[$name] = array();
4840
$this->elementLookup[$name][] = $module->name;
4844
// note the different choice
4845
$this->contentSets = new HTMLPurifier_ContentSets(
4846
// content set assembly deals with all possible modules,
4847
// not just ones deemed to be "safe"
4850
$this->attrCollections = new HTMLPurifier_AttrCollections(
4852
// there is no way to directly disable a global attribute,
4853
// but using AllowedAttributes or simply not including
4854
// the module in your custom doctype should be sufficient
4860
* Takes a module and adds it to the active module collection,
4861
* registering it if necessary.
4863
public function processModule($module) {
4864
if (!isset($this->registeredModules[$module]) || is_object($module)) {
4865
$this->registerModule($module);
4867
$this->modules[$module] = $this->registeredModules[$module];
4871
* Retrieves merged element definitions.
4872
* @return Array of HTMLPurifier_ElementDef
4874
public function getElements() {
4876
$elements = array();
4877
foreach ($this->modules as $module) {
4878
if (!$this->trusted && !$module->safe) continue;
4879
foreach ($module->info as $name => $v) {
4880
if (isset($elements[$name])) continue;
4881
$elements[$name] = $this->getElement($name);
4885
// remove dud elements, this happens when an element that
4886
// appeared to be safe actually wasn't
4887
foreach ($elements as $n => $v) {
4888
if ($v === false) unset($elements[$n]);
4896
* Retrieves a single merged element definition
4897
* @param $name Name of element
4898
* @param $trusted Boolean trusted overriding parameter: set to true
4899
* if you want the full version of an element
4900
* @return Merged HTMLPurifier_ElementDef
4901
* @note You may notice that modules are getting iterated over twice (once
4902
* in getElements() and once here). This
4905
public function getElement($name, $trusted = null) {
4907
if (!isset($this->elementLookup[$name])) {
4911
// setup global state variables
4913
if ($trusted === null) $trusted = $this->trusted;
4915
// iterate through each module that has registered itself to this
4917
foreach($this->elementLookup[$name] as $module_name) {
4919
$module = $this->modules[$module_name];
4921
// refuse to create/merge from a module that is deemed unsafe--
4922
// pretend the module doesn't exist--when trusted mode is not on.
4923
if (!$trusted && !$module->safe) {
4927
// clone is used because, ideally speaking, the original
4928
// definition should not be modified. Usually, this will
4929
// make no difference, but for consistency's sake
4930
$new_def = clone $module->info[$name];
4932
if (!$def && $new_def->standalone) {
4935
// This will occur even if $new_def is standalone. In practice,
4936
// this will usually result in a full replacement.
4937
$def->mergeIn($new_def);
4940
// non-standalone definitions that don't have a standalone
4941
// to merge into could be deferred to the end
4945
// attribute value expansions
4946
$this->attrCollections->performInclusions($def->attr);
4947
$this->attrCollections->expandIdentifiers($def->attr, $this->attrTypes);
4949
// descendants_are_inline, for ChildDef_Chameleon
4950
if (is_string($def->content_model) &&
4951
strpos($def->content_model, 'Inline') !== false) {
4952
if ($name != 'del' && $name != 'ins') {
4953
// this is for you, ins/del
4954
$def->descendants_are_inline = true;
4958
$this->contentSets->generateChildDef($def, $module);
4961
// This can occur if there is a blank definition, but no base to
4963
if (!$def) return false;
4965
// add information on required attributes
4966
foreach ($def->attr as $attr_name => $attr_def) {
4967
if ($attr_def->required) {
4968
$def->required_attr[] = $attr_name;
4983
* Component of HTMLPurifier_AttrContext that accumulates IDs to prevent dupes
4984
* @note In Slashdot-speak, dupe means duplicate.
4985
* @note The default constructor does not accept $config or $context objects:
4986
* use must use the static build() factory method to perform initialization.
4988
class HTMLPurifier_IDAccumulator
4992
* Lookup table of IDs we've accumulated.
4995
public $ids = array();
4998
* Builds an IDAccumulator, also initializing the default blacklist
4999
* @param $config Instance of HTMLPurifier_Config
5000
* @param $context Instance of HTMLPurifier_Context
5001
* @return Fully initialized HTMLPurifier_IDAccumulator
5003
public static function build($config, $context) {
5004
$id_accumulator = new HTMLPurifier_IDAccumulator();
5005
$id_accumulator->load($config->get('Attr.IDBlacklist'));
5006
return $id_accumulator;
5010
* Add an ID to the lookup table.
5011
* @param $id ID to be added.
5012
* @return Bool status, true if success, false if there's a dupe
5014
public function add($id) {
5015
if (isset($this->ids[$id])) return false;
5016
return $this->ids[$id] = true;
5020
* Load a list of IDs into the lookup table
5021
* @param $array_of_ids Array of IDs to load
5022
* @note This function doesn't care about duplicates
5024
public function load($array_of_ids) {
5025
foreach ($array_of_ids as $id) {
5026
$this->ids[$id] = true;
5037
* Injects tokens into the document while parsing for well-formedness.
5038
* This enables "formatter-like" functionality such as auto-paragraphing,
5039
* smiley-ification and linkification to take place.
5041
* A note on how handlers create changes; this is done by assigning a new
5042
* value to the $token reference. These values can take a variety of forms and
5043
* are best described HTMLPurifier_Strategy_MakeWellFormed->processToken()
5046
* @todo Allow injectors to request a re-run on their output. This
5047
* would help if an operation is recursive.
5049
abstract class HTMLPurifier_Injector
5053
* Advisory name of injector, this is for friendly error messages
5058
* Instance of HTMLPurifier_HTMLDefinition
5060
protected $htmlDefinition;
5063
* Reference to CurrentNesting variable in Context. This is an array
5064
* list of tokens that we are currently "inside"
5066
protected $currentNesting;
5069
* Reference to InputTokens variable in Context. This is an array
5070
* list of the input tokens that are being processed.
5072
protected $inputTokens;
5075
* Reference to InputIndex variable in Context. This is an integer
5076
* array index for $this->inputTokens that indicates what token
5077
* is currently being processed.
5079
protected $inputIndex;
5082
* Array of elements and attributes this injector creates and therefore
5083
* need to be allowed by the definition. Takes form of
5084
* array('element' => array('attr', 'attr2'), 'element2')
5086
public $needed = array();
5089
* Index of inputTokens to rewind to.
5091
protected $rewind = false;
5094
* Rewind to a spot to re-perform processing. This is useful if you
5095
* deleted a node, and now need to see if this change affected any
5096
* earlier nodes. Rewinding does not affect other injectors, and can
5097
* result in infinite loops if not used carefully.
5098
* @warning HTML Purifier will prevent you from fast-forwarding with this
5101
public function rewind($index) {
5102
$this->rewind = $index;
5106
* Retrieves rewind, and then unsets it.
5108
public function getRewind() {
5110
$this->rewind = false;
5115
* Prepares the injector by giving it the config and context objects:
5116
* this allows references to important variables to be made within
5117
* the injector. This function also checks if the HTML environment
5118
* will work with the Injector (see checkNeeded()).
5119
* @param $config Instance of HTMLPurifier_Config
5120
* @param $context Instance of HTMLPurifier_Context
5121
* @return Boolean false if success, string of missing needed element/attribute if failure
5123
public function prepare($config, $context) {
5124
$this->htmlDefinition = $config->getHTMLDefinition();
5125
// Even though this might fail, some unit tests ignore this and
5126
// still test checkNeeded, so be careful. Maybe get rid of that
5128
$result = $this->checkNeeded($config);
5129
if ($result !== false) return $result;
5130
$this->currentNesting =& $context->get('CurrentNesting');
5131
$this->inputTokens =& $context->get('InputTokens');
5132
$this->inputIndex =& $context->get('InputIndex');
5137
* This function checks if the HTML environment
5138
* will work with the Injector: if p tags are not allowed, the
5139
* Auto-Paragraphing injector should not be enabled.
5140
* @param $config Instance of HTMLPurifier_Config
5141
* @param $context Instance of HTMLPurifier_Context
5142
* @return Boolean false if success, string of missing needed element/attribute if failure
5144
public function checkNeeded($config) {
5145
$def = $config->getHTMLDefinition();
5146
foreach ($this->needed as $element => $attributes) {
5147
if (is_int($element)) $element = $attributes;
5148
if (!isset($def->info[$element])) return $element;
5149
if (!is_array($attributes)) continue;
5150
foreach ($attributes as $name) {
5151
if (!isset($def->info[$element]->attr[$name])) return "$element.$name";
5158
* Tests if the context node allows a certain element
5159
* @param $name Name of element to test for
5160
* @return True if element is allowed, false if it is not
5162
public function allowsElement($name) {
5163
if (!empty($this->currentNesting)) {
5164
$parent_token = array_pop($this->currentNesting);
5165
$this->currentNesting[] = $parent_token;
5166
$parent = $this->htmlDefinition->info[$parent_token->name];
5168
$parent = $this->htmlDefinition->info_parent_def;
5170
if (!isset($parent->child->elements[$name]) || isset($parent->excludes[$name])) {
5173
// check for exclusion
5174
for ($i = count($this->currentNesting) - 2; $i >= 0; $i--) {
5175
$node = $this->currentNesting[$i];
5176
$def = $this->htmlDefinition->info[$node->name];
5177
if (isset($def->excludes[$name])) return false;
5183
* Iterator function, which starts with the next token and continues until
5184
* you reach the end of the input tokens.
5185
* @warning Please prevent previous references from interfering with this
5186
* functions by setting $i = null beforehand!
5187
* @param &$i Current integer index variable for inputTokens
5188
* @param &$current Current token variable. Do NOT use $token, as that variable is also a reference
5190
protected function forward(&$i, &$current) {
5191
if ($i === null) $i = $this->inputIndex + 1;
5193
if (!isset($this->inputTokens[$i])) return false;
5194
$current = $this->inputTokens[$i];
5199
* Similar to _forward, but accepts a third parameter $nesting (which
5200
* should be initialized at 0) and stops when we hit the end tag
5201
* for the node $this->inputIndex starts in.
5203
protected function forwardUntilEndToken(&$i, &$current, &$nesting) {
5204
$result = $this->forward($i, $current);
5205
if (!$result) return false;
5206
if ($nesting === null) $nesting = 0;
5207
if ($current instanceof HTMLPurifier_Token_Start) $nesting++;
5208
elseif ($current instanceof HTMLPurifier_Token_End) {
5209
if ($nesting <= 0) return false;
5216
* Iterator function, starts with the previous token and continues until
5217
* you reach the beginning of input tokens.
5218
* @warning Please prevent previous references from interfering with this
5219
* functions by setting $i = null beforehand!
5220
* @param &$i Current integer index variable for inputTokens
5221
* @param &$current Current token variable. Do NOT use $token, as that variable is also a reference
5223
protected function backward(&$i, &$current) {
5224
if ($i === null) $i = $this->inputIndex - 1;
5226
if ($i < 0) return false;
5227
$current = $this->inputTokens[$i];
5232
* Initializes the iterator at the current position. Use in a do {} while;
5233
* loop to force the _forward and _backward functions to start at the
5235
* @warning Please prevent previous references from interfering with this
5236
* functions by setting $i = null beforehand!
5237
* @param &$i Current integer index variable for inputTokens
5238
* @param &$current Current token variable. Do NOT use $token, as that variable is also a reference
5240
protected function current(&$i, &$current) {
5241
if ($i === null) $i = $this->inputIndex;
5242
$current = $this->inputTokens[$i];
5246
* Handler that is called when a text token is processed
5248
public function handleText(&$token) {}
5251
* Handler that is called when a start or empty token is processed
5253
public function handleElement(&$token) {}
5256
* Handler that is called when an end token is processed
5258
public function handleEnd(&$token) {
5259
$this->notifyEnd($token);
5263
* Notifier that is called when an end token is processed
5264
* @note This differs from handlers in that the token is read-only
5267
public function notifyEnd($token) {}
5277
* Represents a language and defines localizable string formatting and
5278
* other functions, as well as the localized messages for HTML Purifier.
5280
class HTMLPurifier_Language
5284
* ISO 639 language code of language. Prefers shortest possible version
5286
public $code = 'en';
5289
* Fallback language code
5291
public $fallback = false;
5294
* Array of localizable messages
5296
public $messages = array();
5299
* Array of localizable error codes
5301
public $errorNames = array();
5304
* True if no message file was found for this language, so English
5305
* is being used instead. Check this if you'd like to notify the
5306
* user that they've used a non-supported language.
5308
public $error = false;
5311
* Has the language object been loaded yet?
5312
* @todo Make it private, fix usage in HTMLPurifier_LanguageTest
5314
public $_loaded = false;
5317
* Instances of HTMLPurifier_Config and HTMLPurifier_Context
5319
protected $config, $context;
5321
public function __construct($config, $context) {
5322
$this->config = $config;
5323
$this->context = $context;
5327
* Loads language object with necessary info from factory cache
5328
* @note This is a lazy loader
5330
public function load() {
5331
if ($this->_loaded) return;
5332
$factory = HTMLPurifier_LanguageFactory::instance();
5333
$factory->loadLanguage($this->code);
5334
foreach ($factory->keys as $key) {
5335
$this->$key = $factory->cache[$this->code][$key];
5337
$this->_loaded = true;
5341
* Retrieves a localised message.
5342
* @param $key string identifier of message
5343
* @return string localised message
5345
public function getMessage($key) {
5346
if (!$this->_loaded) $this->load();
5347
if (!isset($this->messages[$key])) return "[$key]";
5348
return $this->messages[$key];
5352
* Retrieves a localised error name.
5353
* @param $int integer error number, corresponding to PHP's error
5355
* @return string localised message
5357
public function getErrorName($int) {
5358
if (!$this->_loaded) $this->load();
5359
if (!isset($this->errorNames[$int])) return "[Error: $int]";
5360
return $this->errorNames[$int];
5364
* Converts an array list into a string readable representation
5366
public function listify($array) {
5367
$sep = $this->getMessage('Item separator');
5368
$sep_last = $this->getMessage('Item separator last');
5370
for ($i = 0, $c = count($array); $i < $c; $i++) {
5372
} elseif ($i + 1 < $c) {
5383
* Formats a localised message with passed parameters
5384
* @param $key string identifier of message
5385
* @param $args Parameters to substitute in
5386
* @return string localised message
5387
* @todo Implement conditionals? Right now, some messages make
5388
* reference to line numbers, but those aren't always available
5390
public function formatMessage($key, $args = array()) {
5391
if (!$this->_loaded) $this->load();
5392
if (!isset($this->messages[$key])) return "[$key]";
5393
$raw = $this->messages[$key];
5396
foreach ($args as $i => $value) {
5397
if (is_object($value)) {
5398
if ($value instanceof HTMLPurifier_Token) {
5399
// factor this out some time
5400
if (!$generator) $generator = $this->context->get('Generator');
5401
if (isset($value->name)) $subst['$'.$i.'.Name'] = $value->name;
5402
if (isset($value->data)) $subst['$'.$i.'.Data'] = $value->data;
5403
$subst['$'.$i.'.Compact'] =
5404
$subst['$'.$i.'.Serialized'] = $generator->generateFromToken($value);
5405
// a more complex algorithm for compact representation
5406
// could be introduced for all types of tokens. This
5407
// may need to be factored out into a dedicated class
5408
if (!empty($value->attr)) {
5409
$stripped_token = clone $value;
5410
$stripped_token->attr = array();
5411
$subst['$'.$i.'.Compact'] = $generator->generateFromToken($stripped_token);
5413
$subst['$'.$i.'.Line'] = $value->line ? $value->line : 'unknown';
5416
} elseif (is_array($value)) {
5417
$keys = array_keys($value);
5418
if (array_keys($keys) === $keys) {
5420
$subst['$'.$i] = $this->listify($value);
5422
// associative array
5423
// no $i implementation yet, sorry
5424
$subst['$'.$i.'.Keys'] = $this->listify($keys);
5425
$subst['$'.$i.'.Values'] = $this->listify(array_values($value));
5429
$subst['$' . $i] = $value;
5431
return strtr($raw, $subst);
5441
* Class responsible for generating HTMLPurifier_Language objects, managing
5442
* caching and fallbacks.
5443
* @note Thanks to MediaWiki for the general logic, although this version
5444
* has been entirely rewritten
5445
* @todo Serialized cache for languages
5447
class HTMLPurifier_LanguageFactory
5451
* Cache of language code information used to load HTMLPurifier_Language objects
5452
* Structure is: $factory->cache[$language_code][$key] = $value
5458
* Valid keys in the HTMLPurifier_Language object. Designates which
5459
* variables to slurp out of a message file.
5462
public $keys = array('fallback', 'messages', 'errorNames');
5465
* Instance of HTMLPurifier_AttrDef_Lang to validate language codes
5466
* @value object HTMLPurifier_AttrDef_Lang
5468
protected $validator;
5471
* Cached copy of dirname(__FILE__), directory of current file without
5473
* @value string filename
5478
* Keys whose contents are a hash map and can be merged
5479
* @value array lookup
5481
protected $mergeable_keys_map = array('messages' => true, 'errorNames' => true);
5484
* Keys whose contents are a list and can be merged
5485
* @value array lookup
5487
protected $mergeable_keys_list = array();
5490
* Retrieve sole instance of the factory.
5491
* @param $prototype Optional prototype to overload sole instance with,
5492
* or bool true to reset to default factory.
5494
public static function instance($prototype = null) {
5495
static $instance = null;
5496
if ($prototype !== null) {
5497
$instance = $prototype;
5498
} elseif ($instance === null || $prototype == true) {
5499
$instance = new HTMLPurifier_LanguageFactory();
5506
* Sets up the singleton, much like a constructor
5507
* @note Prevents people from getting this outside of the singleton
5509
public function setup() {
5510
$this->validator = new HTMLPurifier_AttrDef_Lang();
5511
$this->dir = HTMLPURIFIER_PREFIX . '/HTMLPurifier';
5515
* Creates a language object, handles class fallbacks
5516
* @param $config Instance of HTMLPurifier_Config
5517
* @param $context Instance of HTMLPurifier_Context
5518
* @param $code Code to override configuration with. Private parameter.
5520
public function create($config, $context, $code = false) {
5522
// validate language code
5523
if ($code === false) {
5524
$code = $this->validator->validate(
5525
$config->get('Core.Language'), $config, $context
5528
$code = $this->validator->validate($code, $config, $context);
5530
if ($code === false) $code = 'en'; // malformed code becomes English
5532
$pcode = str_replace('-', '_', $code); // make valid PHP classname
5533
static $depth = 0; // recursion protection
5535
if ($code == 'en') {
5536
$lang = new HTMLPurifier_Language($config, $context);
5538
$class = 'HTMLPurifier_Language_' . $pcode;
5539
$file = $this->dir . '/Language/classes/' . $code . '.php';
5540
if (file_exists($file) || class_exists($class, false)) {
5541
$lang = new $class($config, $context);
5544
$raw_fallback = $this->getFallbackFor($code);
5545
$fallback = $raw_fallback ? $raw_fallback : 'en';
5547
$lang = $this->create($config, $context, $fallback);
5548
if (!$raw_fallback) {
5549
$lang->error = true;
5555
$lang->code = $code;
5562
* Returns the fallback language for language
5563
* @note Loads the original language into cache
5564
* @param $code string language code
5566
public function getFallbackFor($code) {
5567
$this->loadLanguage($code);
5568
return $this->cache[$code]['fallback'];
5572
* Loads language into the cache, handles message file and fallbacks
5573
* @param $code string language code
5575
public function loadLanguage($code) {
5576
static $languages_seen = array(); // recursion guard
5578
// abort if we've already loaded it
5579
if (isset($this->cache[$code])) return;
5581
// generate filename
5582
$filename = $this->dir . '/Language/messages/' . $code . '.php';
5584
// default fallback : may be overwritten by the ensuing include
5585
$fallback = ($code != 'en') ? 'en' : false;
5587
// load primary localisation
5588
if (!file_exists($filename)) {
5589
// skip the include: will rely solely on fallback
5590
$filename = $this->dir . '/Language/messages/en.php';
5594
$cache = compact($this->keys);
5597
// load fallback localisation
5598
if (!empty($fallback)) {
5600
// infinite recursion guard
5601
if (isset($languages_seen[$code])) {
5602
trigger_error('Circular fallback reference in language ' .
5603
$code, E_USER_ERROR);
5606
$language_seen[$code] = true;
5608
// load the fallback recursively
5609
$this->loadLanguage($fallback);
5610
$fallback_cache = $this->cache[$fallback];
5612
// merge fallback with current language
5613
foreach ( $this->keys as $key ) {
5614
if (isset($cache[$key]) && isset($fallback_cache[$key])) {
5615
if (isset($this->mergeable_keys_map[$key])) {
5616
$cache[$key] = $cache[$key] + $fallback_cache[$key];
5617
} elseif (isset($this->mergeable_keys_list[$key])) {
5618
$cache[$key] = array_merge( $fallback_cache[$key], $cache[$key] );
5621
$cache[$key] = $fallback_cache[$key];
5627
// save to cache for later retrieval
5628
$this->cache[$code] = $cache;
5640
* Represents a measurable length, with a string numeric magnitude
5641
* and a unit. This object is immutable.
5643
class HTMLPurifier_Length
5647
* String numeric magnitude.
5652
* String unit. False is permitted if $n = 0.
5657
* Whether or not this length is valid. Null if not calculated yet.
5662
* Lookup array of units recognized by CSS 2.1
5664
protected static $allowedUnits = array(
5665
'em' => true, 'ex' => true, 'px' => true, 'in' => true,
5666
'cm' => true, 'mm' => true, 'pt' => true, 'pc' => true
5670
* @param number $n Magnitude
5671
* @param string $u Unit
5673
public function __construct($n = '0', $u = false) {
5674
$this->n = (string) $n;
5675
$this->unit = $u !== false ? (string) $u : false;
5679
* @param string $s Unit string, like '2em' or '3.4in'
5680
* @warning Does not perform validation.
5682
static public function make($s) {
5683
if ($s instanceof HTMLPurifier_Length) return $s;
5684
$n_length = strspn($s, '1234567890.+-');
5685
$n = substr($s, 0, $n_length);
5686
$unit = substr($s, $n_length);
5687
if ($unit === '') $unit = false;
5688
return new HTMLPurifier_Length($n, $unit);
5692
* Validates the number and unit.
5694
protected function validate() {
5696
if ($this->n === '+0' || $this->n === '-0') $this->n = '0';
5697
if ($this->n === '0' && $this->unit === false) return true;
5698
if (!ctype_lower($this->unit)) $this->unit = strtolower($this->unit);
5699
if (!isset(HTMLPurifier_Length::$allowedUnits[$this->unit])) return false;
5701
$def = new HTMLPurifier_AttrDef_CSS_Number();
5702
$result = $def->validate($this->n, false, false);
5703
if ($result === false) return false;
5709
* Returns string representation of number.
5711
public function toString() {
5712
if (!$this->isValid()) return false;
5713
return $this->n . $this->unit;
5717
* Retrieves string numeric magnitude.
5719
public function getN() {return $this->n;}
5722
* Retrieves string unit.
5724
public function getUnit() {return $this->unit;}
5727
* Returns true if this length unit is valid.
5729
public function isValid() {
5730
if ($this->isValid === null) $this->isValid = $this->validate();
5731
return $this->isValid;
5735
* Compares two lengths, and returns 1 if greater, -1 if less and 0 if equal.
5736
* @warning If both values are too large or small, this calculation will
5739
public function compareTo($l) {
5740
if ($l === false) return false;
5741
if ($l->unit !== $this->unit) {
5742
$converter = new HTMLPurifier_UnitConverter();
5743
$l = $converter->convert($l, $this->unit);
5744
if ($l === false) return false;
5746
return $this->n - $l->n;
5756
* Forgivingly lexes HTML (SGML-style) markup into tokens.
5758
* A lexer parses a string of SGML-style markup and converts them into
5759
* corresponding tokens. It doesn't check for well-formedness, although its
5760
* internal mechanism may make this automatic (such as the case of
5761
* HTMLPurifier_Lexer_DOMLex). There are several implementations to choose
5764
* A lexer is HTML-oriented: it might work with XML, but it's not
5765
* recommended, as we adhere to a subset of the specification for optimization
5766
* reasons. This might change in the future. Also, most tokenizers are not
5767
* expected to handle DTDs or PIs.
5769
* This class should not be directly instantiated, but you may use create() to
5770
* retrieve a default copy of the lexer. Being a supertype, this class
5771
* does not actually define any implementation, but offers commonly used
5772
* convenience functions for subclasses.
5774
* @note The unit tests will instantiate this class for testing purposes, as
5775
* many of the utility functions require a class to be instantiated.
5776
* This means that, even though this class is not runnable, it will
5777
* not be declared abstract.
5782
* We use tokens rather than create a DOM representation because DOM would:
5785
* -# Require more processing and memory to create,
5786
* -# Is not streamable, and
5787
* -# Has the entire document structure (html and body not needed).
5790
* However, DOM is helpful in that it makes it easy to move around nodes
5791
* without a lot of lookaheads to see when a tag is closed. This is a
5792
* limitation of the token system and some workarounds would be nice.
5794
class HTMLPurifier_Lexer
5798
* Whether or not this lexer implements line-number/column-number tracking.
5799
* If it does, set to true.
5801
public $tracksLineNumbers = false;
5803
// -- STATIC ----------------------------------------------------------
5806
* Retrieves or sets the default Lexer as a Prototype Factory.
5808
* By default HTMLPurifier_Lexer_DOMLex will be returned. There are
5809
* a few exceptions involving special features that only DirectLex
5812
* @note The behavior of this class has changed, rather than accepting
5813
* a prototype object, it now accepts a configuration object.
5814
* To specify your own prototype, set %Core.LexerImpl to it.
5815
* This change in behavior de-singletonizes the lexer object.
5817
* @param $config Instance of HTMLPurifier_Config
5818
* @return Concrete lexer.
5820
public static function create($config) {
5822
if (!($config instanceof HTMLPurifier_Config)) {
5824
trigger_error("Passing a prototype to
5825
HTMLPurifier_Lexer::create() is deprecated, please instead
5826
use %Core.LexerImpl", E_USER_WARNING);
5828
$lexer = $config->get('Core.LexerImpl');
5832
$config->get('Core.MaintainLineNumbers') ||
5833
$config->get('Core.CollectErrors');
5836
if (is_object($lexer)) {
5840
if (is_null($lexer)) { do {
5841
// auto-detection algorithm
5843
if ($needs_tracking) {
5844
$lexer = 'DirectLex';
5849
class_exists('DOMDocument') &&
5850
method_exists('DOMDocument', 'loadHTML') &&
5851
!extension_loaded('domxml')
5853
// check for DOM support, because while it's part of the
5854
// core, it can be disabled compile time. Also, the PECL
5855
// domxml extension overrides the default DOM, and is evil
5856
// and nasty and we shan't bother to support it
5859
$lexer = 'DirectLex';
5862
} while(0); } // do..while so we can break
5864
// instantiate recognized string names
5867
$inst = new HTMLPurifier_Lexer_DOMLex();
5870
$inst = new HTMLPurifier_Lexer_DirectLex();
5873
$inst = new HTMLPurifier_Lexer_PH5P();
5876
throw new HTMLPurifier_Exception("Cannot instantiate unrecognized Lexer type " . htmlspecialchars($lexer));
5880
if (!$inst) throw new HTMLPurifier_Exception('No lexer was instantiated');
5882
// once PHP DOM implements native line numbers, or we
5883
// hack out something using XSLT, remove this stipulation
5884
if ($needs_tracking && !$inst->tracksLineNumbers) {
5885
throw new HTMLPurifier_Exception('Cannot use lexer that does not support line numbers with Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)');
5892
// -- CONVENIENCE MEMBERS ---------------------------------------------
5894
public function __construct() {
5895
$this->_entity_parser = new HTMLPurifier_EntityParser();
5899
* Most common entity to raw value conversion table for special entities.
5901
protected $_special_entity2str =
5913
* Parses special entities into the proper characters.
5915
* This string will translate escaped versions of the special characters
5916
* into the correct ones.
5919
* You should be able to treat the output of this function as
5920
* completely parsed, but that's only because all other entities should
5921
* have been handled previously in substituteNonSpecialEntities()
5923
* @param $string String character data to be parsed.
5924
* @returns Parsed character data.
5926
public function parseData($string) {
5928
// following functions require at least one character
5929
if ($string === '') return '';
5931
// subtracts amps that cannot possibly be escaped
5932
$num_amp = substr_count($string, '&') - substr_count($string, '& ') -
5933
($string[strlen($string)-1] === '&' ? 1 : 0);
5935
if (!$num_amp) return $string; // abort if no entities
5936
$num_esc_amp = substr_count($string, '&');
5937
$string = strtr($string, $this->_special_entity2str);
5939
// code duplication for sake of optimization, see above
5940
$num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
5941
($string[strlen($string)-1] === '&' ? 1 : 0);
5943
if ($num_amp_2 <= $num_esc_amp) return $string;
5945
// hmm... now we have some uncommon entities. Use the callback.
5946
$string = $this->_entity_parser->substituteSpecialEntities($string);
5951
* Lexes an HTML string into tokens.
5953
* @param $string String HTML.
5954
* @return HTMLPurifier_Token array representation of HTML.
5956
public function tokenizeHTML($string, $config, $context) {
5957
trigger_error('Call to abstract class', E_USER_ERROR);
5961
* Translates CDATA sections into regular sections (through escaping).
5963
* @param $string HTML string to process.
5964
* @returns HTML with CDATA sections escaped.
5966
protected static function escapeCDATA($string) {
5967
return preg_replace_callback(
5968
'/<!\[CDATA\[(.+?)\]\]>/s',
5969
array('HTMLPurifier_Lexer', 'CDATACallback'),
5975
* Special CDATA case that is especially convoluted for <script>
5977
protected static function escapeCommentedCDATA($string) {
5978
return preg_replace_callback(
5979
'#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s',
5980
array('HTMLPurifier_Lexer', 'CDATACallback'),
5986
* Callback function for escapeCDATA() that does the work.
5988
* @warning Though this is public in order to let the callback happen,
5989
* calling it directly is not recommended.
5990
* @params $matches PCRE matches array, with index 0 the entire match
5991
* and 1 the inside of the CDATA section.
5992
* @returns Escaped internals of the CDATA section.
5994
protected static function CDATACallback($matches) {
5995
// not exactly sure why the character set is needed, but whatever
5996
return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8');
6000
* Takes a piece of HTML and normalizes it by converting entities, fixing
6001
* encoding, extracting bits, and other good stuff.
6002
* @todo Consider making protected
6004
public function normalize($html, $config, $context) {
6006
// normalize newlines to \n
6007
$html = str_replace("\r\n", "\n", $html);
6008
$html = str_replace("\r", "\n", $html);
6010
if ($config->get('HTML.Trusted')) {
6011
// escape convoluted CDATA
6012
$html = $this->escapeCommentedCDATA($html);
6016
$html = $this->escapeCDATA($html);
6018
// extract body from document if applicable
6019
if ($config->get('Core.ConvertDocumentToFragment')) {
6020
$html = $this->extractBody($html);
6023
// expand entities that aren't the big five
6024
$html = $this->_entity_parser->substituteNonSpecialEntities($html);
6026
// clean into wellformed UTF-8 string for an SGML context: this has
6027
// to be done after entity expansion because the entities sometimes
6028
// represent non-SGML characters (horror, horror!)
6029
$html = HTMLPurifier_Encoder::cleanUTF8($html);
6035
* Takes a string of HTML (fragment or document) and returns the content
6036
* @todo Consider making protected
6038
public function extractBody($html) {
6040
$result = preg_match('!<body[^>]*>(.*)</body>!is', $html, $matches);
6055
* Class that handles operations involving percent-encoding in URIs.
6058
* Be careful when reusing instances of PercentEncoder. The object
6059
* you use for normalize() SHOULD NOT be used for encode(), or
6062
class HTMLPurifier_PercentEncoder
6066
* Reserved characters to preserve when using encode().
6068
protected $preserve = array();
6071
* String of characters that should be preserved while using encode().
6073
public function __construct($preserve = false) {
6074
// unreserved letters, ought to const-ify
6075
for ($i = 48; $i <= 57; $i++) $this->preserve[$i] = true; // digits
6076
for ($i = 65; $i <= 90; $i++) $this->preserve[$i] = true; // upper-case
6077
for ($i = 97; $i <= 122; $i++) $this->preserve[$i] = true; // lower-case
6078
$this->preserve[45] = true; // Dash -
6079
$this->preserve[46] = true; // Period .
6080
$this->preserve[95] = true; // Underscore _
6081
$this->preserve[126]= true; // Tilde ~
6083
// extra letters not to escape
6084
if ($preserve !== false) {
6085
for ($i = 0, $c = strlen($preserve); $i < $c; $i++) {
6086
$this->preserve[ord($preserve[$i])] = true;
6092
* Our replacement for urlencode, it encodes all non-reserved characters,
6093
* as well as any extra characters that were instructed to be preserved.
6095
* Assumes that the string has already been normalized, making any
6096
* and all percent escape sequences valid. Percents will not be
6097
* re-escaped, regardless of their status in $preserve
6098
* @param $string String to be encoded
6099
* @return Encoded string.
6101
public function encode($string) {
6103
for ($i = 0, $c = strlen($string); $i < $c; $i++) {
6104
if ($string[$i] !== '%' && !isset($this->preserve[$int = ord($string[$i])]) ) {
6105
$ret .= '%' . sprintf('%02X', $int);
6107
$ret .= $string[$i];
6114
* Fix up percent-encoding by decoding unreserved characters and normalizing.
6115
* @warning This function is affected by $preserve, even though the
6116
* usual desired behavior is for this not to preserve those
6117
* characters. Be careful when reusing instances of PercentEncoder!
6118
* @param $string String to normalize
6120
public function normalize($string) {
6121
if ($string == '') return '';
6122
$parts = explode('%', $string);
6123
$ret = array_shift($parts);
6124
foreach ($parts as $part) {
6125
$length = strlen($part);
6127
$ret .= '%25' . $part;
6130
$encoding = substr($part, 0, 2);
6131
$text = substr($part, 2);
6132
if (!ctype_xdigit($encoding)) {
6133
$ret .= '%25' . $part;
6136
$int = hexdec($encoding);
6137
if (isset($this->preserve[$int])) {
6138
$ret .= chr($int) . $text;
6141
$encoding = strtoupper($encoding);
6142
$ret .= '%' . $encoding . $text;
6154
* Generic property list implementation
6156
class HTMLPurifier_PropertyList
6159
* Internal data-structure for properties
6161
protected $data = array();
6170
public function __construct($parent = null) {
6171
$this->parent = $parent;
6175
* Recursively retrieves the value for a key
6177
public function get($name) {
6178
if ($this->has($name)) return $this->data[$name];
6179
// possible performance bottleneck, convert to iterative if necessary
6180
if ($this->parent) return $this->parent->get($name);
6181
throw new HTMLPurifier_Exception("Key '$name' not found");
6185
* Sets the value of a key, for this plist
6187
public function set($name, $value) {
6188
$this->data[$name] = $value;
6192
* Returns true if a given key exists
6194
public function has($name) {
6195
return array_key_exists($name, $this->data);
6199
* Resets a value to the value of it's parent, usually the default. If
6200
* no value is specified, the entire plist is reset.
6202
public function reset($name = null) {
6203
if ($name == null) $this->data = array();
6204
else unset($this->data[$name]);
6208
* Squashes this property list and all of its property lists into a single
6209
* array, and returns the array. This value is cached by default.
6210
* @param $force If true, ignores the cache and regenerates the array.
6212
public function squash($force = false) {
6213
if ($this->cache !== null && !$force) return $this->cache;
6214
if ($this->parent) {
6215
return $this->cache = array_merge($this->parent->squash($force), $this->data);
6217
return $this->cache = $this->data;
6222
* Returns the parent plist.
6224
public function getParent() {
6225
return $this->parent;
6229
* Sets the parent plist.
6231
public function setParent($plist) {
6232
$this->parent = $plist;
6241
* Property list iterator. Do not instantiate this class directly.
6243
class HTMLPurifier_PropertyListIterator extends FilterIterator
6250
* @param $data Array of data to iterate over
6251
* @param $filter Optional prefix to only allow values of
6253
public function __construct(Iterator $iterator, $filter = null) {
6254
parent::__construct($iterator);
6255
$this->l = strlen($filter);
6256
$this->filter = $filter;
6259
public function accept() {
6260
$key = $this->getInnerIterator()->key();
6261
if( strncmp($key, $this->filter, $this->l) !== 0 ) {
6274
* Supertype for classes that define a strategy for modifying/purifying tokens.
6276
* While HTMLPurifier's core purpose is fixing HTML into something proper,
6277
* strategies provide plug points for extra configuration or even extra
6278
* features, such as custom tags, custom parsing of text, etc.
6282
abstract class HTMLPurifier_Strategy
6286
* Executes the strategy on the tokens.
6288
* @param $tokens Array of HTMLPurifier_Token objects to be operated on.
6289
* @param $config Configuration options
6290
* @returns Processed array of token objects.
6292
abstract public function execute($tokens, $config, $context);
6301
* This is in almost every respect equivalent to an array except
6302
* that it keeps track of which keys were accessed.
6304
* @warning For the sake of backwards compatibility with early versions
6305
* of PHP 5, you must not use the $hash[$key] syntax; if you do
6306
* our version of offsetGet is never called.
6308
class HTMLPurifier_StringHash extends ArrayObject
6310
protected $accessed = array();
6313
* Retrieves a value, and logs the access.
6315
public function offsetGet($index) {
6316
$this->accessed[$index] = true;
6317
return parent::offsetGet($index);
6321
* Returns a lookup array of all array indexes that have been accessed.
6322
* @return Array in form array($index => true).
6324
public function getAccessed() {
6325
return $this->accessed;
6329
* Resets the access array.
6331
public function resetAccessed() {
6332
$this->accessed = array();
6341
* Parses string hash files. File format is as such:
6350
* Which would output something similar to:
6353
* 'ID' => 'DefaultKeyValue',
6355
* 'KEY2' => 'Value2',
6356
* 'MULTILINE-KEY' => "Multiline\nvalue.\n",
6359
* We use this as an easy to use file-format for configuration schema
6360
* files, but the class itself is usage agnostic.
6362
* You can use ---- to forcibly terminate parsing of a single string-hash;
6363
* this marker is used in multi string-hashes to delimit boundaries.
6365
class HTMLPurifier_StringHashParser
6368
public $default = 'ID';
6371
* Parses a file that contains a single string-hash.
6373
public function parseFile($file) {
6374
if (!file_exists($file)) return false;
6375
$fh = fopen($file, 'r');
6376
if (!$fh) return false;
6377
$ret = $this->parseHandle($fh);
6383
* Parses a file that contains multiple string-hashes delimited by '----'
6385
public function parseMultiFile($file) {
6386
if (!file_exists($file)) return false;
6388
$fh = fopen($file, 'r');
6389
if (!$fh) return false;
6390
while (!feof($fh)) {
6391
$ret[] = $this->parseHandle($fh);
6398
* Internal parser that acepts a file handle.
6399
* @note While it's possible to simulate in-memory parsing by using
6400
* custom stream wrappers, if such a use-case arises we should
6401
* factor out the file handle into its own class.
6402
* @param $fh File handle with pointer at start of valid string-hash
6405
protected function parseHandle($fh) {
6411
if ($line === false) break;
6412
$line = rtrim($line, "\n\r");
6413
if (!$state && $line === '') continue;
6414
if ($line === '----') break;
6415
if (strncmp('--#', $line, 3) === 0) {
6418
} elseif (strncmp('--', $line, 2) === 0) {
6419
// Multiline declaration
6420
$state = trim($line, '- ');
6421
if (!isset($ret[$state])) $ret[$state] = '';
6423
} elseif (!$state) {
6425
if (strpos($line, ':') !== false) {
6426
// Single-line declaration
6427
list($state, $line) = explode(':', $line, 2);
6428
$line = trim($line);
6430
// Use default declaration
6431
$state = $this->default;
6435
$ret[$state] = $line;
6439
$ret[$state] .= "$line\n";
6441
} while (!feof($fh));
6452
* Defines a mutation of an obsolete tag into a valid tag.
6454
abstract class HTMLPurifier_TagTransform
6458
* Tag name to transform the tag to.
6460
public $transform_to;
6463
* Transforms the obsolete tag into the valid tag.
6464
* @param $tag Tag to be transformed.
6465
* @param $config Mandatory HTMLPurifier_Config object
6466
* @param $context Mandatory HTMLPurifier_Context object
6468
abstract public function transform($tag, $config, $context);
6471
* Prepends CSS properties to the style attribute, creating the
6472
* attribute if it doesn't exist.
6473
* @warning Copied over from AttrTransform, be sure to keep in sync
6474
* @param $attr Attribute array to process (passed by reference)
6475
* @param $css CSS to prepend
6477
protected function prependCSS(&$attr, $css) {
6478
$attr['style'] = isset($attr['style']) ? $attr['style'] : '';
6479
$attr['style'] = $css . $attr['style'];
6489
* Abstract base token class that all others inherit from.
6491
class HTMLPurifier_Token {
6492
public $line; /**< Line number node was on in source document. Null if unknown. */
6493
public $col; /**< Column of line node was on in source document. Null if unknown. */
6496
* Lookup array of processing that this token is exempt from.
6497
* Currently, valid values are "ValidateAttributes" and
6498
* "MakeWellFormed_TagClosedError"
6500
public $armor = array();
6503
* Used during MakeWellFormed.
6509
public function __get($n) {
6510
if ($n === 'type') {
6511
trigger_error('Deprecated type property called; use instanceof', E_USER_NOTICE);
6512
switch (get_class($this)) {
6513
case 'HTMLPurifier_Token_Start': return 'start';
6514
case 'HTMLPurifier_Token_Empty': return 'empty';
6515
case 'HTMLPurifier_Token_End': return 'end';
6516
case 'HTMLPurifier_Token_Text': return 'text';
6517
case 'HTMLPurifier_Token_Comment': return 'comment';
6518
default: return null;
6524
* Sets the position of the token in the source document.
6526
public function position($l = null, $c = null) {
6532
* Convenience function for DirectLex settings line/col position.
6534
public function rawPosition($l, $c) {
6535
if ($c === -1) $l++;
6547
* Factory for token generation.
6549
* @note Doing some benchmarking indicates that the new operator is much
6550
* slower than the clone operator (even discounting the cost of the
6551
* constructor). This class is for that optimization.
6552
* Other then that, there's not much point as we don't
6553
* maintain parallel HTMLPurifier_Token hierarchies (the main reason why
6554
* you'd want to use an abstract factory).
6555
* @todo Port DirectLex to use this
6557
class HTMLPurifier_TokenFactory
6561
* Prototypes that will be cloned.
6564
// p stands for prototype
6565
private $p_start, $p_end, $p_empty, $p_text, $p_comment;
6568
* Generates blank prototypes for cloning.
6570
public function __construct() {
6571
$this->p_start = new HTMLPurifier_Token_Start('', array());
6572
$this->p_end = new HTMLPurifier_Token_End('');
6573
$this->p_empty = new HTMLPurifier_Token_Empty('', array());
6574
$this->p_text = new HTMLPurifier_Token_Text('');
6575
$this->p_comment= new HTMLPurifier_Token_Comment('');
6579
* Creates a HTMLPurifier_Token_Start.
6580
* @param $name Tag name
6581
* @param $attr Associative array of attributes
6582
* @return Generated HTMLPurifier_Token_Start
6584
public function createStart($name, $attr = array()) {
6585
$p = clone $this->p_start;
6586
$p->__construct($name, $attr);
6591
* Creates a HTMLPurifier_Token_End.
6592
* @param $name Tag name
6593
* @return Generated HTMLPurifier_Token_End
6595
public function createEnd($name) {
6596
$p = clone $this->p_end;
6597
$p->__construct($name);
6602
* Creates a HTMLPurifier_Token_Empty.
6603
* @param $name Tag name
6604
* @param $attr Associative array of attributes
6605
* @return Generated HTMLPurifier_Token_Empty
6607
public function createEmpty($name, $attr = array()) {
6608
$p = clone $this->p_empty;
6609
$p->__construct($name, $attr);
6614
* Creates a HTMLPurifier_Token_Text.
6615
* @param $data Data of text token
6616
* @return Generated HTMLPurifier_Token_Text
6618
public function createText($data) {
6619
$p = clone $this->p_text;
6620
$p->__construct($data);
6625
* Creates a HTMLPurifier_Token_Comment.
6626
* @param $data Data of comment token
6627
* @return Generated HTMLPurifier_Token_Comment
6629
public function createComment($data) {
6630
$p = clone $this->p_comment;
6631
$p->__construct($data);
6642
* HTML Purifier's internal representation of a URI.
6644
* Internal data-structures are completely escaped. If the data needs
6645
* to be used in a non-URI context (which is very unlikely), be sure
6646
* to decode it first. The URI may not necessarily be well-formed until
6647
* validate() is called.
6649
class HTMLPurifier_URI
6652
public $scheme, $userinfo, $host, $port, $path, $query, $fragment;
6655
* @note Automatically normalizes scheme and port
6657
public function __construct($scheme, $userinfo, $host, $port, $path, $query, $fragment) {
6658
$this->scheme = is_null($scheme) || ctype_lower($scheme) ? $scheme : strtolower($scheme);
6659
$this->userinfo = $userinfo;
6660
$this->host = $host;
6661
$this->port = is_null($port) ? $port : (int) $port;
6662
$this->path = $path;
6663
$this->query = $query;
6664
$this->fragment = $fragment;
6668
* Retrieves a scheme object corresponding to the URI's scheme/default
6669
* @param $config Instance of HTMLPurifier_Config
6670
* @param $context Instance of HTMLPurifier_Context
6671
* @return Scheme object appropriate for validating this URI
6673
public function getSchemeObj($config, $context) {
6674
$registry = HTMLPurifier_URISchemeRegistry::instance();
6675
if ($this->scheme !== null) {
6676
$scheme_obj = $registry->getScheme($this->scheme, $config, $context);
6677
if (!$scheme_obj) return false; // invalid scheme, clean it out
6679
// no scheme: retrieve the default one
6680
$def = $config->getDefinition('URI');
6681
$scheme_obj = $registry->getScheme($def->defaultScheme, $config, $context);
6683
// something funky happened to the default scheme object
6685
'Default scheme object "' . $def->defaultScheme . '" was not readable',
6695
* Generic validation method applicable for all schemes. May modify
6696
* this URI in order to get it into a compliant form.
6697
* @param $config Instance of HTMLPurifier_Config
6698
* @param $context Instance of HTMLPurifier_Context
6699
* @return True if validation/filtering succeeds, false if failure
6701
public function validate($config, $context) {
6703
// ABNF definitions from RFC 3986
6704
$chars_sub_delims = '!$&\'()*+,;=';
6705
$chars_gen_delims = ':/?#[]@';
6706
$chars_pchar = $chars_sub_delims . ':@';
6708
// validate scheme (MUST BE FIRST!)
6709
if (!is_null($this->scheme) && is_null($this->host)) {
6710
$def = $config->getDefinition('URI');
6711
if ($def->defaultScheme === $this->scheme) {
6712
$this->scheme = null;
6717
if (!is_null($this->host)) {
6718
$host_def = new HTMLPurifier_AttrDef_URI_Host();
6719
$this->host = $host_def->validate($this->host, $config, $context);
6720
if ($this->host === false) $this->host = null;
6723
// validate username
6724
if (!is_null($this->userinfo)) {
6725
$encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . ':');
6726
$this->userinfo = $encoder->encode($this->userinfo);
6730
if (!is_null($this->port)) {
6731
if ($this->port < 1 || $this->port > 65535) $this->port = null;
6735
$path_parts = array();
6736
$segments_encoder = new HTMLPurifier_PercentEncoder($chars_pchar . '/');
6737
if (!is_null($this->host)) {
6738
// path-abempty (hier and relative)
6739
$this->path = $segments_encoder->encode($this->path);
6740
} elseif ($this->path !== '' && $this->path[0] === '/') {
6741
// path-absolute (hier and relative)
6742
if (strlen($this->path) >= 2 && $this->path[1] === '/') {
6743
// This shouldn't ever happen!
6746
$this->path = $segments_encoder->encode($this->path);
6748
} elseif (!is_null($this->scheme) && $this->path !== '') {
6749
// path-rootless (hier)
6750
// Short circuit evaluation means we don't need to check nz
6751
$this->path = $segments_encoder->encode($this->path);
6752
} elseif (is_null($this->scheme) && $this->path !== '') {
6753
// path-noscheme (relative)
6754
// (once again, not checking nz)
6755
$segment_nc_encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . '@');
6756
$c = strpos($this->path, '/');
6759
$segment_nc_encoder->encode(substr($this->path, 0, $c)) .
6760
$segments_encoder->encode(substr($this->path, $c));
6762
$this->path = $segment_nc_encoder->encode($this->path);
6765
// path-empty (hier and relative)
6766
$this->path = ''; // just to be safe
6769
// qf = query and fragment
6770
$qf_encoder = new HTMLPurifier_PercentEncoder($chars_pchar . '/?');
6772
if (!is_null($this->query)) {
6773
$this->query = $qf_encoder->encode($this->query);
6776
if (!is_null($this->fragment)) {
6777
$this->fragment = $qf_encoder->encode($this->fragment);
6785
* Convert URI back to string
6786
* @return String URI appropriate for output
6788
public function toString() {
6789
// reconstruct authority
6791
if (!is_null($this->host)) {
6793
if(!is_null($this->userinfo)) $authority .= $this->userinfo . '@';
6794
$authority .= $this->host;
6795
if(!is_null($this->port)) $authority .= ':' . $this->port;
6798
// reconstruct the result
6800
if (!is_null($this->scheme)) $result .= $this->scheme . ':';
6801
if (!is_null($authority)) $result .= '//' . $authority;
6802
$result .= $this->path;
6803
if (!is_null($this->query)) $result .= '?' . $this->query;
6804
if (!is_null($this->fragment)) $result .= '#' . $this->fragment;
6815
class HTMLPurifier_URIDefinition extends HTMLPurifier_Definition
6818
public $type = 'URI';
6819
protected $filters = array();
6820
protected $postFilters = array();
6821
protected $registeredFilters = array();
6824
* HTMLPurifier_URI object of the base specified at %URI.Base
6829
* String host to consider "home" base, derived off of $base
6834
* Name of default scheme based on %URI.DefaultScheme and %URI.Base
6836
public $defaultScheme;
6838
public function __construct() {
6839
$this->registerFilter(new HTMLPurifier_URIFilter_DisableExternal());
6840
$this->registerFilter(new HTMLPurifier_URIFilter_DisableExternalResources());
6841
$this->registerFilter(new HTMLPurifier_URIFilter_HostBlacklist());
6842
$this->registerFilter(new HTMLPurifier_URIFilter_MakeAbsolute());
6843
$this->registerFilter(new HTMLPurifier_URIFilter_Munge());
6846
public function registerFilter($filter) {
6847
$this->registeredFilters[$filter->name] = $filter;
6850
public function addFilter($filter, $config) {
6851
$r = $filter->prepare($config);
6852
if ($r === false) return; // null is ok, for backwards compat
6853
if ($filter->post) {
6854
$this->postFilters[$filter->name] = $filter;
6856
$this->filters[$filter->name] = $filter;
6860
protected function doSetup($config) {
6861
$this->setupMemberVariables($config);
6862
$this->setupFilters($config);
6865
protected function setupFilters($config) {
6866
foreach ($this->registeredFilters as $name => $filter) {
6867
$conf = $config->get('URI.' . $name);
6868
if ($conf !== false && $conf !== null) {
6869
$this->addFilter($filter, $config);
6872
unset($this->registeredFilters);
6875
protected function setupMemberVariables($config) {
6876
$this->host = $config->get('URI.Host');
6877
$base_uri = $config->get('URI.Base');
6878
if (!is_null($base_uri)) {
6879
$parser = new HTMLPurifier_URIParser();
6880
$this->base = $parser->parse($base_uri);
6881
$this->defaultScheme = $this->base->scheme;
6882
if (is_null($this->host)) $this->host = $this->base->host;
6884
if (is_null($this->defaultScheme)) $this->defaultScheme = $config->get('URI.DefaultScheme');
6887
public function filter(&$uri, $config, $context) {
6888
foreach ($this->filters as $name => $f) {
6889
$result = $f->filter($uri, $config, $context);
6890
if (!$result) return false;
6895
public function postFilter(&$uri, $config, $context) {
6896
foreach ($this->postFilters as $name => $f) {
6897
$result = $f->filter($uri, $config, $context);
6898
if (!$result) return false;
6910
* Chainable filters for custom URI processing.
6912
* These filters can perform custom actions on a URI filter object,
6913
* including transformation or blacklisting.
6915
* @warning This filter is called before scheme object validation occurs.
6916
* Make sure, if you require a specific scheme object, you
6917
* you check that it exists. This allows filters to convert
6918
* proprietary URI schemes into regular ones.
6920
abstract class HTMLPurifier_URIFilter
6924
* Unique identifier of filter
6929
* True if this filter should be run after scheme validation.
6931
public $post = false;
6934
* Performs initialization for the filter
6936
public function prepare($config) {return true;}
6939
* Filter a URI object
6940
* @param $uri Reference to URI object variable
6941
* @param $config Instance of HTMLPurifier_Config
6942
* @param $context Instance of HTMLPurifier_Context
6943
* @return bool Whether or not to continue processing: false indicates
6944
* URL is no good, true indicates continue processing. Note that
6945
* all changes are committed directly on the URI object
6947
abstract public function filter(&$uri, $config, $context);
6956
* Parses a URI into the components and fragment identifier as specified
6959
class HTMLPurifier_URIParser
6963
* Instance of HTMLPurifier_PercentEncoder to do normalization with.
6965
protected $percentEncoder;
6967
public function __construct() {
6968
$this->percentEncoder = new HTMLPurifier_PercentEncoder();
6973
* @param $uri string URI to parse
6974
* @return HTMLPurifier_URI representation of URI. This representation has
6975
* not been validated yet and may not conform to RFC.
6977
public function parse($uri) {
6979
$uri = $this->percentEncoder->normalize($uri);
6981
// Regexp is as per Appendix B.
6982
// Note that ["<>] are an addition to the RFC's recommended
6983
// characters, because they represent external delimeters.
6985
'(([^:/?#"<>]+):)?'. // 2. Scheme
6986
'(//([^/?#"<>]*))?'. // 4. Authority
6987
'([^?#"<>]*)'. // 5. Path
6988
'(\?([^#"<>]*))?'. // 7. Query
6989
'(#([^"<>]*))?'. // 8. Fragment
6993
$result = preg_match($r_URI, $uri, $matches);
6995
if (!$result) return false; // *really* invalid URI
6997
// seperate out parts
6998
$scheme = !empty($matches[1]) ? $matches[2] : null;
6999
$authority = !empty($matches[3]) ? $matches[4] : null;
7000
$path = $matches[5]; // always present, can be empty
7001
$query = !empty($matches[6]) ? $matches[7] : null;
7002
$fragment = !empty($matches[8]) ? $matches[9] : null;
7004
// further parse authority
7005
if ($authority !== null) {
7006
$r_authority = "/^((.+?)@)?(\[[^\]]+\]|[^:]*)(:(\d*))?/";
7008
preg_match($r_authority, $authority, $matches);
7009
$userinfo = !empty($matches[1]) ? $matches[2] : null;
7010
$host = !empty($matches[3]) ? $matches[3] : '';
7011
$port = !empty($matches[4]) ? (int) $matches[5] : null;
7013
$port = $host = $userinfo = null;
7016
return new HTMLPurifier_URI(
7017
$scheme, $userinfo, $host, $port, $path, $query, $fragment);
7027
* Validator for the components of a URI for a specific scheme
7029
class HTMLPurifier_URIScheme
7033
* Scheme's default port (integer)
7035
public $default_port = null;
7038
* Whether or not URIs of this schem are locatable by a browser
7039
* http and ftp are accessible, while mailto and news are not.
7041
public $browsable = false;
7044
* Whether or not the URI always uses <hier_part>, resolves edge cases
7045
* with making relative URIs absolute
7047
public $hierarchical = false;
7050
* Validates the components of a URI
7051
* @note This implementation should be called by children if they define
7052
* a default port, as it does port processing.
7053
* @param $uri Instance of HTMLPurifier_URI
7054
* @param $config HTMLPurifier_Config object
7055
* @param $context HTMLPurifier_Context object
7056
* @return Bool success or failure
7058
public function validate(&$uri, $config, $context) {
7059
if ($this->default_port == $uri->port) $uri->port = null;
7070
* Registry for retrieving specific URI scheme validator objects.
7072
class HTMLPurifier_URISchemeRegistry
7076
* Retrieve sole instance of the registry.
7077
* @param $prototype Optional prototype to overload sole instance with,
7078
* or bool true to reset to default registry.
7079
* @note Pass a registry object $prototype with a compatible interface and
7080
* the function will copy it and return it all further times.
7082
public static function instance($prototype = null) {
7083
static $instance = null;
7084
if ($prototype !== null) {
7085
$instance = $prototype;
7086
} elseif ($instance === null || $prototype == true) {
7087
$instance = new HTMLPurifier_URISchemeRegistry();
7093
* Cache of retrieved schemes.
7095
protected $schemes = array();
7098
* Retrieves a scheme validator object
7099
* @param $scheme String scheme name like http or mailto
7100
* @param $config HTMLPurifier_Config object
7101
* @param $config HTMLPurifier_Context object
7103
public function getScheme($scheme, $config, $context) {
7104
if (!$config) $config = HTMLPurifier_Config::createDefault();
7106
// important, otherwise attacker could include arbitrary file
7107
$allowed_schemes = $config->get('URI.AllowedSchemes');
7108
if (!$config->get('URI.OverrideAllowedSchemes') &&
7109
!isset($allowed_schemes[$scheme])
7114
if (isset($this->schemes[$scheme])) return $this->schemes[$scheme];
7115
if (!isset($allowed_schemes[$scheme])) return;
7117
$class = 'HTMLPurifier_URIScheme_' . $scheme;
7118
if (!class_exists($class)) return;
7119
$this->schemes[$scheme] = new $class();
7120
return $this->schemes[$scheme];
7124
* Registers a custom scheme to the cache, bypassing reflection.
7125
* @param $scheme Scheme name
7126
* @param $scheme_obj HTMLPurifier_URIScheme object
7128
public function register($scheme, $scheme_obj) {
7129
$this->schemes[$scheme] = $scheme_obj;
7139
* Class for converting between different unit-lengths as specified by
7142
class HTMLPurifier_UnitConverter
7150
* Units information array. Units are grouped into measuring systems
7151
* (English, Metric), and are assigned an integer representing
7152
* the conversion factor between that unit and the smallest unit in
7153
* the system. Numeric indexes are actually magical constants that
7154
* encode conversion data from one system to the next, with a O(n^2)
7155
* constraint on memory (this is generally not a problem, since
7156
* the number of measuring systems is small.)
7158
protected static $units = array(
7159
self::ENGLISH => array(
7160
'px' => 3, // This is as per CSS 2.1 and Firefox. Your mileage may vary
7164
self::METRIC => array('pt', '0.352777778', 'mm'),
7166
self::METRIC => array(
7169
self::ENGLISH => array('mm', '2.83464567', 'pt'),
7174
* Minimum bcmath precision for output.
7176
protected $outputPrecision;
7179
* Bcmath precision for internal calculations.
7181
protected $internalPrecision;
7184
* Whether or not BCMath is available
7188
public function __construct($output_precision = 4, $internal_precision = 10, $force_no_bcmath = false) {
7189
$this->outputPrecision = $output_precision;
7190
$this->internalPrecision = $internal_precision;
7191
$this->bcmath = !$force_no_bcmath && function_exists('bcmul');
7195
* Converts a length object of one unit into another unit.
7196
* @param HTMLPurifier_Length $length
7197
* Instance of HTMLPurifier_Length to convert. You must validate()
7198
* it before passing it here!
7199
* @param string $to_unit
7200
* Unit to convert to.
7202
* About precision: This conversion function pays very special
7203
* attention to the incoming precision of values and attempts
7204
* to maintain a number of significant figure. Results are
7205
* fairly accurate up to nine digits. Some caveats:
7206
* - If a number is zero-padded as a result of this significant
7207
* figure tracking, the zeroes will be eliminated.
7208
* - If a number contains less than four sigfigs ($outputPrecision)
7209
* and this causes some decimals to be excluded, those
7210
* decimals will be added on.
7212
public function convert($length, $to_unit) {
7214
if (!$length->isValid()) return false;
7216
$n = $length->getN();
7217
$unit = $length->getUnit();
7219
if ($n === '0' || $unit === false) {
7220
return new HTMLPurifier_Length('0', false);
7223
$state = $dest_state = false;
7224
foreach (self::$units as $k => $x) {
7225
if (isset($x[$unit])) $state = $k;
7226
if (isset($x[$to_unit])) $dest_state = $k;
7228
if (!$state || !$dest_state) return false;
7230
// Some calculations about the initial precision of the number;
7231
// this will be useful when we need to do final rounding.
7232
$sigfigs = $this->getSigFigs($n);
7233
if ($sigfigs < $this->outputPrecision) $sigfigs = $this->outputPrecision;
7235
// BCMath's internal precision deals only with decimals. Use
7236
// our default if the initial number has no decimals, or increase
7237
// it by how ever many decimals, thus, the number of guard digits
7238
// will always be greater than or equal to internalPrecision.
7239
$log = (int) floor(log(abs($n), 10));
7240
$cp = ($log < 0) ? $this->internalPrecision - $log : $this->internalPrecision; // internal precision
7242
for ($i = 0; $i < 2; $i++) {
7244
// Determine what unit IN THIS SYSTEM we need to convert to
7245
if ($dest_state === $state) {
7246
// Simple conversion
7247
$dest_unit = $to_unit;
7249
// Convert to the smallest unit, pending a system shift
7250
$dest_unit = self::$units[$state][$dest_state][0];
7253
// Do the conversion if necessary
7254
if ($dest_unit !== $unit) {
7255
$factor = $this->div(self::$units[$state][$unit], self::$units[$state][$dest_unit], $cp);
7256
$n = $this->mul($n, $factor, $cp);
7260
// Output was zero, so bail out early. Shouldn't ever happen.
7267
// It was a simple conversion, so bail out
7268
if ($dest_state === $state) {
7273
// Conversion failed! Apparently, the system we forwarded
7274
// to didn't have this unit. This should never happen!
7278
// Pre-condition: $i == 0
7280
// Perform conversion to next system of units
7281
$n = $this->mul($n, self::$units[$state][$dest_state][1], $cp);
7282
$unit = self::$units[$state][$dest_state][2];
7283
$state = $dest_state;
7285
// One more loop around to convert the unit in the new system.
7289
// Post-condition: $unit == $to_unit
7290
if ($unit !== $to_unit) return false;
7292
// Useful for debugging:
7294
//echo "$n\nsigfigs = $sigfigs\nnew_log = $new_log\nlog = $log\nrp = $rp\n</pre>\n";
7296
$n = $this->round($n, $sigfigs);
7297
if (strpos($n, '.') !== false) $n = rtrim($n, '0');
7298
$n = rtrim($n, '.');
7300
return new HTMLPurifier_Length($n, $unit);
7304
* Returns the number of significant figures in a string number.
7305
* @param string $n Decimal number
7306
* @return int number of sigfigs
7308
public function getSigFigs($n) {
7309
$n = ltrim($n, '0+-');
7310
$dp = strpos($n, '.'); // decimal position
7311
if ($dp === false) {
7312
$sigfigs = strlen(rtrim($n, '0'));
7314
$sigfigs = strlen(ltrim($n, '0.')); // eliminate extra decimal character
7315
if ($dp !== 0) $sigfigs--;
7321
* Adds two numbers, using arbitrary precision when available.
7323
private function add($s1, $s2, $scale) {
7324
if ($this->bcmath) return bcadd($s1, $s2, $scale);
7325
else return $this->scale($s1 + $s2, $scale);
7329
* Multiples two numbers, using arbitrary precision when available.
7331
private function mul($s1, $s2, $scale) {
7332
if ($this->bcmath) return bcmul($s1, $s2, $scale);
7333
else return $this->scale($s1 * $s2, $scale);
7337
* Divides two numbers, using arbitrary precision when available.
7339
private function div($s1, $s2, $scale) {
7340
if ($this->bcmath) return bcdiv($s1, $s2, $scale);
7341
else return $this->scale($s1 / $s2, $scale);
7345
* Rounds a number according to the number of sigfigs it should have,
7346
* using arbitrary precision when available.
7348
private function round($n, $sigfigs) {
7349
$new_log = (int) floor(log(abs($n), 10)); // Number of digits left of decimal - 1
7350
$rp = $sigfigs - $new_log - 1; // Number of decimal places needed
7351
$neg = $n < 0 ? '-' : ''; // Negative sign
7352
if ($this->bcmath) {
7354
$n = bcadd($n, $neg . '0.' . str_repeat('0', $rp) . '5', $rp + 1);
7355
$n = bcdiv($n, '1', $rp);
7357
// This algorithm partially depends on the standardized
7358
// form of numbers that comes out of bcmath.
7359
$n = bcadd($n, $neg . '5' . str_repeat('0', $new_log - $sigfigs), 0);
7360
$n = substr($n, 0, $sigfigs + strlen($neg)) . str_repeat('0', $new_log - $sigfigs + 1);
7364
return $this->scale(round($n, $sigfigs - $new_log - 1), $rp + 1);
7369
* Scales a float to $scale digits right of decimal point, like BCMath.
7371
private function scale($r, $scale) {
7373
// The f sprintf type doesn't support negative numbers, so we
7374
// need to cludge things manually. First get the string.
7375
$r = sprintf('%.0f', (float) $r);
7376
// Due to floating point precision loss, $r will more than likely
7377
// look something like 4652999999999.9234. We grab one more digit
7378
// than we need to precise from $r and then use that to round
7380
$precise = (string) round(substr($r, 0, strlen($r) + $scale), -1);
7381
// Now we return it, truncating the zero that was rounded off.
7382
return substr($precise, 0, -1) . str_repeat('0', -$scale + 1);
7384
return sprintf('%.' . $scale . 'f', (float) $r);
7394
* Parses string representations into their corresponding native PHP
7395
* variable type. The base implementation does a simple type-check.
7397
class HTMLPurifier_VarParser
7413
* Lookup table of allowed types. Mainly for backwards compatibility, but
7414
* also convenient for transforming string type names to the integer constants.
7416
static public $types = array(
7417
'string' => self::STRING,
7418
'istring' => self::ISTRING,
7419
'text' => self::TEXT,
7420
'itext' => self::ITEXT,
7422
'float' => self::FLOAT,
7423
'bool' => self::BOOL,
7424
'lookup' => self::LOOKUP,
7425
'list' => self::ALIST,
7426
'hash' => self::HASH,
7427
'mixed' => self::MIXED
7431
* Lookup table of types that are string, and can have aliases or
7432
* allowed value lists.
7434
static public $stringTypes = array(
7435
self::STRING => true,
7436
self::ISTRING => true,
7438
self::ITEXT => true,
7442
* Validate a variable according to type. Throws
7443
* HTMLPurifier_VarParserException if invalid.
7444
* It may return NULL as a valid type if $allow_null is true.
7446
* @param $var Variable to validate
7447
* @param $type Type of variable, see HTMLPurifier_VarParser->types
7448
* @param $allow_null Whether or not to permit null as a value
7449
* @return Validated and type-coerced variable
7451
final public function parse($var, $type, $allow_null = false) {
7452
if (is_string($type)) {
7453
if (!isset(HTMLPurifier_VarParser::$types[$type])) {
7454
throw new HTMLPurifier_VarParserException("Invalid type '$type'");
7456
$type = HTMLPurifier_VarParser::$types[$type];
7459
$var = $this->parseImplementation($var, $type, $allow_null);
7460
if ($allow_null && $var === null) return null;
7461
// These are basic checks, to make sure nothing horribly wrong
7462
// happened in our implementations.
7464
case (self::STRING):
7465
case (self::ISTRING):
7468
if (!is_string($var)) break;
7469
if ($type == self::ISTRING || $type == self::ITEXT) $var = strtolower($var);
7472
if (!is_int($var)) break;
7475
if (!is_float($var)) break;
7478
if (!is_bool($var)) break;
7480
case (self::LOOKUP):
7483
if (!is_array($var)) break;
7484
if ($type === self::LOOKUP) {
7485
foreach ($var as $k) if ($k !== true) $this->error('Lookup table contains value other than true');
7486
} elseif ($type === self::ALIST) {
7487
$keys = array_keys($var);
7488
if (array_keys($keys) !== $keys) $this->error('Indices for list are not uniform');
7494
$this->errorInconsistent(get_class($this), $type);
7496
$this->errorGeneric($var, $type);
7500
* Actually implements the parsing. Base implementation is to not
7501
* do anything to $var. Subclasses should overload this!
7503
protected function parseImplementation($var, $type, $allow_null) {
7508
* Throws an exception.
7510
protected function error($msg) {
7511
throw new HTMLPurifier_VarParserException($msg);
7515
* Throws an inconsistency exception.
7516
* @note This should not ever be called. It would be called if we
7517
* extend the allowed values of HTMLPurifier_VarParser without
7518
* updating subclasses.
7520
protected function errorInconsistent($class, $type) {
7521
throw new HTMLPurifier_Exception("Inconsistency in $class: ".HTMLPurifier_VarParser::getTypeName($type)." not implemented");
7525
* Generic error for if a type didn't work.
7527
protected function errorGeneric($var, $type) {
7528
$vtype = gettype($var);
7529
$this->error("Expected type ".HTMLPurifier_VarParser::getTypeName($type).", got $vtype");
7532
static public function getTypeName($type) {
7535
// Lazy load the alternative lookup table
7536
$lookup = array_flip(HTMLPurifier_VarParser::$types);
7538
if (!isset($lookup[$type])) return 'unknown';
7539
return $lookup[$type];
7549
* Exception type for HTMLPurifier_VarParser
7551
class HTMLPurifier_VarParserException extends HTMLPurifier_Exception
7561
* Validates the HTML attribute style, otherwise known as CSS.
7562
* @note We don't implement the whole CSS specification, so it might be
7563
* difficult to reuse this component in the context of validating
7564
* actual stylesheet declarations.
7565
* @note If we were really serious about validating the CSS, we would
7566
* tokenize the styles and then parse the tokens. Obviously, we
7567
* are not doing that. Doing that could seriously harm performance,
7568
* but would make these components a lot more viable for a CSS
7569
* filtering solution.
7571
class HTMLPurifier_AttrDef_CSS extends HTMLPurifier_AttrDef
7574
public function validate($css, $config, $context) {
7576
$css = $this->parseCDATA($css);
7578
$definition = $config->getCSSDefinition();
7580
// we're going to break the spec and explode by semicolons.
7581
// This is because semicolon rarely appears in escaped form
7582
// Doing this is generally flaky but fast
7583
// IT MIGHT APPEAR IN URIs, see HTMLPurifier_AttrDef_CSSURI
7586
$declarations = explode(';', $css);
7587
$propvalues = array();
7590
* Name of the current CSS property being validated.
7593
$context->register('CurrentCSSProperty', $property);
7595
foreach ($declarations as $declaration) {
7596
if (!$declaration) continue;
7597
if (!strpos($declaration, ':')) continue;
7598
list($property, $value) = explode(':', $declaration, 2);
7599
$property = trim($property);
7600
$value = trim($value);
7603
if (isset($definition->info[$property])) {
7607
if (ctype_lower($property)) break;
7608
$property = strtolower($property);
7609
if (isset($definition->info[$property])) {
7615
// inefficient call, since the validator will do this again
7616
if (strtolower(trim($value)) !== 'inherit') {
7617
// inherit works for everything (but only on the base property)
7618
$result = $definition->info[$property]->validate(
7619
$value, $config, $context );
7621
$result = 'inherit';
7623
if ($result === false) continue;
7624
$propvalues[$property] = $result;
7627
$context->destroy('CurrentCSSProperty');
7629
// procedure does not write the new CSS simultaneously, so it's
7630
// slightly inefficient, but it's the only way of getting rid of
7631
// duplicates. Perhaps config to optimize it, but not now.
7633
$new_declarations = '';
7634
foreach ($propvalues as $prop => $value) {
7635
$new_declarations .= "$prop:$value;";
7638
return $new_declarations ? $new_declarations : false;
7648
// Enum = Enumerated
7650
* Validates a keyword against a list of valid values.
7651
* @warning The case-insensitive compare of this function uses PHP's
7652
* built-in strtolower and ctype_lower functions, which may
7653
* cause problems with international comparisons
7655
class HTMLPurifier_AttrDef_Enum extends HTMLPurifier_AttrDef
7659
* Lookup table of valid values.
7660
* @todo Make protected
7662
public $valid_values = array();
7665
* Bool indicating whether or not enumeration is case sensitive.
7666
* @note In general this is always case insensitive.
7668
protected $case_sensitive = false; // values according to W3C spec
7671
* @param $valid_values List of valid values
7672
* @param $case_sensitive Bool indicating whether or not case sensitive
7674
public function __construct(
7675
$valid_values = array(), $case_sensitive = false
7677
$this->valid_values = array_flip($valid_values);
7678
$this->case_sensitive = $case_sensitive;
7681
public function validate($string, $config, $context) {
7682
$string = trim($string);
7683
if (!$this->case_sensitive) {
7684
// we may want to do full case-insensitive libraries
7685
$string = ctype_lower($string) ? $string : strtolower($string);
7687
$result = isset($this->valid_values[$string]);
7689
return $result ? $string : false;
7693
* @param $string In form of comma-delimited list of case-insensitive
7694
* valid values. Example: "foo,bar,baz". Prepend "s:" to make
7697
public function make($string) {
7698
if (strlen($string) > 2 && $string[0] == 's' && $string[1] == ':') {
7699
$string = substr($string, 2);
7704
$values = explode(',', $string);
7705
return new HTMLPurifier_AttrDef_Enum($values, $sensitive);
7715
* Validates an integer.
7716
* @note While this class was modeled off the CSS definition, no currently
7717
* allowed CSS uses this type. The properties that do are: widows,
7718
* orphans, z-index, counter-increment, counter-reset. Some of the
7719
* HTML attributes, however, find use for a non-negative version of this.
7721
class HTMLPurifier_AttrDef_Integer extends HTMLPurifier_AttrDef
7725
* Bool indicating whether or not negative values are allowed
7727
protected $negative = true;
7730
* Bool indicating whether or not zero is allowed
7732
protected $zero = true;
7735
* Bool indicating whether or not positive values are allowed
7737
protected $positive = true;
7740
* @param $negative Bool indicating whether or not negative values are allowed
7741
* @param $zero Bool indicating whether or not zero is allowed
7742
* @param $positive Bool indicating whether or not positive values are allowed
7744
public function __construct(
7745
$negative = true, $zero = true, $positive = true
7747
$this->negative = $negative;
7748
$this->zero = $zero;
7749
$this->positive = $positive;
7752
public function validate($integer, $config, $context) {
7754
$integer = $this->parseCDATA($integer);
7755
if ($integer === '') return false;
7757
// we could possibly simply typecast it to integer, but there are
7758
// certain fringe cases that must not return an integer.
7760
// clip leading sign
7761
if ( $this->negative && $integer[0] === '-' ) {
7762
$digits = substr($integer, 1);
7763
if ($digits === '0') $integer = '0'; // rm minus sign for zero
7764
} elseif( $this->positive && $integer[0] === '+' ) {
7765
$digits = $integer = substr($integer, 1); // rm unnecessary plus
7770
// test if it's numeric
7771
if (!ctype_digit($digits)) return false;
7773
// perform scope tests
7774
if (!$this->zero && $integer == 0) return false;
7775
if (!$this->positive && $integer > 0) return false;
7776
if (!$this->negative && $integer < 0) return false;
7789
* Validates the HTML attribute lang, effectively a language code.
7790
* @note Built according to RFC 3066, which obsoleted RFC 1766
7792
class HTMLPurifier_AttrDef_Lang extends HTMLPurifier_AttrDef
7795
public function validate($string, $config, $context) {
7797
$string = trim($string);
7798
if (!$string) return false;
7800
$subtags = explode('-', $string);
7801
$num_subtags = count($subtags);
7803
if ($num_subtags == 0) return false; // sanity check
7805
// process primary subtag : $subtags[0]
7806
$length = strlen($subtags[0]);
7811
if (! ($subtags[0] == 'x' || $subtags[0] == 'i') ) {
7817
if (! ctype_alpha($subtags[0]) ) {
7819
} elseif (! ctype_lower($subtags[0]) ) {
7820
$subtags[0] = strtolower($subtags[0]);
7827
$new_string = $subtags[0];
7828
if ($num_subtags == 1) return $new_string;
7830
// process second subtag : $subtags[1]
7831
$length = strlen($subtags[1]);
7832
if ($length == 0 || ($length == 1 && $subtags[1] != 'x') || $length > 8 || !ctype_alnum($subtags[1])) {
7835
if (!ctype_lower($subtags[1])) $subtags[1] = strtolower($subtags[1]);
7837
$new_string .= '-' . $subtags[1];
7838
if ($num_subtags == 2) return $new_string;
7840
// process all other subtags, index 2 and up
7841
for ($i = 2; $i < $num_subtags; $i++) {
7842
$length = strlen($subtags[$i]);
7843
if ($length == 0 || $length > 8 || !ctype_alnum($subtags[$i])) {
7846
if (!ctype_lower($subtags[$i])) {
7847
$subtags[$i] = strtolower($subtags[$i]);
7849
$new_string .= '-' . $subtags[$i];
7863
* Decorator that, depending on a token, switches between two definitions.
7865
class HTMLPurifier_AttrDef_Switch
7869
protected $withTag, $withoutTag;
7872
* @param string $tag Tag name to switch upon
7873
* @param HTMLPurifier_AttrDef $with_tag Call if token matches tag
7874
* @param HTMLPurifier_AttrDef $without_tag Call if token doesn't match, or there is no token
7876
public function __construct($tag, $with_tag, $without_tag) {
7878
$this->withTag = $with_tag;
7879
$this->withoutTag = $without_tag;
7882
public function validate($string, $config, $context) {
7883
$token = $context->get('CurrentToken', true);
7884
if (!$token || $token->name !== $this->tag) {
7885
return $this->withoutTag->validate($string, $config, $context);
7887
return $this->withTag->validate($string, $config, $context);
7898
* Validates arbitrary text according to the HTML spec.
7900
class HTMLPurifier_AttrDef_Text extends HTMLPurifier_AttrDef
7903
public function validate($string, $config, $context) {
7904
return $this->parseCDATA($string);
7914
* Validates a URI as defined by RFC 3986.
7915
* @note Scheme-specific mechanics deferred to HTMLPurifier_URIScheme
7917
class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
7921
protected $embedsResource;
7924
* @param $embeds_resource_resource Does the URI here result in an extra HTTP request?
7926
public function __construct($embeds_resource = false) {
7927
$this->parser = new HTMLPurifier_URIParser();
7928
$this->embedsResource = (bool) $embeds_resource;
7931
public function make($string) {
7932
$embeds = (bool) $string;
7933
return new HTMLPurifier_AttrDef_URI($embeds);
7936
public function validate($uri, $config, $context) {
7938
if ($config->get('URI.Disable')) return false;
7940
$uri = $this->parseCDATA($uri);
7943
$uri = $this->parser->parse($uri);
7944
if ($uri === false) return false;
7946
// add embedded flag to context for validators
7947
$context->register('EmbeddedURI', $this->embedsResource);
7952
// generic validation
7953
$result = $uri->validate($config, $context);
7954
if (!$result) break;
7956
// chained filtering
7957
$uri_def = $config->getDefinition('URI');
7958
$result = $uri_def->filter($uri, $config, $context);
7959
if (!$result) break;
7961
// scheme-specific validation
7962
$scheme_obj = $uri->getSchemeObj($config, $context);
7963
if (!$scheme_obj) break;
7964
if ($this->embedsResource && !$scheme_obj->browsable) break;
7965
$result = $scheme_obj->validate($uri, $config, $context);
7966
if (!$result) break;
7968
// Post chained filtering
7969
$result = $uri_def->postFilter($uri, $config, $context);
7970
if (!$result) break;
7972
// survived gauntlet
7977
$context->destroy('EmbeddedURI');
7978
if (!$ok) return false;
7981
return $uri->toString();
7992
* Validates a number as defined by the CSS spec.
7994
class HTMLPurifier_AttrDef_CSS_Number extends HTMLPurifier_AttrDef
7998
* Bool indicating whether or not only positive values allowed.
8000
protected $non_negative = false;
8003
* @param $non_negative Bool indicating whether negatives are forbidden
8005
public function __construct($non_negative = false) {
8006
$this->non_negative = $non_negative;
8010
* @warning Some contexts do not pass $config, $context. These
8011
* variables should not be used without checking HTMLPurifier_Length
8013
public function validate($number, $config, $context) {
8015
$number = $this->parseCDATA($number);
8017
if ($number === '') return false;
8018
if ($number === '0') return '0';
8021
switch ($number[0]) {
8023
if ($this->non_negative) return false;
8026
$number = substr($number, 1);
8029
if (ctype_digit($number)) {
8030
$number = ltrim($number, '0');
8031
return $number ? $sign . $number : '0';
8034
// Period is the only non-numeric character allowed
8035
if (strpos($number, '.') === false) return false;
8037
list($left, $right) = explode('.', $number, 2);
8039
if ($left === '' && $right === '') return false;
8040
if ($left !== '' && !ctype_digit($left)) return false;
8042
$left = ltrim($left, '0');
8043
$right = rtrim($right, '0');
8045
if ($right === '') {
8046
return $left ? $sign . $left : '0';
8047
} elseif (!ctype_digit($right)) {
8051
return $sign . $left . '.' . $right;
8061
class HTMLPurifier_AttrDef_CSS_AlphaValue extends HTMLPurifier_AttrDef_CSS_Number
8064
public function __construct() {
8065
parent::__construct(false); // opacity is non-negative, but we will clamp it
8068
public function validate($number, $config, $context) {
8069
$result = parent::validate($number, $config, $context);
8070
if ($result === false) return $result;
8071
$float = (float) $result;
8072
if ($float < 0.0) $result = '0';
8073
if ($float > 1.0) $result = '1';
8084
* Validates shorthand CSS property background.
8085
* @warning Does not support url tokens that have internal spaces.
8087
class HTMLPurifier_AttrDef_CSS_Background extends HTMLPurifier_AttrDef
8091
* Local copy of component validators.
8092
* @note See HTMLPurifier_AttrDef_Font::$info for a similar impl.
8096
public function __construct($config) {
8097
$def = $config->getCSSDefinition();
8098
$this->info['background-color'] = $def->info['background-color'];
8099
$this->info['background-image'] = $def->info['background-image'];
8100
$this->info['background-repeat'] = $def->info['background-repeat'];
8101
$this->info['background-attachment'] = $def->info['background-attachment'];
8102
$this->info['background-position'] = $def->info['background-position'];
8105
public function validate($string, $config, $context) {
8107
// regular pre-processing
8108
$string = $this->parseCDATA($string);
8109
if ($string === '') return false;
8111
// munge rgb() decl if necessary
8112
$string = $this->mungeRgb($string);
8114
// assumes URI doesn't have spaces in it
8115
$bits = explode(' ', strtolower($string)); // bits to process
8118
$caught['color'] = false;
8119
$caught['image'] = false;
8120
$caught['repeat'] = false;
8121
$caught['attachment'] = false;
8122
$caught['position'] = false;
8124
$i = 0; // number of catches
8127
foreach ($bits as $bit) {
8128
if ($bit === '') continue;
8129
foreach ($caught as $key => $status) {
8130
if ($key != 'position') {
8131
if ($status !== false) continue;
8132
$r = $this->info['background-' . $key]->validate($bit, $config, $context);
8136
if ($r === false) continue;
8137
if ($key == 'position') {
8138
if ($caught[$key] === false) $caught[$key] = '';
8139
$caught[$key] .= $r . ' ';
8148
if (!$i) return false;
8149
if ($caught['position'] !== false) {
8150
$caught['position'] = $this->info['background-position']->
8151
validate($caught['position'], $config, $context);
8155
foreach ($caught as $value) {
8156
if ($value === false) continue;
8160
if (empty($ret)) return false;
8161
return implode(' ', $ret);
8172
[ // adjective and number must be in correct order, even if
8173
// you could switch them without introducing ambiguity.
8174
// some browsers support that syntax
8176
<percentage> | <length> | left | center | right
8179
<percentage> | <length> | top | center | bottom
8182
[ // this signifies that the vertical and horizontal adjectives
8183
// can be arbitrarily ordered, however, there can only be two,
8184
// one of each, or none at all
8186
left | center | right
8189
top | center | bottom
8193
center, (none) = 50%
8194
bottom, right = 100%
8198
keyword + length/percentage must be ordered correctly, as per W3C
8200
Internet Explorer and Opera, however, support arbitrary ordering. We
8203
Minor issue though, not strictly necessary.
8206
// control freaks may appreciate the ability to convert these to
8207
// percentages or something, but it's not necessary
8210
* Validates the value of background-position.
8212
class HTMLPurifier_AttrDef_CSS_BackgroundPosition extends HTMLPurifier_AttrDef
8216
protected $percentage;
8218
public function __construct() {
8219
$this->length = new HTMLPurifier_AttrDef_CSS_Length();
8220
$this->percentage = new HTMLPurifier_AttrDef_CSS_Percentage();
8223
public function validate($string, $config, $context) {
8224
$string = $this->parseCDATA($string);
8225
$bits = explode(' ', $string);
8227
$keywords = array();
8228
$keywords['h'] = false; // left, right
8229
$keywords['v'] = false; // top, bottom
8230
$keywords['c'] = false; // center
8231
$measures = array();
8243
foreach ($bits as $bit) {
8244
if ($bit === '') continue;
8247
$lbit = ctype_lower($bit) ? $bit : strtolower($bit);
8248
if (isset($lookup[$lbit])) {
8249
$status = $lookup[$lbit];
8250
$keywords[$status] = $lbit;
8255
$r = $this->length->validate($bit, $config, $context);
8261
// test for percentage
8262
$r = $this->percentage->validate($bit, $config, $context);
8270
if (!$i) return false; // no valid values were caught
8276
if ($keywords['h']) $ret[] = $keywords['h'];
8277
elseif (count($measures)) $ret[] = array_shift($measures);
8278
elseif ($keywords['c']) {
8279
$ret[] = $keywords['c'];
8280
$keywords['c'] = false; // prevent re-use: center = center center
8283
if ($keywords['v']) $ret[] = $keywords['v'];
8284
elseif (count($measures)) $ret[] = array_shift($measures);
8285
elseif ($keywords['c']) $ret[] = $keywords['c'];
8287
if (empty($ret)) return false;
8288
return implode(' ', $ret);
8299
* Validates the border property as defined by CSS.
8301
class HTMLPurifier_AttrDef_CSS_Border extends HTMLPurifier_AttrDef
8305
* Local copy of properties this property is shorthand for.
8307
protected $info = array();
8309
public function __construct($config) {
8310
$def = $config->getCSSDefinition();
8311
$this->info['border-width'] = $def->info['border-width'];
8312
$this->info['border-style'] = $def->info['border-style'];
8313
$this->info['border-top-color'] = $def->info['border-top-color'];
8316
public function validate($string, $config, $context) {
8317
$string = $this->parseCDATA($string);
8318
$string = $this->mungeRgb($string);
8319
$bits = explode(' ', $string);
8320
$done = array(); // segments we've finished
8321
$ret = ''; // return value
8322
foreach ($bits as $bit) {
8323
foreach ($this->info as $propname => $validator) {
8324
if (isset($done[$propname])) continue;
8325
$r = $validator->validate($bit, $config, $context);
8328
$done[$propname] = true;
8343
* Validates Color as defined by CSS.
8345
class HTMLPurifier_AttrDef_CSS_Color extends HTMLPurifier_AttrDef
8348
public function validate($color, $config, $context) {
8350
static $colors = null;
8351
if ($colors === null) $colors = $config->get('Core.ColorKeywords');
8353
$color = trim($color);
8354
if ($color === '') return false;
8356
$lower = strtolower($color);
8357
if (isset($colors[$lower])) return $colors[$lower];
8359
if (strpos($color, 'rgb(') !== false) {
8360
// rgb literal handling
8361
$length = strlen($color);
8362
if (strpos($color, ')') !== $length - 1) return false;
8363
$triad = substr($color, 4, $length - 4 - 1);
8364
$parts = explode(',', $triad);
8365
if (count($parts) !== 3) return false;
8366
$type = false; // to ensure that they're all the same type
8367
$new_parts = array();
8368
foreach ($parts as $part) {
8369
$part = trim($part);
8370
if ($part === '') return false;
8371
$length = strlen($part);
8372
if ($part[$length - 1] === '%') {
8375
$type = 'percentage';
8376
} elseif ($type !== 'percentage') {
8379
$num = (float) substr($part, 0, $length - 1);
8380
if ($num < 0) $num = 0;
8381
if ($num > 100) $num = 100;
8382
$new_parts[] = "$num%";
8387
} elseif ($type !== 'integer') {
8391
if ($num < 0) $num = 0;
8392
if ($num > 255) $num = 255;
8393
$new_parts[] = (string) $num;
8396
$new_triad = implode(',', $new_parts);
8397
$color = "rgb($new_triad)";
8399
// hexadecimal handling
8400
if ($color[0] === '#') {
8401
$hex = substr($color, 1);
8404
$color = '#' . $color;
8406
$length = strlen($hex);
8407
if ($length !== 3 && $length !== 6) return false;
8408
if (!ctype_xdigit($hex)) return false;
8422
* Allows multiple validators to attempt to validate attribute.
8424
* Composite is just what it sounds like: a composite of many validators.
8425
* This means that multiple HTMLPurifier_AttrDef objects will have a whack
8426
* at the string. If one of them passes, that's what is returned. This is
8427
* especially useful for CSS values, which often are a choice between
8428
* an enumerated set of predefined values or a flexible data type.
8430
class HTMLPurifier_AttrDef_CSS_Composite extends HTMLPurifier_AttrDef
8434
* List of HTMLPurifier_AttrDef objects that may process strings
8435
* @todo Make protected
8440
* @param $defs List of HTMLPurifier_AttrDef objects
8442
public function __construct($defs) {
8443
$this->defs = $defs;
8446
public function validate($string, $config, $context) {
8447
foreach ($this->defs as $i => $def) {
8448
$result = $this->defs[$i]->validate($string, $config, $context);
8449
if ($result !== false) return $result;
8461
* Decorator which enables CSS properties to be disabled for specific elements.
8463
class HTMLPurifier_AttrDef_CSS_DenyElementDecorator extends HTMLPurifier_AttrDef
8465
public $def, $element;
8468
* @param $def Definition to wrap
8469
* @param $element Element to deny
8471
public function __construct($def, $element) {
8473
$this->element = $element;
8476
* Checks if CurrentToken is set and equal to $this->element
8478
public function validate($string, $config, $context) {
8479
$token = $context->get('CurrentToken', true);
8480
if ($token && $token->name == $this->element) return false;
8481
return $this->def->validate($string, $config, $context);
8490
* Microsoft's proprietary filter: CSS property
8491
* @note Currently supports the alpha filter. In the future, this will
8492
* probably need an extensible framework
8494
class HTMLPurifier_AttrDef_CSS_Filter extends HTMLPurifier_AttrDef
8497
protected $intValidator;
8499
public function __construct() {
8500
$this->intValidator = new HTMLPurifier_AttrDef_Integer();
8503
public function validate($value, $config, $context) {
8504
$value = $this->parseCDATA($value);
8505
if ($value === 'none') return $value;
8506
// if we looped this we could support multiple filters
8507
$function_length = strcspn($value, '(');
8508
$function = trim(substr($value, 0, $function_length));
8509
if ($function !== 'alpha' &&
8510
$function !== 'Alpha' &&
8511
$function !== 'progid:DXImageTransform.Microsoft.Alpha'
8513
$cursor = $function_length + 1;
8514
$parameters_length = strcspn($value, ')', $cursor);
8515
$parameters = substr($value, $cursor, $parameters_length);
8516
$params = explode(',', $parameters);
8517
$ret_params = array();
8519
foreach ($params as $param) {
8520
list($key, $value) = explode('=', $param);
8522
$value = trim($value);
8523
if (isset($lookup[$key])) continue;
8524
if ($key !== 'opacity') continue;
8525
$value = $this->intValidator->validate($value, $config, $context);
8526
if ($value === false) continue;
8527
$int = (int) $value;
8528
if ($int > 100) $value = '100';
8529
if ($int < 0) $value = '0';
8530
$ret_params[] = "$key=$value";
8531
$lookup[$key] = true;
8533
$ret_parameters = implode(',', $ret_params);
8534
$ret_function = "$function($ret_parameters)";
8535
return $ret_function;
8545
* Validates shorthand CSS property font.
8547
class HTMLPurifier_AttrDef_CSS_Font extends HTMLPurifier_AttrDef
8551
* Local copy of component validators.
8553
* @note If we moved specific CSS property definitions to their own
8554
* classes instead of having them be assembled at run time by
8555
* CSSDefinition, this wouldn't be necessary. We'd instantiate
8558
protected $info = array();
8560
public function __construct($config) {
8561
$def = $config->getCSSDefinition();
8562
$this->info['font-style'] = $def->info['font-style'];
8563
$this->info['font-variant'] = $def->info['font-variant'];
8564
$this->info['font-weight'] = $def->info['font-weight'];
8565
$this->info['font-size'] = $def->info['font-size'];
8566
$this->info['line-height'] = $def->info['line-height'];
8567
$this->info['font-family'] = $def->info['font-family'];
8570
public function validate($string, $config, $context) {
8572
static $system_fonts = array(
8576
'message-box' => true,
8577
'small-caption' => true,
8578
'status-bar' => true
8581
// regular pre-processing
8582
$string = $this->parseCDATA($string);
8583
if ($string === '') return false;
8585
// check if it's one of the keywords
8586
$lowercase_string = strtolower($string);
8587
if (isset($system_fonts[$lowercase_string])) {
8588
return $lowercase_string;
8591
$bits = explode(' ', $string); // bits to process
8592
$stage = 0; // this indicates what we're looking for
8593
$caught = array(); // which stage 0 properties have we caught?
8594
$stage_1 = array('font-style', 'font-variant', 'font-weight');
8595
$final = ''; // output
8597
for ($i = 0, $size = count($bits); $i < $size; $i++) {
8598
if ($bits[$i] === '') continue;
8601
// attempting to catch font-style, font-variant or font-weight
8603
foreach ($stage_1 as $validator_name) {
8604
if (isset($caught[$validator_name])) continue;
8605
$r = $this->info[$validator_name]->validate(
8606
$bits[$i], $config, $context);
8609
$caught[$validator_name] = true;
8613
// all three caught, continue on
8614
if (count($caught) >= 3) $stage = 1;
8615
if ($r !== false) break;
8617
// attempting to catch font-size and perhaps line-height
8619
$found_slash = false;
8620
if (strpos($bits[$i], '/') !== false) {
8621
list($font_size, $line_height) =
8622
explode('/', $bits[$i]);
8623
if ($line_height === '') {
8624
// ooh, there's a space after the slash!
8625
$line_height = false;
8626
$found_slash = true;
8629
$font_size = $bits[$i];
8630
$line_height = false;
8632
$r = $this->info['font-size']->validate(
8633
$font_size, $config, $context);
8636
// attempt to catch line-height
8637
if ($line_height === false) {
8638
// we need to scroll forward
8639
for ($j = $i + 1; $j < $size; $j++) {
8640
if ($bits[$j] === '') continue;
8641
if ($bits[$j] === '/') {
8645
$found_slash = true;
8649
$line_height = $bits[$j];
8653
// slash already found
8654
$found_slash = true;
8659
$r = $this->info['line-height']->validate(
8660
$line_height, $config, $context);
8671
// attempting to catch font-family
8674
implode(' ', array_slice($bits, $i, $size - $i));
8675
$r = $this->info['font-family']->validate(
8676
$font_family, $config, $context);
8679
// processing completed successfully
8680
return rtrim($final);
8695
* Validates a font family list according to CSS spec
8696
* @todo whitelisting allowed fonts would be nice
8698
class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
8701
public function validate($string, $config, $context) {
8702
static $generic_names = array(
8704
'sans-serif' => true,
8705
'monospace' => true,
8710
// assume that no font names contain commas in them
8711
$fonts = explode(',', $string);
8713
foreach($fonts as $font) {
8714
$font = trim($font);
8715
if ($font === '') continue;
8716
// match a generic name
8717
if (isset($generic_names[$font])) {
8718
$final .= $font . ', ';
8721
// match a quoted name
8722
if ($font[0] === '"' || $font[0] === "'") {
8723
$length = strlen($font);
8724
if ($length <= 2) continue;
8726
if ($font[$length - 1] !== $quote) continue;
8727
$font = substr($font, 1, $length - 2);
8730
for ($i = 0, $c = strlen($font); $i < $c; $i++) {
8731
if ($font[$i] === '\\') {
8737
if (ctype_xdigit($font[$i])) {
8739
for ($a = 1, $i++; $i < $c && $a < 6; $i++, $a++) {
8740
if (!ctype_xdigit($font[$i])) break;
8743
// We have to be extremely careful when adding
8744
// new characters, to make sure we're not breaking
8746
$char = HTMLPurifier_Encoder::unichr(hexdec($code));
8747
if (HTMLPurifier_Encoder::cleanUTF8($char) === '') continue;
8749
if ($i < $c && trim($font[$i]) !== '') $i--;
8752
if ($font[$i] === "\n") continue;
8754
$new_font .= $font[$i];
8759
// $font is a pure representation of the font name
8761
if (ctype_alnum($font) && $font !== '') {
8762
// very simple font, allow it in unharmed
8763
$final .= $font . ', ';
8767
// complicated font, requires quoting
8769
// armor single quotes and new lines
8770
$font = str_replace("\\", "\\\\", $font);
8771
$font = str_replace("'", "\\'", $font);
8772
$final .= "'$font', ";
8774
$final = rtrim($final, ', ');
8775
if ($final === '') return false;
8786
* Decorator which enables !important to be used in CSS values.
8788
class HTMLPurifier_AttrDef_CSS_ImportantDecorator extends HTMLPurifier_AttrDef
8790
public $def, $allow;
8793
* @param $def Definition to wrap
8794
* @param $allow Whether or not to allow !important
8796
public function __construct($def, $allow = false) {
8798
$this->allow = $allow;
8801
* Intercepts and removes !important if necessary
8803
public function validate($string, $config, $context) {
8804
// test for ! and important tokens
8805
$string = trim($string);
8806
$is_important = false;
8807
// :TODO: optimization: test directly for !important and ! important
8808
if (strlen($string) >= 9 && substr($string, -9) === 'important') {
8809
$temp = rtrim(substr($string, 0, -9));
8810
// use a temp, because we might want to restore important
8811
if (strlen($temp) >= 1 && substr($temp, -1) === '!') {
8812
$string = rtrim(substr($temp, 0, -1));
8813
$is_important = true;
8816
$string = $this->def->validate($string, $config, $context);
8817
if ($this->allow && $is_important) $string .= ' !important';
8827
* Represents a Length as defined by CSS.
8829
class HTMLPurifier_AttrDef_CSS_Length extends HTMLPurifier_AttrDef
8832
protected $min, $max;
8835
* @param HTMLPurifier_Length $max Minimum length, or null for no bound. String is also acceptable.
8836
* @param HTMLPurifier_Length $max Maximum length, or null for no bound. String is also acceptable.
8838
public function __construct($min = null, $max = null) {
8839
$this->min = $min !== null ? HTMLPurifier_Length::make($min) : null;
8840
$this->max = $max !== null ? HTMLPurifier_Length::make($max) : null;
8843
public function validate($string, $config, $context) {
8844
$string = $this->parseCDATA($string);
8847
if ($string === '') return false;
8848
if ($string === '0') return '0';
8849
if (strlen($string) === 1) return false;
8851
$length = HTMLPurifier_Length::make($string);
8852
if (!$length->isValid()) return false;
8855
$c = $length->compareTo($this->min);
8856
if ($c === false) return false;
8857
if ($c < 0) return false;
8860
$c = $length->compareTo($this->max);
8861
if ($c === false) return false;
8862
if ($c > 0) return false;
8865
return $length->toString();
8875
* Validates shorthand CSS property list-style.
8876
* @warning Does not support url tokens that have internal spaces.
8878
class HTMLPurifier_AttrDef_CSS_ListStyle extends HTMLPurifier_AttrDef
8882
* Local copy of component validators.
8883
* @note See HTMLPurifier_AttrDef_CSS_Font::$info for a similar impl.
8887
public function __construct($config) {
8888
$def = $config->getCSSDefinition();
8889
$this->info['list-style-type'] = $def->info['list-style-type'];
8890
$this->info['list-style-position'] = $def->info['list-style-position'];
8891
$this->info['list-style-image'] = $def->info['list-style-image'];
8894
public function validate($string, $config, $context) {
8896
// regular pre-processing
8897
$string = $this->parseCDATA($string);
8898
if ($string === '') return false;
8900
// assumes URI doesn't have spaces in it
8901
$bits = explode(' ', strtolower($string)); // bits to process
8904
$caught['type'] = false;
8905
$caught['position'] = false;
8906
$caught['image'] = false;
8908
$i = 0; // number of catches
8911
foreach ($bits as $bit) {
8912
if ($i >= 3) return; // optimization bit
8913
if ($bit === '') continue;
8914
foreach ($caught as $key => $status) {
8915
if ($status !== false) continue;
8916
$r = $this->info['list-style-' . $key]->validate($bit, $config, $context);
8917
if ($r === false) continue;
8918
if ($r === 'none') {
8919
if ($none) continue;
8921
if ($key == 'image') continue;
8929
if (!$i) return false;
8934
if ($caught['type']) $ret[] = $caught['type'];
8937
if ($caught['image']) $ret[] = $caught['image'];
8939
// construct position
8940
if ($caught['position']) $ret[] = $caught['position'];
8942
if (empty($ret)) return false;
8943
return implode(' ', $ret);
8954
* Framework class for strings that involve multiple values.
8956
* Certain CSS properties such as border-width and margin allow multiple
8957
* lengths to be specified. This class can take a vanilla border-width
8958
* definition and multiply it, usually into a max of four.
8960
* @note Even though the CSS specification isn't clear about it, inherit
8961
* can only be used alone: it will never manifest as part of a multi
8962
* shorthand declaration. Thus, this class does not allow inherit.
8964
class HTMLPurifier_AttrDef_CSS_Multiple extends HTMLPurifier_AttrDef
8968
* Instance of component definition to defer validation to.
8969
* @todo Make protected
8974
* Max number of values allowed.
8975
* @todo Make protected
8980
* @param $single HTMLPurifier_AttrDef to multiply
8981
* @param $max Max number of values allowed (usually four)
8983
public function __construct($single, $max = 4) {
8984
$this->single = $single;
8988
public function validate($string, $config, $context) {
8989
$string = $this->parseCDATA($string);
8990
if ($string === '') return false;
8991
$parts = explode(' ', $string); // parseCDATA replaced \r, \t and \n
8992
$length = count($parts);
8994
for ($i = 0, $num = 0; $i < $length && $num < $this->max; $i++) {
8995
if (ctype_space($parts[$i])) continue;
8996
$result = $this->single->validate($parts[$i], $config, $context);
8997
if ($result !== false) {
8998
$final .= $result . ' ';
9002
if ($final === '') return false;
9003
return rtrim($final);
9013
* Validates a Percentage as defined by the CSS spec.
9015
class HTMLPurifier_AttrDef_CSS_Percentage extends HTMLPurifier_AttrDef
9019
* Instance of HTMLPurifier_AttrDef_CSS_Number to defer number validation
9021
protected $number_def;
9024
* @param Bool indicating whether to forbid negative values
9026
public function __construct($non_negative = false) {
9027
$this->number_def = new HTMLPurifier_AttrDef_CSS_Number($non_negative);
9030
public function validate($string, $config, $context) {
9032
$string = $this->parseCDATA($string);
9034
if ($string === '') return false;
9035
$length = strlen($string);
9036
if ($length === 1) return false;
9037
if ($string[$length - 1] !== '%') return false;
9039
$number = substr($string, 0, $length - 1);
9040
$number = $this->number_def->validate($number, $config, $context);
9042
if ($number === false) return false;
9054
* Validates the value for the CSS property text-decoration
9055
* @note This class could be generalized into a version that acts sort of
9056
* like Enum except you can compound the allowed values.
9058
class HTMLPurifier_AttrDef_CSS_TextDecoration extends HTMLPurifier_AttrDef
9061
public function validate($string, $config, $context) {
9063
static $allowed_values = array(
9064
'line-through' => true,
9066
'underline' => true,
9069
$string = strtolower($this->parseCDATA($string));
9071
if ($string === 'none') return $string;
9073
$parts = explode(' ', $string);
9075
foreach ($parts as $part) {
9076
if (isset($allowed_values[$part])) {
9077
$final .= $part . ' ';
9080
$final = rtrim($final);
9081
if ($final === '') return false;
9093
* Validates a URI in CSS syntax, which uses url('http://example.com')
9094
* @note While theoretically speaking a URI in a CSS document could
9095
* be non-embedded, as of CSS2 there is no such usage so we're
9096
* generalizing it. This may need to be changed in the future.
9097
* @warning Since HTMLPurifier_AttrDef_CSS blindly uses semicolons as
9098
* the separator, you cannot put a literal semicolon in
9099
* in the URI. Try percent encoding it, in that case.
9101
class HTMLPurifier_AttrDef_CSS_URI extends HTMLPurifier_AttrDef_URI
9104
public function __construct() {
9105
parent::__construct(true); // always embedded
9108
public function validate($uri_string, $config, $context) {
9109
// parse the URI out of the string and then pass it onto
9110
// the parent object
9112
$uri_string = $this->parseCDATA($uri_string);
9113
if (strpos($uri_string, 'url(') !== 0) return false;
9114
$uri_string = substr($uri_string, 4);
9115
$new_length = strlen($uri_string) - 1;
9116
if ($uri_string[$new_length] != ')') return false;
9117
$uri = trim(substr($uri_string, 0, $new_length));
9119
if (!empty($uri) && ($uri[0] == "'" || $uri[0] == '"')) {
9121
$new_length = strlen($uri) - 1;
9122
if ($uri[$new_length] !== $quote) return false;
9123
$uri = substr($uri, 1, $new_length - 1);
9126
$keys = array( '(', ')', ',', ' ', '"', "'");
9127
$values = array('\\(', '\\)', '\\,', '\\ ', '\\"', "\\'");
9128
$uri = str_replace($values, $keys, $uri);
9130
$result = parent::validate($uri, $config, $context);
9132
if ($result === false) return false;
9134
// escape necessary characters according to CSS spec
9135
// except for the comma, none of these should appear in the
9137
$result = str_replace($keys, $values, $result);
9139
return "url($result)";
9150
* Validates a boolean attribute
9152
class HTMLPurifier_AttrDef_HTML_Bool extends HTMLPurifier_AttrDef
9156
public $minimized = true;
9158
public function __construct($name = false) {$this->name = $name;}
9160
public function validate($string, $config, $context) {
9161
if (empty($string)) return false;
9166
* @param $string Name of attribute
9168
public function make($string) {
9169
return new HTMLPurifier_AttrDef_HTML_Bool($string);
9179
* Validates contents based on NMTOKENS attribute type.
9181
class HTMLPurifier_AttrDef_HTML_Nmtokens extends HTMLPurifier_AttrDef
9184
public function validate($string, $config, $context) {
9186
$string = trim($string);
9188
// early abort: '' and '0' (strings that convert to false) are invalid
9189
if (!$string) return false;
9191
$tokens = $this->split($string, $config, $context);
9192
$tokens = $this->filter($tokens, $config, $context);
9193
if (empty($tokens)) return false;
9194
return implode(' ', $tokens);
9199
* Splits a space separated list of tokens into its constituent parts.
9201
protected function split($string, $config, $context) {
9203
// do the preg_match, capture all subpatterns for reformulation
9205
// we don't support U+00A1 and up codepoints or
9206
// escaping because I don't know how to do that with regexps
9207
// and plus it would complicate optimization efforts (you never
9208
// see that anyway).
9209
$pattern = '/(?:(?<=\s)|\A)'. // look behind for space or string start
9210
'((?:--|-?[A-Za-z_])[A-Za-z_\-0-9]*)'.
9211
'(?:(?=\s)|\z)/'; // look ahead for space or string end
9212
preg_match_all($pattern, $string, $matches);
9217
* Template method for removing certain tokens based on arbitrary criteria.
9218
* @note If we wanted to be really functional, we'd do an array_filter
9219
* with a callback. But... we're not.
9221
protected function filter($tokens, $config, $context) {
9232
* Implements special behavior for class attribute (normally NMTOKENS)
9234
class HTMLPurifier_AttrDef_HTML_Class extends HTMLPurifier_AttrDef_HTML_Nmtokens
9236
protected function split($string, $config, $context) {
9237
// really, this twiddle should be lazy loaded
9238
$name = $config->getDefinition('HTML')->doctype->name;
9239
if ($name == "XHTML 1.1" || $name == "XHTML 2.0") {
9240
return parent::split($string, $config, $context);
9242
return preg_split('/\s+/', $string);
9245
protected function filter($tokens, $config, $context) {
9246
$allowed = $config->get('Attr.AllowedClasses');
9247
$forbidden = $config->get('Attr.ForbiddenClasses');
9249
foreach ($tokens as $token) {
9251
($allowed === null || isset($allowed[$token])) &&
9252
!isset($forbidden[$token]) &&
9253
// We need this O(n) check because of PHP's array
9254
// implementation that casts -0 to 0.
9255
!in_array($token, $ret, true)
9267
* Validates a color according to the HTML spec.
9269
class HTMLPurifier_AttrDef_HTML_Color extends HTMLPurifier_AttrDef
9272
public function validate($string, $config, $context) {
9274
static $colors = null;
9275
if ($colors === null) $colors = $config->get('Core.ColorKeywords');
9277
$string = trim($string);
9279
if (empty($string)) return false;
9280
if (isset($colors[$string])) return $colors[$string];
9281
if ($string[0] === '#') $hex = substr($string, 1);
9282
else $hex = $string;
9284
$length = strlen($hex);
9285
if ($length !== 3 && $length !== 6) return false;
9286
if (!ctype_xdigit($hex)) return false;
9287
if ($length === 3) $hex = $hex[0].$hex[0].$hex[1].$hex[1].$hex[2].$hex[2];
9300
* Special-case enum attribute definition that lazy loads allowed frame targets
9302
class HTMLPurifier_AttrDef_HTML_FrameTarget extends HTMLPurifier_AttrDef_Enum
9305
public $valid_values = false; // uninitialized value
9306
protected $case_sensitive = false;
9308
public function __construct() {}
9310
public function validate($string, $config, $context) {
9311
if ($this->valid_values === false) $this->valid_values = $config->get('Attr.AllowedFrameTargets');
9312
return parent::validate($string, $config, $context);
9322
* Validates the HTML attribute ID.
9323
* @warning Even though this is the id processor, it
9324
* will ignore the directive Attr:IDBlacklist, since it will only
9325
* go according to the ID accumulator. Since the accumulator is
9326
* automatically generated, it will have already absorbed the
9327
* blacklist. If you're hacking around, make sure you use load()!
9330
class HTMLPurifier_AttrDef_HTML_ID extends HTMLPurifier_AttrDef
9333
// ref functionality disabled, since we also have to verify
9334
// whether or not the ID it refers to exists
9336
public function validate($id, $config, $context) {
9338
if (!$config->get('Attr.EnableID')) return false;
9340
$id = trim($id); // trim it first
9342
if ($id === '') return false;
9344
$prefix = $config->get('Attr.IDPrefix');
9345
if ($prefix !== '') {
9346
$prefix .= $config->get('Attr.IDPrefixLocal');
9347
// prevent re-appending the prefix
9348
if (strpos($id, $prefix) !== 0) $id = $prefix . $id;
9349
} elseif ($config->get('Attr.IDPrefixLocal') !== '') {
9350
trigger_error('%Attr.IDPrefixLocal cannot be used unless '.
9351
'%Attr.IDPrefix is set', E_USER_WARNING);
9354
//if (!$this->ref) {
9355
$id_accumulator =& $context->get('IDAccumulator');
9356
if (isset($id_accumulator->ids[$id])) return false;
9359
// we purposely avoid using regex, hopefully this is faster
9361
if (ctype_alpha($id)) {
9364
if (!ctype_alpha(@$id[0])) return false;
9365
$trim = trim( // primitive style of regexps, I suppose
9369
$result = ($trim === '');
9372
$regexp = $config->get('Attr.IDBlacklistRegexp');
9373
if ($regexp && preg_match($regexp, $id)) {
9377
if (/*!$this->ref && */$result) $id_accumulator->add($id);
9379
// if no change was made to the ID, return the result
9380
// else, return the new id if stripping whitespace made it
9381
// valid, or return false.
9382
return $result ? $id : false;
9393
* Validates an integer representation of pixels according to the HTML spec.
9395
class HTMLPurifier_AttrDef_HTML_Pixels extends HTMLPurifier_AttrDef
9400
public function __construct($max = null) {
9404
public function validate($string, $config, $context) {
9406
$string = trim($string);
9407
if ($string === '0') return $string;
9408
if ($string === '') return false;
9409
$length = strlen($string);
9410
if (substr($string, $length - 2) == 'px') {
9411
$string = substr($string, 0, $length - 2);
9413
if (!is_numeric($string)) return false;
9414
$int = (int) $string;
9416
if ($int < 0) return '0';
9418
// upper-bound value, extremely high values can
9419
// crash operating systems, see <http://ha.ckers.org/imagecrash.html>
9420
// WARNING, above link WILL crash you if you're using Windows
9422
if ($this->max !== null && $int > $this->max) return (string) $this->max;
9424
return (string) $int;
9428
public function make($string) {
9429
if ($string === '') $max = null;
9430
else $max = (int) $string;
9431
$class = get_class($this);
9432
return new $class($max);
9442
* Validates the HTML type length (not to be confused with CSS's length).
9444
* This accepts integer pixels or percentages as lengths for certain
9448
class HTMLPurifier_AttrDef_HTML_Length extends HTMLPurifier_AttrDef_HTML_Pixels
9451
public function validate($string, $config, $context) {
9453
$string = trim($string);
9454
if ($string === '') return false;
9456
$parent_result = parent::validate($string, $config, $context);
9457
if ($parent_result !== false) return $parent_result;
9459
$length = strlen($string);
9460
$last_char = $string[$length - 1];
9462
if ($last_char !== '%') return false;
9464
$points = substr($string, 0, $length - 1);
9466
if (!is_numeric($points)) return false;
9468
$points = (int) $points;
9470
if ($points < 0) return '0%';
9471
if ($points > 100) return '100%';
9473
return ((string) $points) . '%';
9484
* Validates a rel/rev link attribute against a directive of allowed values
9485
* @note We cannot use Enum because link types allow multiple
9487
* @note Assumes link types are ASCII text
9489
class HTMLPurifier_AttrDef_HTML_LinkTypes extends HTMLPurifier_AttrDef
9492
/** Name config attribute to pull. */
9495
public function __construct($name) {
9496
$configLookup = array(
9497
'rel' => 'AllowedRel',
9498
'rev' => 'AllowedRev'
9500
if (!isset($configLookup[$name])) {
9501
trigger_error('Unrecognized attribute name for link '.
9502
'relationship.', E_USER_ERROR);
9505
$this->name = $configLookup[$name];
9508
public function validate($string, $config, $context) {
9510
$allowed = $config->get('Attr.' . $this->name);
9511
if (empty($allowed)) return false;
9513
$string = $this->parseCDATA($string);
9514
$parts = explode(' ', $string);
9516
// lookup to prevent duplicates
9517
$ret_lookup = array();
9518
foreach ($parts as $part) {
9519
$part = strtolower(trim($part));
9520
if (!isset($allowed[$part])) continue;
9521
$ret_lookup[$part] = true;
9524
if (empty($ret_lookup)) return false;
9525
$string = implode(' ', array_keys($ret_lookup));
9538
* Validates a MultiLength as defined by the HTML spec.
9540
* A multilength is either a integer (pixel count), a percentage, or
9541
* a relative number.
9543
class HTMLPurifier_AttrDef_HTML_MultiLength extends HTMLPurifier_AttrDef_HTML_Length
9546
public function validate($string, $config, $context) {
9548
$string = trim($string);
9549
if ($string === '') return false;
9551
$parent_result = parent::validate($string, $config, $context);
9552
if ($parent_result !== false) return $parent_result;
9554
$length = strlen($string);
9555
$last_char = $string[$length - 1];
9557
if ($last_char !== '*') return false;
9559
$int = substr($string, 0, $length - 1);
9561
if ($int == '') return '*';
9562
if (!is_numeric($int)) return false;
9566
if ($int < 0) return false;
9567
if ($int == 0) return '0';
9568
if ($int == 1) return '*';
9569
return ((string) $int) . '*';
9579
abstract class HTMLPurifier_AttrDef_URI_Email extends HTMLPurifier_AttrDef
9583
* Unpacks a mailbox into its display-name and address
9585
function unpack($string) {
9586
// needs to be implemented
9591
// sub-implementations
9598
* Validates a host according to the IPv4, IPv6 and DNS (future) specifications.
9600
class HTMLPurifier_AttrDef_URI_Host extends HTMLPurifier_AttrDef
9604
* Instance of HTMLPurifier_AttrDef_URI_IPv4 sub-validator
9609
* Instance of HTMLPurifier_AttrDef_URI_IPv6 sub-validator
9613
public function __construct() {
9614
$this->ipv4 = new HTMLPurifier_AttrDef_URI_IPv4();
9615
$this->ipv6 = new HTMLPurifier_AttrDef_URI_IPv6();
9618
public function validate($string, $config, $context) {
9619
$length = strlen($string);
9620
if ($string === '') return '';
9621
if ($length > 1 && $string[0] === '[' && $string[$length-1] === ']') {
9623
$ip = substr($string, 1, $length - 2);
9624
$valid = $this->ipv6->validate($ip, $config, $context);
9625
if ($valid === false) return false;
9626
return '['. $valid . ']';
9629
// need to do checks on unusual encodings too
9630
$ipv4 = $this->ipv4->validate($string, $config, $context);
9631
if ($ipv4 !== false) return $ipv4;
9633
// A regular domain name.
9635
// This breaks I18N domain names, but we don't have proper IRI support,
9636
// so force users to insert Punycode. If there's complaining we'll
9637
// try to fix things into an international friendly form.
9639
// The productions describing this are:
9640
$a = '[a-z]'; // alpha
9641
$an = '[a-z0-9]'; // alphanum
9642
$and = '[a-z0-9-]'; // alphanum | "-"
9643
// domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum
9644
$domainlabel = "$an($and*$an)?";
9645
// toplabel = alpha | alpha *( alphanum | "-" ) alphanum
9646
$toplabel = "$a($and*$an)?";
9647
// hostname = *( domainlabel "." ) toplabel [ "." ]
9648
$match = preg_match("/^($domainlabel\.)*$toplabel\.?$/i", $string);
9649
if (!$match) return false;
9661
* Validates an IPv4 address
9662
* @author Feyd @ forums.devnetwork.net (public domain)
9664
class HTMLPurifier_AttrDef_URI_IPv4 extends HTMLPurifier_AttrDef
9668
* IPv4 regex, protected so that IPv6 can reuse it
9672
public function validate($aIP, $config, $context) {
9674
if (!$this->ip4) $this->_loadRegex();
9676
if (preg_match('#^' . $this->ip4 . '$#s', $aIP))
9686
* Lazy load function to prevent regex from being stuffed in
9689
protected function _loadRegex() {
9690
$oct = '(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])'; // 0-255
9691
$this->ip4 = "(?:{$oct}\\.{$oct}\\.{$oct}\\.{$oct})";
9701
* Validates an IPv6 address.
9702
* @author Feyd @ forums.devnetwork.net (public domain)
9703
* @note This function requires brackets to have been removed from address
9706
class HTMLPurifier_AttrDef_URI_IPv6 extends HTMLPurifier_AttrDef_URI_IPv4
9709
public function validate($aIP, $config, $context) {
9711
if (!$this->ip4) $this->_loadRegex();
9715
$hex = '[0-9a-fA-F]';
9716
$blk = '(?:' . $hex . '{1,4})';
9717
$pre = '(?:/(?:12[0-8]|1[0-1][0-9]|[1-9][0-9]|[0-9]))'; // /0 - /128
9720
if (strpos($aIP, '/') !== false)
9722
if (preg_match('#' . $pre . '$#s', $aIP, $find))
9724
$aIP = substr($aIP, 0, 0-strlen($find[0]));
9733
// IPv4-compatiblity check
9734
if (preg_match('#(?<=:'.')' . $this->ip4 . '$#s', $aIP, $find))
9736
$aIP = substr($aIP, 0, 0-strlen($find[0]));
9737
$ip = explode('.', $find[0]);
9738
$ip = array_map('dechex', $ip);
9739
$aIP .= $ip[0] . $ip[1] . ':' . $ip[2] . $ip[3];
9743
// compression check
9744
$aIP = explode('::', $aIP);
9752
list($first, $second) = $aIP;
9753
$first = explode(':', $first);
9754
$second = explode(':', $second);
9756
if (count($first) + count($second) > 8)
9761
while(count($first) < 8)
9763
array_push($first, '0');
9766
array_splice($first, 8 - count($second), 8, $second);
9768
unset($first,$second);
9772
$aIP = explode(':', $aIP[0]);
9781
// All the pieces should be 16-bit hex strings. Are they?
9782
foreach ($aIP as $piece)
9784
if (!preg_match('#^[0-9a-fA-F]{4}$#s', sprintf('%04s', $piece)))
9801
* Primitive email validation class based on the regexp found at
9802
* http://www.regular-expressions.info/email.html
9804
class HTMLPurifier_AttrDef_URI_Email_SimpleCheck extends HTMLPurifier_AttrDef_URI_Email
9807
public function validate($string, $config, $context) {
9808
// no support for named mailboxes i.e. "Bob <bob@example.com>"
9809
// that needs more percent encoding to be done
9810
if ($string == '') return false;
9811
$string = trim($string);
9812
$result = preg_match('/^[A-Z0-9._%-]+@[A-Z0-9.-]+\.[A-Z]{2,4}$/i', $string);
9813
return $result ? $string : false;
9823
* Pre-transform that changes proprietary background attribute to CSS.
9825
class HTMLPurifier_AttrTransform_Background extends HTMLPurifier_AttrTransform {
9827
public function transform($attr, $config, $context) {
9829
if (!isset($attr['background'])) return $attr;
9831
$background = $this->confiscateAttr($attr, 'background');
9832
// some validation should happen here
9834
$this->prependCSS($attr, "background-image:url($background);");
9846
// this MUST be placed in post, as it assumes that any value in dir is valid
9849
* Post-trasnform that ensures that bdo tags have the dir attribute set.
9851
class HTMLPurifier_AttrTransform_BdoDir extends HTMLPurifier_AttrTransform
9854
public function transform($attr, $config, $context) {
9855
if (isset($attr['dir'])) return $attr;
9856
$attr['dir'] = $config->get('Attr.DefaultTextDir');
9867
* Pre-transform that changes deprecated bgcolor attribute to CSS.
9869
class HTMLPurifier_AttrTransform_BgColor extends HTMLPurifier_AttrTransform {
9871
public function transform($attr, $config, $context) {
9873
if (!isset($attr['bgcolor'])) return $attr;
9875
$bgcolor = $this->confiscateAttr($attr, 'bgcolor');
9876
// some validation should happen here
9878
$this->prependCSS($attr, "background-color:$bgcolor;");
9891
* Pre-transform that changes converts a boolean attribute to fixed CSS
9893
class HTMLPurifier_AttrTransform_BoolToCSS extends HTMLPurifier_AttrTransform {
9896
* Name of boolean attribute that is trigger
9901
* CSS declarations to add to style, needs trailing semicolon
9906
* @param $attr string attribute name to convert from
9907
* @param $css string CSS declarations to add to style (needs semicolon)
9909
public function __construct($attr, $css) {
9910
$this->attr = $attr;
9914
public function transform($attr, $config, $context) {
9915
if (!isset($attr[$this->attr])) return $attr;
9916
unset($attr[$this->attr]);
9917
$this->prependCSS($attr, $this->css);
9928
* Pre-transform that changes deprecated border attribute to CSS.
9930
class HTMLPurifier_AttrTransform_Border extends HTMLPurifier_AttrTransform {
9932
public function transform($attr, $config, $context) {
9933
if (!isset($attr['border'])) return $attr;
9934
$border_width = $this->confiscateAttr($attr, 'border');
9935
// some validation should happen here
9936
$this->prependCSS($attr, "border:{$border_width}px solid;");
9947
* Generic pre-transform that converts an attribute with a fixed number of
9948
* values (enumerated) to CSS.
9950
class HTMLPurifier_AttrTransform_EnumToCSS extends HTMLPurifier_AttrTransform {
9953
* Name of attribute to transform from
9958
* Lookup array of attribute values to CSS
9960
protected $enumToCSS = array();
9963
* Case sensitivity of the matching
9964
* @warning Currently can only be guaranteed to work with ASCII
9967
protected $caseSensitive = false;
9970
* @param $attr String attribute name to transform from
9971
* @param $enumToCSS Lookup array of attribute values to CSS
9972
* @param $case_sensitive Boolean case sensitivity indicator, default false
9974
public function __construct($attr, $enum_to_css, $case_sensitive = false) {
9975
$this->attr = $attr;
9976
$this->enumToCSS = $enum_to_css;
9977
$this->caseSensitive = (bool) $case_sensitive;
9980
public function transform($attr, $config, $context) {
9982
if (!isset($attr[$this->attr])) return $attr;
9984
$value = trim($attr[$this->attr]);
9985
unset($attr[$this->attr]);
9987
if (!$this->caseSensitive) $value = strtolower($value);
9989
if (!isset($this->enumToCSS[$value])) {
9993
$this->prependCSS($attr, $this->enumToCSS[$value]);
10005
// must be called POST validation
10008
* Transform that supplies default values for the src and alt attributes
10009
* in img tags, as well as prevents the img tag from being removed
10010
* because of a missing alt tag. This needs to be registered as both
10011
* a pre and post attribute transform.
10013
class HTMLPurifier_AttrTransform_ImgRequired extends HTMLPurifier_AttrTransform
10016
public function transform($attr, $config, $context) {
10019
if (!isset($attr['src'])) {
10020
if ($config->get('Core.RemoveInvalidImg')) return $attr;
10021
$attr['src'] = $config->get('Attr.DefaultInvalidImage');
10025
if (!isset($attr['alt'])) {
10027
$alt = $config->get('Attr.DefaultImageAlt');
10028
if ($alt === null) {
10029
$attr['alt'] = basename($attr['src']);
10031
$attr['alt'] = $alt;
10034
$attr['alt'] = $config->get('Attr.DefaultInvalidImageAlt');
10049
* Pre-transform that changes deprecated hspace and vspace attributes to CSS
10051
class HTMLPurifier_AttrTransform_ImgSpace extends HTMLPurifier_AttrTransform {
10054
protected $css = array(
10055
'hspace' => array('left', 'right'),
10056
'vspace' => array('top', 'bottom')
10059
public function __construct($attr) {
10060
$this->attr = $attr;
10061
if (!isset($this->css[$attr])) {
10062
trigger_error(htmlspecialchars($attr) . ' is not valid space attribute');
10066
public function transform($attr, $config, $context) {
10068
if (!isset($attr[$this->attr])) return $attr;
10070
$width = $this->confiscateAttr($attr, $this->attr);
10071
// some validation could happen here
10073
if (!isset($this->css[$this->attr])) return $attr;
10076
foreach ($this->css[$this->attr] as $suffix) {
10077
$property = "margin-$suffix";
10078
$style .= "$property:{$width}px;";
10081
$this->prependCSS($attr, $style);
10094
* Performs miscellaneous cross attribute validation and filtering for
10095
* input elements. This is meant to be a post-transform.
10097
class HTMLPurifier_AttrTransform_Input extends HTMLPurifier_AttrTransform {
10101
public function __construct() {
10102
$this->pixels = new HTMLPurifier_AttrDef_HTML_Pixels();
10105
public function transform($attr, $config, $context) {
10106
if (!isset($attr['type'])) $t = 'text';
10107
else $t = strtolower($attr['type']);
10108
if (isset($attr['checked']) && $t !== 'radio' && $t !== 'checkbox') {
10109
unset($attr['checked']);
10111
if (isset($attr['maxlength']) && $t !== 'text' && $t !== 'password') {
10112
unset($attr['maxlength']);
10114
if (isset($attr['size']) && $t !== 'text' && $t !== 'password') {
10115
$result = $this->pixels->validate($attr['size'], $config, $context);
10116
if ($result === false) unset($attr['size']);
10117
else $attr['size'] = $result;
10119
if (isset($attr['src']) && $t !== 'image') {
10120
unset($attr['src']);
10122
if (!isset($attr['value']) && ($t === 'radio' || $t === 'checkbox')) {
10123
$attr['value'] = '';
10135
* Post-transform that copies lang's value to xml:lang (and vice-versa)
10136
* @note Theoretically speaking, this could be a pre-transform, but putting
10137
* post is more efficient.
10139
class HTMLPurifier_AttrTransform_Lang extends HTMLPurifier_AttrTransform
10142
public function transform($attr, $config, $context) {
10144
$lang = isset($attr['lang']) ? $attr['lang'] : false;
10145
$xml_lang = isset($attr['xml:lang']) ? $attr['xml:lang'] : false;
10147
if ($lang !== false && $xml_lang === false) {
10148
$attr['xml:lang'] = $lang;
10149
} elseif ($xml_lang !== false) {
10150
$attr['lang'] = $xml_lang;
10164
* Class for handling width/height length attribute transformations to CSS
10166
class HTMLPurifier_AttrTransform_Length extends HTMLPurifier_AttrTransform
10170
protected $cssName;
10172
public function __construct($name, $css_name = null) {
10173
$this->name = $name;
10174
$this->cssName = $css_name ? $css_name : $name;
10177
public function transform($attr, $config, $context) {
10178
if (!isset($attr[$this->name])) return $attr;
10179
$length = $this->confiscateAttr($attr, $this->name);
10180
if(ctype_digit($length)) $length .= 'px';
10181
$this->prependCSS($attr, $this->cssName . ":$length;");
10192
* Pre-transform that changes deprecated name attribute to ID if necessary
10194
class HTMLPurifier_AttrTransform_Name extends HTMLPurifier_AttrTransform
10197
public function transform($attr, $config, $context) {
10198
// Abort early if we're using relaxed definition of name
10199
if ($config->get('HTML.Attr.Name.UseCDATA')) return $attr;
10200
if (!isset($attr['name'])) return $attr;
10201
$id = $this->confiscateAttr($attr, 'name');
10202
if ( isset($attr['id'])) return $attr;
10214
* Post-transform that performs validation to the name attribute; if
10215
* it is present with an equivalent id attribute, it is passed through;
10216
* otherwise validation is performed.
10218
class HTMLPurifier_AttrTransform_NameSync extends HTMLPurifier_AttrTransform
10221
public function __construct() {
10222
$this->idDef = new HTMLPurifier_AttrDef_HTML_ID();
10225
public function transform($attr, $config, $context) {
10226
if (!isset($attr['name'])) return $attr;
10227
$name = $attr['name'];
10228
if (isset($attr['id']) && $attr['id'] === $name) return $attr;
10229
$result = $this->idDef->validate($name, $config, $context);
10230
if ($result === false) unset($attr['name']);
10231
else $attr['name'] = $result;
10241
class HTMLPurifier_AttrTransform_SafeEmbed extends HTMLPurifier_AttrTransform
10243
public $name = "SafeEmbed";
10245
public function transform($attr, $config, $context) {
10246
$attr['allowscriptaccess'] = 'never';
10247
$attr['allownetworking'] = 'internal';
10248
$attr['type'] = 'application/x-shockwave-flash';
10258
* Writes default type for all objects. Currently only supports flash.
10260
class HTMLPurifier_AttrTransform_SafeObject extends HTMLPurifier_AttrTransform
10262
public $name = "SafeObject";
10264
function transform($attr, $config, $context) {
10265
if (!isset($attr['type'])) $attr['type'] = 'application/x-shockwave-flash';
10275
* Validates name/value pairs in param tags to be used in safe objects. This
10276
* will only allow name values it recognizes, and pre-fill certain attributes
10277
* with required values.
10280
* This class only supports Flash. In the future, Quicktime support
10284
* This class expects an injector to add the necessary parameters tags.
10286
class HTMLPurifier_AttrTransform_SafeParam extends HTMLPurifier_AttrTransform
10288
public $name = "SafeParam";
10291
public function __construct() {
10292
$this->uri = new HTMLPurifier_AttrDef_URI(true); // embedded
10295
public function transform($attr, $config, $context) {
10296
// If we add support for other objects, we'll need to alter the
10298
switch ($attr['name']) {
10299
// application/x-shockwave-flash
10300
// Keep this synchronized with Injector/SafeObject.php
10301
case 'allowScriptAccess':
10302
$attr['value'] = 'never';
10304
case 'allowNetworking':
10305
$attr['value'] = 'internal';
10308
$attr['value'] = 'window';
10311
$attr['value'] = $this->uri->validate($attr['value'], $config, $context);
10313
// add other cases to support other param name/value pairs
10315
$attr['name'] = $attr['value'] = null;
10326
* Implements required attribute stipulation for <script>
10328
class HTMLPurifier_AttrTransform_ScriptRequired extends HTMLPurifier_AttrTransform
10330
public function transform($attr, $config, $context) {
10331
if (!isset($attr['type'])) {
10332
$attr['type'] = 'text/javascript';
10343
* Sets height/width defaults for <textarea>
10345
class HTMLPurifier_AttrTransform_Textarea extends HTMLPurifier_AttrTransform
10348
public function transform($attr, $config, $context) {
10349
// Calculated from Firefox
10350
if (!isset($attr['cols'])) $attr['cols'] = '22';
10351
if (!isset($attr['rows'])) $attr['rows'] = '3';
10362
* Definition that uses different definitions depending on context.
10364
* The del and ins tags are notable because they allow different types of
10365
* elements depending on whether or not they're in a block or inline context.
10366
* Chameleon allows this behavior to happen by using two different
10367
* definitions depending on context. While this somewhat generalized,
10368
* it is specifically intended for those two tags.
10370
class HTMLPurifier_ChildDef_Chameleon extends HTMLPurifier_ChildDef
10374
* Instance of the definition object to use when inline. Usually stricter.
10379
* Instance of the definition object to use when block.
10383
public $type = 'chameleon';
10386
* @param $inline List of elements to allow when inline.
10387
* @param $block List of elements to allow when block.
10389
public function __construct($inline, $block) {
10390
$this->inline = new HTMLPurifier_ChildDef_Optional($inline);
10391
$this->block = new HTMLPurifier_ChildDef_Optional($block);
10392
$this->elements = $this->block->elements;
10395
public function validateChildren($tokens_of_children, $config, $context) {
10396
if ($context->get('IsInline') === false) {
10397
return $this->block->validateChildren(
10398
$tokens_of_children, $config, $context);
10400
return $this->inline->validateChildren(
10401
$tokens_of_children, $config, $context);
10411
* Custom validation class, accepts DTD child definitions
10413
* @warning Currently this class is an all or nothing proposition, that is,
10414
* it will only give a bool return value.
10416
class HTMLPurifier_ChildDef_Custom extends HTMLPurifier_ChildDef
10418
public $type = 'custom';
10419
public $allow_empty = false;
10421
* Allowed child pattern as defined by the DTD
10425
* PCRE regex derived from $dtd_regex
10428
private $_pcre_regex;
10430
* @param $dtd_regex Allowed child pattern from the DTD
10432
public function __construct($dtd_regex) {
10433
$this->dtd_regex = $dtd_regex;
10434
$this->_compileRegex();
10437
* Compiles the PCRE regex from a DTD regex ($dtd_regex to $_pcre_regex)
10439
protected function _compileRegex() {
10440
$raw = str_replace(' ', '', $this->dtd_regex);
10441
if ($raw{0} != '(') {
10444
$el = '[#a-zA-Z0-9_.-]+';
10447
// COMPLICATED! AND MIGHT BE BUGGY! I HAVE NO CLUE WHAT I'M
10448
// DOING! Seriously: if there's problems, please report them.
10450
// collect all elements into the $elements array
10451
preg_match_all("/$el/", $reg, $matches);
10452
foreach ($matches[0] as $match) {
10453
$this->elements[$match] = true;
10456
// setup all elements as parentheticals with leading commas
10457
$reg = preg_replace("/$el/", '(,\\0)', $reg);
10459
// remove commas when they were not solicited
10460
$reg = preg_replace("/([^,(|]\(+),/", '\\1', $reg);
10462
// remove all non-paranthetical commas: they are handled by first regex
10463
$reg = preg_replace("/,\(/", '(', $reg);
10465
$this->_pcre_regex = $reg;
10467
public function validateChildren($tokens_of_children, $config, $context) {
10468
$list_of_children = '';
10469
$nesting = 0; // depth into the nest
10470
foreach ($tokens_of_children as $token) {
10471
if (!empty($token->is_whitespace)) continue;
10473
$is_child = ($nesting == 0); // direct
10475
if ($token instanceof HTMLPurifier_Token_Start) {
10477
} elseif ($token instanceof HTMLPurifier_Token_End) {
10482
$list_of_children .= $token->name . ',';
10485
// add leading comma to deal with stray comma declarations
10486
$list_of_children = ',' . rtrim($list_of_children, ',');
10489
'/^,?'.$this->_pcre_regex.'$/',
10493
return (bool) $okay;
10502
* Definition that disallows all elements.
10503
* @warning validateChildren() in this class is actually never called, because
10504
* empty elements are corrected in HTMLPurifier_Strategy_MakeWellFormed
10505
* before child definitions are parsed in earnest by
10506
* HTMLPurifier_Strategy_FixNesting.
10508
class HTMLPurifier_ChildDef_Empty extends HTMLPurifier_ChildDef
10510
public $allow_empty = true;
10511
public $type = 'empty';
10512
public function __construct() {}
10513
public function validateChildren($tokens_of_children, $config, $context) {
10523
* Definition that allows a set of elements, but disallows empty children.
10525
class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef
10528
* Lookup table of allowed elements.
10531
public $elements = array();
10533
* Whether or not the last passed node was all whitespace.
10535
protected $whitespace = false;
10537
* @param $elements List of allowed element names (lowercase).
10539
public function __construct($elements) {
10540
if (is_string($elements)) {
10541
$elements = str_replace(' ', '', $elements);
10542
$elements = explode('|', $elements);
10544
$keys = array_keys($elements);
10545
if ($keys == array_keys($keys)) {
10546
$elements = array_flip($elements);
10547
foreach ($elements as $i => $x) {
10548
$elements[$i] = true;
10549
if (empty($i)) unset($elements[$i]); // remove blank
10552
$this->elements = $elements;
10554
public $allow_empty = false;
10555
public $type = 'required';
10556
public function validateChildren($tokens_of_children, $config, $context) {
10557
// Flag for subclasses
10558
$this->whitespace = false;
10560
// if there are no tokens, delete parent node
10561
if (empty($tokens_of_children)) return false;
10563
// the new set of children
10566
// current depth into the nest
10569
// whether or not we're deleting a node
10570
$is_deleting = false;
10572
// whether or not parsed character data is allowed
10573
// this controls whether or not we silently drop a tag
10574
// or generate escaped HTML from it
10575
$pcdata_allowed = isset($this->elements['#PCDATA']);
10577
// a little sanity check to make sure it's not ALL whitespace
10578
$all_whitespace = true;
10580
// some configuration
10581
$escape_invalid_children = $config->get('Core.EscapeInvalidChildren');
10584
$gen = new HTMLPurifier_Generator($config, $context);
10586
foreach ($tokens_of_children as $token) {
10587
if (!empty($token->is_whitespace)) {
10588
$result[] = $token;
10591
$all_whitespace = false; // phew, we're not talking about whitespace
10593
$is_child = ($nesting == 0);
10595
if ($token instanceof HTMLPurifier_Token_Start) {
10597
} elseif ($token instanceof HTMLPurifier_Token_End) {
10602
$is_deleting = false;
10603
if (!isset($this->elements[$token->name])) {
10604
$is_deleting = true;
10605
if ($pcdata_allowed && $token instanceof HTMLPurifier_Token_Text) {
10606
$result[] = $token;
10607
} elseif ($pcdata_allowed && $escape_invalid_children) {
10608
$result[] = new HTMLPurifier_Token_Text(
10609
$gen->generateFromToken($token)
10615
if (!$is_deleting || ($pcdata_allowed && $token instanceof HTMLPurifier_Token_Text)) {
10616
$result[] = $token;
10617
} elseif ($pcdata_allowed && $escape_invalid_children) {
10619
new HTMLPurifier_Token_Text(
10620
$gen->generateFromToken($token)
10626
if (empty($result)) return false;
10627
if ($all_whitespace) {
10628
$this->whitespace = true;
10631
if ($tokens_of_children == $result) return true;
10641
* Definition that allows a set of elements, and allows no children.
10642
* @note This is a hack to reuse code from HTMLPurifier_ChildDef_Required,
10643
* really, one shouldn't inherit from the other. Only altered behavior
10644
* is to overload a returned false with an array. Thus, it will never
10647
class HTMLPurifier_ChildDef_Optional extends HTMLPurifier_ChildDef_Required
10649
public $allow_empty = true;
10650
public $type = 'optional';
10651
public function validateChildren($tokens_of_children, $config, $context) {
10652
$result = parent::validateChildren($tokens_of_children, $config, $context);
10653
// we assume that $tokens_of_children is not modified
10654
if ($result === false) {
10655
if (empty($tokens_of_children)) return true;
10656
elseif ($this->whitespace) return $tokens_of_children;
10657
else return array();
10668
* Takes the contents of blockquote when in strict and reformats for validation.
10670
class HTMLPurifier_ChildDef_StrictBlockquote extends HTMLPurifier_ChildDef_Required
10672
protected $real_elements;
10673
protected $fake_elements;
10674
public $allow_empty = true;
10675
public $type = 'strictblockquote';
10676
protected $init = false;
10679
* @note We don't want MakeWellFormed to auto-close inline elements since
10680
* they might be allowed.
10682
public function getAllowedElements($config) {
10683
$this->init($config);
10684
return $this->fake_elements;
10687
public function validateChildren($tokens_of_children, $config, $context) {
10689
$this->init($config);
10691
// trick the parent class into thinking it allows more
10692
$this->elements = $this->fake_elements;
10693
$result = parent::validateChildren($tokens_of_children, $config, $context);
10694
$this->elements = $this->real_elements;
10696
if ($result === false) return array();
10697
if ($result === true) $result = $tokens_of_children;
10699
$def = $config->getHTMLDefinition();
10700
$block_wrap_start = new HTMLPurifier_Token_Start($def->info_block_wrapper);
10701
$block_wrap_end = new HTMLPurifier_Token_End( $def->info_block_wrapper);
10702
$is_inline = false;
10706
// assuming that there are no comment tokens
10707
foreach ($result as $i => $token) {
10708
$token = $result[$i];
10709
// ifs are nested for readability
10713
($token instanceof HTMLPurifier_Token_Text && !$token->is_whitespace) ||
10714
(!$token instanceof HTMLPurifier_Token_Text && !isset($this->elements[$token->name]))
10717
$ret[] = $block_wrap_start;
10722
// starting tokens have been inline text / empty
10723
if ($token instanceof HTMLPurifier_Token_Start || $token instanceof HTMLPurifier_Token_Empty) {
10724
if (isset($this->elements[$token->name])) {
10726
$ret[] = $block_wrap_end;
10727
$is_inline = false;
10733
if ($token instanceof HTMLPurifier_Token_Start) $depth++;
10734
if ($token instanceof HTMLPurifier_Token_End) $depth--;
10736
if ($is_inline) $ret[] = $block_wrap_end;
10740
private function init($config) {
10741
if (!$this->init) {
10742
$def = $config->getHTMLDefinition();
10743
// allow all inline elements
10744
$this->real_elements = $this->elements;
10745
$this->fake_elements = $def->info_content_sets['Flow'];
10746
$this->fake_elements['#PCDATA'] = true;
10747
$this->init = true;
10757
* Definition for tables
10759
class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
10761
public $allow_empty = false;
10762
public $type = 'table';
10763
public $elements = array('tr' => true, 'tbody' => true, 'thead' => true,
10764
'tfoot' => true, 'caption' => true, 'colgroup' => true, 'col' => true);
10765
public function __construct() {}
10766
public function validateChildren($tokens_of_children, $config, $context) {
10767
if (empty($tokens_of_children)) return false;
10769
// this ensures that the loop gets run one last time before closing
10770
// up. It's a little bit of a hack, but it works! Just make sure you
10771
// get rid of the token later.
10772
$tokens_of_children[] = false;
10774
// only one of these elements is allowed in a table
10779
// as many of these as you want
10781
$content = array();
10783
$nesting = 0; // current depth so we can determine nodes
10784
$is_collecting = false; // are we globbing together tokens to package
10785
// into one of the collectors?
10786
$collection = array(); // collected nodes
10787
$tag_index = 0; // the first node might be whitespace,
10788
// so this tells us where the start tag is
10790
foreach ($tokens_of_children as $token) {
10791
$is_child = ($nesting == 0);
10793
if ($token === false) {
10794
// terminating sequence started
10795
} elseif ($token instanceof HTMLPurifier_Token_Start) {
10797
} elseif ($token instanceof HTMLPurifier_Token_End) {
10801
// handle node collection
10802
if ($is_collecting) {
10804
// okay, let's stash the tokens away
10805
// first token tells us the type of the collection
10806
switch ($collection[$tag_index]->name) {
10809
$content[] = $collection;
10812
if ($caption !== false) break;
10813
$caption = $collection;
10817
// access the appropriate variable, $thead or $tfoot
10818
$var = $collection[$tag_index]->name;
10819
if ($$var === false) {
10820
$$var = $collection;
10822
// transmutate the first and less entries into
10823
// tbody tags, and then put into content
10824
$collection[$tag_index]->name = 'tbody';
10825
$collection[count($collection)-1]->name = 'tbody';
10826
$content[] = $collection;
10830
$cols[] = $collection;
10833
$collection = array();
10834
$is_collecting = false;
10837
// add the node to the collection
10838
$collection[] = $token;
10843
if ($token === false) break;
10846
// determine what we're dealing with
10847
if ($token->name == 'col') {
10848
// the only empty tag in the possie, we can handle it
10850
$cols[] = array_merge($collection, array($token));
10851
$collection = array();
10855
switch($token->name) {
10862
$is_collecting = true;
10863
$collection[] = $token;
10866
if (!empty($token->is_whitespace)) {
10867
$collection[] = $token;
10875
if (empty($content)) return false;
10878
if ($caption !== false) $ret = array_merge($ret, $caption);
10879
if ($cols !== false) foreach ($cols as $token_array) $ret = array_merge($ret, $token_array);
10880
if ($thead !== false) $ret = array_merge($ret, $thead);
10881
if ($tfoot !== false) $ret = array_merge($ret, $tfoot);
10882
foreach ($content as $token_array) $ret = array_merge($ret, $token_array);
10883
if (!empty($collection) && $is_collecting == false){
10884
// grab the trailing space
10885
$ret = array_merge($ret, $collection);
10888
array_pop($tokens_of_children); // remove phantom token
10890
return ($ret === $tokens_of_children) ? true : $ret;
10899
class HTMLPurifier_DefinitionCache_Decorator extends HTMLPurifier_DefinitionCache
10903
* Cache object we are decorating
10907
public function __construct() {}
10910
* Lazy decorator function
10911
* @param $cache Reference to cache object to decorate
10913
public function decorate(&$cache) {
10914
$decorator = $this->copy();
10915
// reference is necessary for mocks in PHP 4
10916
$decorator->cache =& $cache;
10917
$decorator->type = $cache->type;
10922
* Cross-compatible clone substitute
10924
public function copy() {
10925
return new HTMLPurifier_DefinitionCache_Decorator();
10928
public function add($def, $config) {
10929
return $this->cache->add($def, $config);
10932
public function set($def, $config) {
10933
return $this->cache->set($def, $config);
10936
public function replace($def, $config) {
10937
return $this->cache->replace($def, $config);
10940
public function get($config) {
10941
return $this->cache->get($config);
10944
public function remove($config) {
10945
return $this->cache->remove($config);
10948
public function flush($config) {
10949
return $this->cache->flush($config);
10952
public function cleanup($config) {
10953
return $this->cache->cleanup($config);
10963
* Null cache object to use when no caching is on.
10965
class HTMLPurifier_DefinitionCache_Null extends HTMLPurifier_DefinitionCache
10968
public function add($def, $config) {
10972
public function set($def, $config) {
10976
public function replace($def, $config) {
10980
public function remove($config) {
10984
public function get($config) {
10988
public function flush($config) {
10992
public function cleanup($config) {
11002
class HTMLPurifier_DefinitionCache_Serializer extends
11003
HTMLPurifier_DefinitionCache
11006
public function add($def, $config) {
11007
if (!$this->checkDefType($def)) return;
11008
$file = $this->generateFilePath($config);
11009
if (file_exists($file)) return false;
11010
if (!$this->_prepareDir($config)) return false;
11011
return $this->_write($file, serialize($def));
11014
public function set($def, $config) {
11015
if (!$this->checkDefType($def)) return;
11016
$file = $this->generateFilePath($config);
11017
if (!$this->_prepareDir($config)) return false;
11018
return $this->_write($file, serialize($def));
11021
public function replace($def, $config) {
11022
if (!$this->checkDefType($def)) return;
11023
$file = $this->generateFilePath($config);
11024
if (!file_exists($file)) return false;
11025
if (!$this->_prepareDir($config)) return false;
11026
return $this->_write($file, serialize($def));
11029
public function get($config) {
11030
$file = $this->generateFilePath($config);
11031
if (!file_exists($file)) return false;
11032
return unserialize(file_get_contents($file));
11035
public function remove($config) {
11036
$file = $this->generateFilePath($config);
11037
if (!file_exists($file)) return false;
11038
return unlink($file);
11041
public function flush($config) {
11042
if (!$this->_prepareDir($config)) return false;
11043
$dir = $this->generateDirectoryPath($config);
11044
$dh = opendir($dir);
11045
while (false !== ($filename = readdir($dh))) {
11046
if (empty($filename)) continue;
11047
if ($filename[0] === '.') continue;
11048
unlink($dir . '/' . $filename);
11052
public function cleanup($config) {
11053
if (!$this->_prepareDir($config)) return false;
11054
$dir = $this->generateDirectoryPath($config);
11055
$dh = opendir($dir);
11056
while (false !== ($filename = readdir($dh))) {
11057
if (empty($filename)) continue;
11058
if ($filename[0] === '.') continue;
11059
$key = substr($filename, 0, strlen($filename) - 4);
11060
if ($this->isOld($key, $config)) unlink($dir . '/' . $filename);
11065
* Generates the file path to the serial file corresponding to
11066
* the configuration and definition name
11067
* @todo Make protected
11069
public function generateFilePath($config) {
11070
$key = $this->generateKey($config);
11071
return $this->generateDirectoryPath($config) . '/' . $key . '.ser';
11075
* Generates the path to the directory contain this cache's serial files
11076
* @note No trailing slash
11077
* @todo Make protected
11079
public function generateDirectoryPath($config) {
11080
$base = $this->generateBaseDirectoryPath($config);
11081
return $base . '/' . $this->type;
11085
* Generates path to base directory that contains all definition type
11087
* @todo Make protected
11089
public function generateBaseDirectoryPath($config) {
11090
$base = $config->get('Cache.SerializerPath');
11091
$base = is_null($base) ? HTMLPURIFIER_PREFIX . '/HTMLPurifier/DefinitionCache/Serializer' : $base;
11096
* Convenience wrapper function for file_put_contents
11097
* @param $file File name to write to
11098
* @param $data Data to write into file
11099
* @return Number of bytes written if success, or false if failure.
11101
private function _write($file, $data) {
11102
return file_put_contents($file, $data);
11106
* Prepares the directory that this type stores the serials in
11107
* @return True if successful
11109
private function _prepareDir($config) {
11110
$directory = $this->generateDirectoryPath($config);
11111
if (!is_dir($directory)) {
11112
$base = $this->generateBaseDirectoryPath($config);
11113
if (!is_dir($base)) {
11114
trigger_error('Base directory '.$base.' does not exist,
11115
please create or change using %Cache.SerializerPath',
11118
} elseif (!$this->_testPermissions($base)) {
11121
$old = umask(0022); // disable group and world writes
11124
} elseif (!$this->_testPermissions($directory)) {
11131
* Tests permissions on a directory and throws out friendly
11132
* error messages and attempts to chmod it itself if possible
11134
private function _testPermissions($dir) {
11135
// early abort, if it is writable, everything is hunky-dory
11136
if (is_writable($dir)) return true;
11137
if (!is_dir($dir)) {
11138
// generally, you'll want to handle this beforehand
11139
// so a more specific error message can be given
11140
trigger_error('Directory '.$dir.' does not exist',
11144
if (function_exists('posix_getuid')) {
11145
// POSIX system, we can give more specific advice
11146
if (fileowner($dir) === posix_getuid()) {
11147
// we can chmod it ourselves
11150
} elseif (filegroup($dir) === posix_getgid()) {
11153
// PHP's probably running as nobody, so we'll
11154
// need to give global permissions
11157
trigger_error('Directory '.$dir.' not writable, '.
11158
'please chmod to ' . $chmod,
11161
// generic error message
11162
trigger_error('Directory '.$dir.' not writable, '.
11163
'please alter file permissions',
11176
* Definition cache decorator class that cleans up the cache
11177
* whenever there is a cache miss.
11179
class HTMLPurifier_DefinitionCache_Decorator_Cleanup extends
11180
HTMLPurifier_DefinitionCache_Decorator
11183
public $name = 'Cleanup';
11185
public function copy() {
11186
return new HTMLPurifier_DefinitionCache_Decorator_Cleanup();
11189
public function add($def, $config) {
11190
$status = parent::add($def, $config);
11191
if (!$status) parent::cleanup($config);
11195
public function set($def, $config) {
11196
$status = parent::set($def, $config);
11197
if (!$status) parent::cleanup($config);
11201
public function replace($def, $config) {
11202
$status = parent::replace($def, $config);
11203
if (!$status) parent::cleanup($config);
11207
public function get($config) {
11208
$ret = parent::get($config);
11209
if (!$ret) parent::cleanup($config);
11220
* Definition cache decorator class that saves all cache retrievals
11221
* to PHP's memory; good for unit tests or circumstances where
11222
* there are lots of configuration objects floating around.
11224
class HTMLPurifier_DefinitionCache_Decorator_Memory extends
11225
HTMLPurifier_DefinitionCache_Decorator
11228
protected $definitions;
11229
public $name = 'Memory';
11231
public function copy() {
11232
return new HTMLPurifier_DefinitionCache_Decorator_Memory();
11235
public function add($def, $config) {
11236
$status = parent::add($def, $config);
11237
if ($status) $this->definitions[$this->generateKey($config)] = $def;
11241
public function set($def, $config) {
11242
$status = parent::set($def, $config);
11243
if ($status) $this->definitions[$this->generateKey($config)] = $def;
11247
public function replace($def, $config) {
11248
$status = parent::replace($def, $config);
11249
if ($status) $this->definitions[$this->generateKey($config)] = $def;
11253
public function get($config) {
11254
$key = $this->generateKey($config);
11255
if (isset($this->definitions[$key])) return $this->definitions[$key];
11256
$this->definitions[$key] = parent::get($config);
11257
return $this->definitions[$key];
11267
* XHTML 1.1 Bi-directional Text Module, defines elements that
11268
* declare directionality of content. Text Extension Module.
11270
class HTMLPurifier_HTMLModule_Bdo extends HTMLPurifier_HTMLModule
11273
public $name = 'Bdo';
11274
public $attr_collections = array(
11275
'I18N' => array('dir' => false)
11278
public function setup($config) {
11279
$bdo = $this->addElement(
11280
'bdo', 'Inline', 'Inline', array('Core', 'Lang'),
11282
'dir' => 'Enum#ltr,rtl', // required
11283
// The Abstract Module specification has the attribute
11284
// inclusions wrong for bdo: bdo allows Lang
11287
$bdo->attr_transform_post['required-dir'] = new HTMLPurifier_AttrTransform_BdoDir();
11289
$this->attr_collections['I18N']['dir'] = 'Enum#ltr,rtl';
11298
class HTMLPurifier_HTMLModule_CommonAttributes extends HTMLPurifier_HTMLModule
11300
public $name = 'CommonAttributes';
11302
public $attr_collections = array(
11304
0 => array('Style'),
11305
// 'xml:space' => false,
11306
'class' => 'Class',
11308
'title' => 'CDATA',
11312
0 => array('Lang'), // proprietary, for xml:lang/lang
11315
0 => array('Core', 'I18N')
11326
* XHTML 1.1 Edit Module, defines editing-related elements. Text Extension
11329
class HTMLPurifier_HTMLModule_Edit extends HTMLPurifier_HTMLModule
11332
public $name = 'Edit';
11334
public function setup($config) {
11335
$contents = 'Chameleon: #PCDATA | Inline ! #PCDATA | Flow';
11338
// 'datetime' => 'Datetime', // not implemented
11340
$this->addElement('del', 'Inline', $contents, 'Common', $attr);
11341
$this->addElement('ins', 'Inline', $contents, 'Common', $attr);
11344
// HTML 4.01 specifies that ins/del must not contain block
11345
// elements when used in an inline context, chameleon is
11346
// a complicated workaround to acheive this effect
11348
// Inline context ! Block context (exclamation mark is
11349
// separator, see getChildDef for parsing)
11351
public $defines_child_def = true;
11352
public function getChildDef($def) {
11353
if ($def->content_model_type != 'chameleon') return false;
11354
$value = explode('!', $def->content_model);
11355
return new HTMLPurifier_ChildDef_Chameleon($value[0], $value[1]);
11365
* XHTML 1.1 Forms module, defines all form-related elements found in HTML 4.
11367
class HTMLPurifier_HTMLModule_Forms extends HTMLPurifier_HTMLModule
11369
public $name = 'Forms';
11370
public $safe = false;
11372
public $content_sets = array(
11374
'Inline' => 'Formctrl',
11377
public function setup($config) {
11378
$form = $this->addElement('form', 'Form',
11379
'Required: Heading | List | Block | fieldset', 'Common', array(
11380
'accept' => 'ContentTypes',
11381
'accept-charset' => 'Charsets',
11382
'action*' => 'URI',
11383
'method' => 'Enum#get,post',
11384
// really ContentType, but these two are the only ones used today
11385
'enctype' => 'Enum#application/x-www-form-urlencoded,multipart/form-data',
11387
$form->excludes = array('form' => true);
11389
$input = $this->addElement('input', 'Formctrl', 'Empty', 'Common', array(
11390
'accept' => 'ContentTypes',
11391
'accesskey' => 'Character',
11393
'checked' => 'Bool#checked',
11394
'disabled' => 'Bool#disabled',
11395
'maxlength' => 'Number',
11397
'readonly' => 'Bool#readonly',
11398
'size' => 'Number',
11399
'src' => 'URI#embeds',
11400
'tabindex' => 'Number',
11401
'type' => 'Enum#text,password,checkbox,button,radio,submit,reset,file,hidden,image',
11402
'value' => 'CDATA',
11404
$input->attr_transform_post[] = new HTMLPurifier_AttrTransform_Input();
11406
$this->addElement('select', 'Formctrl', 'Required: optgroup | option', 'Common', array(
11407
'disabled' => 'Bool#disabled',
11408
'multiple' => 'Bool#multiple',
11410
'size' => 'Number',
11411
'tabindex' => 'Number',
11414
$this->addElement('option', false, 'Optional: #PCDATA', 'Common', array(
11415
'disabled' => 'Bool#disabled',
11417
'selected' => 'Bool#selected',
11418
'value' => 'CDATA',
11420
// It's illegal for there to be more than one selected, but not
11421
// be multiple. Also, no selected means undefined behavior. This might
11422
// be difficult to implement; perhaps an injector, or a context variable.
11424
$textarea = $this->addElement('textarea', 'Formctrl', 'Optional: #PCDATA', 'Common', array(
11425
'accesskey' => 'Character',
11426
'cols*' => 'Number',
11427
'disabled' => 'Bool#disabled',
11429
'readonly' => 'Bool#readonly',
11430
'rows*' => 'Number',
11431
'tabindex' => 'Number',
11433
$textarea->attr_transform_pre[] = new HTMLPurifier_AttrTransform_Textarea();
11435
$button = $this->addElement('button', 'Formctrl', 'Optional: #PCDATA | Heading | List | Block | Inline', 'Common', array(
11436
'accesskey' => 'Character',
11437
'disabled' => 'Bool#disabled',
11439
'tabindex' => 'Number',
11440
'type' => 'Enum#button,submit,reset',
11441
'value' => 'CDATA',
11444
// For exclusions, ideally we'd specify content sets, not literal elements
11445
$button->excludes = $this->makeLookup(
11446
'form', 'fieldset', // Form
11447
'input', 'select', 'textarea', 'label', 'button', // Formctrl
11448
'a' // as per HTML 4.01 spec, this is omitted by modularization
11451
// Extra exclusion: img usemap="" is not permitted within this element.
11452
// We'll omit this for now, since we don't have any good way of
11453
// indicating it yet.
11455
// This is HIGHLY user-unfriendly; we need a custom child-def for this
11456
$this->addElement('fieldset', 'Form', 'Custom: (#WS?,legend,(Flow|#PCDATA)*)', 'Common');
11458
$label = $this->addElement('label', 'Formctrl', 'Optional: #PCDATA | Inline', 'Common', array(
11459
'accesskey' => 'Character',
11460
// 'for' => 'IDREF', // IDREF not implemented, cannot allow
11462
$label->excludes = array('label' => true);
11464
$this->addElement('legend', false, 'Optional: #PCDATA | Inline', 'Common', array(
11465
'accesskey' => 'Character',
11468
$this->addElement('optgroup', false, 'Required: option', 'Common', array(
11469
'disabled' => 'Bool#disabled',
11470
'label*' => 'Text',
11473
// Don't forget an injector for <isindex>. This one's a little complex
11474
// because it maps to multiple elements.
11484
* XHTML 1.1 Hypertext Module, defines hypertext links. Core Module.
11486
class HTMLPurifier_HTMLModule_Hypertext extends HTMLPurifier_HTMLModule
11489
public $name = 'Hypertext';
11491
public function setup($config) {
11492
$a = $this->addElement(
11493
'a', 'Inline', 'Inline', 'Common',
11495
// 'accesskey' => 'Character',
11496
// 'charset' => 'Charset',
11498
// 'hreflang' => 'LanguageCode',
11499
'rel' => new HTMLPurifier_AttrDef_HTML_LinkTypes('rel'),
11500
'rev' => new HTMLPurifier_AttrDef_HTML_LinkTypes('rev'),
11501
// 'tabindex' => 'Number',
11502
// 'type' => 'ContentType',
11505
$a->formatting = true;
11506
$a->excludes = array('a' => true);
11516
* XHTML 1.1 Image Module provides basic image embedding.
11517
* @note There is specialized code for removing empty images in
11518
* HTMLPurifier_Strategy_RemoveForeignElements
11520
class HTMLPurifier_HTMLModule_Image extends HTMLPurifier_HTMLModule
11523
public $name = 'Image';
11525
public function setup($config) {
11526
$max = $config->get('HTML.MaxImgLength');
11527
$img = $this->addElement(
11528
'img', 'Inline', 'Empty', 'Common',
11531
// According to the spec, it's Length, but percents can
11532
// be abused, so we allow only Pixels.
11533
'height' => 'Pixels#' . $max,
11534
'width' => 'Pixels#' . $max,
11535
'longdesc' => 'URI',
11536
'src*' => new HTMLPurifier_AttrDef_URI(true), // embedded
11539
if ($max === null || $config->get('HTML.Trusted')) {
11540
$img->attr['height'] =
11541
$img->attr['width'] = 'Length';
11544
// kind of strange, but splitting things up would be inefficient
11545
$img->attr_transform_pre[] =
11546
$img->attr_transform_post[] =
11547
new HTMLPurifier_AttrTransform_ImgRequired();
11557
* XHTML 1.1 Legacy module defines elements that were previously
11560
* @note Not all legacy elements have been implemented yet, which
11561
* is a bit of a reverse problem as compared to browsers! In
11562
* addition, this legacy module may implement a bit more than
11563
* mandated by XHTML 1.1.
11565
* This module can be used in combination with TransformToStrict in order
11566
* to transform as many deprecated elements as possible, but retain
11567
* questionably deprecated elements that do not have good alternatives
11568
* as well as transform elements that don't have an implementation.
11569
* See docs/ref-strictness.txt for more details.
11572
class HTMLPurifier_HTMLModule_Legacy extends HTMLPurifier_HTMLModule
11575
public $name = 'Legacy';
11577
public function setup($config) {
11579
$this->addElement('basefont', 'Inline', 'Empty', false, array(
11580
'color' => 'Color',
11581
'face' => 'Text', // extremely broad, we should
11582
'size' => 'Text', // tighten it
11585
$this->addElement('center', 'Block', 'Flow', 'Common');
11586
$this->addElement('dir', 'Block', 'Required: li', 'Common', array(
11587
'compact' => 'Bool#compact'
11589
$this->addElement('font', 'Inline', 'Inline', array('Core', 'I18N'), array(
11590
'color' => 'Color',
11591
'face' => 'Text', // extremely broad, we should
11592
'size' => 'Text', // tighten it
11594
$this->addElement('menu', 'Block', 'Required: li', 'Common', array(
11595
'compact' => 'Bool#compact'
11598
$s = $this->addElement('s', 'Inline', 'Inline', 'Common');
11599
$s->formatting = true;
11601
$strike = $this->addElement('strike', 'Inline', 'Inline', 'Common');
11602
$strike->formatting = true;
11604
$u = $this->addElement('u', 'Inline', 'Inline', 'Common');
11605
$u->formatting = true;
11607
// setup modifications to old elements
11609
$align = 'Enum#left,right,center,justify';
11611
$address = $this->addBlankElement('address');
11612
$address->content_model = 'Inline | #PCDATA | p';
11613
$address->content_model_type = 'optional';
11614
$address->child = false;
11616
$blockquote = $this->addBlankElement('blockquote');
11617
$blockquote->content_model = 'Flow | #PCDATA';
11618
$blockquote->content_model_type = 'optional';
11619
$blockquote->child = false;
11621
$br = $this->addBlankElement('br');
11622
$br->attr['clear'] = 'Enum#left,all,right,none';
11624
$caption = $this->addBlankElement('caption');
11625
$caption->attr['align'] = 'Enum#top,bottom,left,right';
11627
$div = $this->addBlankElement('div');
11628
$div->attr['align'] = $align;
11630
$dl = $this->addBlankElement('dl');
11631
$dl->attr['compact'] = 'Bool#compact';
11633
for ($i = 1; $i <= 6; $i++) {
11634
$h = $this->addBlankElement("h$i");
11635
$h->attr['align'] = $align;
11638
$hr = $this->addBlankElement('hr');
11639
$hr->attr['align'] = $align;
11640
$hr->attr['noshade'] = 'Bool#noshade';
11641
$hr->attr['size'] = 'Pixels';
11642
$hr->attr['width'] = 'Length';
11644
$img = $this->addBlankElement('img');
11645
$img->attr['align'] = 'Enum#top,middle,bottom,left,right';
11646
$img->attr['border'] = 'Pixels';
11647
$img->attr['hspace'] = 'Pixels';
11648
$img->attr['vspace'] = 'Pixels';
11650
// figure out this integer business
11652
$li = $this->addBlankElement('li');
11653
$li->attr['value'] = new HTMLPurifier_AttrDef_Integer();
11654
$li->attr['type'] = 'Enum#s:1,i,I,a,A,disc,square,circle';
11656
$ol = $this->addBlankElement('ol');
11657
$ol->attr['compact'] = 'Bool#compact';
11658
$ol->attr['start'] = new HTMLPurifier_AttrDef_Integer();
11659
$ol->attr['type'] = 'Enum#s:1,i,I,a,A';
11661
$p = $this->addBlankElement('p');
11662
$p->attr['align'] = $align;
11664
$pre = $this->addBlankElement('pre');
11665
$pre->attr['width'] = 'Number';
11669
$table = $this->addBlankElement('table');
11670
$table->attr['align'] = 'Enum#left,center,right';
11671
$table->attr['bgcolor'] = 'Color';
11673
$tr = $this->addBlankElement('tr');
11674
$tr->attr['bgcolor'] = 'Color';
11676
$th = $this->addBlankElement('th');
11677
$th->attr['bgcolor'] = 'Color';
11678
$th->attr['height'] = 'Length';
11679
$th->attr['nowrap'] = 'Bool#nowrap';
11680
$th->attr['width'] = 'Length';
11682
$td = $this->addBlankElement('td');
11683
$td->attr['bgcolor'] = 'Color';
11684
$td->attr['height'] = 'Length';
11685
$td->attr['nowrap'] = 'Bool#nowrap';
11686
$td->attr['width'] = 'Length';
11688
$ul = $this->addBlankElement('ul');
11689
$ul->attr['compact'] = 'Bool#compact';
11690
$ul->attr['type'] = 'Enum#square,disc,circle';
11701
* XHTML 1.1 List Module, defines list-oriented elements. Core Module.
11703
class HTMLPurifier_HTMLModule_List extends HTMLPurifier_HTMLModule
11706
public $name = 'List';
11708
// According to the abstract schema, the List content set is a fully formed
11709
// one or more expr, but it invariably occurs in an optional declaration
11710
// so we're not going to do that subtlety. It might cause trouble
11711
// if a user defines "List" and expects that multiple lists are
11712
// allowed to be specified, but then again, that's not very intuitive.
11713
// Furthermore, the actual XML Schema may disagree. Regardless,
11714
// we don't have support for such nested expressions without using
11715
// the incredibly inefficient and draconic Custom ChildDef.
11717
public $content_sets = array('Flow' => 'List');
11719
public function setup($config) {
11720
$this->addElement('ol', 'List', 'Required: li', 'Common');
11721
$this->addElement('ul', 'List', 'Required: li', 'Common');
11722
$this->addElement('dl', 'List', 'Required: dt | dd', 'Common');
11724
$this->addElement('li', false, 'Flow', 'Common');
11726
$this->addElement('dd', false, 'Flow', 'Common');
11727
$this->addElement('dt', false, 'Inline', 'Common');
11736
class HTMLPurifier_HTMLModule_Name extends HTMLPurifier_HTMLModule
11739
public $name = 'Name';
11741
public function setup($config) {
11742
$elements = array('a', 'applet', 'form', 'frame', 'iframe', 'img', 'map');
11743
foreach ($elements as $name) {
11744
$element = $this->addBlankElement($name);
11745
$element->attr['name'] = 'CDATA';
11746
if (!$config->get('HTML.Attr.Name.UseCDATA')) {
11747
$element->attr_transform_post['NameSync'] = new HTMLPurifier_AttrTransform_NameSync();
11758
class HTMLPurifier_HTMLModule_NonXMLCommonAttributes extends HTMLPurifier_HTMLModule
11760
public $name = 'NonXMLCommonAttributes';
11762
public $attr_collections = array(
11764
'lang' => 'LanguageCode',
11774
* XHTML 1.1 Object Module, defines elements for generic object inclusion
11775
* @warning Users will commonly use <embed> to cater to legacy browsers: this
11776
* module does not allow this sort of behavior
11778
class HTMLPurifier_HTMLModule_Object extends HTMLPurifier_HTMLModule
11781
public $name = 'Object';
11782
public $safe = false;
11784
public function setup($config) {
11786
$this->addElement('object', 'Inline', 'Optional: #PCDATA | Flow | param', 'Common',
11788
'archive' => 'URI',
11789
'classid' => 'URI',
11790
'codebase' => 'URI',
11791
'codetype' => 'Text',
11793
'declare' => 'Bool#declare',
11794
'height' => 'Length',
11796
'standby' => 'Text',
11797
'tabindex' => 'Number',
11798
'type' => 'ContentType',
11799
'width' => 'Length'
11803
$this->addElement('param', false, 'Empty', false,
11809
'valuetype' => 'Enum#data,ref,object'
11822
* XHTML 1.1 Presentation Module, defines simple presentation-related
11823
* markup. Text Extension Module.
11824
* @note The official XML Schema and DTD specs further divide this into
11826
* - Block Presentation (hr)
11827
* - Inline Presentation (b, big, i, small, sub, sup, tt)
11828
* We have chosen not to heed this distinction, as content_sets
11829
* provides satisfactory disambiguation.
11831
class HTMLPurifier_HTMLModule_Presentation extends HTMLPurifier_HTMLModule
11834
public $name = 'Presentation';
11836
public function setup($config) {
11837
$this->addElement('hr', 'Block', 'Empty', 'Common');
11838
$this->addElement('sub', 'Inline', 'Inline', 'Common');
11839
$this->addElement('sup', 'Inline', 'Inline', 'Common');
11840
$b = $this->addElement('b', 'Inline', 'Inline', 'Common');
11841
$b->formatting = true;
11842
$big = $this->addElement('big', 'Inline', 'Inline', 'Common');
11843
$big->formatting = true;
11844
$i = $this->addElement('i', 'Inline', 'Inline', 'Common');
11845
$i->formatting = true;
11846
$small = $this->addElement('small', 'Inline', 'Inline', 'Common');
11847
$small->formatting = true;
11848
$tt = $this->addElement('tt', 'Inline', 'Inline', 'Common');
11849
$tt->formatting = true;
11859
* Module defines proprietary tags and attributes in HTML.
11860
* @warning If this module is enabled, standards-compliance is off!
11862
class HTMLPurifier_HTMLModule_Proprietary extends HTMLPurifier_HTMLModule
11865
public $name = 'Proprietary';
11867
public function setup($config) {
11869
$this->addElement('marquee', 'Inline', 'Flow', 'Common',
11871
'direction' => 'Enum#left,right,up,down',
11872
'behavior' => 'Enum#alternate',
11873
'width' => 'Length',
11874
'height' => 'Length',
11875
'scrolldelay' => 'Number',
11876
'scrollamount' => 'Number',
11877
'loop' => 'Number',
11878
'bgcolor' => 'Color',
11879
'hspace' => 'Pixels',
11880
'vspace' => 'Pixels',
11893
* XHTML 1.1 Ruby Annotation Module, defines elements that indicate
11894
* short runs of text alongside base text for annotation or pronounciation.
11896
class HTMLPurifier_HTMLModule_Ruby extends HTMLPurifier_HTMLModule
11899
public $name = 'Ruby';
11901
public function setup($config) {
11902
$this->addElement('ruby', 'Inline',
11903
'Custom: ((rb, (rt | (rp, rt, rp))) | (rbc, rtc, rtc?))',
11905
$this->addElement('rbc', false, 'Required: rb', 'Common');
11906
$this->addElement('rtc', false, 'Required: rt', 'Common');
11907
$rb = $this->addElement('rb', false, 'Inline', 'Common');
11908
$rb->excludes = array('ruby' => true);
11909
$rt = $this->addElement('rt', false, 'Inline', 'Common', array('rbspan' => 'Number'));
11910
$rt->excludes = array('ruby' => true);
11911
$this->addElement('rp', false, 'Optional: #PCDATA', 'Common');
11921
* A "safe" embed module. See SafeObject. This is a proprietary element.
11923
class HTMLPurifier_HTMLModule_SafeEmbed extends HTMLPurifier_HTMLModule
11926
public $name = 'SafeEmbed';
11928
public function setup($config) {
11930
$max = $config->get('HTML.MaxImgLength');
11931
$embed = $this->addElement(
11932
'embed', 'Inline', 'Empty', 'Common',
11934
'src*' => 'URI#embedded',
11935
'type' => 'Enum#application/x-shockwave-flash',
11936
'width' => 'Pixels#' . $max,
11937
'height' => 'Pixels#' . $max,
11938
'allowscriptaccess' => 'Enum#never',
11939
'allownetworking' => 'Enum#internal',
11940
'wmode' => 'Enum#window',
11944
$embed->attr_transform_post[] = new HTMLPurifier_AttrTransform_SafeEmbed();
11955
* A "safe" object module. In theory, objects permitted by this module will
11956
* be safe, and untrusted users can be allowed to embed arbitrary flash objects
11957
* (maybe other types too, but only Flash is supported as of right now).
11958
* Highly experimental.
11960
class HTMLPurifier_HTMLModule_SafeObject extends HTMLPurifier_HTMLModule
11963
public $name = 'SafeObject';
11965
public function setup($config) {
11967
// These definitions are not intrinsically safe: the attribute transforms
11968
// are a vital part of ensuring safety.
11970
$max = $config->get('HTML.MaxImgLength');
11971
$object = $this->addElement(
11974
'Optional: param | Flow | #PCDATA',
11977
// While technically not required by the spec, we're forcing
11978
// it to this value.
11979
'type' => 'Enum#application/x-shockwave-flash',
11980
'width' => 'Pixels#' . $max,
11981
'height' => 'Pixels#' . $max,
11982
'data' => 'URI#embedded'
11985
$object->attr_transform_post[] = new HTMLPurifier_AttrTransform_SafeObject();
11987
$param = $this->addElement('param', false, 'Empty', false,
11994
$param->attr_transform_post[] = new HTMLPurifier_AttrTransform_SafeParam();
11995
$this->info_injector[] = 'SafeObject';
12007
WARNING: THIS MODULE IS EXTREMELY DANGEROUS AS IT ENABLES INLINE SCRIPTING
12008
INSIDE HTML PURIFIER DOCUMENTS. USE ONLY WITH TRUSTED USER INPUT!!!
12013
* XHTML 1.1 Scripting module, defines elements that are used to contain
12014
* information pertaining to executable scripts or the lack of support
12015
* for executable scripts.
12016
* @note This module does not contain inline scripting elements
12018
class HTMLPurifier_HTMLModule_Scripting extends HTMLPurifier_HTMLModule
12020
public $name = 'Scripting';
12021
public $elements = array('script', 'noscript');
12022
public $content_sets = array('Block' => 'script | noscript', 'Inline' => 'script | noscript');
12023
public $safe = false;
12025
public function setup($config) {
12026
// TODO: create custom child-definition for noscript that
12027
// auto-wraps stray #PCDATA in a similar manner to
12028
// blockquote's custom definition (we would use it but
12029
// blockquote's contents are optional while noscript's contents
12032
// TODO: convert this to new syntax, main problem is getting
12033
// both content sets working
12035
// In theory, this could be safe, but I don't see any reason to
12037
$this->info['noscript'] = new HTMLPurifier_ElementDef();
12038
$this->info['noscript']->attr = array( 0 => array('Common') );
12039
$this->info['noscript']->content_model = 'Heading | List | Block';
12040
$this->info['noscript']->content_model_type = 'required';
12042
$this->info['script'] = new HTMLPurifier_ElementDef();
12043
$this->info['script']->attr = array(
12044
'defer' => new HTMLPurifier_AttrDef_Enum(array('defer')),
12045
'src' => new HTMLPurifier_AttrDef_URI(true),
12046
'type' => new HTMLPurifier_AttrDef_Enum(array('text/javascript'))
12048
$this->info['script']->content_model = '#PCDATA';
12049
$this->info['script']->content_model_type = 'optional';
12050
$this->info['script']->attr_transform_pre['type'] =
12051
$this->info['script']->attr_transform_post['type'] =
12052
new HTMLPurifier_AttrTransform_ScriptRequired();
12061
* XHTML 1.1 Edit Module, defines editing-related elements. Text Extension
12064
class HTMLPurifier_HTMLModule_StyleAttribute extends HTMLPurifier_HTMLModule
12067
public $name = 'StyleAttribute';
12068
public $attr_collections = array(
12069
// The inclusion routine differs from the Abstract Modules but
12070
// is in line with the DTD and XML Schemas.
12071
'Style' => array('style' => false), // see constructor
12072
'Core' => array(0 => array('Style'))
12075
public function setup($config) {
12076
$this->attr_collections['Style']['style'] = new HTMLPurifier_AttrDef_CSS();
12086
* XHTML 1.1 Tables Module, fully defines accessible table elements.
12088
class HTMLPurifier_HTMLModule_Tables extends HTMLPurifier_HTMLModule
12091
public $name = 'Tables';
12093
public function setup($config) {
12095
$this->addElement('caption', false, 'Inline', 'Common');
12097
$this->addElement('table', 'Block',
12098
new HTMLPurifier_ChildDef_Table(), 'Common',
12100
'border' => 'Pixels',
12101
'cellpadding' => 'Length',
12102
'cellspacing' => 'Length',
12103
'frame' => 'Enum#void,above,below,hsides,lhs,rhs,vsides,box,border',
12104
'rules' => 'Enum#none,groups,rows,cols,all',
12105
'summary' => 'Text',
12106
'width' => 'Length'
12110
// common attributes
12111
$cell_align = array(
12112
'align' => 'Enum#left,center,right,justify,char',
12113
'charoff' => 'Length',
12114
'valign' => 'Enum#top,middle,bottom,baseline',
12117
$cell_t = array_merge(
12120
'colspan' => 'Number',
12121
'rowspan' => 'Number',
12125
$this->addElement('td', false, 'Flow', 'Common', $cell_t);
12126
$this->addElement('th', false, 'Flow', 'Common', $cell_t);
12128
$this->addElement('tr', false, 'Required: td | th', 'Common', $cell_align);
12130
$cell_col = array_merge(
12132
'span' => 'Number',
12133
'width' => 'MultiLength',
12137
$this->addElement('col', false, 'Empty', 'Common', $cell_col);
12138
$this->addElement('colgroup', false, 'Optional: col', 'Common', $cell_col);
12140
$this->addElement('tbody', false, 'Required: tr', 'Common', $cell_align);
12141
$this->addElement('thead', false, 'Required: tr', 'Common', $cell_align);
12142
$this->addElement('tfoot', false, 'Required: tr', 'Common', $cell_align);
12153
* XHTML 1.1 Target Module, defines target attribute in link elements.
12155
class HTMLPurifier_HTMLModule_Target extends HTMLPurifier_HTMLModule
12158
public $name = 'Target';
12160
public function setup($config) {
12161
$elements = array('a');
12162
foreach ($elements as $name) {
12163
$e = $this->addBlankElement($name);
12165
'target' => new HTMLPurifier_AttrDef_HTML_FrameTarget()
12177
* XHTML 1.1 Text Module, defines basic text containers. Core Module.
12178
* @note In the normative XML Schema specification, this module
12179
* is further abstracted into the following modules:
12180
* - Block Phrasal (address, blockquote, pre, h1, h2, h3, h4, h5, h6)
12181
* - Block Structural (div, p)
12182
* - Inline Phrasal (abbr, acronym, cite, code, dfn, em, kbd, q, samp, strong, var)
12183
* - Inline Structural (br, span)
12184
* This module, functionally, does not distinguish between these
12185
* sub-modules, but the code is internally structured to reflect
12186
* these distinctions.
12188
class HTMLPurifier_HTMLModule_Text extends HTMLPurifier_HTMLModule
12191
public $name = 'Text';
12192
public $content_sets = array(
12193
'Flow' => 'Heading | Block | Inline'
12196
public function setup($config) {
12198
// Inline Phrasal -------------------------------------------------
12199
$this->addElement('abbr', 'Inline', 'Inline', 'Common');
12200
$this->addElement('acronym', 'Inline', 'Inline', 'Common');
12201
$this->addElement('cite', 'Inline', 'Inline', 'Common');
12202
$this->addElement('dfn', 'Inline', 'Inline', 'Common');
12203
$this->addElement('kbd', 'Inline', 'Inline', 'Common');
12204
$this->addElement('q', 'Inline', 'Inline', 'Common', array('cite' => 'URI'));
12205
$this->addElement('samp', 'Inline', 'Inline', 'Common');
12206
$this->addElement('var', 'Inline', 'Inline', 'Common');
12208
$em = $this->addElement('em', 'Inline', 'Inline', 'Common');
12209
$em->formatting = true;
12211
$strong = $this->addElement('strong', 'Inline', 'Inline', 'Common');
12212
$strong->formatting = true;
12214
$code = $this->addElement('code', 'Inline', 'Inline', 'Common');
12215
$code->formatting = true;
12217
// Inline Structural ----------------------------------------------
12218
$this->addElement('span', 'Inline', 'Inline', 'Common');
12219
$this->addElement('br', 'Inline', 'Empty', 'Core');
12221
// Block Phrasal --------------------------------------------------
12222
$this->addElement('address', 'Block', 'Inline', 'Common');
12223
$this->addElement('blockquote', 'Block', 'Optional: Heading | Block | List', 'Common', array('cite' => 'URI') );
12224
$pre = $this->addElement('pre', 'Block', 'Inline', 'Common');
12225
$pre->excludes = $this->makeLookup(
12226
'img', 'big', 'small', 'object', 'applet', 'font', 'basefont' );
12227
$this->addElement('h1', 'Heading', 'Inline', 'Common');
12228
$this->addElement('h2', 'Heading', 'Inline', 'Common');
12229
$this->addElement('h3', 'Heading', 'Inline', 'Common');
12230
$this->addElement('h4', 'Heading', 'Inline', 'Common');
12231
$this->addElement('h5', 'Heading', 'Inline', 'Common');
12232
$this->addElement('h6', 'Heading', 'Inline', 'Common');
12234
// Block Structural -----------------------------------------------
12235
$p = $this->addElement('p', 'Block', 'Inline', 'Common');
12236
$p->autoclose = array_flip(array("address", "blockquote", "center", "dir", "div", "dl", "fieldset", "ol", "p", "ul"));
12238
$this->addElement('div', 'Block', 'Flow', 'Common');
12249
* Abstract class for a set of proprietary modules that clean up (tidy)
12250
* poorly written HTML.
12251
* @todo Figure out how to protect some of these methods/properties
12253
class HTMLPurifier_HTMLModule_Tidy extends HTMLPurifier_HTMLModule
12257
* List of supported levels. Index zero is a special case "no fixes"
12260
public $levels = array(0 => 'none', 'light', 'medium', 'heavy');
12263
* Default level to place all fixes in. Disabled by default
12265
public $defaultLevel = null;
12268
* Lists of fixes used by getFixesForLevel(). Format is:
12269
* HTMLModule_Tidy->fixesForLevel[$level] = array('fix-1', 'fix-2');
12271
public $fixesForLevel = array(
12272
'light' => array(),
12273
'medium' => array(),
12278
* Lazy load constructs the module by determining the necessary
12279
* fixes to create and then delegating to the populate() function.
12280
* @todo Wildcard matching and error reporting when an added or
12281
* subtracted fix has no effect.
12283
public function setup($config) {
12285
// create fixes, initialize fixesForLevel
12286
$fixes = $this->makeFixes();
12287
$this->makeFixesForLevel($fixes);
12289
// figure out which fixes to use
12290
$level = $config->get('HTML.TidyLevel');
12291
$fixes_lookup = $this->getFixesForLevel($level);
12293
// get custom fix declarations: these need namespace processing
12294
$add_fixes = $config->get('HTML.TidyAdd');
12295
$remove_fixes = $config->get('HTML.TidyRemove');
12297
foreach ($fixes as $name => $fix) {
12298
// needs to be refactored a little to implement globbing
12300
isset($remove_fixes[$name]) ||
12301
(!isset($add_fixes[$name]) && !isset($fixes_lookup[$name]))
12303
unset($fixes[$name]);
12307
// populate this module with necessary fixes
12308
$this->populate($fixes);
12313
* Retrieves all fixes per a level, returning fixes for that specific
12314
* level as well as all levels below it.
12315
* @param $level String level identifier, see $levels for valid values
12316
* @return Lookup up table of fixes
12318
public function getFixesForLevel($level) {
12319
if ($level == $this->levels[0]) {
12322
$activated_levels = array();
12323
for ($i = 1, $c = count($this->levels); $i < $c; $i++) {
12324
$activated_levels[] = $this->levels[$i];
12325
if ($this->levels[$i] == $level) break;
12329
'Tidy level ' . htmlspecialchars($level) . ' not recognized',
12335
foreach ($activated_levels as $level) {
12336
foreach ($this->fixesForLevel[$level] as $fix) {
12344
* Dynamically populates the $fixesForLevel member variable using
12345
* the fixes array. It may be custom overloaded, used in conjunction
12346
* with $defaultLevel, or not used at all.
12348
public function makeFixesForLevel($fixes) {
12349
if (!isset($this->defaultLevel)) return;
12350
if (!isset($this->fixesForLevel[$this->defaultLevel])) {
12352
'Default level ' . $this->defaultLevel . ' does not exist',
12357
$this->fixesForLevel[$this->defaultLevel] = array_keys($fixes);
12361
* Populates the module with transforms and other special-case code
12362
* based on a list of fixes passed to it
12363
* @param $lookup Lookup table of fixes to activate
12365
public function populate($fixes) {
12366
foreach ($fixes as $name => $fix) {
12367
// determine what the fix is for
12368
list($type, $params) = $this->getFixType($name);
12370
case 'attr_transform_pre':
12371
case 'attr_transform_post':
12372
$attr = $params['attr'];
12373
if (isset($params['element'])) {
12374
$element = $params['element'];
12375
if (empty($this->info[$element])) {
12376
$e = $this->addBlankElement($element);
12378
$e = $this->info[$element];
12381
$type = "info_$type";
12384
// PHP does some weird parsing when I do
12385
// $e->$type[$attr], so I have to assign a ref.
12389
case 'tag_transform':
12390
$this->info_tag_transform[$params['element']] = $fix;
12393
case 'content_model_type':
12394
$element = $params['element'];
12395
if (empty($this->info[$element])) {
12396
$e = $this->addBlankElement($element);
12398
$e = $this->info[$element];
12403
trigger_error("Fix type $type not supported", E_USER_ERROR);
12410
* Parses a fix name and determines what kind of fix it is, as well
12411
* as other information defined by the fix
12412
* @param $name String name of fix
12413
* @return array(string $fix_type, array $fix_parameters)
12414
* @note $fix_parameters is type dependant, see populate() for usage
12415
* of these parameters
12417
public function getFixType($name) {
12419
$property = $attr = null;
12420
if (strpos($name, '#') !== false) list($name, $property) = explode('#', $name);
12421
if (strpos($name, '@') !== false) list($name, $attr) = explode('@', $name);
12423
// figure out the parameters
12425
if ($name !== '') $params['element'] = $name;
12426
if (!is_null($attr)) $params['attr'] = $attr;
12428
// special case: attribute transform
12429
if (!is_null($attr)) {
12430
if (is_null($property)) $property = 'pre';
12431
$type = 'attr_transform_' . $property;
12432
return array($type, $params);
12435
// special case: tag transform
12436
if (is_null($property)) {
12437
return array('tag_transform', $params);
12440
return array($property, $params);
12445
* Defines all fixes the module will perform in a compact
12446
* associative array of fix name to fix implementation.
12448
public function makeFixes() {}
12456
class HTMLPurifier_HTMLModule_XMLCommonAttributes extends HTMLPurifier_HTMLModule
12458
public $name = 'XMLCommonAttributes';
12460
public $attr_collections = array(
12462
'xml:lang' => 'LanguageCode',
12472
* Name is deprecated, but allowed in strict doctypes, so onl
12474
class HTMLPurifier_HTMLModule_Tidy_Name extends HTMLPurifier_HTMLModule_Tidy
12476
public $name = 'Tidy_Name';
12477
public $defaultLevel = 'heavy';
12478
public function makeFixes() {
12482
// @name for img, a -----------------------------------------------
12483
// Technically, it's allowed even on strict, so we allow authors to use
12484
// it. However, it's deprecated in future versions of XHTML.
12486
$r['a@name'] = new HTMLPurifier_AttrTransform_Name();
12496
class HTMLPurifier_HTMLModule_Tidy_Proprietary extends HTMLPurifier_HTMLModule_Tidy
12499
public $name = 'Tidy_Proprietary';
12500
public $defaultLevel = 'light';
12502
public function makeFixes() {
12504
$r['table@background'] = new HTMLPurifier_AttrTransform_Background();
12505
$r['td@background'] = new HTMLPurifier_AttrTransform_Background();
12506
$r['th@background'] = new HTMLPurifier_AttrTransform_Background();
12507
$r['tr@background'] = new HTMLPurifier_AttrTransform_Background();
12508
$r['thead@background'] = new HTMLPurifier_AttrTransform_Background();
12509
$r['tfoot@background'] = new HTMLPurifier_AttrTransform_Background();
12510
$r['tbody@background'] = new HTMLPurifier_AttrTransform_Background();
12520
class HTMLPurifier_HTMLModule_Tidy_XHTMLAndHTML4 extends HTMLPurifier_HTMLModule_Tidy
12523
public function makeFixes() {
12527
// == deprecated tag transforms ===================================
12529
$r['font'] = new HTMLPurifier_TagTransform_Font();
12530
$r['menu'] = new HTMLPurifier_TagTransform_Simple('ul');
12531
$r['dir'] = new HTMLPurifier_TagTransform_Simple('ul');
12532
$r['center'] = new HTMLPurifier_TagTransform_Simple('div', 'text-align:center;');
12533
$r['u'] = new HTMLPurifier_TagTransform_Simple('span', 'text-decoration:underline;');
12534
$r['s'] = new HTMLPurifier_TagTransform_Simple('span', 'text-decoration:line-through;');
12535
$r['strike'] = new HTMLPurifier_TagTransform_Simple('span', 'text-decoration:line-through;');
12537
// == deprecated attribute transforms =============================
12539
$r['caption@align'] =
12540
new HTMLPurifier_AttrTransform_EnumToCSS('align', array(
12541
// we're following IE's behavior, not Firefox's, due
12542
// to the fact that no one supports caption-side:right,
12543
// W3C included (with CSS 2.1). This is a slightly
12544
// unreasonable attribute!
12545
'left' => 'text-align:left;',
12546
'right' => 'text-align:right;',
12547
'top' => 'caption-side:top;',
12548
'bottom' => 'caption-side:bottom;' // not supported by IE
12551
// @align for img -------------------------------------------------
12553
new HTMLPurifier_AttrTransform_EnumToCSS('align', array(
12554
'left' => 'float:left;',
12555
'right' => 'float:right;',
12556
'top' => 'vertical-align:top;',
12557
'middle' => 'vertical-align:middle;',
12558
'bottom' => 'vertical-align:baseline;',
12561
// @align for table -----------------------------------------------
12562
$r['table@align'] =
12563
new HTMLPurifier_AttrTransform_EnumToCSS('align', array(
12564
'left' => 'float:left;',
12565
'center' => 'margin-left:auto;margin-right:auto;',
12566
'right' => 'float:right;'
12569
// @align for hr -----------------------------------------------
12571
new HTMLPurifier_AttrTransform_EnumToCSS('align', array(
12572
// we use both text-align and margin because these work
12573
// for different browsers (IE and Firefox, respectively)
12574
// and the melange makes for a pretty cross-compatible
12576
'left' => 'margin-left:0;margin-right:auto;text-align:left;',
12577
'center' => 'margin-left:auto;margin-right:auto;text-align:center;',
12578
'right' => 'margin-left:auto;margin-right:0;text-align:right;'
12581
// @align for h1, h2, h3, h4, h5, h6, p, div ----------------------
12583
$align_lookup = array();
12584
$align_values = array('left', 'right', 'center', 'justify');
12585
foreach ($align_values as $v) $align_lookup[$v] = "text-align:$v;";
12595
new HTMLPurifier_AttrTransform_EnumToCSS('align', $align_lookup);
12597
// @bgcolor for table, tr, td, th ---------------------------------
12598
$r['table@bgcolor'] =
12601
new HTMLPurifier_AttrTransform_BgColor();
12603
// @border for img ------------------------------------------------
12604
$r['img@border'] = new HTMLPurifier_AttrTransform_Border();
12606
// @clear for br --------------------------------------------------
12608
new HTMLPurifier_AttrTransform_EnumToCSS('clear', array(
12609
'left' => 'clear:left;',
12610
'right' => 'clear:right;',
12611
'all' => 'clear:both;',
12612
'none' => 'clear:none;',
12615
// @height for td, th ---------------------------------------------
12618
new HTMLPurifier_AttrTransform_Length('height');
12620
// @hspace for img ------------------------------------------------
12621
$r['img@hspace'] = new HTMLPurifier_AttrTransform_ImgSpace('hspace');
12623
// @noshade for hr ------------------------------------------------
12624
// this transformation is not precise but often good enough.
12625
// different browsers use different styles to designate noshade
12627
new HTMLPurifier_AttrTransform_BoolToCSS(
12629
'color:#808080;background-color:#808080;border:0;'
12632
// @nowrap for td, th ---------------------------------------------
12635
new HTMLPurifier_AttrTransform_BoolToCSS(
12637
'white-space:nowrap;'
12640
// @size for hr --------------------------------------------------
12641
$r['hr@size'] = new HTMLPurifier_AttrTransform_Length('size', 'height');
12643
// @type for li, ol, ul -------------------------------------------
12646
'disc' => 'list-style-type:disc;',
12647
'square' => 'list-style-type:square;',
12648
'circle' => 'list-style-type:circle;'
12651
'1' => 'list-style-type:decimal;',
12652
'i' => 'list-style-type:lower-roman;',
12653
'I' => 'list-style-type:upper-roman;',
12654
'a' => 'list-style-type:lower-alpha;',
12655
'A' => 'list-style-type:upper-alpha;'
12657
$li_types = $ul_types + $ol_types;
12660
$r['ul@type'] = new HTMLPurifier_AttrTransform_EnumToCSS('type', $ul_types);
12661
$r['ol@type'] = new HTMLPurifier_AttrTransform_EnumToCSS('type', $ol_types, true);
12662
$r['li@type'] = new HTMLPurifier_AttrTransform_EnumToCSS('type', $li_types, true);
12664
// @vspace for img ------------------------------------------------
12665
$r['img@vspace'] = new HTMLPurifier_AttrTransform_ImgSpace('vspace');
12667
// @width for hr, td, th ------------------------------------------
12670
$r['hr@width'] = new HTMLPurifier_AttrTransform_Length('width');
12682
class HTMLPurifier_HTMLModule_Tidy_Strict extends HTMLPurifier_HTMLModule_Tidy_XHTMLAndHTML4
12684
public $name = 'Tidy_Strict';
12685
public $defaultLevel = 'light';
12687
public function makeFixes() {
12688
$r = parent::makeFixes();
12689
$r['blockquote#content_model_type'] = 'strictblockquote';
12693
public $defines_child_def = true;
12694
public function getChildDef($def) {
12695
if ($def->content_model_type != 'strictblockquote') return parent::getChildDef($def);
12696
return new HTMLPurifier_ChildDef_StrictBlockquote($def->content_model);
12704
class HTMLPurifier_HTMLModule_Tidy_Transitional extends HTMLPurifier_HTMLModule_Tidy_XHTMLAndHTML4
12706
public $name = 'Tidy_Transitional';
12707
public $defaultLevel = 'heavy';
12714
class HTMLPurifier_HTMLModule_Tidy_XHTML extends HTMLPurifier_HTMLModule_Tidy
12717
public $name = 'Tidy_XHTML';
12718
public $defaultLevel = 'medium';
12720
public function makeFixes() {
12722
$r['@lang'] = new HTMLPurifier_AttrTransform_Lang();
12733
* Injector that auto paragraphs text in the root node based on
12735
* @todo Ensure all states are unit tested, including variations as well.
12736
* @todo Make a graph of the flow control for this Injector.
12738
class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
12741
public $name = 'AutoParagraph';
12742
public $needed = array('p');
12744
private function _pStart() {
12745
$par = new HTMLPurifier_Token_Start('p');
12746
$par->armor['MakeWellFormed_TagClosedError'] = true;
12750
public function handleText(&$token) {
12751
$text = $token->data;
12752
// Does the current parent allow <p> tags?
12753
if ($this->allowsElement('p')) {
12754
if (empty($this->currentNesting) || strpos($text, "\n\n") !== false) {
12755
// Note that we have differing behavior when dealing with text
12756
// in the anonymous root node, or a node inside the document.
12757
// If the text as a double-newline, the treatment is the same;
12758
// if it doesn't, see the next if-block if you're in the document.
12760
$i = $nesting = null;
12761
if (!$this->forwardUntilEndToken($i, $current, $nesting) && $token->is_whitespace) {
12762
// State 1.1: ... ^ (whitespace, then document end)
12764
// This is a degenerate case
12769
// State 1.3: PAR1\n\nPAR2
12772
// State 1.4: <div>PAR1\n\nPAR2 (see State 2)
12774
$token = array($this->_pStart());
12775
$this->_splitText($text, $token);
12778
// State 2: <div>PAR1... (similar to 1.4)
12781
// We're in an element that allows paragraph tags, but we're not
12782
// sure if we're going to need them.
12783
if ($this->_pLookAhead()) {
12784
// State 2.1: <div>PAR1<b>PAR1\n\nPAR2
12786
// Note: This will always be the first child, since any
12787
// previous inline element would have triggered this very
12788
// same routine, and found the double newline. One possible
12789
// exception would be a comment.
12790
$token = array($this->_pStart(), $token);
12792
// State 2.2.1: <div>PAR1<div>
12795
// State 2.2.2: <div>PAR1<b>PAR1</b></div>
12799
// Is the current parent a <p> tag?
12801
!empty($this->currentNesting) &&
12802
$this->currentNesting[count($this->currentNesting)-1]->name == 'p'
12804
// State 3.1: ...<p>PAR1
12807
// State 3.2: ...<p>PAR1\n\nPAR2
12810
$this->_splitText($text, $token);
12813
// State 4.1: ...<b>PAR1
12816
// State 4.2: ...<b>PAR1\n\nPAR2
12821
public function handleElement(&$token) {
12822
// We don't have to check if we're already in a <p> tag for block
12823
// tokens, because the tag would have been autoclosed by MakeWellFormed.
12824
if ($this->allowsElement('p')) {
12825
if (!empty($this->currentNesting)) {
12826
if ($this->_isInline($token)) {
12827
// State 1: <div>...<b>
12830
// Check if this token is adjacent to the parent token
12831
// (seek backwards until token isn't whitespace)
12833
$this->backward($i, $prev);
12835
if (!$prev instanceof HTMLPurifier_Token_Start) {
12836
// Token wasn't adjacent
12839
$prev instanceof HTMLPurifier_Token_Text &&
12840
substr($prev->data, -2) === "\n\n"
12842
// State 1.1.4: <div><p>PAR1</p>\n\n<b>
12845
// Quite frankly, this should be handled by splitText
12846
$token = array($this->_pStart(), $token);
12848
// State 1.1.1: <div><p>PAR1</p><b>
12851
// State 1.1.2: <div><br /><b>
12854
// State 1.1.3: <div>PAR<b>
12859
// State 1.2.1: <div><b>
12862
// Lookahead to see if <p> is needed.
12863
if ($this->_pLookAhead()) {
12864
// State 1.3.1: <div><b>PAR1\n\nPAR2
12866
$token = array($this->_pStart(), $token);
12868
// State 1.3.2: <div><b>PAR1</b></div>
12871
// State 1.3.3: <div><b>PAR1</b><div></div>\n\n</div>
12876
// State 2.3: ...<div>
12880
if ($this->_isInline($token)) {
12883
// This is where the {p} tag is inserted, not reflected in
12884
// inputTokens yet, however.
12885
$token = array($this->_pStart(), $token);
12887
// State 3.2: <div>
12892
if ($this->backward($i, $prev)) {
12894
!$prev instanceof HTMLPurifier_Token_Text
12896
// State 3.1.1: ...</p>{p}<b>
12899
// State 3.2.1: ...</p><div>
12902
if (!is_array($token)) $token = array($token);
12903
array_unshift($token, new HTMLPurifier_Token_Text("\n\n"));
12905
// State 3.1.2: ...</p>\n\n{p}<b>
12908
// State 3.2.2: ...</p>\n\n<div>
12911
// Note: PAR<ELEM> cannot occur because PAR would have been
12912
// wrapped in <p> tags.
12917
// State 2.2: <ul><li>
12920
// State 2.4: <p><b>
12926
* Splits up a text in paragraph tokens and appends them
12927
* to the result stream that will replace the original
12928
* @param $data String text data that will be processed
12930
* @param $result Reference to array of tokens that the
12931
* tags will be appended onto
12932
* @param $config Instance of HTMLPurifier_Config
12933
* @param $context Instance of HTMLPurifier_Context
12935
private function _splitText($data, &$result) {
12936
$raw_paragraphs = explode("\n\n", $data);
12937
$paragraphs = array(); // without empty paragraphs
12938
$needs_start = false;
12939
$needs_end = false;
12941
$c = count($raw_paragraphs);
12943
// There were no double-newlines, abort quickly. In theory this
12944
// should never happen.
12945
$result[] = new HTMLPurifier_Token_Text($data);
12948
for ($i = 0; $i < $c; $i++) {
12949
$par = $raw_paragraphs[$i];
12950
if (trim($par) !== '') {
12951
$paragraphs[] = $par;
12954
// Double newline at the front
12955
if (empty($result)) {
12956
// The empty result indicates that the AutoParagraph
12957
// injector did not add any start paragraph tokens.
12958
// This means that we have been in a paragraph for
12959
// a while, and the newline means we should start a new one.
12960
$result[] = new HTMLPurifier_Token_End('p');
12961
$result[] = new HTMLPurifier_Token_Text("\n\n");
12962
// However, the start token should only be added if
12963
// there is more processing to be done (i.e. there are
12964
// real paragraphs in here). If there are none, the
12965
// next start paragraph tag will be handled by the
12966
// next call to the injector
12967
$needs_start = true;
12969
// We just started a new paragraph!
12970
// Reinstate a double-newline for presentation's sake, since
12971
// it was in the source code.
12972
array_unshift($result, new HTMLPurifier_Token_Text("\n\n"));
12974
} elseif ($i + 1 == $c) {
12975
// Double newline at the end
12976
// There should be a trailing </p> when we're finally done.
12982
// Check if this was just a giant blob of whitespace. Move this earlier,
12984
if (empty($paragraphs)) {
12988
// Add the start tag indicated by \n\n at the beginning of $data
12989
if ($needs_start) {
12990
$result[] = $this->_pStart();
12993
// Append the paragraphs onto the result
12994
foreach ($paragraphs as $par) {
12995
$result[] = new HTMLPurifier_Token_Text($par);
12996
$result[] = new HTMLPurifier_Token_End('p');
12997
$result[] = new HTMLPurifier_Token_Text("\n\n");
12998
$result[] = $this->_pStart();
13001
// Remove trailing start token; Injector will handle this later if
13002
// it was indeed needed. This prevents from needing to do a lookahead,
13003
// at the cost of a lookbehind later.
13004
array_pop($result);
13006
// If there is no need for an end tag, remove all of it and let
13007
// MakeWellFormed close it later.
13009
array_pop($result); // removes \n\n
13010
array_pop($result); // removes </p>
13016
* Returns true if passed token is inline (and, ergo, allowed in
13019
private function _isInline($token) {
13020
return isset($this->htmlDefinition->info['p']->child->elements[$token->name]);
13024
* Looks ahead in the token list and determines whether or not we need
13025
* to insert a <p> tag.
13027
private function _pLookAhead() {
13028
$this->current($i, $current);
13029
if ($current instanceof HTMLPurifier_Token_Start) $nesting = 1;
13032
while ($this->forwardUntilEndToken($i, $current, $nesting)) {
13033
$result = $this->_checkNeedsP($current);
13034
if ($result !== null) {
13043
* Determines if a particular token requires an earlier inline token
13044
* to get a paragraph. This should be used with _forwardUntilEndToken
13046
private function _checkNeedsP($current) {
13047
if ($current instanceof HTMLPurifier_Token_Start){
13048
if (!$this->_isInline($current)) {
13051
// Terminate early, since we hit a block element
13054
} elseif ($current instanceof HTMLPurifier_Token_Text) {
13055
if (strpos($current->data, "\n\n") !== false) {
13056
// <div>PAR1<b>PAR1\n\nPAR2
13060
// <div>PAR1<b>PAR1...
13074
* Injector that displays the URL of an anchor instead of linking to it, in addition to showing the text of the link.
13076
class HTMLPurifier_Injector_DisplayLinkURI extends HTMLPurifier_Injector
13079
public $name = 'DisplayLinkURI';
13080
public $needed = array('a');
13082
public function handleElement(&$token) {
13085
public function handleEnd(&$token) {
13086
if (isset($token->start->attr['href'])){
13087
$url = $token->start->attr['href'];
13088
unset($token->start->attr['href']);
13089
$token = array($token, new HTMLPurifier_Token_Text(" ($url)"));
13091
// nothing to display
13101
* Injector that converts http, https and ftp text URLs to actual links.
13103
class HTMLPurifier_Injector_Linkify extends HTMLPurifier_Injector
13106
public $name = 'Linkify';
13107
public $needed = array('a' => array('href'));
13109
public function handleText(&$token) {
13110
if (!$this->allowsElement('a')) return;
13112
if (strpos($token->data, '://') === false) {
13113
// our really quick heuristic failed, abort
13114
// this may not work so well if we want to match things like
13115
// "google.com", but then again, most people don't
13119
// there is/are URL(s). Let's split the string:
13120
// Note: this regex is extremely permissive
13121
$bits = preg_split('#((?:https?|ftp)://[^\s\'"<>()]+)#S', $token->data, -1, PREG_SPLIT_DELIM_CAPTURE);
13128
for ($i = 0, $c = count($bits), $l = false; $i < $c; $i++, $l = !$l) {
13130
if ($bits[$i] === '') continue;
13131
$token[] = new HTMLPurifier_Token_Text($bits[$i]);
13133
$token[] = new HTMLPurifier_Token_Start('a', array('href' => $bits[$i]));
13134
$token[] = new HTMLPurifier_Token_Text($bits[$i]);
13135
$token[] = new HTMLPurifier_Token_End('a');
13148
* Injector that converts configuration directive syntax %Namespace.Directive
13151
class HTMLPurifier_Injector_PurifierLinkify extends HTMLPurifier_Injector
13154
public $name = 'PurifierLinkify';
13156
public $needed = array('a' => array('href'));
13158
public function prepare($config, $context) {
13159
$this->docURL = $config->get('AutoFormat.PurifierLinkify.DocURL');
13160
return parent::prepare($config, $context);
13163
public function handleText(&$token) {
13164
if (!$this->allowsElement('a')) return;
13165
if (strpos($token->data, '%') === false) return;
13167
$bits = preg_split('#%([a-z0-9]+\.[a-z0-9]+)#Si', $token->data, -1, PREG_SPLIT_DELIM_CAPTURE);
13173
for ($i = 0, $c = count($bits), $l = false; $i < $c; $i++, $l = !$l) {
13175
if ($bits[$i] === '') continue;
13176
$token[] = new HTMLPurifier_Token_Text($bits[$i]);
13178
$token[] = new HTMLPurifier_Token_Start('a',
13179
array('href' => str_replace('%s', $bits[$i], $this->docURL)));
13180
$token[] = new HTMLPurifier_Token_Text('%' . $bits[$i]);
13181
$token[] = new HTMLPurifier_Token_End('a');
13193
class HTMLPurifier_Injector_RemoveEmpty extends HTMLPurifier_Injector
13196
private $context, $config, $attrValidator, $removeNbsp, $removeNbspExceptions;
13198
public function prepare($config, $context) {
13199
parent::prepare($config, $context);
13200
$this->config = $config;
13201
$this->context = $context;
13202
$this->removeNbsp = $config->get('AutoFormat.RemoveEmpty.RemoveNbsp');
13203
$this->removeNbspExceptions = $config->get('AutoFormat.RemoveEmpty.RemoveNbsp.Exceptions');
13204
$this->attrValidator = new HTMLPurifier_AttrValidator();
13207
public function handleElement(&$token) {
13208
if (!$token instanceof HTMLPurifier_Token_Start) return;
13210
for ($i = $this->inputIndex + 1, $c = count($this->inputTokens); $i < $c; $i++) {
13211
$next = $this->inputTokens[$i];
13212
if ($next instanceof HTMLPurifier_Token_Text) {
13213
if ($next->is_whitespace) continue;
13214
if ($this->removeNbsp && !isset($this->removeNbspExceptions[$token->name])) {
13215
$plain = str_replace("\xC2\xA0", "", $next->data);
13216
$isWsOrNbsp = $plain === '' || ctype_space($plain);
13217
if ($isWsOrNbsp) continue;
13222
if (!$next || ($next instanceof HTMLPurifier_Token_End && $next->name == $token->name)) {
13223
if ($token->name == 'colgroup') return;
13224
$this->attrValidator->validateToken($token, $this->config, $this->context);
13225
$token->armor['ValidateAttributes'] = true;
13226
if (isset($token->attr['id']) || isset($token->attr['name'])) return;
13227
$token = $i - $this->inputIndex + 1;
13228
for ($b = $this->inputIndex - 1; $b > 0; $b--) {
13229
$prev = $this->inputTokens[$b];
13230
if ($prev instanceof HTMLPurifier_Token_Text && $prev->is_whitespace) continue;
13233
// This is safe because we removed the token that triggered this.
13234
$this->rewind($b - 1);
13246
* Adds important param elements to inside of object in order to make
13249
class HTMLPurifier_Injector_SafeObject extends HTMLPurifier_Injector
13251
public $name = 'SafeObject';
13252
public $needed = array('object', 'param');
13254
protected $objectStack = array();
13255
protected $paramStack = array();
13257
// Keep this synchronized with AttrTransform/SafeParam.php
13258
protected $addParam = array(
13259
'allowScriptAccess' => 'never',
13260
'allowNetworking' => 'internal',
13262
protected $allowedParam = array(
13267
public function prepare($config, $context) {
13268
parent::prepare($config, $context);
13271
public function handleElement(&$token) {
13272
if ($token->name == 'object') {
13273
$this->objectStack[] = $token;
13274
$this->paramStack[] = array();
13275
$new = array($token);
13276
foreach ($this->addParam as $name => $value) {
13277
$new[] = new HTMLPurifier_Token_Empty('param', array('name' => $name, 'value' => $value));
13280
} elseif ($token->name == 'param') {
13281
$nest = count($this->currentNesting) - 1;
13282
if ($nest >= 0 && $this->currentNesting[$nest]->name === 'object') {
13283
$i = count($this->objectStack) - 1;
13284
if (!isset($token->attr['name'])) {
13288
$n = $token->attr['name'];
13289
// We need this fix because YouTube doesn't supply a data
13290
// attribute, which we need if a type is specified. This is
13291
// *very* Flash specific.
13292
if (!isset($this->objectStack[$i]->attr['data']) && $token->attr['name'] == 'movie') {
13293
$this->objectStack[$i]->attr['data'] = $token->attr['value'];
13295
// Check if the parameter is the correct value but has not
13296
// already been added
13298
!isset($this->paramStack[$i][$n]) &&
13299
isset($this->addParam[$n]) &&
13300
$token->attr['name'] === $this->addParam[$n]
13302
// keep token, and add to param stack
13303
$this->paramStack[$i][$n] = true;
13304
} elseif (isset($this->allowedParam[$n])) {
13305
// keep token, don't do anything to it
13306
// (could possibly check for duplicates here)
13311
// not directly inside an object, DENY!
13317
public function handleEnd(&$token) {
13318
// This is the WRONG way of handling the object and param stacks;
13319
// we should be inserting them directly on the relevant object tokens
13320
// so that the global stack handling handles it.
13321
if ($token->name == 'object') {
13322
array_pop($this->objectStack);
13323
array_pop($this->paramStack);
13334
* Parser that uses PHP 5's DOM extension (part of the core).
13336
* In PHP 5, the DOM XML extension was revamped into DOM and added to the core.
13337
* It gives us a forgiving HTML parser, which we use to transform the HTML
13338
* into a DOM, and then into the tokens. It is blazingly fast (for large
13339
* documents, it performs twenty times faster than
13340
* HTMLPurifier_Lexer_DirectLex,and is the default choice for PHP 5.
13342
* @note Any empty elements will have empty tokens associated with them, even if
13343
* this is prohibited by the spec. This is cannot be fixed until the spec
13346
* @note PHP's DOM extension does not actually parse any entities, we use
13347
* our own function to do that.
13349
* @warning DOM tends to drop whitespace, which may wreak havoc on indenting.
13350
* If this is a huge problem, due to the fact that HTML is hand
13351
* edited and you are unable to get a parser cache that caches the
13352
* the output of HTML Purifier while keeping the original HTML lying
13353
* around, you may want to run Tidy on the resulting output or use
13354
* HTMLPurifier_DirectLex
13357
class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
13362
public function __construct() {
13363
// setup the factory
13364
parent::__construct();
13365
$this->factory = new HTMLPurifier_TokenFactory();
13368
public function tokenizeHTML($html, $config, $context) {
13370
$html = $this->normalize($html, $config, $context);
13372
// attempt to armor stray angled brackets that cannot possibly
13373
// form tags and thus are probably being used as emoticons
13374
if ($config->get('Core.AggressivelyFixLt')) {
13375
$char = '[^a-z!\/]';
13376
$comment = "/<!--(.*?)(-->|\z)/is";
13377
$html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html);
13380
$html = preg_replace("/<($char)/i", '<\\1', $html);
13381
} while ($html !== $old);
13382
$html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments
13385
// preprocess html, essential for UTF-8
13386
$html = $this->wrapHTML($html, $config, $context);
13388
$doc = new DOMDocument();
13389
$doc->encoding = 'UTF-8'; // theoretically, the above has this covered
13391
set_error_handler(array($this, 'muteErrorHandler'));
13392
$doc->loadHTML($html);
13393
restore_error_handler();
13396
$this->tokenizeDOM(
13397
$doc->getElementsByTagName('html')->item(0)-> // <html>
13398
getElementsByTagName('body')->item(0)-> // <body>
13399
getElementsByTagName('div')->item(0) // <div>
13405
* Recursive function that tokenizes a node, putting it into an accumulator.
13407
* @param $node DOMNode to be tokenized.
13408
* @param $tokens Array-list of already tokenized tokens.
13409
* @param $collect Says whether or start and close are collected, set to
13410
* false at first recursion because it's the implicit DIV
13411
* tag you're dealing with.
13412
* @returns Tokens of node appended to previously passed tokens.
13414
protected function tokenizeDOM($node, &$tokens, $collect = false) {
13416
// intercept non element nodes. WE MUST catch all of them,
13417
// but we're not getting the character reference nodes because
13418
// those should have been preprocessed
13419
if ($node->nodeType === XML_TEXT_NODE) {
13420
$tokens[] = $this->factory->createText($node->data);
13422
} elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {
13423
// undo libxml's special treatment of <script> and <style> tags
13424
$last = end($tokens);
13425
$data = $node->data;
13426
// (note $node->tagname is already normalized)
13427
if ($last instanceof HTMLPurifier_Token_Start && ($last->name == 'script' || $last->name == 'style')) {
13428
$new_data = trim($data);
13429
if (substr($new_data, 0, 4) === '<!--') {
13430
$data = substr($new_data, 4);
13431
if (substr($data, -3) === '-->') {
13432
$data = substr($data, 0, -3);
13434
// Highly suspicious! Not sure what to do...
13438
$tokens[] = $this->factory->createText($this->parseData($data));
13440
} elseif ($node->nodeType === XML_COMMENT_NODE) {
13441
// this is code is only invoked for comments in script/style in versions
13442
// of libxml pre-2.6.28 (regular comments, of course, are still
13443
// handled regularly)
13444
$tokens[] = $this->factory->createComment($node->data);
13447
// not-well tested: there may be other nodes we have to grab
13448
$node->nodeType !== XML_ELEMENT_NODE
13453
$attr = $node->hasAttributes() ?
13454
$this->transformAttrToAssoc($node->attributes) :
13457
// We still have to make sure that the element actually IS empty
13458
if (!$node->childNodes->length) {
13460
$tokens[] = $this->factory->createEmpty($node->tagName, $attr);
13463
if ($collect) { // don't wrap on first iteration
13464
$tokens[] = $this->factory->createStart(
13465
$tag_name = $node->tagName, // somehow, it get's dropped
13469
foreach ($node->childNodes as $node) {
13470
// remember, it's an accumulator. Otherwise, we'd have
13471
// to use array_merge
13472
$this->tokenizeDOM($node, $tokens, true);
13475
$tokens[] = $this->factory->createEnd($tag_name);
13482
* Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array.
13484
* @param $attribute_list DOMNamedNodeMap of DOMAttr objects.
13485
* @returns Associative array of attributes.
13487
protected function transformAttrToAssoc($node_map) {
13488
// NamedNodeMap is documented very well, so we're using undocumented
13489
// features, namely, the fact that it implements Iterator and
13490
// has a ->length attribute
13491
if ($node_map->length === 0) return array();
13493
foreach ($node_map as $attr) {
13494
$array[$attr->name] = $attr->value;
13500
* An error handler that mutes all errors
13502
public function muteErrorHandler($errno, $errstr) {}
13505
* Callback function for undoing escaping of stray angled brackets
13508
public function callbackUndoCommentSubst($matches) {
13509
return '<!--' . strtr($matches[1], array('&'=>'&','<'=>'<')) . $matches[2];
13513
* Callback function that entity-izes ampersands in comments so that
13514
* callbackUndoCommentSubst doesn't clobber them
13516
public function callbackArmorCommentEntities($matches) {
13517
return '<!--' . str_replace('&', '&', $matches[1]) . $matches[2];
13521
* Wraps an HTML fragment in the necessary HTML
13523
protected function wrapHTML($html, $config, $context) {
13524
$def = $config->getDefinition('HTML');
13527
if (!empty($def->doctype->dtdPublic) || !empty($def->doctype->dtdSystem)) {
13528
$ret .= '<!DOCTYPE html ';
13529
if (!empty($def->doctype->dtdPublic)) $ret .= 'PUBLIC "' . $def->doctype->dtdPublic . '" ';
13530
if (!empty($def->doctype->dtdSystem)) $ret .= '"' . $def->doctype->dtdSystem . '" ';
13534
$ret .= '<html><head>';
13535
$ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />';
13536
// No protection if $html contains a stray </div>!
13537
$ret .= '</head><body><div>'.$html.'</div></body></html>';
13548
* Our in-house implementation of a parser.
13550
* A pure PHP parser, DirectLex has absolutely no dependencies, making
13551
* it a reasonably good default for PHP4. Written with efficiency in mind,
13552
* it can be four times faster than HTMLPurifier_Lexer_PEARSax3, although it
13553
* pales in comparison to HTMLPurifier_Lexer_DOMLex.
13555
* @todo Reread XML spec and document differences.
13557
class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
13560
public $tracksLineNumbers = true;
13563
* Whitespace characters for str(c)spn.
13565
protected $_whitespace = "\x20\x09\x0D\x0A";
13568
* Callback function for script CDATA fudge
13569
* @param $matches, in form of array(opening tag, contents, closing tag)
13571
protected function scriptCallback($matches) {
13572
return $matches[1] . htmlspecialchars($matches[2], ENT_COMPAT, 'UTF-8') . $matches[3];
13575
public function tokenizeHTML($html, $config, $context) {
13577
// special normalization for script tags without any armor
13578
// our "armor" heurstic is a < sign any number of whitespaces after
13579
// the first script tag
13580
if ($config->get('HTML.Trusted')) {
13581
$html = preg_replace_callback('#(<script[^>]*>)(\s*[^<].+?)(</script>)#si',
13582
array($this, 'scriptCallback'), $html);
13585
$html = $this->normalize($html, $config, $context);
13587
$cursor = 0; // our location in the text
13588
$inside_tag = false; // whether or not we're parsing the inside of a tag
13589
$array = array(); // result array
13591
// This is also treated to mean maintain *column* numbers too
13592
$maintain_line_numbers = $config->get('Core.MaintainLineNumbers');
13594
if ($maintain_line_numbers === null) {
13595
// automatically determine line numbering by checking
13596
// if error collection is on
13597
$maintain_line_numbers = $config->get('Core.CollectErrors');
13600
if ($maintain_line_numbers) {
13603
$length = strlen($html);
13605
$current_line = false;
13606
$current_col = false;
13609
$context->register('CurrentLine', $current_line);
13610
$context->register('CurrentCol', $current_col);
13612
// how often to manually recalculate. This will ALWAYS be right,
13613
// but it's pretty wasteful. Set to 0 to turn off
13614
$synchronize_interval = $config->get('Core.DirectLexLineNumberSyncInterval');
13617
if ($config->get('Core.CollectErrors')) {
13618
$e =& $context->get('ErrorCollector');
13621
// for testing synchronization
13626
// $cursor is either at the start of a token, or inside of
13627
// a tag (i.e. there was a < immediately before it), as indicated
13630
if ($maintain_line_numbers) {
13632
// $rcursor, however, is always at the start of a token.
13633
$rcursor = $cursor - (int) $inside_tag;
13635
// Column number is cheap, so we calculate it every round.
13636
// We're interested at the *end* of the newline string, so
13637
// we need to add strlen($nl) == 1 to $nl_pos before subtracting it
13638
// from our "rcursor" position.
13639
$nl_pos = strrpos($html, $nl, $rcursor - $length);
13640
$current_col = $rcursor - (is_bool($nl_pos) ? 0 : $nl_pos + 1);
13642
// recalculate lines
13644
$synchronize_interval && // synchronization is on
13645
$cursor > 0 && // cursor is further than zero
13646
$loops % $synchronize_interval === 0 // time to synchronize!
13648
$current_line = 1 + $this->substrCount($html, $nl, 0, $cursor);
13653
$position_next_lt = strpos($html, '<', $cursor);
13654
$position_next_gt = strpos($html, '>', $cursor);
13656
// triggers on "<b>asdf</b>" but not "asdf <b></b>"
13657
// special case to set up context
13658
if ($position_next_lt === $cursor) {
13659
$inside_tag = true;
13663
if (!$inside_tag && $position_next_lt !== false) {
13664
// We are not inside tag and there still is another tag to parse
13666
HTMLPurifier_Token_Text(
13669
$html, $cursor, $position_next_lt - $cursor
13673
if ($maintain_line_numbers) {
13674
$token->rawPosition($current_line, $current_col);
13675
$current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor);
13678
$cursor = $position_next_lt + 1;
13679
$inside_tag = true;
13681
} elseif (!$inside_tag) {
13682
// We are not inside tag but there are no more tags
13683
// If we're already at the end, break
13684
if ($cursor === strlen($html)) break;
13685
// Create Text of rest of string
13687
HTMLPurifier_Token_Text(
13694
if ($maintain_line_numbers) $token->rawPosition($current_line, $current_col);
13697
} elseif ($inside_tag && $position_next_gt !== false) {
13698
// We are in tag and it is well formed
13699
// Grab the internals of the tag
13700
$strlen_segment = $position_next_gt - $cursor;
13702
if ($strlen_segment < 1) {
13703
// there's nothing to process!
13704
$token = new HTMLPurifier_Token_Text('<');
13709
$segment = substr($html, $cursor, $strlen_segment);
13711
if ($segment === false) {
13712
// somehow, we attempted to access beyond the end of
13713
// the string, defense-in-depth, reported by Nate Abele
13717
// Check if it's a comment
13719
substr($segment, 0, 3) === '!--'
13721
// re-determine segment length, looking for -->
13722
$position_comment_end = strpos($html, '-->', $cursor);
13723
if ($position_comment_end === false) {
13724
// uh oh, we have a comment that extends to
13725
// infinity. Can't be helped: set comment
13726
// end position to end of string
13727
if ($e) $e->send(E_WARNING, 'Lexer: Unclosed comment');
13728
$position_comment_end = strlen($html);
13733
$strlen_segment = $position_comment_end - $cursor;
13734
$segment = substr($html, $cursor, $strlen_segment);
13736
HTMLPurifier_Token_Comment(
13738
$segment, 3, $strlen_segment - 3
13741
if ($maintain_line_numbers) {
13742
$token->rawPosition($current_line, $current_col);
13743
$current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment);
13746
$cursor = $end ? $position_comment_end : $position_comment_end + 3;
13747
$inside_tag = false;
13751
// Check if it's an end tag
13752
$is_end_tag = (strpos($segment,'/') === 0);
13754
$type = substr($segment, 1);
13755
$token = new HTMLPurifier_Token_End($type);
13756
if ($maintain_line_numbers) {
13757
$token->rawPosition($current_line, $current_col);
13758
$current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
13761
$inside_tag = false;
13762
$cursor = $position_next_gt + 1;
13766
// Check leading character is alnum, if not, we may
13767
// have accidently grabbed an emoticon. Translate into
13768
// text and go our merry way
13769
if (!ctype_alpha($segment[0])) {
13770
// XML: $segment[0] !== '_' && $segment[0] !== ':'
13771
if ($e) $e->send(E_NOTICE, 'Lexer: Unescaped lt');
13772
$token = new HTMLPurifier_Token_Text('<');
13773
if ($maintain_line_numbers) {
13774
$token->rawPosition($current_line, $current_col);
13775
$current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
13778
$inside_tag = false;
13782
// Check if it is explicitly self closing, if so, remove
13783
// trailing slash. Remember, we could have a tag like <br>, so
13784
// any later token processing scripts must convert improperly
13785
// classified EmptyTags from StartTags.
13786
$is_self_closing = (strrpos($segment,'/') === $strlen_segment-1);
13787
if ($is_self_closing) {
13789
$segment = substr($segment, 0, $strlen_segment);
13792
// Check if there are any attributes
13793
$position_first_space = strcspn($segment, $this->_whitespace);
13795
if ($position_first_space >= $strlen_segment) {
13796
if ($is_self_closing) {
13797
$token = new HTMLPurifier_Token_Empty($segment);
13799
$token = new HTMLPurifier_Token_Start($segment);
13801
if ($maintain_line_numbers) {
13802
$token->rawPosition($current_line, $current_col);
13803
$current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
13806
$inside_tag = false;
13807
$cursor = $position_next_gt + 1;
13811
// Grab out all the data
13812
$type = substr($segment, 0, $position_first_space);
13813
$attribute_string =
13816
$segment, $position_first_space
13819
if ($attribute_string) {
13820
$attr = $this->parseAttributeString(
13822
, $config, $context
13828
if ($is_self_closing) {
13829
$token = new HTMLPurifier_Token_Empty($type, $attr);
13831
$token = new HTMLPurifier_Token_Start($type, $attr);
13833
if ($maintain_line_numbers) {
13834
$token->rawPosition($current_line, $current_col);
13835
$current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
13838
$cursor = $position_next_gt + 1;
13839
$inside_tag = false;
13842
// inside tag, but there's no ending > sign
13843
if ($e) $e->send(E_WARNING, 'Lexer: Missing gt');
13845
HTMLPurifier_Token_Text(
13848
substr($html, $cursor)
13851
if ($maintain_line_numbers) $token->rawPosition($current_line, $current_col);
13852
// no cursor scroll? Hmm...
13859
$context->destroy('CurrentLine');
13860
$context->destroy('CurrentCol');
13865
* PHP 5.0.x compatible substr_count that implements offset and length
13867
protected function substrCount($haystack, $needle, $offset, $length) {
13868
static $oldVersion;
13869
if ($oldVersion === null) {
13870
$oldVersion = version_compare(PHP_VERSION, '5.1', '<');
13873
$haystack = substr($haystack, $offset, $length);
13874
return substr_count($haystack, $needle);
13876
return substr_count($haystack, $needle, $offset, $length);
13881
* Takes the inside of an HTML tag and makes an assoc array of attributes.
13883
* @param $string Inside of tag excluding name.
13884
* @returns Assoc array of attributes.
13886
public function parseAttributeString($string, $config, $context) {
13887
$string = (string) $string; // quick typecast
13889
if ($string == '') return array(); // no attributes
13892
if ($config->get('Core.CollectErrors')) {
13893
$e =& $context->get('ErrorCollector');
13896
// let's see if we can abort as quickly as possible
13897
// one equal sign, no spaces => one attribute
13898
$num_equal = substr_count($string, '=');
13899
$has_space = strpos($string, ' ');
13900
if ($num_equal === 0 && !$has_space) {
13902
return array($string => $string);
13903
} elseif ($num_equal === 1 && !$has_space) {
13904
// only one attribute
13905
list($key, $quoted_value) = explode('=', $string);
13906
$quoted_value = trim($quoted_value);
13908
if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
13911
if (!$quoted_value) return array($key => '');
13912
$first_char = @$quoted_value[0];
13913
$last_char = @$quoted_value[strlen($quoted_value)-1];
13915
$same_quote = ($first_char == $last_char);
13916
$open_quote = ($first_char == '"' || $first_char == "'");
13918
if ( $same_quote && $open_quote) {
13920
$value = substr($quoted_value, 1, strlen($quoted_value) - 2);
13922
// not well behaved
13924
if ($e) $e->send(E_ERROR, 'Lexer: Missing end quote');
13925
$value = substr($quoted_value, 1);
13927
$value = $quoted_value;
13930
if ($value === false) $value = '';
13931
return array($key => $value);
13934
// setup loop environment
13935
$array = array(); // return assoc array of attributes
13936
$cursor = 0; // current position in string (moves forward)
13937
$size = strlen($string); // size of the string (stays the same)
13939
// if we have unquoted attributes, the parser expects a terminating
13940
// space, so let's guarantee that there's always a terminating space.
13945
if ($cursor >= $size) {
13949
$cursor += ($value = strspn($string, $this->_whitespace, $cursor));
13952
$key_begin = $cursor; //we're currently at the start of the key
13954
// scroll past all characters that are the key (not whitespace or =)
13955
$cursor += strcspn($string, $this->_whitespace . '=', $cursor);
13957
$key_end = $cursor; // now at the end of the key
13959
$key = substr($string, $key_begin, $key_end - $key_begin);
13962
if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
13963
$cursor += strcspn($string, $this->_whitespace, $cursor + 1); // prevent infinite loop
13964
continue; // empty key
13967
// scroll past all whitespace
13968
$cursor += strspn($string, $this->_whitespace, $cursor);
13970
if ($cursor >= $size) {
13971
$array[$key] = $key;
13975
// if the next character is an equal sign, we've got a regular
13976
// pair, otherwise, it's a bool attribute
13977
$first_char = @$string[$cursor];
13979
if ($first_char == '=') {
13983
$cursor += strspn($string, $this->_whitespace, $cursor);
13985
if ($cursor === false) {
13990
// we might be in front of a quote right now
13992
$char = @$string[$cursor];
13994
if ($char == '"' || $char == "'") {
13995
// it's quoted, end bound is $char
13997
$value_begin = $cursor;
13998
$cursor = strpos($string, $char, $cursor);
13999
$value_end = $cursor;
14001
// it's not quoted, end bound is whitespace
14002
$value_begin = $cursor;
14003
$cursor += strcspn($string, $this->_whitespace, $cursor);
14004
$value_end = $cursor;
14007
// we reached a premature end
14008
if ($cursor === false) {
14010
$value_end = $cursor;
14013
$value = substr($string, $value_begin, $value_end - $value_begin);
14014
if ($value === false) $value = '';
14015
$array[$key] = $this->parseData($value);
14021
$array[$key] = $key;
14023
// purely theoretical
14024
if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
14039
* Composite strategy that runs multiple strategies on tokens.
14041
abstract class HTMLPurifier_Strategy_Composite extends HTMLPurifier_Strategy
14045
* List of strategies to run tokens through.
14047
protected $strategies = array();
14049
abstract public function __construct();
14051
public function execute($tokens, $config, $context) {
14052
foreach ($this->strategies as $strategy) {
14053
$tokens = $strategy->execute($tokens, $config, $context);
14065
* Core strategy composed of the big four strategies.
14067
class HTMLPurifier_Strategy_Core extends HTMLPurifier_Strategy_Composite
14070
public function __construct() {
14071
$this->strategies[] = new HTMLPurifier_Strategy_RemoveForeignElements();
14072
$this->strategies[] = new HTMLPurifier_Strategy_MakeWellFormed();
14073
$this->strategies[] = new HTMLPurifier_Strategy_FixNesting();
14074
$this->strategies[] = new HTMLPurifier_Strategy_ValidateAttributes();
14084
* Takes a well formed list of tokens and fixes their nesting.
14086
* HTML elements dictate which elements are allowed to be their children,
14087
* for example, you can't have a p tag in a span tag. Other elements have
14088
* much more rigorous definitions: tables, for instance, require a specific
14089
* order for their elements. There are also constraints not expressible by
14090
* document type definitions, such as the chameleon nature of ins/del
14091
* tags and global child exclusions.
14093
* The first major objective of this strategy is to iterate through all the
14094
* nodes (not tokens) of the list of tokens and determine whether or not
14095
* their children conform to the element's definition. If they do not, the
14096
* child definition may optionally supply an amended list of elements that
14097
* is valid or require that the entire node be deleted (and the previous
14100
* The second objective is to ensure that explicitly excluded elements of
14101
* an element do not appear in its children. Code that accomplishes this
14102
* task is pervasive through the strategy, though the two are distinct tasks
14103
* and could, theoretically, be seperated (although it's not recommended).
14105
* @note Whether or not unrecognized children are silently dropped or
14106
* translated into text depends on the child definitions.
14108
* @todo Enable nodes to be bubbled out of the structure.
14111
class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
14114
public function execute($tokens, $config, $context) {
14115
//####################################################################//
14118
// get a copy of the HTML definition
14119
$definition = $config->getHTMLDefinition();
14121
// insert implicit "parent" node, will be removed at end.
14123
$parent_name = $definition->info_parent;
14124
array_unshift($tokens, new HTMLPurifier_Token_Start($parent_name));
14125
$tokens[] = new HTMLPurifier_Token_End($parent_name);
14127
// setup the context variable 'IsInline', for chameleon processing
14128
// is 'false' when we are not inline, 'true' when it must always
14129
// be inline, and an integer when it is inline for a certain
14130
// branch of the document tree
14131
$is_inline = $definition->info_parent_def->descendants_are_inline;
14132
$context->register('IsInline', $is_inline);
14134
// setup error collector
14135
$e =& $context->get('ErrorCollector', true);
14137
//####################################################################//
14138
// Loop initialization
14140
// stack that contains the indexes of all parents,
14141
// $stack[count($stack)-1] being the current parent
14144
// stack that contains all elements that are excluded
14145
// it is organized by parent elements, similar to $stack,
14146
// but it is only populated when an element with exclusions is
14147
// processed, i.e. there won't be empty exclusions.
14148
$exclude_stack = array();
14150
// variable that contains the start token while we are processing
14151
// nodes. This enables error reporting to do its job
14152
$start_token = false;
14153
$context->register('CurrentToken', $start_token);
14155
//####################################################################//
14158
// iterate through all start nodes. Determining the start node
14159
// is complicated so it has been omitted from the loop construct
14160
for ($i = 0, $size = count($tokens) ; $i < $size; ) {
14162
//################################################################//
14163
// Gather information on children
14165
// child token accumulator
14166
$child_tokens = array();
14168
// scroll to the end of this node, report number, and collect
14170
for ($j = $i, $depth = 0; ; $j++) {
14171
if ($tokens[$j] instanceof HTMLPurifier_Token_Start) {
14173
// skip token assignment on first iteration, this is the
14174
// token we currently are on
14175
if ($depth == 1) continue;
14176
} elseif ($tokens[$j] instanceof HTMLPurifier_Token_End) {
14178
// skip token assignment on last iteration, this is the
14179
// end token of the token we're currently on
14180
if ($depth == 0) break;
14182
$child_tokens[] = $tokens[$j];
14185
// $i is index of start token
14186
// $j is index of end token
14188
$start_token = $tokens[$i]; // to make token available via CurrentToken
14190
//################################################################//
14191
// Gather information on parent
14193
// calculate parent information
14194
if ($count = count($stack)) {
14195
$parent_index = $stack[$count-1];
14196
$parent_name = $tokens[$parent_index]->name;
14197
if ($parent_index == 0) {
14198
$parent_def = $definition->info_parent_def;
14200
$parent_def = $definition->info[$parent_name];
14203
// processing as if the parent were the "root" node
14204
// unknown info, it won't be used anyway, in the future,
14205
// we may want to enforce one element only (this is
14206
// necessary for HTML Purifier to clean entire documents
14207
$parent_index = $parent_name = $parent_def = null;
14210
// calculate context
14211
if ($is_inline === false) {
14212
// check if conditions make it inline
14213
if (!empty($parent_def) && $parent_def->descendants_are_inline) {
14214
$is_inline = $count - 1;
14217
// check if we're out of inline
14218
if ($count === $is_inline) {
14219
$is_inline = false;
14223
//################################################################//
14224
// Determine whether element is explicitly excluded SGML-style
14226
// determine whether or not element is excluded by checking all
14227
// parent exclusions. The array should not be very large, two
14228
// elements at most.
14230
if (!empty($exclude_stack)) {
14231
foreach ($exclude_stack as $lookup) {
14232
if (isset($lookup[$tokens[$i]->name])) {
14234
// no need to continue processing
14240
//################################################################//
14241
// Perform child validation
14244
// there is an exclusion, remove the entire node
14246
$excludes = array(); // not used, but good to initialize anyway
14250
// special processing for the first node
14251
$def = $definition->info_parent_def;
14253
$def = $definition->info[$tokens[$i]->name];
14257
if (!empty($def->child)) {
14258
// have DTD child def validate children
14259
$result = $def->child->validateChildren(
14260
$child_tokens, $config, $context);
14262
// weird, no child definition, get rid of everything
14266
// determine whether or not this element has any exclusions
14267
$excludes = $def->excludes;
14270
// $result is now a bool or array
14272
//################################################################//
14273
// Process result by interpreting $result
14275
if ($result === true || $child_tokens === $result) {
14276
// leave the node as is
14278
// register start token as a parental node start
14281
// register exclusions if there are any
14282
if (!empty($excludes)) $exclude_stack[] = $excludes;
14284
// move cursor to next possible start node
14287
} elseif($result === false) {
14288
// remove entire node
14292
$e->send(E_ERROR, 'Strategy_FixNesting: Node excluded');
14294
$e->send(E_ERROR, 'Strategy_FixNesting: Node removed');
14298
// calculate length of inner tokens and current tokens
14299
$length = $j - $i + 1;
14302
array_splice($tokens, $i, $length);
14307
// there is no start token to register,
14308
// current node is now the next possible start node
14309
// unless it turns out that we need to do a double-check
14311
// this is a rought heuristic that covers 100% of HTML's
14312
// cases and 99% of all other cases. A child definition
14313
// that would be tricked by this would be something like:
14314
// ( | a b c) where it's all or nothing. Fortunately,
14315
// our current implementation claims that that case would
14316
// not allow empty, even if it did
14317
if (!$parent_def->child->allow_empty) {
14318
// we need to do a double-check
14319
$i = $parent_index;
14323
// PROJECTED OPTIMIZATION: Process all children elements before
14324
// reprocessing parent node.
14327
// replace node with $result
14329
// calculate length of inner tokens
14330
$length = $j - $i - 1;
14333
if (empty($result) && $length) {
14334
$e->send(E_ERROR, 'Strategy_FixNesting: Node contents removed');
14336
$e->send(E_WARNING, 'Strategy_FixNesting: Node reorganized');
14340
// perform replacement
14341
array_splice($tokens, $i + 1, $length, $result);
14345
$size += count($result);
14347
// register start token as a parental node start
14350
// register exclusions if there are any
14351
if (!empty($excludes)) $exclude_stack[] = $excludes;
14353
// move cursor to next possible start node
14358
//################################################################//
14359
// Scroll to next start node
14361
// We assume, at this point, that $i is the index of the token
14362
// that is the first possible new start point for a node.
14364
// Test if the token indeed is a start tag, if not, move forward
14366
$size = count($tokens);
14367
while ($i < $size and !$tokens[$i] instanceof HTMLPurifier_Token_Start) {
14368
if ($tokens[$i] instanceof HTMLPurifier_Token_End) {
14369
// pop a token index off the stack if we ended a node
14371
// pop an exclusion lookup off exclusion stack if
14372
// we ended node and that node had exclusions
14373
if ($i == 0 || $i == $size - 1) {
14374
// use specialized var if it's the super-parent
14375
$s_excludes = $definition->info_parent_def->excludes;
14377
$s_excludes = $definition->info[$tokens[$i]->name]->excludes;
14380
array_pop($exclude_stack);
14388
//####################################################################//
14391
// remove implicit parent tokens at the beginning and end
14392
array_shift($tokens);
14393
array_pop($tokens);
14395
// remove context variables
14396
$context->destroy('IsInline');
14397
$context->destroy('CurrentToken');
14399
//####################################################################//
14413
* Takes tokens makes them well-formed (balance end tags, etc.)
14415
class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
14419
* Array stream of tokens being processed.
14424
* Current index in $tokens.
14429
* Current nesting of elements.
14434
* Injectors active in this stream processing.
14436
protected $injectors;
14439
* Current instance of HTMLPurifier_Config.
14444
* Current instance of HTMLPurifier_Context.
14446
protected $context;
14448
public function execute($tokens, $config, $context) {
14450
$definition = $config->getHTMLDefinition();
14453
$generator = new HTMLPurifier_Generator($config, $context);
14454
$escape_invalid_tags = $config->get('Core.EscapeInvalidTags');
14455
$e = $context->get('ErrorCollector', true);
14456
$t = false; // token index
14457
$i = false; // injector index
14458
$token = false; // the current token
14459
$reprocess = false; // whether or not to reprocess the same token
14462
// member variables
14463
$this->stack =& $stack;
14465
$this->tokens =& $tokens;
14466
$this->config = $config;
14467
$this->context = $context;
14469
// context variables
14470
$context->register('CurrentNesting', $stack);
14471
$context->register('InputIndex', $t);
14472
$context->register('InputTokens', $tokens);
14473
$context->register('CurrentToken', $token);
14475
// -- begin INJECTOR --
14477
$this->injectors = array();
14479
$injectors = $config->getBatch('AutoFormat');
14480
$def_injectors = $definition->info_injector;
14481
$custom_injectors = $injectors['Custom'];
14482
unset($injectors['Custom']); // special case
14483
foreach ($injectors as $injector => $b) {
14484
// XXX: Fix with a legitimate lookup table of enabled filters
14485
if (strpos($injector, '.') !== false) continue;
14486
$injector = "HTMLPurifier_Injector_$injector";
14488
$this->injectors[] = new $injector;
14490
foreach ($def_injectors as $injector) {
14491
// assumed to be objects
14492
$this->injectors[] = $injector;
14494
foreach ($custom_injectors as $injector) {
14495
if (is_string($injector)) {
14496
$injector = "HTMLPurifier_Injector_$injector";
14497
$injector = new $injector;
14499
$this->injectors[] = $injector;
14502
// give the injectors references to the definition and context
14503
// variables for performance reasons
14504
foreach ($this->injectors as $ix => $injector) {
14505
$error = $injector->prepare($config, $context);
14506
if (!$error) continue;
14507
array_splice($this->injectors, $ix, 1); // rm the injector
14508
trigger_error("Cannot enable {$injector->name} injector because $error is not allowed", E_USER_WARNING);
14511
// -- end INJECTOR --
14513
// a note on punting:
14514
// In order to reduce code duplication, whenever some code needs
14515
// to make HTML changes in order to make things "correct", the
14516
// new HTML gets sent through the purifier, regardless of its
14517
// status. This means that if we add a start token, because it
14518
// was totally necessary, we don't have to update nesting; we just
14519
// punt ($reprocess = true; continue;) and it does that for us.
14521
// isset is in loop because $tokens size changes during loop exec
14524
$t == 0 || isset($tokens[$t - 1]);
14525
// only increment if we don't need to reprocess
14526
$reprocess ? $reprocess = false : $t++
14529
// check for a rewind
14530
if (is_int($i) && $i >= 0) {
14531
// possibility: disable rewinding if the current token has a
14532
// rewind set on it already. This would offer protection from
14533
// infinite loop, but might hinder some advanced rewinding.
14534
$rewind_to = $this->injectors[$i]->getRewind();
14535
if (is_int($rewind_to) && $rewind_to < $t) {
14536
if ($rewind_to < 0) $rewind_to = 0;
14537
while ($t > $rewind_to) {
14539
$prev = $tokens[$t];
14540
// indicate that other injectors should not process this token,
14541
// but we need to reprocess it
14542
unset($prev->skip[$i]);
14543
$prev->rewind = $i;
14544
if ($prev instanceof HTMLPurifier_Token_Start) array_pop($this->stack);
14545
elseif ($prev instanceof HTMLPurifier_Token_End) $this->stack[] = $prev->start;
14551
// handle case of document end
14552
if (!isset($tokens[$t])) {
14553
// kill processing if stack is empty
14554
if (empty($this->stack)) break;
14557
$top_nesting = array_pop($this->stack);
14558
$this->stack[] = $top_nesting;
14561
if ($e && !isset($top_nesting->armor['MakeWellFormed_TagClosedError'])) {
14562
$e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by document end', $top_nesting);
14565
// append, don't splice, since this is the end
14566
$tokens[] = new HTMLPurifier_Token_End($top_nesting->name);
14573
$token = $tokens[$t];
14575
//echo '<br>'; printTokens($tokens, $t); printTokens($this->stack);
14577
// quick-check: if it's not a tag, no need to process
14578
if (empty($token->is_tag)) {
14579
if ($token instanceof HTMLPurifier_Token_Text) {
14580
foreach ($this->injectors as $i => $injector) {
14581
if (isset($token->skip[$i])) continue;
14582
if ($token->rewind !== null && $token->rewind !== $i) continue;
14583
$injector->handleText($token);
14584
$this->processToken($token, $i);
14589
// another possibility is a comment
14593
if (isset($definition->info[$token->name])) {
14594
$type = $definition->info[$token->name]->child->type;
14596
$type = false; // Type is unknown, treat accordingly
14599
// quick tag checks: anything that's *not* an end tag
14601
if ($type === 'empty' && $token instanceof HTMLPurifier_Token_Start) {
14602
// claims to be a start tag but is empty
14603
$token = new HTMLPurifier_Token_Empty($token->name, $token->attr);
14605
} elseif ($type && $type !== 'empty' && $token instanceof HTMLPurifier_Token_Empty) {
14606
// claims to be empty but really is a start tag
14607
$this->swap(new HTMLPurifier_Token_End($token->name));
14608
$this->insertBefore(new HTMLPurifier_Token_Start($token->name, $token->attr));
14609
// punt (since we had to modify the input stream in a non-trivial way)
14612
} elseif ($token instanceof HTMLPurifier_Token_Empty) {
14613
// real empty token
14615
} elseif ($token instanceof HTMLPurifier_Token_Start) {
14618
// ...unless they also have to close their parent
14619
if (!empty($this->stack)) {
14621
$parent = array_pop($this->stack);
14622
$this->stack[] = $parent;
14624
if (isset($definition->info[$parent->name])) {
14625
$elements = $definition->info[$parent->name]->child->getAllowedElements($config);
14626
$autoclose = !isset($elements[$token->name]);
14628
$autoclose = false;
14631
$carryover = false;
14632
if ($autoclose && $definition->info[$parent->name]->formatting) {
14637
// errors need to be updated
14638
$new_token = new HTMLPurifier_Token_End($parent->name);
14639
$new_token->start = $parent;
14641
$element = clone $parent;
14642
$element->armor['MakeWellFormed_TagClosedError'] = true;
14643
$element->carryover = true;
14644
$this->processToken(array($new_token, $token, $element));
14646
$this->insertBefore($new_token);
14648
if ($e && !isset($parent->armor['MakeWellFormed_TagClosedError'])) {
14650
$e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag auto closed', $parent);
14652
$e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag carryover', $parent);
14664
foreach ($this->injectors as $i => $injector) {
14665
if (isset($token->skip[$i])) continue;
14666
if ($token->rewind !== null && $token->rewind !== $i) continue;
14667
$injector->handleElement($token);
14668
$this->processToken($token, $i);
14673
// ah, nothing interesting happened; do normal processing
14674
$this->swap($token);
14675
if ($token instanceof HTMLPurifier_Token_Start) {
14676
$this->stack[] = $token;
14677
} elseif ($token instanceof HTMLPurifier_Token_End) {
14678
throw new HTMLPurifier_Exception('Improper handling of end tag in start code; possible error in MakeWellFormed');
14684
// sanity check: we should be dealing with a closing tag
14685
if (!$token instanceof HTMLPurifier_Token_End) {
14686
throw new HTMLPurifier_Exception('Unaccounted for tag token in input stream, bug in HTML Purifier');
14689
// make sure that we have something open
14690
if (empty($this->stack)) {
14691
if ($escape_invalid_tags) {
14692
if ($e) $e->send(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag to text');
14693
$this->swap(new HTMLPurifier_Token_Text(
14694
$generator->generateFromToken($token)
14698
if ($e) $e->send(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag removed');
14704
// first, check for the simplest case: everything closes neatly.
14705
// Eventually, everything passes through here; if there are problems
14706
// we modify the input stream accordingly and then punt, so that
14707
// the tokens get processed again.
14708
$current_parent = array_pop($this->stack);
14709
if ($current_parent->name == $token->name) {
14710
$token->start = $current_parent;
14711
foreach ($this->injectors as $i => $injector) {
14712
if (isset($token->skip[$i])) continue;
14713
if ($token->rewind !== null && $token->rewind !== $i) continue;
14714
$injector->handleEnd($token);
14715
$this->processToken($token, $i);
14716
$this->stack[] = $current_parent;
14723
// okay, so we're trying to close the wrong tag
14725
// undo the pop previous pop
14726
$this->stack[] = $current_parent;
14728
// scroll back the entire nest, trying to find our tag.
14729
// (feature could be to specify how far you'd like to go)
14730
$size = count($this->stack);
14731
// -2 because -1 is the last element, but we already checked that
14732
$skipped_tags = false;
14733
for ($j = $size - 2; $j >= 0; $j--) {
14734
if ($this->stack[$j]->name == $token->name) {
14735
$skipped_tags = array_slice($this->stack, $j);
14740
// we didn't find the tag, so remove
14741
if ($skipped_tags === false) {
14742
if ($escape_invalid_tags) {
14743
$this->swap(new HTMLPurifier_Token_Text(
14744
$generator->generateFromToken($token)
14746
if ($e) $e->send(E_WARNING, 'Strategy_MakeWellFormed: Stray end tag to text');
14749
if ($e) $e->send(E_WARNING, 'Strategy_MakeWellFormed: Stray end tag removed');
14755
// do errors, in REVERSE $j order: a,b,c with </a></b></c>
14756
$c = count($skipped_tags);
14758
for ($j = $c - 1; $j > 0; $j--) {
14759
// notice we exclude $j == 0, i.e. the current ending tag, from
14761
if (!isset($skipped_tags[$j]->armor['MakeWellFormed_TagClosedError'])) {
14762
$e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by element end', $skipped_tags[$j]);
14767
// insert tags, in FORWARD $j order: c,b,a with </a></b></c>
14768
$replace = array($token);
14769
for ($j = 1; $j < $c; $j++) {
14770
// ...as well as from the insertions
14771
$new_token = new HTMLPurifier_Token_End($skipped_tags[$j]->name);
14772
$new_token->start = $skipped_tags[$j];
14773
array_unshift($replace, $new_token);
14774
if (isset($definition->info[$new_token->name]) && $definition->info[$new_token->name]->formatting) {
14775
$element = clone $skipped_tags[$j];
14776
$element->carryover = true;
14777
$element->armor['MakeWellFormed_TagClosedError'] = true;
14778
$replace[] = $element;
14781
$this->processToken($replace);
14786
$context->destroy('CurrentNesting');
14787
$context->destroy('InputTokens');
14788
$context->destroy('InputIndex');
14789
$context->destroy('CurrentToken');
14791
unset($this->injectors, $this->stack, $this->tokens, $this->t);
14796
* Processes arbitrary token values for complicated substitution patterns.
14799
* If $token is an array, it is a list of tokens to substitute for the
14800
* current token. These tokens then get individually processed. If there
14801
* is a leading integer in the list, that integer determines how many
14802
* tokens from the stream should be removed.
14804
* If $token is a regular token, it is swapped with the current token.
14806
* If $token is false, the current token is deleted.
14808
* If $token is an integer, that number of tokens (with the first token
14809
* being the current one) will be deleted.
14811
* @param $token Token substitution value
14812
* @param $injector Injector that performed the substitution; default is if
14813
* this is not an injector related operation.
14815
protected function processToken($token, $injector = -1) {
14817
// normalize forms of token
14818
if (is_object($token)) $token = array(1, $token);
14819
if (is_int($token)) $token = array($token);
14820
if ($token === false) $token = array(1);
14821
if (!is_array($token)) throw new HTMLPurifier_Exception('Invalid token type from injector');
14822
if (!is_int($token[0])) array_unshift($token, 1);
14823
if ($token[0] === 0) throw new HTMLPurifier_Exception('Deleting zero tokens is not valid');
14825
// $token is now an array with the following form:
14826
// array(number nodes to delete, new node 1, new node 2, ...)
14828
$delete = array_shift($token);
14829
$old = array_splice($this->tokens, $this->t, $delete, $token);
14831
if ($injector > -1) {
14832
// determine appropriate skips
14833
$oldskip = isset($old[0]) ? $old[0]->skip : array();
14834
foreach ($token as $object) {
14835
$object->skip = $oldskip;
14836
$object->skip[$injector] = true;
14843
* Inserts a token before the current token. Cursor now points to this token
14845
private function insertBefore($token) {
14846
array_splice($this->tokens, $this->t, 0, array($token));
14850
* Removes current token. Cursor now points to new token occupying previously
14853
private function remove() {
14854
array_splice($this->tokens, $this->t, 1);
14858
* Swap current token with new token. Cursor points to new token (no change).
14860
private function swap($token) {
14861
$this->tokens[$this->t] = $token;
14871
* Removes all unrecognized tags from the list of tokens.
14873
* This strategy iterates through all the tokens and removes unrecognized
14874
* tokens. If a token is not recognized but a TagTransform is defined for
14875
* that element, the element will be transformed accordingly.
14878
class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy
14881
public function execute($tokens, $config, $context) {
14882
$definition = $config->getHTMLDefinition();
14883
$generator = new HTMLPurifier_Generator($config, $context);
14886
$escape_invalid_tags = $config->get('Core.EscapeInvalidTags');
14887
$remove_invalid_img = $config->get('Core.RemoveInvalidImg');
14889
// currently only used to determine if comments should be kept
14890
$trusted = $config->get('HTML.Trusted');
14892
$remove_script_contents = $config->get('Core.RemoveScriptContents');
14893
$hidden_elements = $config->get('Core.HiddenElements');
14895
// remove script contents compatibility
14896
if ($remove_script_contents === true) {
14897
$hidden_elements['script'] = true;
14898
} elseif ($remove_script_contents === false && isset($hidden_elements['script'])) {
14899
unset($hidden_elements['script']);
14902
$attr_validator = new HTMLPurifier_AttrValidator();
14904
// removes tokens until it reaches a closing tag with its value
14905
$remove_until = false;
14907
// converts comments into text tokens when this is equal to a tag name
14908
$textify_comments = false;
14911
$context->register('CurrentToken', $token);
14914
if ($config->get('Core.CollectErrors')) {
14915
$e =& $context->get('ErrorCollector');
14918
foreach($tokens as $token) {
14919
if ($remove_until) {
14920
if (empty($token->is_tag) || $token->name !== $remove_until) {
14924
if (!empty( $token->is_tag )) {
14927
// before any processing, try to transform the element
14929
isset($definition->info_tag_transform[$token->name])
14931
$original_name = $token->name;
14932
// there is a transformation for this tag
14934
$token = $definition->
14935
info_tag_transform[$token->name]->
14936
transform($token, $config, $context);
14937
if ($e) $e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Tag transform', $original_name);
14940
if (isset($definition->info[$token->name])) {
14942
// mostly everything's good, but
14943
// we need to make sure required attributes are in order
14945
($token instanceof HTMLPurifier_Token_Start || $token instanceof HTMLPurifier_Token_Empty) &&
14946
$definition->info[$token->name]->required_attr &&
14947
($token->name != 'img' || $remove_invalid_img) // ensure config option still works
14949
$attr_validator->validateToken($token, $config, $context);
14951
foreach ($definition->info[$token->name]->required_attr as $name) {
14952
if (!isset($token->attr[$name])) {
14958
if ($e) $e->send(E_ERROR, 'Strategy_RemoveForeignElements: Missing required attribute', $name);
14961
$token->armor['ValidateAttributes'] = true;
14964
if (isset($hidden_elements[$token->name]) && $token instanceof HTMLPurifier_Token_Start) {
14965
$textify_comments = $token->name;
14966
} elseif ($token->name === $textify_comments && $token instanceof HTMLPurifier_Token_End) {
14967
$textify_comments = false;
14970
} elseif ($escape_invalid_tags) {
14971
// invalid tag, generate HTML representation and insert in
14972
if ($e) $e->send(E_WARNING, 'Strategy_RemoveForeignElements: Foreign element to text');
14973
$token = new HTMLPurifier_Token_Text(
14974
$generator->generateFromToken($token)
14977
// check if we need to destroy all of the tag's children
14978
// CAN BE GENERICIZED
14979
if (isset($hidden_elements[$token->name])) {
14980
if ($token instanceof HTMLPurifier_Token_Start) {
14981
$remove_until = $token->name;
14982
} elseif ($token instanceof HTMLPurifier_Token_Empty) {
14983
// do nothing: we're still looking
14985
$remove_until = false;
14987
if ($e) $e->send(E_ERROR, 'Strategy_RemoveForeignElements: Foreign meta element removed');
14989
if ($e) $e->send(E_ERROR, 'Strategy_RemoveForeignElements: Foreign element removed');
14993
} elseif ($token instanceof HTMLPurifier_Token_Comment) {
14994
// textify comments in script tags when they are allowed
14995
if ($textify_comments !== false) {
14996
$data = $token->data;
14997
$token = new HTMLPurifier_Token_Text($data);
14998
} elseif ($trusted) {
14999
// keep, but perform comment cleaning
15001
// perform check whether or not there's a trailing hyphen
15002
if (substr($token->data, -1) == '-') {
15003
$e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Trailing hyphen in comment removed');
15006
$token->data = rtrim($token->data, '-');
15007
$found_double_hyphen = false;
15008
while (strpos($token->data, '--') !== false) {
15009
if ($e && !$found_double_hyphen) {
15010
$e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Hyphens in comment collapsed');
15012
$found_double_hyphen = true; // prevent double-erroring
15013
$token->data = str_replace('--', '-', $token->data);
15017
if ($e) $e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Comment removed');
15020
} elseif ($token instanceof HTMLPurifier_Token_Text) {
15024
$result[] = $token;
15026
if ($remove_until && $e) {
15027
// we removed tokens until the end, throw error
15028
$e->send(E_ERROR, 'Strategy_RemoveForeignElements: Token removed to end', $remove_until);
15031
$context->destroy('CurrentToken');
15043
* Validate all attributes in the tokens.
15046
class HTMLPurifier_Strategy_ValidateAttributes extends HTMLPurifier_Strategy
15049
public function execute($tokens, $config, $context) {
15052
$validator = new HTMLPurifier_AttrValidator();
15055
$context->register('CurrentToken', $token);
15057
foreach ($tokens as $key => $token) {
15059
// only process tokens that have attributes,
15060
// namely start and empty tags
15061
if (!$token instanceof HTMLPurifier_Token_Start && !$token instanceof HTMLPurifier_Token_Empty) continue;
15063
// skip tokens that are armored
15064
if (!empty($token->armor['ValidateAttributes'])) continue;
15066
// note that we have no facilities here for removing tokens
15067
$validator->validateToken($token, $config, $context);
15069
$tokens[$key] = $token; // for PHP 4
15071
$context->destroy('CurrentToken');
15083
* Transforms FONT tags to the proper form (SPAN with CSS styling)
15085
* This transformation takes the three proprietary attributes of FONT and
15086
* transforms them into their corresponding CSS attributes. These are color,
15089
* @note Size is an interesting case because it doesn't map cleanly to CSS.
15091
* http://style.cleverchimp.com/font_size_intervals/altintervals.html
15092
* for reasonable mappings.
15093
* @warning This doesn't work completely correctly; specifically, this
15094
* TagTransform operates before well-formedness is enforced, so
15095
* the "active formatting elements" algorithm doesn't get applied.
15097
class HTMLPurifier_TagTransform_Font extends HTMLPurifier_TagTransform
15100
public $transform_to = 'span';
15102
protected $_size_lookup = array(
15119
public function transform($tag, $config, $context) {
15121
if ($tag instanceof HTMLPurifier_Token_End) {
15122
$new_tag = clone $tag;
15123
$new_tag->name = $this->transform_to;
15127
$attr = $tag->attr;
15128
$prepend_style = '';
15130
// handle color transform
15131
if (isset($attr['color'])) {
15132
$prepend_style .= 'color:' . $attr['color'] . ';';
15133
unset($attr['color']);
15136
// handle face transform
15137
if (isset($attr['face'])) {
15138
$prepend_style .= 'font-family:' . $attr['face'] . ';';
15139
unset($attr['face']);
15142
// handle size transform
15143
if (isset($attr['size'])) {
15144
// normalize large numbers
15145
if ($attr['size']{0} == '+' || $attr['size']{0} == '-') {
15146
$size = (int) $attr['size'];
15147
if ($size < -2) $attr['size'] = '-2';
15148
if ($size > 4) $attr['size'] = '+4';
15150
$size = (int) $attr['size'];
15151
if ($size > 7) $attr['size'] = '7';
15153
if (isset($this->_size_lookup[$attr['size']])) {
15154
$prepend_style .= 'font-size:' .
15155
$this->_size_lookup[$attr['size']] . ';';
15157
unset($attr['size']);
15160
if ($prepend_style) {
15161
$attr['style'] = isset($attr['style']) ?
15162
$prepend_style . $attr['style'] :
15166
$new_tag = clone $tag;
15167
$new_tag->name = $this->transform_to;
15168
$new_tag->attr = $attr;
15180
* Simple transformation, just change tag name to something else,
15181
* and possibly add some styling. This will cover most of the deprecated
15184
class HTMLPurifier_TagTransform_Simple extends HTMLPurifier_TagTransform
15190
* @param $transform_to Tag name to transform to.
15191
* @param $style CSS style to add to the tag
15193
public function __construct($transform_to, $style = null) {
15194
$this->transform_to = $transform_to;
15195
$this->style = $style;
15198
public function transform($tag, $config, $context) {
15199
$new_tag = clone $tag;
15200
$new_tag->name = $this->transform_to;
15201
if (!is_null($this->style) &&
15202
($new_tag instanceof HTMLPurifier_Token_Start || $new_tag instanceof HTMLPurifier_Token_Empty)
15204
$this->prependCSS($new_tag->attr, $this->style);
15216
* Concrete comment token class. Generally will be ignored.
15218
class HTMLPurifier_Token_Comment extends HTMLPurifier_Token
15220
public $data; /**< Character data within comment. */
15221
public $is_whitespace = true;
15223
* Transparent constructor.
15225
* @param $data String comment data.
15227
public function __construct($data, $line = null, $col = null) {
15228
$this->data = $data;
15229
$this->line = $line;
15239
* Abstract class of a tag token (start, end or empty), and its behavior.
15241
class HTMLPurifier_Token_Tag extends HTMLPurifier_Token
15244
* Static bool marker that indicates the class is a tag.
15246
* This allows us to check objects with <tt>!empty($obj->is_tag)</tt>
15247
* without having to use a function call <tt>is_a()</tt>.
15249
public $is_tag = true;
15252
* The lower-case name of the tag, like 'a', 'b' or 'blockquote'.
15254
* @note Strictly speaking, XML tags are case sensitive, so we shouldn't
15255
* be lower-casing them, but these tokens cater to HTML tags, which are
15261
* Associative array of the tag's attributes.
15263
public $attr = array();
15266
* Non-overloaded constructor, which lower-cases passed tag name.
15268
* @param $name String name.
15269
* @param $attr Associative array of attributes.
15271
public function __construct($name, $attr = array(), $line = null, $col = null) {
15272
$this->name = ctype_lower($name) ? $name : strtolower($name);
15273
foreach ($attr as $key => $value) {
15274
// normalization only necessary when key is not lowercase
15275
if (!ctype_lower($key)) {
15276
$new_key = strtolower($key);
15277
if (!isset($attr[$new_key])) {
15278
$attr[$new_key] = $attr[$key];
15280
if ($new_key !== $key) {
15281
unset($attr[$key]);
15285
$this->attr = $attr;
15286
$this->line = $line;
15296
* Concrete empty token class.
15298
class HTMLPurifier_Token_Empty extends HTMLPurifier_Token_Tag
15308
* Concrete end token class.
15310
* @warning This class accepts attributes even though end tags cannot. This
15311
* is for optimization reasons, as under normal circumstances, the Lexers
15312
* do not pass attributes.
15314
class HTMLPurifier_Token_End extends HTMLPurifier_Token_Tag
15317
* Token that started this node. Added by MakeWellFormed. Please
15318
* do not edit this!
15328
* Concrete start token class.
15330
class HTMLPurifier_Token_Start extends HTMLPurifier_Token_Tag
15340
* Concrete text token class.
15342
* Text tokens comprise of regular parsed character data (PCDATA) and raw
15343
* character data (from the CDATA sections). Internally, their
15344
* data is parsed with all entities expanded. Surprisingly, the text token
15345
* does have a "tag name" called #PCDATA, which is how the DTD represents it
15346
* in permissible child nodes.
15348
class HTMLPurifier_Token_Text extends HTMLPurifier_Token
15351
public $name = '#PCDATA'; /**< PCDATA tag name compatible with DTD. */
15352
public $data; /**< Parsed character data of text. */
15353
public $is_whitespace; /**< Bool indicating if node is whitespace. */
15356
* Constructor, accepts data and determines if it is whitespace.
15358
* @param $data String parsed character data.
15360
public function __construct($data, $line = null, $col = null) {
15361
$this->data = $data;
15362
$this->is_whitespace = ctype_space($data);
15363
$this->line = $line;
15373
class HTMLPurifier_URIFilter_DisableExternal extends HTMLPurifier_URIFilter
15375
public $name = 'DisableExternal';
15376
protected $ourHostParts = false;
15377
public function prepare($config) {
15378
$our_host = $config->getDefinition('URI')->host;
15379
if ($our_host !== null) $this->ourHostParts = array_reverse(explode('.', $our_host));
15381
public function filter(&$uri, $config, $context) {
15382
if (is_null($uri->host)) return true;
15383
if ($this->ourHostParts === false) return false;
15384
$host_parts = array_reverse(explode('.', $uri->host));
15385
foreach ($this->ourHostParts as $i => $x) {
15386
if (!isset($host_parts[$i])) return false;
15387
if ($host_parts[$i] != $this->ourHostParts[$i]) return false;
15397
class HTMLPurifier_URIFilter_DisableExternalResources extends HTMLPurifier_URIFilter_DisableExternal
15399
public $name = 'DisableExternalResources';
15400
public function filter(&$uri, $config, $context) {
15401
if (!$context->get('EmbeddedURI', true)) return true;
15402
return parent::filter($uri, $config, $context);
15410
class HTMLPurifier_URIFilter_HostBlacklist extends HTMLPurifier_URIFilter
15412
public $name = 'HostBlacklist';
15413
protected $blacklist = array();
15414
public function prepare($config) {
15415
$this->blacklist = $config->get('URI.HostBlacklist');
15418
public function filter(&$uri, $config, $context) {
15419
foreach($this->blacklist as $blacklisted_host_fragment) {
15420
if (strpos($uri->host, $blacklisted_host_fragment) !== false) {
15432
// does not support network paths
15434
class HTMLPurifier_URIFilter_MakeAbsolute extends HTMLPurifier_URIFilter
15436
public $name = 'MakeAbsolute';
15438
protected $basePathStack = array();
15439
public function prepare($config) {
15440
$def = $config->getDefinition('URI');
15441
$this->base = $def->base;
15442
if (is_null($this->base)) {
15443
trigger_error('URI.MakeAbsolute is being ignored due to lack of value for URI.Base configuration', E_USER_WARNING);
15446
$this->base->fragment = null; // fragment is invalid for base URI
15447
$stack = explode('/', $this->base->path);
15448
array_pop($stack); // discard last segment
15449
$stack = $this->_collapseStack($stack); // do pre-parsing
15450
$this->basePathStack = $stack;
15453
public function filter(&$uri, $config, $context) {
15454
if (is_null($this->base)) return true; // abort early
15456
$uri->path === '' && is_null($uri->scheme) &&
15457
is_null($uri->host) && is_null($uri->query) && is_null($uri->fragment)
15459
// reference to current document
15460
$uri = clone $this->base;
15463
if (!is_null($uri->scheme)) {
15464
// absolute URI already: don't change
15465
if (!is_null($uri->host)) return true;
15466
$scheme_obj = $uri->getSchemeObj($config, $context);
15467
if (!$scheme_obj) {
15468
// scheme not recognized
15471
if (!$scheme_obj->hierarchical) {
15472
// non-hierarchal URI with explicit scheme, don't change
15475
// special case: had a scheme but always is hierarchical and had no authority
15477
if (!is_null($uri->host)) {
15478
// network path, don't bother
15481
if ($uri->path === '') {
15482
$uri->path = $this->base->path;
15483
} elseif ($uri->path[0] !== '/') {
15484
// relative path, needs more complicated processing
15485
$stack = explode('/', $uri->path);
15486
$new_stack = array_merge($this->basePathStack, $stack);
15487
if ($new_stack[0] !== '' && !is_null($this->base->host)) {
15488
array_unshift($new_stack, '');
15490
$new_stack = $this->_collapseStack($new_stack);
15491
$uri->path = implode('/', $new_stack);
15493
// absolute path, but still we should collapse
15494
$uri->path = implode('/', $this->_collapseStack(explode('/', $uri->path)));
15497
$uri->scheme = $this->base->scheme;
15498
if (is_null($uri->userinfo)) $uri->userinfo = $this->base->userinfo;
15499
if (is_null($uri->host)) $uri->host = $this->base->host;
15500
if (is_null($uri->port)) $uri->port = $this->base->port;
15505
* Resolve dots and double-dots in a path stack
15507
private function _collapseStack($stack) {
15509
$is_folder = false;
15510
for ($i = 0; isset($stack[$i]); $i++) {
15511
$is_folder = false;
15512
// absorb an internally duplicated slash
15513
if ($stack[$i] == '' && $i && isset($stack[$i+1])) continue;
15514
if ($stack[$i] == '..') {
15515
if (!empty($result)) {
15516
$segment = array_pop($result);
15517
if ($segment === '' && empty($result)) {
15518
// error case: attempted to back out too far:
15519
// restore the leading slash
15521
} elseif ($segment === '..') {
15522
$result[] = '..'; // cannot remove .. with ..
15525
// relative path, preserve the double-dots
15531
if ($stack[$i] == '.') {
15536
$result[] = $stack[$i];
15538
if ($is_folder) $result[] = '';
15547
class HTMLPurifier_URIFilter_Munge extends HTMLPurifier_URIFilter
15549
public $name = 'Munge';
15550
public $post = true;
15551
private $target, $parser, $doEmbed, $secretKey;
15553
protected $replace = array();
15555
public function prepare($config) {
15556
$this->target = $config->get('URI.' . $this->name);
15557
$this->parser = new HTMLPurifier_URIParser();
15558
$this->doEmbed = $config->get('URI.MungeResources');
15559
$this->secretKey = $config->get('URI.MungeSecretKey');
15562
public function filter(&$uri, $config, $context) {
15563
if ($context->get('EmbeddedURI', true) && !$this->doEmbed) return true;
15565
$scheme_obj = $uri->getSchemeObj($config, $context);
15566
if (!$scheme_obj) return true; // ignore unknown schemes, maybe another postfilter did it
15567
if (is_null($uri->host) || empty($scheme_obj->browsable)) {
15570
// don't redirect if target host is our host
15571
if ($uri->host === $config->getDefinition('URI')->host) {
15575
$this->makeReplace($uri, $config, $context);
15576
$this->replace = array_map('rawurlencode', $this->replace);
15578
$new_uri = strtr($this->target, $this->replace);
15579
$new_uri = $this->parser->parse($new_uri);
15580
// don't redirect if the target host is the same as the
15582
if ($uri->host === $new_uri->host) return true;
15583
$uri = $new_uri; // overwrite
15587
protected function makeReplace($uri, $config, $context) {
15588
$string = $uri->toString();
15589
// always available
15590
$this->replace['%s'] = $string;
15591
$this->replace['%r'] = $context->get('EmbeddedURI', true);
15592
$token = $context->get('CurrentToken', true);
15593
$this->replace['%n'] = $token ? $token->name : null;
15594
$this->replace['%m'] = $context->get('CurrentAttr', true);
15595
$this->replace['%p'] = $context->get('CurrentCSSProperty', true);
15596
// not always available
15597
if ($this->secretKey) $this->replace['%t'] = sha1($this->secretKey . ':' . $string);
15607
* Validates ftp (File Transfer Protocol) URIs as defined by generic RFC 1738.
15609
class HTMLPurifier_URIScheme_ftp extends HTMLPurifier_URIScheme {
15611
public $default_port = 21;
15612
public $browsable = true; // usually
15613
public $hierarchical = true;
15615
public function validate(&$uri, $config, $context) {
15616
parent::validate($uri, $config, $context);
15617
$uri->query = null;
15620
$semicolon_pos = strrpos($uri->path, ';'); // reverse
15621
if ($semicolon_pos !== false) {
15622
$type = substr($uri->path, $semicolon_pos + 1); // no semicolon
15623
$uri->path = substr($uri->path, 0, $semicolon_pos);
15625
if (strpos($type, '=') !== false) {
15626
// figure out whether or not the declaration is correct
15627
list($key, $typecode) = explode('=', $type, 2);
15628
if ($key !== 'type') {
15629
// invalid key, tack it back on encoded
15630
$uri->path .= '%3B' . $type;
15631
} elseif ($typecode === 'a' || $typecode === 'i' || $typecode === 'd') {
15632
$type_ret = ";type=$typecode";
15635
$uri->path .= '%3B' . $type;
15637
$uri->path = str_replace(';', '%3B', $uri->path);
15638
$uri->path .= $type_ret;
15651
* Validates http (HyperText Transfer Protocol) as defined by RFC 2616
15653
class HTMLPurifier_URIScheme_http extends HTMLPurifier_URIScheme {
15655
public $default_port = 80;
15656
public $browsable = true;
15657
public $hierarchical = true;
15659
public function validate(&$uri, $config, $context) {
15660
parent::validate($uri, $config, $context);
15661
$uri->userinfo = null;
15672
* Validates https (Secure HTTP) according to http scheme.
15674
class HTMLPurifier_URIScheme_https extends HTMLPurifier_URIScheme_http {
15676
public $default_port = 443;
15684
// VERY RELAXED! Shouldn't cause problems, not even Firefox checks if the
15685
// email is valid, but be careful!
15688
* Validates mailto (for E-mail) according to RFC 2368
15689
* @todo Validate the email address
15690
* @todo Filter allowed query parameters
15693
class HTMLPurifier_URIScheme_mailto extends HTMLPurifier_URIScheme {
15695
public $browsable = false;
15697
public function validate(&$uri, $config, $context) {
15698
parent::validate($uri, $config, $context);
15699
$uri->userinfo = null;
15702
// we need to validate path against RFC 2368's addr-spec
15713
* Validates news (Usenet) as defined by generic RFC 1738
15715
class HTMLPurifier_URIScheme_news extends HTMLPurifier_URIScheme {
15717
public $browsable = false;
15719
public function validate(&$uri, $config, $context) {
15720
parent::validate($uri, $config, $context);
15721
$uri->userinfo = null;
15724
$uri->query = null;
15725
// typecode check needed on path
15736
* Validates nntp (Network News Transfer Protocol) as defined by generic RFC 1738
15738
class HTMLPurifier_URIScheme_nntp extends HTMLPurifier_URIScheme {
15740
public $default_port = 119;
15741
public $browsable = false;
15743
public function validate(&$uri, $config, $context) {
15744
parent::validate($uri, $config, $context);
15745
$uri->userinfo = null;
15746
$uri->query = null;
15757
* Performs safe variable parsing based on types which can be used by
15758
* users. This may not be able to represent all possible data inputs,
15761
class HTMLPurifier_VarParser_Flexible extends HTMLPurifier_VarParser
15764
protected function parseImplementation($var, $type, $allow_null) {
15765
if ($allow_null && $var === null) return null;
15767
// Note: if code "breaks" from the switch, it triggers a generic
15768
// exception to be thrown. Specific errors can be specifically
15771
case self::ISTRING :
15772
case self::STRING :
15777
if (is_string($var) && ctype_digit($var)) $var = (int) $var;
15780
if ((is_string($var) && is_numeric($var)) || is_int($var)) $var = (float) $var;
15783
if (is_int($var) && ($var === 0 || $var === 1)) {
15784
$var = (bool) $var;
15785
} elseif (is_string($var)) {
15786
if ($var == 'on' || $var == 'true' || $var == '1') {
15788
} elseif ($var == 'off' || $var == 'false' || $var == '0') {
15791
throw new HTMLPurifier_VarParserException("Unrecognized value '$var' for $type");
15797
case self::LOOKUP :
15798
if (is_string($var)) {
15799
// special case: technically, this is an array with
15800
// a single empty string item, but having an empty
15801
// array is more intuitive
15802
if ($var == '') return array();
15803
if (strpos($var, "\n") === false && strpos($var, "\r") === false) {
15804
// simplistic string to array method that only works
15805
// for simple lists of tag names or alphanumeric characters
15806
$var = explode(',',$var);
15808
$var = preg_split('/(,|[\n\r]+)/', $var);
15811
foreach ($var as $i => $j) $var[$i] = trim($j);
15812
if ($type === self::HASH) {
15813
// key:value,key2:value2
15815
foreach ($var as $keypair) {
15816
$c = explode(':', $keypair, 2);
15817
if (!isset($c[1])) continue;
15818
$nvar[$c[0]] = $c[1];
15823
if (!is_array($var)) break;
15824
$keys = array_keys($var);
15825
if ($keys === array_keys($keys)) {
15826
if ($type == self::ALIST) return $var;
15827
elseif ($type == self::LOOKUP) {
15829
foreach ($var as $key) {
15835
if ($type === self::LOOKUP) {
15836
foreach ($var as $key => $value) {
15842
$this->errorInconsistent(__CLASS__, $type);
15844
$this->errorGeneric($var, $type);
15854
* This variable parser uses PHP's internal code engine. Because it does
15855
* this, it can represent all inputs; however, it is dangerous and cannot
15856
* be used by users.
15858
class HTMLPurifier_VarParser_Native extends HTMLPurifier_VarParser
15861
protected function parseImplementation($var, $type, $allow_null) {
15862
return $this->evalExpression($var);
15865
protected function evalExpression($expr) {
15867
$result = eval("\$var = $expr;");
15868
if ($result === false) {
15869
throw new HTMLPurifier_VarParserException("Fatal error in evaluated code");