5 * This file was auto-generated by generate-includes.php and includes all of
6 * the core files required by HTML Purifier. Use this if performance is a
7 * primary concern and you are using an opcode cache. PLEASE DO NOT EDIT THIS
8 * FILE, changes will be overwritten the next time the script is run.
13 * You must *not* include any other HTML Purifier files before this file,
14 * because 'require' not 'require_once' is used.
17 * This file requires that the include path contains the HTML Purifier
18 * library directory; this is not auto-set.
25 * HTML Purifier is an HTML filter that will take an arbitrary snippet of
26 * HTML and rigorously test, validate and filter it into a version that
27 * is safe for output onto webpages. It achieves this by:
29 * -# Lexing (parsing into tokens) the document,
30 * -# Executing various strategies on the tokens:
31 * -# Removing all elements not in the whitelist,
32 * -# Making the tokens well-formed,
33 * -# Fixing the nesting of the nodes, and
34 * -# Validating attributes of the nodes; and
35 * -# Generating HTML from the purified tokens.
37 * However, most users will only need to interface with the HTMLPurifier
38 * and HTMLPurifier_Config.
42 HTML Purifier 4.3.0 - Standards Compliant HTML Filtering
43 Copyright (C) 2006-2008 Edward Z. Yang
45 This library is free software; you can redistribute it and/or
46 modify it under the terms of the GNU Lesser General Public
47 License as published by the Free Software Foundation; either
48 version 2.1 of the License, or (at your option) any later version.
50 This library is distributed in the hope that it will be useful,
51 but WITHOUT ANY WARRANTY; without even the implied warranty of
52 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
53 Lesser General Public License for more details.
55 You should have received a copy of the GNU Lesser General Public
56 License along with this library; if not, write to the Free Software
57 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
61 * Facade that coordinates HTML Purifier's subsystems in order to purify HTML.
63 * @note There are several points in which configuration can be specified
64 * for HTML Purifier. The precedence of these (from lowest to
65 * highest) is as follows:
66 * -# Instance: new HTMLPurifier($config)
67 * -# Invocation: purify($html, $config)
68 * These configurations are entirely independent of each other and
69 * are *not* merged (this behavior may change in the future).
71 * @todo We need an easier way to inject strategies using the configuration
77 /** Version of HTML Purifier */
78 public $version = '4.3.0';
80 /** Constant with version of HTML Purifier */
81 const VERSION = '4.3.0';
83 /** Global configuration object */
86 /** Array of extra HTMLPurifier_Filter objects to run on HTML, for backwards compatibility */
87 private $filters = array();
89 /** Single instance of HTML Purifier */
90 private static $instance;
92 protected $strategy, $generator;
95 * Resultant HTMLPurifier_Context of last run purification. Is an array
96 * of contexts if the last called method was purifyArray().
101 * Initializes the purifier.
102 * @param $config Optional HTMLPurifier_Config object for all instances of
103 * the purifier, if omitted, a default configuration is
104 * supplied (which can be overridden on a per-use basis).
105 * The parameter can also be any type that
106 * HTMLPurifier_Config::create() supports.
108 public function __construct($config = null) {
110 $this->config = HTMLPurifier_Config::create($config);
112 $this->strategy = new HTMLPurifier_Strategy_Core();
117 * Adds a filter to process the output. First come first serve
118 * @param $filter HTMLPurifier_Filter object
120 public function addFilter($filter) {
121 trigger_error('HTMLPurifier->addFilter() is deprecated, use configuration directives in the Filter namespace or Filter.Custom', E_USER_WARNING);
122 $this->filters[] = $filter;
126 * Filters an HTML snippet/document to be XSS-free and standards-compliant.
128 * @param $html String of HTML to purify
129 * @param $config HTMLPurifier_Config object for this operation, if omitted,
130 * defaults to the config object specified during this
131 * object's construction. The parameter can also be any type
132 * that HTMLPurifier_Config::create() supports.
133 * @return Purified HTML
135 public function purify($html, $config = null) {
137 // :TODO: make the config merge in, instead of replace
138 $config = $config ? HTMLPurifier_Config::create($config) : $this->config;
140 // implementation is partially environment dependant, partially
141 // configuration dependant
142 $lexer = HTMLPurifier_Lexer::create($config);
144 $context = new HTMLPurifier_Context();
146 // setup HTML generator
147 $this->generator = new HTMLPurifier_Generator($config, $context);
148 $context->register('Generator', $this->generator);
150 // set up global context variables
151 if ($config->get('Core.CollectErrors')) {
152 // may get moved out if other facilities use it
153 $language_factory = HTMLPurifier_LanguageFactory::instance();
154 $language = $language_factory->create($config, $context);
155 $context->register('Locale', $language);
157 $error_collector = new HTMLPurifier_ErrorCollector($context);
158 $context->register('ErrorCollector', $error_collector);
161 // setup id_accumulator context, necessary due to the fact that
162 // AttrValidator can be called from many places
163 $id_accumulator = HTMLPurifier_IDAccumulator::build($config, $context);
164 $context->register('IDAccumulator', $id_accumulator);
166 $html = HTMLPurifier_Encoder::convertToUTF8($html, $config, $context);
169 $filter_flags = $config->getBatch('Filter');
170 $custom_filters = $filter_flags['Custom'];
171 unset($filter_flags['Custom']);
173 foreach ($filter_flags as $filter => $flag) {
174 if (!$flag) continue;
175 if (strpos($filter, '.') !== false) continue;
176 $class = "HTMLPurifier_Filter_$filter";
177 $filters[] = new $class;
179 foreach ($custom_filters as $filter) {
180 // maybe "HTMLPurifier_Filter_$filter", but be consistent with AutoFormat
181 $filters[] = $filter;
183 $filters = array_merge($filters, $this->filters);
184 // maybe prepare(), but later
186 for ($i = 0, $filter_size = count($filters); $i < $filter_size; $i++) {
187 $html = $filters[$i]->preFilter($html, $config, $context);
192 $this->generator->generateFromTokens(
194 $this->strategy->execute(
195 // list of un-purified tokens
196 $lexer->tokenizeHTML(
198 $html, $config, $context
204 for ($i = $filter_size - 1; $i >= 0; $i--) {
205 $html = $filters[$i]->postFilter($html, $config, $context);
208 $html = HTMLPurifier_Encoder::convertFromUTF8($html, $config, $context);
209 $this->context =& $context;
214 * Filters an array of HTML snippets
215 * @param $config Optional HTMLPurifier_Config object for this operation.
216 * See HTMLPurifier::purify() for more details.
217 * @return Array of purified HTML
219 public function purifyArray($array_of_html, $config = null) {
220 $context_array = array();
221 foreach ($array_of_html as $key => $html) {
222 $array_of_html[$key] = $this->purify($html, $config);
223 $context_array[$key] = $this->context;
225 $this->context = $context_array;
226 return $array_of_html;
230 * Singleton for enforcing just one HTML Purifier in your system
231 * @param $prototype Optional prototype HTMLPurifier instance to
232 * overload singleton with, or HTMLPurifier_Config
233 * instance to configure the generated version with.
235 public static function instance($prototype = null) {
236 if (!self::$instance || $prototype) {
237 if ($prototype instanceof HTMLPurifier) {
238 self::$instance = $prototype;
239 } elseif ($prototype) {
240 self::$instance = new HTMLPurifier($prototype);
242 self::$instance = new HTMLPurifier();
245 return self::$instance;
249 * @note Backwards compatibility, see instance()
251 public static function getInstance($prototype = null) {
252 return HTMLPurifier::instance($prototype);
262 * Defines common attribute collections that modules reference
265 class HTMLPurifier_AttrCollections
269 * Associative array of attribute collections, indexed by name
271 public $info = array();
274 * Performs all expansions on internal data for use by other inclusions
275 * It also collects all attribute collection extensions from
277 * @param $attr_types HTMLPurifier_AttrTypes instance
278 * @param $modules Hash array of HTMLPurifier_HTMLModule members
280 public function __construct($attr_types, $modules) {
281 // load extensions from the modules
282 foreach ($modules as $module) {
283 foreach ($module->attr_collections as $coll_i => $coll) {
284 if (!isset($this->info[$coll_i])) {
285 $this->info[$coll_i] = array();
287 foreach ($coll as $attr_i => $attr) {
288 if ($attr_i === 0 && isset($this->info[$coll_i][$attr_i])) {
290 $this->info[$coll_i][$attr_i] = array_merge(
291 $this->info[$coll_i][$attr_i], $attr);
294 $this->info[$coll_i][$attr_i] = $attr;
298 // perform internal expansions and inclusions
299 foreach ($this->info as $name => $attr) {
300 // merge attribute collections that include others
301 $this->performInclusions($this->info[$name]);
302 // replace string identifiers with actual attribute objects
303 $this->expandIdentifiers($this->info[$name], $attr_types);
308 * Takes a reference to an attribute associative array and performs
309 * all inclusions specified by the zero index.
310 * @param &$attr Reference to attribute array
312 public function performInclusions(&$attr) {
313 if (!isset($attr[0])) return;
315 $seen = array(); // recursion guard
316 // loop through all the inclusions
317 for ($i = 0; isset($merge[$i]); $i++) {
318 if (isset($seen[$merge[$i]])) continue;
319 $seen[$merge[$i]] = true;
320 // foreach attribute of the inclusion, copy it over
321 if (!isset($this->info[$merge[$i]])) continue;
322 foreach ($this->info[$merge[$i]] as $key => $value) {
323 if (isset($attr[$key])) continue; // also catches more inclusions
324 $attr[$key] = $value;
326 if (isset($this->info[$merge[$i]][0])) {
328 $merge = array_merge($merge, $this->info[$merge[$i]][0]);
335 * Expands all string identifiers in an attribute array by replacing
336 * them with the appropriate values inside HTMLPurifier_AttrTypes
337 * @param &$attr Reference to attribute array
338 * @param $attr_types HTMLPurifier_AttrTypes instance
340 public function expandIdentifiers(&$attr, $attr_types) {
342 // because foreach will process new elements we add, make sure we
344 $processed = array();
346 foreach ($attr as $def_i => $def) {
348 if ($def_i === 0) continue;
350 if (isset($processed[$def_i])) continue;
352 // determine whether or not attribute is required
353 if ($required = (strpos($def_i, '*') !== false)) {
354 // rename the definition
355 unset($attr[$def_i]);
356 $def_i = trim($def_i, '*');
357 $attr[$def_i] = $def;
360 $processed[$def_i] = true;
362 // if we've already got a literal object, move on
363 if (is_object($def)) {
364 // preserve previous required
365 $attr[$def_i]->required = ($required || $attr[$def_i]->required);
369 if ($def === false) {
370 unset($attr[$def_i]);
374 if ($t = $attr_types->get($def)) {
376 $attr[$def_i]->required = $required;
378 unset($attr[$def_i]);
391 * Base class for all validating attribute definitions.
393 * This family of classes forms the core for not only HTML attribute validation,
394 * but also any sort of string that needs to be validated or cleaned (which
395 * means CSS properties and composite definitions are defined here too).
396 * Besides defining (through code) what precisely makes the string valid,
397 * subclasses are also responsible for cleaning the code if possible.
400 abstract class HTMLPurifier_AttrDef
404 * Tells us whether or not an HTML attribute is minimized. Has no
405 * meaning in other contexts.
407 public $minimized = false;
410 * Tells us whether or not an HTML attribute is required. Has no
411 * meaning in other contexts
413 public $required = false;
416 * Validates and cleans passed string according to a definition.
418 * @param $string String to be validated and cleaned.
419 * @param $config Mandatory HTMLPurifier_Config object.
420 * @param $context Mandatory HTMLPurifier_AttrContext object.
422 abstract public function validate($string, $config, $context);
425 * Convenience method that parses a string as if it were CDATA.
427 * This method process a string in the manner specified at
428 * <http://www.w3.org/TR/html4/types.html#h-6.2> by removing
429 * leading and trailing whitespace, ignoring line feeds, and replacing
430 * carriage returns and tabs with spaces. While most useful for HTML
431 * attributes specified as CDATA, it can also be applied to most CSS
434 * @note This method is not entirely standards compliant, as trim() removes
435 * more types of whitespace than specified in the spec. In practice,
436 * this is rarely a problem, as those extra characters usually have
437 * already been removed by HTMLPurifier_Encoder.
439 * @warning This processing is inconsistent with XML's whitespace handling
440 * as specified by section 3.3.3 and referenced XHTML 1.0 section
441 * 4.7. However, note that we are NOT necessarily
442 * parsing XML, thus, this behavior may still be correct. We
443 * assume that newlines have been normalized.
445 public function parseCDATA($string) {
446 $string = trim($string);
447 $string = str_replace(array("\n", "\t", "\r"), ' ', $string);
452 * Factory method for creating this class from a string.
453 * @param $string String construction info
454 * @return Created AttrDef object corresponding to $string
456 public function make($string) {
457 // default implementation, return a flyweight of this object.
458 // If $string has an effect on the returned object (i.e. you
459 // need to overload this method), it is best
460 // to clone or instantiate new copies. (Instantiation is safer.)
465 * Removes spaces from rgb(0, 0, 0) so that shorthand CSS properties work
466 * properly. THIS IS A HACK!
468 protected function mungeRgb($string) {
469 return preg_replace('/rgb\((\d+)\s*,\s*(\d+)\s*,\s*(\d+)\)/', 'rgb(\1,\2,\3)', $string);
473 * Parses a possibly escaped CSS string and returns the "pure"
476 protected function expandCSSEscape($string) {
479 for ($i = 0, $c = strlen($string); $i < $c; $i++) {
480 if ($string[$i] === '\\') {
486 if (ctype_xdigit($string[$i])) {
488 for ($a = 1, $i++; $i < $c && $a < 6; $i++, $a++) {
489 if (!ctype_xdigit($string[$i])) break;
490 $code .= $string[$i];
492 // We have to be extremely careful when adding
493 // new characters, to make sure we're not breaking
495 $char = HTMLPurifier_Encoder::unichr(hexdec($code));
496 if (HTMLPurifier_Encoder::cleanUTF8($char) === '') continue;
498 if ($i < $c && trim($string[$i]) !== '') $i--;
501 if ($string[$i] === "\n") continue;
515 * Processes an entire attribute array for corrections needing multiple values.
517 * Occasionally, a certain attribute will need to be removed and popped onto
518 * another value. Instead of creating a complex return syntax for
519 * HTMLPurifier_AttrDef, we just pass the whole attribute array to a
520 * specialized object and have that do the special work. That is the
521 * family of HTMLPurifier_AttrTransform.
523 * An attribute transformation can be assigned to run before or after
524 * HTMLPurifier_AttrDef validation. See HTMLPurifier_HTMLDefinition for
528 abstract class HTMLPurifier_AttrTransform
532 * Abstract: makes changes to the attributes dependent on multiple values.
534 * @param $attr Assoc array of attributes, usually from
535 * HTMLPurifier_Token_Tag::$attr
536 * @param $config Mandatory HTMLPurifier_Config object.
537 * @param $context Mandatory HTMLPurifier_Context object
538 * @returns Processed attribute array.
540 abstract public function transform($attr, $config, $context);
543 * Prepends CSS properties to the style attribute, creating the
544 * attribute if it doesn't exist.
545 * @param $attr Attribute array to process (passed by reference)
546 * @param $css CSS to prepend
548 public function prependCSS(&$attr, $css) {
549 $attr['style'] = isset($attr['style']) ? $attr['style'] : '';
550 $attr['style'] = $css . $attr['style'];
554 * Retrieves and removes an attribute
555 * @param $attr Attribute array to process (passed by reference)
556 * @param $key Key of attribute to confiscate
558 public function confiscateAttr(&$attr, $key) {
559 if (!isset($attr[$key])) return null;
560 $value = $attr[$key];
572 * Provides lookup array of attribute types to HTMLPurifier_AttrDef objects
574 class HTMLPurifier_AttrTypes
577 * Lookup array of attribute string identifiers to concrete implementations
579 protected $info = array();
582 * Constructs the info array, supplying default implementations for attribute
585 public function __construct() {
586 // pseudo-types, must be instantiated via shorthand
587 $this->info['Enum'] = new HTMLPurifier_AttrDef_Enum();
588 $this->info['Bool'] = new HTMLPurifier_AttrDef_HTML_Bool();
590 $this->info['CDATA'] = new HTMLPurifier_AttrDef_Text();
591 $this->info['ID'] = new HTMLPurifier_AttrDef_HTML_ID();
592 $this->info['Length'] = new HTMLPurifier_AttrDef_HTML_Length();
593 $this->info['MultiLength'] = new HTMLPurifier_AttrDef_HTML_MultiLength();
594 $this->info['NMTOKENS'] = new HTMLPurifier_AttrDef_HTML_Nmtokens();
595 $this->info['Pixels'] = new HTMLPurifier_AttrDef_HTML_Pixels();
596 $this->info['Text'] = new HTMLPurifier_AttrDef_Text();
597 $this->info['URI'] = new HTMLPurifier_AttrDef_URI();
598 $this->info['LanguageCode'] = new HTMLPurifier_AttrDef_Lang();
599 $this->info['Color'] = new HTMLPurifier_AttrDef_HTML_Color();
601 // unimplemented aliases
602 $this->info['ContentType'] = new HTMLPurifier_AttrDef_Text();
603 $this->info['ContentTypes'] = new HTMLPurifier_AttrDef_Text();
604 $this->info['Charsets'] = new HTMLPurifier_AttrDef_Text();
605 $this->info['Character'] = new HTMLPurifier_AttrDef_Text();
607 // "proprietary" types
608 $this->info['Class'] = new HTMLPurifier_AttrDef_HTML_Class();
610 // number is really a positive integer (one or more digits)
611 // FIXME: ^^ not always, see start and value of list items
612 $this->info['Number'] = new HTMLPurifier_AttrDef_Integer(false, false, true);
617 * @param $type String type name
618 * @return Object AttrDef for type
620 public function get($type) {
622 // determine if there is any extra info tacked on
623 if (strpos($type, '#') !== false) list($type, $string) = explode('#', $type, 2);
626 if (!isset($this->info[$type])) {
627 trigger_error('Cannot retrieve undefined attribute type ' . $type, E_USER_ERROR);
631 return $this->info[$type]->make($string);
636 * Sets a new implementation for a type
637 * @param $type String type name
638 * @param $impl Object AttrDef for type
640 public function set($type, $impl) {
641 $this->info[$type] = $impl;
650 * Validates the attributes of a token. Doesn't manage required attributes
651 * very well. The only reason we factored this out was because RemoveForeignElements
652 * also needed it besides ValidateAttributes.
654 class HTMLPurifier_AttrValidator
658 * Validates the attributes of a token, returning a modified token
659 * that has valid tokens
660 * @param $token Reference to token to validate. We require a reference
661 * because the operation this class performs on the token are
662 * not atomic, so the context CurrentToken to be updated
664 * @param $config Instance of HTMLPurifier_Config
665 * @param $context Instance of HTMLPurifier_Context
667 public function validateToken(&$token, &$config, $context) {
669 $definition = $config->getHTMLDefinition();
670 $e =& $context->get('ErrorCollector', true);
672 // initialize IDAccumulator if necessary
673 $ok =& $context->get('IDAccumulator', true);
675 $id_accumulator = HTMLPurifier_IDAccumulator::build($config, $context);
676 $context->register('IDAccumulator', $id_accumulator);
679 // initialize CurrentToken if necessary
680 $current_token =& $context->get('CurrentToken', true);
681 if (!$current_token) $context->register('CurrentToken', $token);
684 !$token instanceof HTMLPurifier_Token_Start &&
685 !$token instanceof HTMLPurifier_Token_Empty
688 // create alias to global definition array, see also $defs
690 $d_defs = $definition->info_global_attr;
692 // don't update token until the very end, to ensure an atomic update
693 $attr = $token->attr;
695 // do global transformations (pre)
696 // nothing currently utilizes this
697 foreach ($definition->info_attr_transform_pre as $transform) {
698 $attr = $transform->transform($o = $attr, $config, $context);
700 if ($attr != $o) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr);
704 // do local transformations only applicable to this element (pre)
705 // ex. <p align="right"> to <p style="text-align:right;">
706 foreach ($definition->info[$token->name]->attr_transform_pre as $transform) {
707 $attr = $transform->transform($o = $attr, $config, $context);
709 if ($attr != $o) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr);
713 // create alias to this element's attribute definition array, see
714 // also $d_defs (global attribute definition array)
716 $defs = $definition->info[$token->name]->attr;
719 $context->register('CurrentAttr', $attr_key);
721 // iterate through all the attribute keypairs
722 // Watch out for name collisions: $key has previously been used
723 foreach ($attr as $attr_key => $value) {
725 // call the definition
726 if ( isset($defs[$attr_key]) ) {
727 // there is a local definition defined
728 if ($defs[$attr_key] === false) {
729 // We've explicitly been told not to allow this element.
730 // This is usually when there's a global definition
731 // that must be overridden.
732 // Theoretically speaking, we could have a
733 // AttrDef_DenyAll, but this is faster!
736 // validate according to the element's definition
737 $result = $defs[$attr_key]->validate(
738 $value, $config, $context
741 } elseif ( isset($d_defs[$attr_key]) ) {
742 // there is a global definition defined, validate according
743 // to the global definition
744 $result = $d_defs[$attr_key]->validate(
745 $value, $config, $context
748 // system never heard of the attribute? DELETE!
752 // put the results into effect
753 if ($result === false || $result === null) {
754 // this is a generic error message that should replaced
755 // with more specific ones when possible
756 if ($e) $e->send(E_ERROR, 'AttrValidator: Attribute removed');
758 // remove the attribute
759 unset($attr[$attr_key]);
760 } elseif (is_string($result)) {
761 // generally, if a substitution is happening, there
762 // was some sort of implicit correction going on. We'll
763 // delegate it to the attribute classes to say exactly what.
765 // simple substitution
766 $attr[$attr_key] = $result;
771 // we'd also want slightly more complicated substitution
772 // involving an array as the return value,
773 // although we're not sure how colliding attributes would
774 // resolve (certain ones would be completely overriden,
775 // others would prepend themselves).
778 $context->destroy('CurrentAttr');
782 // global (error reporting untested)
783 foreach ($definition->info_attr_transform_post as $transform) {
784 $attr = $transform->transform($o = $attr, $config, $context);
786 if ($attr != $o) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr);
790 // local (error reporting untested)
791 foreach ($definition->info[$token->name]->attr_transform_post as $transform) {
792 $attr = $transform->transform($o = $attr, $config, $context);
794 if ($attr != $o) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr);
798 $token->attr = $attr;
800 // destroy CurrentToken if we made it ourselves
801 if (!$current_token) $context->destroy('CurrentToken');
812 // constants are slow, so we use as few as possible
813 if (!defined('HTMLPURIFIER_PREFIX')) {
814 define('HTMLPURIFIER_PREFIX', dirname(__FILE__) . '/standalone');
817 // accomodations for versions earlier than 5.0.2
818 // borrowed from PHP_Compat, LGPL licensed, by Aidan Lister <aidan@php.net>
819 if (!defined('PHP_EOL')) {
820 switch (strtoupper(substr(PHP_OS, 0, 3))) {
822 define('PHP_EOL', "\r\n");
825 define('PHP_EOL', "\r");
828 define('PHP_EOL', "\n");
833 * Bootstrap class that contains meta-functionality for HTML Purifier such as
834 * the autoload function.
837 * This class may be used without any other files from HTML Purifier.
839 class HTMLPurifier_Bootstrap
843 * Autoload function for HTML Purifier
844 * @param $class Class to load
846 public static function autoload($class) {
847 $file = HTMLPurifier_Bootstrap::getPath($class);
848 if (!$file) return false;
849 // Technically speaking, it should be ok and more efficient to
850 // just do 'require', but Antonio Parraga reports that with
851 // Zend extensions such as Zend debugger and APC, this invariant
852 // may be broken. Since we have efficient alternatives, pay
853 // the cost here and avoid the bug.
854 require_once HTMLPURIFIER_PREFIX . '/' . $file;
859 * Returns the path for a specific class.
861 public static function getPath($class) {
862 if (strncmp('HTMLPurifier', $class, 12) !== 0) return false;
863 // Custom implementations
864 if (strncmp('HTMLPurifier_Language_', $class, 22) === 0) {
865 $code = str_replace('_', '-', substr($class, 22));
866 $file = 'HTMLPurifier/Language/classes/' . $code . '.php';
868 $file = str_replace('_', '/', $class) . '.php';
870 if (!file_exists(HTMLPURIFIER_PREFIX . '/' . $file)) return false;
875 * "Pre-registers" our autoloader on the SPL stack.
877 public static function registerAutoload() {
878 $autoload = array('HTMLPurifier_Bootstrap', 'autoload');
879 if ( ($funcs = spl_autoload_functions()) === false ) {
880 spl_autoload_register($autoload);
881 } elseif (function_exists('spl_autoload_unregister')) {
882 $buggy = version_compare(PHP_VERSION, '5.2.11', '<');
883 $compat = version_compare(PHP_VERSION, '5.1.2', '<=') &&
884 version_compare(PHP_VERSION, '5.1.0', '>=');
885 foreach ($funcs as $func) {
886 if ($buggy && is_array($func)) {
887 // :TRICKY: There are some compatibility issues and some
888 // places where we need to error out
889 $reflector = new ReflectionMethod($func[0], $func[1]);
890 if (!$reflector->isStatic()) {
891 throw new Exception('
892 HTML Purifier autoloader registrar is not compatible
893 with non-static object methods due to PHP Bug #44144;
894 Please do not use HTMLPurifier.autoload.php (or any
895 file that includes this file); instead, place the code:
896 spl_autoload_register(array(\'HTMLPurifier_Bootstrap\', \'autoload\'))
897 after your own autoloaders.
900 // Suprisingly, spl_autoload_register supports the
901 // Class::staticMethod callback format, although call_user_func doesn't
902 if ($compat) $func = implode('::', $func);
904 spl_autoload_unregister($func);
906 spl_autoload_register($autoload);
907 foreach ($funcs as $func) spl_autoload_register($func);
918 * Super-class for definition datatype objects, implements serialization
919 * functions for the class.
921 abstract class HTMLPurifier_Definition
925 * Has setup() been called yet?
927 public $setup = false;
930 * If true, write out the final definition object to the cache after
931 * setup. This will be true only if all invocations to get a raw
932 * definition object are also optimized. This does not cause file
933 * system thrashing because on subsequent calls the cached object
934 * is used and any writes to the raw definition object are short
935 * circuited. See enduser-customize.html for the high-level
938 public $optimized = null;
941 * What type of definition is it?
946 * Sets up the definition object into the final form, something
947 * not done by the constructor
948 * @param $config HTMLPurifier_Config instance
950 abstract protected function doSetup($config);
953 * Setup function that aborts if already setup
954 * @param $config HTMLPurifier_Config instance
956 public function setup($config) {
957 if ($this->setup) return;
959 $this->doSetup($config);
969 * Defines allowed CSS attributes and what their values are.
970 * @see HTMLPurifier_HTMLDefinition
972 class HTMLPurifier_CSSDefinition extends HTMLPurifier_Definition
975 public $type = 'CSS';
978 * Assoc array of attribute name to definition object.
980 public $info = array();
983 * Constructs the info array. The meat of this class.
985 protected function doSetup($config) {
987 $this->info['text-align'] = new HTMLPurifier_AttrDef_Enum(
988 array('left', 'right', 'center', 'justify'), false);
991 $this->info['border-bottom-style'] =
992 $this->info['border-right-style'] =
993 $this->info['border-left-style'] =
994 $this->info['border-top-style'] = new HTMLPurifier_AttrDef_Enum(
995 array('none', 'hidden', 'dotted', 'dashed', 'solid', 'double',
996 'groove', 'ridge', 'inset', 'outset'), false);
998 $this->info['border-style'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_style);
1000 $this->info['clear'] = new HTMLPurifier_AttrDef_Enum(
1001 array('none', 'left', 'right', 'both'), false);
1002 $this->info['float'] = new HTMLPurifier_AttrDef_Enum(
1003 array('none', 'left', 'right'), false);
1004 $this->info['font-style'] = new HTMLPurifier_AttrDef_Enum(
1005 array('normal', 'italic', 'oblique'), false);
1006 $this->info['font-variant'] = new HTMLPurifier_AttrDef_Enum(
1007 array('normal', 'small-caps'), false);
1009 $uri_or_none = new HTMLPurifier_AttrDef_CSS_Composite(
1011 new HTMLPurifier_AttrDef_Enum(array('none')),
1012 new HTMLPurifier_AttrDef_CSS_URI()
1016 $this->info['list-style-position'] = new HTMLPurifier_AttrDef_Enum(
1017 array('inside', 'outside'), false);
1018 $this->info['list-style-type'] = new HTMLPurifier_AttrDef_Enum(
1019 array('disc', 'circle', 'square', 'decimal', 'lower-roman',
1020 'upper-roman', 'lower-alpha', 'upper-alpha', 'none'), false);
1021 $this->info['list-style-image'] = $uri_or_none;
1023 $this->info['list-style'] = new HTMLPurifier_AttrDef_CSS_ListStyle($config);
1025 $this->info['text-transform'] = new HTMLPurifier_AttrDef_Enum(
1026 array('capitalize', 'uppercase', 'lowercase', 'none'), false);
1027 $this->info['color'] = new HTMLPurifier_AttrDef_CSS_Color();
1029 $this->info['background-image'] = $uri_or_none;
1030 $this->info['background-repeat'] = new HTMLPurifier_AttrDef_Enum(
1031 array('repeat', 'repeat-x', 'repeat-y', 'no-repeat')
1033 $this->info['background-attachment'] = new HTMLPurifier_AttrDef_Enum(
1034 array('scroll', 'fixed')
1036 $this->info['background-position'] = new HTMLPurifier_AttrDef_CSS_BackgroundPosition();
1039 $this->info['border-top-color'] =
1040 $this->info['border-bottom-color'] =
1041 $this->info['border-left-color'] =
1042 $this->info['border-right-color'] =
1043 $this->info['background-color'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1044 new HTMLPurifier_AttrDef_Enum(array('transparent')),
1045 new HTMLPurifier_AttrDef_CSS_Color()
1048 $this->info['background'] = new HTMLPurifier_AttrDef_CSS_Background($config);
1050 $this->info['border-color'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_color);
1053 $this->info['border-top-width'] =
1054 $this->info['border-bottom-width'] =
1055 $this->info['border-left-width'] =
1056 $this->info['border-right-width'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1057 new HTMLPurifier_AttrDef_Enum(array('thin', 'medium', 'thick')),
1058 new HTMLPurifier_AttrDef_CSS_Length('0') //disallow negative
1061 $this->info['border-width'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_width);
1063 $this->info['letter-spacing'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1064 new HTMLPurifier_AttrDef_Enum(array('normal')),
1065 new HTMLPurifier_AttrDef_CSS_Length()
1068 $this->info['word-spacing'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1069 new HTMLPurifier_AttrDef_Enum(array('normal')),
1070 new HTMLPurifier_AttrDef_CSS_Length()
1073 $this->info['font-size'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1074 new HTMLPurifier_AttrDef_Enum(array('xx-small', 'x-small',
1075 'small', 'medium', 'large', 'x-large', 'xx-large',
1076 'larger', 'smaller')),
1077 new HTMLPurifier_AttrDef_CSS_Percentage(),
1078 new HTMLPurifier_AttrDef_CSS_Length()
1081 $this->info['line-height'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1082 new HTMLPurifier_AttrDef_Enum(array('normal')),
1083 new HTMLPurifier_AttrDef_CSS_Number(true), // no negatives
1084 new HTMLPurifier_AttrDef_CSS_Length('0'),
1085 new HTMLPurifier_AttrDef_CSS_Percentage(true)
1089 $this->info['margin-top'] =
1090 $this->info['margin-bottom'] =
1091 $this->info['margin-left'] =
1092 $this->info['margin-right'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1093 new HTMLPurifier_AttrDef_CSS_Length(),
1094 new HTMLPurifier_AttrDef_CSS_Percentage(),
1095 new HTMLPurifier_AttrDef_Enum(array('auto'))
1098 $this->info['margin'] = new HTMLPurifier_AttrDef_CSS_Multiple($margin);
1102 $this->info['padding-top'] =
1103 $this->info['padding-bottom'] =
1104 $this->info['padding-left'] =
1105 $this->info['padding-right'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1106 new HTMLPurifier_AttrDef_CSS_Length('0'),
1107 new HTMLPurifier_AttrDef_CSS_Percentage(true)
1110 $this->info['padding'] = new HTMLPurifier_AttrDef_CSS_Multiple($padding);
1112 $this->info['text-indent'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1113 new HTMLPurifier_AttrDef_CSS_Length(),
1114 new HTMLPurifier_AttrDef_CSS_Percentage()
1117 $trusted_wh = new HTMLPurifier_AttrDef_CSS_Composite(array(
1118 new HTMLPurifier_AttrDef_CSS_Length('0'),
1119 new HTMLPurifier_AttrDef_CSS_Percentage(true),
1120 new HTMLPurifier_AttrDef_Enum(array('auto'))
1122 $max = $config->get('CSS.MaxImgLength');
1124 $this->info['width'] =
1125 $this->info['height'] =
1128 new HTMLPurifier_AttrDef_Switch('img',
1130 new HTMLPurifier_AttrDef_CSS_Composite(array(
1131 new HTMLPurifier_AttrDef_CSS_Length('0', $max),
1132 new HTMLPurifier_AttrDef_Enum(array('auto'))
1134 // For everyone else:
1138 $this->info['text-decoration'] = new HTMLPurifier_AttrDef_CSS_TextDecoration();
1140 $this->info['font-family'] = new HTMLPurifier_AttrDef_CSS_FontFamily();
1142 // this could use specialized code
1143 $this->info['font-weight'] = new HTMLPurifier_AttrDef_Enum(
1144 array('normal', 'bold', 'bolder', 'lighter', '100', '200', '300',
1145 '400', '500', '600', '700', '800', '900'), false);
1147 // MUST be called after other font properties, as it references
1148 // a CSSDefinition object
1149 $this->info['font'] = new HTMLPurifier_AttrDef_CSS_Font($config);
1152 $this->info['border'] =
1153 $this->info['border-bottom'] =
1154 $this->info['border-top'] =
1155 $this->info['border-left'] =
1156 $this->info['border-right'] = new HTMLPurifier_AttrDef_CSS_Border($config);
1158 $this->info['border-collapse'] = new HTMLPurifier_AttrDef_Enum(array(
1159 'collapse', 'separate'));
1161 $this->info['caption-side'] = new HTMLPurifier_AttrDef_Enum(array(
1164 $this->info['table-layout'] = new HTMLPurifier_AttrDef_Enum(array(
1167 $this->info['vertical-align'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1168 new HTMLPurifier_AttrDef_Enum(array('baseline', 'sub', 'super',
1169 'top', 'text-top', 'middle', 'bottom', 'text-bottom')),
1170 new HTMLPurifier_AttrDef_CSS_Length(),
1171 new HTMLPurifier_AttrDef_CSS_Percentage()
1174 $this->info['border-spacing'] = new HTMLPurifier_AttrDef_CSS_Multiple(new HTMLPurifier_AttrDef_CSS_Length(), 2);
1177 $this->info['white-space'] = new HTMLPurifier_AttrDef_Enum(array('nowrap'));
1179 if ($config->get('CSS.Proprietary')) {
1180 $this->doSetupProprietary($config);
1183 if ($config->get('CSS.AllowTricky')) {
1184 $this->doSetupTricky($config);
1187 if ($config->get('CSS.Trusted')) {
1188 $this->doSetupTrusted($config);
1191 $allow_important = $config->get('CSS.AllowImportant');
1192 // wrap all attr-defs with decorator that handles !important
1193 foreach ($this->info as $k => $v) {
1194 $this->info[$k] = new HTMLPurifier_AttrDef_CSS_ImportantDecorator($v, $allow_important);
1197 $this->setupConfigStuff($config);
1200 protected function doSetupProprietary($config) {
1201 // Internet Explorer only scrollbar colors
1202 $this->info['scrollbar-arrow-color'] = new HTMLPurifier_AttrDef_CSS_Color();
1203 $this->info['scrollbar-base-color'] = new HTMLPurifier_AttrDef_CSS_Color();
1204 $this->info['scrollbar-darkshadow-color'] = new HTMLPurifier_AttrDef_CSS_Color();
1205 $this->info['scrollbar-face-color'] = new HTMLPurifier_AttrDef_CSS_Color();
1206 $this->info['scrollbar-highlight-color'] = new HTMLPurifier_AttrDef_CSS_Color();
1207 $this->info['scrollbar-shadow-color'] = new HTMLPurifier_AttrDef_CSS_Color();
1209 // technically not proprietary, but CSS3, and no one supports it
1210 $this->info['opacity'] = new HTMLPurifier_AttrDef_CSS_AlphaValue();
1211 $this->info['-moz-opacity'] = new HTMLPurifier_AttrDef_CSS_AlphaValue();
1212 $this->info['-khtml-opacity'] = new HTMLPurifier_AttrDef_CSS_AlphaValue();
1214 // only opacity, for now
1215 $this->info['filter'] = new HTMLPurifier_AttrDef_CSS_Filter();
1219 protected function doSetupTricky($config) {
1220 $this->info['display'] = new HTMLPurifier_AttrDef_Enum(array(
1221 'inline', 'block', 'list-item', 'run-in', 'compact',
1222 'marker', 'table', 'inline-table', 'table-row-group',
1223 'table-header-group', 'table-footer-group', 'table-row',
1224 'table-column-group', 'table-column', 'table-cell', 'table-caption', 'none'
1226 $this->info['visibility'] = new HTMLPurifier_AttrDef_Enum(array(
1227 'visible', 'hidden', 'collapse'
1229 $this->info['overflow'] = new HTMLPurifier_AttrDef_Enum(array('visible', 'hidden', 'auto', 'scroll'));
1232 protected function doSetupTrusted($config) {
1233 $this->info['position'] = new HTMLPurifier_AttrDef_Enum(array(
1234 'static', 'relative', 'absolute', 'fixed'
1236 $this->info['top'] =
1237 $this->info['left'] =
1238 $this->info['right'] =
1239 $this->info['bottom'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1240 new HTMLPurifier_AttrDef_CSS_Length(),
1241 new HTMLPurifier_AttrDef_CSS_Percentage(),
1242 new HTMLPurifier_AttrDef_Enum(array('auto')),
1244 $this->info['z-index'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1245 new HTMLPurifier_AttrDef_Integer(),
1246 new HTMLPurifier_AttrDef_Enum(array('auto')),
1251 * Performs extra config-based processing. Based off of
1252 * HTMLPurifier_HTMLDefinition.
1253 * @todo Refactor duplicate elements into common class (probably using
1254 * composition, not inheritance).
1256 protected function setupConfigStuff($config) {
1258 // setup allowed elements
1259 $support = "(for information on implementing this, see the ".
1261 $allowed_properties = $config->get('CSS.AllowedProperties');
1262 if ($allowed_properties !== null) {
1263 foreach ($this->info as $name => $d) {
1264 if(!isset($allowed_properties[$name])) unset($this->info[$name]);
1265 unset($allowed_properties[$name]);
1268 foreach ($allowed_properties as $name => $d) {
1269 // :TODO: Is this htmlspecialchars() call really necessary?
1270 $name = htmlspecialchars($name);
1271 trigger_error("Style attribute '$name' is not supported $support", E_USER_WARNING);
1275 $forbidden_properties = $config->get('CSS.ForbiddenProperties');
1276 if ($forbidden_properties !== null) {
1277 foreach ($this->info as $name => $d) {
1278 if (isset($forbidden_properties[$name])) {
1279 unset($this->info[$name]);
1292 * Defines allowed child nodes and validates tokens against it.
1294 abstract class HTMLPurifier_ChildDef
1297 * Type of child definition, usually right-most part of class name lowercase.
1298 * Used occasionally in terms of context.
1303 * Bool that indicates whether or not an empty array of children is okay
1305 * This is necessary for redundant checking when changes affecting
1306 * a child node may cause a parent node to now be disallowed.
1308 public $allow_empty;
1311 * Lookup array of all elements that this definition could possibly allow
1313 public $elements = array();
1316 * Get lookup of tag names that should not close this element automatically.
1317 * All other elements will do so.
1319 public function getAllowedElements($config) {
1320 return $this->elements;
1324 * Validates nodes according to definition and returns modification.
1326 * @param $tokens_of_children Array of HTMLPurifier_Token
1327 * @param $config HTMLPurifier_Config object
1328 * @param $context HTMLPurifier_Context object
1329 * @return bool true to leave nodes as is
1330 * @return bool false to remove parent node
1331 * @return array of replacement child tokens
1333 abstract public function validateChildren($tokens_of_children, $config, $context);
1341 * Configuration object that triggers customizable behavior.
1343 * @warning This class is strongly defined: that means that the class
1344 * will fail if an undefined directive is retrieved or set.
1346 * @note Many classes that could (although many times don't) use the
1347 * configuration object make it a mandatory parameter. This is
1348 * because a configuration object should always be forwarded,
1349 * otherwise, you run the risk of missing a parameter and then
1350 * being stumped when a configuration directive doesn't work.
1352 * @todo Reconsider some of the public member variables
1354 class HTMLPurifier_Config
1358 * HTML Purifier's version
1360 public $version = '4.3.0';
1363 * Bool indicator whether or not to automatically finalize
1364 * the object if a read operation is done
1366 public $autoFinalize = true;
1368 // protected member variables
1371 * Namespace indexed array of serials for specific namespaces (see
1372 * getSerial() for more info).
1374 protected $serials = array();
1377 * Serial for entire configuration object
1382 * Parser for variables
1387 * Reference HTMLPurifier_ConfigSchema for value checking
1388 * @note This is public for introspective purposes. Please don't
1394 * Indexed array of definitions
1396 protected $definitions;
1399 * Bool indicator whether or not config is finalized
1401 protected $finalized = false;
1404 * Property list containing configuration directives.
1409 * Whether or not a set is taking place due to an
1415 * Set to false if you do not want line and file numbers in errors
1416 * (useful when unit testing). This will also compress some errors
1419 public $chatty = true;
1422 * Current lock; only gets to this namespace are allowed.
1427 * @param $definition HTMLPurifier_ConfigSchema that defines what directives
1430 public function __construct($definition, $parent = null) {
1431 $parent = $parent ? $parent : $definition->defaultPlist;
1432 $this->plist = new HTMLPurifier_PropertyList($parent);
1433 $this->def = $definition; // keep a copy around for checking
1434 $this->parser = new HTMLPurifier_VarParser_Flexible();
1438 * Convenience constructor that creates a config object based on a mixed var
1439 * @param mixed $config Variable that defines the state of the config
1440 * object. Can be: a HTMLPurifier_Config() object,
1441 * an array of directives based on loadArray(),
1442 * or a string filename of an ini file.
1443 * @param HTMLPurifier_ConfigSchema Schema object
1444 * @return Configured HTMLPurifier_Config object
1446 public static function create($config, $schema = null) {
1447 if ($config instanceof HTMLPurifier_Config) {
1452 $ret = HTMLPurifier_Config::createDefault();
1454 $ret = new HTMLPurifier_Config($schema);
1456 if (is_string($config)) $ret->loadIni($config);
1457 elseif (is_array($config)) $ret->loadArray($config);
1462 * Creates a new config object that inherits from a previous one.
1463 * @param HTMLPurifier_Config $config Configuration object to inherit
1465 * @return HTMLPurifier_Config object with $config as its parent.
1467 public static function inherit(HTMLPurifier_Config $config) {
1468 return new HTMLPurifier_Config($config->def, $config->plist);
1472 * Convenience constructor that creates a default configuration object.
1473 * @return Default HTMLPurifier_Config object.
1475 public static function createDefault() {
1476 $definition = HTMLPurifier_ConfigSchema::instance();
1477 $config = new HTMLPurifier_Config($definition);
1482 * Retreives a value from the configuration.
1483 * @param $key String key
1485 public function get($key, $a = null) {
1487 $this->triggerError("Using deprecated API: use \$config->get('$key.$a') instead", E_USER_WARNING);
1490 if (!$this->finalized) $this->autoFinalize();
1491 if (!isset($this->def->info[$key])) {
1492 // can't add % due to SimpleTest bug
1493 $this->triggerError('Cannot retrieve value of undefined directive ' . htmlspecialchars($key),
1497 if (isset($this->def->info[$key]->isAlias)) {
1498 $d = $this->def->info[$key];
1499 $this->triggerError('Cannot get value from aliased directive, use real name ' . $d->key,
1504 list($ns) = explode('.', $key);
1505 if ($ns !== $this->lock) {
1506 $this->triggerError('Cannot get value of namespace ' . $ns . ' when lock for ' . $this->lock . ' is active, this probably indicates a Definition setup method is accessing directives that are not within its namespace', E_USER_ERROR);
1510 return $this->plist->get($key);
1514 * Retreives an array of directives to values from a given namespace
1515 * @param $namespace String namespace
1517 public function getBatch($namespace) {
1518 if (!$this->finalized) $this->autoFinalize();
1519 $full = $this->getAll();
1520 if (!isset($full[$namespace])) {
1521 $this->triggerError('Cannot retrieve undefined namespace ' . htmlspecialchars($namespace),
1525 return $full[$namespace];
1529 * Returns a md5 signature of a segment of the configuration object
1530 * that uniquely identifies that particular configuration
1531 * @note Revision is handled specially and is removed from the batch
1532 * before processing!
1533 * @param $namespace Namespace to get serial for
1535 public function getBatchSerial($namespace) {
1536 if (empty($this->serials[$namespace])) {
1537 $batch = $this->getBatch($namespace);
1538 unset($batch['DefinitionRev']);
1539 $this->serials[$namespace] = md5(serialize($batch));
1541 return $this->serials[$namespace];
1545 * Returns a md5 signature for the entire configuration object
1546 * that uniquely identifies that particular configuration
1548 public function getSerial() {
1549 if (empty($this->serial)) {
1550 $this->serial = md5(serialize($this->getAll()));
1552 return $this->serial;
1556 * Retrieves all directives, organized by namespace
1557 * @warning This is a pretty inefficient function, avoid if you can
1559 public function getAll() {
1560 if (!$this->finalized) $this->autoFinalize();
1562 foreach ($this->plist->squash() as $name => $value) {
1563 list($ns, $key) = explode('.', $name, 2);
1564 $ret[$ns][$key] = $value;
1570 * Sets a value to configuration.
1571 * @param $key String key
1572 * @param $value Mixed value
1574 public function set($key, $value, $a = null) {
1575 if (strpos($key, '.') === false) {
1577 $directive = $value;
1579 $key = "$key.$directive";
1580 $this->triggerError("Using deprecated API: use \$config->set('$key', ...) instead", E_USER_NOTICE);
1582 list($namespace) = explode('.', $key);
1584 if ($this->isFinalized('Cannot set directive after finalization')) return;
1585 if (!isset($this->def->info[$key])) {
1586 $this->triggerError('Cannot set undefined directive ' . htmlspecialchars($key) . ' to value',
1590 $def = $this->def->info[$key];
1592 if (isset($def->isAlias)) {
1593 if ($this->aliasMode) {
1594 $this->triggerError('Double-aliases not allowed, please fix '.
1595 'ConfigSchema bug with' . $key, E_USER_ERROR);
1598 $this->aliasMode = true;
1599 $this->set($def->key, $value);
1600 $this->aliasMode = false;
1601 $this->triggerError("$key is an alias, preferred directive name is {$def->key}", E_USER_NOTICE);
1605 // Raw type might be negative when using the fully optimized form
1606 // of stdclass, which indicates allow_null == true
1607 $rtype = is_int($def) ? $def : $def->type;
1613 $allow_null = isset($def->allow_null);
1617 $value = $this->parser->parse($value, $type, $allow_null);
1618 } catch (HTMLPurifier_VarParserException $e) {
1619 $this->triggerError('Value for ' . $key . ' is of invalid type, should be ' . HTMLPurifier_VarParser::getTypeName($type), E_USER_WARNING);
1622 if (is_string($value) && is_object($def)) {
1623 // resolve value alias if defined
1624 if (isset($def->aliases[$value])) {
1625 $value = $def->aliases[$value];
1627 // check to see if the value is allowed
1628 if (isset($def->allowed) && !isset($def->allowed[$value])) {
1629 $this->triggerError('Value not supported, valid values are: ' .
1630 $this->_listify($def->allowed), E_USER_WARNING);
1634 $this->plist->set($key, $value);
1636 // reset definitions if the directives they depend on changed
1637 // this is a very costly process, so it's discouraged
1638 // with finalization
1639 if ($namespace == 'HTML' || $namespace == 'CSS' || $namespace == 'URI') {
1640 $this->definitions[$namespace] = null;
1643 $this->serials[$namespace] = false;
1647 * Convenience function for error reporting
1649 private function _listify($lookup) {
1651 foreach ($lookup as $name => $b) $list[] = $name;
1652 return implode(', ', $list);
1656 * Retrieves object reference to the HTML definition.
1657 * @param $raw Return a copy that has not been setup yet. Must be
1658 * called before it's been setup, otherwise won't work.
1659 * @param $optimized If true, this method may return null, to
1660 * indicate that a cached version of the modified
1661 * definition object is available and no further edits
1662 * are necessary. Consider using
1663 * maybeGetRawHTMLDefinition, which is more explicitly
1666 public function getHTMLDefinition($raw = false, $optimized = false) {
1667 return $this->getDefinition('HTML', $raw, $optimized);
1671 * Retrieves object reference to the CSS definition
1672 * @param $raw Return a copy that has not been setup yet. Must be
1673 * called before it's been setup, otherwise won't work.
1674 * @param $optimized If true, this method may return null, to
1675 * indicate that a cached version of the modified
1676 * definition object is available and no further edits
1677 * are necessary. Consider using
1678 * maybeGetRawCSSDefinition, which is more explicitly
1681 public function getCSSDefinition($raw = false, $optimized = false) {
1682 return $this->getDefinition('CSS', $raw, $optimized);
1686 * Retrieves object reference to the URI definition
1687 * @param $raw Return a copy that has not been setup yet. Must be
1688 * called before it's been setup, otherwise won't work.
1689 * @param $optimized If true, this method may return null, to
1690 * indicate that a cached version of the modified
1691 * definition object is available and no further edits
1692 * are necessary. Consider using
1693 * maybeGetRawURIDefinition, which is more explicitly
1696 public function getURIDefinition($raw = false, $optimized = false) {
1697 return $this->getDefinition('URI', $raw, $optimized);
1701 * Retrieves a definition
1702 * @param $type Type of definition: HTML, CSS, etc
1703 * @param $raw Whether or not definition should be returned raw
1704 * @param $optimized Only has an effect when $raw is true. Whether
1705 * or not to return null if the result is already present in
1706 * the cache. This is off by default for backwards
1707 * compatibility reasons, but you need to do things this
1708 * way in order to ensure that caching is done properly.
1709 * Check out enduser-customize.html for more details.
1710 * We probably won't ever change this default, as much as the
1711 * maybe semantics is the "right thing to do."
1713 public function getDefinition($type, $raw = false, $optimized = false) {
1714 if ($optimized && !$raw) {
1715 throw new HTMLPurifier_Exception("Cannot set optimized = true when raw = false");
1717 if (!$this->finalized) $this->autoFinalize();
1718 // temporarily suspend locks, so we can handle recursive definition calls
1719 $lock = $this->lock;
1721 $factory = HTMLPurifier_DefinitionCacheFactory::instance();
1722 $cache = $factory->create($type, $this);
1723 $this->lock = $lock;
1727 // check if definition is in memory
1728 if (!empty($this->definitions[$type])) {
1729 $def = $this->definitions[$type];
1730 // check if the definition is setup
1735 if ($def->optimized) $cache->add($def, $this);
1739 // check if definition is in cache
1740 $def = $cache->get($this);
1742 // definition in cache, save to memory and return it
1743 $this->definitions[$type] = $def;
1747 $def = $this->initDefinition($type);
1749 $this->lock = $type;
1753 $cache->add($def, $this);
1759 // check preconditions
1762 if (is_null($this->get($type . '.DefinitionID'))) {
1763 // fatally error out if definition ID not set
1764 throw new HTMLPurifier_Exception("Cannot retrieve raw version without specifying %$type.DefinitionID");
1767 if (!empty($this->definitions[$type])) {
1768 $def = $this->definitions[$type];
1769 if ($def->setup && !$optimized) {
1770 $extra = $this->chatty ? " (try moving this code block earlier in your initialization)" : "";
1771 throw new HTMLPurifier_Exception("Cannot retrieve raw definition after it has already been setup" . $extra);
1773 if ($def->optimized === null) {
1774 $extra = $this->chatty ? " (try flushing your cache)" : "";
1775 throw new HTMLPurifier_Exception("Optimization status of definition is unknown" . $extra);
1777 if ($def->optimized !== $optimized) {
1778 $msg = $optimized ? "optimized" : "unoptimized";
1779 $extra = $this->chatty ? " (this backtrace is for the first inconsistent call, which was for a $msg raw definition)" : "";
1780 throw new HTMLPurifier_Exception("Inconsistent use of optimized and unoptimized raw definition retrievals" . $extra);
1783 // check if definition was in memory
1786 // invariant: $optimized === true (checked above)
1792 // if optimized, check if definition was in cache
1793 // (because we do the memory check first, this formulation
1794 // is prone to cache slamming, but I think
1795 // guaranteeing that either /all/ of the raw
1796 // setup code or /none/ of it is run is more important.)
1798 // This code path only gets run once; once we put
1799 // something in $definitions (which is guaranteed by the
1800 // trailing code), we always short-circuit above.
1801 $def = $cache->get($this);
1803 // save the full definition for later, but don't
1805 $this->definitions[$type] = $def;
1809 // check invariants for creation
1811 if (!is_null($this->get($type . '.DefinitionID'))) {
1812 if ($this->chatty) {
1813 $this->triggerError("Due to a documentation error in previous version of HTML Purifier, your definitions are not being cached. If this is OK, you can remove the %$type.DefinitionRev and %$type.DefinitionID declaration. Otherwise, modify your code to use maybeGetRawDefinition, and test if the returned value is null before making any edits (if it is null, that means that a cached version is available, and no raw operations are necessary). See <a href='http://htmlpurifier.org/docs/enduser-customize.html#optimized'>Customize</a> for more details", E_USER_WARNING);
1815 $this->triggerError("Useless DefinitionID declaration", E_USER_WARNING);
1820 $def = $this->initDefinition($type);
1821 $def->optimized = $optimized;
1824 throw new HTMLPurifier_Exception("The impossible happened!");
1827 private function initDefinition($type) {
1828 // quick checks failed, let's create the object
1829 if ($type == 'HTML') {
1830 $def = new HTMLPurifier_HTMLDefinition();
1831 } elseif ($type == 'CSS') {
1832 $def = new HTMLPurifier_CSSDefinition();
1833 } elseif ($type == 'URI') {
1834 $def = new HTMLPurifier_URIDefinition();
1836 throw new HTMLPurifier_Exception("Definition of $type type not supported");
1838 $this->definitions[$type] = $def;
1842 public function maybeGetRawDefinition($name) {
1843 return $this->getDefinition($name, true, true);
1846 public function maybeGetRawHTMLDefinition() {
1847 return $this->getDefinition('HTML', true, true);
1850 public function maybeGetRawCSSDefinition() {
1851 return $this->getDefinition('CSS', true, true);
1854 public function maybeGetRawURIDefinition() {
1855 return $this->getDefinition('URI', true, true);
1859 * Loads configuration values from an array with the following structure:
1860 * Namespace.Directive => Value
1861 * @param $config_array Configuration associative array
1863 public function loadArray($config_array) {
1864 if ($this->isFinalized('Cannot load directives after finalization')) return;
1865 foreach ($config_array as $key => $value) {
1866 $key = str_replace('_', '.', $key);
1867 if (strpos($key, '.') !== false) {
1868 $this->set($key, $value);
1871 $namespace_values = $value;
1872 foreach ($namespace_values as $directive => $value) {
1873 $this->set($namespace .'.'. $directive, $value);
1880 * Returns a list of array(namespace, directive) for all directives
1881 * that are allowed in a web-form context as per an allowed
1882 * namespaces/directives list.
1883 * @param $allowed List of allowed namespaces/directives
1885 public static function getAllowedDirectivesForForm($allowed, $schema = null) {
1887 $schema = HTMLPurifier_ConfigSchema::instance();
1889 if ($allowed !== true) {
1890 if (is_string($allowed)) $allowed = array($allowed);
1891 $allowed_ns = array();
1892 $allowed_directives = array();
1893 $blacklisted_directives = array();
1894 foreach ($allowed as $ns_or_directive) {
1895 if (strpos($ns_or_directive, '.') !== false) {
1897 if ($ns_or_directive[0] == '-') {
1898 $blacklisted_directives[substr($ns_or_directive, 1)] = true;
1900 $allowed_directives[$ns_or_directive] = true;
1904 $allowed_ns[$ns_or_directive] = true;
1909 foreach ($schema->info as $key => $def) {
1910 list($ns, $directive) = explode('.', $key, 2);
1911 if ($allowed !== true) {
1912 if (isset($blacklisted_directives["$ns.$directive"])) continue;
1913 if (!isset($allowed_directives["$ns.$directive"]) && !isset($allowed_ns[$ns])) continue;
1915 if (isset($def->isAlias)) continue;
1916 if ($directive == 'DefinitionID' || $directive == 'DefinitionRev') continue;
1917 $ret[] = array($ns, $directive);
1923 * Loads configuration values from $_GET/$_POST that were posted
1925 * @param $array $_GET or $_POST array to import
1926 * @param $index Index/name that the config variables are in
1927 * @param $allowed List of allowed namespaces/directives
1928 * @param $mq_fix Boolean whether or not to enable magic quotes fix
1929 * @param $schema Instance of HTMLPurifier_ConfigSchema to use, if not global copy
1931 public static function loadArrayFromForm($array, $index = false, $allowed = true, $mq_fix = true, $schema = null) {
1932 $ret = HTMLPurifier_Config::prepareArrayFromForm($array, $index, $allowed, $mq_fix, $schema);
1933 $config = HTMLPurifier_Config::create($ret, $schema);
1938 * Merges in configuration values from $_GET/$_POST to object. NOT STATIC.
1939 * @note Same parameters as loadArrayFromForm
1941 public function mergeArrayFromForm($array, $index = false, $allowed = true, $mq_fix = true) {
1942 $ret = HTMLPurifier_Config::prepareArrayFromForm($array, $index, $allowed, $mq_fix, $this->def);
1943 $this->loadArray($ret);
1947 * Prepares an array from a form into something usable for the more
1948 * strict parts of HTMLPurifier_Config
1950 public static function prepareArrayFromForm($array, $index = false, $allowed = true, $mq_fix = true, $schema = null) {
1951 if ($index !== false) $array = (isset($array[$index]) && is_array($array[$index])) ? $array[$index] : array();
1952 $mq = $mq_fix && function_exists('get_magic_quotes_gpc') && get_magic_quotes_gpc();
1954 $allowed = HTMLPurifier_Config::getAllowedDirectivesForForm($allowed, $schema);
1956 foreach ($allowed as $key) {
1957 list($ns, $directive) = $key;
1958 $skey = "$ns.$directive";
1959 if (!empty($array["Null_$skey"])) {
1960 $ret[$ns][$directive] = null;
1963 if (!isset($array[$skey])) continue;
1964 $value = $mq ? stripslashes($array[$skey]) : $array[$skey];
1965 $ret[$ns][$directive] = $value;
1971 * Loads configuration values from an ini file
1972 * @param $filename Name of ini file
1974 public function loadIni($filename) {
1975 if ($this->isFinalized('Cannot load directives after finalization')) return;
1976 $array = parse_ini_file($filename, true);
1977 $this->loadArray($array);
1981 * Checks whether or not the configuration object is finalized.
1982 * @param $error String error message, or false for no error
1984 public function isFinalized($error = false) {
1985 if ($this->finalized && $error) {
1986 $this->triggerError($error, E_USER_ERROR);
1988 return $this->finalized;
1992 * Finalizes configuration only if auto finalize is on and not
1995 public function autoFinalize() {
1996 if ($this->autoFinalize) {
1999 $this->plist->squash(true);
2004 * Finalizes a configuration object, prohibiting further change
2006 public function finalize() {
2007 $this->finalized = true;
2008 unset($this->parser);
2012 * Produces a nicely formatted error message by supplying the
2013 * stack frame information OUTSIDE of HTMLPurifier_Config.
2015 protected function triggerError($msg, $no) {
2016 // determine previous stack frame
2018 if ($this->chatty) {
2019 $trace = debug_backtrace();
2020 // zip(tail(trace), trace) -- but PHP is not Haskell har har
2021 for ($i = 0, $c = count($trace); $i < $c - 1; $i++) {
2022 if ($trace[$i + 1]['class'] === 'HTMLPurifier_Config') {
2025 $frame = $trace[$i];
2026 $extra = " invoked on line {$frame['line']} in file {$frame['file']}";
2030 trigger_error($msg . $extra, $no);
2034 * Returns a serialized form of the configuration object that can
2037 public function serialize() {
2038 $this->getDefinition('HTML');
2039 $this->getDefinition('CSS');
2040 $this->getDefinition('URI');
2041 return serialize($this);
2051 * Configuration definition, defines directives and their defaults.
2053 class HTMLPurifier_ConfigSchema {
2056 * Defaults of the directives and namespaces.
2057 * @note This shares the exact same structure as HTMLPurifier_Config::$conf
2059 public $defaults = array();
2062 * The default property list. Do not edit this property list.
2064 public $defaultPlist;
2067 * Definition of the directives. The structure of this is:
2070 * 'Namespace' => array(
2071 * 'Directive' => new stdclass(),
2075 * The stdclass may have the following properties:
2077 * - If isAlias isn't set:
2078 * - type: Integer type of directive, see HTMLPurifier_VarParser for definitions
2079 * - allow_null: If set, this directive allows null values
2080 * - aliases: If set, an associative array of value aliases to real values
2081 * - allowed: If set, a lookup array of allowed (string) values
2082 * - If isAlias is set:
2083 * - namespace: Namespace this directive aliases to
2084 * - name: Directive name this directive aliases to
2086 * In certain degenerate cases, stdclass will actually be an integer. In
2087 * that case, the value is equivalent to an stdclass with the type
2088 * property set to the integer. If the integer is negative, type is
2089 * equal to the absolute value of integer, and allow_null is true.
2091 * This class is friendly with HTMLPurifier_Config. If you need introspection
2092 * about the schema, you're better of using the ConfigSchema_Interchange,
2093 * which uses more memory but has much richer information.
2095 public $info = array();
2098 * Application-wide singleton
2100 static protected $singleton;
2102 public function __construct() {
2103 $this->defaultPlist = new HTMLPurifier_PropertyList();
2107 * Unserializes the default ConfigSchema.
2109 public static function makeFromSerial() {
2110 $contents = file_get_contents(HTMLPURIFIER_PREFIX . '/HTMLPurifier/ConfigSchema/schema.ser');
2111 $r = unserialize($contents);
2113 $hash = sha1($contents);
2114 trigger_error("Unserialization of configuration schema failed, sha1 of file was $hash", E_USER_ERROR);
2120 * Retrieves an instance of the application-wide configuration definition.
2122 public static function instance($prototype = null) {
2123 if ($prototype !== null) {
2124 HTMLPurifier_ConfigSchema::$singleton = $prototype;
2125 } elseif (HTMLPurifier_ConfigSchema::$singleton === null || $prototype === true) {
2126 HTMLPurifier_ConfigSchema::$singleton = HTMLPurifier_ConfigSchema::makeFromSerial();
2128 return HTMLPurifier_ConfigSchema::$singleton;
2132 * Defines a directive for configuration
2133 * @warning Will fail of directive's namespace is defined.
2134 * @warning This method's signature is slightly different from the legacy
2135 * define() static method! Beware!
2136 * @param $namespace Namespace the directive is in
2137 * @param $name Key of directive
2138 * @param $default Default value of directive
2139 * @param $type Allowed type of the directive. See
2140 * HTMLPurifier_DirectiveDef::$type for allowed values
2141 * @param $allow_null Whether or not to allow null values
2143 public function add($key, $default, $type, $allow_null) {
2144 $obj = new stdclass();
2145 $obj->type = is_int($type) ? $type : HTMLPurifier_VarParser::$types[$type];
2146 if ($allow_null) $obj->allow_null = true;
2147 $this->info[$key] = $obj;
2148 $this->defaults[$key] = $default;
2149 $this->defaultPlist->set($key, $default);
2153 * Defines a directive value alias.
2155 * Directive value aliases are convenient for developers because it lets
2156 * them set a directive to several values and get the same result.
2157 * @param $namespace Directive's namespace
2158 * @param $name Name of Directive
2159 * @param $aliases Hash of aliased values to the real alias
2161 public function addValueAliases($key, $aliases) {
2162 if (!isset($this->info[$key]->aliases)) {
2163 $this->info[$key]->aliases = array();
2165 foreach ($aliases as $alias => $real) {
2166 $this->info[$key]->aliases[$alias] = $real;
2171 * Defines a set of allowed values for a directive.
2172 * @warning This is slightly different from the corresponding static
2173 * method definition.
2174 * @param $namespace Namespace of directive
2175 * @param $name Name of directive
2176 * @param $allowed Lookup array of allowed values
2178 public function addAllowedValues($key, $allowed) {
2179 $this->info[$key]->allowed = $allowed;
2183 * Defines a directive alias for backwards compatibility
2185 * @param $name Directive that will be aliased
2186 * @param $new_namespace
2187 * @param $new_name Directive that the alias will be to
2189 public function addAlias($key, $new_key) {
2190 $obj = new stdclass;
2191 $obj->key = $new_key;
2192 $obj->isAlias = true;
2193 $this->info[$key] = $obj;
2197 * Replaces any stdclass that only has the type property with type integer.
2199 public function postProcess() {
2200 foreach ($this->info as $key => $v) {
2201 if (count((array) $v) == 1) {
2202 $this->info[$key] = $v->type;
2203 } elseif (count((array) $v) == 2 && isset($v->allow_null)) {
2204 $this->info[$key] = -$v->type;
2218 class HTMLPurifier_ContentSets
2222 * List of content set strings (pipe seperators) indexed by name.
2224 public $info = array();
2227 * List of content set lookups (element => true) indexed by name.
2228 * @note This is in HTMLPurifier_HTMLDefinition->info_content_sets
2230 public $lookup = array();
2233 * Synchronized list of defined content sets (keys of info)
2235 protected $keys = array();
2237 * Synchronized list of defined content values (values of info)
2239 protected $values = array();
2242 * Merges in module's content sets, expands identifiers in the content
2243 * sets and populates the keys, values and lookup member variables.
2244 * @param $modules List of HTMLPurifier_HTMLModule
2246 public function __construct($modules) {
2247 if (!is_array($modules)) $modules = array($modules);
2248 // populate content_sets based on module hints
2249 // sorry, no way of overloading
2250 foreach ($modules as $module_i => $module) {
2251 foreach ($module->content_sets as $key => $value) {
2252 $temp = $this->convertToLookup($value);
2253 if (isset($this->lookup[$key])) {
2254 // add it into the existing content set
2255 $this->lookup[$key] = array_merge($this->lookup[$key], $temp);
2257 $this->lookup[$key] = $temp;
2261 $old_lookup = false;
2262 while ($old_lookup !== $this->lookup) {
2263 $old_lookup = $this->lookup;
2264 foreach ($this->lookup as $i => $set) {
2266 foreach ($set as $element => $x) {
2267 if (isset($this->lookup[$element])) {
2268 $add += $this->lookup[$element];
2269 unset($this->lookup[$i][$element]);
2272 $this->lookup[$i] += $add;
2276 foreach ($this->lookup as $key => $lookup) {
2277 $this->info[$key] = implode(' | ', array_keys($lookup));
2279 $this->keys = array_keys($this->info);
2280 $this->values = array_values($this->info);
2284 * Accepts a definition; generates and assigns a ChildDef for it
2285 * @param $def HTMLPurifier_ElementDef reference
2286 * @param $module Module that defined the ElementDef
2288 public function generateChildDef(&$def, $module) {
2289 if (!empty($def->child)) return; // already done!
2290 $content_model = $def->content_model;
2291 if (is_string($content_model)) {
2292 // Assume that $this->keys is alphanumeric
2293 $def->content_model = preg_replace_callback(
2294 '/\b(' . implode('|', $this->keys) . ')\b/',
2295 array($this, 'generateChildDefCallback'),
2298 //$def->content_model = str_replace(
2299 // $this->keys, $this->values, $content_model);
2301 $def->child = $this->getChildDef($def, $module);
2304 public function generateChildDefCallback($matches) {
2305 return $this->info[$matches[0]];
2309 * Instantiates a ChildDef based on content_model and content_model_type
2310 * member variables in HTMLPurifier_ElementDef
2311 * @note This will also defer to modules for custom HTMLPurifier_ChildDef
2312 * subclasses that need content set expansion
2313 * @param $def HTMLPurifier_ElementDef to have ChildDef extracted
2314 * @return HTMLPurifier_ChildDef corresponding to ElementDef
2316 public function getChildDef($def, $module) {
2317 $value = $def->content_model;
2318 if (is_object($value)) {
2320 'Literal object child definitions should be stored in '.
2321 'ElementDef->child not ElementDef->content_model',
2326 switch ($def->content_model_type) {
2328 return new HTMLPurifier_ChildDef_Required($value);
2330 return new HTMLPurifier_ChildDef_Optional($value);
2332 return new HTMLPurifier_ChildDef_Empty();
2334 return new HTMLPurifier_ChildDef_Custom($value);
2336 // defer to its module
2338 if ($module->defines_child_def) { // save a func call
2339 $return = $module->getChildDef($def);
2341 if ($return !== false) return $return;
2344 'Could not determine which ChildDef class to instantiate',
2351 * Converts a string list of elements separated by pipes into
2353 * @param $string List of elements
2354 * @return Lookup array of elements
2356 protected function convertToLookup($string) {
2357 $array = explode('|', str_replace(' ', '', $string));
2359 foreach ($array as $i => $k) {
2372 * Registry object that contains information about the current context.
2373 * @warning Is a bit buggy when variables are set to null: it thinks
2374 * they don't exist! So use false instead, please.
2375 * @note Since the variables Context deals with may not be objects,
2376 * references are very important here! Do not remove!
2378 class HTMLPurifier_Context
2382 * Private array that stores the references.
2384 private $_storage = array();
2387 * Registers a variable into the context.
2388 * @param $name String name
2389 * @param $ref Reference to variable to be registered
2391 public function register($name, &$ref) {
2392 if (isset($this->_storage[$name])) {
2393 trigger_error("Name $name produces collision, cannot re-register",
2397 $this->_storage[$name] =& $ref;
2401 * Retrieves a variable reference from the context.
2402 * @param $name String name
2403 * @param $ignore_error Boolean whether or not to ignore error
2405 public function &get($name, $ignore_error = false) {
2406 if (!isset($this->_storage[$name])) {
2407 if (!$ignore_error) {
2408 trigger_error("Attempted to retrieve non-existent variable $name",
2411 $var = null; // so we can return by reference
2414 return $this->_storage[$name];
2418 * Destorys a variable in the context.
2419 * @param $name String name
2421 public function destroy($name) {
2422 if (!isset($this->_storage[$name])) {
2423 trigger_error("Attempted to destroy non-existent variable $name",
2427 unset($this->_storage[$name]);
2431 * Checks whether or not the variable exists.
2432 * @param $name String name
2434 public function exists($name) {
2435 return isset($this->_storage[$name]);
2439 * Loads a series of variables from an associative array
2440 * @param $context_array Assoc array of variables to load
2442 public function loadArray($context_array) {
2443 foreach ($context_array as $key => $discard) {
2444 $this->register($key, $context_array[$key]);
2455 * Abstract class representing Definition cache managers that implements
2456 * useful common methods and is a factory.
2457 * @todo Create a separate maintenance file advanced users can use to
2458 * cache their custom HTMLDefinition, which can be loaded
2459 * via a configuration directive
2460 * @todo Implement memcached
2462 abstract class HTMLPurifier_DefinitionCache
2468 * @param $name Type of definition objects this instance of the
2469 * cache will handle.
2471 public function __construct($type) {
2472 $this->type = $type;
2476 * Generates a unique identifier for a particular configuration
2477 * @param Instance of HTMLPurifier_Config
2479 public function generateKey($config) {
2480 return $config->version . ',' . // possibly replace with function calls
2481 $config->getBatchSerial($this->type) . ',' .
2482 $config->get($this->type . '.DefinitionRev');
2486 * Tests whether or not a key is old with respect to the configuration's
2487 * version and revision number.
2488 * @param $key Key to test
2489 * @param $config Instance of HTMLPurifier_Config to test against
2491 public function isOld($key, $config) {
2492 if (substr_count($key, ',') < 2) return true;
2493 list($version, $hash, $revision) = explode(',', $key, 3);
2494 $compare = version_compare($version, $config->version);
2495 // version mismatch, is always old
2496 if ($compare != 0) return true;
2497 // versions match, ids match, check revision number
2499 $hash == $config->getBatchSerial($this->type) &&
2500 $revision < $config->get($this->type . '.DefinitionRev')
2506 * Checks if a definition's type jives with the cache's type
2507 * @note Throws an error on failure
2508 * @param $def Definition object to check
2509 * @return Boolean true if good, false if not
2511 public function checkDefType($def) {
2512 if ($def->type !== $this->type) {
2513 trigger_error("Cannot use definition of type {$def->type} in cache for {$this->type}");
2520 * Adds a definition object to the cache
2522 abstract public function add($def, $config);
2525 * Unconditionally saves a definition object to the cache
2527 abstract public function set($def, $config);
2530 * Replace an object in the cache
2532 abstract public function replace($def, $config);
2535 * Retrieves a definition object from the cache
2537 abstract public function get($config);
2540 * Removes a definition object to the cache
2542 abstract public function remove($config);
2545 * Clears all objects from cache
2547 abstract public function flush($config);
2550 * Clears all expired (older version or revision) objects from cache
2551 * @note Be carefuly implementing this method as flush. Flush must
2552 * not interfere with other Definition types, and cleanup()
2553 * should not be repeatedly called by userland code.
2555 abstract public function cleanup($config);
2564 * Responsible for creating definition caches.
2566 class HTMLPurifier_DefinitionCacheFactory
2569 protected $caches = array('Serializer' => array());
2570 protected $implementations = array();
2571 protected $decorators = array();
2574 * Initialize default decorators
2576 public function setup() {
2577 $this->addDecorator('Cleanup');
2581 * Retrieves an instance of global definition cache factory.
2583 public static function instance($prototype = null) {
2585 if ($prototype !== null) {
2586 $instance = $prototype;
2587 } elseif ($instance === null || $prototype === true) {
2588 $instance = new HTMLPurifier_DefinitionCacheFactory();
2595 * Registers a new definition cache object
2596 * @param $short Short name of cache object, for reference
2597 * @param $long Full class name of cache object, for construction
2599 public function register($short, $long) {
2600 $this->implementations[$short] = $long;
2604 * Factory method that creates a cache object based on configuration
2605 * @param $name Name of definitions handled by cache
2606 * @param $config Instance of HTMLPurifier_Config
2608 public function create($type, $config) {
2609 $method = $config->get('Cache.DefinitionImpl');
2610 if ($method === null) {
2611 return new HTMLPurifier_DefinitionCache_Null($type);
2613 if (!empty($this->caches[$method][$type])) {
2614 return $this->caches[$method][$type];
2617 isset($this->implementations[$method]) &&
2618 class_exists($class = $this->implementations[$method], false)
2620 $cache = new $class($type);
2622 if ($method != 'Serializer') {
2623 trigger_error("Unrecognized DefinitionCache $method, using Serializer instead", E_USER_WARNING);
2625 $cache = new HTMLPurifier_DefinitionCache_Serializer($type);
2627 foreach ($this->decorators as $decorator) {
2628 $new_cache = $decorator->decorate($cache);
2629 // prevent infinite recursion in PHP 4
2631 $cache = $new_cache;
2633 $this->caches[$method][$type] = $cache;
2634 return $this->caches[$method][$type];
2638 * Registers a decorator to add to all new cache objects
2641 public function addDecorator($decorator) {
2642 if (is_string($decorator)) {
2643 $class = "HTMLPurifier_DefinitionCache_Decorator_$decorator";
2644 $decorator = new $class;
2646 $this->decorators[$decorator->name] = $decorator;
2656 * Represents a document type, contains information on which modules
2657 * need to be loaded.
2658 * @note This class is inspected by Printer_HTMLDefinition->renderDoctype.
2659 * If structure changes, please update that function.
2661 class HTMLPurifier_Doctype
2664 * Full name of doctype
2669 * List of standard modules (string identifiers or literal objects)
2670 * that this doctype uses
2672 public $modules = array();
2675 * List of modules to use for tidying up code
2677 public $tidyModules = array();
2680 * Is the language derived from XML (i.e. XHTML)?
2685 * List of aliases for this doctype
2687 public $aliases = array();
2690 * Public DTD identifier
2695 * System DTD identifier
2699 public function __construct($name = null, $xml = true, $modules = array(),
2700 $tidyModules = array(), $aliases = array(), $dtd_public = null, $dtd_system = null
2702 $this->name = $name;
2704 $this->modules = $modules;
2705 $this->tidyModules = $tidyModules;
2706 $this->aliases = $aliases;
2707 $this->dtdPublic = $dtd_public;
2708 $this->dtdSystem = $dtd_system;
2716 class HTMLPurifier_DoctypeRegistry
2720 * Hash of doctype names to doctype objects
2722 protected $doctypes;
2725 * Lookup table of aliases to real doctype names
2730 * Registers a doctype to the registry
2731 * @note Accepts a fully-formed doctype object, or the
2732 * parameters for constructing a doctype object
2733 * @param $doctype Name of doctype or literal doctype object
2734 * @param $modules Modules doctype will load
2735 * @param $modules_for_modes Modules doctype will load for certain modes
2736 * @param $aliases Alias names for doctype
2737 * @return Editable registered doctype
2739 public function register($doctype, $xml = true, $modules = array(),
2740 $tidy_modules = array(), $aliases = array(), $dtd_public = null, $dtd_system = null
2742 if (!is_array($modules)) $modules = array($modules);
2743 if (!is_array($tidy_modules)) $tidy_modules = array($tidy_modules);
2744 if (!is_array($aliases)) $aliases = array($aliases);
2745 if (!is_object($doctype)) {
2746 $doctype = new HTMLPurifier_Doctype(
2747 $doctype, $xml, $modules, $tidy_modules, $aliases, $dtd_public, $dtd_system
2750 $this->doctypes[$doctype->name] = $doctype;
2751 $name = $doctype->name;
2753 foreach ($doctype->aliases as $alias) {
2754 if (isset($this->doctypes[$alias])) continue;
2755 $this->aliases[$alias] = $name;
2757 // remove old aliases
2758 if (isset($this->aliases[$name])) unset($this->aliases[$name]);
2763 * Retrieves reference to a doctype of a certain name
2764 * @note This function resolves aliases
2765 * @note When possible, use the more fully-featured make()
2766 * @param $doctype Name of doctype
2767 * @return Editable doctype object
2769 public function get($doctype) {
2770 if (isset($this->aliases[$doctype])) $doctype = $this->aliases[$doctype];
2771 if (!isset($this->doctypes[$doctype])) {
2772 trigger_error('Doctype ' . htmlspecialchars($doctype) . ' does not exist', E_USER_ERROR);
2773 $anon = new HTMLPurifier_Doctype($doctype);
2776 return $this->doctypes[$doctype];
2780 * Creates a doctype based on a configuration object,
2781 * will perform initialization on the doctype
2782 * @note Use this function to get a copy of doctype that config
2783 * can hold on to (this is necessary in order to tell
2784 * Generator whether or not the current document is XML
2787 public function make($config) {
2788 return clone $this->get($this->getDoctypeFromConfig($config));
2792 * Retrieves the doctype from the configuration object
2794 public function getDoctypeFromConfig($config) {
2796 $doctype = $config->get('HTML.Doctype');
2797 if (!empty($doctype)) return $doctype;
2798 $doctype = $config->get('HTML.CustomDoctype');
2799 if (!empty($doctype)) return $doctype;
2800 // backwards-compatibility
2801 if ($config->get('HTML.XHTML')) {
2802 $doctype = 'XHTML 1.0';
2804 $doctype = 'HTML 4.01';
2806 if ($config->get('HTML.Strict')) {
2807 $doctype .= ' Strict';
2809 $doctype .= ' Transitional';
2821 * Structure that stores an HTML element definition. Used by
2822 * HTMLPurifier_HTMLDefinition and HTMLPurifier_HTMLModule.
2823 * @note This class is inspected by HTMLPurifier_Printer_HTMLDefinition.
2824 * Please update that class too.
2825 * @warning If you add new properties to this class, you MUST update
2826 * the mergeIn() method.
2828 class HTMLPurifier_ElementDef
2832 * Does the definition work by itself, or is it created solely
2833 * for the purpose of merging into another definition?
2835 public $standalone = true;
2838 * Associative array of attribute name to HTMLPurifier_AttrDef
2839 * @note Before being processed by HTMLPurifier_AttrCollections
2840 * when modules are finalized during
2841 * HTMLPurifier_HTMLDefinition->setup(), this array may also
2842 * contain an array at index 0 that indicates which attribute
2843 * collections to load into the full array. It may also
2844 * contain string indentifiers in lieu of HTMLPurifier_AttrDef,
2845 * see HTMLPurifier_AttrTypes on how they are expanded during
2846 * HTMLPurifier_HTMLDefinition->setup() processing.
2848 public $attr = array();
2851 * Indexed list of tag's HTMLPurifier_AttrTransform to be done before validation
2853 public $attr_transform_pre = array();
2856 * Indexed list of tag's HTMLPurifier_AttrTransform to be done after validation
2858 public $attr_transform_post = array();
2861 * HTMLPurifier_ChildDef of this tag.
2866 * Abstract string representation of internal ChildDef rules. See
2867 * HTMLPurifier_ContentSets for how this is parsed and then transformed
2868 * into an HTMLPurifier_ChildDef.
2869 * @warning This is a temporary variable that is not available after
2870 * being processed by HTMLDefinition
2872 public $content_model;
2875 * Value of $child->type, used to determine which ChildDef to use,
2876 * used in combination with $content_model.
2877 * @warning This must be lowercase
2878 * @warning This is a temporary variable that is not available after
2879 * being processed by HTMLDefinition
2881 public $content_model_type;
2886 * Does the element have a content model (#PCDATA | Inline)*? This
2887 * is important for chameleon ins and del processing in
2888 * HTMLPurifier_ChildDef_Chameleon. Dynamically set: modules don't
2889 * have to worry about this one.
2891 public $descendants_are_inline = false;
2894 * List of the names of required attributes this element has. Dynamically
2895 * populated by HTMLPurifier_HTMLDefinition::getElement
2897 public $required_attr = array();
2900 * Lookup table of tags excluded from all descendants of this tag.
2901 * @note SGML permits exclusions for all descendants, but this is
2902 * not possible with DTDs or XML Schemas. W3C has elected to
2903 * use complicated compositions of content_models to simulate
2904 * exclusion for children, but we go the simpler, SGML-style
2905 * route of flat-out exclusions, which correctly apply to
2906 * all descendants and not just children. Note that the XHTML
2907 * Modularization Abstract Modules are blithely unaware of such
2910 public $excludes = array();
2913 * This tag is explicitly auto-closed by the following tags.
2915 public $autoclose = array();
2918 * If a foreign element is found in this element, test if it is
2919 * allowed by this sub-element; if it is, instead of closing the
2920 * current element, place it inside this element.
2925 * Whether or not this is a formatting element affected by the
2926 * "Active Formatting Elements" algorithm.
2931 * Low-level factory constructor for creating new standalone element defs
2933 public static function create($content_model, $content_model_type, $attr) {
2934 $def = new HTMLPurifier_ElementDef();
2935 $def->content_model = $content_model;
2936 $def->content_model_type = $content_model_type;
2942 * Merges the values of another element definition into this one.
2943 * Values from the new element def take precedence if a value is
2946 public function mergeIn($def) {
2948 // later keys takes precedence
2949 foreach($def->attr as $k => $v) {
2951 // merge in the includes
2952 // sorry, no way to override an include
2953 foreach ($v as $v2) {
2954 $this->attr[0][] = $v2;
2959 if (isset($this->attr[$k])) unset($this->attr[$k]);
2962 $this->attr[$k] = $v;
2964 $this->_mergeAssocArray($this->attr_transform_pre, $def->attr_transform_pre);
2965 $this->_mergeAssocArray($this->attr_transform_post, $def->attr_transform_post);
2966 $this->_mergeAssocArray($this->excludes, $def->excludes);
2968 if(!empty($def->content_model)) {
2969 $this->content_model =
2970 str_replace("#SUPER", $this->content_model, $def->content_model);
2971 $this->child = false;
2973 if(!empty($def->content_model_type)) {
2974 $this->content_model_type = $def->content_model_type;
2975 $this->child = false;
2977 if(!is_null($def->child)) $this->child = $def->child;
2978 if(!is_null($def->formatting)) $this->formatting = $def->formatting;
2979 if($def->descendants_are_inline) $this->descendants_are_inline = $def->descendants_are_inline;
2984 * Merges one array into another, removes values which equal false
2985 * @param $a1 Array by reference that is merged into
2986 * @param $a2 Array that merges into $a1
2988 private function _mergeAssocArray(&$a1, $a2) {
2989 foreach ($a2 as $k => $v) {
2991 if (isset($a1[$k])) unset($a1[$k]);
3005 * A UTF-8 specific character encoder that handles cleaning and transforming.
3006 * @note All functions in this class should be static.
3008 class HTMLPurifier_Encoder
3012 * Constructor throws fatal error if you attempt to instantiate class
3014 private function __construct() {
3015 trigger_error('Cannot instantiate encoder, call methods statically', E_USER_ERROR);
3019 * Error-handler that mutes errors, alternative to shut-up operator.
3021 public static function muteErrorHandler() {}
3024 * Cleans a UTF-8 string for well-formedness and SGML validity
3026 * It will parse according to UTF-8 and return a valid UTF8 string, with
3027 * non-SGML codepoints excluded.
3029 * @note Just for reference, the non-SGML code points are 0 to 31 and
3030 * 127 to 159, inclusive. However, we allow code points 9, 10
3031 * and 13, which are the tab, line feed and carriage return
3032 * respectively. 128 and above the code points map to multibyte
3033 * UTF-8 representations.
3035 * @note Fallback code adapted from utf8ToUnicode by Henri Sivonen and
3036 * hsivonen@iki.fi at <http://iki.fi/hsivonen/php-utf8/> under the
3037 * LGPL license. Notes on what changed are inside, but in general,
3038 * the original code transformed UTF-8 text into an array of integer
3039 * Unicode codepoints. Understandably, transforming that back to
3040 * a string would be somewhat expensive, so the function was modded to
3041 * directly operate on the string. However, this discourages code
3042 * reuse, and the logic enumerated here would be useful for any
3043 * function that needs to be able to understand UTF-8 characters.
3044 * As of right now, only smart lossless character encoding converters
3045 * would need that, and I'm probably not going to implement them.
3046 * Once again, PHP 6 should solve all our problems.
3048 public static function cleanUTF8($str, $force_php = false) {
3050 // UTF-8 validity is checked since PHP 4.3.5
3051 // This is an optimization: if the string is already valid UTF-8, no
3052 // need to do PHP stuff. 99% of the time, this will be the case.
3053 // The regexp matches the XML char production, as well as well as excluding
3054 // non-SGML codepoints U+007F to U+009F
3055 if (preg_match('/^[\x{9}\x{A}\x{D}\x{20}-\x{7E}\x{A0}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]*$/Du', $str)) {
3059 $mState = 0; // cached expected number of octets after the current octet
3060 // until the beginning of the next UTF8 character sequence
3061 $mUcs4 = 0; // cached Unicode character
3062 $mBytes = 1; // cached expected number of octets in the current sequence
3064 // original code involved an $out that was an array of Unicode
3065 // codepoints. Instead of having to convert back into UTF-8, we've
3066 // decided to directly append valid UTF-8 characters onto a string
3067 // $out once they're done. $char accumulates raw bytes, while $mUcs4
3068 // turns into the Unicode code point, so there's some redundancy.
3073 $len = strlen($str);
3074 for($i = 0; $i < $len; $i++) {
3075 $in = ord($str{$i});
3076 $char .= $str[$i]; // append byte to char
3078 // When mState is zero we expect either a US-ASCII character
3079 // or a multi-octet sequence.
3080 if (0 == (0x80 & ($in))) {
3081 // US-ASCII, pass straight through.
3082 if (($in <= 31 || $in == 127) &&
3083 !($in == 9 || $in == 13 || $in == 10) // save \r\t\n
3085 // control characters, remove
3092 } elseif (0xC0 == (0xE0 & ($in))) {
3093 // First octet of 2 octet sequence
3095 $mUcs4 = ($mUcs4 & 0x1F) << 6;
3098 } elseif (0xE0 == (0xF0 & ($in))) {
3099 // First octet of 3 octet sequence
3101 $mUcs4 = ($mUcs4 & 0x0F) << 12;
3104 } elseif (0xF0 == (0xF8 & ($in))) {
3105 // First octet of 4 octet sequence
3107 $mUcs4 = ($mUcs4 & 0x07) << 18;
3110 } elseif (0xF8 == (0xFC & ($in))) {
3111 // First octet of 5 octet sequence.
3113 // This is illegal because the encoded codepoint must be
3115 // (a) not the shortest form or
3116 // (b) outside the Unicode range of 0-0x10FFFF.
3117 // Rather than trying to resynchronize, we will carry on
3118 // until the end of the sequence and let the later error
3119 // handling code catch it.
3121 $mUcs4 = ($mUcs4 & 0x03) << 24;
3124 } elseif (0xFC == (0xFE & ($in))) {
3125 // First octet of 6 octet sequence, see comments for 5
3128 $mUcs4 = ($mUcs4 & 1) << 30;
3132 // Current octet is neither in the US-ASCII range nor a
3133 // legal first octet of a multi-octet sequence.
3140 // When mState is non-zero, we expect a continuation of the
3141 // multi-octet sequence
3142 if (0x80 == (0xC0 & ($in))) {
3143 // Legal continuation.
3144 $shift = ($mState - 1) * 6;
3146 $tmp = ($tmp & 0x0000003F) << $shift;
3149 if (0 == --$mState) {
3150 // End of the multi-octet sequence. mUcs4 now contains
3151 // the final Unicode codepoint to be output
3153 // Check for illegal sequences and codepoints.
3155 // From Unicode 3.1, non-shortest form is illegal
3156 if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
3157 ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
3158 ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
3160 // From Unicode 3.2, surrogate characters = illegal
3161 (($mUcs4 & 0xFFFFF800) == 0xD800) ||
3162 // Codepoints outside the Unicode range are illegal
3166 } elseif (0xFEFF != $mUcs4 && // omit BOM
3167 // check for valid Char unicode codepoints
3172 (0x20 <= $mUcs4 && 0x7E >= $mUcs4) ||
3173 // 7F-9F is not strictly prohibited by XML,
3174 // but it is non-SGML, and thus we don't allow it
3175 (0xA0 <= $mUcs4 && 0xD7FF >= $mUcs4) ||
3176 (0x10000 <= $mUcs4 && 0x10FFFF >= $mUcs4)
3181 // initialize UTF8 cache (reset)
3188 // ((0xC0 & (*in) != 0x80) && (mState != 0))
3189 // Incomplete multi-octet sequence.
3190 // used to result in complete fail, but we'll reset
3202 * Translates a Unicode codepoint into its corresponding UTF-8 character.
3203 * @note Based on Feyd's function at
3204 * <http://forums.devnetwork.net/viewtopic.php?p=191404#191404>,
3205 * which is in public domain.
3206 * @note While we're going to do code point parsing anyway, a good
3207 * optimization would be to refuse to translate code points that
3208 * are non-SGML characters. However, this could lead to duplication.
3209 * @note This is very similar to the unichr function in
3210 * maintenance/generate-entity-file.php (although this is superior,
3211 * due to its sanity checks).
3214 // +----------+----------+----------+----------+
3215 // | 33222222 | 22221111 | 111111 | |
3216 // | 10987654 | 32109876 | 54321098 | 76543210 | bit
3217 // +----------+----------+----------+----------+
3218 // | | | | 0xxxxxxx | 1 byte 0x00000000..0x0000007F
3219 // | | | 110yyyyy | 10xxxxxx | 2 byte 0x00000080..0x000007FF
3220 // | | 1110zzzz | 10yyyyyy | 10xxxxxx | 3 byte 0x00000800..0x0000FFFF
3221 // | 11110www | 10wwzzzz | 10yyyyyy | 10xxxxxx | 4 byte 0x00010000..0x0010FFFF
3222 // +----------+----------+----------+----------+
3223 // | 00000000 | 00011111 | 11111111 | 11111111 | Theoretical upper limit of legal scalars: 2097151 (0x001FFFFF)
3224 // | 00000000 | 00010000 | 11111111 | 11111111 | Defined upper limit of legal scalar codes
3225 // +----------+----------+----------+----------+
3227 public static function unichr($code) {
3228 if($code > 1114111 or $code < 0 or
3229 ($code >= 55296 and $code <= 57343) ) {
3230 // bits are set outside the "valid" range as defined
3235 $x = $y = $z = $w = 0;
3237 // regular ASCII character
3240 // set up bits for UTF-8
3241 $x = ($code & 63) | 128;
3243 $y = (($code & 2047) >> 6) | 192;
3245 $y = (($code & 4032) >> 6) | 128;
3247 $z = (($code >> 12) & 15) | 224;
3249 $z = (($code >> 12) & 63) | 128;
3250 $w = (($code >> 18) & 7) | 240;
3254 // set up the actual character
3256 if($w) $ret .= chr($w);
3257 if($z) $ret .= chr($z);
3258 if($y) $ret .= chr($y);
3265 * Converts a string to UTF-8 based on configuration.
3267 public static function convertToUTF8($str, $config, $context) {
3268 $encoding = $config->get('Core.Encoding');
3269 if ($encoding === 'utf-8') return $str;
3270 static $iconv = null;
3271 if ($iconv === null) $iconv = function_exists('iconv');
3272 set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
3273 if ($iconv && !$config->get('Test.ForceNoIconv')) {
3274 $str = iconv($encoding, 'utf-8//IGNORE', $str);
3275 if ($str === false) {
3276 // $encoding is not a valid encoding
3277 restore_error_handler();
3278 trigger_error('Invalid encoding ' . $encoding, E_USER_ERROR);
3281 // If the string is bjorked by Shift_JIS or a similar encoding
3282 // that doesn't support all of ASCII, convert the naughty
3283 // characters to their true byte-wise ASCII/UTF-8 equivalents.
3284 $str = strtr($str, HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding));
3285 restore_error_handler();
3287 } elseif ($encoding === 'iso-8859-1') {
3288 $str = utf8_encode($str);
3289 restore_error_handler();
3292 trigger_error('Encoding not supported, please install iconv', E_USER_ERROR);
3296 * Converts a string from UTF-8 based on configuration.
3297 * @note Currently, this is a lossy conversion, with unexpressable
3298 * characters being omitted.
3300 public static function convertFromUTF8($str, $config, $context) {
3301 $encoding = $config->get('Core.Encoding');
3302 if ($encoding === 'utf-8') return $str;
3303 static $iconv = null;
3304 if ($iconv === null) $iconv = function_exists('iconv');
3305 if ($escape = $config->get('Core.EscapeNonASCIICharacters')) {
3306 $str = HTMLPurifier_Encoder::convertToASCIIDumbLossless($str);
3308 set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
3309 if ($iconv && !$config->get('Test.ForceNoIconv')) {
3310 // Undo our previous fix in convertToUTF8, otherwise iconv will barf
3311 $ascii_fix = HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding);
3312 if (!$escape && !empty($ascii_fix)) {
3313 $clear_fix = array();
3314 foreach ($ascii_fix as $utf8 => $native) $clear_fix[$utf8] = '';
3315 $str = strtr($str, $clear_fix);
3317 $str = strtr($str, array_flip($ascii_fix));
3319 $str = iconv('utf-8', $encoding . '//IGNORE', $str);
3320 restore_error_handler();
3322 } elseif ($encoding === 'iso-8859-1') {
3323 $str = utf8_decode($str);
3324 restore_error_handler();
3327 trigger_error('Encoding not supported', E_USER_ERROR);
3331 * Lossless (character-wise) conversion of HTML to ASCII
3332 * @param $str UTF-8 string to be converted to ASCII
3333 * @returns ASCII encoded string with non-ASCII character entity-ized
3334 * @warning Adapted from MediaWiki, claiming fair use: this is a common
3335 * algorithm. If you disagree with this license fudgery,
3336 * implement it yourself.
3337 * @note Uses decimal numeric entities since they are best supported.
3338 * @note This is a DUMB function: it has no concept of keeping
3339 * character entities that the projected character encoding
3340 * can allow. We could possibly implement a smart version
3341 * but that would require it to also know which Unicode
3342 * codepoints the charset supported (not an easy task).
3343 * @note Sort of with cleanUTF8() but it assumes that $str is
3346 public static function convertToASCIIDumbLossless($str) {
3350 $len = strlen($str);
3351 for( $i = 0; $i < $len; $i++ ) {
3352 $bytevalue = ord( $str[$i] );
3353 if( $bytevalue <= 0x7F ) { //0xxx xxxx
3354 $result .= chr( $bytevalue );
3356 } elseif( $bytevalue <= 0xBF ) { //10xx xxxx
3357 $working = $working << 6;
3358 $working += ($bytevalue & 0x3F);
3360 if( $bytesleft <= 0 ) {
3361 $result .= "&#" . $working . ";";
3363 } elseif( $bytevalue <= 0xDF ) { //110x xxxx
3364 $working = $bytevalue & 0x1F;
3366 } elseif( $bytevalue <= 0xEF ) { //1110 xxxx
3367 $working = $bytevalue & 0x0F;
3369 } else { //1111 0xxx
3370 $working = $bytevalue & 0x07;
3378 * This expensive function tests whether or not a given character
3379 * encoding supports ASCII. 7/8-bit encodings like Shift_JIS will
3380 * fail this test, and require special processing. Variable width
3381 * encodings shouldn't ever fail.
3383 * @param string $encoding Encoding name to test, as per iconv format
3384 * @param bool $bypass Whether or not to bypass the precompiled arrays.
3385 * @return Array of UTF-8 characters to their corresponding ASCII,
3386 * which can be used to "undo" any overzealous iconv action.
3388 public static function testEncodingSupportsASCII($encoding, $bypass = false) {
3389 static $encodings = array();
3391 if (isset($encodings[$encoding])) return $encodings[$encoding];
3392 $lenc = strtolower($encoding);
3395 return array("\xC2\xA5" => '\\', "\xE2\x80\xBE" => '~');
3397 return array("\xE2\x82\xA9" => '\\');
3399 if (strpos($lenc, 'iso-8859-') === 0) return array();
3402 set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
3403 if (iconv('UTF-8', $encoding, 'a') === false) return false;
3404 for ($i = 0x20; $i <= 0x7E; $i++) { // all printable ASCII chars
3405 $c = chr($i); // UTF-8 char
3406 $r = iconv('UTF-8', "$encoding//IGNORE", $c); // initial conversion
3409 // This line is needed for iconv implementations that do not
3410 // omit characters that do not exist in the target character set
3411 ($r === $c && iconv($encoding, 'UTF-8//IGNORE', $r) !== $c)
3413 // Reverse engineer: what's the UTF-8 equiv of this byte
3414 // sequence? This assumes that there's no variable width
3415 // encoding that doesn't support ASCII.
3416 $ret[iconv($encoding, 'UTF-8//IGNORE', $c)] = $c;
3419 restore_error_handler();
3420 $encodings[$encoding] = $ret;
3432 * Object that provides entity lookup table from entity name to character
3434 class HTMLPurifier_EntityLookup {
3437 * Assoc array of entity name to character represented.
3442 * Sets up the entity lookup table from the serialized file contents.
3443 * @note The serialized contents are versioned, but were generated
3444 * using the maintenance script generate_entity_file.php
3445 * @warning This is not in constructor to help enforce the Singleton
3447 public function setup($file = false) {
3449 $file = HTMLPURIFIER_PREFIX . '/HTMLPurifier/EntityLookup/entities.ser';
3451 $this->table = unserialize(file_get_contents($file));
3455 * Retrieves sole instance of the object.
3456 * @param Optional prototype of custom lookup table to overload with.
3458 public static function instance($prototype = false) {
3459 // no references, since PHP doesn't copy unless modified
3460 static $instance = null;
3462 $instance = $prototype;
3463 } elseif (!$instance) {
3464 $instance = new HTMLPurifier_EntityLookup();
3476 // if want to implement error collecting here, we'll need to use some sort
3477 // of global data (probably trigger_error) because it's impossible to pass
3478 // $config or $context to the callback functions.
3481 * Handles referencing and derefencing character entities
3483 class HTMLPurifier_EntityParser
3487 * Reference to entity lookup table.
3489 protected $_entity_lookup;
3492 * Callback regex string for parsing entities.
3494 protected $_substituteEntitiesRegex =
3495 '/&(?:[#]x([a-fA-F0-9]+)|[#]0*(\d+)|([A-Za-z_:][A-Za-z0-9.\-_:]*));?/';
3496 // 1. hex 2. dec 3. string (XML style)
3500 * Decimal to parsed string conversion table for special entities.
3502 protected $_special_dec2str =
3512 * Stripped entity names to decimal conversion table for special entities.
3514 protected $_special_ent2dec =
3523 * Substitutes non-special entities with their parsed equivalents. Since
3524 * running this whenever you have parsed character is t3h 5uck, we run
3525 * it before everything else.
3527 * @param $string String to have non-special entities parsed.
3528 * @returns Parsed string.
3530 public function substituteNonSpecialEntities($string) {
3531 // it will try to detect missing semicolons, but don't rely on it
3532 return preg_replace_callback(
3533 $this->_substituteEntitiesRegex,
3534 array($this, 'nonSpecialEntityCallback'),
3540 * Callback function for substituteNonSpecialEntities() that does the work.
3542 * @param $matches PCRE matches array, with 0 the entire match, and
3543 * either index 1, 2 or 3 set with a hex value, dec value,
3544 * or string (respectively).
3545 * @returns Replacement string.
3548 protected function nonSpecialEntityCallback($matches) {
3549 // replaces all but big five
3550 $entity = $matches[0];
3551 $is_num = (@$matches[0][1] === '#');
3553 $is_hex = (@$entity[2] === 'x');
3554 $code = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
3556 // abort for special characters
3557 if (isset($this->_special_dec2str[$code])) return $entity;
3559 return HTMLPurifier_Encoder::unichr($code);
3561 if (isset($this->_special_ent2dec[$matches[3]])) return $entity;
3562 if (!$this->_entity_lookup) {
3563 $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
3565 if (isset($this->_entity_lookup->table[$matches[3]])) {
3566 return $this->_entity_lookup->table[$matches[3]];
3574 * Substitutes only special entities with their parsed equivalents.
3576 * @notice We try to avoid calling this function because otherwise, it
3577 * would have to be called a lot (for every parsed section).
3579 * @param $string String to have non-special entities parsed.
3580 * @returns Parsed string.
3582 public function substituteSpecialEntities($string) {
3583 return preg_replace_callback(
3584 $this->_substituteEntitiesRegex,
3585 array($this, 'specialEntityCallback'),
3590 * Callback function for substituteSpecialEntities() that does the work.
3592 * This callback has same syntax as nonSpecialEntityCallback().
3594 * @param $matches PCRE-style matches array, with 0 the entire match, and
3595 * either index 1, 2 or 3 set with a hex value, dec value,
3596 * or string (respectively).
3597 * @returns Replacement string.
3599 protected function specialEntityCallback($matches) {
3600 $entity = $matches[0];
3601 $is_num = (@$matches[0][1] === '#');
3603 $is_hex = (@$entity[2] === 'x');
3604 $int = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
3605 return isset($this->_special_dec2str[$int]) ?
3606 $this->_special_dec2str[$int] :
3609 return isset($this->_special_ent2dec[$matches[3]]) ?
3610 $this->_special_ent2dec[$matches[3]] :
3622 * Error collection class that enables HTML Purifier to report HTML
3623 * problems back to the user
3625 class HTMLPurifier_ErrorCollector
3629 * Identifiers for the returned error array. These are purposely numeric
3630 * so list() can be used.
3638 protected $_current;
3639 protected $_stacks = array(array());
3641 protected $generator;
3644 protected $lines = array();
3646 public function __construct($context) {
3647 $this->locale =& $context->get('Locale');
3648 $this->context = $context;
3649 $this->_current =& $this->_stacks[0];
3650 $this->errors =& $this->_stacks[0];
3654 * Sends an error message to the collector for later use
3655 * @param $severity int Error severity, PHP error style (don't use E_USER_)
3656 * @param $msg string Error message text
3657 * @param $subst1 string First substitution for $msg
3658 * @param $subst2 string ...
3660 public function send($severity, $msg) {
3663 if (func_num_args() > 2) {
3664 $args = func_get_args();
3669 $token = $this->context->get('CurrentToken', true);
3670 $line = $token ? $token->line : $this->context->get('CurrentLine', true);
3671 $col = $token ? $token->col : $this->context->get('CurrentCol', true);
3672 $attr = $this->context->get('CurrentAttr', true);
3674 // perform special substitutions, also add custom parameters
3676 if (!is_null($token)) {
3677 $args['CurrentToken'] = $token;
3679 if (!is_null($attr)) {
3680 $subst['$CurrentAttr.Name'] = $attr;
3681 if (isset($token->attr[$attr])) $subst['$CurrentAttr.Value'] = $token->attr[$attr];
3685 $msg = $this->locale->getMessage($msg);
3687 $msg = $this->locale->formatMessage($msg, $args);
3690 if (!empty($subst)) $msg = strtr($msg, $subst);
3692 // (numerically indexed)
3694 self::LINENO => $line,
3695 self::SEVERITY => $severity,
3696 self::MESSAGE => $msg,
3697 self::CHILDREN => array()
3699 $this->_current[] = $error;
3702 // NEW CODE BELOW ...
3705 // Top-level errors are either:
3706 // TOKEN type, if $value is set appropriately, or
3707 // "syntax" type, if $value is null
3708 $new_struct = new HTMLPurifier_ErrorStruct();
3709 $new_struct->type = HTMLPurifier_ErrorStruct::TOKEN;
3710 if ($token) $new_struct->value = clone $token;
3711 if (is_int($line) && is_int($col)) {
3712 if (isset($this->lines[$line][$col])) {
3713 $struct = $this->lines[$line][$col];
3715 $struct = $this->lines[$line][$col] = $new_struct;
3717 // These ksorts may present a performance problem
3718 ksort($this->lines[$line], SORT_NUMERIC);
3720 if (isset($this->lines[-1])) {
3721 $struct = $this->lines[-1];
3723 $struct = $this->lines[-1] = $new_struct;
3726 ksort($this->lines, SORT_NUMERIC);
3728 // Now, check if we need to operate on a lower structure
3729 if (!empty($attr)) {
3730 $struct = $struct->getChild(HTMLPurifier_ErrorStruct::ATTR, $attr);
3731 if (!$struct->value) {
3732 $struct->value = array($attr, 'PUT VALUE HERE');
3735 if (!empty($cssprop)) {
3736 $struct = $struct->getChild(HTMLPurifier_ErrorStruct::CSSPROP, $cssprop);
3737 if (!$struct->value) {
3738 // if we tokenize CSS this might be a little more difficult to do
3739 $struct->value = array($cssprop, 'PUT VALUE HERE');
3743 // Ok, structs are all setup, now time to register the error
3744 $struct->addError($severity, $msg);
3748 * Retrieves raw error data for custom formatter to use
3749 * @param List of arrays in format of array(line of error,
3750 * error severity, error message,
3751 * recursive sub-errors array)
3753 public function getRaw() {
3754 return $this->errors;
3758 * Default HTML formatting implementation for error messages
3759 * @param $config Configuration array, vital for HTML output nature
3760 * @param $errors Errors array to display; used for recursion.
3762 public function getHTMLFormatted($config, $errors = null) {
3765 $this->generator = new HTMLPurifier_Generator($config, $this->context);
3766 if ($errors === null) $errors = $this->errors;
3768 // 'At line' message needs to be removed
3770 // generation code for new structure goes here. It needs to be recursive.
3771 foreach ($this->lines as $line => $col_array) {
3772 if ($line == -1) continue;
3773 foreach ($col_array as $col => $struct) {
3774 $this->_renderStruct($ret, $struct, $line, $col);
3777 if (isset($this->lines[-1])) {
3778 $this->_renderStruct($ret, $this->lines[-1]);
3781 if (empty($errors)) {
3782 return '<p>' . $this->locale->getMessage('ErrorCollector: No errors') . '</p>';
3784 return '<ul><li>' . implode('</li><li>', $ret) . '</li></ul>';
3789 private function _renderStruct(&$ret, $struct, $line = null, $col = null) {
3790 $stack = array($struct);
3791 $context_stack = array(array());
3792 while ($current = array_pop($stack)) {
3793 $context = array_pop($context_stack);
3794 foreach ($current->errors as $error) {
3795 list($severity, $msg) = $error;
3798 // W3C uses an icon to indicate the severity of the error.
3799 $error = $this->locale->getErrorName($severity);
3800 $string .= "<span class=\"error e$severity\"><strong>$error</strong></span> ";
3801 if (!is_null($line) && !is_null($col)) {
3802 $string .= "<em class=\"location\">Line $line, Column $col: </em> ";
3804 $string .= '<em class="location">End of Document: </em> ';
3806 $string .= '<strong class="description">' . $this->generator->escape($msg) . '</strong> ';
3807 $string .= '</div>';
3808 // Here, have a marker for the character on the column appropriate.
3809 // Be sure to clip extremely long lines.
3810 //$string .= '<pre>';
3812 //$string .= '</pre>';
3815 foreach ($current->children as $type => $array) {
3816 $context[] = $current;
3817 $stack = array_merge($stack, array_reverse($array, true));
3818 for ($i = count($array); $i > 0; $i--) {
3819 $context_stack[] = $context;
3832 * Records errors for particular segments of an HTML document such as tokens,
3833 * attributes or CSS properties. They can contain error structs (which apply
3834 * to components of what they represent), but their main purpose is to hold
3835 * errors applying to whatever struct is being used.
3837 class HTMLPurifier_ErrorStruct
3841 * Possible values for $children first-key. Note that top-level structures
3842 * are automatically token-level.
3849 * Type of this struct.
3854 * Value of the struct we are recording errors for. There are various
3856 * - TOKEN: Instance of HTMLPurifier_Token
3857 * - ATTR: array('attr-name', 'value')
3858 * - CSSPROP: array('prop-name', 'value')
3863 * Errors registered for this structure.
3865 public $errors = array();
3868 * Child ErrorStructs that are from this structure. For example, a TOKEN
3869 * ErrorStruct would contain ATTR ErrorStructs. This is a multi-dimensional
3870 * array in structure: [TYPE]['identifier']
3872 public $children = array();
3874 public function getChild($type, $id) {
3875 if (!isset($this->children[$type][$id])) {
3876 $this->children[$type][$id] = new HTMLPurifier_ErrorStruct();
3877 $this->children[$type][$id]->type = $type;
3879 return $this->children[$type][$id];
3882 public function addError($severity, $message) {
3883 $this->errors[] = array($severity, $message);
3893 * Global exception class for HTML Purifier; any exceptions we throw
3896 class HTMLPurifier_Exception extends Exception
3906 * Represents a pre or post processing filter on HTML Purifier's output
3908 * Sometimes, a little ad-hoc fixing of HTML has to be done before
3909 * it gets sent through HTML Purifier: you can use filters to acheive
3910 * this effect. For instance, YouTube videos can be preserved using
3911 * this manner. You could have used a decorator for this task, but
3912 * PHP's support for them is not terribly robust, so we're going
3913 * to just loop through the filters.
3915 * Filters should be exited first in, last out. If there are three filters,
3916 * named 1, 2 and 3, the order of execution should go 1->preFilter,
3917 * 2->preFilter, 3->preFilter, purify, 3->postFilter, 2->postFilter,
3920 * @note Methods are not declared abstract as it is perfectly legitimate
3921 * for an implementation not to want anything to happen on a step
3924 class HTMLPurifier_Filter
3928 * Name of the filter for identification purposes
3933 * Pre-processor function, handles HTML before HTML Purifier
3935 public function preFilter($html, $config, $context) {
3940 * Post-processor function, handles HTML after HTML Purifier
3942 public function postFilter($html, $config, $context) {
3953 * Generates HTML from tokens.
3954 * @todo Refactor interface so that configuration/context is determined
3955 * upon instantiation, no need for messy generateFromTokens() calls
3956 * @todo Make some of the more internal functions protected, and have
3957 * unit tests work around that
3959 class HTMLPurifier_Generator
3963 * Whether or not generator should produce XML output
3965 private $_xhtml = true;
3968 * :HACK: Whether or not generator should comment the insides of <script> tags
3970 private $_scriptFix = false;
3973 * Cache of HTMLDefinition during HTML output to determine whether or
3974 * not attributes should be minimized.
3979 * Cache of %Output.SortAttr
3984 * Cache of %Output.FlashCompat
3986 private $_flashCompat;
3989 * Cache of %Output.FixInnerHTML
3991 private $_innerHTMLFix;
3994 * Stack for keeping track of object information when outputting IE
3995 * compatibility code.
3997 private $_flashStack = array();
4000 * Configuration for the generator
4005 * @param $config Instance of HTMLPurifier_Config
4006 * @param $context Instance of HTMLPurifier_Context
4008 public function __construct($config, $context) {
4009 $this->config = $config;
4010 $this->_scriptFix = $config->get('Output.CommentScriptContents');
4011 $this->_innerHTMLFix = $config->get('Output.FixInnerHTML');
4012 $this->_sortAttr = $config->get('Output.SortAttr');
4013 $this->_flashCompat = $config->get('Output.FlashCompat');
4014 $this->_def = $config->getHTMLDefinition();
4015 $this->_xhtml = $this->_def->doctype->xml;
4019 * Generates HTML from an array of tokens.
4020 * @param $tokens Array of HTMLPurifier_Token
4021 * @param $config HTMLPurifier_Config object
4022 * @return Generated HTML
4024 public function generateFromTokens($tokens) {
4025 if (!$tokens) return '';
4029 for ($i = 0, $size = count($tokens); $i < $size; $i++) {
4030 if ($this->_scriptFix && $tokens[$i]->name === 'script'
4031 && $i + 2 < $size && $tokens[$i+2] instanceof HTMLPurifier_Token_End) {
4032 // script special case
4033 // the contents of the script block must be ONE token
4034 // for this to work.
4035 $html .= $this->generateFromToken($tokens[$i++]);
4036 $html .= $this->generateScriptFromToken($tokens[$i++]);
4038 $html .= $this->generateFromToken($tokens[$i]);
4042 if (extension_loaded('tidy') && $this->config->get('Output.TidyFormat')) {
4044 $tidy->parseString($html, array(
4046 'output-xhtml' => $this->_xhtml,
4047 'show-body-only' => true,
4048 'indent-spaces' => 2,
4051 $tidy->cleanRepair();
4052 $html = (string) $tidy; // explicit cast necessary
4055 // Normalize newlines to system defined value
4056 if ($this->config->get('Core.NormalizeNewlines')) {
4057 $nl = $this->config->get('Output.Newline');
4058 if ($nl === null) $nl = PHP_EOL;
4059 if ($nl !== "\n") $html = str_replace("\n", $nl, $html);
4065 * Generates HTML from a single token.
4066 * @param $token HTMLPurifier_Token object.
4067 * @return Generated HTML
4069 public function generateFromToken($token) {
4070 if (!$token instanceof HTMLPurifier_Token) {
4071 trigger_error('Cannot generate HTML from non-HTMLPurifier_Token object', E_USER_WARNING);
4074 } elseif ($token instanceof HTMLPurifier_Token_Start) {
4075 $attr = $this->generateAttributes($token->attr, $token->name);
4076 if ($this->_flashCompat) {
4077 if ($token->name == "object") {
4078 $flash = new stdclass();
4079 $flash->attr = $token->attr;
4080 $flash->param = array();
4081 $this->_flashStack[] = $flash;
4084 return '<' . $token->name . ($attr ? ' ' : '') . $attr . '>';
4086 } elseif ($token instanceof HTMLPurifier_Token_End) {
4088 if ($this->_flashCompat) {
4089 if ($token->name == "object" && !empty($this->_flashStack)) {
4090 // doesn't do anything for now
4093 return $_extra . '</' . $token->name . '>';
4095 } elseif ($token instanceof HTMLPurifier_Token_Empty) {
4096 if ($this->_flashCompat && $token->name == "param" && !empty($this->_flashStack)) {
4097 $this->_flashStack[count($this->_flashStack)-1]->param[$token->attr['name']] = $token->attr['value'];
4099 $attr = $this->generateAttributes($token->attr, $token->name);
4100 return '<' . $token->name . ($attr ? ' ' : '') . $attr .
4101 ( $this->_xhtml ? ' /': '' ) // <br /> v. <br>
4104 } elseif ($token instanceof HTMLPurifier_Token_Text) {
4105 return $this->escape($token->data, ENT_NOQUOTES);
4107 } elseif ($token instanceof HTMLPurifier_Token_Comment) {
4108 return '<!--' . $token->data . '-->';
4116 * Special case processor for the contents of script tags
4117 * @warning This runs into problems if there's already a literal
4118 * --> somewhere inside the script contents.
4120 public function generateScriptFromToken($token) {
4121 if (!$token instanceof HTMLPurifier_Token_Text) return $this->generateFromToken($token);
4122 // Thanks <http://lachy.id.au/log/2005/05/script-comments>
4123 $data = preg_replace('#//\s*$#', '', $token->data);
4124 return '<!--//--><![CDATA[//><!--' . "\n" . trim($data) . "\n" . '//--><!]]>';
4128 * Generates attribute declarations from attribute array.
4129 * @note This does not include the leading or trailing space.
4130 * @param $assoc_array_of_attributes Attribute array
4131 * @param $element Name of element attributes are for, used to check
4132 * attribute minimization.
4133 * @return Generate HTML fragment for insertion.
4135 public function generateAttributes($assoc_array_of_attributes, $element = false) {
4137 if ($this->_sortAttr) ksort($assoc_array_of_attributes);
4138 foreach ($assoc_array_of_attributes as $key => $value) {
4139 if (!$this->_xhtml) {
4140 // Remove namespaced attributes
4141 if (strpos($key, ':') !== false) continue;
4142 // Check if we should minimize the attribute: val="val" -> val
4143 if ($element && !empty($this->_def->info[$element]->attr[$key]->minimized)) {
4144 $html .= $key . ' ';
4148 // Workaround for Internet Explorer innerHTML bug.
4149 // Essentially, Internet Explorer, when calculating
4150 // innerHTML, omits quotes if there are no instances of
4151 // angled brackets, quotes or spaces. However, when parsing
4152 // HTML (for example, when you assign to innerHTML), it
4153 // treats backticks as quotes. Thus,
4159 // Fortunately, all we need to do is trigger an appropriate
4160 // quoting style, which we do by adding an extra space.
4161 // This also is consistent with the W3C spec, which states
4162 // that user agents may ignore leading or trailing
4163 // whitespace (in fact, most don't, at least for attributes
4164 // like alt, but an extra space at the end is barely
4165 // noticeable). Still, we have a configuration knob for
4166 // this, since this transformation is not necesary if you
4167 // don't process user input with innerHTML or you don't plan
4168 // on supporting Internet Explorer.
4169 if ($this->_innerHTMLFix) {
4170 if (strpos($value, '`') !== false) {
4171 // check if correct quoting style would not already be
4173 if (strcspn($value, '"\' <>') === strlen($value)) {
4179 $html .= $key.'="'.$this->escape($value).'" ';
4181 return rtrim($html);
4185 * Escapes raw text data.
4186 * @todo This really ought to be protected, but until we have a facility
4187 * for properly generating HTML here w/o using tokens, it stays
4189 * @param $string String data to escape for HTML.
4190 * @param $quote Quoting style, like htmlspecialchars. ENT_NOQUOTES is
4191 * permissible for non-attribute output.
4192 * @return String escaped data.
4194 public function escape($string, $quote = null) {
4195 // Workaround for APC bug on Mac Leopard reported by sidepodcast
4196 // http://htmlpurifier.org/phorum/read.php?3,4823,4846
4197 if ($quote === null) $quote = ENT_COMPAT;
4198 return htmlspecialchars($string, $quote, 'UTF-8');
4208 * Definition of the purified HTML that describes allowed children,
4209 * attributes, and many other things.
4213 * All member variables that are prefixed with info
4214 * (including the main $info array) are used by HTML Purifier internals
4215 * and should not be directly edited when customizing the HTMLDefinition.
4216 * They can usually be set via configuration directives or custom
4219 * On the other hand, member variables without the info prefix are used
4220 * internally by the HTMLDefinition and MUST NOT be used by other HTML
4221 * Purifier internals. Many of them, however, are public, and may be
4222 * edited by userspace code to tweak the behavior of HTMLDefinition.
4224 * @note This class is inspected by Printer_HTMLDefinition; please
4225 * update that class if things here change.
4227 * @warning Directives that change this object's structure must be in
4228 * the HTML or Attr namespace!
4230 class HTMLPurifier_HTMLDefinition extends HTMLPurifier_Definition
4233 // FULLY-PUBLIC VARIABLES ---------------------------------------------
4236 * Associative array of element names to HTMLPurifier_ElementDef
4238 public $info = array();
4241 * Associative array of global attribute name to attribute definition.
4243 public $info_global_attr = array();
4246 * String name of parent element HTML will be going into.
4248 public $info_parent = 'div';
4251 * Definition for parent element, allows parent element to be a
4252 * tag that's not allowed inside the HTML fragment.
4254 public $info_parent_def;
4257 * String name of element used to wrap inline elements in block context
4258 * @note This is rarely used except for BLOCKQUOTEs in strict mode
4260 public $info_block_wrapper = 'p';
4263 * Associative array of deprecated tag name to HTMLPurifier_TagTransform
4265 public $info_tag_transform = array();
4268 * Indexed list of HTMLPurifier_AttrTransform to be performed before validation.
4270 public $info_attr_transform_pre = array();
4273 * Indexed list of HTMLPurifier_AttrTransform to be performed after validation.
4275 public $info_attr_transform_post = array();
4278 * Nested lookup array of content set name (Block, Inline) to
4279 * element name to whether or not it belongs in that content set.
4281 public $info_content_sets = array();
4284 * Indexed list of HTMLPurifier_Injector to be used.
4286 public $info_injector = array();
4295 // RAW CUSTOMIZATION STUFF --------------------------------------------
4298 * Adds a custom attribute to a pre-existing element
4299 * @note This is strictly convenience, and does not have a corresponding
4300 * method in HTMLPurifier_HTMLModule
4301 * @param $element_name String element name to add attribute to
4302 * @param $attr_name String name of attribute
4303 * @param $def Attribute definition, can be string or object, see
4304 * HTMLPurifier_AttrTypes for details
4306 public function addAttribute($element_name, $attr_name, $def) {
4307 $module = $this->getAnonymousModule();
4308 if (!isset($module->info[$element_name])) {
4309 $element = $module->addBlankElement($element_name);
4311 $element = $module->info[$element_name];
4313 $element->attr[$attr_name] = $def;
4317 * Adds a custom element to your HTML definition
4318 * @note See HTMLPurifier_HTMLModule::addElement for detailed
4319 * parameter and return value descriptions.
4321 public function addElement($element_name, $type, $contents, $attr_collections, $attributes = array()) {
4322 $module = $this->getAnonymousModule();
4323 // assume that if the user is calling this, the element
4324 // is safe. This may not be a good idea
4325 $element = $module->addElement($element_name, $type, $contents, $attr_collections, $attributes);
4330 * Adds a blank element to your HTML definition, for overriding
4332 * @note See HTMLPurifier_HTMLModule::addBlankElement for detailed
4333 * parameter and return value descriptions.
4335 public function addBlankElement($element_name) {
4336 $module = $this->getAnonymousModule();
4337 $element = $module->addBlankElement($element_name);
4342 * Retrieves a reference to the anonymous module, so you can
4343 * bust out advanced features without having to make your own
4346 public function getAnonymousModule() {
4347 if (!$this->_anonModule) {
4348 $this->_anonModule = new HTMLPurifier_HTMLModule();
4349 $this->_anonModule->name = 'Anonymous';
4351 return $this->_anonModule;
4354 private $_anonModule;
4357 // PUBLIC BUT INTERNAL VARIABLES --------------------------------------
4359 public $type = 'HTML';
4360 public $manager; /**< Instance of HTMLPurifier_HTMLModuleManager */
4363 * Performs low-cost, preliminary initialization.
4365 public function __construct() {
4366 $this->manager = new HTMLPurifier_HTMLModuleManager();
4369 protected function doSetup($config) {
4370 $this->processModules($config);
4371 $this->setupConfigStuff($config);
4372 unset($this->manager);
4374 // cleanup some of the element definitions
4375 foreach ($this->info as $k => $v) {
4376 unset($this->info[$k]->content_model);
4377 unset($this->info[$k]->content_model_type);
4382 * Extract out the information from the manager
4384 protected function processModules($config) {
4386 if ($this->_anonModule) {
4387 // for user specific changes
4388 // this is late-loaded so we don't have to deal with PHP4
4389 // reference wonky-ness
4390 $this->manager->addModule($this->_anonModule);
4391 unset($this->_anonModule);
4394 $this->manager->setup($config);
4395 $this->doctype = $this->manager->doctype;
4397 foreach ($this->manager->modules as $module) {
4398 foreach($module->info_tag_transform as $k => $v) {
4399 if ($v === false) unset($this->info_tag_transform[$k]);
4400 else $this->info_tag_transform[$k] = $v;
4402 foreach($module->info_attr_transform_pre as $k => $v) {
4403 if ($v === false) unset($this->info_attr_transform_pre[$k]);
4404 else $this->info_attr_transform_pre[$k] = $v;
4406 foreach($module->info_attr_transform_post as $k => $v) {
4407 if ($v === false) unset($this->info_attr_transform_post[$k]);
4408 else $this->info_attr_transform_post[$k] = $v;
4410 foreach ($module->info_injector as $k => $v) {
4411 if ($v === false) unset($this->info_injector[$k]);
4412 else $this->info_injector[$k] = $v;
4416 $this->info = $this->manager->getElements();
4417 $this->info_content_sets = $this->manager->contentSets->lookup;
4422 * Sets up stuff based on config. We need a better way of doing this.
4424 protected function setupConfigStuff($config) {
4426 $block_wrapper = $config->get('HTML.BlockWrapper');
4427 if (isset($this->info_content_sets['Block'][$block_wrapper])) {
4428 $this->info_block_wrapper = $block_wrapper;
4430 trigger_error('Cannot use non-block element as block wrapper',
4434 $parent = $config->get('HTML.Parent');
4435 $def = $this->manager->getElement($parent, true);
4437 $this->info_parent = $parent;
4438 $this->info_parent_def = $def;
4440 trigger_error('Cannot use unrecognized element as parent',
4442 $this->info_parent_def = $this->manager->getElement($this->info_parent, true);
4445 // support template text
4446 $support = "(for information on implementing this, see the ".
4449 // setup allowed elements -----------------------------------------
4451 $allowed_elements = $config->get('HTML.AllowedElements');
4452 $allowed_attributes = $config->get('HTML.AllowedAttributes'); // retrieve early
4454 if (!is_array($allowed_elements) && !is_array($allowed_attributes)) {
4455 $allowed = $config->get('HTML.Allowed');
4456 if (is_string($allowed)) {
4457 list($allowed_elements, $allowed_attributes) = $this->parseTinyMCEAllowedList($allowed);
4461 if (is_array($allowed_elements)) {
4462 foreach ($this->info as $name => $d) {
4463 if(!isset($allowed_elements[$name])) unset($this->info[$name]);
4464 unset($allowed_elements[$name]);
4467 foreach ($allowed_elements as $element => $d) {
4468 $element = htmlspecialchars($element); // PHP doesn't escape errors, be careful!
4469 trigger_error("Element '$element' is not supported $support", E_USER_WARNING);
4473 // setup allowed attributes ---------------------------------------
4475 $allowed_attributes_mutable = $allowed_attributes; // by copy!
4476 if (is_array($allowed_attributes)) {
4478 // This actually doesn't do anything, since we went away from
4479 // global attributes. It's possible that userland code uses
4480 // it, but HTMLModuleManager doesn't!
4481 foreach ($this->info_global_attr as $attr => $x) {
4482 $keys = array($attr, "*@$attr", "*.$attr");
4484 foreach ($keys as $key) {
4485 if ($delete && isset($allowed_attributes[$key])) {
4488 if (isset($allowed_attributes_mutable[$key])) {
4489 unset($allowed_attributes_mutable[$key]);
4492 if ($delete) unset($this->info_global_attr[$attr]);
4495 foreach ($this->info as $tag => $info) {
4496 foreach ($info->attr as $attr => $x) {
4497 $keys = array("$tag@$attr", $attr, "*@$attr", "$tag.$attr", "*.$attr");
4499 foreach ($keys as $key) {
4500 if ($delete && isset($allowed_attributes[$key])) {
4503 if (isset($allowed_attributes_mutable[$key])) {
4504 unset($allowed_attributes_mutable[$key]);
4508 if ($this->info[$tag]->attr[$attr]->required) {
4509 trigger_error("Required attribute '$attr' in element '$tag' was not allowed, which means '$tag' will not be allowed either", E_USER_WARNING);
4511 unset($this->info[$tag]->attr[$attr]);
4516 foreach ($allowed_attributes_mutable as $elattr => $d) {
4517 $bits = preg_split('/[.@]/', $elattr, 2);
4521 if ($bits[0] !== '*') {
4522 $element = htmlspecialchars($bits[0]);
4523 $attribute = htmlspecialchars($bits[1]);
4524 if (!isset($this->info[$element])) {
4525 trigger_error("Cannot allow attribute '$attribute' if element '$element' is not allowed/supported $support");
4527 trigger_error("Attribute '$attribute' in element '$element' not supported $support",
4532 // otherwise fall through
4534 $attribute = htmlspecialchars($bits[0]);
4535 trigger_error("Global attribute '$attribute' is not ".
4536 "supported in any elements $support",
4544 // setup forbidden elements ---------------------------------------
4546 $forbidden_elements = $config->get('HTML.ForbiddenElements');
4547 $forbidden_attributes = $config->get('HTML.ForbiddenAttributes');
4549 foreach ($this->info as $tag => $info) {
4550 if (isset($forbidden_elements[$tag])) {
4551 unset($this->info[$tag]);
4554 foreach ($info->attr as $attr => $x) {
4556 isset($forbidden_attributes["$tag@$attr"]) ||
4557 isset($forbidden_attributes["*@$attr"]) ||
4558 isset($forbidden_attributes[$attr])
4560 unset($this->info[$tag]->attr[$attr]);
4562 } // this segment might get removed eventually
4563 elseif (isset($forbidden_attributes["$tag.$attr"])) {
4564 // $tag.$attr are not user supplied, so no worries!
4565 trigger_error("Error with $tag.$attr: tag.attr syntax not supported for HTML.ForbiddenAttributes; use tag@attr instead", E_USER_WARNING);
4569 foreach ($forbidden_attributes as $key => $v) {
4570 if (strlen($key) < 2) continue;
4571 if ($key[0] != '*') continue;
4572 if ($key[1] == '.') {
4573 trigger_error("Error with $key: *.attr syntax not supported for HTML.ForbiddenAttributes; use attr instead", E_USER_WARNING);
4577 // setup injectors -----------------------------------------------------
4578 foreach ($this->info_injector as $i => $injector) {
4579 if ($injector->checkNeeded($config) !== false) {
4580 // remove injector that does not have it's required
4581 // elements/attributes present, and is thus not needed.
4582 unset($this->info_injector[$i]);
4588 * Parses a TinyMCE-flavored Allowed Elements and Attributes list into
4589 * separate lists for processing. Format is element[attr1|attr2],element2...
4590 * @warning Although it's largely drawn from TinyMCE's implementation,
4591 * it is different, and you'll probably have to modify your lists
4592 * @param $list String list to parse
4593 * @param array($allowed_elements, $allowed_attributes)
4594 * @todo Give this its own class, probably static interface
4596 public function parseTinyMCEAllowedList($list) {
4598 $list = str_replace(array(' ', "\t"), '', $list);
4600 $elements = array();
4601 $attributes = array();
4603 $chunks = preg_split('/(,|[\n\r]+)/', $list);
4604 foreach ($chunks as $chunk) {
4605 if (empty($chunk)) continue;
4606 // remove TinyMCE element control characters
4607 if (!strpos($chunk, '[')) {
4611 list($element, $attr) = explode('[', $chunk);
4613 if ($element !== '*') $elements[$element] = true;
4614 if (!$attr) continue;
4615 $attr = substr($attr, 0, strlen($attr) - 1); // remove trailing ]
4616 $attr = explode('|', $attr);
4617 foreach ($attr as $key) {
4618 $attributes["$element.$key"] = true;
4622 return array($elements, $attributes);
4634 * Represents an XHTML 1.1 module, with information on elements, tags
4636 * @note Even though this is technically XHTML 1.1, it is also used for
4637 * regular HTML parsing. We are using modulization as a convenient
4638 * way to represent the internals of HTMLDefinition, and our
4639 * implementation is by no means conforming and does not directly
4640 * use the normative DTDs or XML schemas.
4641 * @note The public variables in a module should almost directly
4642 * correspond to the variables in HTMLPurifier_HTMLDefinition.
4643 * However, the prefix info carries no special meaning in these
4644 * objects (include it anyway if that's the correspondence though).
4645 * @todo Consider making some member functions protected
4648 class HTMLPurifier_HTMLModule
4651 // -- Overloadable ----------------------------------------------------
4654 * Short unique string identifier of the module
4659 * Informally, a list of elements this module changes. Not used in
4660 * any significant way.
4662 public $elements = array();
4665 * Associative array of element names to element definitions.
4666 * Some definitions may be incomplete, to be merged in later
4667 * with the full definition.
4669 public $info = array();
4672 * Associative array of content set names to content set additions.
4673 * This is commonly used to, say, add an A element to the Inline
4674 * content set. This corresponds to an internal variable $content_sets
4675 * and NOT info_content_sets member variable of HTMLDefinition.
4677 public $content_sets = array();
4680 * Associative array of attribute collection names to attribute
4681 * collection additions. More rarely used for adding attributes to
4682 * the global collections. Example is the StyleAttribute module adding
4683 * the style attribute to the Core. Corresponds to HTMLDefinition's
4684 * attr_collections->info, since the object's data is only info,
4685 * with extra behavior associated with it.
4687 public $attr_collections = array();
4690 * Associative array of deprecated tag name to HTMLPurifier_TagTransform
4692 public $info_tag_transform = array();
4695 * List of HTMLPurifier_AttrTransform to be performed before validation.
4697 public $info_attr_transform_pre = array();
4700 * List of HTMLPurifier_AttrTransform to be performed after validation.
4702 public $info_attr_transform_post = array();
4705 * List of HTMLPurifier_Injector to be performed during well-formedness fixing.
4706 * An injector will only be invoked if all of it's pre-requisites are met;
4707 * if an injector fails setup, there will be no error; it will simply be
4708 * silently disabled.
4710 public $info_injector = array();
4713 * Boolean flag that indicates whether or not getChildDef is implemented.
4714 * For optimization reasons: may save a call to a function. Be sure
4715 * to set it if you do implement getChildDef(), otherwise it will have
4718 public $defines_child_def = false;
4721 * Boolean flag whether or not this module is safe. If it is not safe, all
4722 * of its members are unsafe. Modules are safe by default (this might be
4723 * slightly dangerous, but it doesn't make much sense to force HTML Purifier,
4724 * which is based off of safe HTML, to explicitly say, "This is safe," even
4725 * though there are modules which are "unsafe")
4727 * @note Previously, safety could be applied at an element level granularity.
4728 * We've removed this ability, so in order to add "unsafe" elements
4729 * or attributes, a dedicated module with this property set to false
4732 public $safe = true;
4735 * Retrieves a proper HTMLPurifier_ChildDef subclass based on
4736 * content_model and content_model_type member variables of
4737 * the HTMLPurifier_ElementDef class. There is a similar function
4738 * in HTMLPurifier_HTMLDefinition.
4739 * @param $def HTMLPurifier_ElementDef instance
4740 * @return HTMLPurifier_ChildDef subclass
4742 public function getChildDef($def) {return false;}
4744 // -- Convenience -----------------------------------------------------
4747 * Convenience function that sets up a new element
4748 * @param $element Name of element to add
4749 * @param $type What content set should element be registered to?
4750 * Set as false to skip this step.
4751 * @param $contents Allowed children in form of:
4752 * "$content_model_type: $content_model"
4753 * @param $attr_includes What attribute collections to register to
4755 * @param $attr What unique attributes does the element define?
4756 * @note See ElementDef for in-depth descriptions of these parameters.
4757 * @return Created element definition object, so you
4758 * can set advanced parameters
4760 public function addElement($element, $type, $contents, $attr_includes = array(), $attr = array()) {
4761 $this->elements[] = $element;
4762 // parse content_model
4763 list($content_model_type, $content_model) = $this->parseContents($contents);
4764 // merge in attribute inclusions
4765 $this->mergeInAttrIncludes($attr, $attr_includes);
4766 // add element to content sets
4767 if ($type) $this->addElementToContentSet($element, $type);
4769 $this->info[$element] = HTMLPurifier_ElementDef::create(
4770 $content_model, $content_model_type, $attr
4772 // literal object $contents means direct child manipulation
4773 if (!is_string($contents)) $this->info[$element]->child = $contents;
4774 return $this->info[$element];
4778 * Convenience function that creates a totally blank, non-standalone
4780 * @param $element Name of element to create
4781 * @return Created element
4783 public function addBlankElement($element) {
4784 if (!isset($this->info[$element])) {
4785 $this->elements[] = $element;
4786 $this->info[$element] = new HTMLPurifier_ElementDef();
4787 $this->info[$element]->standalone = false;
4789 trigger_error("Definition for $element already exists in module, cannot redefine");
4791 return $this->info[$element];
4795 * Convenience function that registers an element to a content set
4796 * @param Element to register
4797 * @param Name content set (warning: case sensitive, usually upper-case
4800 public function addElementToContentSet($element, $type) {
4801 if (!isset($this->content_sets[$type])) $this->content_sets[$type] = '';
4802 else $this->content_sets[$type] .= ' | ';
4803 $this->content_sets[$type] .= $element;
4807 * Convenience function that transforms single-string contents
4808 * into separate content model and content model type
4809 * @param $contents Allowed children in form of:
4810 * "$content_model_type: $content_model"
4811 * @note If contents is an object, an array of two nulls will be
4812 * returned, and the callee needs to take the original $contents
4813 * and use it directly.
4815 public function parseContents($contents) {
4816 if (!is_string($contents)) return array(null, null); // defer
4817 switch ($contents) {
4818 // check for shorthand content model forms
4820 return array('empty', '');
4822 return array('optional', 'Inline | #PCDATA');
4824 return array('optional', 'Flow | #PCDATA');
4826 list($content_model_type, $content_model) = explode(':', $contents);
4827 $content_model_type = strtolower(trim($content_model_type));
4828 $content_model = trim($content_model);
4829 return array($content_model_type, $content_model);
4833 * Convenience function that merges a list of attribute includes into
4834 * an attribute array.
4835 * @param $attr Reference to attr array to modify
4836 * @param $attr_includes Array of includes / string include to merge in
4838 public function mergeInAttrIncludes(&$attr, $attr_includes) {
4839 if (!is_array($attr_includes)) {
4840 if (empty($attr_includes)) $attr_includes = array();
4841 else $attr_includes = array($attr_includes);
4843 $attr[0] = $attr_includes;
4847 * Convenience function that generates a lookup table with boolean
4849 * @param $list List of values to turn into a lookup
4850 * @note You can also pass an arbitrary number of arguments in
4851 * place of the regular argument
4852 * @return Lookup array equivalent of list
4854 public function makeLookup($list) {
4855 if (is_string($list)) $list = func_get_args();
4857 foreach ($list as $value) {
4858 if (is_null($value)) continue;
4859 $ret[$value] = true;
4865 * Lazy load construction of the module after determining whether
4866 * or not it's needed, and also when a finalized configuration object
4868 * @param $config Instance of HTMLPurifier_Config
4870 public function setup($config) {}
4878 class HTMLPurifier_HTMLModuleManager
4882 * Instance of HTMLPurifier_DoctypeRegistry
4887 * Instance of current doctype
4892 * Instance of HTMLPurifier_AttrTypes
4897 * Active instances of modules for the specified doctype are
4898 * indexed, by name, in this array.
4900 public $modules = array();
4903 * Array of recognized HTMLPurifier_Module instances, indexed by
4904 * module's class name. This array is usually lazy loaded, but a
4905 * user can overload a module by pre-emptively registering it.
4907 public $registeredModules = array();
4910 * List of extra modules that were added by the user using addModule().
4911 * These get unconditionally merged into the current doctype, whatever
4914 public $userModules = array();
4917 * Associative array of element name to list of modules that have
4918 * definitions for the element; this array is dynamically filled.
4920 public $elementLookup = array();
4922 /** List of prefixes we should use for registering small names */
4923 public $prefixes = array('HTMLPurifier_HTMLModule_');
4925 public $contentSets; /**< Instance of HTMLPurifier_ContentSets */
4926 public $attrCollections; /**< Instance of HTMLPurifier_AttrCollections */
4928 /** If set to true, unsafe elements and attributes will be allowed */
4929 public $trusted = false;
4931 public function __construct() {
4933 // editable internal objects
4934 $this->attrTypes = new HTMLPurifier_AttrTypes();
4935 $this->doctypes = new HTMLPurifier_DoctypeRegistry();
4937 // setup basic modules
4939 'CommonAttributes', 'Text', 'Hypertext', 'List',
4940 'Presentation', 'Edit', 'Bdo', 'Tables', 'Image',
4943 'Scripting', 'Object', 'Forms',
4944 // Sorta legacy, but present in strict:
4947 $transitional = array('Legacy', 'Target');
4948 $xml = array('XMLCommonAttributes');
4949 $non_xml = array('NonXMLCommonAttributes');
4951 // setup basic doctypes
4952 $this->doctypes->register(
4953 'HTML 4.01 Transitional', false,
4954 array_merge($common, $transitional, $non_xml),
4955 array('Tidy_Transitional', 'Tidy_Proprietary'),
4957 '-//W3C//DTD HTML 4.01 Transitional//EN',
4958 'http://www.w3.org/TR/html4/loose.dtd'
4961 $this->doctypes->register(
4962 'HTML 4.01 Strict', false,
4963 array_merge($common, $non_xml),
4964 array('Tidy_Strict', 'Tidy_Proprietary', 'Tidy_Name'),
4966 '-//W3C//DTD HTML 4.01//EN',
4967 'http://www.w3.org/TR/html4/strict.dtd'
4970 $this->doctypes->register(
4971 'XHTML 1.0 Transitional', true,
4972 array_merge($common, $transitional, $xml, $non_xml),
4973 array('Tidy_Transitional', 'Tidy_XHTML', 'Tidy_Proprietary', 'Tidy_Name'),
4975 '-//W3C//DTD XHTML 1.0 Transitional//EN',
4976 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'
4979 $this->doctypes->register(
4980 'XHTML 1.0 Strict', true,
4981 array_merge($common, $xml, $non_xml),
4982 array('Tidy_Strict', 'Tidy_XHTML', 'Tidy_Strict', 'Tidy_Proprietary', 'Tidy_Name'),
4984 '-//W3C//DTD XHTML 1.0 Strict//EN',
4985 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd'
4988 $this->doctypes->register(
4990 array_merge($common, $xml, array('Ruby')),
4991 array('Tidy_Strict', 'Tidy_XHTML', 'Tidy_Proprietary', 'Tidy_Strict', 'Tidy_Name'), // Tidy_XHTML1_1
4993 '-//W3C//DTD XHTML 1.1//EN',
4994 'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd'
5000 * Registers a module to the recognized module list, useful for
5001 * overloading pre-existing modules.
5002 * @param $module Mixed: string module name, with or without
5003 * HTMLPurifier_HTMLModule prefix, or instance of
5004 * subclass of HTMLPurifier_HTMLModule.
5005 * @param $overload Boolean whether or not to overload previous modules.
5006 * If this is not set, and you do overload a module,
5007 * HTML Purifier will complain with a warning.
5008 * @note This function will not call autoload, you must instantiate
5009 * (and thus invoke) autoload outside the method.
5010 * @note If a string is passed as a module name, different variants
5011 * will be tested in this order:
5012 * - Check for HTMLPurifier_HTMLModule_$name
5013 * - Check all prefixes with $name in order they were added
5014 * - Check for literal object name
5015 * - Throw fatal error
5016 * If your object name collides with an internal class, specify
5017 * your module manually. All modules must have been included
5018 * externally: registerModule will not perform inclusions for you!
5020 public function registerModule($module, $overload = false) {
5021 if (is_string($module)) {
5022 // attempt to load the module
5023 $original_module = $module;
5025 foreach ($this->prefixes as $prefix) {
5026 $module = $prefix . $original_module;
5027 if (class_exists($module)) {
5033 $module = $original_module;
5034 if (!class_exists($module)) {
5035 trigger_error($original_module . ' module does not exist',
5040 $module = new $module();
5042 if (empty($module->name)) {
5043 trigger_error('Module instance of ' . get_class($module) . ' must have name');
5046 if (!$overload && isset($this->registeredModules[$module->name])) {
5047 trigger_error('Overloading ' . $module->name . ' without explicit overload parameter', E_USER_WARNING);
5049 $this->registeredModules[$module->name] = $module;
5053 * Adds a module to the current doctype by first registering it,
5054 * and then tacking it on to the active doctype
5056 public function addModule($module) {
5057 $this->registerModule($module);
5058 if (is_object($module)) $module = $module->name;
5059 $this->userModules[] = $module;
5063 * Adds a class prefix that registerModule() will use to resolve a
5064 * string name to a concrete class
5066 public function addPrefix($prefix) {
5067 $this->prefixes[] = $prefix;
5071 * Performs processing on modules, after being called you may
5072 * use getElement() and getElements()
5073 * @param $config Instance of HTMLPurifier_Config
5075 public function setup($config) {
5077 $this->trusted = $config->get('HTML.Trusted');
5080 $this->doctype = $this->doctypes->make($config);
5081 $modules = $this->doctype->modules;
5083 // take out the default modules that aren't allowed
5084 $lookup = $config->get('HTML.AllowedModules');
5085 $special_cases = $config->get('HTML.CoreModules');
5087 if (is_array($lookup)) {
5088 foreach ($modules as $k => $m) {
5089 if (isset($special_cases[$m])) continue;
5090 if (!isset($lookup[$m])) unset($modules[$k]);
5095 if ($config->get('HTML.Proprietary')) {
5096 $modules[] = 'Proprietary';
5098 if ($config->get('HTML.SafeObject')) {
5099 $modules[] = 'SafeObject';
5101 if ($config->get('HTML.SafeEmbed')) {
5102 $modules[] = 'SafeEmbed';
5104 if ($config->get('HTML.Nofollow')) {
5105 $modules[] = 'Nofollow';
5108 // merge in custom modules
5109 $modules = array_merge($modules, $this->userModules);
5111 foreach ($modules as $module) {
5112 $this->processModule($module);
5113 $this->modules[$module]->setup($config);
5116 foreach ($this->doctype->tidyModules as $module) {
5117 $this->processModule($module);
5118 $this->modules[$module]->setup($config);
5121 // prepare any injectors
5122 foreach ($this->modules as $module) {
5124 foreach ($module->info_injector as $i => $injector) {
5125 if (!is_object($injector)) {
5126 $class = "HTMLPurifier_Injector_$injector";
5127 $injector = new $class;
5129 $n[$injector->name] = $injector;
5131 $module->info_injector = $n;
5134 // setup lookup table based on all valid modules
5135 foreach ($this->modules as $module) {
5136 foreach ($module->info as $name => $def) {
5137 if (!isset($this->elementLookup[$name])) {
5138 $this->elementLookup[$name] = array();
5140 $this->elementLookup[$name][] = $module->name;
5144 // note the different choice
5145 $this->contentSets = new HTMLPurifier_ContentSets(
5146 // content set assembly deals with all possible modules,
5147 // not just ones deemed to be "safe"
5150 $this->attrCollections = new HTMLPurifier_AttrCollections(
5152 // there is no way to directly disable a global attribute,
5153 // but using AllowedAttributes or simply not including
5154 // the module in your custom doctype should be sufficient
5160 * Takes a module and adds it to the active module collection,
5161 * registering it if necessary.
5163 public function processModule($module) {
5164 if (!isset($this->registeredModules[$module]) || is_object($module)) {
5165 $this->registerModule($module);
5167 $this->modules[$module] = $this->registeredModules[$module];
5171 * Retrieves merged element definitions.
5172 * @return Array of HTMLPurifier_ElementDef
5174 public function getElements() {
5176 $elements = array();
5177 foreach ($this->modules as $module) {
5178 if (!$this->trusted && !$module->safe) continue;
5179 foreach ($module->info as $name => $v) {
5180 if (isset($elements[$name])) continue;
5181 $elements[$name] = $this->getElement($name);
5185 // remove dud elements, this happens when an element that
5186 // appeared to be safe actually wasn't
5187 foreach ($elements as $n => $v) {
5188 if ($v === false) unset($elements[$n]);
5196 * Retrieves a single merged element definition
5197 * @param $name Name of element
5198 * @param $trusted Boolean trusted overriding parameter: set to true
5199 * if you want the full version of an element
5200 * @return Merged HTMLPurifier_ElementDef
5201 * @note You may notice that modules are getting iterated over twice (once
5202 * in getElements() and once here). This
5205 public function getElement($name, $trusted = null) {
5207 if (!isset($this->elementLookup[$name])) {
5211 // setup global state variables
5213 if ($trusted === null) $trusted = $this->trusted;
5215 // iterate through each module that has registered itself to this
5217 foreach($this->elementLookup[$name] as $module_name) {
5219 $module = $this->modules[$module_name];
5221 // refuse to create/merge from a module that is deemed unsafe--
5222 // pretend the module doesn't exist--when trusted mode is not on.
5223 if (!$trusted && !$module->safe) {
5227 // clone is used because, ideally speaking, the original
5228 // definition should not be modified. Usually, this will
5229 // make no difference, but for consistency's sake
5230 $new_def = clone $module->info[$name];
5232 if (!$def && $new_def->standalone) {
5235 // This will occur even if $new_def is standalone. In practice,
5236 // this will usually result in a full replacement.
5237 $def->mergeIn($new_def);
5240 // non-standalone definitions that don't have a standalone
5241 // to merge into could be deferred to the end
5245 // attribute value expansions
5246 $this->attrCollections->performInclusions($def->attr);
5247 $this->attrCollections->expandIdentifiers($def->attr, $this->attrTypes);
5249 // descendants_are_inline, for ChildDef_Chameleon
5250 if (is_string($def->content_model) &&
5251 strpos($def->content_model, 'Inline') !== false) {
5252 if ($name != 'del' && $name != 'ins') {
5253 // this is for you, ins/del
5254 $def->descendants_are_inline = true;
5258 $this->contentSets->generateChildDef($def, $module);
5261 // This can occur if there is a blank definition, but no base to
5263 if (!$def) return false;
5265 // add information on required attributes
5266 foreach ($def->attr as $attr_name => $attr_def) {
5267 if ($attr_def->required) {
5268 $def->required_attr[] = $attr_name;
5283 * Component of HTMLPurifier_AttrContext that accumulates IDs to prevent dupes
5284 * @note In Slashdot-speak, dupe means duplicate.
5285 * @note The default constructor does not accept $config or $context objects:
5286 * use must use the static build() factory method to perform initialization.
5288 class HTMLPurifier_IDAccumulator
5292 * Lookup table of IDs we've accumulated.
5295 public $ids = array();
5298 * Builds an IDAccumulator, also initializing the default blacklist
5299 * @param $config Instance of HTMLPurifier_Config
5300 * @param $context Instance of HTMLPurifier_Context
5301 * @return Fully initialized HTMLPurifier_IDAccumulator
5303 public static function build($config, $context) {
5304 $id_accumulator = new HTMLPurifier_IDAccumulator();
5305 $id_accumulator->load($config->get('Attr.IDBlacklist'));
5306 return $id_accumulator;
5310 * Add an ID to the lookup table.
5311 * @param $id ID to be added.
5312 * @return Bool status, true if success, false if there's a dupe
5314 public function add($id) {
5315 if (isset($this->ids[$id])) return false;
5316 return $this->ids[$id] = true;
5320 * Load a list of IDs into the lookup table
5321 * @param $array_of_ids Array of IDs to load
5322 * @note This function doesn't care about duplicates
5324 public function load($array_of_ids) {
5325 foreach ($array_of_ids as $id) {
5326 $this->ids[$id] = true;
5337 * Injects tokens into the document while parsing for well-formedness.
5338 * This enables "formatter-like" functionality such as auto-paragraphing,
5339 * smiley-ification and linkification to take place.
5341 * A note on how handlers create changes; this is done by assigning a new
5342 * value to the $token reference. These values can take a variety of forms and
5343 * are best described HTMLPurifier_Strategy_MakeWellFormed->processToken()
5346 * @todo Allow injectors to request a re-run on their output. This
5347 * would help if an operation is recursive.
5349 abstract class HTMLPurifier_Injector
5353 * Advisory name of injector, this is for friendly error messages
5358 * Instance of HTMLPurifier_HTMLDefinition
5360 protected $htmlDefinition;
5363 * Reference to CurrentNesting variable in Context. This is an array
5364 * list of tokens that we are currently "inside"
5366 protected $currentNesting;
5369 * Reference to InputTokens variable in Context. This is an array
5370 * list of the input tokens that are being processed.
5372 protected $inputTokens;
5375 * Reference to InputIndex variable in Context. This is an integer
5376 * array index for $this->inputTokens that indicates what token
5377 * is currently being processed.
5379 protected $inputIndex;
5382 * Array of elements and attributes this injector creates and therefore
5383 * need to be allowed by the definition. Takes form of
5384 * array('element' => array('attr', 'attr2'), 'element2')
5386 public $needed = array();
5389 * Index of inputTokens to rewind to.
5391 protected $rewind = false;
5394 * Rewind to a spot to re-perform processing. This is useful if you
5395 * deleted a node, and now need to see if this change affected any
5396 * earlier nodes. Rewinding does not affect other injectors, and can
5397 * result in infinite loops if not used carefully.
5398 * @warning HTML Purifier will prevent you from fast-forwarding with this
5401 public function rewind($index) {
5402 $this->rewind = $index;
5406 * Retrieves rewind, and then unsets it.
5408 public function getRewind() {
5410 $this->rewind = false;
5415 * Prepares the injector by giving it the config and context objects:
5416 * this allows references to important variables to be made within
5417 * the injector. This function also checks if the HTML environment
5418 * will work with the Injector (see checkNeeded()).
5419 * @param $config Instance of HTMLPurifier_Config
5420 * @param $context Instance of HTMLPurifier_Context
5421 * @return Boolean false if success, string of missing needed element/attribute if failure
5423 public function prepare($config, $context) {
5424 $this->htmlDefinition = $config->getHTMLDefinition();
5425 // Even though this might fail, some unit tests ignore this and
5426 // still test checkNeeded, so be careful. Maybe get rid of that
5428 $result = $this->checkNeeded($config);
5429 if ($result !== false) return $result;
5430 $this->currentNesting =& $context->get('CurrentNesting');
5431 $this->inputTokens =& $context->get('InputTokens');
5432 $this->inputIndex =& $context->get('InputIndex');
5437 * This function checks if the HTML environment
5438 * will work with the Injector: if p tags are not allowed, the
5439 * Auto-Paragraphing injector should not be enabled.
5440 * @param $config Instance of HTMLPurifier_Config
5441 * @param $context Instance of HTMLPurifier_Context
5442 * @return Boolean false if success, string of missing needed element/attribute if failure
5444 public function checkNeeded($config) {
5445 $def = $config->getHTMLDefinition();
5446 foreach ($this->needed as $element => $attributes) {
5447 if (is_int($element)) $element = $attributes;
5448 if (!isset($def->info[$element])) return $element;
5449 if (!is_array($attributes)) continue;
5450 foreach ($attributes as $name) {
5451 if (!isset($def->info[$element]->attr[$name])) return "$element.$name";
5458 * Tests if the context node allows a certain element
5459 * @param $name Name of element to test for
5460 * @return True if element is allowed, false if it is not
5462 public function allowsElement($name) {
5463 if (!empty($this->currentNesting)) {
5464 $parent_token = array_pop($this->currentNesting);
5465 $this->currentNesting[] = $parent_token;
5466 $parent = $this->htmlDefinition->info[$parent_token->name];
5468 $parent = $this->htmlDefinition->info_parent_def;
5470 if (!isset($parent->child->elements[$name]) || isset($parent->excludes[$name])) {
5473 // check for exclusion
5474 for ($i = count($this->currentNesting) - 2; $i >= 0; $i--) {
5475 $node = $this->currentNesting[$i];
5476 $def = $this->htmlDefinition->info[$node->name];
5477 if (isset($def->excludes[$name])) return false;
5483 * Iterator function, which starts with the next token and continues until
5484 * you reach the end of the input tokens.
5485 * @warning Please prevent previous references from interfering with this
5486 * functions by setting $i = null beforehand!
5487 * @param &$i Current integer index variable for inputTokens
5488 * @param &$current Current token variable. Do NOT use $token, as that variable is also a reference
5490 protected function forward(&$i, &$current) {
5491 if ($i === null) $i = $this->inputIndex + 1;
5493 if (!isset($this->inputTokens[$i])) return false;
5494 $current = $this->inputTokens[$i];
5499 * Similar to _forward, but accepts a third parameter $nesting (which
5500 * should be initialized at 0) and stops when we hit the end tag
5501 * for the node $this->inputIndex starts in.
5503 protected function forwardUntilEndToken(&$i, &$current, &$nesting) {
5504 $result = $this->forward($i, $current);
5505 if (!$result) return false;
5506 if ($nesting === null) $nesting = 0;
5507 if ($current instanceof HTMLPurifier_Token_Start) $nesting++;
5508 elseif ($current instanceof HTMLPurifier_Token_End) {
5509 if ($nesting <= 0) return false;
5516 * Iterator function, starts with the previous token and continues until
5517 * you reach the beginning of input tokens.
5518 * @warning Please prevent previous references from interfering with this
5519 * functions by setting $i = null beforehand!
5520 * @param &$i Current integer index variable for inputTokens
5521 * @param &$current Current token variable. Do NOT use $token, as that variable is also a reference
5523 protected function backward(&$i, &$current) {
5524 if ($i === null) $i = $this->inputIndex - 1;
5526 if ($i < 0) return false;
5527 $current = $this->inputTokens[$i];
5532 * Initializes the iterator at the current position. Use in a do {} while;
5533 * loop to force the _forward and _backward functions to start at the
5535 * @warning Please prevent previous references from interfering with this
5536 * functions by setting $i = null beforehand!
5537 * @param &$i Current integer index variable for inputTokens
5538 * @param &$current Current token variable. Do NOT use $token, as that variable is also a reference
5540 protected function current(&$i, &$current) {
5541 if ($i === null) $i = $this->inputIndex;
5542 $current = $this->inputTokens[$i];
5546 * Handler that is called when a text token is processed
5548 public function handleText(&$token) {}
5551 * Handler that is called when a start or empty token is processed
5553 public function handleElement(&$token) {}
5556 * Handler that is called when an end token is processed
5558 public function handleEnd(&$token) {
5559 $this->notifyEnd($token);
5563 * Notifier that is called when an end token is processed
5564 * @note This differs from handlers in that the token is read-only
5567 public function notifyEnd($token) {}
5577 * Represents a language and defines localizable string formatting and
5578 * other functions, as well as the localized messages for HTML Purifier.
5580 class HTMLPurifier_Language
5584 * ISO 639 language code of language. Prefers shortest possible version
5586 public $code = 'en';
5589 * Fallback language code
5591 public $fallback = false;
5594 * Array of localizable messages
5596 public $messages = array();
5599 * Array of localizable error codes
5601 public $errorNames = array();
5604 * True if no message file was found for this language, so English
5605 * is being used instead. Check this if you'd like to notify the
5606 * user that they've used a non-supported language.
5608 public $error = false;
5611 * Has the language object been loaded yet?
5612 * @todo Make it private, fix usage in HTMLPurifier_LanguageTest
5614 public $_loaded = false;
5617 * Instances of HTMLPurifier_Config and HTMLPurifier_Context
5619 protected $config, $context;
5621 public function __construct($config, $context) {
5622 $this->config = $config;
5623 $this->context = $context;
5627 * Loads language object with necessary info from factory cache
5628 * @note This is a lazy loader
5630 public function load() {
5631 if ($this->_loaded) return;
5632 $factory = HTMLPurifier_LanguageFactory::instance();
5633 $factory->loadLanguage($this->code);
5634 foreach ($factory->keys as $key) {
5635 $this->$key = $factory->cache[$this->code][$key];
5637 $this->_loaded = true;
5641 * Retrieves a localised message.
5642 * @param $key string identifier of message
5643 * @return string localised message
5645 public function getMessage($key) {
5646 if (!$this->_loaded) $this->load();
5647 if (!isset($this->messages[$key])) return "[$key]";
5648 return $this->messages[$key];
5652 * Retrieves a localised error name.
5653 * @param $int integer error number, corresponding to PHP's error
5655 * @return string localised message
5657 public function getErrorName($int) {
5658 if (!$this->_loaded) $this->load();
5659 if (!isset($this->errorNames[$int])) return "[Error: $int]";
5660 return $this->errorNames[$int];
5664 * Converts an array list into a string readable representation
5666 public function listify($array) {
5667 $sep = $this->getMessage('Item separator');
5668 $sep_last = $this->getMessage('Item separator last');
5670 for ($i = 0, $c = count($array); $i < $c; $i++) {
5672 } elseif ($i + 1 < $c) {
5683 * Formats a localised message with passed parameters
5684 * @param $key string identifier of message
5685 * @param $args Parameters to substitute in
5686 * @return string localised message
5687 * @todo Implement conditionals? Right now, some messages make
5688 * reference to line numbers, but those aren't always available
5690 public function formatMessage($key, $args = array()) {
5691 if (!$this->_loaded) $this->load();
5692 if (!isset($this->messages[$key])) return "[$key]";
5693 $raw = $this->messages[$key];
5696 foreach ($args as $i => $value) {
5697 if (is_object($value)) {
5698 if ($value instanceof HTMLPurifier_Token) {
5699 // factor this out some time
5700 if (!$generator) $generator = $this->context->get('Generator');
5701 if (isset($value->name)) $subst['$'.$i.'.Name'] = $value->name;
5702 if (isset($value->data)) $subst['$'.$i.'.Data'] = $value->data;
5703 $subst['$'.$i.'.Compact'] =
5704 $subst['$'.$i.'.Serialized'] = $generator->generateFromToken($value);
5705 // a more complex algorithm for compact representation
5706 // could be introduced for all types of tokens. This
5707 // may need to be factored out into a dedicated class
5708 if (!empty($value->attr)) {
5709 $stripped_token = clone $value;
5710 $stripped_token->attr = array();
5711 $subst['$'.$i.'.Compact'] = $generator->generateFromToken($stripped_token);
5713 $subst['$'.$i.'.Line'] = $value->line ? $value->line : 'unknown';
5716 } elseif (is_array($value)) {
5717 $keys = array_keys($value);
5718 if (array_keys($keys) === $keys) {
5720 $subst['$'.$i] = $this->listify($value);
5722 // associative array
5723 // no $i implementation yet, sorry
5724 $subst['$'.$i.'.Keys'] = $this->listify($keys);
5725 $subst['$'.$i.'.Values'] = $this->listify(array_values($value));
5729 $subst['$' . $i] = $value;
5731 return strtr($raw, $subst);
5741 * Class responsible for generating HTMLPurifier_Language objects, managing
5742 * caching and fallbacks.
5743 * @note Thanks to MediaWiki for the general logic, although this version
5744 * has been entirely rewritten
5745 * @todo Serialized cache for languages
5747 class HTMLPurifier_LanguageFactory
5751 * Cache of language code information used to load HTMLPurifier_Language objects
5752 * Structure is: $factory->cache[$language_code][$key] = $value
5758 * Valid keys in the HTMLPurifier_Language object. Designates which
5759 * variables to slurp out of a message file.
5762 public $keys = array('fallback', 'messages', 'errorNames');
5765 * Instance of HTMLPurifier_AttrDef_Lang to validate language codes
5766 * @value object HTMLPurifier_AttrDef_Lang
5768 protected $validator;
5771 * Cached copy of dirname(__FILE__), directory of current file without
5773 * @value string filename
5778 * Keys whose contents are a hash map and can be merged
5779 * @value array lookup
5781 protected $mergeable_keys_map = array('messages' => true, 'errorNames' => true);
5784 * Keys whose contents are a list and can be merged
5785 * @value array lookup
5787 protected $mergeable_keys_list = array();
5790 * Retrieve sole instance of the factory.
5791 * @param $prototype Optional prototype to overload sole instance with,
5792 * or bool true to reset to default factory.
5794 public static function instance($prototype = null) {
5795 static $instance = null;
5796 if ($prototype !== null) {
5797 $instance = $prototype;
5798 } elseif ($instance === null || $prototype == true) {
5799 $instance = new HTMLPurifier_LanguageFactory();
5806 * Sets up the singleton, much like a constructor
5807 * @note Prevents people from getting this outside of the singleton
5809 public function setup() {
5810 $this->validator = new HTMLPurifier_AttrDef_Lang();
5811 $this->dir = HTMLPURIFIER_PREFIX . '/HTMLPurifier';
5815 * Creates a language object, handles class fallbacks
5816 * @param $config Instance of HTMLPurifier_Config
5817 * @param $context Instance of HTMLPurifier_Context
5818 * @param $code Code to override configuration with. Private parameter.
5820 public function create($config, $context, $code = false) {
5822 // validate language code
5823 if ($code === false) {
5824 $code = $this->validator->validate(
5825 $config->get('Core.Language'), $config, $context
5828 $code = $this->validator->validate($code, $config, $context);
5830 if ($code === false) $code = 'en'; // malformed code becomes English
5832 $pcode = str_replace('-', '_', $code); // make valid PHP classname
5833 static $depth = 0; // recursion protection
5835 if ($code == 'en') {
5836 $lang = new HTMLPurifier_Language($config, $context);
5838 $class = 'HTMLPurifier_Language_' . $pcode;
5839 $file = $this->dir . '/Language/classes/' . $code . '.php';
5840 if (file_exists($file) || class_exists($class, false)) {
5841 $lang = new $class($config, $context);
5844 $raw_fallback = $this->getFallbackFor($code);
5845 $fallback = $raw_fallback ? $raw_fallback : 'en';
5847 $lang = $this->create($config, $context, $fallback);
5848 if (!$raw_fallback) {
5849 $lang->error = true;
5855 $lang->code = $code;
5862 * Returns the fallback language for language
5863 * @note Loads the original language into cache
5864 * @param $code string language code
5866 public function getFallbackFor($code) {
5867 $this->loadLanguage($code);
5868 return $this->cache[$code]['fallback'];
5872 * Loads language into the cache, handles message file and fallbacks
5873 * @param $code string language code
5875 public function loadLanguage($code) {
5876 static $languages_seen = array(); // recursion guard
5878 // abort if we've already loaded it
5879 if (isset($this->cache[$code])) return;
5881 // generate filename
5882 $filename = $this->dir . '/Language/messages/' . $code . '.php';
5884 // default fallback : may be overwritten by the ensuing include
5885 $fallback = ($code != 'en') ? 'en' : false;
5887 // load primary localisation
5888 if (!file_exists($filename)) {
5889 // skip the include: will rely solely on fallback
5890 $filename = $this->dir . '/Language/messages/en.php';
5894 $cache = compact($this->keys);
5897 // load fallback localisation
5898 if (!empty($fallback)) {
5900 // infinite recursion guard
5901 if (isset($languages_seen[$code])) {
5902 trigger_error('Circular fallback reference in language ' .
5903 $code, E_USER_ERROR);
5906 $language_seen[$code] = true;
5908 // load the fallback recursively
5909 $this->loadLanguage($fallback);
5910 $fallback_cache = $this->cache[$fallback];
5912 // merge fallback with current language
5913 foreach ( $this->keys as $key ) {
5914 if (isset($cache[$key]) && isset($fallback_cache[$key])) {
5915 if (isset($this->mergeable_keys_map[$key])) {
5916 $cache[$key] = $cache[$key] + $fallback_cache[$key];
5917 } elseif (isset($this->mergeable_keys_list[$key])) {
5918 $cache[$key] = array_merge( $fallback_cache[$key], $cache[$key] );
5921 $cache[$key] = $fallback_cache[$key];
5927 // save to cache for later retrieval
5928 $this->cache[$code] = $cache;
5940 * Represents a measurable length, with a string numeric magnitude
5941 * and a unit. This object is immutable.
5943 class HTMLPurifier_Length
5947 * String numeric magnitude.
5952 * String unit. False is permitted if $n = 0.
5957 * Whether or not this length is valid. Null if not calculated yet.
5962 * Lookup array of units recognized by CSS 2.1
5964 protected static $allowedUnits = array(
5965 'em' => true, 'ex' => true, 'px' => true, 'in' => true,
5966 'cm' => true, 'mm' => true, 'pt' => true, 'pc' => true
5970 * @param number $n Magnitude
5971 * @param string $u Unit
5973 public function __construct($n = '0', $u = false) {
5974 $this->n = (string) $n;
5975 $this->unit = $u !== false ? (string) $u : false;
5979 * @param string $s Unit string, like '2em' or '3.4in'
5980 * @warning Does not perform validation.
5982 static public function make($s) {
5983 if ($s instanceof HTMLPurifier_Length) return $s;
5984 $n_length = strspn($s, '1234567890.+-');
5985 $n = substr($s, 0, $n_length);
5986 $unit = substr($s, $n_length);
5987 if ($unit === '') $unit = false;
5988 return new HTMLPurifier_Length($n, $unit);
5992 * Validates the number and unit.
5994 protected function validate() {
5996 if ($this->n === '+0' || $this->n === '-0') $this->n = '0';
5997 if ($this->n === '0' && $this->unit === false) return true;
5998 if (!ctype_lower($this->unit)) $this->unit = strtolower($this->unit);
5999 if (!isset(HTMLPurifier_Length::$allowedUnits[$this->unit])) return false;
6001 $def = new HTMLPurifier_AttrDef_CSS_Number();
6002 $result = $def->validate($this->n, false, false);
6003 if ($result === false) return false;
6009 * Returns string representation of number.
6011 public function toString() {
6012 if (!$this->isValid()) return false;
6013 return $this->n . $this->unit;
6017 * Retrieves string numeric magnitude.
6019 public function getN() {return $this->n;}
6022 * Retrieves string unit.
6024 public function getUnit() {return $this->unit;}
6027 * Returns true if this length unit is valid.
6029 public function isValid() {
6030 if ($this->isValid === null) $this->isValid = $this->validate();
6031 return $this->isValid;
6035 * Compares two lengths, and returns 1 if greater, -1 if less and 0 if equal.
6036 * @warning If both values are too large or small, this calculation will
6039 public function compareTo($l) {
6040 if ($l === false) return false;
6041 if ($l->unit !== $this->unit) {
6042 $converter = new HTMLPurifier_UnitConverter();
6043 $l = $converter->convert($l, $this->unit);
6044 if ($l === false) return false;
6046 return $this->n - $l->n;
6056 * Forgivingly lexes HTML (SGML-style) markup into tokens.
6058 * A lexer parses a string of SGML-style markup and converts them into
6059 * corresponding tokens. It doesn't check for well-formedness, although its
6060 * internal mechanism may make this automatic (such as the case of
6061 * HTMLPurifier_Lexer_DOMLex). There are several implementations to choose
6064 * A lexer is HTML-oriented: it might work with XML, but it's not
6065 * recommended, as we adhere to a subset of the specification for optimization
6066 * reasons. This might change in the future. Also, most tokenizers are not
6067 * expected to handle DTDs or PIs.
6069 * This class should not be directly instantiated, but you may use create() to
6070 * retrieve a default copy of the lexer. Being a supertype, this class
6071 * does not actually define any implementation, but offers commonly used
6072 * convenience functions for subclasses.
6074 * @note The unit tests will instantiate this class for testing purposes, as
6075 * many of the utility functions require a class to be instantiated.
6076 * This means that, even though this class is not runnable, it will
6077 * not be declared abstract.
6082 * We use tokens rather than create a DOM representation because DOM would:
6085 * -# Require more processing and memory to create,
6086 * -# Is not streamable, and
6087 * -# Has the entire document structure (html and body not needed).
6090 * However, DOM is helpful in that it makes it easy to move around nodes
6091 * without a lot of lookaheads to see when a tag is closed. This is a
6092 * limitation of the token system and some workarounds would be nice.
6094 class HTMLPurifier_Lexer
6098 * Whether or not this lexer implements line-number/column-number tracking.
6099 * If it does, set to true.
6101 public $tracksLineNumbers = false;
6103 // -- STATIC ----------------------------------------------------------
6106 * Retrieves or sets the default Lexer as a Prototype Factory.
6108 * By default HTMLPurifier_Lexer_DOMLex will be returned. There are
6109 * a few exceptions involving special features that only DirectLex
6112 * @note The behavior of this class has changed, rather than accepting
6113 * a prototype object, it now accepts a configuration object.
6114 * To specify your own prototype, set %Core.LexerImpl to it.
6115 * This change in behavior de-singletonizes the lexer object.
6117 * @param $config Instance of HTMLPurifier_Config
6118 * @return Concrete lexer.
6120 public static function create($config) {
6122 if (!($config instanceof HTMLPurifier_Config)) {
6124 trigger_error("Passing a prototype to
6125 HTMLPurifier_Lexer::create() is deprecated, please instead
6126 use %Core.LexerImpl", E_USER_WARNING);
6128 $lexer = $config->get('Core.LexerImpl');
6132 $config->get('Core.MaintainLineNumbers') ||
6133 $config->get('Core.CollectErrors');
6136 if (is_object($lexer)) {
6140 if (is_null($lexer)) { do {
6141 // auto-detection algorithm
6143 if ($needs_tracking) {
6144 $lexer = 'DirectLex';
6149 class_exists('DOMDocument') &&
6150 method_exists('DOMDocument', 'loadHTML') &&
6151 !extension_loaded('domxml')
6153 // check for DOM support, because while it's part of the
6154 // core, it can be disabled compile time. Also, the PECL
6155 // domxml extension overrides the default DOM, and is evil
6156 // and nasty and we shan't bother to support it
6159 $lexer = 'DirectLex';
6162 } while(0); } // do..while so we can break
6164 // instantiate recognized string names
6167 $inst = new HTMLPurifier_Lexer_DOMLex();
6170 $inst = new HTMLPurifier_Lexer_DirectLex();
6173 $inst = new HTMLPurifier_Lexer_PH5P();
6176 throw new HTMLPurifier_Exception("Cannot instantiate unrecognized Lexer type " . htmlspecialchars($lexer));
6180 if (!$inst) throw new HTMLPurifier_Exception('No lexer was instantiated');
6182 // once PHP DOM implements native line numbers, or we
6183 // hack out something using XSLT, remove this stipulation
6184 if ($needs_tracking && !$inst->tracksLineNumbers) {
6185 throw new HTMLPurifier_Exception('Cannot use lexer that does not support line numbers with Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)');
6192 // -- CONVENIENCE MEMBERS ---------------------------------------------
6194 public function __construct() {
6195 $this->_entity_parser = new HTMLPurifier_EntityParser();
6199 * Most common entity to raw value conversion table for special entities.
6201 protected $_special_entity2str =
6213 * Parses special entities into the proper characters.
6215 * This string will translate escaped versions of the special characters
6216 * into the correct ones.
6219 * You should be able to treat the output of this function as
6220 * completely parsed, but that's only because all other entities should
6221 * have been handled previously in substituteNonSpecialEntities()
6223 * @param $string String character data to be parsed.
6224 * @returns Parsed character data.
6226 public function parseData($string) {
6228 // following functions require at least one character
6229 if ($string === '') return '';
6231 // subtracts amps that cannot possibly be escaped
6232 $num_amp = substr_count($string, '&') - substr_count($string, '& ') -
6233 ($string[strlen($string)-1] === '&' ? 1 : 0);
6235 if (!$num_amp) return $string; // abort if no entities
6236 $num_esc_amp = substr_count($string, '&');
6237 $string = strtr($string, $this->_special_entity2str);
6239 // code duplication for sake of optimization, see above
6240 $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
6241 ($string[strlen($string)-1] === '&' ? 1 : 0);
6243 if ($num_amp_2 <= $num_esc_amp) return $string;
6245 // hmm... now we have some uncommon entities. Use the callback.
6246 $string = $this->_entity_parser->substituteSpecialEntities($string);
6251 * Lexes an HTML string into tokens.
6253 * @param $string String HTML.
6254 * @return HTMLPurifier_Token array representation of HTML.
6256 public function tokenizeHTML($string, $config, $context) {
6257 trigger_error('Call to abstract class', E_USER_ERROR);
6261 * Translates CDATA sections into regular sections (through escaping).
6263 * @param $string HTML string to process.
6264 * @returns HTML with CDATA sections escaped.
6266 protected static function escapeCDATA($string) {
6267 return preg_replace_callback(
6268 '/<!\[CDATA\[(.+?)\]\]>/s',
6269 array('HTMLPurifier_Lexer', 'CDATACallback'),
6275 * Special CDATA case that is especially convoluted for <script>
6277 protected static function escapeCommentedCDATA($string) {
6278 return preg_replace_callback(
6279 '#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s',
6280 array('HTMLPurifier_Lexer', 'CDATACallback'),
6286 * Special Internet Explorer conditional comments should be removed.
6288 protected static function removeIEConditional($string) {
6289 return preg_replace(
6290 '#<!--\[if [^>]+\]>.*?<!\[endif\]-->#si', // probably should generalize for all strings
6297 * Callback function for escapeCDATA() that does the work.
6299 * @warning Though this is public in order to let the callback happen,
6300 * calling it directly is not recommended.
6301 * @params $matches PCRE matches array, with index 0 the entire match
6302 * and 1 the inside of the CDATA section.
6303 * @returns Escaped internals of the CDATA section.
6305 protected static function CDATACallback($matches) {
6306 // not exactly sure why the character set is needed, but whatever
6307 return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8');
6311 * Takes a piece of HTML and normalizes it by converting entities, fixing
6312 * encoding, extracting bits, and other good stuff.
6313 * @todo Consider making protected
6315 public function normalize($html, $config, $context) {
6317 // normalize newlines to \n
6318 if ($config->get('Core.NormalizeNewlines')) {
6319 $html = str_replace("\r\n", "\n", $html);
6320 $html = str_replace("\r", "\n", $html);
6323 if ($config->get('HTML.Trusted')) {
6324 // escape convoluted CDATA
6325 $html = $this->escapeCommentedCDATA($html);
6329 $html = $this->escapeCDATA($html);
6331 $html = $this->removeIEConditional($html);
6333 // extract body from document if applicable
6334 if ($config->get('Core.ConvertDocumentToFragment')) {
6336 if ($config->get('Core.CollectErrors')) {
6337 $e =& $context->get('ErrorCollector');
6339 $new_html = $this->extractBody($html);
6340 if ($e && $new_html != $html) {
6341 $e->send(E_WARNING, 'Lexer: Extracted body');
6346 // expand entities that aren't the big five
6347 $html = $this->_entity_parser->substituteNonSpecialEntities($html);
6349 // clean into wellformed UTF-8 string for an SGML context: this has
6350 // to be done after entity expansion because the entities sometimes
6351 // represent non-SGML characters (horror, horror!)
6352 $html = HTMLPurifier_Encoder::cleanUTF8($html);
6354 // if processing instructions are to removed, remove them now
6355 if ($config->get('Core.RemoveProcessingInstructions')) {
6356 $html = preg_replace('#<\?.+?\?>#s', '', $html);
6363 * Takes a string of HTML (fragment or document) and returns the content
6364 * @todo Consider making protected
6366 public function extractBody($html) {
6368 $result = preg_match('!<body[^>]*>(.*)</body>!is', $html, $matches);
6383 * Class that handles operations involving percent-encoding in URIs.
6386 * Be careful when reusing instances of PercentEncoder. The object
6387 * you use for normalize() SHOULD NOT be used for encode(), or
6390 class HTMLPurifier_PercentEncoder
6394 * Reserved characters to preserve when using encode().
6396 protected $preserve = array();
6399 * String of characters that should be preserved while using encode().
6401 public function __construct($preserve = false) {
6402 // unreserved letters, ought to const-ify
6403 for ($i = 48; $i <= 57; $i++) $this->preserve[$i] = true; // digits
6404 for ($i = 65; $i <= 90; $i++) $this->preserve[$i] = true; // upper-case
6405 for ($i = 97; $i <= 122; $i++) $this->preserve[$i] = true; // lower-case
6406 $this->preserve[45] = true; // Dash -
6407 $this->preserve[46] = true; // Period .
6408 $this->preserve[95] = true; // Underscore _
6409 $this->preserve[126]= true; // Tilde ~
6411 // extra letters not to escape
6412 if ($preserve !== false) {
6413 for ($i = 0, $c = strlen($preserve); $i < $c; $i++) {
6414 $this->preserve[ord($preserve[$i])] = true;
6420 * Our replacement for urlencode, it encodes all non-reserved characters,
6421 * as well as any extra characters that were instructed to be preserved.
6423 * Assumes that the string has already been normalized, making any
6424 * and all percent escape sequences valid. Percents will not be
6425 * re-escaped, regardless of their status in $preserve
6426 * @param $string String to be encoded
6427 * @return Encoded string.
6429 public function encode($string) {
6431 for ($i = 0, $c = strlen($string); $i < $c; $i++) {
6432 if ($string[$i] !== '%' && !isset($this->preserve[$int = ord($string[$i])]) ) {
6433 $ret .= '%' . sprintf('%02X', $int);
6435 $ret .= $string[$i];
6442 * Fix up percent-encoding by decoding unreserved characters and normalizing.
6443 * @warning This function is affected by $preserve, even though the
6444 * usual desired behavior is for this not to preserve those
6445 * characters. Be careful when reusing instances of PercentEncoder!
6446 * @param $string String to normalize
6448 public function normalize($string) {
6449 if ($string == '') return '';
6450 $parts = explode('%', $string);
6451 $ret = array_shift($parts);
6452 foreach ($parts as $part) {
6453 $length = strlen($part);
6455 $ret .= '%25' . $part;
6458 $encoding = substr($part, 0, 2);
6459 $text = substr($part, 2);
6460 if (!ctype_xdigit($encoding)) {
6461 $ret .= '%25' . $part;
6464 $int = hexdec($encoding);
6465 if (isset($this->preserve[$int])) {
6466 $ret .= chr($int) . $text;
6469 $encoding = strtoupper($encoding);
6470 $ret .= '%' . $encoding . $text;
6482 * Generic property list implementation
6484 class HTMLPurifier_PropertyList
6487 * Internal data-structure for properties
6489 protected $data = array();
6498 public function __construct($parent = null) {
6499 $this->parent = $parent;
6503 * Recursively retrieves the value for a key
6505 public function get($name) {
6506 if ($this->has($name)) return $this->data[$name];
6507 // possible performance bottleneck, convert to iterative if necessary
6508 if ($this->parent) return $this->parent->get($name);
6509 throw new HTMLPurifier_Exception("Key '$name' not found");
6513 * Sets the value of a key, for this plist
6515 public function set($name, $value) {
6516 $this->data[$name] = $value;
6520 * Returns true if a given key exists
6522 public function has($name) {
6523 return array_key_exists($name, $this->data);
6527 * Resets a value to the value of it's parent, usually the default. If
6528 * no value is specified, the entire plist is reset.
6530 public function reset($name = null) {
6531 if ($name == null) $this->data = array();
6532 else unset($this->data[$name]);
6536 * Squashes this property list and all of its property lists into a single
6537 * array, and returns the array. This value is cached by default.
6538 * @param $force If true, ignores the cache and regenerates the array.
6540 public function squash($force = false) {
6541 if ($this->cache !== null && !$force) return $this->cache;
6542 if ($this->parent) {
6543 return $this->cache = array_merge($this->parent->squash($force), $this->data);
6545 return $this->cache = $this->data;
6550 * Returns the parent plist.
6552 public function getParent() {
6553 return $this->parent;
6557 * Sets the parent plist.
6559 public function setParent($plist) {
6560 $this->parent = $plist;
6569 * Property list iterator. Do not instantiate this class directly.
6571 class HTMLPurifier_PropertyListIterator extends FilterIterator
6578 * @param $data Array of data to iterate over
6579 * @param $filter Optional prefix to only allow values of
6581 public function __construct(Iterator $iterator, $filter = null) {
6582 parent::__construct($iterator);
6583 $this->l = strlen($filter);
6584 $this->filter = $filter;
6587 public function accept() {
6588 $key = $this->getInnerIterator()->key();
6589 if( strncmp($key, $this->filter, $this->l) !== 0 ) {
6602 * Supertype for classes that define a strategy for modifying/purifying tokens.
6604 * While HTMLPurifier's core purpose is fixing HTML into something proper,
6605 * strategies provide plug points for extra configuration or even extra
6606 * features, such as custom tags, custom parsing of text, etc.
6610 abstract class HTMLPurifier_Strategy
6614 * Executes the strategy on the tokens.
6616 * @param $tokens Array of HTMLPurifier_Token objects to be operated on.
6617 * @param $config Configuration options
6618 * @returns Processed array of token objects.
6620 abstract public function execute($tokens, $config, $context);
6629 * This is in almost every respect equivalent to an array except
6630 * that it keeps track of which keys were accessed.
6632 * @warning For the sake of backwards compatibility with early versions
6633 * of PHP 5, you must not use the $hash[$key] syntax; if you do
6634 * our version of offsetGet is never called.
6636 class HTMLPurifier_StringHash extends ArrayObject
6638 protected $accessed = array();
6641 * Retrieves a value, and logs the access.
6643 public function offsetGet($index) {
6644 $this->accessed[$index] = true;
6645 return parent::offsetGet($index);
6649 * Returns a lookup array of all array indexes that have been accessed.
6650 * @return Array in form array($index => true).
6652 public function getAccessed() {
6653 return $this->accessed;
6657 * Resets the access array.
6659 public function resetAccessed() {
6660 $this->accessed = array();
6669 * Parses string hash files. File format is as such:
6678 * Which would output something similar to:
6681 * 'ID' => 'DefaultKeyValue',
6683 * 'KEY2' => 'Value2',
6684 * 'MULTILINE-KEY' => "Multiline\nvalue.\n",
6687 * We use this as an easy to use file-format for configuration schema
6688 * files, but the class itself is usage agnostic.
6690 * You can use ---- to forcibly terminate parsing of a single string-hash;
6691 * this marker is used in multi string-hashes to delimit boundaries.
6693 class HTMLPurifier_StringHashParser
6696 public $default = 'ID';
6699 * Parses a file that contains a single string-hash.
6701 public function parseFile($file) {
6702 if (!file_exists($file)) return false;
6703 $fh = fopen($file, 'r');
6704 if (!$fh) return false;
6705 $ret = $this->parseHandle($fh);
6711 * Parses a file that contains multiple string-hashes delimited by '----'
6713 public function parseMultiFile($file) {
6714 if (!file_exists($file)) return false;
6716 $fh = fopen($file, 'r');
6717 if (!$fh) return false;
6718 while (!feof($fh)) {
6719 $ret[] = $this->parseHandle($fh);
6726 * Internal parser that acepts a file handle.
6727 * @note While it's possible to simulate in-memory parsing by using
6728 * custom stream wrappers, if such a use-case arises we should
6729 * factor out the file handle into its own class.
6730 * @param $fh File handle with pointer at start of valid string-hash
6733 protected function parseHandle($fh) {
6739 if ($line === false) break;
6740 $line = rtrim($line, "\n\r");
6741 if (!$state && $line === '') continue;
6742 if ($line === '----') break;
6743 if (strncmp('--#', $line, 3) === 0) {
6746 } elseif (strncmp('--', $line, 2) === 0) {
6747 // Multiline declaration
6748 $state = trim($line, '- ');
6749 if (!isset($ret[$state])) $ret[$state] = '';
6751 } elseif (!$state) {
6753 if (strpos($line, ':') !== false) {
6754 // Single-line declaration
6755 list($state, $line) = explode(':', $line, 2);
6756 $line = trim($line);
6758 // Use default declaration
6759 $state = $this->default;
6763 $ret[$state] = $line;
6767 $ret[$state] .= "$line\n";
6769 } while (!feof($fh));
6780 * Defines a mutation of an obsolete tag into a valid tag.
6782 abstract class HTMLPurifier_TagTransform
6786 * Tag name to transform the tag to.
6788 public $transform_to;
6791 * Transforms the obsolete tag into the valid tag.
6792 * @param $tag Tag to be transformed.
6793 * @param $config Mandatory HTMLPurifier_Config object
6794 * @param $context Mandatory HTMLPurifier_Context object
6796 abstract public function transform($tag, $config, $context);
6799 * Prepends CSS properties to the style attribute, creating the
6800 * attribute if it doesn't exist.
6801 * @warning Copied over from AttrTransform, be sure to keep in sync
6802 * @param $attr Attribute array to process (passed by reference)
6803 * @param $css CSS to prepend
6805 protected function prependCSS(&$attr, $css) {
6806 $attr['style'] = isset($attr['style']) ? $attr['style'] : '';
6807 $attr['style'] = $css . $attr['style'];
6817 * Abstract base token class that all others inherit from.
6819 class HTMLPurifier_Token {
6820 public $line; /**< Line number node was on in source document. Null if unknown. */
6821 public $col; /**< Column of line node was on in source document. Null if unknown. */
6824 * Lookup array of processing that this token is exempt from.
6825 * Currently, valid values are "ValidateAttributes" and
6826 * "MakeWellFormed_TagClosedError"
6828 public $armor = array();
6831 * Used during MakeWellFormed.
6837 public function __get($n) {
6838 if ($n === 'type') {
6839 trigger_error('Deprecated type property called; use instanceof', E_USER_NOTICE);
6840 switch (get_class($this)) {
6841 case 'HTMLPurifier_Token_Start': return 'start';
6842 case 'HTMLPurifier_Token_Empty': return 'empty';
6843 case 'HTMLPurifier_Token_End': return 'end';
6844 case 'HTMLPurifier_Token_Text': return 'text';
6845 case 'HTMLPurifier_Token_Comment': return 'comment';
6846 default: return null;
6852 * Sets the position of the token in the source document.
6854 public function position($l = null, $c = null) {
6860 * Convenience function for DirectLex settings line/col position.
6862 public function rawPosition($l, $c) {
6863 if ($c === -1) $l++;
6875 * Factory for token generation.
6877 * @note Doing some benchmarking indicates that the new operator is much
6878 * slower than the clone operator (even discounting the cost of the
6879 * constructor). This class is for that optimization.
6880 * Other then that, there's not much point as we don't
6881 * maintain parallel HTMLPurifier_Token hierarchies (the main reason why
6882 * you'd want to use an abstract factory).
6883 * @todo Port DirectLex to use this
6885 class HTMLPurifier_TokenFactory
6889 * Prototypes that will be cloned.
6892 // p stands for prototype
6893 private $p_start, $p_end, $p_empty, $p_text, $p_comment;
6896 * Generates blank prototypes for cloning.
6898 public function __construct() {
6899 $this->p_start = new HTMLPurifier_Token_Start('', array());
6900 $this->p_end = new HTMLPurifier_Token_End('');
6901 $this->p_empty = new HTMLPurifier_Token_Empty('', array());
6902 $this->p_text = new HTMLPurifier_Token_Text('');
6903 $this->p_comment= new HTMLPurifier_Token_Comment('');
6907 * Creates a HTMLPurifier_Token_Start.
6908 * @param $name Tag name
6909 * @param $attr Associative array of attributes
6910 * @return Generated HTMLPurifier_Token_Start
6912 public function createStart($name, $attr = array()) {
6913 $p = clone $this->p_start;
6914 $p->__construct($name, $attr);
6919 * Creates a HTMLPurifier_Token_End.
6920 * @param $name Tag name
6921 * @return Generated HTMLPurifier_Token_End
6923 public function createEnd($name) {
6924 $p = clone $this->p_end;
6925 $p->__construct($name);
6930 * Creates a HTMLPurifier_Token_Empty.
6931 * @param $name Tag name
6932 * @param $attr Associative array of attributes
6933 * @return Generated HTMLPurifier_Token_Empty
6935 public function createEmpty($name, $attr = array()) {
6936 $p = clone $this->p_empty;
6937 $p->__construct($name, $attr);
6942 * Creates a HTMLPurifier_Token_Text.
6943 * @param $data Data of text token
6944 * @return Generated HTMLPurifier_Token_Text
6946 public function createText($data) {
6947 $p = clone $this->p_text;
6948 $p->__construct($data);
6953 * Creates a HTMLPurifier_Token_Comment.
6954 * @param $data Data of comment token
6955 * @return Generated HTMLPurifier_Token_Comment
6957 public function createComment($data) {
6958 $p = clone $this->p_comment;
6959 $p->__construct($data);
6970 * HTML Purifier's internal representation of a URI.
6972 * Internal data-structures are completely escaped. If the data needs
6973 * to be used in a non-URI context (which is very unlikely), be sure
6974 * to decode it first. The URI may not necessarily be well-formed until
6975 * validate() is called.
6977 class HTMLPurifier_URI
6980 public $scheme, $userinfo, $host, $port, $path, $query, $fragment;
6983 * @note Automatically normalizes scheme and port
6985 public function __construct($scheme, $userinfo, $host, $port, $path, $query, $fragment) {
6986 $this->scheme = is_null($scheme) || ctype_lower($scheme) ? $scheme : strtolower($scheme);
6987 $this->userinfo = $userinfo;
6988 $this->host = $host;
6989 $this->port = is_null($port) ? $port : (int) $port;
6990 $this->path = $path;
6991 $this->query = $query;
6992 $this->fragment = $fragment;
6996 * Retrieves a scheme object corresponding to the URI's scheme/default
6997 * @param $config Instance of HTMLPurifier_Config
6998 * @param $context Instance of HTMLPurifier_Context
6999 * @return Scheme object appropriate for validating this URI
7001 public function getSchemeObj($config, $context) {
7002 $registry = HTMLPurifier_URISchemeRegistry::instance();
7003 if ($this->scheme !== null) {
7004 $scheme_obj = $registry->getScheme($this->scheme, $config, $context);
7005 if (!$scheme_obj) return false; // invalid scheme, clean it out
7007 // no scheme: retrieve the default one
7008 $def = $config->getDefinition('URI');
7009 $scheme_obj = $registry->getScheme($def->defaultScheme, $config, $context);
7011 // something funky happened to the default scheme object
7013 'Default scheme object "' . $def->defaultScheme . '" was not readable',
7023 * Generic validation method applicable for all schemes. May modify
7024 * this URI in order to get it into a compliant form.
7025 * @param $config Instance of HTMLPurifier_Config
7026 * @param $context Instance of HTMLPurifier_Context
7027 * @return True if validation/filtering succeeds, false if failure
7029 public function validate($config, $context) {
7031 // ABNF definitions from RFC 3986
7032 $chars_sub_delims = '!$&\'()*+,;=';
7033 $chars_gen_delims = ':/?#[]@';
7034 $chars_pchar = $chars_sub_delims . ':@';
7037 if (!is_null($this->host)) {
7038 $host_def = new HTMLPurifier_AttrDef_URI_Host();
7039 $this->host = $host_def->validate($this->host, $config, $context);
7040 if ($this->host === false) $this->host = null;
7044 // NOTE: It's not appropriate to check whether or not this
7045 // scheme is in our registry, since a URIFilter may convert a
7046 // URI that we don't allow into one we do. So instead, we just
7047 // check if the scheme can be dropped because there is no host
7048 // and it is our default scheme.
7049 if (!is_null($this->scheme) && is_null($this->host) || $this->host === '') {
7050 // support for relative paths is pretty abysmal when the
7051 // scheme is present, so axe it when possible
7052 $def = $config->getDefinition('URI');
7053 if ($def->defaultScheme === $this->scheme) {
7054 $this->scheme = null;
7058 // validate username
7059 if (!is_null($this->userinfo)) {
7060 $encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . ':');
7061 $this->userinfo = $encoder->encode($this->userinfo);
7065 if (!is_null($this->port)) {
7066 if ($this->port < 1 || $this->port > 65535) $this->port = null;
7070 $path_parts = array();
7071 $segments_encoder = new HTMLPurifier_PercentEncoder($chars_pchar . '/');
7072 if (!is_null($this->host)) { // this catches $this->host === ''
7073 // path-abempty (hier and relative)
7074 // http://www.example.com/my/path
7075 // //www.example.com/my/path (looks odd, but works, and
7076 // recognized by most browsers)
7077 // (this set is valid or invalid on a scheme by scheme
7078 // basis, so we'll deal with it later)
7081 $this->path = $segments_encoder->encode($this->path);
7082 } elseif ($this->path !== '') {
7083 if ($this->path[0] === '/') {
7084 // path-absolute (hier and relative)
7087 if (strlen($this->path) >= 2 && $this->path[1] === '/') {
7088 // This could happen if both the host gets stripped
7094 $this->path = $segments_encoder->encode($this->path);
7096 } elseif (!is_null($this->scheme)) {
7097 // path-rootless (hier)
7099 // Short circuit evaluation means we don't need to check nz
7100 $this->path = $segments_encoder->encode($this->path);
7102 // path-noscheme (relative)
7104 // (once again, not checking nz)
7105 $segment_nc_encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . '@');
7106 $c = strpos($this->path, '/');
7109 $segment_nc_encoder->encode(substr($this->path, 0, $c)) .
7110 $segments_encoder->encode(substr($this->path, $c));
7112 $this->path = $segment_nc_encoder->encode($this->path);
7116 // path-empty (hier and relative)
7117 $this->path = ''; // just to be safe
7120 // qf = query and fragment
7121 $qf_encoder = new HTMLPurifier_PercentEncoder($chars_pchar . '/?');
7123 if (!is_null($this->query)) {
7124 $this->query = $qf_encoder->encode($this->query);
7127 if (!is_null($this->fragment)) {
7128 $this->fragment = $qf_encoder->encode($this->fragment);
7136 * Convert URI back to string
7137 * @return String URI appropriate for output
7139 public function toString() {
7140 // reconstruct authority
7142 // there is a rendering difference between a null authority
7143 // (http:foo-bar) and an empty string authority
7144 // (http:///foo-bar).
7145 if (!is_null($this->host)) {
7147 if(!is_null($this->userinfo)) $authority .= $this->userinfo . '@';
7148 $authority .= $this->host;
7149 if(!is_null($this->port)) $authority .= ':' . $this->port;
7152 // Reconstruct the result
7153 // One might wonder about parsing quirks from browsers after
7154 // this reconstruction. Unfortunately, parsing behavior depends
7155 // on what *scheme* was employed (file:///foo is handled *very*
7156 // differently than http:///foo), so unfortunately we have to
7157 // defer to the schemes to do the right thing.
7159 if (!is_null($this->scheme)) $result .= $this->scheme . ':';
7160 if (!is_null($authority)) $result .= '//' . $authority;
7161 $result .= $this->path;
7162 if (!is_null($this->query)) $result .= '?' . $this->query;
7163 if (!is_null($this->fragment)) $result .= '#' . $this->fragment;
7174 class HTMLPurifier_URIDefinition extends HTMLPurifier_Definition
7177 public $type = 'URI';
7178 protected $filters = array();
7179 protected $postFilters = array();
7180 protected $registeredFilters = array();
7183 * HTMLPurifier_URI object of the base specified at %URI.Base
7188 * String host to consider "home" base, derived off of $base
7193 * Name of default scheme based on %URI.DefaultScheme and %URI.Base
7195 public $defaultScheme;
7197 public function __construct() {
7198 $this->registerFilter(new HTMLPurifier_URIFilter_DisableExternal());
7199 $this->registerFilter(new HTMLPurifier_URIFilter_DisableExternalResources());
7200 $this->registerFilter(new HTMLPurifier_URIFilter_HostBlacklist());
7201 $this->registerFilter(new HTMLPurifier_URIFilter_MakeAbsolute());
7202 $this->registerFilter(new HTMLPurifier_URIFilter_Munge());
7205 public function registerFilter($filter) {
7206 $this->registeredFilters[$filter->name] = $filter;
7209 public function addFilter($filter, $config) {
7210 $r = $filter->prepare($config);
7211 if ($r === false) return; // null is ok, for backwards compat
7212 if ($filter->post) {
7213 $this->postFilters[$filter->name] = $filter;
7215 $this->filters[$filter->name] = $filter;
7219 protected function doSetup($config) {
7220 $this->setupMemberVariables($config);
7221 $this->setupFilters($config);
7224 protected function setupFilters($config) {
7225 foreach ($this->registeredFilters as $name => $filter) {
7226 $conf = $config->get('URI.' . $name);
7227 if ($conf !== false && $conf !== null) {
7228 $this->addFilter($filter, $config);
7231 unset($this->registeredFilters);
7234 protected function setupMemberVariables($config) {
7235 $this->host = $config->get('URI.Host');
7236 $base_uri = $config->get('URI.Base');
7237 if (!is_null($base_uri)) {
7238 $parser = new HTMLPurifier_URIParser();
7239 $this->base = $parser->parse($base_uri);
7240 $this->defaultScheme = $this->base->scheme;
7241 if (is_null($this->host)) $this->host = $this->base->host;
7243 if (is_null($this->defaultScheme)) $this->defaultScheme = $config->get('URI.DefaultScheme');
7246 public function filter(&$uri, $config, $context) {
7247 foreach ($this->filters as $name => $f) {
7248 $result = $f->filter($uri, $config, $context);
7249 if (!$result) return false;
7254 public function postFilter(&$uri, $config, $context) {
7255 foreach ($this->postFilters as $name => $f) {
7256 $result = $f->filter($uri, $config, $context);
7257 if (!$result) return false;
7269 * Chainable filters for custom URI processing.
7271 * These filters can perform custom actions on a URI filter object,
7272 * including transformation or blacklisting.
7274 * @warning This filter is called before scheme object validation occurs.
7275 * Make sure, if you require a specific scheme object, you
7276 * you check that it exists. This allows filters to convert
7277 * proprietary URI schemes into regular ones.
7279 abstract class HTMLPurifier_URIFilter
7283 * Unique identifier of filter
7288 * True if this filter should be run after scheme validation.
7290 public $post = false;
7293 * Performs initialization for the filter
7295 public function prepare($config) {return true;}
7298 * Filter a URI object
7299 * @param $uri Reference to URI object variable
7300 * @param $config Instance of HTMLPurifier_Config
7301 * @param $context Instance of HTMLPurifier_Context
7302 * @return bool Whether or not to continue processing: false indicates
7303 * URL is no good, true indicates continue processing. Note that
7304 * all changes are committed directly on the URI object
7306 abstract public function filter(&$uri, $config, $context);
7315 * Parses a URI into the components and fragment identifier as specified
7318 class HTMLPurifier_URIParser
7322 * Instance of HTMLPurifier_PercentEncoder to do normalization with.
7324 protected $percentEncoder;
7326 public function __construct() {
7327 $this->percentEncoder = new HTMLPurifier_PercentEncoder();
7332 * @param $uri string URI to parse
7333 * @return HTMLPurifier_URI representation of URI. This representation has
7334 * not been validated yet and may not conform to RFC.
7336 public function parse($uri) {
7338 $uri = $this->percentEncoder->normalize($uri);
7340 // Regexp is as per Appendix B.
7341 // Note that ["<>] are an addition to the RFC's recommended
7342 // characters, because they represent external delimeters.
7344 '(([^:/?#"<>]+):)?'. // 2. Scheme
7345 '(//([^/?#"<>]*))?'. // 4. Authority
7346 '([^?#"<>]*)'. // 5. Path
7347 '(\?([^#"<>]*))?'. // 7. Query
7348 '(#([^"<>]*))?'. // 8. Fragment
7352 $result = preg_match($r_URI, $uri, $matches);
7354 if (!$result) return false; // *really* invalid URI
7356 // seperate out parts
7357 $scheme = !empty($matches[1]) ? $matches[2] : null;
7358 $authority = !empty($matches[3]) ? $matches[4] : null;
7359 $path = $matches[5]; // always present, can be empty
7360 $query = !empty($matches[6]) ? $matches[7] : null;
7361 $fragment = !empty($matches[8]) ? $matches[9] : null;
7363 // further parse authority
7364 if ($authority !== null) {
7365 $r_authority = "/^((.+?)@)?(\[[^\]]+\]|[^:]*)(:(\d*))?/";
7367 preg_match($r_authority, $authority, $matches);
7368 $userinfo = !empty($matches[1]) ? $matches[2] : null;
7369 $host = !empty($matches[3]) ? $matches[3] : '';
7370 $port = !empty($matches[4]) ? (int) $matches[5] : null;
7372 $port = $host = $userinfo = null;
7375 return new HTMLPurifier_URI(
7376 $scheme, $userinfo, $host, $port, $path, $query, $fragment);
7386 * Validator for the components of a URI for a specific scheme
7388 abstract class HTMLPurifier_URIScheme
7392 * Scheme's default port (integer). If an explicit port number is
7393 * specified that coincides with the default port, it will be
7396 public $default_port = null;
7399 * Whether or not URIs of this schem are locatable by a browser
7400 * http and ftp are accessible, while mailto and news are not.
7402 public $browsable = false;
7405 * Whether or not the URI always uses <hier_part>, resolves edge cases
7406 * with making relative URIs absolute
7408 public $hierarchical = false;
7411 * Whether or not the URI may omit a hostname when the scheme is
7412 * explicitly specified, ala file:///path/to/file. As of writing,
7413 * 'file' is the only scheme that browsers support his properly.
7415 public $may_omit_host = false;
7418 * Validates the components of a URI for a specific scheme.
7419 * @param $uri Reference to a HTMLPurifier_URI object
7420 * @param $config HTMLPurifier_Config object
7421 * @param $context HTMLPurifier_Context object
7422 * @return Bool success or failure
7424 public abstract function doValidate(&$uri, $config, $context);
7427 * Public interface for validating components of a URI. Performs a
7428 * bunch of default actions. Don't overload this method.
7429 * @param $uri Reference to a HTMLPurifier_URI object
7430 * @param $config HTMLPurifier_Config object
7431 * @param $context HTMLPurifier_Context object
7432 * @return Bool success or failure
7434 public function validate(&$uri, $config, $context) {
7435 if ($this->default_port == $uri->port) $uri->port = null;
7436 // kludge: browsers do funny things when the scheme but not the
7438 if (!$this->may_omit_host &&
7439 // if the scheme is present, a missing host is always in error
7440 (!is_null($uri->scheme) && ($uri->host === '' || is_null($uri->host))) ||
7441 // if the scheme is not present, a *blank* host is in error,
7442 // since this translates into '///path' which most browsers
7443 // interpret as being 'http://path'.
7444 (is_null($uri->scheme) && $uri->host === '')
7447 if (is_null($uri->scheme)) {
7448 if (substr($uri->path, 0, 2) != '//') {
7452 // URI is '////path', so we cannot nullify the
7453 // host to preserve semantics. Try expanding the
7454 // hostname instead (fall through)
7456 // first see if we can manually insert a hostname
7457 $host = $config->get('URI.Host');
7458 if (!is_null($host)) {
7461 // we can't do anything sensible, reject the URL.
7466 return $this->doValidate($uri, $config, $context);
7476 * Registry for retrieving specific URI scheme validator objects.
7478 class HTMLPurifier_URISchemeRegistry
7482 * Retrieve sole instance of the registry.
7483 * @param $prototype Optional prototype to overload sole instance with,
7484 * or bool true to reset to default registry.
7485 * @note Pass a registry object $prototype with a compatible interface and
7486 * the function will copy it and return it all further times.
7488 public static function instance($prototype = null) {
7489 static $instance = null;
7490 if ($prototype !== null) {
7491 $instance = $prototype;
7492 } elseif ($instance === null || $prototype == true) {
7493 $instance = new HTMLPurifier_URISchemeRegistry();
7499 * Cache of retrieved schemes.
7501 protected $schemes = array();
7504 * Retrieves a scheme validator object
7505 * @param $scheme String scheme name like http or mailto
7506 * @param $config HTMLPurifier_Config object
7507 * @param $config HTMLPurifier_Context object
7509 public function getScheme($scheme, $config, $context) {
7510 if (!$config) $config = HTMLPurifier_Config::createDefault();
7512 // important, otherwise attacker could include arbitrary file
7513 $allowed_schemes = $config->get('URI.AllowedSchemes');
7514 if (!$config->get('URI.OverrideAllowedSchemes') &&
7515 !isset($allowed_schemes[$scheme])
7520 if (isset($this->schemes[$scheme])) return $this->schemes[$scheme];
7521 if (!isset($allowed_schemes[$scheme])) return;
7523 $class = 'HTMLPurifier_URIScheme_' . $scheme;
7524 if (!class_exists($class)) return;
7525 $this->schemes[$scheme] = new $class();
7526 return $this->schemes[$scheme];
7530 * Registers a custom scheme to the cache, bypassing reflection.
7531 * @param $scheme Scheme name
7532 * @param $scheme_obj HTMLPurifier_URIScheme object
7534 public function register($scheme, $scheme_obj) {
7535 $this->schemes[$scheme] = $scheme_obj;
7545 * Class for converting between different unit-lengths as specified by
7548 class HTMLPurifier_UnitConverter
7556 * Units information array. Units are grouped into measuring systems
7557 * (English, Metric), and are assigned an integer representing
7558 * the conversion factor between that unit and the smallest unit in
7559 * the system. Numeric indexes are actually magical constants that
7560 * encode conversion data from one system to the next, with a O(n^2)
7561 * constraint on memory (this is generally not a problem, since
7562 * the number of measuring systems is small.)
7564 protected static $units = array(
7565 self::ENGLISH => array(
7566 'px' => 3, // This is as per CSS 2.1 and Firefox. Your mileage may vary
7570 self::METRIC => array('pt', '0.352777778', 'mm'),
7572 self::METRIC => array(
7575 self::ENGLISH => array('mm', '2.83464567', 'pt'),
7580 * Minimum bcmath precision for output.
7582 protected $outputPrecision;
7585 * Bcmath precision for internal calculations.
7587 protected $internalPrecision;
7590 * Whether or not BCMath is available
7594 public function __construct($output_precision = 4, $internal_precision = 10, $force_no_bcmath = false) {
7595 $this->outputPrecision = $output_precision;
7596 $this->internalPrecision = $internal_precision;
7597 $this->bcmath = !$force_no_bcmath && function_exists('bcmul');
7601 * Converts a length object of one unit into another unit.
7602 * @param HTMLPurifier_Length $length
7603 * Instance of HTMLPurifier_Length to convert. You must validate()
7604 * it before passing it here!
7605 * @param string $to_unit
7606 * Unit to convert to.
7608 * About precision: This conversion function pays very special
7609 * attention to the incoming precision of values and attempts
7610 * to maintain a number of significant figure. Results are
7611 * fairly accurate up to nine digits. Some caveats:
7612 * - If a number is zero-padded as a result of this significant
7613 * figure tracking, the zeroes will be eliminated.
7614 * - If a number contains less than four sigfigs ($outputPrecision)
7615 * and this causes some decimals to be excluded, those
7616 * decimals will be added on.
7618 public function convert($length, $to_unit) {
7620 if (!$length->isValid()) return false;
7622 $n = $length->getN();
7623 $unit = $length->getUnit();
7625 if ($n === '0' || $unit === false) {
7626 return new HTMLPurifier_Length('0', false);
7629 $state = $dest_state = false;
7630 foreach (self::$units as $k => $x) {
7631 if (isset($x[$unit])) $state = $k;
7632 if (isset($x[$to_unit])) $dest_state = $k;
7634 if (!$state || !$dest_state) return false;
7636 // Some calculations about the initial precision of the number;
7637 // this will be useful when we need to do final rounding.
7638 $sigfigs = $this->getSigFigs($n);
7639 if ($sigfigs < $this->outputPrecision) $sigfigs = $this->outputPrecision;
7641 // BCMath's internal precision deals only with decimals. Use
7642 // our default if the initial number has no decimals, or increase
7643 // it by how ever many decimals, thus, the number of guard digits
7644 // will always be greater than or equal to internalPrecision.
7645 $log = (int) floor(log(abs($n), 10));
7646 $cp = ($log < 0) ? $this->internalPrecision - $log : $this->internalPrecision; // internal precision
7648 for ($i = 0; $i < 2; $i++) {
7650 // Determine what unit IN THIS SYSTEM we need to convert to
7651 if ($dest_state === $state) {
7652 // Simple conversion
7653 $dest_unit = $to_unit;
7655 // Convert to the smallest unit, pending a system shift
7656 $dest_unit = self::$units[$state][$dest_state][0];
7659 // Do the conversion if necessary
7660 if ($dest_unit !== $unit) {
7661 $factor = $this->div(self::$units[$state][$unit], self::$units[$state][$dest_unit], $cp);
7662 $n = $this->mul($n, $factor, $cp);
7666 // Output was zero, so bail out early. Shouldn't ever happen.
7673 // It was a simple conversion, so bail out
7674 if ($dest_state === $state) {
7679 // Conversion failed! Apparently, the system we forwarded
7680 // to didn't have this unit. This should never happen!
7684 // Pre-condition: $i == 0
7686 // Perform conversion to next system of units
7687 $n = $this->mul($n, self::$units[$state][$dest_state][1], $cp);
7688 $unit = self::$units[$state][$dest_state][2];
7689 $state = $dest_state;
7691 // One more loop around to convert the unit in the new system.
7695 // Post-condition: $unit == $to_unit
7696 if ($unit !== $to_unit) return false;
7698 // Useful for debugging:
7700 //echo "$n\nsigfigs = $sigfigs\nnew_log = $new_log\nlog = $log\nrp = $rp\n</pre>\n";
7702 $n = $this->round($n, $sigfigs);
7703 if (strpos($n, '.') !== false) $n = rtrim($n, '0');
7704 $n = rtrim($n, '.');
7706 return new HTMLPurifier_Length($n, $unit);
7710 * Returns the number of significant figures in a string number.
7711 * @param string $n Decimal number
7712 * @return int number of sigfigs
7714 public function getSigFigs($n) {
7715 $n = ltrim($n, '0+-');
7716 $dp = strpos($n, '.'); // decimal position
7717 if ($dp === false) {
7718 $sigfigs = strlen(rtrim($n, '0'));
7720 $sigfigs = strlen(ltrim($n, '0.')); // eliminate extra decimal character
7721 if ($dp !== 0) $sigfigs--;
7727 * Adds two numbers, using arbitrary precision when available.
7729 private function add($s1, $s2, $scale) {
7730 if ($this->bcmath) return bcadd($s1, $s2, $scale);
7731 else return $this->scale($s1 + $s2, $scale);
7735 * Multiples two numbers, using arbitrary precision when available.
7737 private function mul($s1, $s2, $scale) {
7738 if ($this->bcmath) return bcmul($s1, $s2, $scale);
7739 else return $this->scale($s1 * $s2, $scale);
7743 * Divides two numbers, using arbitrary precision when available.
7745 private function div($s1, $s2, $scale) {
7746 if ($this->bcmath) return bcdiv($s1, $s2, $scale);
7747 else return $this->scale($s1 / $s2, $scale);
7751 * Rounds a number according to the number of sigfigs it should have,
7752 * using arbitrary precision when available.
7754 private function round($n, $sigfigs) {
7755 $new_log = (int) floor(log(abs($n), 10)); // Number of digits left of decimal - 1
7756 $rp = $sigfigs - $new_log - 1; // Number of decimal places needed
7757 $neg = $n < 0 ? '-' : ''; // Negative sign
7758 if ($this->bcmath) {
7760 $n = bcadd($n, $neg . '0.' . str_repeat('0', $rp) . '5', $rp + 1);
7761 $n = bcdiv($n, '1', $rp);
7763 // This algorithm partially depends on the standardized
7764 // form of numbers that comes out of bcmath.
7765 $n = bcadd($n, $neg . '5' . str_repeat('0', $new_log - $sigfigs), 0);
7766 $n = substr($n, 0, $sigfigs + strlen($neg)) . str_repeat('0', $new_log - $sigfigs + 1);
7770 return $this->scale(round($n, $sigfigs - $new_log - 1), $rp + 1);
7775 * Scales a float to $scale digits right of decimal point, like BCMath.
7777 private function scale($r, $scale) {
7779 // The f sprintf type doesn't support negative numbers, so we
7780 // need to cludge things manually. First get the string.
7781 $r = sprintf('%.0f', (float) $r);
7782 // Due to floating point precision loss, $r will more than likely
7783 // look something like 4652999999999.9234. We grab one more digit
7784 // than we need to precise from $r and then use that to round
7786 $precise = (string) round(substr($r, 0, strlen($r) + $scale), -1);
7787 // Now we return it, truncating the zero that was rounded off.
7788 return substr($precise, 0, -1) . str_repeat('0', -$scale + 1);
7790 return sprintf('%.' . $scale . 'f', (float) $r);
7800 * Parses string representations into their corresponding native PHP
7801 * variable type. The base implementation does a simple type-check.
7803 class HTMLPurifier_VarParser
7819 * Lookup table of allowed types. Mainly for backwards compatibility, but
7820 * also convenient for transforming string type names to the integer constants.
7822 static public $types = array(
7823 'string' => self::STRING,
7824 'istring' => self::ISTRING,
7825 'text' => self::TEXT,
7826 'itext' => self::ITEXT,
7828 'float' => self::FLOAT,
7829 'bool' => self::BOOL,
7830 'lookup' => self::LOOKUP,
7831 'list' => self::ALIST,
7832 'hash' => self::HASH,
7833 'mixed' => self::MIXED
7837 * Lookup table of types that are string, and can have aliases or
7838 * allowed value lists.
7840 static public $stringTypes = array(
7841 self::STRING => true,
7842 self::ISTRING => true,
7844 self::ITEXT => true,
7848 * Validate a variable according to type. Throws
7849 * HTMLPurifier_VarParserException if invalid.
7850 * It may return NULL as a valid type if $allow_null is true.
7852 * @param $var Variable to validate
7853 * @param $type Type of variable, see HTMLPurifier_VarParser->types
7854 * @param $allow_null Whether or not to permit null as a value
7855 * @return Validated and type-coerced variable
7857 final public function parse($var, $type, $allow_null = false) {
7858 if (is_string($type)) {
7859 if (!isset(HTMLPurifier_VarParser::$types[$type])) {
7860 throw new HTMLPurifier_VarParserException("Invalid type '$type'");
7862 $type = HTMLPurifier_VarParser::$types[$type];
7865 $var = $this->parseImplementation($var, $type, $allow_null);
7866 if ($allow_null && $var === null) return null;
7867 // These are basic checks, to make sure nothing horribly wrong
7868 // happened in our implementations.
7870 case (self::STRING):
7871 case (self::ISTRING):
7874 if (!is_string($var)) break;
7875 if ($type == self::ISTRING || $type == self::ITEXT) $var = strtolower($var);
7878 if (!is_int($var)) break;
7881 if (!is_float($var)) break;
7884 if (!is_bool($var)) break;
7886 case (self::LOOKUP):
7889 if (!is_array($var)) break;
7890 if ($type === self::LOOKUP) {
7891 foreach ($var as $k) if ($k !== true) $this->error('Lookup table contains value other than true');
7892 } elseif ($type === self::ALIST) {
7893 $keys = array_keys($var);
7894 if (array_keys($keys) !== $keys) $this->error('Indices for list are not uniform');
7900 $this->errorInconsistent(get_class($this), $type);
7902 $this->errorGeneric($var, $type);
7906 * Actually implements the parsing. Base implementation is to not
7907 * do anything to $var. Subclasses should overload this!
7909 protected function parseImplementation($var, $type, $allow_null) {
7914 * Throws an exception.
7916 protected function error($msg) {
7917 throw new HTMLPurifier_VarParserException($msg);
7921 * Throws an inconsistency exception.
7922 * @note This should not ever be called. It would be called if we
7923 * extend the allowed values of HTMLPurifier_VarParser without
7924 * updating subclasses.
7926 protected function errorInconsistent($class, $type) {
7927 throw new HTMLPurifier_Exception("Inconsistency in $class: ".HTMLPurifier_VarParser::getTypeName($type)." not implemented");
7931 * Generic error for if a type didn't work.
7933 protected function errorGeneric($var, $type) {
7934 $vtype = gettype($var);
7935 $this->error("Expected type ".HTMLPurifier_VarParser::getTypeName($type).", got $vtype");
7938 static public function getTypeName($type) {
7941 // Lazy load the alternative lookup table
7942 $lookup = array_flip(HTMLPurifier_VarParser::$types);
7944 if (!isset($lookup[$type])) return 'unknown';
7945 return $lookup[$type];
7955 * Exception type for HTMLPurifier_VarParser
7957 class HTMLPurifier_VarParserException extends HTMLPurifier_Exception
7967 * Validates the HTML attribute style, otherwise known as CSS.
7968 * @note We don't implement the whole CSS specification, so it might be
7969 * difficult to reuse this component in the context of validating
7970 * actual stylesheet declarations.
7971 * @note If we were really serious about validating the CSS, we would
7972 * tokenize the styles and then parse the tokens. Obviously, we
7973 * are not doing that. Doing that could seriously harm performance,
7974 * but would make these components a lot more viable for a CSS
7975 * filtering solution.
7977 class HTMLPurifier_AttrDef_CSS extends HTMLPurifier_AttrDef
7980 public function validate($css, $config, $context) {
7982 $css = $this->parseCDATA($css);
7984 $definition = $config->getCSSDefinition();
7986 // we're going to break the spec and explode by semicolons.
7987 // This is because semicolon rarely appears in escaped form
7988 // Doing this is generally flaky but fast
7989 // IT MIGHT APPEAR IN URIs, see HTMLPurifier_AttrDef_CSSURI
7992 $declarations = explode(';', $css);
7993 $propvalues = array();
7996 * Name of the current CSS property being validated.
7999 $context->register('CurrentCSSProperty', $property);
8001 foreach ($declarations as $declaration) {
8002 if (!$declaration) continue;
8003 if (!strpos($declaration, ':')) continue;
8004 list($property, $value) = explode(':', $declaration, 2);
8005 $property = trim($property);
8006 $value = trim($value);
8009 if (isset($definition->info[$property])) {
8013 if (ctype_lower($property)) break;
8014 $property = strtolower($property);
8015 if (isset($definition->info[$property])) {
8021 // inefficient call, since the validator will do this again
8022 if (strtolower(trim($value)) !== 'inherit') {
8023 // inherit works for everything (but only on the base property)
8024 $result = $definition->info[$property]->validate(
8025 $value, $config, $context );
8027 $result = 'inherit';
8029 if ($result === false) continue;
8030 $propvalues[$property] = $result;
8033 $context->destroy('CurrentCSSProperty');
8035 // procedure does not write the new CSS simultaneously, so it's
8036 // slightly inefficient, but it's the only way of getting rid of
8037 // duplicates. Perhaps config to optimize it, but not now.
8039 $new_declarations = '';
8040 foreach ($propvalues as $prop => $value) {
8041 $new_declarations .= "$prop:$value;";
8044 return $new_declarations ? $new_declarations : false;
8054 // Enum = Enumerated
8056 * Validates a keyword against a list of valid values.
8057 * @warning The case-insensitive compare of this function uses PHP's
8058 * built-in strtolower and ctype_lower functions, which may
8059 * cause problems with international comparisons
8061 class HTMLPurifier_AttrDef_Enum extends HTMLPurifier_AttrDef
8065 * Lookup table of valid values.
8066 * @todo Make protected
8068 public $valid_values = array();
8071 * Bool indicating whether or not enumeration is case sensitive.
8072 * @note In general this is always case insensitive.
8074 protected $case_sensitive = false; // values according to W3C spec
8077 * @param $valid_values List of valid values
8078 * @param $case_sensitive Bool indicating whether or not case sensitive
8080 public function __construct(
8081 $valid_values = array(), $case_sensitive = false
8083 $this->valid_values = array_flip($valid_values);
8084 $this->case_sensitive = $case_sensitive;
8087 public function validate($string, $config, $context) {
8088 $string = trim($string);
8089 if (!$this->case_sensitive) {
8090 // we may want to do full case-insensitive libraries
8091 $string = ctype_lower($string) ? $string : strtolower($string);
8093 $result = isset($this->valid_values[$string]);
8095 return $result ? $string : false;
8099 * @param $string In form of comma-delimited list of case-insensitive
8100 * valid values. Example: "foo,bar,baz". Prepend "s:" to make
8103 public function make($string) {
8104 if (strlen($string) > 2 && $string[0] == 's' && $string[1] == ':') {
8105 $string = substr($string, 2);
8110 $values = explode(',', $string);
8111 return new HTMLPurifier_AttrDef_Enum($values, $sensitive);
8121 * Validates an integer.
8122 * @note While this class was modeled off the CSS definition, no currently
8123 * allowed CSS uses this type. The properties that do are: widows,
8124 * orphans, z-index, counter-increment, counter-reset. Some of the
8125 * HTML attributes, however, find use for a non-negative version of this.
8127 class HTMLPurifier_AttrDef_Integer extends HTMLPurifier_AttrDef
8131 * Bool indicating whether or not negative values are allowed
8133 protected $negative = true;
8136 * Bool indicating whether or not zero is allowed
8138 protected $zero = true;
8141 * Bool indicating whether or not positive values are allowed
8143 protected $positive = true;
8146 * @param $negative Bool indicating whether or not negative values are allowed
8147 * @param $zero Bool indicating whether or not zero is allowed
8148 * @param $positive Bool indicating whether or not positive values are allowed
8150 public function __construct(
8151 $negative = true, $zero = true, $positive = true
8153 $this->negative = $negative;
8154 $this->zero = $zero;
8155 $this->positive = $positive;
8158 public function validate($integer, $config, $context) {
8160 $integer = $this->parseCDATA($integer);
8161 if ($integer === '') return false;
8163 // we could possibly simply typecast it to integer, but there are
8164 // certain fringe cases that must not return an integer.
8166 // clip leading sign
8167 if ( $this->negative && $integer[0] === '-' ) {
8168 $digits = substr($integer, 1);
8169 if ($digits === '0') $integer = '0'; // rm minus sign for zero
8170 } elseif( $this->positive && $integer[0] === '+' ) {
8171 $digits = $integer = substr($integer, 1); // rm unnecessary plus
8176 // test if it's numeric
8177 if (!ctype_digit($digits)) return false;
8179 // perform scope tests
8180 if (!$this->zero && $integer == 0) return false;
8181 if (!$this->positive && $integer > 0) return false;
8182 if (!$this->negative && $integer < 0) return false;
8195 * Validates the HTML attribute lang, effectively a language code.
8196 * @note Built according to RFC 3066, which obsoleted RFC 1766
8198 class HTMLPurifier_AttrDef_Lang extends HTMLPurifier_AttrDef
8201 public function validate($string, $config, $context) {
8203 $string = trim($string);
8204 if (!$string) return false;
8206 $subtags = explode('-', $string);
8207 $num_subtags = count($subtags);
8209 if ($num_subtags == 0) return false; // sanity check
8211 // process primary subtag : $subtags[0]
8212 $length = strlen($subtags[0]);
8217 if (! ($subtags[0] == 'x' || $subtags[0] == 'i') ) {
8223 if (! ctype_alpha($subtags[0]) ) {
8225 } elseif (! ctype_lower($subtags[0]) ) {
8226 $subtags[0] = strtolower($subtags[0]);
8233 $new_string = $subtags[0];
8234 if ($num_subtags == 1) return $new_string;
8236 // process second subtag : $subtags[1]
8237 $length = strlen($subtags[1]);
8238 if ($length == 0 || ($length == 1 && $subtags[1] != 'x') || $length > 8 || !ctype_alnum($subtags[1])) {
8241 if (!ctype_lower($subtags[1])) $subtags[1] = strtolower($subtags[1]);
8243 $new_string .= '-' . $subtags[1];
8244 if ($num_subtags == 2) return $new_string;
8246 // process all other subtags, index 2 and up
8247 for ($i = 2; $i < $num_subtags; $i++) {
8248 $length = strlen($subtags[$i]);
8249 if ($length == 0 || $length > 8 || !ctype_alnum($subtags[$i])) {
8252 if (!ctype_lower($subtags[$i])) {
8253 $subtags[$i] = strtolower($subtags[$i]);
8255 $new_string .= '-' . $subtags[$i];
8269 * Decorator that, depending on a token, switches between two definitions.
8271 class HTMLPurifier_AttrDef_Switch
8275 protected $withTag, $withoutTag;
8278 * @param string $tag Tag name to switch upon
8279 * @param HTMLPurifier_AttrDef $with_tag Call if token matches tag
8280 * @param HTMLPurifier_AttrDef $without_tag Call if token doesn't match, or there is no token
8282 public function __construct($tag, $with_tag, $without_tag) {
8284 $this->withTag = $with_tag;
8285 $this->withoutTag = $without_tag;
8288 public function validate($string, $config, $context) {
8289 $token = $context->get('CurrentToken', true);
8290 if (!$token || $token->name !== $this->tag) {
8291 return $this->withoutTag->validate($string, $config, $context);
8293 return $this->withTag->validate($string, $config, $context);
8304 * Validates arbitrary text according to the HTML spec.
8306 class HTMLPurifier_AttrDef_Text extends HTMLPurifier_AttrDef
8309 public function validate($string, $config, $context) {
8310 return $this->parseCDATA($string);
8320 * Validates a URI as defined by RFC 3986.
8321 * @note Scheme-specific mechanics deferred to HTMLPurifier_URIScheme
8323 class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
8327 protected $embedsResource;
8330 * @param $embeds_resource_resource Does the URI here result in an extra HTTP request?
8332 public function __construct($embeds_resource = false) {
8333 $this->parser = new HTMLPurifier_URIParser();
8334 $this->embedsResource = (bool) $embeds_resource;
8337 public function make($string) {
8338 $embeds = (bool) $string;
8339 return new HTMLPurifier_AttrDef_URI($embeds);
8342 public function validate($uri, $config, $context) {
8344 if ($config->get('URI.Disable')) return false;
8346 $uri = $this->parseCDATA($uri);
8349 $uri = $this->parser->parse($uri);
8350 if ($uri === false) return false;
8352 // add embedded flag to context for validators
8353 $context->register('EmbeddedURI', $this->embedsResource);
8358 // generic validation
8359 $result = $uri->validate($config, $context);
8360 if (!$result) break;
8362 // chained filtering
8363 $uri_def = $config->getDefinition('URI');
8364 $result = $uri_def->filter($uri, $config, $context);
8365 if (!$result) break;
8367 // scheme-specific validation
8368 $scheme_obj = $uri->getSchemeObj($config, $context);
8369 if (!$scheme_obj) break;
8370 if ($this->embedsResource && !$scheme_obj->browsable) break;
8371 $result = $scheme_obj->validate($uri, $config, $context);
8372 if (!$result) break;
8374 // Post chained filtering
8375 $result = $uri_def->postFilter($uri, $config, $context);
8376 if (!$result) break;
8378 // survived gauntlet
8383 $context->destroy('EmbeddedURI');
8384 if (!$ok) return false;
8387 return $uri->toString();
8398 * Validates a number as defined by the CSS spec.
8400 class HTMLPurifier_AttrDef_CSS_Number extends HTMLPurifier_AttrDef
8404 * Bool indicating whether or not only positive values allowed.
8406 protected $non_negative = false;
8409 * @param $non_negative Bool indicating whether negatives are forbidden
8411 public function __construct($non_negative = false) {
8412 $this->non_negative = $non_negative;
8416 * @warning Some contexts do not pass $config, $context. These
8417 * variables should not be used without checking HTMLPurifier_Length
8419 public function validate($number, $config, $context) {
8421 $number = $this->parseCDATA($number);
8423 if ($number === '') return false;
8424 if ($number === '0') return '0';
8427 switch ($number[0]) {
8429 if ($this->non_negative) return false;
8432 $number = substr($number, 1);
8435 if (ctype_digit($number)) {
8436 $number = ltrim($number, '0');
8437 return $number ? $sign . $number : '0';
8440 // Period is the only non-numeric character allowed
8441 if (strpos($number, '.') === false) return false;
8443 list($left, $right) = explode('.', $number, 2);
8445 if ($left === '' && $right === '') return false;
8446 if ($left !== '' && !ctype_digit($left)) return false;
8448 $left = ltrim($left, '0');
8449 $right = rtrim($right, '0');
8451 if ($right === '') {
8452 return $left ? $sign . $left : '0';
8453 } elseif (!ctype_digit($right)) {
8457 return $sign . $left . '.' . $right;
8467 class HTMLPurifier_AttrDef_CSS_AlphaValue extends HTMLPurifier_AttrDef_CSS_Number
8470 public function __construct() {
8471 parent::__construct(false); // opacity is non-negative, but we will clamp it
8474 public function validate($number, $config, $context) {
8475 $result = parent::validate($number, $config, $context);
8476 if ($result === false) return $result;
8477 $float = (float) $result;
8478 if ($float < 0.0) $result = '0';
8479 if ($float > 1.0) $result = '1';
8490 * Validates shorthand CSS property background.
8491 * @warning Does not support url tokens that have internal spaces.
8493 class HTMLPurifier_AttrDef_CSS_Background extends HTMLPurifier_AttrDef
8497 * Local copy of component validators.
8498 * @note See HTMLPurifier_AttrDef_Font::$info for a similar impl.
8502 public function __construct($config) {
8503 $def = $config->getCSSDefinition();
8504 $this->info['background-color'] = $def->info['background-color'];
8505 $this->info['background-image'] = $def->info['background-image'];
8506 $this->info['background-repeat'] = $def->info['background-repeat'];
8507 $this->info['background-attachment'] = $def->info['background-attachment'];
8508 $this->info['background-position'] = $def->info['background-position'];
8511 public function validate($string, $config, $context) {
8513 // regular pre-processing
8514 $string = $this->parseCDATA($string);
8515 if ($string === '') return false;
8517 // munge rgb() decl if necessary
8518 $string = $this->mungeRgb($string);
8520 // assumes URI doesn't have spaces in it
8521 $bits = explode(' ', strtolower($string)); // bits to process
8524 $caught['color'] = false;
8525 $caught['image'] = false;
8526 $caught['repeat'] = false;
8527 $caught['attachment'] = false;
8528 $caught['position'] = false;
8530 $i = 0; // number of catches
8533 foreach ($bits as $bit) {
8534 if ($bit === '') continue;
8535 foreach ($caught as $key => $status) {
8536 if ($key != 'position') {
8537 if ($status !== false) continue;
8538 $r = $this->info['background-' . $key]->validate($bit, $config, $context);
8542 if ($r === false) continue;
8543 if ($key == 'position') {
8544 if ($caught[$key] === false) $caught[$key] = '';
8545 $caught[$key] .= $r . ' ';
8554 if (!$i) return false;
8555 if ($caught['position'] !== false) {
8556 $caught['position'] = $this->info['background-position']->
8557 validate($caught['position'], $config, $context);
8561 foreach ($caught as $value) {
8562 if ($value === false) continue;
8566 if (empty($ret)) return false;
8567 return implode(' ', $ret);
8578 [ // adjective and number must be in correct order, even if
8579 // you could switch them without introducing ambiguity.
8580 // some browsers support that syntax
8582 <percentage> | <length> | left | center | right
8585 <percentage> | <length> | top | center | bottom
8588 [ // this signifies that the vertical and horizontal adjectives
8589 // can be arbitrarily ordered, however, there can only be two,
8590 // one of each, or none at all
8592 left | center | right
8595 top | center | bottom
8599 center, (none) = 50%
8600 bottom, right = 100%
8604 keyword + length/percentage must be ordered correctly, as per W3C
8606 Internet Explorer and Opera, however, support arbitrary ordering. We
8609 Minor issue though, not strictly necessary.
8612 // control freaks may appreciate the ability to convert these to
8613 // percentages or something, but it's not necessary
8616 * Validates the value of background-position.
8618 class HTMLPurifier_AttrDef_CSS_BackgroundPosition extends HTMLPurifier_AttrDef
8622 protected $percentage;
8624 public function __construct() {
8625 $this->length = new HTMLPurifier_AttrDef_CSS_Length();
8626 $this->percentage = new HTMLPurifier_AttrDef_CSS_Percentage();
8629 public function validate($string, $config, $context) {
8630 $string = $this->parseCDATA($string);
8631 $bits = explode(' ', $string);
8633 $keywords = array();
8634 $keywords['h'] = false; // left, right
8635 $keywords['v'] = false; // top, bottom
8636 $keywords['ch'] = false; // center (first word)
8637 $keywords['cv'] = false; // center (second word)
8638 $measures = array();
8650 foreach ($bits as $bit) {
8651 if ($bit === '') continue;
8654 $lbit = ctype_lower($bit) ? $bit : strtolower($bit);
8655 if (isset($lookup[$lbit])) {
8656 $status = $lookup[$lbit];
8657 if ($status == 'c') {
8664 $keywords[$status] = $lbit;
8669 $r = $this->length->validate($bit, $config, $context);
8675 // test for percentage
8676 $r = $this->percentage->validate($bit, $config, $context);
8684 if (!$i) return false; // no valid values were caught
8689 if ($keywords['h']) $ret[] = $keywords['h'];
8690 elseif ($keywords['ch']) {
8691 $ret[] = $keywords['ch'];
8692 $keywords['cv'] = false; // prevent re-use: center = center center
8694 elseif (count($measures)) $ret[] = array_shift($measures);
8696 if ($keywords['v']) $ret[] = $keywords['v'];
8697 elseif ($keywords['cv']) $ret[] = $keywords['cv'];
8698 elseif (count($measures)) $ret[] = array_shift($measures);
8700 if (empty($ret)) return false;
8701 return implode(' ', $ret);
8712 * Validates the border property as defined by CSS.
8714 class HTMLPurifier_AttrDef_CSS_Border extends HTMLPurifier_AttrDef
8718 * Local copy of properties this property is shorthand for.
8720 protected $info = array();
8722 public function __construct($config) {
8723 $def = $config->getCSSDefinition();
8724 $this->info['border-width'] = $def->info['border-width'];
8725 $this->info['border-style'] = $def->info['border-style'];
8726 $this->info['border-top-color'] = $def->info['border-top-color'];
8729 public function validate($string, $config, $context) {
8730 $string = $this->parseCDATA($string);
8731 $string = $this->mungeRgb($string);
8732 $bits = explode(' ', $string);
8733 $done = array(); // segments we've finished
8734 $ret = ''; // return value
8735 foreach ($bits as $bit) {
8736 foreach ($this->info as $propname => $validator) {
8737 if (isset($done[$propname])) continue;
8738 $r = $validator->validate($bit, $config, $context);
8741 $done[$propname] = true;
8756 * Validates Color as defined by CSS.
8758 class HTMLPurifier_AttrDef_CSS_Color extends HTMLPurifier_AttrDef
8761 public function validate($color, $config, $context) {
8763 static $colors = null;
8764 if ($colors === null) $colors = $config->get('Core.ColorKeywords');
8766 $color = trim($color);
8767 if ($color === '') return false;
8769 $lower = strtolower($color);
8770 if (isset($colors[$lower])) return $colors[$lower];
8772 if (strpos($color, 'rgb(') !== false) {
8773 // rgb literal handling
8774 $length = strlen($color);
8775 if (strpos($color, ')') !== $length - 1) return false;
8776 $triad = substr($color, 4, $length - 4 - 1);
8777 $parts = explode(',', $triad);
8778 if (count($parts) !== 3) return false;
8779 $type = false; // to ensure that they're all the same type
8780 $new_parts = array();
8781 foreach ($parts as $part) {
8782 $part = trim($part);
8783 if ($part === '') return false;
8784 $length = strlen($part);
8785 if ($part[$length - 1] === '%') {
8788 $type = 'percentage';
8789 } elseif ($type !== 'percentage') {
8792 $num = (float) substr($part, 0, $length - 1);
8793 if ($num < 0) $num = 0;
8794 if ($num > 100) $num = 100;
8795 $new_parts[] = "$num%";
8800 } elseif ($type !== 'integer') {
8804 if ($num < 0) $num = 0;
8805 if ($num > 255) $num = 255;
8806 $new_parts[] = (string) $num;
8809 $new_triad = implode(',', $new_parts);
8810 $color = "rgb($new_triad)";
8812 // hexadecimal handling
8813 if ($color[0] === '#') {
8814 $hex = substr($color, 1);
8817 $color = '#' . $color;
8819 $length = strlen($hex);
8820 if ($length !== 3 && $length !== 6) return false;
8821 if (!ctype_xdigit($hex)) return false;
8835 * Allows multiple validators to attempt to validate attribute.
8837 * Composite is just what it sounds like: a composite of many validators.
8838 * This means that multiple HTMLPurifier_AttrDef objects will have a whack
8839 * at the string. If one of them passes, that's what is returned. This is
8840 * especially useful for CSS values, which often are a choice between
8841 * an enumerated set of predefined values or a flexible data type.
8843 class HTMLPurifier_AttrDef_CSS_Composite extends HTMLPurifier_AttrDef
8847 * List of HTMLPurifier_AttrDef objects that may process strings
8848 * @todo Make protected
8853 * @param $defs List of HTMLPurifier_AttrDef objects
8855 public function __construct($defs) {
8856 $this->defs = $defs;
8859 public function validate($string, $config, $context) {
8860 foreach ($this->defs as $i => $def) {
8861 $result = $this->defs[$i]->validate($string, $config, $context);
8862 if ($result !== false) return $result;
8874 * Decorator which enables CSS properties to be disabled for specific elements.
8876 class HTMLPurifier_AttrDef_CSS_DenyElementDecorator extends HTMLPurifier_AttrDef
8878 public $def, $element;
8881 * @param $def Definition to wrap
8882 * @param $element Element to deny
8884 public function __construct($def, $element) {
8886 $this->element = $element;
8889 * Checks if CurrentToken is set and equal to $this->element
8891 public function validate($string, $config, $context) {
8892 $token = $context->get('CurrentToken', true);
8893 if ($token && $token->name == $this->element) return false;
8894 return $this->def->validate($string, $config, $context);
8903 * Microsoft's proprietary filter: CSS property
8904 * @note Currently supports the alpha filter. In the future, this will
8905 * probably need an extensible framework
8907 class HTMLPurifier_AttrDef_CSS_Filter extends HTMLPurifier_AttrDef
8910 protected $intValidator;
8912 public function __construct() {
8913 $this->intValidator = new HTMLPurifier_AttrDef_Integer();
8916 public function validate($value, $config, $context) {
8917 $value = $this->parseCDATA($value);
8918 if ($value === 'none') return $value;
8919 // if we looped this we could support multiple filters
8920 $function_length = strcspn($value, '(');
8921 $function = trim(substr($value, 0, $function_length));
8922 if ($function !== 'alpha' &&
8923 $function !== 'Alpha' &&
8924 $function !== 'progid:DXImageTransform.Microsoft.Alpha'
8926 $cursor = $function_length + 1;
8927 $parameters_length = strcspn($value, ')', $cursor);
8928 $parameters = substr($value, $cursor, $parameters_length);
8929 $params = explode(',', $parameters);
8930 $ret_params = array();
8932 foreach ($params as $param) {
8933 list($key, $value) = explode('=', $param);
8935 $value = trim($value);
8936 if (isset($lookup[$key])) continue;
8937 if ($key !== 'opacity') continue;
8938 $value = $this->intValidator->validate($value, $config, $context);
8939 if ($value === false) continue;
8940 $int = (int) $value;
8941 if ($int > 100) $value = '100';
8942 if ($int < 0) $value = '0';
8943 $ret_params[] = "$key=$value";
8944 $lookup[$key] = true;
8946 $ret_parameters = implode(',', $ret_params);
8947 $ret_function = "$function($ret_parameters)";
8948 return $ret_function;
8958 * Validates shorthand CSS property font.
8960 class HTMLPurifier_AttrDef_CSS_Font extends HTMLPurifier_AttrDef
8964 * Local copy of component validators.
8966 * @note If we moved specific CSS property definitions to their own
8967 * classes instead of having them be assembled at run time by
8968 * CSSDefinition, this wouldn't be necessary. We'd instantiate
8971 protected $info = array();
8973 public function __construct($config) {
8974 $def = $config->getCSSDefinition();
8975 $this->info['font-style'] = $def->info['font-style'];
8976 $this->info['font-variant'] = $def->info['font-variant'];
8977 $this->info['font-weight'] = $def->info['font-weight'];
8978 $this->info['font-size'] = $def->info['font-size'];
8979 $this->info['line-height'] = $def->info['line-height'];
8980 $this->info['font-family'] = $def->info['font-family'];
8983 public function validate($string, $config, $context) {
8985 static $system_fonts = array(
8989 'message-box' => true,
8990 'small-caption' => true,
8991 'status-bar' => true
8994 // regular pre-processing
8995 $string = $this->parseCDATA($string);
8996 if ($string === '') return false;
8998 // check if it's one of the keywords
8999 $lowercase_string = strtolower($string);
9000 if (isset($system_fonts[$lowercase_string])) {
9001 return $lowercase_string;
9004 $bits = explode(' ', $string); // bits to process
9005 $stage = 0; // this indicates what we're looking for
9006 $caught = array(); // which stage 0 properties have we caught?
9007 $stage_1 = array('font-style', 'font-variant', 'font-weight');
9008 $final = ''; // output
9010 for ($i = 0, $size = count($bits); $i < $size; $i++) {
9011 if ($bits[$i] === '') continue;
9014 // attempting to catch font-style, font-variant or font-weight
9016 foreach ($stage_1 as $validator_name) {
9017 if (isset($caught[$validator_name])) continue;
9018 $r = $this->info[$validator_name]->validate(
9019 $bits[$i], $config, $context);
9022 $caught[$validator_name] = true;
9026 // all three caught, continue on
9027 if (count($caught) >= 3) $stage = 1;
9028 if ($r !== false) break;
9030 // attempting to catch font-size and perhaps line-height
9032 $found_slash = false;
9033 if (strpos($bits[$i], '/') !== false) {
9034 list($font_size, $line_height) =
9035 explode('/', $bits[$i]);
9036 if ($line_height === '') {
9037 // ooh, there's a space after the slash!
9038 $line_height = false;
9039 $found_slash = true;
9042 $font_size = $bits[$i];
9043 $line_height = false;
9045 $r = $this->info['font-size']->validate(
9046 $font_size, $config, $context);
9049 // attempt to catch line-height
9050 if ($line_height === false) {
9051 // we need to scroll forward
9052 for ($j = $i + 1; $j < $size; $j++) {
9053 if ($bits[$j] === '') continue;
9054 if ($bits[$j] === '/') {
9058 $found_slash = true;
9062 $line_height = $bits[$j];
9066 // slash already found
9067 $found_slash = true;
9072 $r = $this->info['line-height']->validate(
9073 $line_height, $config, $context);
9084 // attempting to catch font-family
9087 implode(' ', array_slice($bits, $i, $size - $i));
9088 $r = $this->info['font-family']->validate(
9089 $font_family, $config, $context);
9092 // processing completed successfully
9093 return rtrim($final);
9108 * Validates a font family list according to CSS spec
9110 class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
9113 protected $mask = null;
9115 public function __construct() {
9117 for ($c = 'a'; $c <= 'z'; $c++) $this->mask .= $c;
9118 for ($c = 'A'; $c <= 'Z'; $c++) $this->mask .= $c;
9119 for ($c = '0'; $c <= '9'; $c++) $this->mask .= $c; // cast-y, but should be fine
9120 // special bytes used by UTF-8
9121 for ($i = 0x80; $i <= 0xFF; $i++) {
9122 // We don't bother excluding invalid bytes in this range,
9123 // because the our restriction of well-formed UTF-8 will
9124 // prevent these from ever occurring.
9125 $this->mask .= chr($i);
9129 PHP's internal strcspn implementation is
9130 O(length of string * length of mask), making it inefficient
9131 for large masks. However, it's still faster than
9136 if (*spanp == c || p == s1_end) {
9139 } while (spanp++ < (s2_end - 1));
9143 // possible optimization: invert the mask.
9146 public function validate($string, $config, $context) {
9147 static $generic_names = array(
9149 'sans-serif' => true,
9150 'monospace' => true,
9154 $allowed_fonts = $config->get('CSS.AllowedFonts');
9156 // assume that no font names contain commas in them
9157 $fonts = explode(',', $string);
9159 foreach($fonts as $font) {
9160 $font = trim($font);
9161 if ($font === '') continue;
9162 // match a generic name
9163 if (isset($generic_names[$font])) {
9164 if ($allowed_fonts === null || isset($allowed_fonts[$font])) {
9165 $final .= $font . ', ';
9169 // match a quoted name
9170 if ($font[0] === '"' || $font[0] === "'") {
9171 $length = strlen($font);
9172 if ($length <= 2) continue;
9174 if ($font[$length - 1] !== $quote) continue;
9175 $font = substr($font, 1, $length - 2);
9178 $font = $this->expandCSSEscape($font);
9180 // $font is a pure representation of the font name
9182 if ($allowed_fonts !== null && !isset($allowed_fonts[$font])) {
9186 if (ctype_alnum($font) && $font !== '') {
9187 // very simple font, allow it in unharmed
9188 $final .= $font . ', ';
9192 // bugger out on whitespace. form feed (0C) really
9193 // shouldn't show up regardless
9194 $font = str_replace(array("\n", "\t", "\r", "\x0C"), ' ', $font);
9196 // Here, there are various classes of characters which need
9197 // to be treated differently:
9198 // - Alphanumeric characters are essentially safe. We
9199 // handled these above.
9200 // - Spaces require quoting, though most parsers will do
9201 // the right thing if there aren't any characters that
9202 // can be misinterpreted
9203 // - Dashes rarely occur, but they fairly unproblematic
9204 // for parsing/rendering purposes.
9205 // The above characters cover the majority of Western font
9207 // - Arbitrary Unicode characters not in ASCII. Because
9208 // most parsers give little thought to Unicode, treatment
9209 // of these codepoints is basically uniform, even for
9210 // punctuation-like codepoints. These characters can
9211 // show up in non-Western pages and are supported by most
9212 // major browsers, for example: "MS 明朝" is a
9213 // legitimate font-name
9214 // <http://ja.wikipedia.org/wiki/MS_明朝>. See
9215 // the CSS3 spec for more examples:
9216 // <http://www.w3.org/TR/2011/WD-css3-fonts-20110324/localizedfamilynames.png>
9217 // You can see live samples of these on the Internet:
9218 // <http://www.google.co.jp/search?q=font-family+MS+明朝|ゴシック>
9219 // However, most of these fonts have ASCII equivalents:
9220 // for example, 'MS Mincho', and it's considered
9221 // professional to use ASCII font names instead of
9222 // Unicode font names. Thanks Takeshi Terada for
9223 // providing this information.
9224 // The following characters, to my knowledge, have not been
9225 // used to name font names.
9226 // - Single quote. While theoretically you might find a
9227 // font name that has a single quote in its name (serving
9228 // as an apostrophe, e.g. Dave's Scribble), I haven't
9229 // been able to find any actual examples of this.
9230 // Internet Explorer's cssText translation (which I
9231 // believe is invoked by innerHTML) normalizes any
9232 // quoting to single quotes, and fails to escape single
9233 // quotes. (Note that this is not IE's behavior for all
9234 // CSS properties, just some sort of special casing for
9235 // font-family). So a single quote *cannot* be used
9236 // safely in the font-family context if there will be an
9237 // innerHTML/cssText translation. Note that Firefox 3.x
9239 // - Double quote. In IE, these get normalized to
9240 // single-quotes, no matter what the encoding. (Fun
9241 // fact, in IE8, the 'content' CSS property gained
9242 // support, where they special cased to preserve encoded
9243 // double quotes, but still translate unadorned double
9244 // quotes into single quotes.) So, because their
9245 // fixpoint behavior is identical to single quotes, they
9246 // cannot be allowed either. Firefox 3.x displays
9247 // single-quote style behavior.
9248 // - Backslashes are reduced by one (so \\ -> \) every
9249 // iteration, so they cannot be used safely. This shows
9250 // up in IE7, IE8 and FF3
9251 // - Semicolons, commas and backticks are handled properly.
9252 // - The rest of the ASCII punctuation is handled properly.
9253 // We haven't checked what browsers do to unadorned
9254 // versions, but this is not important as long as the
9255 // browser doesn't /remove/ surrounding quotes (as IE does
9258 // With these results in hand, we conclude that there are
9259 // various levels of safety:
9260 // - Paranoid: alphanumeric, spaces and dashes(?)
9261 // - International: Paranoid + non-ASCII Unicode
9262 // - Edgy: Everything except quotes, backslashes
9263 // - NoJS: Standards compliance, e.g. sod IE. Note that
9264 // with some judicious character escaping (since certain
9265 // types of escaping doesn't work) this is theoretically
9266 // OK as long as innerHTML/cssText is not called.
9267 // We believe that international is a reasonable default
9268 // (that we will implement now), and once we do more
9269 // extensive research, we may feel comfortable with dropping
9272 // Edgy: alphanumeric, spaces, dashes and Unicode. Use of
9273 // str(c)spn assumes that the string was already well formed
9274 // Unicode (which of course it is).
9275 if (strspn($font, $this->mask) !== strlen($font)) {
9280 // In the absence of innerHTML/cssText, these ugly
9281 // transforms don't pose a security risk (as \\ and \"
9282 // might--these escapes are not supported by most browsers).
9283 // We could try to be clever and use single-quote wrapping
9284 // when there is a double quote present, but I have choosen
9285 // not to implement that. (NOTE: you can reduce the amount
9286 // of escapes by one depending on what quoting style you use)
9287 // $font = str_replace('\\', '\\5C ', $font);
9288 // $font = str_replace('"', '\\22 ', $font);
9289 // $font = str_replace("'", '\\27 ', $font);
9291 // font possibly with spaces, requires quoting
9292 $final .= "'$font', ";
9294 $final = rtrim($final, ', ');
9295 if ($final === '') return false;
9306 * Decorator which enables !important to be used in CSS values.
9308 class HTMLPurifier_AttrDef_CSS_ImportantDecorator extends HTMLPurifier_AttrDef
9310 public $def, $allow;
9313 * @param $def Definition to wrap
9314 * @param $allow Whether or not to allow !important
9316 public function __construct($def, $allow = false) {
9318 $this->allow = $allow;
9321 * Intercepts and removes !important if necessary
9323 public function validate($string, $config, $context) {
9324 // test for ! and important tokens
9325 $string = trim($string);
9326 $is_important = false;
9327 // :TODO: optimization: test directly for !important and ! important
9328 if (strlen($string) >= 9 && substr($string, -9) === 'important') {
9329 $temp = rtrim(substr($string, 0, -9));
9330 // use a temp, because we might want to restore important
9331 if (strlen($temp) >= 1 && substr($temp, -1) === '!') {
9332 $string = rtrim(substr($temp, 0, -1));
9333 $is_important = true;
9336 $string = $this->def->validate($string, $config, $context);
9337 if ($this->allow && $is_important) $string .= ' !important';
9347 * Represents a Length as defined by CSS.
9349 class HTMLPurifier_AttrDef_CSS_Length extends HTMLPurifier_AttrDef
9352 protected $min, $max;
9355 * @param HTMLPurifier_Length $max Minimum length, or null for no bound. String is also acceptable.
9356 * @param HTMLPurifier_Length $max Maximum length, or null for no bound. String is also acceptable.
9358 public function __construct($min = null, $max = null) {
9359 $this->min = $min !== null ? HTMLPurifier_Length::make($min) : null;
9360 $this->max = $max !== null ? HTMLPurifier_Length::make($max) : null;
9363 public function validate($string, $config, $context) {
9364 $string = $this->parseCDATA($string);
9367 if ($string === '') return false;
9368 if ($string === '0') return '0';
9369 if (strlen($string) === 1) return false;
9371 $length = HTMLPurifier_Length::make($string);
9372 if (!$length->isValid()) return false;
9375 $c = $length->compareTo($this->min);
9376 if ($c === false) return false;
9377 if ($c < 0) return false;
9380 $c = $length->compareTo($this->max);
9381 if ($c === false) return false;
9382 if ($c > 0) return false;
9385 return $length->toString();
9395 * Validates shorthand CSS property list-style.
9396 * @warning Does not support url tokens that have internal spaces.
9398 class HTMLPurifier_AttrDef_CSS_ListStyle extends HTMLPurifier_AttrDef
9402 * Local copy of component validators.
9403 * @note See HTMLPurifier_AttrDef_CSS_Font::$info for a similar impl.
9407 public function __construct($config) {
9408 $def = $config->getCSSDefinition();
9409 $this->info['list-style-type'] = $def->info['list-style-type'];
9410 $this->info['list-style-position'] = $def->info['list-style-position'];
9411 $this->info['list-style-image'] = $def->info['list-style-image'];
9414 public function validate($string, $config, $context) {
9416 // regular pre-processing
9417 $string = $this->parseCDATA($string);
9418 if ($string === '') return false;
9420 // assumes URI doesn't have spaces in it
9421 $bits = explode(' ', strtolower($string)); // bits to process
9424 $caught['type'] = false;
9425 $caught['position'] = false;
9426 $caught['image'] = false;
9428 $i = 0; // number of catches
9431 foreach ($bits as $bit) {
9432 if ($i >= 3) return; // optimization bit
9433 if ($bit === '') continue;
9434 foreach ($caught as $key => $status) {
9435 if ($status !== false) continue;
9436 $r = $this->info['list-style-' . $key]->validate($bit, $config, $context);
9437 if ($r === false) continue;
9438 if ($r === 'none') {
9439 if ($none) continue;
9441 if ($key == 'image') continue;
9449 if (!$i) return false;
9454 if ($caught['type']) $ret[] = $caught['type'];
9457 if ($caught['image']) $ret[] = $caught['image'];
9459 // construct position
9460 if ($caught['position']) $ret[] = $caught['position'];
9462 if (empty($ret)) return false;
9463 return implode(' ', $ret);
9474 * Framework class for strings that involve multiple values.
9476 * Certain CSS properties such as border-width and margin allow multiple
9477 * lengths to be specified. This class can take a vanilla border-width
9478 * definition and multiply it, usually into a max of four.
9480 * @note Even though the CSS specification isn't clear about it, inherit
9481 * can only be used alone: it will never manifest as part of a multi
9482 * shorthand declaration. Thus, this class does not allow inherit.
9484 class HTMLPurifier_AttrDef_CSS_Multiple extends HTMLPurifier_AttrDef
9488 * Instance of component definition to defer validation to.
9489 * @todo Make protected
9494 * Max number of values allowed.
9495 * @todo Make protected
9500 * @param $single HTMLPurifier_AttrDef to multiply
9501 * @param $max Max number of values allowed (usually four)
9503 public function __construct($single, $max = 4) {
9504 $this->single = $single;
9508 public function validate($string, $config, $context) {
9509 $string = $this->parseCDATA($string);
9510 if ($string === '') return false;
9511 $parts = explode(' ', $string); // parseCDATA replaced \r, \t and \n
9512 $length = count($parts);
9514 for ($i = 0, $num = 0; $i < $length && $num < $this->max; $i++) {
9515 if (ctype_space($parts[$i])) continue;
9516 $result = $this->single->validate($parts[$i], $config, $context);
9517 if ($result !== false) {
9518 $final .= $result . ' ';
9522 if ($final === '') return false;
9523 return rtrim($final);
9533 * Validates a Percentage as defined by the CSS spec.
9535 class HTMLPurifier_AttrDef_CSS_Percentage extends HTMLPurifier_AttrDef
9539 * Instance of HTMLPurifier_AttrDef_CSS_Number to defer number validation
9541 protected $number_def;
9544 * @param Bool indicating whether to forbid negative values
9546 public function __construct($non_negative = false) {
9547 $this->number_def = new HTMLPurifier_AttrDef_CSS_Number($non_negative);
9550 public function validate($string, $config, $context) {
9552 $string = $this->parseCDATA($string);
9554 if ($string === '') return false;
9555 $length = strlen($string);
9556 if ($length === 1) return false;
9557 if ($string[$length - 1] !== '%') return false;
9559 $number = substr($string, 0, $length - 1);
9560 $number = $this->number_def->validate($number, $config, $context);
9562 if ($number === false) return false;
9574 * Validates the value for the CSS property text-decoration
9575 * @note This class could be generalized into a version that acts sort of
9576 * like Enum except you can compound the allowed values.
9578 class HTMLPurifier_AttrDef_CSS_TextDecoration extends HTMLPurifier_AttrDef
9581 public function validate($string, $config, $context) {
9583 static $allowed_values = array(
9584 'line-through' => true,
9586 'underline' => true,
9589 $string = strtolower($this->parseCDATA($string));
9591 if ($string === 'none') return $string;
9593 $parts = explode(' ', $string);
9595 foreach ($parts as $part) {
9596 if (isset($allowed_values[$part])) {
9597 $final .= $part . ' ';
9600 $final = rtrim($final);
9601 if ($final === '') return false;
9613 * Validates a URI in CSS syntax, which uses url('http://example.com')
9614 * @note While theoretically speaking a URI in a CSS document could
9615 * be non-embedded, as of CSS2 there is no such usage so we're
9616 * generalizing it. This may need to be changed in the future.
9617 * @warning Since HTMLPurifier_AttrDef_CSS blindly uses semicolons as
9618 * the separator, you cannot put a literal semicolon in
9619 * in the URI. Try percent encoding it, in that case.
9621 class HTMLPurifier_AttrDef_CSS_URI extends HTMLPurifier_AttrDef_URI
9624 public function __construct() {
9625 parent::__construct(true); // always embedded
9628 public function validate($uri_string, $config, $context) {
9629 // parse the URI out of the string and then pass it onto
9630 // the parent object
9632 $uri_string = $this->parseCDATA($uri_string);
9633 if (strpos($uri_string, 'url(') !== 0) return false;
9634 $uri_string = substr($uri_string, 4);
9635 $new_length = strlen($uri_string) - 1;
9636 if ($uri_string[$new_length] != ')') return false;
9637 $uri = trim(substr($uri_string, 0, $new_length));
9639 if (!empty($uri) && ($uri[0] == "'" || $uri[0] == '"')) {
9641 $new_length = strlen($uri) - 1;
9642 if ($uri[$new_length] !== $quote) return false;
9643 $uri = substr($uri, 1, $new_length - 1);
9646 $uri = $this->expandCSSEscape($uri);
9648 $result = parent::validate($uri, $config, $context);
9650 if ($result === false) return false;
9652 // extra sanity check; should have been done by URI
9653 $result = str_replace(array('"', "\\", "\n", "\x0c", "\r"), "", $result);
9655 // suspicious characters are ()'; we're going to percent encode
9657 $result = str_replace(array('(', ')', "'"), array('%28', '%29', '%27'), $result);
9659 // there's an extra bug where ampersands lose their escaping on
9660 // an innerHTML cycle, so a very unlucky query parameter could
9661 // then change the meaning of the URL. Unfortunately, there's
9662 // not much we can do about that...
9664 return "url(\"$result\")";
9675 * Validates a boolean attribute
9677 class HTMLPurifier_AttrDef_HTML_Bool extends HTMLPurifier_AttrDef
9681 public $minimized = true;
9683 public function __construct($name = false) {$this->name = $name;}
9685 public function validate($string, $config, $context) {
9686 if (empty($string)) return false;
9691 * @param $string Name of attribute
9693 public function make($string) {
9694 return new HTMLPurifier_AttrDef_HTML_Bool($string);
9704 * Validates contents based on NMTOKENS attribute type.
9706 class HTMLPurifier_AttrDef_HTML_Nmtokens extends HTMLPurifier_AttrDef
9709 public function validate($string, $config, $context) {
9711 $string = trim($string);
9713 // early abort: '' and '0' (strings that convert to false) are invalid
9714 if (!$string) return false;
9716 $tokens = $this->split($string, $config, $context);
9717 $tokens = $this->filter($tokens, $config, $context);
9718 if (empty($tokens)) return false;
9719 return implode(' ', $tokens);
9724 * Splits a space separated list of tokens into its constituent parts.
9726 protected function split($string, $config, $context) {
9728 // do the preg_match, capture all subpatterns for reformulation
9730 // we don't support U+00A1 and up codepoints or
9731 // escaping because I don't know how to do that with regexps
9732 // and plus it would complicate optimization efforts (you never
9733 // see that anyway).
9734 $pattern = '/(?:(?<=\s)|\A)'. // look behind for space or string start
9735 '((?:--|-?[A-Za-z_])[A-Za-z_\-0-9]*)'.
9736 '(?:(?=\s)|\z)/'; // look ahead for space or string end
9737 preg_match_all($pattern, $string, $matches);
9742 * Template method for removing certain tokens based on arbitrary criteria.
9743 * @note If we wanted to be really functional, we'd do an array_filter
9744 * with a callback. But... we're not.
9746 protected function filter($tokens, $config, $context) {
9757 * Implements special behavior for class attribute (normally NMTOKENS)
9759 class HTMLPurifier_AttrDef_HTML_Class extends HTMLPurifier_AttrDef_HTML_Nmtokens
9761 protected function split($string, $config, $context) {
9762 // really, this twiddle should be lazy loaded
9763 $name = $config->getDefinition('HTML')->doctype->name;
9764 if ($name == "XHTML 1.1" || $name == "XHTML 2.0") {
9765 return parent::split($string, $config, $context);
9767 return preg_split('/\s+/', $string);
9770 protected function filter($tokens, $config, $context) {
9771 $allowed = $config->get('Attr.AllowedClasses');
9772 $forbidden = $config->get('Attr.ForbiddenClasses');
9774 foreach ($tokens as $token) {
9776 ($allowed === null || isset($allowed[$token])) &&
9777 !isset($forbidden[$token]) &&
9778 // We need this O(n) check because of PHP's array
9779 // implementation that casts -0 to 0.
9780 !in_array($token, $ret, true)
9792 * Validates a color according to the HTML spec.
9794 class HTMLPurifier_AttrDef_HTML_Color extends HTMLPurifier_AttrDef
9797 public function validate($string, $config, $context) {
9799 static $colors = null;
9800 if ($colors === null) $colors = $config->get('Core.ColorKeywords');
9802 $string = trim($string);
9804 if (empty($string)) return false;
9805 if (isset($colors[$string])) return $colors[$string];
9806 if ($string[0] === '#') $hex = substr($string, 1);
9807 else $hex = $string;
9809 $length = strlen($hex);
9810 if ($length !== 3 && $length !== 6) return false;
9811 if (!ctype_xdigit($hex)) return false;
9812 if ($length === 3) $hex = $hex[0].$hex[0].$hex[1].$hex[1].$hex[2].$hex[2];
9825 * Special-case enum attribute definition that lazy loads allowed frame targets
9827 class HTMLPurifier_AttrDef_HTML_FrameTarget extends HTMLPurifier_AttrDef_Enum
9830 public $valid_values = false; // uninitialized value
9831 protected $case_sensitive = false;
9833 public function __construct() {}
9835 public function validate($string, $config, $context) {
9836 if ($this->valid_values === false) $this->valid_values = $config->get('Attr.AllowedFrameTargets');
9837 return parent::validate($string, $config, $context);
9847 * Validates the HTML attribute ID.
9848 * @warning Even though this is the id processor, it
9849 * will ignore the directive Attr:IDBlacklist, since it will only
9850 * go according to the ID accumulator. Since the accumulator is
9851 * automatically generated, it will have already absorbed the
9852 * blacklist. If you're hacking around, make sure you use load()!
9855 class HTMLPurifier_AttrDef_HTML_ID extends HTMLPurifier_AttrDef
9858 // ref functionality disabled, since we also have to verify
9859 // whether or not the ID it refers to exists
9861 public function validate($id, $config, $context) {
9863 if (!$config->get('Attr.EnableID')) return false;
9865 $id = trim($id); // trim it first
9867 if ($id === '') return false;
9869 $prefix = $config->get('Attr.IDPrefix');
9870 if ($prefix !== '') {
9871 $prefix .= $config->get('Attr.IDPrefixLocal');
9872 // prevent re-appending the prefix
9873 if (strpos($id, $prefix) !== 0) $id = $prefix . $id;
9874 } elseif ($config->get('Attr.IDPrefixLocal') !== '') {
9875 trigger_error('%Attr.IDPrefixLocal cannot be used unless '.
9876 '%Attr.IDPrefix is set', E_USER_WARNING);
9879 //if (!$this->ref) {
9880 $id_accumulator =& $context->get('IDAccumulator');
9881 if (isset($id_accumulator->ids[$id])) return false;
9884 // we purposely avoid using regex, hopefully this is faster
9886 if (ctype_alpha($id)) {
9889 if (!ctype_alpha(@$id[0])) return false;
9890 $trim = trim( // primitive style of regexps, I suppose
9894 $result = ($trim === '');
9897 $regexp = $config->get('Attr.IDBlacklistRegexp');
9898 if ($regexp && preg_match($regexp, $id)) {
9902 if (/*!$this->ref && */$result) $id_accumulator->add($id);
9904 // if no change was made to the ID, return the result
9905 // else, return the new id if stripping whitespace made it
9906 // valid, or return false.
9907 return $result ? $id : false;
9918 * Validates an integer representation of pixels according to the HTML spec.
9920 class HTMLPurifier_AttrDef_HTML_Pixels extends HTMLPurifier_AttrDef
9925 public function __construct($max = null) {
9929 public function validate($string, $config, $context) {
9931 $string = trim($string);
9932 if ($string === '0') return $string;
9933 if ($string === '') return false;
9934 $length = strlen($string);
9935 if (substr($string, $length - 2) == 'px') {
9936 $string = substr($string, 0, $length - 2);
9938 if (!is_numeric($string)) return false;
9939 $int = (int) $string;
9941 if ($int < 0) return '0';
9943 // upper-bound value, extremely high values can
9944 // crash operating systems, see <http://ha.ckers.org/imagecrash.html>
9945 // WARNING, above link WILL crash you if you're using Windows
9947 if ($this->max !== null && $int > $this->max) return (string) $this->max;
9949 return (string) $int;
9953 public function make($string) {
9954 if ($string === '') $max = null;
9955 else $max = (int) $string;
9956 $class = get_class($this);
9957 return new $class($max);
9967 * Validates the HTML type length (not to be confused with CSS's length).
9969 * This accepts integer pixels or percentages as lengths for certain
9973 class HTMLPurifier_AttrDef_HTML_Length extends HTMLPurifier_AttrDef_HTML_Pixels
9976 public function validate($string, $config, $context) {
9978 $string = trim($string);
9979 if ($string === '') return false;
9981 $parent_result = parent::validate($string, $config, $context);
9982 if ($parent_result !== false) return $parent_result;
9984 $length = strlen($string);
9985 $last_char = $string[$length - 1];
9987 if ($last_char !== '%') return false;
9989 $points = substr($string, 0, $length - 1);
9991 if (!is_numeric($points)) return false;
9993 $points = (int) $points;
9995 if ($points < 0) return '0%';
9996 if ($points > 100) return '100%';
9998 return ((string) $points) . '%';
10009 * Validates a rel/rev link attribute against a directive of allowed values
10010 * @note We cannot use Enum because link types allow multiple
10012 * @note Assumes link types are ASCII text
10014 class HTMLPurifier_AttrDef_HTML_LinkTypes extends HTMLPurifier_AttrDef
10017 /** Name config attribute to pull. */
10020 public function __construct($name) {
10021 $configLookup = array(
10022 'rel' => 'AllowedRel',
10023 'rev' => 'AllowedRev'
10025 if (!isset($configLookup[$name])) {
10026 trigger_error('Unrecognized attribute name for link '.
10027 'relationship.', E_USER_ERROR);
10030 $this->name = $configLookup[$name];
10033 public function validate($string, $config, $context) {
10035 $allowed = $config->get('Attr.' . $this->name);
10036 if (empty($allowed)) return false;
10038 $string = $this->parseCDATA($string);
10039 $parts = explode(' ', $string);
10041 // lookup to prevent duplicates
10042 $ret_lookup = array();
10043 foreach ($parts as $part) {
10044 $part = strtolower(trim($part));
10045 if (!isset($allowed[$part])) continue;
10046 $ret_lookup[$part] = true;
10049 if (empty($ret_lookup)) return false;
10050 $string = implode(' ', array_keys($ret_lookup));
10063 * Validates a MultiLength as defined by the HTML spec.
10065 * A multilength is either a integer (pixel count), a percentage, or
10066 * a relative number.
10068 class HTMLPurifier_AttrDef_HTML_MultiLength extends HTMLPurifier_AttrDef_HTML_Length
10071 public function validate($string, $config, $context) {
10073 $string = trim($string);
10074 if ($string === '') return false;
10076 $parent_result = parent::validate($string, $config, $context);
10077 if ($parent_result !== false) return $parent_result;
10079 $length = strlen($string);
10080 $last_char = $string[$length - 1];
10082 if ($last_char !== '*') return false;
10084 $int = substr($string, 0, $length - 1);
10086 if ($int == '') return '*';
10087 if (!is_numeric($int)) return false;
10091 if ($int < 0) return false;
10092 if ($int == 0) return '0';
10093 if ($int == 1) return '*';
10094 return ((string) $int) . '*';
10104 abstract class HTMLPurifier_AttrDef_URI_Email extends HTMLPurifier_AttrDef
10108 * Unpacks a mailbox into its display-name and address
10110 function unpack($string) {
10111 // needs to be implemented
10116 // sub-implementations
10123 * Validates a host according to the IPv4, IPv6 and DNS (future) specifications.
10125 class HTMLPurifier_AttrDef_URI_Host extends HTMLPurifier_AttrDef
10129 * Instance of HTMLPurifier_AttrDef_URI_IPv4 sub-validator
10134 * Instance of HTMLPurifier_AttrDef_URI_IPv6 sub-validator
10138 public function __construct() {
10139 $this->ipv4 = new HTMLPurifier_AttrDef_URI_IPv4();
10140 $this->ipv6 = new HTMLPurifier_AttrDef_URI_IPv6();
10143 public function validate($string, $config, $context) {
10144 $length = strlen($string);
10145 // empty hostname is OK; it's usually semantically equivalent:
10146 // the default host as defined by a URI scheme is used:
10148 // If the URI scheme defines a default for host, then that
10149 // default applies when the host subcomponent is undefined
10150 // or when the registered name is empty (zero length).
10151 if ($string === '') return '';
10152 if ($length > 1 && $string[0] === '[' && $string[$length-1] === ']') {
10154 $ip = substr($string, 1, $length - 2);
10155 $valid = $this->ipv6->validate($ip, $config, $context);
10156 if ($valid === false) return false;
10157 return '['. $valid . ']';
10160 // need to do checks on unusual encodings too
10161 $ipv4 = $this->ipv4->validate($string, $config, $context);
10162 if ($ipv4 !== false) return $ipv4;
10164 // A regular domain name.
10166 // This breaks I18N domain names, but we don't have proper IRI support,
10167 // so force users to insert Punycode. If there's complaining we'll
10168 // try to fix things into an international friendly form.
10170 // The productions describing this are:
10171 $a = '[a-z]'; // alpha
10172 $an = '[a-z0-9]'; // alphanum
10173 $and = '[a-z0-9-]'; // alphanum | "-"
10174 // domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum
10175 $domainlabel = "$an($and*$an)?";
10176 // toplabel = alpha | alpha *( alphanum | "-" ) alphanum
10177 $toplabel = "$a($and*$an)?";
10178 // hostname = *( domainlabel "." ) toplabel [ "." ]
10179 $match = preg_match("/^($domainlabel\.)*$toplabel\.?$/i", $string);
10180 if (!$match) return false;
10192 * Validates an IPv4 address
10193 * @author Feyd @ forums.devnetwork.net (public domain)
10195 class HTMLPurifier_AttrDef_URI_IPv4 extends HTMLPurifier_AttrDef
10199 * IPv4 regex, protected so that IPv6 can reuse it
10203 public function validate($aIP, $config, $context) {
10205 if (!$this->ip4) $this->_loadRegex();
10207 if (preg_match('#^' . $this->ip4 . '$#s', $aIP))
10217 * Lazy load function to prevent regex from being stuffed in
10220 protected function _loadRegex() {
10221 $oct = '(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])'; // 0-255
10222 $this->ip4 = "(?:{$oct}\\.{$oct}\\.{$oct}\\.{$oct})";
10232 * Validates an IPv6 address.
10233 * @author Feyd @ forums.devnetwork.net (public domain)
10234 * @note This function requires brackets to have been removed from address
10237 class HTMLPurifier_AttrDef_URI_IPv6 extends HTMLPurifier_AttrDef_URI_IPv4
10240 public function validate($aIP, $config, $context) {
10242 if (!$this->ip4) $this->_loadRegex();
10246 $hex = '[0-9a-fA-F]';
10247 $blk = '(?:' . $hex . '{1,4})';
10248 $pre = '(?:/(?:12[0-8]|1[0-1][0-9]|[1-9][0-9]|[0-9]))'; // /0 - /128
10251 if (strpos($aIP, '/') !== false)
10253 if (preg_match('#' . $pre . '$#s', $aIP, $find))
10255 $aIP = substr($aIP, 0, 0-strlen($find[0]));
10264 // IPv4-compatiblity check
10265 if (preg_match('#(?<=:'.')' . $this->ip4 . '$#s', $aIP, $find))
10267 $aIP = substr($aIP, 0, 0-strlen($find[0]));
10268 $ip = explode('.', $find[0]);
10269 $ip = array_map('dechex', $ip);
10270 $aIP .= $ip[0] . $ip[1] . ':' . $ip[2] . $ip[3];
10274 // compression check
10275 $aIP = explode('::', $aIP);
10283 list($first, $second) = $aIP;
10284 $first = explode(':', $first);
10285 $second = explode(':', $second);
10287 if (count($first) + count($second) > 8)
10292 while(count($first) < 8)
10294 array_push($first, '0');
10297 array_splice($first, 8 - count($second), 8, $second);
10299 unset($first,$second);
10303 $aIP = explode(':', $aIP[0]);
10312 // All the pieces should be 16-bit hex strings. Are they?
10313 foreach ($aIP as $piece)
10315 if (!preg_match('#^[0-9a-fA-F]{4}$#s', sprintf('%04s', $piece)))
10332 * Primitive email validation class based on the regexp found at
10333 * http://www.regular-expressions.info/email.html
10335 class HTMLPurifier_AttrDef_URI_Email_SimpleCheck extends HTMLPurifier_AttrDef_URI_Email
10338 public function validate($string, $config, $context) {
10339 // no support for named mailboxes i.e. "Bob <bob@example.com>"
10340 // that needs more percent encoding to be done
10341 if ($string == '') return false;
10342 $string = trim($string);
10343 $result = preg_match('/^[A-Z0-9._%-]+@[A-Z0-9.-]+\.[A-Z]{2,4}$/i', $string);
10344 return $result ? $string : false;
10354 * Pre-transform that changes proprietary background attribute to CSS.
10356 class HTMLPurifier_AttrTransform_Background extends HTMLPurifier_AttrTransform {
10358 public function transform($attr, $config, $context) {
10360 if (!isset($attr['background'])) return $attr;
10362 $background = $this->confiscateAttr($attr, 'background');
10363 // some validation should happen here
10365 $this->prependCSS($attr, "background-image:url($background);");
10377 // this MUST be placed in post, as it assumes that any value in dir is valid
10380 * Post-trasnform that ensures that bdo tags have the dir attribute set.
10382 class HTMLPurifier_AttrTransform_BdoDir extends HTMLPurifier_AttrTransform
10385 public function transform($attr, $config, $context) {
10386 if (isset($attr['dir'])) return $attr;
10387 $attr['dir'] = $config->get('Attr.DefaultTextDir');
10398 * Pre-transform that changes deprecated bgcolor attribute to CSS.
10400 class HTMLPurifier_AttrTransform_BgColor extends HTMLPurifier_AttrTransform {
10402 public function transform($attr, $config, $context) {
10404 if (!isset($attr['bgcolor'])) return $attr;
10406 $bgcolor = $this->confiscateAttr($attr, 'bgcolor');
10407 // some validation should happen here
10409 $this->prependCSS($attr, "background-color:$bgcolor;");
10422 * Pre-transform that changes converts a boolean attribute to fixed CSS
10424 class HTMLPurifier_AttrTransform_BoolToCSS extends HTMLPurifier_AttrTransform {
10427 * Name of boolean attribute that is trigger
10432 * CSS declarations to add to style, needs trailing semicolon
10437 * @param $attr string attribute name to convert from
10438 * @param $css string CSS declarations to add to style (needs semicolon)
10440 public function __construct($attr, $css) {
10441 $this->attr = $attr;
10445 public function transform($attr, $config, $context) {
10446 if (!isset($attr[$this->attr])) return $attr;
10447 unset($attr[$this->attr]);
10448 $this->prependCSS($attr, $this->css);
10459 * Pre-transform that changes deprecated border attribute to CSS.
10461 class HTMLPurifier_AttrTransform_Border extends HTMLPurifier_AttrTransform {
10463 public function transform($attr, $config, $context) {
10464 if (!isset($attr['border'])) return $attr;
10465 $border_width = $this->confiscateAttr($attr, 'border');
10466 // some validation should happen here
10467 $this->prependCSS($attr, "border:{$border_width}px solid;");
10478 * Generic pre-transform that converts an attribute with a fixed number of
10479 * values (enumerated) to CSS.
10481 class HTMLPurifier_AttrTransform_EnumToCSS extends HTMLPurifier_AttrTransform {
10484 * Name of attribute to transform from
10489 * Lookup array of attribute values to CSS
10491 protected $enumToCSS = array();
10494 * Case sensitivity of the matching
10495 * @warning Currently can only be guaranteed to work with ASCII
10498 protected $caseSensitive = false;
10501 * @param $attr String attribute name to transform from
10502 * @param $enumToCSS Lookup array of attribute values to CSS
10503 * @param $case_sensitive Boolean case sensitivity indicator, default false
10505 public function __construct($attr, $enum_to_css, $case_sensitive = false) {
10506 $this->attr = $attr;
10507 $this->enumToCSS = $enum_to_css;
10508 $this->caseSensitive = (bool) $case_sensitive;
10511 public function transform($attr, $config, $context) {
10513 if (!isset($attr[$this->attr])) return $attr;
10515 $value = trim($attr[$this->attr]);
10516 unset($attr[$this->attr]);
10518 if (!$this->caseSensitive) $value = strtolower($value);
10520 if (!isset($this->enumToCSS[$value])) {
10524 $this->prependCSS($attr, $this->enumToCSS[$value]);
10536 // must be called POST validation
10539 * Transform that supplies default values for the src and alt attributes
10540 * in img tags, as well as prevents the img tag from being removed
10541 * because of a missing alt tag. This needs to be registered as both
10542 * a pre and post attribute transform.
10544 class HTMLPurifier_AttrTransform_ImgRequired extends HTMLPurifier_AttrTransform
10547 public function transform($attr, $config, $context) {
10550 if (!isset($attr['src'])) {
10551 if ($config->get('Core.RemoveInvalidImg')) return $attr;
10552 $attr['src'] = $config->get('Attr.DefaultInvalidImage');
10556 if (!isset($attr['alt'])) {
10558 $alt = $config->get('Attr.DefaultImageAlt');
10559 if ($alt === null) {
10560 // truncate if the alt is too long
10561 $attr['alt'] = substr(basename($attr['src']),0,40);
10563 $attr['alt'] = $alt;
10566 $attr['alt'] = $config->get('Attr.DefaultInvalidImageAlt');
10581 * Pre-transform that changes deprecated hspace and vspace attributes to CSS
10583 class HTMLPurifier_AttrTransform_ImgSpace extends HTMLPurifier_AttrTransform {
10586 protected $css = array(
10587 'hspace' => array('left', 'right'),
10588 'vspace' => array('top', 'bottom')
10591 public function __construct($attr) {
10592 $this->attr = $attr;
10593 if (!isset($this->css[$attr])) {
10594 trigger_error(htmlspecialchars($attr) . ' is not valid space attribute');
10598 public function transform($attr, $config, $context) {
10600 if (!isset($attr[$this->attr])) return $attr;
10602 $width = $this->confiscateAttr($attr, $this->attr);
10603 // some validation could happen here
10605 if (!isset($this->css[$this->attr])) return $attr;
10608 foreach ($this->css[$this->attr] as $suffix) {
10609 $property = "margin-$suffix";
10610 $style .= "$property:{$width}px;";
10613 $this->prependCSS($attr, $style);
10626 * Performs miscellaneous cross attribute validation and filtering for
10627 * input elements. This is meant to be a post-transform.
10629 class HTMLPurifier_AttrTransform_Input extends HTMLPurifier_AttrTransform {
10633 public function __construct() {
10634 $this->pixels = new HTMLPurifier_AttrDef_HTML_Pixels();
10637 public function transform($attr, $config, $context) {
10638 if (!isset($attr['type'])) $t = 'text';
10639 else $t = strtolower($attr['type']);
10640 if (isset($attr['checked']) && $t !== 'radio' && $t !== 'checkbox') {
10641 unset($attr['checked']);
10643 if (isset($attr['maxlength']) && $t !== 'text' && $t !== 'password') {
10644 unset($attr['maxlength']);
10646 if (isset($attr['size']) && $t !== 'text' && $t !== 'password') {
10647 $result = $this->pixels->validate($attr['size'], $config, $context);
10648 if ($result === false) unset($attr['size']);
10649 else $attr['size'] = $result;
10651 if (isset($attr['src']) && $t !== 'image') {
10652 unset($attr['src']);
10654 if (!isset($attr['value']) && ($t === 'radio' || $t === 'checkbox')) {
10655 $attr['value'] = '';
10667 * Post-transform that copies lang's value to xml:lang (and vice-versa)
10668 * @note Theoretically speaking, this could be a pre-transform, but putting
10669 * post is more efficient.
10671 class HTMLPurifier_AttrTransform_Lang extends HTMLPurifier_AttrTransform
10674 public function transform($attr, $config, $context) {
10676 $lang = isset($attr['lang']) ? $attr['lang'] : false;
10677 $xml_lang = isset($attr['xml:lang']) ? $attr['xml:lang'] : false;
10679 if ($lang !== false && $xml_lang === false) {
10680 $attr['xml:lang'] = $lang;
10681 } elseif ($xml_lang !== false) {
10682 $attr['lang'] = $xml_lang;
10696 * Class for handling width/height length attribute transformations to CSS
10698 class HTMLPurifier_AttrTransform_Length extends HTMLPurifier_AttrTransform
10702 protected $cssName;
10704 public function __construct($name, $css_name = null) {
10705 $this->name = $name;
10706 $this->cssName = $css_name ? $css_name : $name;
10709 public function transform($attr, $config, $context) {
10710 if (!isset($attr[$this->name])) return $attr;
10711 $length = $this->confiscateAttr($attr, $this->name);
10712 if(ctype_digit($length)) $length .= 'px';
10713 $this->prependCSS($attr, $this->cssName . ":$length;");
10724 * Pre-transform that changes deprecated name attribute to ID if necessary
10726 class HTMLPurifier_AttrTransform_Name extends HTMLPurifier_AttrTransform
10729 public function transform($attr, $config, $context) {
10730 // Abort early if we're using relaxed definition of name
10731 if ($config->get('HTML.Attr.Name.UseCDATA')) return $attr;
10732 if (!isset($attr['name'])) return $attr;
10733 $id = $this->confiscateAttr($attr, 'name');
10734 if ( isset($attr['id'])) return $attr;
10746 * Post-transform that performs validation to the name attribute; if
10747 * it is present with an equivalent id attribute, it is passed through;
10748 * otherwise validation is performed.
10750 class HTMLPurifier_AttrTransform_NameSync extends HTMLPurifier_AttrTransform
10753 public function __construct() {
10754 $this->idDef = new HTMLPurifier_AttrDef_HTML_ID();
10757 public function transform($attr, $config, $context) {
10758 if (!isset($attr['name'])) return $attr;
10759 $name = $attr['name'];
10760 if (isset($attr['id']) && $attr['id'] === $name) return $attr;
10761 $result = $this->idDef->validate($name, $config, $context);
10762 if ($result === false) unset($attr['name']);
10763 else $attr['name'] = $result;
10773 // must be called POST validation
10776 * Adds rel="nofollow" to all outbound links. This transform is
10777 * only attached if Attr.Nofollow is TRUE.
10779 class HTMLPurifier_AttrTransform_Nofollow extends HTMLPurifier_AttrTransform
10783 public function __construct() {
10784 $this->parser = new HTMLPurifier_URIParser();
10787 public function transform($attr, $config, $context) {
10789 if (!isset($attr['href'])) {
10793 // XXX Kind of inefficient
10794 $url = $this->parser->parse($attr['href']);
10795 $scheme = $url->getSchemeObj($config, $context);
10797 if (!is_null($url->host) && $scheme !== false && $scheme->browsable) {
10798 if (isset($attr['rel'])) {
10799 $attr['rel'] .= ' nofollow';
10801 $attr['rel'] = 'nofollow';
10815 class HTMLPurifier_AttrTransform_SafeEmbed extends HTMLPurifier_AttrTransform
10817 public $name = "SafeEmbed";
10819 public function transform($attr, $config, $context) {
10820 $attr['allowscriptaccess'] = 'never';
10821 $attr['allownetworking'] = 'internal';
10822 $attr['type'] = 'application/x-shockwave-flash';
10832 * Writes default type for all objects. Currently only supports flash.
10834 class HTMLPurifier_AttrTransform_SafeObject extends HTMLPurifier_AttrTransform
10836 public $name = "SafeObject";
10838 function transform($attr, $config, $context) {
10839 if (!isset($attr['type'])) $attr['type'] = 'application/x-shockwave-flash';
10849 * Validates name/value pairs in param tags to be used in safe objects. This
10850 * will only allow name values it recognizes, and pre-fill certain attributes
10851 * with required values.
10854 * This class only supports Flash. In the future, Quicktime support
10858 * This class expects an injector to add the necessary parameters tags.
10860 class HTMLPurifier_AttrTransform_SafeParam extends HTMLPurifier_AttrTransform
10862 public $name = "SafeParam";
10865 public function __construct() {
10866 $this->uri = new HTMLPurifier_AttrDef_URI(true); // embedded
10867 $this->wmode = new HTMLPurifier_AttrDef_Enum(array('window', 'opaque', 'transparent'));
10870 public function transform($attr, $config, $context) {
10871 // If we add support for other objects, we'll need to alter the
10873 switch ($attr['name']) {
10874 // application/x-shockwave-flash
10875 // Keep this synchronized with Injector/SafeObject.php
10876 case 'allowScriptAccess':
10877 $attr['value'] = 'never';
10879 case 'allowNetworking':
10880 $attr['value'] = 'internal';
10882 case 'allowFullScreen':
10883 if ($config->get('HTML.FlashAllowFullScreen')) {
10884 $attr['value'] = ($attr['value'] == 'true') ? 'true' : 'false';
10886 $attr['value'] = 'false';
10890 $attr['value'] = $this->wmode->validate($attr['value'], $config, $context);
10894 $attr['name'] = "movie";
10895 $attr['value'] = $this->uri->validate($attr['value'], $config, $context);
10898 // we're going to allow arbitrary inputs to the SWF, on
10899 // the reasoning that it could only hack the SWF, not us.
10901 // add other cases to support other param name/value pairs
10903 $attr['name'] = $attr['value'] = null;
10914 * Implements required attribute stipulation for <script>
10916 class HTMLPurifier_AttrTransform_ScriptRequired extends HTMLPurifier_AttrTransform
10918 public function transform($attr, $config, $context) {
10919 if (!isset($attr['type'])) {
10920 $attr['type'] = 'text/javascript';
10931 * Sets height/width defaults for <textarea>
10933 class HTMLPurifier_AttrTransform_Textarea extends HTMLPurifier_AttrTransform
10936 public function transform($attr, $config, $context) {
10937 // Calculated from Firefox
10938 if (!isset($attr['cols'])) $attr['cols'] = '22';
10939 if (!isset($attr['rows'])) $attr['rows'] = '3';
10950 * Definition that uses different definitions depending on context.
10952 * The del and ins tags are notable because they allow different types of
10953 * elements depending on whether or not they're in a block or inline context.
10954 * Chameleon allows this behavior to happen by using two different
10955 * definitions depending on context. While this somewhat generalized,
10956 * it is specifically intended for those two tags.
10958 class HTMLPurifier_ChildDef_Chameleon extends HTMLPurifier_ChildDef
10962 * Instance of the definition object to use when inline. Usually stricter.
10967 * Instance of the definition object to use when block.
10971 public $type = 'chameleon';
10974 * @param $inline List of elements to allow when inline.
10975 * @param $block List of elements to allow when block.
10977 public function __construct($inline, $block) {
10978 $this->inline = new HTMLPurifier_ChildDef_Optional($inline);
10979 $this->block = new HTMLPurifier_ChildDef_Optional($block);
10980 $this->elements = $this->block->elements;
10983 public function validateChildren($tokens_of_children, $config, $context) {
10984 if ($context->get('IsInline') === false) {
10985 return $this->block->validateChildren(
10986 $tokens_of_children, $config, $context);
10988 return $this->inline->validateChildren(
10989 $tokens_of_children, $config, $context);
10999 * Custom validation class, accepts DTD child definitions
11001 * @warning Currently this class is an all or nothing proposition, that is,
11002 * it will only give a bool return value.
11004 class HTMLPurifier_ChildDef_Custom extends HTMLPurifier_ChildDef
11006 public $type = 'custom';
11007 public $allow_empty = false;
11009 * Allowed child pattern as defined by the DTD
11013 * PCRE regex derived from $dtd_regex
11016 private $_pcre_regex;
11018 * @param $dtd_regex Allowed child pattern from the DTD
11020 public function __construct($dtd_regex) {
11021 $this->dtd_regex = $dtd_regex;
11022 $this->_compileRegex();
11025 * Compiles the PCRE regex from a DTD regex ($dtd_regex to $_pcre_regex)
11027 protected function _compileRegex() {
11028 $raw = str_replace(' ', '', $this->dtd_regex);
11029 if ($raw{0} != '(') {
11032 $el = '[#a-zA-Z0-9_.-]+';
11035 // COMPLICATED! AND MIGHT BE BUGGY! I HAVE NO CLUE WHAT I'M
11036 // DOING! Seriously: if there's problems, please report them.
11038 // collect all elements into the $elements array
11039 preg_match_all("/$el/", $reg, $matches);
11040 foreach ($matches[0] as $match) {
11041 $this->elements[$match] = true;
11044 // setup all elements as parentheticals with leading commas
11045 $reg = preg_replace("/$el/", '(,\\0)', $reg);
11047 // remove commas when they were not solicited
11048 $reg = preg_replace("/([^,(|]\(+),/", '\\1', $reg);
11050 // remove all non-paranthetical commas: they are handled by first regex
11051 $reg = preg_replace("/,\(/", '(', $reg);
11053 $this->_pcre_regex = $reg;
11055 public function validateChildren($tokens_of_children, $config, $context) {
11056 $list_of_children = '';
11057 $nesting = 0; // depth into the nest
11058 foreach ($tokens_of_children as $token) {
11059 if (!empty($token->is_whitespace)) continue;
11061 $is_child = ($nesting == 0); // direct
11063 if ($token instanceof HTMLPurifier_Token_Start) {
11065 } elseif ($token instanceof HTMLPurifier_Token_End) {
11070 $list_of_children .= $token->name . ',';
11073 // add leading comma to deal with stray comma declarations
11074 $list_of_children = ',' . rtrim($list_of_children, ',');
11077 '/^,?'.$this->_pcre_regex.'$/',
11081 return (bool) $okay;
11090 * Definition that disallows all elements.
11091 * @warning validateChildren() in this class is actually never called, because
11092 * empty elements are corrected in HTMLPurifier_Strategy_MakeWellFormed
11093 * before child definitions are parsed in earnest by
11094 * HTMLPurifier_Strategy_FixNesting.
11096 class HTMLPurifier_ChildDef_Empty extends HTMLPurifier_ChildDef
11098 public $allow_empty = true;
11099 public $type = 'empty';
11100 public function __construct() {}
11101 public function validateChildren($tokens_of_children, $config, $context) {
11111 * Definition that allows a set of elements, but disallows empty children.
11113 class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef
11116 * Lookup table of allowed elements.
11119 public $elements = array();
11121 * Whether or not the last passed node was all whitespace.
11123 protected $whitespace = false;
11125 * @param $elements List of allowed element names (lowercase).
11127 public function __construct($elements) {
11128 if (is_string($elements)) {
11129 $elements = str_replace(' ', '', $elements);
11130 $elements = explode('|', $elements);
11132 $keys = array_keys($elements);
11133 if ($keys == array_keys($keys)) {
11134 $elements = array_flip($elements);
11135 foreach ($elements as $i => $x) {
11136 $elements[$i] = true;
11137 if (empty($i)) unset($elements[$i]); // remove blank
11140 $this->elements = $elements;
11142 public $allow_empty = false;
11143 public $type = 'required';
11144 public function validateChildren($tokens_of_children, $config, $context) {
11145 // Flag for subclasses
11146 $this->whitespace = false;
11148 // if there are no tokens, delete parent node
11149 if (empty($tokens_of_children)) return false;
11151 // the new set of children
11154 // current depth into the nest
11157 // whether or not we're deleting a node
11158 $is_deleting = false;
11160 // whether or not parsed character data is allowed
11161 // this controls whether or not we silently drop a tag
11162 // or generate escaped HTML from it
11163 $pcdata_allowed = isset($this->elements['#PCDATA']);
11165 // a little sanity check to make sure it's not ALL whitespace
11166 $all_whitespace = true;
11168 // some configuration
11169 $escape_invalid_children = $config->get('Core.EscapeInvalidChildren');
11172 $gen = new HTMLPurifier_Generator($config, $context);
11174 foreach ($tokens_of_children as $token) {
11175 if (!empty($token->is_whitespace)) {
11176 $result[] = $token;
11179 $all_whitespace = false; // phew, we're not talking about whitespace
11181 $is_child = ($nesting == 0);
11183 if ($token instanceof HTMLPurifier_Token_Start) {
11185 } elseif ($token instanceof HTMLPurifier_Token_End) {
11190 $is_deleting = false;
11191 if (!isset($this->elements[$token->name])) {
11192 $is_deleting = true;
11193 if ($pcdata_allowed && $token instanceof HTMLPurifier_Token_Text) {
11194 $result[] = $token;
11195 } elseif ($pcdata_allowed && $escape_invalid_children) {
11196 $result[] = new HTMLPurifier_Token_Text(
11197 $gen->generateFromToken($token)
11203 if (!$is_deleting || ($pcdata_allowed && $token instanceof HTMLPurifier_Token_Text)) {
11204 $result[] = $token;
11205 } elseif ($pcdata_allowed && $escape_invalid_children) {
11207 new HTMLPurifier_Token_Text(
11208 $gen->generateFromToken($token)
11214 if (empty($result)) return false;
11215 if ($all_whitespace) {
11216 $this->whitespace = true;
11219 if ($tokens_of_children == $result) return true;
11229 * Definition that allows a set of elements, and allows no children.
11230 * @note This is a hack to reuse code from HTMLPurifier_ChildDef_Required,
11231 * really, one shouldn't inherit from the other. Only altered behavior
11232 * is to overload a returned false with an array. Thus, it will never
11235 class HTMLPurifier_ChildDef_Optional extends HTMLPurifier_ChildDef_Required
11237 public $allow_empty = true;
11238 public $type = 'optional';
11239 public function validateChildren($tokens_of_children, $config, $context) {
11240 $result = parent::validateChildren($tokens_of_children, $config, $context);
11241 // we assume that $tokens_of_children is not modified
11242 if ($result === false) {
11243 if (empty($tokens_of_children)) return true;
11244 elseif ($this->whitespace) return $tokens_of_children;
11245 else return array();
11256 * Takes the contents of blockquote when in strict and reformats for validation.
11258 class HTMLPurifier_ChildDef_StrictBlockquote extends HTMLPurifier_ChildDef_Required
11260 protected $real_elements;
11261 protected $fake_elements;
11262 public $allow_empty = true;
11263 public $type = 'strictblockquote';
11264 protected $init = false;
11267 * @note We don't want MakeWellFormed to auto-close inline elements since
11268 * they might be allowed.
11270 public function getAllowedElements($config) {
11271 $this->init($config);
11272 return $this->fake_elements;
11275 public function validateChildren($tokens_of_children, $config, $context) {
11277 $this->init($config);
11279 // trick the parent class into thinking it allows more
11280 $this->elements = $this->fake_elements;
11281 $result = parent::validateChildren($tokens_of_children, $config, $context);
11282 $this->elements = $this->real_elements;
11284 if ($result === false) return array();
11285 if ($result === true) $result = $tokens_of_children;
11287 $def = $config->getHTMLDefinition();
11288 $block_wrap_start = new HTMLPurifier_Token_Start($def->info_block_wrapper);
11289 $block_wrap_end = new HTMLPurifier_Token_End( $def->info_block_wrapper);
11290 $is_inline = false;
11294 // assuming that there are no comment tokens
11295 foreach ($result as $i => $token) {
11296 $token = $result[$i];
11297 // ifs are nested for readability
11301 ($token instanceof HTMLPurifier_Token_Text && !$token->is_whitespace) ||
11302 (!$token instanceof HTMLPurifier_Token_Text && !isset($this->elements[$token->name]))
11305 $ret[] = $block_wrap_start;
11310 // starting tokens have been inline text / empty
11311 if ($token instanceof HTMLPurifier_Token_Start || $token instanceof HTMLPurifier_Token_Empty) {
11312 if (isset($this->elements[$token->name])) {
11314 $ret[] = $block_wrap_end;
11315 $is_inline = false;
11321 if ($token instanceof HTMLPurifier_Token_Start) $depth++;
11322 if ($token instanceof HTMLPurifier_Token_End) $depth--;
11324 if ($is_inline) $ret[] = $block_wrap_end;
11328 private function init($config) {
11329 if (!$this->init) {
11330 $def = $config->getHTMLDefinition();
11331 // allow all inline elements
11332 $this->real_elements = $this->elements;
11333 $this->fake_elements = $def->info_content_sets['Flow'];
11334 $this->fake_elements['#PCDATA'] = true;
11335 $this->init = true;
11345 * Definition for tables
11347 class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
11349 public $allow_empty = false;
11350 public $type = 'table';
11351 public $elements = array('tr' => true, 'tbody' => true, 'thead' => true,
11352 'tfoot' => true, 'caption' => true, 'colgroup' => true, 'col' => true);
11353 public function __construct() {}
11354 public function validateChildren($tokens_of_children, $config, $context) {
11355 if (empty($tokens_of_children)) return false;
11357 // this ensures that the loop gets run one last time before closing
11358 // up. It's a little bit of a hack, but it works! Just make sure you
11359 // get rid of the token later.
11360 $tokens_of_children[] = false;
11362 // only one of these elements is allowed in a table
11367 // as many of these as you want
11369 $content = array();
11371 $nesting = 0; // current depth so we can determine nodes
11372 $is_collecting = false; // are we globbing together tokens to package
11373 // into one of the collectors?
11374 $collection = array(); // collected nodes
11375 $tag_index = 0; // the first node might be whitespace,
11376 // so this tells us where the start tag is
11378 foreach ($tokens_of_children as $token) {
11379 $is_child = ($nesting == 0);
11381 if ($token === false) {
11382 // terminating sequence started
11383 } elseif ($token instanceof HTMLPurifier_Token_Start) {
11385 } elseif ($token instanceof HTMLPurifier_Token_End) {
11389 // handle node collection
11390 if ($is_collecting) {
11392 // okay, let's stash the tokens away
11393 // first token tells us the type of the collection
11394 switch ($collection[$tag_index]->name) {
11397 $content[] = $collection;
11400 if ($caption !== false) break;
11401 $caption = $collection;
11405 // access the appropriate variable, $thead or $tfoot
11406 $var = $collection[$tag_index]->name;
11407 if ($$var === false) {
11408 $$var = $collection;
11410 // transmutate the first and less entries into
11411 // tbody tags, and then put into content
11412 $collection[$tag_index]->name = 'tbody';
11413 $collection[count($collection)-1]->name = 'tbody';
11414 $content[] = $collection;
11418 $cols[] = $collection;
11421 $collection = array();
11422 $is_collecting = false;
11425 // add the node to the collection
11426 $collection[] = $token;
11431 if ($token === false) break;
11434 // determine what we're dealing with
11435 if ($token->name == 'col') {
11436 // the only empty tag in the possie, we can handle it
11438 $cols[] = array_merge($collection, array($token));
11439 $collection = array();
11443 switch($token->name) {
11450 $is_collecting = true;
11451 $collection[] = $token;
11454 if (!empty($token->is_whitespace)) {
11455 $collection[] = $token;
11463 if (empty($content)) return false;
11466 if ($caption !== false) $ret = array_merge($ret, $caption);
11467 if ($cols !== false) foreach ($cols as $token_array) $ret = array_merge($ret, $token_array);
11468 if ($thead !== false) $ret = array_merge($ret, $thead);
11469 if ($tfoot !== false) $ret = array_merge($ret, $tfoot);
11470 foreach ($content as $token_array) $ret = array_merge($ret, $token_array);
11471 if (!empty($collection) && $is_collecting == false){
11472 // grab the trailing space
11473 $ret = array_merge($ret, $collection);
11476 array_pop($tokens_of_children); // remove phantom token
11478 return ($ret === $tokens_of_children) ? true : $ret;
11487 class HTMLPurifier_DefinitionCache_Decorator extends HTMLPurifier_DefinitionCache
11491 * Cache object we are decorating
11495 public function __construct() {}
11498 * Lazy decorator function
11499 * @param $cache Reference to cache object to decorate
11501 public function decorate(&$cache) {
11502 $decorator = $this->copy();
11503 // reference is necessary for mocks in PHP 4
11504 $decorator->cache =& $cache;
11505 $decorator->type = $cache->type;
11510 * Cross-compatible clone substitute
11512 public function copy() {
11513 return new HTMLPurifier_DefinitionCache_Decorator();
11516 public function add($def, $config) {
11517 return $this->cache->add($def, $config);
11520 public function set($def, $config) {
11521 return $this->cache->set($def, $config);
11524 public function replace($def, $config) {
11525 return $this->cache->replace($def, $config);
11528 public function get($config) {
11529 return $this->cache->get($config);
11532 public function remove($config) {
11533 return $this->cache->remove($config);
11536 public function flush($config) {
11537 return $this->cache->flush($config);
11540 public function cleanup($config) {
11541 return $this->cache->cleanup($config);
11551 * Null cache object to use when no caching is on.
11553 class HTMLPurifier_DefinitionCache_Null extends HTMLPurifier_DefinitionCache
11556 public function add($def, $config) {
11560 public function set($def, $config) {
11564 public function replace($def, $config) {
11568 public function remove($config) {
11572 public function get($config) {
11576 public function flush($config) {
11580 public function cleanup($config) {
11590 class HTMLPurifier_DefinitionCache_Serializer extends
11591 HTMLPurifier_DefinitionCache
11594 public function add($def, $config) {
11595 if (!$this->checkDefType($def)) return;
11596 $file = $this->generateFilePath($config);
11597 if (file_exists($file)) return false;
11598 if (!$this->_prepareDir($config)) return false;
11599 return $this->_write($file, serialize($def), $config);
11602 public function set($def, $config) {
11603 if (!$this->checkDefType($def)) return;
11604 $file = $this->generateFilePath($config);
11605 if (!$this->_prepareDir($config)) return false;
11606 return $this->_write($file, serialize($def), $config);
11609 public function replace($def, $config) {
11610 if (!$this->checkDefType($def)) return;
11611 $file = $this->generateFilePath($config);
11612 if (!file_exists($file)) return false;
11613 if (!$this->_prepareDir($config)) return false;
11614 return $this->_write($file, serialize($def), $config);
11617 public function get($config) {
11618 $file = $this->generateFilePath($config);
11619 if (!file_exists($file)) return false;
11620 return unserialize(file_get_contents($file));
11623 public function remove($config) {
11624 $file = $this->generateFilePath($config);
11625 if (!file_exists($file)) return false;
11626 return unlink($file);
11629 public function flush($config) {
11630 if (!$this->_prepareDir($config)) return false;
11631 $dir = $this->generateDirectoryPath($config);
11632 $dh = opendir($dir);
11633 while (false !== ($filename = readdir($dh))) {
11634 if (empty($filename)) continue;
11635 if ($filename[0] === '.') continue;
11636 unlink($dir . '/' . $filename);
11640 public function cleanup($config) {
11641 if (!$this->_prepareDir($config)) return false;
11642 $dir = $this->generateDirectoryPath($config);
11643 $dh = opendir($dir);
11644 while (false !== ($filename = readdir($dh))) {
11645 if (empty($filename)) continue;
11646 if ($filename[0] === '.') continue;
11647 $key = substr($filename, 0, strlen($filename) - 4);
11648 if ($this->isOld($key, $config)) unlink($dir . '/' . $filename);
11653 * Generates the file path to the serial file corresponding to
11654 * the configuration and definition name
11655 * @todo Make protected
11657 public function generateFilePath($config) {
11658 $key = $this->generateKey($config);
11659 return $this->generateDirectoryPath($config) . '/' . $key . '.ser';
11663 * Generates the path to the directory contain this cache's serial files
11664 * @note No trailing slash
11665 * @todo Make protected
11667 public function generateDirectoryPath($config) {
11668 $base = $this->generateBaseDirectoryPath($config);
11669 return $base . '/' . $this->type;
11673 * Generates path to base directory that contains all definition type
11675 * @todo Make protected
11677 public function generateBaseDirectoryPath($config) {
11678 $base = $config->get('Cache.SerializerPath');
11679 $base = is_null($base) ? HTMLPURIFIER_PREFIX . '/HTMLPurifier/DefinitionCache/Serializer' : $base;
11684 * Convenience wrapper function for file_put_contents
11685 * @param $file File name to write to
11686 * @param $data Data to write into file
11687 * @param $config Config object
11688 * @return Number of bytes written if success, or false if failure.
11690 private function _write($file, $data, $config) {
11691 $result = file_put_contents($file, $data);
11692 if ($result !== false) {
11693 // set permissions of the new file (no execute)
11694 $chmod = $config->get('Cache.SerializerPermissions');
11696 $chmod = 0644; // invalid config or simpletest
11698 $chmod = $chmod & 0666;
11699 chmod($file, $chmod);
11705 * Prepares the directory that this type stores the serials in
11706 * @param $config Config object
11707 * @return True if successful
11709 private function _prepareDir($config) {
11710 $directory = $this->generateDirectoryPath($config);
11711 $chmod = $config->get('Cache.SerializerPermissions');
11713 $chmod = 0755; // invalid config or simpletest
11715 if (!is_dir($directory)) {
11716 $base = $this->generateBaseDirectoryPath($config);
11717 if (!is_dir($base)) {
11718 trigger_error('Base directory '.$base.' does not exist,
11719 please create or change using %Cache.SerializerPath',
11722 } elseif (!$this->_testPermissions($base, $chmod)) {
11725 $old = umask(0000);
11726 mkdir($directory, $chmod);
11728 } elseif (!$this->_testPermissions($directory, $chmod)) {
11735 * Tests permissions on a directory and throws out friendly
11736 * error messages and attempts to chmod it itself if possible
11737 * @param $dir Directory path
11738 * @param $chmod Permissions
11739 * @return True if directory writable
11741 private function _testPermissions($dir, $chmod) {
11742 // early abort, if it is writable, everything is hunky-dory
11743 if (is_writable($dir)) return true;
11744 if (!is_dir($dir)) {
11745 // generally, you'll want to handle this beforehand
11746 // so a more specific error message can be given
11747 trigger_error('Directory '.$dir.' does not exist',
11751 if (function_exists('posix_getuid')) {
11752 // POSIX system, we can give more specific advice
11753 if (fileowner($dir) === posix_getuid()) {
11754 // we can chmod it ourselves
11755 $chmod = $chmod | 0700;
11756 if (chmod($dir, $chmod)) return true;
11757 } elseif (filegroup($dir) === posix_getgid()) {
11758 $chmod = $chmod | 0070;
11760 // PHP's probably running as nobody, so we'll
11761 // need to give global permissions
11762 $chmod = $chmod | 0777;
11764 trigger_error('Directory '.$dir.' not writable, '.
11765 'please chmod to ' . decoct($chmod),
11768 // generic error message
11769 trigger_error('Directory '.$dir.' not writable, '.
11770 'please alter file permissions',
11783 * Definition cache decorator class that cleans up the cache
11784 * whenever there is a cache miss.
11786 class HTMLPurifier_DefinitionCache_Decorator_Cleanup extends
11787 HTMLPurifier_DefinitionCache_Decorator
11790 public $name = 'Cleanup';
11792 public function copy() {
11793 return new HTMLPurifier_DefinitionCache_Decorator_Cleanup();
11796 public function add($def, $config) {
11797 $status = parent::add($def, $config);
11798 if (!$status) parent::cleanup($config);
11802 public function set($def, $config) {
11803 $status = parent::set($def, $config);
11804 if (!$status) parent::cleanup($config);
11808 public function replace($def, $config) {
11809 $status = parent::replace($def, $config);
11810 if (!$status) parent::cleanup($config);
11814 public function get($config) {
11815 $ret = parent::get($config);
11816 if (!$ret) parent::cleanup($config);
11827 * Definition cache decorator class that saves all cache retrievals
11828 * to PHP's memory; good for unit tests or circumstances where
11829 * there are lots of configuration objects floating around.
11831 class HTMLPurifier_DefinitionCache_Decorator_Memory extends
11832 HTMLPurifier_DefinitionCache_Decorator
11835 protected $definitions;
11836 public $name = 'Memory';
11838 public function copy() {
11839 return new HTMLPurifier_DefinitionCache_Decorator_Memory();
11842 public function add($def, $config) {
11843 $status = parent::add($def, $config);
11844 if ($status) $this->definitions[$this->generateKey($config)] = $def;
11848 public function set($def, $config) {
11849 $status = parent::set($def, $config);
11850 if ($status) $this->definitions[$this->generateKey($config)] = $def;
11854 public function replace($def, $config) {
11855 $status = parent::replace($def, $config);
11856 if ($status) $this->definitions[$this->generateKey($config)] = $def;
11860 public function get($config) {
11861 $key = $this->generateKey($config);
11862 if (isset($this->definitions[$key])) return $this->definitions[$key];
11863 $this->definitions[$key] = parent::get($config);
11864 return $this->definitions[$key];
11874 * XHTML 1.1 Bi-directional Text Module, defines elements that
11875 * declare directionality of content. Text Extension Module.
11877 class HTMLPurifier_HTMLModule_Bdo extends HTMLPurifier_HTMLModule
11880 public $name = 'Bdo';
11881 public $attr_collections = array(
11882 'I18N' => array('dir' => false)
11885 public function setup($config) {
11886 $bdo = $this->addElement(
11887 'bdo', 'Inline', 'Inline', array('Core', 'Lang'),
11889 'dir' => 'Enum#ltr,rtl', // required
11890 // The Abstract Module specification has the attribute
11891 // inclusions wrong for bdo: bdo allows Lang
11894 $bdo->attr_transform_post['required-dir'] = new HTMLPurifier_AttrTransform_BdoDir();
11896 $this->attr_collections['I18N']['dir'] = 'Enum#ltr,rtl';
11905 class HTMLPurifier_HTMLModule_CommonAttributes extends HTMLPurifier_HTMLModule
11907 public $name = 'CommonAttributes';
11909 public $attr_collections = array(
11911 0 => array('Style'),
11912 // 'xml:space' => false,
11913 'class' => 'Class',
11915 'title' => 'CDATA',
11919 0 => array('Lang'), // proprietary, for xml:lang/lang
11922 0 => array('Core', 'I18N')
11933 * XHTML 1.1 Edit Module, defines editing-related elements. Text Extension
11936 class HTMLPurifier_HTMLModule_Edit extends HTMLPurifier_HTMLModule
11939 public $name = 'Edit';
11941 public function setup($config) {
11942 $contents = 'Chameleon: #PCDATA | Inline ! #PCDATA | Flow';
11945 // 'datetime' => 'Datetime', // not implemented
11947 $this->addElement('del', 'Inline', $contents, 'Common', $attr);
11948 $this->addElement('ins', 'Inline', $contents, 'Common', $attr);
11951 // HTML 4.01 specifies that ins/del must not contain block
11952 // elements when used in an inline context, chameleon is
11953 // a complicated workaround to acheive this effect
11955 // Inline context ! Block context (exclamation mark is
11956 // separator, see getChildDef for parsing)
11958 public $defines_child_def = true;
11959 public function getChildDef($def) {
11960 if ($def->content_model_type != 'chameleon') return false;
11961 $value = explode('!', $def->content_model);
11962 return new HTMLPurifier_ChildDef_Chameleon($value[0], $value[1]);
11972 * XHTML 1.1 Forms module, defines all form-related elements found in HTML 4.
11974 class HTMLPurifier_HTMLModule_Forms extends HTMLPurifier_HTMLModule
11976 public $name = 'Forms';
11977 public $safe = false;
11979 public $content_sets = array(
11981 'Inline' => 'Formctrl',
11984 public function setup($config) {
11985 $form = $this->addElement('form', 'Form',
11986 'Required: Heading | List | Block | fieldset', 'Common', array(
11987 'accept' => 'ContentTypes',
11988 'accept-charset' => 'Charsets',
11989 'action*' => 'URI',
11990 'method' => 'Enum#get,post',
11991 // really ContentType, but these two are the only ones used today
11992 'enctype' => 'Enum#application/x-www-form-urlencoded,multipart/form-data',
11994 $form->excludes = array('form' => true);
11996 $input = $this->addElement('input', 'Formctrl', 'Empty', 'Common', array(
11997 'accept' => 'ContentTypes',
11998 'accesskey' => 'Character',
12000 'checked' => 'Bool#checked',
12001 'disabled' => 'Bool#disabled',
12002 'maxlength' => 'Number',
12004 'readonly' => 'Bool#readonly',
12005 'size' => 'Number',
12006 'src' => 'URI#embeds',
12007 'tabindex' => 'Number',
12008 'type' => 'Enum#text,password,checkbox,button,radio,submit,reset,file,hidden,image',
12009 'value' => 'CDATA',
12011 $input->attr_transform_post[] = new HTMLPurifier_AttrTransform_Input();
12013 $this->addElement('select', 'Formctrl', 'Required: optgroup | option', 'Common', array(
12014 'disabled' => 'Bool#disabled',
12015 'multiple' => 'Bool#multiple',
12017 'size' => 'Number',
12018 'tabindex' => 'Number',
12021 $this->addElement('option', false, 'Optional: #PCDATA', 'Common', array(
12022 'disabled' => 'Bool#disabled',
12024 'selected' => 'Bool#selected',
12025 'value' => 'CDATA',
12027 // It's illegal for there to be more than one selected, but not
12028 // be multiple. Also, no selected means undefined behavior. This might
12029 // be difficult to implement; perhaps an injector, or a context variable.
12031 $textarea = $this->addElement('textarea', 'Formctrl', 'Optional: #PCDATA', 'Common', array(
12032 'accesskey' => 'Character',
12033 'cols*' => 'Number',
12034 'disabled' => 'Bool#disabled',
12036 'readonly' => 'Bool#readonly',
12037 'rows*' => 'Number',
12038 'tabindex' => 'Number',
12040 $textarea->attr_transform_pre[] = new HTMLPurifier_AttrTransform_Textarea();
12042 $button = $this->addElement('button', 'Formctrl', 'Optional: #PCDATA | Heading | List | Block | Inline', 'Common', array(
12043 'accesskey' => 'Character',
12044 'disabled' => 'Bool#disabled',
12046 'tabindex' => 'Number',
12047 'type' => 'Enum#button,submit,reset',
12048 'value' => 'CDATA',
12051 // For exclusions, ideally we'd specify content sets, not literal elements
12052 $button->excludes = $this->makeLookup(
12053 'form', 'fieldset', // Form
12054 'input', 'select', 'textarea', 'label', 'button', // Formctrl
12055 'a' // as per HTML 4.01 spec, this is omitted by modularization
12058 // Extra exclusion: img usemap="" is not permitted within this element.
12059 // We'll omit this for now, since we don't have any good way of
12060 // indicating it yet.
12062 // This is HIGHLY user-unfriendly; we need a custom child-def for this
12063 $this->addElement('fieldset', 'Form', 'Custom: (#WS?,legend,(Flow|#PCDATA)*)', 'Common');
12065 $label = $this->addElement('label', 'Formctrl', 'Optional: #PCDATA | Inline', 'Common', array(
12066 'accesskey' => 'Character',
12067 // 'for' => 'IDREF', // IDREF not implemented, cannot allow
12069 $label->excludes = array('label' => true);
12071 $this->addElement('legend', false, 'Optional: #PCDATA | Inline', 'Common', array(
12072 'accesskey' => 'Character',
12075 $this->addElement('optgroup', false, 'Required: option', 'Common', array(
12076 'disabled' => 'Bool#disabled',
12077 'label*' => 'Text',
12080 // Don't forget an injector for <isindex>. This one's a little complex
12081 // because it maps to multiple elements.
12091 * XHTML 1.1 Hypertext Module, defines hypertext links. Core Module.
12093 class HTMLPurifier_HTMLModule_Hypertext extends HTMLPurifier_HTMLModule
12096 public $name = 'Hypertext';
12098 public function setup($config) {
12099 $a = $this->addElement(
12100 'a', 'Inline', 'Inline', 'Common',
12102 // 'accesskey' => 'Character',
12103 // 'charset' => 'Charset',
12105 // 'hreflang' => 'LanguageCode',
12106 'rel' => new HTMLPurifier_AttrDef_HTML_LinkTypes('rel'),
12107 'rev' => new HTMLPurifier_AttrDef_HTML_LinkTypes('rev'),
12108 // 'tabindex' => 'Number',
12109 // 'type' => 'ContentType',
12112 $a->formatting = true;
12113 $a->excludes = array('a' => true);
12123 * XHTML 1.1 Image Module provides basic image embedding.
12124 * @note There is specialized code for removing empty images in
12125 * HTMLPurifier_Strategy_RemoveForeignElements
12127 class HTMLPurifier_HTMLModule_Image extends HTMLPurifier_HTMLModule
12130 public $name = 'Image';
12132 public function setup($config) {
12133 $max = $config->get('HTML.MaxImgLength');
12134 $img = $this->addElement(
12135 'img', 'Inline', 'Empty', 'Common',
12138 // According to the spec, it's Length, but percents can
12139 // be abused, so we allow only Pixels.
12140 'height' => 'Pixels#' . $max,
12141 'width' => 'Pixels#' . $max,
12142 'longdesc' => 'URI',
12143 'src*' => new HTMLPurifier_AttrDef_URI(true), // embedded
12146 if ($max === null || $config->get('HTML.Trusted')) {
12147 $img->attr['height'] =
12148 $img->attr['width'] = 'Length';
12151 // kind of strange, but splitting things up would be inefficient
12152 $img->attr_transform_pre[] =
12153 $img->attr_transform_post[] =
12154 new HTMLPurifier_AttrTransform_ImgRequired();
12164 * XHTML 1.1 Legacy module defines elements that were previously
12167 * @note Not all legacy elements have been implemented yet, which
12168 * is a bit of a reverse problem as compared to browsers! In
12169 * addition, this legacy module may implement a bit more than
12170 * mandated by XHTML 1.1.
12172 * This module can be used in combination with TransformToStrict in order
12173 * to transform as many deprecated elements as possible, but retain
12174 * questionably deprecated elements that do not have good alternatives
12175 * as well as transform elements that don't have an implementation.
12176 * See docs/ref-strictness.txt for more details.
12179 class HTMLPurifier_HTMLModule_Legacy extends HTMLPurifier_HTMLModule
12182 public $name = 'Legacy';
12184 public function setup($config) {
12186 $this->addElement('basefont', 'Inline', 'Empty', false, array(
12187 'color' => 'Color',
12188 'face' => 'Text', // extremely broad, we should
12189 'size' => 'Text', // tighten it
12192 $this->addElement('center', 'Block', 'Flow', 'Common');
12193 $this->addElement('dir', 'Block', 'Required: li', 'Common', array(
12194 'compact' => 'Bool#compact'
12196 $this->addElement('font', 'Inline', 'Inline', array('Core', 'I18N'), array(
12197 'color' => 'Color',
12198 'face' => 'Text', // extremely broad, we should
12199 'size' => 'Text', // tighten it
12201 $this->addElement('menu', 'Block', 'Required: li', 'Common', array(
12202 'compact' => 'Bool#compact'
12205 $s = $this->addElement('s', 'Inline', 'Inline', 'Common');
12206 $s->formatting = true;
12208 $strike = $this->addElement('strike', 'Inline', 'Inline', 'Common');
12209 $strike->formatting = true;
12211 $u = $this->addElement('u', 'Inline', 'Inline', 'Common');
12212 $u->formatting = true;
12214 // setup modifications to old elements
12216 $align = 'Enum#left,right,center,justify';
12218 $address = $this->addBlankElement('address');
12219 $address->content_model = 'Inline | #PCDATA | p';
12220 $address->content_model_type = 'optional';
12221 $address->child = false;
12223 $blockquote = $this->addBlankElement('blockquote');
12224 $blockquote->content_model = 'Flow | #PCDATA';
12225 $blockquote->content_model_type = 'optional';
12226 $blockquote->child = false;
12228 $br = $this->addBlankElement('br');
12229 $br->attr['clear'] = 'Enum#left,all,right,none';
12231 $caption = $this->addBlankElement('caption');
12232 $caption->attr['align'] = 'Enum#top,bottom,left,right';
12234 $div = $this->addBlankElement('div');
12235 $div->attr['align'] = $align;
12237 $dl = $this->addBlankElement('dl');
12238 $dl->attr['compact'] = 'Bool#compact';
12240 for ($i = 1; $i <= 6; $i++) {
12241 $h = $this->addBlankElement("h$i");
12242 $h->attr['align'] = $align;
12245 $hr = $this->addBlankElement('hr');
12246 $hr->attr['align'] = $align;
12247 $hr->attr['noshade'] = 'Bool#noshade';
12248 $hr->attr['size'] = 'Pixels';
12249 $hr->attr['width'] = 'Length';
12251 $img = $this->addBlankElement('img');
12252 $img->attr['align'] = 'Enum#top,middle,bottom,left,right';
12253 $img->attr['border'] = 'Pixels';
12254 $img->attr['hspace'] = 'Pixels';
12255 $img->attr['vspace'] = 'Pixels';
12257 // figure out this integer business
12259 $li = $this->addBlankElement('li');
12260 $li->attr['value'] = new HTMLPurifier_AttrDef_Integer();
12261 $li->attr['type'] = 'Enum#s:1,i,I,a,A,disc,square,circle';
12263 $ol = $this->addBlankElement('ol');
12264 $ol->attr['compact'] = 'Bool#compact';
12265 $ol->attr['start'] = new HTMLPurifier_AttrDef_Integer();
12266 $ol->attr['type'] = 'Enum#s:1,i,I,a,A';
12268 $p = $this->addBlankElement('p');
12269 $p->attr['align'] = $align;
12271 $pre = $this->addBlankElement('pre');
12272 $pre->attr['width'] = 'Number';
12276 $table = $this->addBlankElement('table');
12277 $table->attr['align'] = 'Enum#left,center,right';
12278 $table->attr['bgcolor'] = 'Color';
12280 $tr = $this->addBlankElement('tr');
12281 $tr->attr['bgcolor'] = 'Color';
12283 $th = $this->addBlankElement('th');
12284 $th->attr['bgcolor'] = 'Color';
12285 $th->attr['height'] = 'Length';
12286 $th->attr['nowrap'] = 'Bool#nowrap';
12287 $th->attr['width'] = 'Length';
12289 $td = $this->addBlankElement('td');
12290 $td->attr['bgcolor'] = 'Color';
12291 $td->attr['height'] = 'Length';
12292 $td->attr['nowrap'] = 'Bool#nowrap';
12293 $td->attr['width'] = 'Length';
12295 $ul = $this->addBlankElement('ul');
12296 $ul->attr['compact'] = 'Bool#compact';
12297 $ul->attr['type'] = 'Enum#square,disc,circle';
12308 * XHTML 1.1 List Module, defines list-oriented elements. Core Module.
12310 class HTMLPurifier_HTMLModule_List extends HTMLPurifier_HTMLModule
12313 public $name = 'List';
12315 // According to the abstract schema, the List content set is a fully formed
12316 // one or more expr, but it invariably occurs in an optional declaration
12317 // so we're not going to do that subtlety. It might cause trouble
12318 // if a user defines "List" and expects that multiple lists are
12319 // allowed to be specified, but then again, that's not very intuitive.
12320 // Furthermore, the actual XML Schema may disagree. Regardless,
12321 // we don't have support for such nested expressions without using
12322 // the incredibly inefficient and draconic Custom ChildDef.
12324 public $content_sets = array('Flow' => 'List');
12326 public function setup($config) {
12327 $ol = $this->addElement('ol', 'List', 'Required: li', 'Common');
12329 $ul = $this->addElement('ul', 'List', 'Required: li', 'Common');
12331 $this->addElement('dl', 'List', 'Required: dt | dd', 'Common');
12333 $this->addElement('li', false, 'Flow', 'Common');
12335 $this->addElement('dd', false, 'Flow', 'Common');
12336 $this->addElement('dt', false, 'Inline', 'Common');
12345 class HTMLPurifier_HTMLModule_Name extends HTMLPurifier_HTMLModule
12348 public $name = 'Name';
12350 public function setup($config) {
12351 $elements = array('a', 'applet', 'form', 'frame', 'iframe', 'img', 'map');
12352 foreach ($elements as $name) {
12353 $element = $this->addBlankElement($name);
12354 $element->attr['name'] = 'CDATA';
12355 if (!$config->get('HTML.Attr.Name.UseCDATA')) {
12356 $element->attr_transform_post['NameSync'] = new HTMLPurifier_AttrTransform_NameSync();
12368 * Module adds the nofollow attribute transformation to a tags. It
12369 * is enabled by HTML.Nofollow
12371 class HTMLPurifier_HTMLModule_Nofollow extends HTMLPurifier_HTMLModule
12374 public $name = 'Nofollow';
12376 public function setup($config) {
12377 $a = $this->addBlankElement('a');
12378 $a->attr_transform_post[] = new HTMLPurifier_AttrTransform_Nofollow();
12387 class HTMLPurifier_HTMLModule_NonXMLCommonAttributes extends HTMLPurifier_HTMLModule
12389 public $name = 'NonXMLCommonAttributes';
12391 public $attr_collections = array(
12393 'lang' => 'LanguageCode',
12403 * XHTML 1.1 Object Module, defines elements for generic object inclusion
12404 * @warning Users will commonly use <embed> to cater to legacy browsers: this
12405 * module does not allow this sort of behavior
12407 class HTMLPurifier_HTMLModule_Object extends HTMLPurifier_HTMLModule
12410 public $name = 'Object';
12411 public $safe = false;
12413 public function setup($config) {
12415 $this->addElement('object', 'Inline', 'Optional: #PCDATA | Flow | param', 'Common',
12417 'archive' => 'URI',
12418 'classid' => 'URI',
12419 'codebase' => 'URI',
12420 'codetype' => 'Text',
12422 'declare' => 'Bool#declare',
12423 'height' => 'Length',
12425 'standby' => 'Text',
12426 'tabindex' => 'Number',
12427 'type' => 'ContentType',
12428 'width' => 'Length'
12432 $this->addElement('param', false, 'Empty', false,
12438 'valuetype' => 'Enum#data,ref,object'
12451 * XHTML 1.1 Presentation Module, defines simple presentation-related
12452 * markup. Text Extension Module.
12453 * @note The official XML Schema and DTD specs further divide this into
12455 * - Block Presentation (hr)
12456 * - Inline Presentation (b, big, i, small, sub, sup, tt)
12457 * We have chosen not to heed this distinction, as content_sets
12458 * provides satisfactory disambiguation.
12460 class HTMLPurifier_HTMLModule_Presentation extends HTMLPurifier_HTMLModule
12463 public $name = 'Presentation';
12465 public function setup($config) {
12466 $this->addElement('hr', 'Block', 'Empty', 'Common');
12467 $this->addElement('sub', 'Inline', 'Inline', 'Common');
12468 $this->addElement('sup', 'Inline', 'Inline', 'Common');
12469 $b = $this->addElement('b', 'Inline', 'Inline', 'Common');
12470 $b->formatting = true;
12471 $big = $this->addElement('big', 'Inline', 'Inline', 'Common');
12472 $big->formatting = true;
12473 $i = $this->addElement('i', 'Inline', 'Inline', 'Common');
12474 $i->formatting = true;
12475 $small = $this->addElement('small', 'Inline', 'Inline', 'Common');
12476 $small->formatting = true;
12477 $tt = $this->addElement('tt', 'Inline', 'Inline', 'Common');
12478 $tt->formatting = true;
12488 * Module defines proprietary tags and attributes in HTML.
12489 * @warning If this module is enabled, standards-compliance is off!
12491 class HTMLPurifier_HTMLModule_Proprietary extends HTMLPurifier_HTMLModule
12494 public $name = 'Proprietary';
12496 public function setup($config) {
12498 $this->addElement('marquee', 'Inline', 'Flow', 'Common',
12500 'direction' => 'Enum#left,right,up,down',
12501 'behavior' => 'Enum#alternate',
12502 'width' => 'Length',
12503 'height' => 'Length',
12504 'scrolldelay' => 'Number',
12505 'scrollamount' => 'Number',
12506 'loop' => 'Number',
12507 'bgcolor' => 'Color',
12508 'hspace' => 'Pixels',
12509 'vspace' => 'Pixels',
12522 * XHTML 1.1 Ruby Annotation Module, defines elements that indicate
12523 * short runs of text alongside base text for annotation or pronounciation.
12525 class HTMLPurifier_HTMLModule_Ruby extends HTMLPurifier_HTMLModule
12528 public $name = 'Ruby';
12530 public function setup($config) {
12531 $this->addElement('ruby', 'Inline',
12532 'Custom: ((rb, (rt | (rp, rt, rp))) | (rbc, rtc, rtc?))',
12534 $this->addElement('rbc', false, 'Required: rb', 'Common');
12535 $this->addElement('rtc', false, 'Required: rt', 'Common');
12536 $rb = $this->addElement('rb', false, 'Inline', 'Common');
12537 $rb->excludes = array('ruby' => true);
12538 $rt = $this->addElement('rt', false, 'Inline', 'Common', array('rbspan' => 'Number'));
12539 $rt->excludes = array('ruby' => true);
12540 $this->addElement('rp', false, 'Optional: #PCDATA', 'Common');
12550 * A "safe" embed module. See SafeObject. This is a proprietary element.
12552 class HTMLPurifier_HTMLModule_SafeEmbed extends HTMLPurifier_HTMLModule
12555 public $name = 'SafeEmbed';
12557 public function setup($config) {
12559 $max = $config->get('HTML.MaxImgLength');
12560 $embed = $this->addElement(
12561 'embed', 'Inline', 'Empty', 'Common',
12563 'src*' => 'URI#embedded',
12564 'type' => 'Enum#application/x-shockwave-flash',
12565 'width' => 'Pixels#' . $max,
12566 'height' => 'Pixels#' . $max,
12567 'allowscriptaccess' => 'Enum#never',
12568 'allownetworking' => 'Enum#internal',
12569 'flashvars' => 'Text',
12570 'wmode' => 'Enum#window,transparent,opaque',
12574 $embed->attr_transform_post[] = new HTMLPurifier_AttrTransform_SafeEmbed();
12585 * A "safe" object module. In theory, objects permitted by this module will
12586 * be safe, and untrusted users can be allowed to embed arbitrary flash objects
12587 * (maybe other types too, but only Flash is supported as of right now).
12588 * Highly experimental.
12590 class HTMLPurifier_HTMLModule_SafeObject extends HTMLPurifier_HTMLModule
12593 public $name = 'SafeObject';
12595 public function setup($config) {
12597 // These definitions are not intrinsically safe: the attribute transforms
12598 // are a vital part of ensuring safety.
12600 $max = $config->get('HTML.MaxImgLength');
12601 $object = $this->addElement(
12604 'Optional: param | Flow | #PCDATA',
12607 // While technically not required by the spec, we're forcing
12608 // it to this value.
12609 'type' => 'Enum#application/x-shockwave-flash',
12610 'width' => 'Pixels#' . $max,
12611 'height' => 'Pixels#' . $max,
12612 'data' => 'URI#embedded',
12613 'codebase' => new HTMLPurifier_AttrDef_Enum(array(
12614 'http://download.macromedia.com/pub/shockwave/cabs/flash/swflash.cab#version=6,0,40,0')),
12617 $object->attr_transform_post[] = new HTMLPurifier_AttrTransform_SafeObject();
12619 $param = $this->addElement('param', false, 'Empty', false,
12626 $param->attr_transform_post[] = new HTMLPurifier_AttrTransform_SafeParam();
12627 $this->info_injector[] = 'SafeObject';
12639 WARNING: THIS MODULE IS EXTREMELY DANGEROUS AS IT ENABLES INLINE SCRIPTING
12640 INSIDE HTML PURIFIER DOCUMENTS. USE ONLY WITH TRUSTED USER INPUT!!!
12645 * XHTML 1.1 Scripting module, defines elements that are used to contain
12646 * information pertaining to executable scripts or the lack of support
12647 * for executable scripts.
12648 * @note This module does not contain inline scripting elements
12650 class HTMLPurifier_HTMLModule_Scripting extends HTMLPurifier_HTMLModule
12652 public $name = 'Scripting';
12653 public $elements = array('script', 'noscript');
12654 public $content_sets = array('Block' => 'script | noscript', 'Inline' => 'script | noscript');
12655 public $safe = false;
12657 public function setup($config) {
12658 // TODO: create custom child-definition for noscript that
12659 // auto-wraps stray #PCDATA in a similar manner to
12660 // blockquote's custom definition (we would use it but
12661 // blockquote's contents are optional while noscript's contents
12664 // TODO: convert this to new syntax, main problem is getting
12665 // both content sets working
12667 // In theory, this could be safe, but I don't see any reason to
12669 $this->info['noscript'] = new HTMLPurifier_ElementDef();
12670 $this->info['noscript']->attr = array( 0 => array('Common') );
12671 $this->info['noscript']->content_model = 'Heading | List | Block';
12672 $this->info['noscript']->content_model_type = 'required';
12674 $this->info['script'] = new HTMLPurifier_ElementDef();
12675 $this->info['script']->attr = array(
12676 'defer' => new HTMLPurifier_AttrDef_Enum(array('defer')),
12677 'src' => new HTMLPurifier_AttrDef_URI(true),
12678 'type' => new HTMLPurifier_AttrDef_Enum(array('text/javascript'))
12680 $this->info['script']->content_model = '#PCDATA';
12681 $this->info['script']->content_model_type = 'optional';
12682 $this->info['script']->attr_transform_pre['type'] =
12683 $this->info['script']->attr_transform_post['type'] =
12684 new HTMLPurifier_AttrTransform_ScriptRequired();
12693 * XHTML 1.1 Edit Module, defines editing-related elements. Text Extension
12696 class HTMLPurifier_HTMLModule_StyleAttribute extends HTMLPurifier_HTMLModule
12699 public $name = 'StyleAttribute';
12700 public $attr_collections = array(
12701 // The inclusion routine differs from the Abstract Modules but
12702 // is in line with the DTD and XML Schemas.
12703 'Style' => array('style' => false), // see constructor
12704 'Core' => array(0 => array('Style'))
12707 public function setup($config) {
12708 $this->attr_collections['Style']['style'] = new HTMLPurifier_AttrDef_CSS();
12718 * XHTML 1.1 Tables Module, fully defines accessible table elements.
12720 class HTMLPurifier_HTMLModule_Tables extends HTMLPurifier_HTMLModule
12723 public $name = 'Tables';
12725 public function setup($config) {
12727 $this->addElement('caption', false, 'Inline', 'Common');
12729 $this->addElement('table', 'Block',
12730 new HTMLPurifier_ChildDef_Table(), 'Common',
12732 'border' => 'Pixels',
12733 'cellpadding' => 'Length',
12734 'cellspacing' => 'Length',
12735 'frame' => 'Enum#void,above,below,hsides,lhs,rhs,vsides,box,border',
12736 'rules' => 'Enum#none,groups,rows,cols,all',
12737 'summary' => 'Text',
12738 'width' => 'Length'
12742 // common attributes
12743 $cell_align = array(
12744 'align' => 'Enum#left,center,right,justify,char',
12745 'charoff' => 'Length',
12746 'valign' => 'Enum#top,middle,bottom,baseline',
12749 $cell_t = array_merge(
12752 'colspan' => 'Number',
12753 'rowspan' => 'Number',
12757 $this->addElement('td', false, 'Flow', 'Common', $cell_t);
12758 $this->addElement('th', false, 'Flow', 'Common', $cell_t);
12760 $this->addElement('tr', false, 'Required: td | th', 'Common', $cell_align);
12762 $cell_col = array_merge(
12764 'span' => 'Number',
12765 'width' => 'MultiLength',
12769 $this->addElement('col', false, 'Empty', 'Common', $cell_col);
12770 $this->addElement('colgroup', false, 'Optional: col', 'Common', $cell_col);
12772 $this->addElement('tbody', false, 'Required: tr', 'Common', $cell_align);
12773 $this->addElement('thead', false, 'Required: tr', 'Common', $cell_align);
12774 $this->addElement('tfoot', false, 'Required: tr', 'Common', $cell_align);
12785 * XHTML 1.1 Target Module, defines target attribute in link elements.
12787 class HTMLPurifier_HTMLModule_Target extends HTMLPurifier_HTMLModule
12790 public $name = 'Target';
12792 public function setup($config) {
12793 $elements = array('a');
12794 foreach ($elements as $name) {
12795 $e = $this->addBlankElement($name);
12797 'target' => new HTMLPurifier_AttrDef_HTML_FrameTarget()
12809 * XHTML 1.1 Text Module, defines basic text containers. Core Module.
12810 * @note In the normative XML Schema specification, this module
12811 * is further abstracted into the following modules:
12812 * - Block Phrasal (address, blockquote, pre, h1, h2, h3, h4, h5, h6)
12813 * - Block Structural (div, p)
12814 * - Inline Phrasal (abbr, acronym, cite, code, dfn, em, kbd, q, samp, strong, var)
12815 * - Inline Structural (br, span)
12816 * This module, functionally, does not distinguish between these
12817 * sub-modules, but the code is internally structured to reflect
12818 * these distinctions.
12820 class HTMLPurifier_HTMLModule_Text extends HTMLPurifier_HTMLModule
12823 public $name = 'Text';
12824 public $content_sets = array(
12825 'Flow' => 'Heading | Block | Inline'
12828 public function setup($config) {
12830 // Inline Phrasal -------------------------------------------------
12831 $this->addElement('abbr', 'Inline', 'Inline', 'Common');
12832 $this->addElement('acronym', 'Inline', 'Inline', 'Common');
12833 $this->addElement('cite', 'Inline', 'Inline', 'Common');
12834 $this->addElement('dfn', 'Inline', 'Inline', 'Common');
12835 $this->addElement('kbd', 'Inline', 'Inline', 'Common');
12836 $this->addElement('q', 'Inline', 'Inline', 'Common', array('cite' => 'URI'));
12837 $this->addElement('samp', 'Inline', 'Inline', 'Common');
12838 $this->addElement('var', 'Inline', 'Inline', 'Common');
12840 $em = $this->addElement('em', 'Inline', 'Inline', 'Common');
12841 $em->formatting = true;
12843 $strong = $this->addElement('strong', 'Inline', 'Inline', 'Common');
12844 $strong->formatting = true;
12846 $code = $this->addElement('code', 'Inline', 'Inline', 'Common');
12847 $code->formatting = true;
12849 // Inline Structural ----------------------------------------------
12850 $this->addElement('span', 'Inline', 'Inline', 'Common');
12851 $this->addElement('br', 'Inline', 'Empty', 'Core');
12853 // Block Phrasal --------------------------------------------------
12854 $this->addElement('address', 'Block', 'Inline', 'Common');
12855 $this->addElement('blockquote', 'Block', 'Optional: Heading | Block | List', 'Common', array('cite' => 'URI') );
12856 $pre = $this->addElement('pre', 'Block', 'Inline', 'Common');
12857 $pre->excludes = $this->makeLookup(
12858 'img', 'big', 'small', 'object', 'applet', 'font', 'basefont' );
12859 $this->addElement('h1', 'Heading', 'Inline', 'Common');
12860 $this->addElement('h2', 'Heading', 'Inline', 'Common');
12861 $this->addElement('h3', 'Heading', 'Inline', 'Common');
12862 $this->addElement('h4', 'Heading', 'Inline', 'Common');
12863 $this->addElement('h5', 'Heading', 'Inline', 'Common');
12864 $this->addElement('h6', 'Heading', 'Inline', 'Common');
12866 // Block Structural -----------------------------------------------
12867 $p = $this->addElement('p', 'Block', 'Inline', 'Common');
12868 $p->autoclose = array_flip(array("address", "blockquote", "center", "dir", "div", "dl", "fieldset", "ol", "p", "ul"));
12870 $this->addElement('div', 'Block', 'Flow', 'Common');
12881 * Abstract class for a set of proprietary modules that clean up (tidy)
12882 * poorly written HTML.
12883 * @todo Figure out how to protect some of these methods/properties
12885 class HTMLPurifier_HTMLModule_Tidy extends HTMLPurifier_HTMLModule
12889 * List of supported levels. Index zero is a special case "no fixes"
12892 public $levels = array(0 => 'none', 'light', 'medium', 'heavy');
12895 * Default level to place all fixes in. Disabled by default
12897 public $defaultLevel = null;
12900 * Lists of fixes used by getFixesForLevel(). Format is:
12901 * HTMLModule_Tidy->fixesForLevel[$level] = array('fix-1', 'fix-2');
12903 public $fixesForLevel = array(
12904 'light' => array(),
12905 'medium' => array(),
12910 * Lazy load constructs the module by determining the necessary
12911 * fixes to create and then delegating to the populate() function.
12912 * @todo Wildcard matching and error reporting when an added or
12913 * subtracted fix has no effect.
12915 public function setup($config) {
12917 // create fixes, initialize fixesForLevel
12918 $fixes = $this->makeFixes();
12919 $this->makeFixesForLevel($fixes);
12921 // figure out which fixes to use
12922 $level = $config->get('HTML.TidyLevel');
12923 $fixes_lookup = $this->getFixesForLevel($level);
12925 // get custom fix declarations: these need namespace processing
12926 $add_fixes = $config->get('HTML.TidyAdd');
12927 $remove_fixes = $config->get('HTML.TidyRemove');
12929 foreach ($fixes as $name => $fix) {
12930 // needs to be refactored a little to implement globbing
12932 isset($remove_fixes[$name]) ||
12933 (!isset($add_fixes[$name]) && !isset($fixes_lookup[$name]))
12935 unset($fixes[$name]);
12939 // populate this module with necessary fixes
12940 $this->populate($fixes);
12945 * Retrieves all fixes per a level, returning fixes for that specific
12946 * level as well as all levels below it.
12947 * @param $level String level identifier, see $levels for valid values
12948 * @return Lookup up table of fixes
12950 public function getFixesForLevel($level) {
12951 if ($level == $this->levels[0]) {
12954 $activated_levels = array();
12955 for ($i = 1, $c = count($this->levels); $i < $c; $i++) {
12956 $activated_levels[] = $this->levels[$i];
12957 if ($this->levels[$i] == $level) break;
12961 'Tidy level ' . htmlspecialchars($level) . ' not recognized',
12967 foreach ($activated_levels as $level) {
12968 foreach ($this->fixesForLevel[$level] as $fix) {
12976 * Dynamically populates the $fixesForLevel member variable using
12977 * the fixes array. It may be custom overloaded, used in conjunction
12978 * with $defaultLevel, or not used at all.
12980 public function makeFixesForLevel($fixes) {
12981 if (!isset($this->defaultLevel)) return;
12982 if (!isset($this->fixesForLevel[$this->defaultLevel])) {
12984 'Default level ' . $this->defaultLevel . ' does not exist',
12989 $this->fixesForLevel[$this->defaultLevel] = array_keys($fixes);
12993 * Populates the module with transforms and other special-case code
12994 * based on a list of fixes passed to it
12995 * @param $lookup Lookup table of fixes to activate
12997 public function populate($fixes) {
12998 foreach ($fixes as $name => $fix) {
12999 // determine what the fix is for
13000 list($type, $params) = $this->getFixType($name);
13002 case 'attr_transform_pre':
13003 case 'attr_transform_post':
13004 $attr = $params['attr'];
13005 if (isset($params['element'])) {
13006 $element = $params['element'];
13007 if (empty($this->info[$element])) {
13008 $e = $this->addBlankElement($element);
13010 $e = $this->info[$element];
13013 $type = "info_$type";
13016 // PHP does some weird parsing when I do
13017 // $e->$type[$attr], so I have to assign a ref.
13021 case 'tag_transform':
13022 $this->info_tag_transform[$params['element']] = $fix;
13025 case 'content_model_type':
13026 $element = $params['element'];
13027 if (empty($this->info[$element])) {
13028 $e = $this->addBlankElement($element);
13030 $e = $this->info[$element];
13035 trigger_error("Fix type $type not supported", E_USER_ERROR);
13042 * Parses a fix name and determines what kind of fix it is, as well
13043 * as other information defined by the fix
13044 * @param $name String name of fix
13045 * @return array(string $fix_type, array $fix_parameters)
13046 * @note $fix_parameters is type dependant, see populate() for usage
13047 * of these parameters
13049 public function getFixType($name) {
13051 $property = $attr = null;
13052 if (strpos($name, '#') !== false) list($name, $property) = explode('#', $name);
13053 if (strpos($name, '@') !== false) list($name, $attr) = explode('@', $name);
13055 // figure out the parameters
13057 if ($name !== '') $params['element'] = $name;
13058 if (!is_null($attr)) $params['attr'] = $attr;
13060 // special case: attribute transform
13061 if (!is_null($attr)) {
13062 if (is_null($property)) $property = 'pre';
13063 $type = 'attr_transform_' . $property;
13064 return array($type, $params);
13067 // special case: tag transform
13068 if (is_null($property)) {
13069 return array('tag_transform', $params);
13072 return array($property, $params);
13077 * Defines all fixes the module will perform in a compact
13078 * associative array of fix name to fix implementation.
13080 public function makeFixes() {}
13088 class HTMLPurifier_HTMLModule_XMLCommonAttributes extends HTMLPurifier_HTMLModule
13090 public $name = 'XMLCommonAttributes';
13092 public $attr_collections = array(
13094 'xml:lang' => 'LanguageCode',
13104 * Name is deprecated, but allowed in strict doctypes, so onl
13106 class HTMLPurifier_HTMLModule_Tidy_Name extends HTMLPurifier_HTMLModule_Tidy
13108 public $name = 'Tidy_Name';
13109 public $defaultLevel = 'heavy';
13110 public function makeFixes() {
13114 // @name for img, a -----------------------------------------------
13115 // Technically, it's allowed even on strict, so we allow authors to use
13116 // it. However, it's deprecated in future versions of XHTML.
13118 $r['a@name'] = new HTMLPurifier_AttrTransform_Name();
13128 class HTMLPurifier_HTMLModule_Tidy_Proprietary extends HTMLPurifier_HTMLModule_Tidy
13131 public $name = 'Tidy_Proprietary';
13132 public $defaultLevel = 'light';
13134 public function makeFixes() {
13136 $r['table@background'] = new HTMLPurifier_AttrTransform_Background();
13137 $r['td@background'] = new HTMLPurifier_AttrTransform_Background();
13138 $r['th@background'] = new HTMLPurifier_AttrTransform_Background();
13139 $r['tr@background'] = new HTMLPurifier_AttrTransform_Background();
13140 $r['thead@background'] = new HTMLPurifier_AttrTransform_Background();
13141 $r['tfoot@background'] = new HTMLPurifier_AttrTransform_Background();
13142 $r['tbody@background'] = new HTMLPurifier_AttrTransform_Background();
13143 $r['table@height'] = new HTMLPurifier_AttrTransform_Length('height');
13153 class HTMLPurifier_HTMLModule_Tidy_XHTMLAndHTML4 extends HTMLPurifier_HTMLModule_Tidy
13156 public function makeFixes() {
13160 // == deprecated tag transforms ===================================
13162 $r['font'] = new HTMLPurifier_TagTransform_Font();
13163 $r['menu'] = new HTMLPurifier_TagTransform_Simple('ul');
13164 $r['dir'] = new HTMLPurifier_TagTransform_Simple('ul');
13165 $r['center'] = new HTMLPurifier_TagTransform_Simple('div', 'text-align:center;');
13166 $r['u'] = new HTMLPurifier_TagTransform_Simple('span', 'text-decoration:underline;');
13167 $r['s'] = new HTMLPurifier_TagTransform_Simple('span', 'text-decoration:line-through;');
13168 $r['strike'] = new HTMLPurifier_TagTransform_Simple('span', 'text-decoration:line-through;');
13170 // == deprecated attribute transforms =============================
13172 $r['caption@align'] =
13173 new HTMLPurifier_AttrTransform_EnumToCSS('align', array(
13174 // we're following IE's behavior, not Firefox's, due
13175 // to the fact that no one supports caption-side:right,
13176 // W3C included (with CSS 2.1). This is a slightly
13177 // unreasonable attribute!
13178 'left' => 'text-align:left;',
13179 'right' => 'text-align:right;',
13180 'top' => 'caption-side:top;',
13181 'bottom' => 'caption-side:bottom;' // not supported by IE
13184 // @align for img -------------------------------------------------
13186 new HTMLPurifier_AttrTransform_EnumToCSS('align', array(
13187 'left' => 'float:left;',
13188 'right' => 'float:right;',
13189 'top' => 'vertical-align:top;',
13190 'middle' => 'vertical-align:middle;',
13191 'bottom' => 'vertical-align:baseline;',
13194 // @align for table -----------------------------------------------
13195 $r['table@align'] =
13196 new HTMLPurifier_AttrTransform_EnumToCSS('align', array(
13197 'left' => 'float:left;',
13198 'center' => 'margin-left:auto;margin-right:auto;',
13199 'right' => 'float:right;'
13202 // @align for hr -----------------------------------------------
13204 new HTMLPurifier_AttrTransform_EnumToCSS('align', array(
13205 // we use both text-align and margin because these work
13206 // for different browsers (IE and Firefox, respectively)
13207 // and the melange makes for a pretty cross-compatible
13209 'left' => 'margin-left:0;margin-right:auto;text-align:left;',
13210 'center' => 'margin-left:auto;margin-right:auto;text-align:center;',
13211 'right' => 'margin-left:auto;margin-right:0;text-align:right;'
13214 // @align for h1, h2, h3, h4, h5, h6, p, div ----------------------
13216 $align_lookup = array();
13217 $align_values = array('left', 'right', 'center', 'justify');
13218 foreach ($align_values as $v) $align_lookup[$v] = "text-align:$v;";
13228 new HTMLPurifier_AttrTransform_EnumToCSS('align', $align_lookup);
13230 // @bgcolor for table, tr, td, th ---------------------------------
13231 $r['table@bgcolor'] =
13234 new HTMLPurifier_AttrTransform_BgColor();
13236 // @border for img ------------------------------------------------
13237 $r['img@border'] = new HTMLPurifier_AttrTransform_Border();
13239 // @clear for br --------------------------------------------------
13241 new HTMLPurifier_AttrTransform_EnumToCSS('clear', array(
13242 'left' => 'clear:left;',
13243 'right' => 'clear:right;',
13244 'all' => 'clear:both;',
13245 'none' => 'clear:none;',
13248 // @height for td, th ---------------------------------------------
13251 new HTMLPurifier_AttrTransform_Length('height');
13253 // @hspace for img ------------------------------------------------
13254 $r['img@hspace'] = new HTMLPurifier_AttrTransform_ImgSpace('hspace');
13256 // @noshade for hr ------------------------------------------------
13257 // this transformation is not precise but often good enough.
13258 // different browsers use different styles to designate noshade
13260 new HTMLPurifier_AttrTransform_BoolToCSS(
13262 'color:#808080;background-color:#808080;border:0;'
13265 // @nowrap for td, th ---------------------------------------------
13268 new HTMLPurifier_AttrTransform_BoolToCSS(
13270 'white-space:nowrap;'
13273 // @size for hr --------------------------------------------------
13274 $r['hr@size'] = new HTMLPurifier_AttrTransform_Length('size', 'height');
13276 // @type for li, ol, ul -------------------------------------------
13279 'disc' => 'list-style-type:disc;',
13280 'square' => 'list-style-type:square;',
13281 'circle' => 'list-style-type:circle;'
13284 '1' => 'list-style-type:decimal;',
13285 'i' => 'list-style-type:lower-roman;',
13286 'I' => 'list-style-type:upper-roman;',
13287 'a' => 'list-style-type:lower-alpha;',
13288 'A' => 'list-style-type:upper-alpha;'
13290 $li_types = $ul_types + $ol_types;
13293 $r['ul@type'] = new HTMLPurifier_AttrTransform_EnumToCSS('type', $ul_types);
13294 $r['ol@type'] = new HTMLPurifier_AttrTransform_EnumToCSS('type', $ol_types, true);
13295 $r['li@type'] = new HTMLPurifier_AttrTransform_EnumToCSS('type', $li_types, true);
13297 // @vspace for img ------------------------------------------------
13298 $r['img@vspace'] = new HTMLPurifier_AttrTransform_ImgSpace('vspace');
13300 // @width for hr, td, th ------------------------------------------
13303 $r['hr@width'] = new HTMLPurifier_AttrTransform_Length('width');
13315 class HTMLPurifier_HTMLModule_Tidy_Strict extends HTMLPurifier_HTMLModule_Tidy_XHTMLAndHTML4
13317 public $name = 'Tidy_Strict';
13318 public $defaultLevel = 'light';
13320 public function makeFixes() {
13321 $r = parent::makeFixes();
13322 $r['blockquote#content_model_type'] = 'strictblockquote';
13326 public $defines_child_def = true;
13327 public function getChildDef($def) {
13328 if ($def->content_model_type != 'strictblockquote') return parent::getChildDef($def);
13329 return new HTMLPurifier_ChildDef_StrictBlockquote($def->content_model);
13337 class HTMLPurifier_HTMLModule_Tidy_Transitional extends HTMLPurifier_HTMLModule_Tidy_XHTMLAndHTML4
13339 public $name = 'Tidy_Transitional';
13340 public $defaultLevel = 'heavy';
13347 class HTMLPurifier_HTMLModule_Tidy_XHTML extends HTMLPurifier_HTMLModule_Tidy
13350 public $name = 'Tidy_XHTML';
13351 public $defaultLevel = 'medium';
13353 public function makeFixes() {
13355 $r['@lang'] = new HTMLPurifier_AttrTransform_Lang();
13366 * Injector that auto paragraphs text in the root node based on
13368 * @todo Ensure all states are unit tested, including variations as well.
13369 * @todo Make a graph of the flow control for this Injector.
13371 class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
13374 public $name = 'AutoParagraph';
13375 public $needed = array('p');
13377 private function _pStart() {
13378 $par = new HTMLPurifier_Token_Start('p');
13379 $par->armor['MakeWellFormed_TagClosedError'] = true;
13383 public function handleText(&$token) {
13384 $text = $token->data;
13385 // Does the current parent allow <p> tags?
13386 if ($this->allowsElement('p')) {
13387 if (empty($this->currentNesting) || strpos($text, "\n\n") !== false) {
13388 // Note that we have differing behavior when dealing with text
13389 // in the anonymous root node, or a node inside the document.
13390 // If the text as a double-newline, the treatment is the same;
13391 // if it doesn't, see the next if-block if you're in the document.
13393 $i = $nesting = null;
13394 if (!$this->forwardUntilEndToken($i, $current, $nesting) && $token->is_whitespace) {
13395 // State 1.1: ... ^ (whitespace, then document end)
13397 // This is a degenerate case
13399 if (!$token->is_whitespace || $this->_isInline($current)) {
13403 // State 1.3: PAR1\n\nPAR2
13406 // State 1.4: <div>PAR1\n\nPAR2 (see State 2)
13408 $token = array($this->_pStart());
13409 $this->_splitText($text, $token);
13411 // State 1.5: \n<hr />
13416 // State 2: <div>PAR1... (similar to 1.4)
13419 // We're in an element that allows paragraph tags, but we're not
13420 // sure if we're going to need them.
13421 if ($this->_pLookAhead()) {
13422 // State 2.1: <div>PAR1<b>PAR1\n\nPAR2
13424 // Note: This will always be the first child, since any
13425 // previous inline element would have triggered this very
13426 // same routine, and found the double newline. One possible
13427 // exception would be a comment.
13428 $token = array($this->_pStart(), $token);
13430 // State 2.2.1: <div>PAR1<div>
13433 // State 2.2.2: <div>PAR1<b>PAR1</b></div>
13437 // Is the current parent a <p> tag?
13439 !empty($this->currentNesting) &&
13440 $this->currentNesting[count($this->currentNesting)-1]->name == 'p'
13442 // State 3.1: ...<p>PAR1
13445 // State 3.2: ...<p>PAR1\n\nPAR2
13448 $this->_splitText($text, $token);
13451 // State 4.1: ...<b>PAR1
13454 // State 4.2: ...<b>PAR1\n\nPAR2
13459 public function handleElement(&$token) {
13460 // We don't have to check if we're already in a <p> tag for block
13461 // tokens, because the tag would have been autoclosed by MakeWellFormed.
13462 if ($this->allowsElement('p')) {
13463 if (!empty($this->currentNesting)) {
13464 if ($this->_isInline($token)) {
13465 // State 1: <div>...<b>
13468 // Check if this token is adjacent to the parent token
13469 // (seek backwards until token isn't whitespace)
13471 $this->backward($i, $prev);
13473 if (!$prev instanceof HTMLPurifier_Token_Start) {
13474 // Token wasn't adjacent
13477 $prev instanceof HTMLPurifier_Token_Text &&
13478 substr($prev->data, -2) === "\n\n"
13480 // State 1.1.4: <div><p>PAR1</p>\n\n<b>
13483 // Quite frankly, this should be handled by splitText
13484 $token = array($this->_pStart(), $token);
13486 // State 1.1.1: <div><p>PAR1</p><b>
13489 // State 1.1.2: <div><br /><b>
13492 // State 1.1.3: <div>PAR<b>
13497 // State 1.2.1: <div><b>
13500 // Lookahead to see if <p> is needed.
13501 if ($this->_pLookAhead()) {
13502 // State 1.3.1: <div><b>PAR1\n\nPAR2
13504 $token = array($this->_pStart(), $token);
13506 // State 1.3.2: <div><b>PAR1</b></div>
13509 // State 1.3.3: <div><b>PAR1</b><div></div>\n\n</div>
13514 // State 2.3: ...<div>
13518 if ($this->_isInline($token)) {
13521 // This is where the {p} tag is inserted, not reflected in
13522 // inputTokens yet, however.
13523 $token = array($this->_pStart(), $token);
13525 // State 3.2: <div>
13530 if ($this->backward($i, $prev)) {
13532 !$prev instanceof HTMLPurifier_Token_Text
13534 // State 3.1.1: ...</p>{p}<b>
13537 // State 3.2.1: ...</p><div>
13540 if (!is_array($token)) $token = array($token);
13541 array_unshift($token, new HTMLPurifier_Token_Text("\n\n"));
13543 // State 3.1.2: ...</p>\n\n{p}<b>
13546 // State 3.2.2: ...</p>\n\n<div>
13549 // Note: PAR<ELEM> cannot occur because PAR would have been
13550 // wrapped in <p> tags.
13555 // State 2.2: <ul><li>
13558 // State 2.4: <p><b>
13564 * Splits up a text in paragraph tokens and appends them
13565 * to the result stream that will replace the original
13566 * @param $data String text data that will be processed
13568 * @param $result Reference to array of tokens that the
13569 * tags will be appended onto
13570 * @param $config Instance of HTMLPurifier_Config
13571 * @param $context Instance of HTMLPurifier_Context
13573 private function _splitText($data, &$result) {
13574 $raw_paragraphs = explode("\n\n", $data);
13575 $paragraphs = array(); // without empty paragraphs
13576 $needs_start = false;
13577 $needs_end = false;
13579 $c = count($raw_paragraphs);
13581 // There were no double-newlines, abort quickly. In theory this
13582 // should never happen.
13583 $result[] = new HTMLPurifier_Token_Text($data);
13586 for ($i = 0; $i < $c; $i++) {
13587 $par = $raw_paragraphs[$i];
13588 if (trim($par) !== '') {
13589 $paragraphs[] = $par;
13592 // Double newline at the front
13593 if (empty($result)) {
13594 // The empty result indicates that the AutoParagraph
13595 // injector did not add any start paragraph tokens.
13596 // This means that we have been in a paragraph for
13597 // a while, and the newline means we should start a new one.
13598 $result[] = new HTMLPurifier_Token_End('p');
13599 $result[] = new HTMLPurifier_Token_Text("\n\n");
13600 // However, the start token should only be added if
13601 // there is more processing to be done (i.e. there are
13602 // real paragraphs in here). If there are none, the
13603 // next start paragraph tag will be handled by the
13604 // next call to the injector
13605 $needs_start = true;
13607 // We just started a new paragraph!
13608 // Reinstate a double-newline for presentation's sake, since
13609 // it was in the source code.
13610 array_unshift($result, new HTMLPurifier_Token_Text("\n\n"));
13612 } elseif ($i + 1 == $c) {
13613 // Double newline at the end
13614 // There should be a trailing </p> when we're finally done.
13620 // Check if this was just a giant blob of whitespace. Move this earlier,
13622 if (empty($paragraphs)) {
13626 // Add the start tag indicated by \n\n at the beginning of $data
13627 if ($needs_start) {
13628 $result[] = $this->_pStart();
13631 // Append the paragraphs onto the result
13632 foreach ($paragraphs as $par) {
13633 $result[] = new HTMLPurifier_Token_Text($par);
13634 $result[] = new HTMLPurifier_Token_End('p');
13635 $result[] = new HTMLPurifier_Token_Text("\n\n");
13636 $result[] = $this->_pStart();
13639 // Remove trailing start token; Injector will handle this later if
13640 // it was indeed needed. This prevents from needing to do a lookahead,
13641 // at the cost of a lookbehind later.
13642 array_pop($result);
13644 // If there is no need for an end tag, remove all of it and let
13645 // MakeWellFormed close it later.
13647 array_pop($result); // removes \n\n
13648 array_pop($result); // removes </p>
13654 * Returns true if passed token is inline (and, ergo, allowed in
13657 private function _isInline($token) {
13658 return isset($this->htmlDefinition->info['p']->child->elements[$token->name]);
13662 * Looks ahead in the token list and determines whether or not we need
13663 * to insert a <p> tag.
13665 private function _pLookAhead() {
13666 $this->current($i, $current);
13667 if ($current instanceof HTMLPurifier_Token_Start) $nesting = 1;
13670 while ($this->forwardUntilEndToken($i, $current, $nesting)) {
13671 $result = $this->_checkNeedsP($current);
13672 if ($result !== null) {
13681 * Determines if a particular token requires an earlier inline token
13682 * to get a paragraph. This should be used with _forwardUntilEndToken
13684 private function _checkNeedsP($current) {
13685 if ($current instanceof HTMLPurifier_Token_Start){
13686 if (!$this->_isInline($current)) {
13689 // Terminate early, since we hit a block element
13692 } elseif ($current instanceof HTMLPurifier_Token_Text) {
13693 if (strpos($current->data, "\n\n") !== false) {
13694 // <div>PAR1<b>PAR1\n\nPAR2
13698 // <div>PAR1<b>PAR1...
13712 * Injector that displays the URL of an anchor instead of linking to it, in addition to showing the text of the link.
13714 class HTMLPurifier_Injector_DisplayLinkURI extends HTMLPurifier_Injector
13717 public $name = 'DisplayLinkURI';
13718 public $needed = array('a');
13720 public function handleElement(&$token) {
13723 public function handleEnd(&$token) {
13724 if (isset($token->start->attr['href'])){
13725 $url = $token->start->attr['href'];
13726 unset($token->start->attr['href']);
13727 $token = array($token, new HTMLPurifier_Token_Text(" ($url)"));
13729 // nothing to display
13739 * Injector that converts http, https and ftp text URLs to actual links.
13741 class HTMLPurifier_Injector_Linkify extends HTMLPurifier_Injector
13744 public $name = 'Linkify';
13745 public $needed = array('a' => array('href'));
13747 public function handleText(&$token) {
13748 if (!$this->allowsElement('a')) return;
13750 if (strpos($token->data, '://') === false) {
13751 // our really quick heuristic failed, abort
13752 // this may not work so well if we want to match things like
13753 // "google.com", but then again, most people don't
13757 // there is/are URL(s). Let's split the string:
13758 // Note: this regex is extremely permissive
13759 $bits = preg_split('#((?:https?|ftp)://[^\s\'"<>()]+)#S', $token->data, -1, PREG_SPLIT_DELIM_CAPTURE);
13766 for ($i = 0, $c = count($bits), $l = false; $i < $c; $i++, $l = !$l) {
13768 if ($bits[$i] === '') continue;
13769 $token[] = new HTMLPurifier_Token_Text($bits[$i]);
13771 $token[] = new HTMLPurifier_Token_Start('a', array('href' => $bits[$i]));
13772 $token[] = new HTMLPurifier_Token_Text($bits[$i]);
13773 $token[] = new HTMLPurifier_Token_End('a');
13786 * Injector that converts configuration directive syntax %Namespace.Directive
13789 class HTMLPurifier_Injector_PurifierLinkify extends HTMLPurifier_Injector
13792 public $name = 'PurifierLinkify';
13794 public $needed = array('a' => array('href'));
13796 public function prepare($config, $context) {
13797 $this->docURL = $config->get('AutoFormat.PurifierLinkify.DocURL');
13798 return parent::prepare($config, $context);
13801 public function handleText(&$token) {
13802 if (!$this->allowsElement('a')) return;
13803 if (strpos($token->data, '%') === false) return;
13805 $bits = preg_split('#%([a-z0-9]+\.[a-z0-9]+)#Si', $token->data, -1, PREG_SPLIT_DELIM_CAPTURE);
13811 for ($i = 0, $c = count($bits), $l = false; $i < $c; $i++, $l = !$l) {
13813 if ($bits[$i] === '') continue;
13814 $token[] = new HTMLPurifier_Token_Text($bits[$i]);
13816 $token[] = new HTMLPurifier_Token_Start('a',
13817 array('href' => str_replace('%s', $bits[$i], $this->docURL)));
13818 $token[] = new HTMLPurifier_Token_Text('%' . $bits[$i]);
13819 $token[] = new HTMLPurifier_Token_End('a');
13831 class HTMLPurifier_Injector_RemoveEmpty extends HTMLPurifier_Injector
13834 private $context, $config, $attrValidator, $removeNbsp, $removeNbspExceptions;
13836 public function prepare($config, $context) {
13837 parent::prepare($config, $context);
13838 $this->config = $config;
13839 $this->context = $context;
13840 $this->removeNbsp = $config->get('AutoFormat.RemoveEmpty.RemoveNbsp');
13841 $this->removeNbspExceptions = $config->get('AutoFormat.RemoveEmpty.RemoveNbsp.Exceptions');
13842 $this->attrValidator = new HTMLPurifier_AttrValidator();
13845 public function handleElement(&$token) {
13846 if (!$token instanceof HTMLPurifier_Token_Start) return;
13848 for ($i = $this->inputIndex + 1, $c = count($this->inputTokens); $i < $c; $i++) {
13849 $next = $this->inputTokens[$i];
13850 if ($next instanceof HTMLPurifier_Token_Text) {
13851 if ($next->is_whitespace) continue;
13852 if ($this->removeNbsp && !isset($this->removeNbspExceptions[$token->name])) {
13853 $plain = str_replace("\xC2\xA0", "", $next->data);
13854 $isWsOrNbsp = $plain === '' || ctype_space($plain);
13855 if ($isWsOrNbsp) continue;
13860 if (!$next || ($next instanceof HTMLPurifier_Token_End && $next->name == $token->name)) {
13861 if ($token->name == 'colgroup') return;
13862 $this->attrValidator->validateToken($token, $this->config, $this->context);
13863 $token->armor['ValidateAttributes'] = true;
13864 if (isset($token->attr['id']) || isset($token->attr['name'])) return;
13865 $token = $i - $this->inputIndex + 1;
13866 for ($b = $this->inputIndex - 1; $b > 0; $b--) {
13867 $prev = $this->inputTokens[$b];
13868 if ($prev instanceof HTMLPurifier_Token_Text && $prev->is_whitespace) continue;
13871 // This is safe because we removed the token that triggered this.
13872 $this->rewind($b - 1);
13884 * Injector that removes spans with no attributes
13886 class HTMLPurifier_Injector_RemoveSpansWithoutAttributes extends HTMLPurifier_Injector
13888 public $name = 'RemoveSpansWithoutAttributes';
13889 public $needed = array('span');
13891 private $attrValidator;
13894 * Used by AttrValidator
13899 public function prepare($config, $context) {
13900 $this->attrValidator = new HTMLPurifier_AttrValidator();
13901 $this->config = $config;
13902 $this->context = $context;
13903 return parent::prepare($config, $context);
13906 public function handleElement(&$token) {
13907 if ($token->name !== 'span' || !$token instanceof HTMLPurifier_Token_Start) {
13911 // We need to validate the attributes now since this doesn't normally
13912 // happen until after MakeWellFormed. If all the attributes are removed
13913 // the span needs to be removed too.
13914 $this->attrValidator->validateToken($token, $this->config, $this->context);
13915 $token->armor['ValidateAttributes'] = true;
13917 if (!empty($token->attr)) {
13922 $spanContentTokens = array();
13923 while ($this->forwardUntilEndToken($i, $current, $nesting)) {}
13925 if ($current instanceof HTMLPurifier_Token_End && $current->name === 'span') {
13926 // Mark closing span tag for deletion
13927 $current->markForDeletion = true;
13928 // Delete open span tag
13933 public function handleEnd(&$token) {
13934 if ($token->markForDeletion) {
13945 * Adds important param elements to inside of object in order to make
13948 class HTMLPurifier_Injector_SafeObject extends HTMLPurifier_Injector
13950 public $name = 'SafeObject';
13951 public $needed = array('object', 'param');
13953 protected $objectStack = array();
13954 protected $paramStack = array();
13956 // Keep this synchronized with AttrTransform/SafeParam.php
13957 protected $addParam = array(
13958 'allowScriptAccess' => 'never',
13959 'allowNetworking' => 'internal',
13961 protected $allowedParam = array(
13964 'flashvars' => true,
13966 'allowFullScreen' => true, // if omitted, assume to be 'false'
13969 public function prepare($config, $context) {
13970 parent::prepare($config, $context);
13973 public function handleElement(&$token) {
13974 if ($token->name == 'object') {
13975 $this->objectStack[] = $token;
13976 $this->paramStack[] = array();
13977 $new = array($token);
13978 foreach ($this->addParam as $name => $value) {
13979 $new[] = new HTMLPurifier_Token_Empty('param', array('name' => $name, 'value' => $value));
13982 } elseif ($token->name == 'param') {
13983 $nest = count($this->currentNesting) - 1;
13984 if ($nest >= 0 && $this->currentNesting[$nest]->name === 'object') {
13985 $i = count($this->objectStack) - 1;
13986 if (!isset($token->attr['name'])) {
13990 $n = $token->attr['name'];
13991 // We need this fix because YouTube doesn't supply a data
13992 // attribute, which we need if a type is specified. This is
13993 // *very* Flash specific.
13994 if (!isset($this->objectStack[$i]->attr['data']) &&
13995 ($token->attr['name'] == 'movie' || $token->attr['name'] == 'src')) {
13996 $this->objectStack[$i]->attr['data'] = $token->attr['value'];
13998 // Check if the parameter is the correct value but has not
13999 // already been added
14001 !isset($this->paramStack[$i][$n]) &&
14002 isset($this->addParam[$n]) &&
14003 $token->attr['name'] === $this->addParam[$n]
14005 // keep token, and add to param stack
14006 $this->paramStack[$i][$n] = true;
14007 } elseif (isset($this->allowedParam[$n])) {
14008 // keep token, don't do anything to it
14009 // (could possibly check for duplicates here)
14014 // not directly inside an object, DENY!
14020 public function handleEnd(&$token) {
14021 // This is the WRONG way of handling the object and param stacks;
14022 // we should be inserting them directly on the relevant object tokens
14023 // so that the global stack handling handles it.
14024 if ($token->name == 'object') {
14025 array_pop($this->objectStack);
14026 array_pop($this->paramStack);
14037 * Parser that uses PHP 5's DOM extension (part of the core).
14039 * In PHP 5, the DOM XML extension was revamped into DOM and added to the core.
14040 * It gives us a forgiving HTML parser, which we use to transform the HTML
14041 * into a DOM, and then into the tokens. It is blazingly fast (for large
14042 * documents, it performs twenty times faster than
14043 * HTMLPurifier_Lexer_DirectLex,and is the default choice for PHP 5.
14045 * @note Any empty elements will have empty tokens associated with them, even if
14046 * this is prohibited by the spec. This is cannot be fixed until the spec
14049 * @note PHP's DOM extension does not actually parse any entities, we use
14050 * our own function to do that.
14052 * @warning DOM tends to drop whitespace, which may wreak havoc on indenting.
14053 * If this is a huge problem, due to the fact that HTML is hand
14054 * edited and you are unable to get a parser cache that caches the
14055 * the output of HTML Purifier while keeping the original HTML lying
14056 * around, you may want to run Tidy on the resulting output or use
14057 * HTMLPurifier_DirectLex
14060 class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
14065 public function __construct() {
14066 // setup the factory
14067 parent::__construct();
14068 $this->factory = new HTMLPurifier_TokenFactory();
14071 public function tokenizeHTML($html, $config, $context) {
14073 $html = $this->normalize($html, $config, $context);
14075 // attempt to armor stray angled brackets that cannot possibly
14076 // form tags and thus are probably being used as emoticons
14077 if ($config->get('Core.AggressivelyFixLt')) {
14078 $char = '[^a-z!\/]';
14079 $comment = "/<!--(.*?)(-->|\z)/is";
14080 $html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html);
14083 $html = preg_replace("/<($char)/i", '<\\1', $html);
14084 } while ($html !== $old);
14085 $html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments
14088 // preprocess html, essential for UTF-8
14089 $html = $this->wrapHTML($html, $config, $context);
14091 $doc = new DOMDocument();
14092 $doc->encoding = 'UTF-8'; // theoretically, the above has this covered
14094 set_error_handler(array($this, 'muteErrorHandler'));
14095 $doc->loadHTML($html);
14096 restore_error_handler();
14099 $this->tokenizeDOM(
14100 $doc->getElementsByTagName('html')->item(0)-> // <html>
14101 getElementsByTagName('body')->item(0)-> // <body>
14102 getElementsByTagName('div')->item(0) // <div>
14108 * Iterative function that tokenizes a node, putting it into an accumulator.
14109 * To iterate is human, to recurse divine - L. Peter Deutsch
14110 * @param $node DOMNode to be tokenized.
14111 * @param $tokens Array-list of already tokenized tokens.
14112 * @returns Tokens of node appended to previously passed tokens.
14114 protected function tokenizeDOM($node, &$tokens) {
14117 $nodes = array($level => array($node));
14118 $closingNodes = array();
14120 while (!empty($nodes[$level])) {
14121 $node = array_shift($nodes[$level]); // FIFO
14122 $collect = $level > 0 ? true : false;
14123 $needEndingTag = $this->createStartNode($node, $tokens, $collect);
14124 if ($needEndingTag) {
14125 $closingNodes[$level][] = $node;
14127 if ($node->childNodes && $node->childNodes->length) {
14129 $nodes[$level] = array();
14130 foreach ($node->childNodes as $childNode) {
14131 array_push($nodes[$level], $childNode);
14136 if ($level && isset($closingNodes[$level])) {
14137 while($node = array_pop($closingNodes[$level])) {
14138 $this->createEndNode($node, $tokens);
14141 } while ($level > 0);
14145 * @param $node DOMNode to be tokenized.
14146 * @param $tokens Array-list of already tokenized tokens.
14147 * @param $collect Says whether or start and close are collected, set to
14148 * false at first recursion because it's the implicit DIV
14149 * tag you're dealing with.
14150 * @returns bool if the token needs an endtoken
14152 protected function createStartNode($node, &$tokens, $collect) {
14153 // intercept non element nodes. WE MUST catch all of them,
14154 // but we're not getting the character reference nodes because
14155 // those should have been preprocessed
14156 if ($node->nodeType === XML_TEXT_NODE) {
14157 $tokens[] = $this->factory->createText($node->data);
14159 } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {
14160 // undo libxml's special treatment of <script> and <style> tags
14161 $last = end($tokens);
14162 $data = $node->data;
14163 // (note $node->tagname is already normalized)
14164 if ($last instanceof HTMLPurifier_Token_Start && ($last->name == 'script' || $last->name == 'style')) {
14165 $new_data = trim($data);
14166 if (substr($new_data, 0, 4) === '<!--') {
14167 $data = substr($new_data, 4);
14168 if (substr($data, -3) === '-->') {
14169 $data = substr($data, 0, -3);
14171 // Highly suspicious! Not sure what to do...
14175 $tokens[] = $this->factory->createText($this->parseData($data));
14177 } elseif ($node->nodeType === XML_COMMENT_NODE) {
14178 // this is code is only invoked for comments in script/style in versions
14179 // of libxml pre-2.6.28 (regular comments, of course, are still
14180 // handled regularly)
14181 $tokens[] = $this->factory->createComment($node->data);
14184 // not-well tested: there may be other nodes we have to grab
14185 $node->nodeType !== XML_ELEMENT_NODE
14190 $attr = $node->hasAttributes() ? $this->transformAttrToAssoc($node->attributes) : array();
14192 // We still have to make sure that the element actually IS empty
14193 if (!$node->childNodes->length) {
14195 $tokens[] = $this->factory->createEmpty($node->tagName, $attr);
14200 $tokens[] = $this->factory->createStart(
14201 $tag_name = $node->tagName, // somehow, it get's dropped
14209 protected function createEndNode($node, &$tokens) {
14210 $tokens[] = $this->factory->createEnd($node->tagName);
14215 * Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array.
14217 * @param $attribute_list DOMNamedNodeMap of DOMAttr objects.
14218 * @returns Associative array of attributes.
14220 protected function transformAttrToAssoc($node_map) {
14221 // NamedNodeMap is documented very well, so we're using undocumented
14222 // features, namely, the fact that it implements Iterator and
14223 // has a ->length attribute
14224 if ($node_map->length === 0) return array();
14226 foreach ($node_map as $attr) {
14227 $array[$attr->name] = $attr->value;
14233 * An error handler that mutes all errors
14235 public function muteErrorHandler($errno, $errstr) {}
14238 * Callback function for undoing escaping of stray angled brackets
14241 public function callbackUndoCommentSubst($matches) {
14242 return '<!--' . strtr($matches[1], array('&'=>'&','<'=>'<')) . $matches[2];
14246 * Callback function that entity-izes ampersands in comments so that
14247 * callbackUndoCommentSubst doesn't clobber them
14249 public function callbackArmorCommentEntities($matches) {
14250 return '<!--' . str_replace('&', '&', $matches[1]) . $matches[2];
14254 * Wraps an HTML fragment in the necessary HTML
14256 protected function wrapHTML($html, $config, $context) {
14257 $def = $config->getDefinition('HTML');
14260 if (!empty($def->doctype->dtdPublic) || !empty($def->doctype->dtdSystem)) {
14261 $ret .= '<!DOCTYPE html ';
14262 if (!empty($def->doctype->dtdPublic)) $ret .= 'PUBLIC "' . $def->doctype->dtdPublic . '" ';
14263 if (!empty($def->doctype->dtdSystem)) $ret .= '"' . $def->doctype->dtdSystem . '" ';
14267 $ret .= '<html><head>';
14268 $ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />';
14269 // No protection if $html contains a stray </div>!
14270 $ret .= '</head><body><div>'.$html.'</div></body></html>';
14281 * Our in-house implementation of a parser.
14283 * A pure PHP parser, DirectLex has absolutely no dependencies, making
14284 * it a reasonably good default for PHP4. Written with efficiency in mind,
14285 * it can be four times faster than HTMLPurifier_Lexer_PEARSax3, although it
14286 * pales in comparison to HTMLPurifier_Lexer_DOMLex.
14288 * @todo Reread XML spec and document differences.
14290 class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
14293 public $tracksLineNumbers = true;
14296 * Whitespace characters for str(c)spn.
14298 protected $_whitespace = "\x20\x09\x0D\x0A";
14301 * Callback function for script CDATA fudge
14302 * @param $matches, in form of array(opening tag, contents, closing tag)
14304 protected function scriptCallback($matches) {
14305 return $matches[1] . htmlspecialchars($matches[2], ENT_COMPAT, 'UTF-8') . $matches[3];
14308 public function tokenizeHTML($html, $config, $context) {
14310 // special normalization for script tags without any armor
14311 // our "armor" heurstic is a < sign any number of whitespaces after
14312 // the first script tag
14313 if ($config->get('HTML.Trusted')) {
14314 $html = preg_replace_callback('#(<script[^>]*>)(\s*[^<].+?)(</script>)#si',
14315 array($this, 'scriptCallback'), $html);
14318 $html = $this->normalize($html, $config, $context);
14320 $cursor = 0; // our location in the text
14321 $inside_tag = false; // whether or not we're parsing the inside of a tag
14322 $array = array(); // result array
14324 // This is also treated to mean maintain *column* numbers too
14325 $maintain_line_numbers = $config->get('Core.MaintainLineNumbers');
14327 if ($maintain_line_numbers === null) {
14328 // automatically determine line numbering by checking
14329 // if error collection is on
14330 $maintain_line_numbers = $config->get('Core.CollectErrors');
14333 if ($maintain_line_numbers) {
14336 $length = strlen($html);
14338 $current_line = false;
14339 $current_col = false;
14342 $context->register('CurrentLine', $current_line);
14343 $context->register('CurrentCol', $current_col);
14345 // how often to manually recalculate. This will ALWAYS be right,
14346 // but it's pretty wasteful. Set to 0 to turn off
14347 $synchronize_interval = $config->get('Core.DirectLexLineNumberSyncInterval');
14350 if ($config->get('Core.CollectErrors')) {
14351 $e =& $context->get('ErrorCollector');
14354 // for testing synchronization
14359 // $cursor is either at the start of a token, or inside of
14360 // a tag (i.e. there was a < immediately before it), as indicated
14363 if ($maintain_line_numbers) {
14365 // $rcursor, however, is always at the start of a token.
14366 $rcursor = $cursor - (int) $inside_tag;
14368 // Column number is cheap, so we calculate it every round.
14369 // We're interested at the *end* of the newline string, so
14370 // we need to add strlen($nl) == 1 to $nl_pos before subtracting it
14371 // from our "rcursor" position.
14372 $nl_pos = strrpos($html, $nl, $rcursor - $length);
14373 $current_col = $rcursor - (is_bool($nl_pos) ? 0 : $nl_pos + 1);
14375 // recalculate lines
14377 $synchronize_interval && // synchronization is on
14378 $cursor > 0 && // cursor is further than zero
14379 $loops % $synchronize_interval === 0 // time to synchronize!
14381 $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor);
14386 $position_next_lt = strpos($html, '<', $cursor);
14387 $position_next_gt = strpos($html, '>', $cursor);
14389 // triggers on "<b>asdf</b>" but not "asdf <b></b>"
14390 // special case to set up context
14391 if ($position_next_lt === $cursor) {
14392 $inside_tag = true;
14396 if (!$inside_tag && $position_next_lt !== false) {
14397 // We are not inside tag and there still is another tag to parse
14399 HTMLPurifier_Token_Text(
14402 $html, $cursor, $position_next_lt - $cursor
14406 if ($maintain_line_numbers) {
14407 $token->rawPosition($current_line, $current_col);
14408 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor);
14411 $cursor = $position_next_lt + 1;
14412 $inside_tag = true;
14414 } elseif (!$inside_tag) {
14415 // We are not inside tag but there are no more tags
14416 // If we're already at the end, break
14417 if ($cursor === strlen($html)) break;
14418 // Create Text of rest of string
14420 HTMLPurifier_Token_Text(
14427 if ($maintain_line_numbers) $token->rawPosition($current_line, $current_col);
14430 } elseif ($inside_tag && $position_next_gt !== false) {
14431 // We are in tag and it is well formed
14432 // Grab the internals of the tag
14433 $strlen_segment = $position_next_gt - $cursor;
14435 if ($strlen_segment < 1) {
14436 // there's nothing to process!
14437 $token = new HTMLPurifier_Token_Text('<');
14442 $segment = substr($html, $cursor, $strlen_segment);
14444 if ($segment === false) {
14445 // somehow, we attempted to access beyond the end of
14446 // the string, defense-in-depth, reported by Nate Abele
14450 // Check if it's a comment
14452 substr($segment, 0, 3) === '!--'
14454 // re-determine segment length, looking for -->
14455 $position_comment_end = strpos($html, '-->', $cursor);
14456 if ($position_comment_end === false) {
14457 // uh oh, we have a comment that extends to
14458 // infinity. Can't be helped: set comment
14459 // end position to end of string
14460 if ($e) $e->send(E_WARNING, 'Lexer: Unclosed comment');
14461 $position_comment_end = strlen($html);
14466 $strlen_segment = $position_comment_end - $cursor;
14467 $segment = substr($html, $cursor, $strlen_segment);
14469 HTMLPurifier_Token_Comment(
14471 $segment, 3, $strlen_segment - 3
14474 if ($maintain_line_numbers) {
14475 $token->rawPosition($current_line, $current_col);
14476 $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment);
14479 $cursor = $end ? $position_comment_end : $position_comment_end + 3;
14480 $inside_tag = false;
14484 // Check if it's an end tag
14485 $is_end_tag = (strpos($segment,'/') === 0);
14487 $type = substr($segment, 1);
14488 $token = new HTMLPurifier_Token_End($type);
14489 if ($maintain_line_numbers) {
14490 $token->rawPosition($current_line, $current_col);
14491 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
14494 $inside_tag = false;
14495 $cursor = $position_next_gt + 1;
14499 // Check leading character is alnum, if not, we may
14500 // have accidently grabbed an emoticon. Translate into
14501 // text and go our merry way
14502 if (!ctype_alpha($segment[0])) {
14503 // XML: $segment[0] !== '_' && $segment[0] !== ':'
14504 if ($e) $e->send(E_NOTICE, 'Lexer: Unescaped lt');
14505 $token = new HTMLPurifier_Token_Text('<');
14506 if ($maintain_line_numbers) {
14507 $token->rawPosition($current_line, $current_col);
14508 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
14511 $inside_tag = false;
14515 // Check if it is explicitly self closing, if so, remove
14516 // trailing slash. Remember, we could have a tag like <br>, so
14517 // any later token processing scripts must convert improperly
14518 // classified EmptyTags from StartTags.
14519 $is_self_closing = (strrpos($segment,'/') === $strlen_segment-1);
14520 if ($is_self_closing) {
14522 $segment = substr($segment, 0, $strlen_segment);
14525 // Check if there are any attributes
14526 $position_first_space = strcspn($segment, $this->_whitespace);
14528 if ($position_first_space >= $strlen_segment) {
14529 if ($is_self_closing) {
14530 $token = new HTMLPurifier_Token_Empty($segment);
14532 $token = new HTMLPurifier_Token_Start($segment);
14534 if ($maintain_line_numbers) {
14535 $token->rawPosition($current_line, $current_col);
14536 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
14539 $inside_tag = false;
14540 $cursor = $position_next_gt + 1;
14544 // Grab out all the data
14545 $type = substr($segment, 0, $position_first_space);
14546 $attribute_string =
14549 $segment, $position_first_space
14552 if ($attribute_string) {
14553 $attr = $this->parseAttributeString(
14555 , $config, $context
14561 if ($is_self_closing) {
14562 $token = new HTMLPurifier_Token_Empty($type, $attr);
14564 $token = new HTMLPurifier_Token_Start($type, $attr);
14566 if ($maintain_line_numbers) {
14567 $token->rawPosition($current_line, $current_col);
14568 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
14571 $cursor = $position_next_gt + 1;
14572 $inside_tag = false;
14575 // inside tag, but there's no ending > sign
14576 if ($e) $e->send(E_WARNING, 'Lexer: Missing gt');
14578 HTMLPurifier_Token_Text(
14581 substr($html, $cursor)
14584 if ($maintain_line_numbers) $token->rawPosition($current_line, $current_col);
14585 // no cursor scroll? Hmm...
14592 $context->destroy('CurrentLine');
14593 $context->destroy('CurrentCol');
14598 * PHP 5.0.x compatible substr_count that implements offset and length
14600 protected function substrCount($haystack, $needle, $offset, $length) {
14601 static $oldVersion;
14602 if ($oldVersion === null) {
14603 $oldVersion = version_compare(PHP_VERSION, '5.1', '<');
14606 $haystack = substr($haystack, $offset, $length);
14607 return substr_count($haystack, $needle);
14609 return substr_count($haystack, $needle, $offset, $length);
14614 * Takes the inside of an HTML tag and makes an assoc array of attributes.
14616 * @param $string Inside of tag excluding name.
14617 * @returns Assoc array of attributes.
14619 public function parseAttributeString($string, $config, $context) {
14620 $string = (string) $string; // quick typecast
14622 if ($string == '') return array(); // no attributes
14625 if ($config->get('Core.CollectErrors')) {
14626 $e =& $context->get('ErrorCollector');
14629 // let's see if we can abort as quickly as possible
14630 // one equal sign, no spaces => one attribute
14631 $num_equal = substr_count($string, '=');
14632 $has_space = strpos($string, ' ');
14633 if ($num_equal === 0 && !$has_space) {
14635 return array($string => $string);
14636 } elseif ($num_equal === 1 && !$has_space) {
14637 // only one attribute
14638 list($key, $quoted_value) = explode('=', $string);
14639 $quoted_value = trim($quoted_value);
14641 if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
14644 if (!$quoted_value) return array($key => '');
14645 $first_char = @$quoted_value[0];
14646 $last_char = @$quoted_value[strlen($quoted_value)-1];
14648 $same_quote = ($first_char == $last_char);
14649 $open_quote = ($first_char == '"' || $first_char == "'");
14651 if ( $same_quote && $open_quote) {
14653 $value = substr($quoted_value, 1, strlen($quoted_value) - 2);
14655 // not well behaved
14657 if ($e) $e->send(E_ERROR, 'Lexer: Missing end quote');
14658 $value = substr($quoted_value, 1);
14660 $value = $quoted_value;
14663 if ($value === false) $value = '';
14664 return array($key => $this->parseData($value));
14667 // setup loop environment
14668 $array = array(); // return assoc array of attributes
14669 $cursor = 0; // current position in string (moves forward)
14670 $size = strlen($string); // size of the string (stays the same)
14672 // if we have unquoted attributes, the parser expects a terminating
14673 // space, so let's guarantee that there's always a terminating space.
14678 if ($cursor >= $size) {
14682 $cursor += ($value = strspn($string, $this->_whitespace, $cursor));
14685 $key_begin = $cursor; //we're currently at the start of the key
14687 // scroll past all characters that are the key (not whitespace or =)
14688 $cursor += strcspn($string, $this->_whitespace . '=', $cursor);
14690 $key_end = $cursor; // now at the end of the key
14692 $key = substr($string, $key_begin, $key_end - $key_begin);
14695 if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
14696 $cursor += strcspn($string, $this->_whitespace, $cursor + 1); // prevent infinite loop
14697 continue; // empty key
14700 // scroll past all whitespace
14701 $cursor += strspn($string, $this->_whitespace, $cursor);
14703 if ($cursor >= $size) {
14704 $array[$key] = $key;
14708 // if the next character is an equal sign, we've got a regular
14709 // pair, otherwise, it's a bool attribute
14710 $first_char = @$string[$cursor];
14712 if ($first_char == '=') {
14716 $cursor += strspn($string, $this->_whitespace, $cursor);
14718 if ($cursor === false) {
14723 // we might be in front of a quote right now
14725 $char = @$string[$cursor];
14727 if ($char == '"' || $char == "'") {
14728 // it's quoted, end bound is $char
14730 $value_begin = $cursor;
14731 $cursor = strpos($string, $char, $cursor);
14732 $value_end = $cursor;
14734 // it's not quoted, end bound is whitespace
14735 $value_begin = $cursor;
14736 $cursor += strcspn($string, $this->_whitespace, $cursor);
14737 $value_end = $cursor;
14740 // we reached a premature end
14741 if ($cursor === false) {
14743 $value_end = $cursor;
14746 $value = substr($string, $value_begin, $value_end - $value_begin);
14747 if ($value === false) $value = '';
14748 $array[$key] = $this->parseData($value);
14754 $array[$key] = $key;
14756 // purely theoretical
14757 if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
14772 * Composite strategy that runs multiple strategies on tokens.
14774 abstract class HTMLPurifier_Strategy_Composite extends HTMLPurifier_Strategy
14778 * List of strategies to run tokens through.
14780 protected $strategies = array();
14782 abstract public function __construct();
14784 public function execute($tokens, $config, $context) {
14785 foreach ($this->strategies as $strategy) {
14786 $tokens = $strategy->execute($tokens, $config, $context);
14798 * Core strategy composed of the big four strategies.
14800 class HTMLPurifier_Strategy_Core extends HTMLPurifier_Strategy_Composite
14803 public function __construct() {
14804 $this->strategies[] = new HTMLPurifier_Strategy_RemoveForeignElements();
14805 $this->strategies[] = new HTMLPurifier_Strategy_MakeWellFormed();
14806 $this->strategies[] = new HTMLPurifier_Strategy_FixNesting();
14807 $this->strategies[] = new HTMLPurifier_Strategy_ValidateAttributes();
14817 * Takes a well formed list of tokens and fixes their nesting.
14819 * HTML elements dictate which elements are allowed to be their children,
14820 * for example, you can't have a p tag in a span tag. Other elements have
14821 * much more rigorous definitions: tables, for instance, require a specific
14822 * order for their elements. There are also constraints not expressible by
14823 * document type definitions, such as the chameleon nature of ins/del
14824 * tags and global child exclusions.
14826 * The first major objective of this strategy is to iterate through all the
14827 * nodes (not tokens) of the list of tokens and determine whether or not
14828 * their children conform to the element's definition. If they do not, the
14829 * child definition may optionally supply an amended list of elements that
14830 * is valid or require that the entire node be deleted (and the previous
14833 * The second objective is to ensure that explicitly excluded elements of
14834 * an element do not appear in its children. Code that accomplishes this
14835 * task is pervasive through the strategy, though the two are distinct tasks
14836 * and could, theoretically, be seperated (although it's not recommended).
14838 * @note Whether or not unrecognized children are silently dropped or
14839 * translated into text depends on the child definitions.
14841 * @todo Enable nodes to be bubbled out of the structure.
14844 class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
14847 public function execute($tokens, $config, $context) {
14848 //####################################################################//
14851 // get a copy of the HTML definition
14852 $definition = $config->getHTMLDefinition();
14854 // insert implicit "parent" node, will be removed at end.
14856 $parent_name = $definition->info_parent;
14857 array_unshift($tokens, new HTMLPurifier_Token_Start($parent_name));
14858 $tokens[] = new HTMLPurifier_Token_End($parent_name);
14860 // setup the context variable 'IsInline', for chameleon processing
14861 // is 'false' when we are not inline, 'true' when it must always
14862 // be inline, and an integer when it is inline for a certain
14863 // branch of the document tree
14864 $is_inline = $definition->info_parent_def->descendants_are_inline;
14865 $context->register('IsInline', $is_inline);
14867 // setup error collector
14868 $e =& $context->get('ErrorCollector', true);
14870 //####################################################################//
14871 // Loop initialization
14873 // stack that contains the indexes of all parents,
14874 // $stack[count($stack)-1] being the current parent
14877 // stack that contains all elements that are excluded
14878 // it is organized by parent elements, similar to $stack,
14879 // but it is only populated when an element with exclusions is
14880 // processed, i.e. there won't be empty exclusions.
14881 $exclude_stack = array();
14883 // variable that contains the start token while we are processing
14884 // nodes. This enables error reporting to do its job
14885 $start_token = false;
14886 $context->register('CurrentToken', $start_token);
14888 //####################################################################//
14891 // iterate through all start nodes. Determining the start node
14892 // is complicated so it has been omitted from the loop construct
14893 for ($i = 0, $size = count($tokens) ; $i < $size; ) {
14895 //################################################################//
14896 // Gather information on children
14898 // child token accumulator
14899 $child_tokens = array();
14901 // scroll to the end of this node, report number, and collect
14903 for ($j = $i, $depth = 0; ; $j++) {
14904 if ($tokens[$j] instanceof HTMLPurifier_Token_Start) {
14906 // skip token assignment on first iteration, this is the
14907 // token we currently are on
14908 if ($depth == 1) continue;
14909 } elseif ($tokens[$j] instanceof HTMLPurifier_Token_End) {
14911 // skip token assignment on last iteration, this is the
14912 // end token of the token we're currently on
14913 if ($depth == 0) break;
14915 $child_tokens[] = $tokens[$j];
14918 // $i is index of start token
14919 // $j is index of end token
14921 $start_token = $tokens[$i]; // to make token available via CurrentToken
14923 //################################################################//
14924 // Gather information on parent
14926 // calculate parent information
14927 if ($count = count($stack)) {
14928 $parent_index = $stack[$count-1];
14929 $parent_name = $tokens[$parent_index]->name;
14930 if ($parent_index == 0) {
14931 $parent_def = $definition->info_parent_def;
14933 $parent_def = $definition->info[$parent_name];
14936 // processing as if the parent were the "root" node
14937 // unknown info, it won't be used anyway, in the future,
14938 // we may want to enforce one element only (this is
14939 // necessary for HTML Purifier to clean entire documents
14940 $parent_index = $parent_name = $parent_def = null;
14943 // calculate context
14944 if ($is_inline === false) {
14945 // check if conditions make it inline
14946 if (!empty($parent_def) && $parent_def->descendants_are_inline) {
14947 $is_inline = $count - 1;
14950 // check if we're out of inline
14951 if ($count === $is_inline) {
14952 $is_inline = false;
14956 //################################################################//
14957 // Determine whether element is explicitly excluded SGML-style
14959 // determine whether or not element is excluded by checking all
14960 // parent exclusions. The array should not be very large, two
14961 // elements at most.
14963 if (!empty($exclude_stack)) {
14964 foreach ($exclude_stack as $lookup) {
14965 if (isset($lookup[$tokens[$i]->name])) {
14967 // no need to continue processing
14973 //################################################################//
14974 // Perform child validation
14977 // there is an exclusion, remove the entire node
14979 $excludes = array(); // not used, but good to initialize anyway
14983 // special processing for the first node
14984 $def = $definition->info_parent_def;
14986 $def = $definition->info[$tokens[$i]->name];
14990 if (!empty($def->child)) {
14991 // have DTD child def validate children
14992 $result = $def->child->validateChildren(
14993 $child_tokens, $config, $context);
14995 // weird, no child definition, get rid of everything
14999 // determine whether or not this element has any exclusions
15000 $excludes = $def->excludes;
15003 // $result is now a bool or array
15005 //################################################################//
15006 // Process result by interpreting $result
15008 if ($result === true || $child_tokens === $result) {
15009 // leave the node as is
15011 // register start token as a parental node start
15014 // register exclusions if there are any
15015 if (!empty($excludes)) $exclude_stack[] = $excludes;
15017 // move cursor to next possible start node
15020 } elseif($result === false) {
15021 // remove entire node
15025 $e->send(E_ERROR, 'Strategy_FixNesting: Node excluded');
15027 $e->send(E_ERROR, 'Strategy_FixNesting: Node removed');
15031 // calculate length of inner tokens and current tokens
15032 $length = $j - $i + 1;
15035 // CODE HAS TO BE MOVED WITH UPGRADE START
15036 if ($tokens instanceof HTMLPurifier_Array) {
15037 $tokens->splice($i, $length);
15039 array_splice($tokens, $i, $length);
15041 // CODE HAS TO BE MOVED WITH UPGRADE ENT
15046 // there is no start token to register,
15047 // current node is now the next possible start node
15048 // unless it turns out that we need to do a double-check
15050 // this is a rought heuristic that covers 100% of HTML's
15051 // cases and 99% of all other cases. A child definition
15052 // that would be tricked by this would be something like:
15053 // ( | a b c) where it's all or nothing. Fortunately,
15054 // our current implementation claims that that case would
15055 // not allow empty, even if it did
15056 if (!$parent_def->child->allow_empty) {
15057 // we need to do a double-check
15058 $i = $parent_index;
15062 // PROJECTED OPTIMIZATION: Process all children elements before
15063 // reprocessing parent node.
15066 // replace node with $result
15068 // calculate length of inner tokens
15069 $length = $j - $i - 1;
15072 if (empty($result) && $length) {
15073 $e->send(E_ERROR, 'Strategy_FixNesting: Node contents removed');
15075 $e->send(E_WARNING, 'Strategy_FixNesting: Node reorganized');
15079 // perform replacement
15080 // CODE HAS TO BE MOVED WITH UPGRADE START
15081 if ($tokens instanceof HTMLPurifier_Array) {
15082 $tokens->splice($i + 1, $length, $result);
15084 array_splice($tokens, $i + 1, $length, $result);
15086 // CODE HAS TO BE MOVED WITH UPGRADE END
15090 $size += count($result);
15092 // register start token as a parental node start
15095 // register exclusions if there are any
15096 if (!empty($excludes)) $exclude_stack[] = $excludes;
15098 // move cursor to next possible start node
15103 //################################################################//
15104 // Scroll to next start node
15106 // We assume, at this point, that $i is the index of the token
15107 // that is the first possible new start point for a node.
15109 // Test if the token indeed is a start tag, if not, move forward
15111 $size = count($tokens);
15112 while ($i < $size and !$tokens[$i] instanceof HTMLPurifier_Token_Start) {
15113 if ($tokens[$i] instanceof HTMLPurifier_Token_End) {
15114 // pop a token index off the stack if we ended a node
15116 // pop an exclusion lookup off exclusion stack if
15117 // we ended node and that node had exclusions
15118 if ($i == 0 || $i == $size - 1) {
15119 // use specialized var if it's the super-parent
15120 $s_excludes = $definition->info_parent_def->excludes;
15122 $s_excludes = $definition->info[$tokens[$i]->name]->excludes;
15125 array_pop($exclude_stack);
15133 //####################################################################//
15136 // remove implicit parent tokens at the beginning and end
15137 array_shift($tokens);
15138 array_pop($tokens);
15140 // remove context variables
15141 $context->destroy('IsInline');
15142 $context->destroy('CurrentToken');
15144 //####################################################################//
15158 * Takes tokens makes them well-formed (balance end tags, etc.)
15160 * Specification of the armor attributes this strategy uses:
15162 * - MakeWellFormed_TagClosedError: This armor field is used to
15163 * suppress tag closed errors for certain tokens [TagClosedSuppress],
15164 * in particular, if a tag was generated automatically by HTML
15165 * Purifier, we may rely on our infrastructure to close it for us
15166 * and shouldn't report an error to the user [TagClosedAuto].
15168 class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
15172 * Array stream of tokens being processed.
15177 * Current index in $tokens.
15182 * Current nesting of elements.
15187 * Injectors active in this stream processing.
15189 protected $injectors;
15192 * Current instance of HTMLPurifier_Config.
15197 * Current instance of HTMLPurifier_Context.
15199 protected $context;
15201 public function execute($tokens, $config, $context) {
15202 // CODE HAS TO BE MOVED WITH UPGRADE START
15203 $tokens = new HTMLPurifier_Array($tokens);
15204 // CODE HAS TO BE MOVED WITH UPGRADE END
15205 $definition = $config->getHTMLDefinition();
15208 $generator = new HTMLPurifier_Generator($config, $context);
15209 $escape_invalid_tags = $config->get('Core.EscapeInvalidTags');
15210 // used for autoclose early abortion
15211 $global_parent_allowed_elements = array();
15212 if (isset($definition->info[$definition->info_parent])) {
15213 // may be unset under testing circumstances
15214 $global_parent_allowed_elements = $definition->info[$definition->info_parent]->child->getAllowedElements($config);
15216 $e = $context->get('ErrorCollector', true);
15217 $t = false; // token index
15218 $i = false; // injector index
15219 $token = false; // the current token
15220 $reprocess = false; // whether or not to reprocess the same token
15223 // member variables
15224 $this->stack =& $stack;
15226 $this->tokens =& $tokens;
15227 $this->config = $config;
15228 $this->context = $context;
15230 // context variables
15231 $context->register('CurrentNesting', $stack);
15232 $context->register('InputIndex', $t);
15233 $context->register('InputTokens', $tokens);
15234 $context->register('CurrentToken', $token);
15236 // -- begin INJECTOR --
15238 $this->injectors = array();
15240 $injectors = $config->getBatch('AutoFormat');
15241 $def_injectors = $definition->info_injector;
15242 $custom_injectors = $injectors['Custom'];
15243 unset($injectors['Custom']); // special case
15244 foreach ($injectors as $injector => $b) {
15245 // XXX: Fix with a legitimate lookup table of enabled filters
15246 if (strpos($injector, '.') !== false) continue;
15247 $injector = "HTMLPurifier_Injector_$injector";
15249 $this->injectors[] = new $injector;
15251 foreach ($def_injectors as $injector) {
15252 // assumed to be objects
15253 $this->injectors[] = $injector;
15255 foreach ($custom_injectors as $injector) {
15256 if (!$injector) continue;
15257 if (is_string($injector)) {
15258 $injector = "HTMLPurifier_Injector_$injector";
15259 $injector = new $injector;
15261 $this->injectors[] = $injector;
15264 // give the injectors references to the definition and context
15265 // variables for performance reasons
15266 foreach ($this->injectors as $ix => $injector) {
15267 $error = $injector->prepare($config, $context);
15268 if (!$error) continue;
15269 array_splice($this->injectors, $ix, 1); // rm the injector
15270 trigger_error("Cannot enable {$injector->name} injector because $error is not allowed", E_USER_WARNING);
15273 // -- end INJECTOR --
15275 // a note on reprocessing:
15276 // In order to reduce code duplication, whenever some code needs
15277 // to make HTML changes in order to make things "correct", the
15278 // new HTML gets sent through the purifier, regardless of its
15279 // status. This means that if we add a start token, because it
15280 // was totally necessary, we don't have to update nesting; we just
15281 // punt ($reprocess = true; continue;) and it does that for us.
15283 // isset is in loop because $tokens size changes during loop exec
15286 $t == 0 || isset($tokens[$t - 1]);
15287 // only increment if we don't need to reprocess
15288 $reprocess ? $reprocess = false : $t++
15291 // check for a rewind
15292 if (is_int($i) && $i >= 0) {
15293 // possibility: disable rewinding if the current token has a
15294 // rewind set on it already. This would offer protection from
15295 // infinite loop, but might hinder some advanced rewinding.
15296 $rewind_to = $this->injectors[$i]->getRewind();
15297 if (is_int($rewind_to) && $rewind_to < $t) {
15298 if ($rewind_to < 0) $rewind_to = 0;
15299 while ($t > $rewind_to) {
15301 $prev = $tokens[$t];
15302 // indicate that other injectors should not process this token,
15303 // but we need to reprocess it
15304 unset($prev->skip[$i]);
15305 $prev->rewind = $i;
15306 if ($prev instanceof HTMLPurifier_Token_Start) array_pop($this->stack);
15307 elseif ($prev instanceof HTMLPurifier_Token_End) $this->stack[] = $prev->start;
15313 // handle case of document end
15314 if (!isset($tokens[$t])) {
15315 // kill processing if stack is empty
15316 if (empty($this->stack)) break;
15319 $top_nesting = array_pop($this->stack);
15320 $this->stack[] = $top_nesting;
15322 // send error [TagClosedSuppress]
15323 if ($e && !isset($top_nesting->armor['MakeWellFormed_TagClosedError'])) {
15324 $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by document end', $top_nesting);
15327 // append, don't splice, since this is the end
15328 $tokens[] = new HTMLPurifier_Token_End($top_nesting->name);
15335 $token = $tokens[$t];
15337 //echo '<br>'; printTokens($tokens, $t); printTokens($this->stack);
15340 // quick-check: if it's not a tag, no need to process
15341 if (empty($token->is_tag)) {
15342 if ($token instanceof HTMLPurifier_Token_Text) {
15343 foreach ($this->injectors as $i => $injector) {
15344 if (isset($token->skip[$i])) continue;
15345 if ($token->rewind !== null && $token->rewind !== $i) continue;
15346 $injector->handleText($token);
15347 $this->processToken($token, $i);
15352 // another possibility is a comment
15356 if (isset($definition->info[$token->name])) {
15357 $type = $definition->info[$token->name]->child->type;
15359 $type = false; // Type is unknown, treat accordingly
15362 // quick tag checks: anything that's *not* an end tag
15364 if ($type === 'empty' && $token instanceof HTMLPurifier_Token_Start) {
15365 // claims to be a start tag but is empty
15366 $token = new HTMLPurifier_Token_Empty($token->name, $token->attr, $token->line, $token->col, $token->armor);
15368 } elseif ($type && $type !== 'empty' && $token instanceof HTMLPurifier_Token_Empty) {
15369 // claims to be empty but really is a start tag
15370 $this->swap(new HTMLPurifier_Token_End($token->name));
15371 $this->insertBefore(new HTMLPurifier_Token_Start($token->name, $token->attr, $token->line, $token->col, $token->armor));
15372 // punt (since we had to modify the input stream in a non-trivial way)
15375 } elseif ($token instanceof HTMLPurifier_Token_Empty) {
15376 // real empty token
15378 } elseif ($token instanceof HTMLPurifier_Token_Start) {
15381 // ...unless they also have to close their parent
15382 if (!empty($this->stack)) {
15384 // Performance note: you might think that it's rather
15385 // inefficient, recalculating the autoclose information
15386 // for every tag that a token closes (since when we
15387 // do an autoclose, we push a new token into the
15388 // stream and then /process/ that, before
15389 // re-processing this token.) But this is
15390 // necessary, because an injector can make an
15391 // arbitrary transformations to the autoclosing
15392 // tokens we introduce, so things may have changed
15393 // in the meantime. Also, doing the inefficient thing is
15394 // "easy" to reason about (for certain perverse definitions
15397 $parent = array_pop($this->stack);
15398 $this->stack[] = $parent;
15400 if (isset($definition->info[$parent->name])) {
15401 $elements = $definition->info[$parent->name]->child->getAllowedElements($config);
15402 $autoclose = !isset($elements[$token->name]);
15404 $autoclose = false;
15407 if ($autoclose && $definition->info[$token->name]->wrap) {
15408 // Check if an element can be wrapped by another
15409 // element to make it valid in a context (for
15410 // example, <ul><ul> needs a <li> in between)
15411 $wrapname = $definition->info[$token->name]->wrap;
15412 $wrapdef = $definition->info[$wrapname];
15413 $elements = $wrapdef->child->getAllowedElements($config);
15414 $parent_elements = $definition->info[$parent->name]->child->getAllowedElements($config);
15415 if (isset($elements[$token->name]) && isset($parent_elements[$wrapname])) {
15416 $newtoken = new HTMLPurifier_Token_Start($wrapname);
15417 $this->insertBefore($newtoken);
15423 $carryover = false;
15424 if ($autoclose && $definition->info[$parent->name]->formatting) {
15429 // check if this autoclose is doomed to fail
15430 // (this rechecks $parent, which his harmless)
15431 $autoclose_ok = isset($global_parent_allowed_elements[$token->name]);
15432 if (!$autoclose_ok) {
15433 foreach ($this->stack as $ancestor) {
15434 $elements = $definition->info[$ancestor->name]->child->getAllowedElements($config);
15435 if (isset($elements[$token->name])) {
15436 $autoclose_ok = true;
15439 if ($definition->info[$token->name]->wrap) {
15440 $wrapname = $definition->info[$token->name]->wrap;
15441 $wrapdef = $definition->info[$wrapname];
15442 $wrap_elements = $wrapdef->child->getAllowedElements($config);
15443 if (isset($wrap_elements[$token->name]) && isset($elements[$wrapname])) {
15444 $autoclose_ok = true;
15450 if ($autoclose_ok) {
15451 // errors need to be updated
15452 $new_token = new HTMLPurifier_Token_End($parent->name);
15453 $new_token->start = $parent;
15455 $element = clone $parent;
15457 $element->armor['MakeWellFormed_TagClosedError'] = true;
15458 $element->carryover = true;
15459 $this->processToken(array($new_token, $token, $element));
15461 $this->insertBefore($new_token);
15463 // [TagClosedSuppress]
15464 if ($e && !isset($parent->armor['MakeWellFormed_TagClosedError'])) {
15466 $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag auto closed', $parent);
15468 $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag carryover', $parent);
15483 foreach ($this->injectors as $i => $injector) {
15484 if (isset($token->skip[$i])) continue;
15485 if ($token->rewind !== null && $token->rewind !== $i) continue;
15486 $injector->handleElement($token);
15487 $this->processToken($token, $i);
15492 // ah, nothing interesting happened; do normal processing
15493 $this->swap($token);
15494 if ($token instanceof HTMLPurifier_Token_Start) {
15495 $this->stack[] = $token;
15496 } elseif ($token instanceof HTMLPurifier_Token_End) {
15497 throw new HTMLPurifier_Exception('Improper handling of end tag in start code; possible error in MakeWellFormed');
15503 // sanity check: we should be dealing with a closing tag
15504 if (!$token instanceof HTMLPurifier_Token_End) {
15505 throw new HTMLPurifier_Exception('Unaccounted for tag token in input stream, bug in HTML Purifier');
15508 // make sure that we have something open
15509 if (empty($this->stack)) {
15510 if ($escape_invalid_tags) {
15511 if ($e) $e->send(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag to text');
15512 $this->swap(new HTMLPurifier_Token_Text(
15513 $generator->generateFromToken($token)
15517 if ($e) $e->send(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag removed');
15523 // first, check for the simplest case: everything closes neatly.
15524 // Eventually, everything passes through here; if there are problems
15525 // we modify the input stream accordingly and then punt, so that
15526 // the tokens get processed again.
15527 $current_parent = array_pop($this->stack);
15528 if ($current_parent->name == $token->name) {
15529 $token->start = $current_parent;
15530 foreach ($this->injectors as $i => $injector) {
15531 if (isset($token->skip[$i])) continue;
15532 if ($token->rewind !== null && $token->rewind !== $i) continue;
15533 $injector->handleEnd($token);
15534 $this->processToken($token, $i);
15535 $this->stack[] = $current_parent;
15542 // okay, so we're trying to close the wrong tag
15544 // undo the pop previous pop
15545 $this->stack[] = $current_parent;
15547 // scroll back the entire nest, trying to find our tag.
15548 // (feature could be to specify how far you'd like to go)
15549 $size = count($this->stack);
15550 // -2 because -1 is the last element, but we already checked that
15551 $skipped_tags = false;
15552 for ($j = $size - 2; $j >= 0; $j--) {
15553 if ($this->stack[$j]->name == $token->name) {
15554 $skipped_tags = array_slice($this->stack, $j);
15559 // we didn't find the tag, so remove
15560 if ($skipped_tags === false) {
15561 if ($escape_invalid_tags) {
15562 $this->swap(new HTMLPurifier_Token_Text(
15563 $generator->generateFromToken($token)
15565 if ($e) $e->send(E_WARNING, 'Strategy_MakeWellFormed: Stray end tag to text');
15568 if ($e) $e->send(E_WARNING, 'Strategy_MakeWellFormed: Stray end tag removed');
15574 // do errors, in REVERSE $j order: a,b,c with </a></b></c>
15575 $c = count($skipped_tags);
15577 for ($j = $c - 1; $j > 0; $j--) {
15578 // notice we exclude $j == 0, i.e. the current ending tag, from
15579 // the errors... [TagClosedSuppress]
15580 if (!isset($skipped_tags[$j]->armor['MakeWellFormed_TagClosedError'])) {
15581 $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by element end', $skipped_tags[$j]);
15586 // insert tags, in FORWARD $j order: c,b,a with </a></b></c>
15587 $replace = array($token);
15588 for ($j = 1; $j < $c; $j++) {
15589 // ...as well as from the insertions
15590 $new_token = new HTMLPurifier_Token_End($skipped_tags[$j]->name);
15591 $new_token->start = $skipped_tags[$j];
15592 array_unshift($replace, $new_token);
15593 if (isset($definition->info[$new_token->name]) && $definition->info[$new_token->name]->formatting) {
15595 $element = clone $skipped_tags[$j];
15596 $element->carryover = true;
15597 $element->armor['MakeWellFormed_TagClosedError'] = true;
15598 $replace[] = $element;
15601 $this->processToken($replace);
15606 $context->destroy('CurrentNesting');
15607 $context->destroy('InputTokens');
15608 $context->destroy('InputIndex');
15609 $context->destroy('CurrentToken');
15611 unset($this->injectors, $this->stack, $this->tokens, $this->t);
15612 // CODE HAS TO BE MOVED WITH UPGRADE START
15613 return $tokens->getArray();
15614 // CODE HAS TO BE MOVED WITH UPGRADE END
15618 * Processes arbitrary token values for complicated substitution patterns.
15621 * If $token is an array, it is a list of tokens to substitute for the
15622 * current token. These tokens then get individually processed. If there
15623 * is a leading integer in the list, that integer determines how many
15624 * tokens from the stream should be removed.
15626 * If $token is a regular token, it is swapped with the current token.
15628 * If $token is false, the current token is deleted.
15630 * If $token is an integer, that number of tokens (with the first token
15631 * being the current one) will be deleted.
15633 * @param $token Token substitution value
15634 * @param $injector Injector that performed the substitution; default is if
15635 * this is not an injector related operation.
15637 protected function processToken($token, $injector = -1) {
15639 // normalize forms of token
15640 if (is_object($token)) $token = array(1, $token);
15641 if (is_int($token)) $token = array($token);
15642 if ($token === false) $token = array(1);
15643 if (!is_array($token)) throw new HTMLPurifier_Exception('Invalid token type from injector');
15644 if (!is_int($token[0])) array_unshift($token, 1);
15645 if ($token[0] === 0) throw new HTMLPurifier_Exception('Deleting zero tokens is not valid');
15647 // $token is now an array with the following form:
15648 // array(number nodes to delete, new node 1, new node 2, ...)
15650 $delete = array_shift($token);
15651 // CODE HAS TO BE MOVED WITH UPGRADE START
15653 if ($this->tokens instanceof HTMLPurifier_Array) {
15654 $old = $this->tokens->splice($this->t, $delete, $token);
15656 $old = array_splice($this->tokens, $this->t, $delete, $token);
15658 // CODE HAS TO BE MOVED WITH UPGRADE END
15661 if ($injector > -1) {
15662 // determine appropriate skips
15663 $oldskip = isset($old[0]) ? $old[0]->skip : array();
15664 foreach ($token as $object) {
15665 $object->skip = $oldskip;
15666 $object->skip[$injector] = true;
15673 * Inserts a token before the current token. Cursor now points to
15674 * this token. You must reprocess after this.
15676 private function insertBefore($token) {
15677 // CODE HAS TO BE MOVED WITH UPGRADE START
15678 $this->tokens->insertBefore($this->t, $token);
15679 // CODE HAS TO BE MOVED WITH UPGRADE END
15683 * Removes current token. Cursor now points to new token occupying previously
15684 * occupied space. You must reprocess after this.
15686 private function remove() {
15687 // CODE HAS TO BE MOVED WITH UPGRADE START
15688 $this->tokens->remove($this->t);
15689 // CODE HAS TO BE MOVED WITH UPGRADE END
15693 * Swap current token with new token. Cursor points to new token (no
15694 * change). You must reprocess after this.
15696 private function swap($token) {
15697 $this->tokens[$this->t] = $token;
15707 * Removes all unrecognized tags from the list of tokens.
15709 * This strategy iterates through all the tokens and removes unrecognized
15710 * tokens. If a token is not recognized but a TagTransform is defined for
15711 * that element, the element will be transformed accordingly.
15714 class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy
15717 public function execute($tokens, $config, $context) {
15718 $definition = $config->getHTMLDefinition();
15719 $generator = new HTMLPurifier_Generator($config, $context);
15722 $escape_invalid_tags = $config->get('Core.EscapeInvalidTags');
15723 $remove_invalid_img = $config->get('Core.RemoveInvalidImg');
15725 // currently only used to determine if comments should be kept
15726 $trusted = $config->get('HTML.Trusted');
15728 $remove_script_contents = $config->get('Core.RemoveScriptContents');
15729 $hidden_elements = $config->get('Core.HiddenElements');
15731 // remove script contents compatibility
15732 if ($remove_script_contents === true) {
15733 $hidden_elements['script'] = true;
15734 } elseif ($remove_script_contents === false && isset($hidden_elements['script'])) {
15735 unset($hidden_elements['script']);
15738 $attr_validator = new HTMLPurifier_AttrValidator();
15740 // removes tokens until it reaches a closing tag with its value
15741 $remove_until = false;
15743 // converts comments into text tokens when this is equal to a tag name
15744 $textify_comments = false;
15747 $context->register('CurrentToken', $token);
15750 if ($config->get('Core.CollectErrors')) {
15751 $e =& $context->get('ErrorCollector');
15754 foreach($tokens as $token) {
15755 if ($remove_until) {
15756 if (empty($token->is_tag) || $token->name !== $remove_until) {
15760 if (!empty( $token->is_tag )) {
15763 // before any processing, try to transform the element
15765 isset($definition->info_tag_transform[$token->name])
15767 $original_name = $token->name;
15768 // there is a transformation for this tag
15770 $token = $definition->
15771 info_tag_transform[$token->name]->
15772 transform($token, $config, $context);
15773 if ($e) $e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Tag transform', $original_name);
15776 if (isset($definition->info[$token->name])) {
15778 // mostly everything's good, but
15779 // we need to make sure required attributes are in order
15781 ($token instanceof HTMLPurifier_Token_Start || $token instanceof HTMLPurifier_Token_Empty) &&
15782 $definition->info[$token->name]->required_attr &&
15783 ($token->name != 'img' || $remove_invalid_img) // ensure config option still works
15785 $attr_validator->validateToken($token, $config, $context);
15787 foreach ($definition->info[$token->name]->required_attr as $name) {
15788 if (!isset($token->attr[$name])) {
15794 if ($e) $e->send(E_ERROR, 'Strategy_RemoveForeignElements: Missing required attribute', $name);
15797 $token->armor['ValidateAttributes'] = true;
15800 if (isset($hidden_elements[$token->name]) && $token instanceof HTMLPurifier_Token_Start) {
15801 $textify_comments = $token->name;
15802 } elseif ($token->name === $textify_comments && $token instanceof HTMLPurifier_Token_End) {
15803 $textify_comments = false;
15806 } elseif ($escape_invalid_tags) {
15807 // invalid tag, generate HTML representation and insert in
15808 if ($e) $e->send(E_WARNING, 'Strategy_RemoveForeignElements: Foreign element to text');
15809 $token = new HTMLPurifier_Token_Text(
15810 $generator->generateFromToken($token)
15813 // check if we need to destroy all of the tag's children
15814 // CAN BE GENERICIZED
15815 if (isset($hidden_elements[$token->name])) {
15816 if ($token instanceof HTMLPurifier_Token_Start) {
15817 $remove_until = $token->name;
15818 } elseif ($token instanceof HTMLPurifier_Token_Empty) {
15819 // do nothing: we're still looking
15821 $remove_until = false;
15823 if ($e) $e->send(E_ERROR, 'Strategy_RemoveForeignElements: Foreign meta element removed');
15825 if ($e) $e->send(E_ERROR, 'Strategy_RemoveForeignElements: Foreign element removed');
15829 } elseif ($token instanceof HTMLPurifier_Token_Comment) {
15830 // textify comments in script tags when they are allowed
15831 if ($textify_comments !== false) {
15832 $data = $token->data;
15833 $token = new HTMLPurifier_Token_Text($data);
15834 } elseif ($trusted) {
15835 // keep, but perform comment cleaning
15837 // perform check whether or not there's a trailing hyphen
15838 if (substr($token->data, -1) == '-') {
15839 $e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Trailing hyphen in comment removed');
15842 $token->data = rtrim($token->data, '-');
15843 $found_double_hyphen = false;
15844 while (strpos($token->data, '--') !== false) {
15845 if ($e && !$found_double_hyphen) {
15846 $e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Hyphens in comment collapsed');
15848 $found_double_hyphen = true; // prevent double-erroring
15849 $token->data = str_replace('--', '-', $token->data);
15853 if ($e) $e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Comment removed');
15856 } elseif ($token instanceof HTMLPurifier_Token_Text) {
15860 $result[] = $token;
15862 if ($remove_until && $e) {
15863 // we removed tokens until the end, throw error
15864 $e->send(E_ERROR, 'Strategy_RemoveForeignElements: Token removed to end', $remove_until);
15867 $context->destroy('CurrentToken');
15879 * Validate all attributes in the tokens.
15882 class HTMLPurifier_Strategy_ValidateAttributes extends HTMLPurifier_Strategy
15885 public function execute($tokens, $config, $context) {
15888 $validator = new HTMLPurifier_AttrValidator();
15891 $context->register('CurrentToken', $token);
15893 foreach ($tokens as $key => $token) {
15895 // only process tokens that have attributes,
15896 // namely start and empty tags
15897 if (!$token instanceof HTMLPurifier_Token_Start && !$token instanceof HTMLPurifier_Token_Empty) continue;
15899 // skip tokens that are armored
15900 if (!empty($token->armor['ValidateAttributes'])) continue;
15902 // note that we have no facilities here for removing tokens
15903 $validator->validateToken($token, $config, $context);
15905 $tokens[$key] = $token; // for PHP 4
15907 $context->destroy('CurrentToken');
15919 * Transforms FONT tags to the proper form (SPAN with CSS styling)
15921 * This transformation takes the three proprietary attributes of FONT and
15922 * transforms them into their corresponding CSS attributes. These are color,
15925 * @note Size is an interesting case because it doesn't map cleanly to CSS.
15927 * http://style.cleverchimp.com/font_size_intervals/altintervals.html
15928 * for reasonable mappings.
15929 * @warning This doesn't work completely correctly; specifically, this
15930 * TagTransform operates before well-formedness is enforced, so
15931 * the "active formatting elements" algorithm doesn't get applied.
15933 class HTMLPurifier_TagTransform_Font extends HTMLPurifier_TagTransform
15936 public $transform_to = 'span';
15938 protected $_size_lookup = array(
15955 public function transform($tag, $config, $context) {
15957 if ($tag instanceof HTMLPurifier_Token_End) {
15958 $new_tag = clone $tag;
15959 $new_tag->name = $this->transform_to;
15963 $attr = $tag->attr;
15964 $prepend_style = '';
15966 // handle color transform
15967 if (isset($attr['color'])) {
15968 $prepend_style .= 'color:' . $attr['color'] . ';';
15969 unset($attr['color']);
15972 // handle face transform
15973 if (isset($attr['face'])) {
15974 $prepend_style .= 'font-family:' . $attr['face'] . ';';
15975 unset($attr['face']);
15978 // handle size transform
15979 if (isset($attr['size'])) {
15980 // normalize large numbers
15981 if ($attr['size'] !== '') {
15982 if ($attr['size']{0} == '+' || $attr['size']{0} == '-') {
15983 $size = (int) $attr['size'];
15984 if ($size < -2) $attr['size'] = '-2';
15985 if ($size > 4) $attr['size'] = '+4';
15987 $size = (int) $attr['size'];
15988 if ($size > 7) $attr['size'] = '7';
15991 if (isset($this->_size_lookup[$attr['size']])) {
15992 $prepend_style .= 'font-size:' .
15993 $this->_size_lookup[$attr['size']] . ';';
15995 unset($attr['size']);
15998 if ($prepend_style) {
15999 $attr['style'] = isset($attr['style']) ?
16000 $prepend_style . $attr['style'] :
16004 $new_tag = clone $tag;
16005 $new_tag->name = $this->transform_to;
16006 $new_tag->attr = $attr;
16018 * Simple transformation, just change tag name to something else,
16019 * and possibly add some styling. This will cover most of the deprecated
16022 class HTMLPurifier_TagTransform_Simple extends HTMLPurifier_TagTransform
16028 * @param $transform_to Tag name to transform to.
16029 * @param $style CSS style to add to the tag
16031 public function __construct($transform_to, $style = null) {
16032 $this->transform_to = $transform_to;
16033 $this->style = $style;
16036 public function transform($tag, $config, $context) {
16037 $new_tag = clone $tag;
16038 $new_tag->name = $this->transform_to;
16039 if (!is_null($this->style) &&
16040 ($new_tag instanceof HTMLPurifier_Token_Start || $new_tag instanceof HTMLPurifier_Token_Empty)
16042 $this->prependCSS($new_tag->attr, $this->style);
16054 * Concrete comment token class. Generally will be ignored.
16056 class HTMLPurifier_Token_Comment extends HTMLPurifier_Token
16058 public $data; /**< Character data within comment. */
16059 public $is_whitespace = true;
16061 * Transparent constructor.
16063 * @param $data String comment data.
16065 public function __construct($data, $line = null, $col = null) {
16066 $this->data = $data;
16067 $this->line = $line;
16077 * Abstract class of a tag token (start, end or empty), and its behavior.
16079 class HTMLPurifier_Token_Tag extends HTMLPurifier_Token
16082 * Static bool marker that indicates the class is a tag.
16084 * This allows us to check objects with <tt>!empty($obj->is_tag)</tt>
16085 * without having to use a function call <tt>is_a()</tt>.
16087 public $is_tag = true;
16090 * The lower-case name of the tag, like 'a', 'b' or 'blockquote'.
16092 * @note Strictly speaking, XML tags are case sensitive, so we shouldn't
16093 * be lower-casing them, but these tokens cater to HTML tags, which are
16099 * Associative array of the tag's attributes.
16101 public $attr = array();
16104 * Non-overloaded constructor, which lower-cases passed tag name.
16106 * @param $name String name.
16107 * @param $attr Associative array of attributes.
16109 public function __construct($name, $attr = array(), $line = null, $col = null, $armor = array()) {
16110 $this->name = ctype_lower($name) ? $name : strtolower($name);
16111 foreach ($attr as $key => $value) {
16112 // normalization only necessary when key is not lowercase
16113 if (!ctype_lower($key)) {
16114 $new_key = strtolower($key);
16115 if (!isset($attr[$new_key])) {
16116 $attr[$new_key] = $attr[$key];
16118 if ($new_key !== $key) {
16119 unset($attr[$key]);
16123 $this->attr = $attr;
16124 $this->line = $line;
16126 $this->armor = $armor;
16135 * Concrete empty token class.
16137 class HTMLPurifier_Token_Empty extends HTMLPurifier_Token_Tag
16147 * Concrete end token class.
16149 * @warning This class accepts attributes even though end tags cannot. This
16150 * is for optimization reasons, as under normal circumstances, the Lexers
16151 * do not pass attributes.
16153 class HTMLPurifier_Token_End extends HTMLPurifier_Token_Tag
16156 * Token that started this node. Added by MakeWellFormed. Please
16157 * do not edit this!
16167 * Concrete start token class.
16169 class HTMLPurifier_Token_Start extends HTMLPurifier_Token_Tag
16179 * Concrete text token class.
16181 * Text tokens comprise of regular parsed character data (PCDATA) and raw
16182 * character data (from the CDATA sections). Internally, their
16183 * data is parsed with all entities expanded. Surprisingly, the text token
16184 * does have a "tag name" called #PCDATA, which is how the DTD represents it
16185 * in permissible child nodes.
16187 class HTMLPurifier_Token_Text extends HTMLPurifier_Token
16190 public $name = '#PCDATA'; /**< PCDATA tag name compatible with DTD. */
16191 public $data; /**< Parsed character data of text. */
16192 public $is_whitespace; /**< Bool indicating if node is whitespace. */
16195 * Constructor, accepts data and determines if it is whitespace.
16197 * @param $data String parsed character data.
16199 public function __construct($data, $line = null, $col = null) {
16200 $this->data = $data;
16201 $this->is_whitespace = ctype_space($data);
16202 $this->line = $line;
16212 class HTMLPurifier_URIFilter_DisableExternal extends HTMLPurifier_URIFilter
16214 public $name = 'DisableExternal';
16215 protected $ourHostParts = false;
16216 public function prepare($config) {
16217 $our_host = $config->getDefinition('URI')->host;
16218 if ($our_host !== null) $this->ourHostParts = array_reverse(explode('.', $our_host));
16220 public function filter(&$uri, $config, $context) {
16221 if (is_null($uri->host)) return true;
16222 if ($this->ourHostParts === false) return false;
16223 $host_parts = array_reverse(explode('.', $uri->host));
16224 foreach ($this->ourHostParts as $i => $x) {
16225 if (!isset($host_parts[$i])) return false;
16226 if ($host_parts[$i] != $this->ourHostParts[$i]) return false;
16236 class HTMLPurifier_URIFilter_DisableExternalResources extends HTMLPurifier_URIFilter_DisableExternal
16238 public $name = 'DisableExternalResources';
16239 public function filter(&$uri, $config, $context) {
16240 if (!$context->get('EmbeddedURI', true)) return true;
16241 return parent::filter($uri, $config, $context);
16249 class HTMLPurifier_URIFilter_DisableResources extends HTMLPurifier_URIFilter
16251 public $name = 'DisableResources';
16252 public function filter(&$uri, $config, $context) {
16253 return !$context->get('EmbeddedURI', true);
16261 class HTMLPurifier_URIFilter_HostBlacklist extends HTMLPurifier_URIFilter
16263 public $name = 'HostBlacklist';
16264 protected $blacklist = array();
16265 public function prepare($config) {
16266 $this->blacklist = $config->get('URI.HostBlacklist');
16269 public function filter(&$uri, $config, $context) {
16270 foreach($this->blacklist as $blacklisted_host_fragment) {
16271 if (strpos($uri->host, $blacklisted_host_fragment) !== false) {
16283 // does not support network paths
16285 class HTMLPurifier_URIFilter_MakeAbsolute extends HTMLPurifier_URIFilter
16287 public $name = 'MakeAbsolute';
16289 protected $basePathStack = array();
16290 public function prepare($config) {
16291 $def = $config->getDefinition('URI');
16292 $this->base = $def->base;
16293 if (is_null($this->base)) {
16294 trigger_error('URI.MakeAbsolute is being ignored due to lack of value for URI.Base configuration', E_USER_WARNING);
16297 $this->base->fragment = null; // fragment is invalid for base URI
16298 $stack = explode('/', $this->base->path);
16299 array_pop($stack); // discard last segment
16300 $stack = $this->_collapseStack($stack); // do pre-parsing
16301 $this->basePathStack = $stack;
16304 public function filter(&$uri, $config, $context) {
16305 if (is_null($this->base)) return true; // abort early
16307 $uri->path === '' && is_null($uri->scheme) &&
16308 is_null($uri->host) && is_null($uri->query) && is_null($uri->fragment)
16310 // reference to current document
16311 $uri = clone $this->base;
16314 if (!is_null($uri->scheme)) {
16315 // absolute URI already: don't change
16316 if (!is_null($uri->host)) return true;
16317 $scheme_obj = $uri->getSchemeObj($config, $context);
16318 if (!$scheme_obj) {
16319 // scheme not recognized
16322 if (!$scheme_obj->hierarchical) {
16323 // non-hierarchal URI with explicit scheme, don't change
16326 // special case: had a scheme but always is hierarchical and had no authority
16328 if (!is_null($uri->host)) {
16329 // network path, don't bother
16332 if ($uri->path === '') {
16333 $uri->path = $this->base->path;
16334 } elseif ($uri->path[0] !== '/') {
16335 // relative path, needs more complicated processing
16336 $stack = explode('/', $uri->path);
16337 $new_stack = array_merge($this->basePathStack, $stack);
16338 if ($new_stack[0] !== '' && !is_null($this->base->host)) {
16339 array_unshift($new_stack, '');
16341 $new_stack = $this->_collapseStack($new_stack);
16342 $uri->path = implode('/', $new_stack);
16344 // absolute path, but still we should collapse
16345 $uri->path = implode('/', $this->_collapseStack(explode('/', $uri->path)));
16348 $uri->scheme = $this->base->scheme;
16349 if (is_null($uri->userinfo)) $uri->userinfo = $this->base->userinfo;
16350 if (is_null($uri->host)) $uri->host = $this->base->host;
16351 if (is_null($uri->port)) $uri->port = $this->base->port;
16356 * Resolve dots and double-dots in a path stack
16358 private function _collapseStack($stack) {
16360 $is_folder = false;
16361 for ($i = 0; isset($stack[$i]); $i++) {
16362 $is_folder = false;
16363 // absorb an internally duplicated slash
16364 if ($stack[$i] == '' && $i && isset($stack[$i+1])) continue;
16365 if ($stack[$i] == '..') {
16366 if (!empty($result)) {
16367 $segment = array_pop($result);
16368 if ($segment === '' && empty($result)) {
16369 // error case: attempted to back out too far:
16370 // restore the leading slash
16372 } elseif ($segment === '..') {
16373 $result[] = '..'; // cannot remove .. with ..
16376 // relative path, preserve the double-dots
16382 if ($stack[$i] == '.') {
16387 $result[] = $stack[$i];
16389 if ($is_folder) $result[] = '';
16398 class HTMLPurifier_URIFilter_Munge extends HTMLPurifier_URIFilter
16400 public $name = 'Munge';
16401 public $post = true;
16402 private $target, $parser, $doEmbed, $secretKey;
16404 protected $replace = array();
16406 public function prepare($config) {
16407 $this->target = $config->get('URI.' . $this->name);
16408 $this->parser = new HTMLPurifier_URIParser();
16409 $this->doEmbed = $config->get('URI.MungeResources');
16410 $this->secretKey = $config->get('URI.MungeSecretKey');
16413 public function filter(&$uri, $config, $context) {
16414 if ($context->get('EmbeddedURI', true) && !$this->doEmbed) return true;
16416 $scheme_obj = $uri->getSchemeObj($config, $context);
16417 if (!$scheme_obj) return true; // ignore unknown schemes, maybe another postfilter did it
16418 if (is_null($uri->host) || empty($scheme_obj->browsable)) {
16421 // don't redirect if target host is our host
16422 if ($uri->host === $config->getDefinition('URI')->host) {
16426 $this->makeReplace($uri, $config, $context);
16427 $this->replace = array_map('rawurlencode', $this->replace);
16429 $new_uri = strtr($this->target, $this->replace);
16430 $new_uri = $this->parser->parse($new_uri);
16431 // don't redirect if the target host is the same as the
16433 if ($uri->host === $new_uri->host) return true;
16434 $uri = $new_uri; // overwrite
16438 protected function makeReplace($uri, $config, $context) {
16439 $string = $uri->toString();
16440 // always available
16441 $this->replace['%s'] = $string;
16442 $this->replace['%r'] = $context->get('EmbeddedURI', true);
16443 $token = $context->get('CurrentToken', true);
16444 $this->replace['%n'] = $token ? $token->name : null;
16445 $this->replace['%m'] = $context->get('CurrentAttr', true);
16446 $this->replace['%p'] = $context->get('CurrentCSSProperty', true);
16447 // not always available
16448 if ($this->secretKey) $this->replace['%t'] = sha1($this->secretKey . ':' . $string);
16458 * Implements data: URI for base64 encoded images supported by GD.
16460 class HTMLPurifier_URIScheme_data extends HTMLPurifier_URIScheme {
16462 public $browsable = true;
16463 public $allowed_types = array(
16464 // you better write validation code for other types if you
16465 // decide to allow them
16466 'image/jpeg' => true,
16467 'image/gif' => true,
16468 'image/png' => true,
16470 // this is actually irrelevant since we only write out the path
16472 public $may_omit_host = true;
16474 public function doValidate(&$uri, $config, $context) {
16475 $result = explode(',', $uri->path, 2);
16476 $is_base64 = false;
16478 $content_type = null;
16479 if (count($result) == 2) {
16480 list($metadata, $data) = $result;
16481 // do some legwork on the metadata
16482 $metas = explode(';', $metadata);
16483 while(!empty($metas)) {
16484 $cur = array_shift($metas);
16485 if ($cur == 'base64') {
16489 if (substr($cur, 0, 8) == 'charset=') {
16490 // doesn't match if there are arbitrary spaces, but
16492 if ($charset !== null) continue; // garbage
16493 $charset = substr($cur, 8); // not used
16495 if ($content_type !== null) continue; // garbage
16496 $content_type = $cur;
16500 $data = $result[0];
16502 if ($content_type !== null && empty($this->allowed_types[$content_type])) {
16505 if ($charset !== null) {
16506 // error; we don't allow plaintext stuff
16509 $data = rawurldecode($data);
16511 $raw_data = base64_decode($data);
16515 // XXX probably want to refactor this into a general mechanism
16516 // for filtering arbitrary content types
16517 $file = tempnam("/tmp", "");
16518 file_put_contents($file, $raw_data);
16519 if (function_exists('exif_imagetype')) {
16520 $image_code = exif_imagetype($file);
16521 } elseif (function_exists('getimagesize')) {
16522 set_error_handler(array($this, 'muteErrorHandler'));
16523 $info = getimagesize($file);
16524 restore_error_handler();
16525 if ($info == false) return false;
16526 $image_code = $info[2];
16528 trigger_error("could not find exif_imagetype or getimagesize functions", E_USER_ERROR);
16530 $real_content_type = image_type_to_mime_type($image_code);
16531 if ($real_content_type != $content_type) {
16532 // we're nice guys; if the content type is something else we
16533 // support, change it over
16534 if (empty($this->allowed_types[$real_content_type])) return false;
16535 $content_type = $real_content_type;
16537 // ok, it's kosher, rewrite what we need
16538 $uri->userinfo = null;
16541 $uri->fragment = null;
16542 $uri->query = null;
16543 $uri->path = "$content_type;base64," . base64_encode($raw_data);
16547 public function muteErrorHandler($errno, $errstr) {}
16555 * Validates file as defined by RFC 1630 and RFC 1738.
16557 class HTMLPurifier_URIScheme_file extends HTMLPurifier_URIScheme {
16559 // Generally file:// URLs are not accessible from most
16560 // machines, so placing them as an img src is incorrect.
16561 public $browsable = false;
16563 // Basically the *only* URI scheme for which this is true, since
16564 // accessing files on the local machine is very common. In fact,
16565 // browsers on some operating systems don't understand the
16566 // authority, though I hear it is used on Windows to refer to
16568 public $may_omit_host = true;
16570 public function doValidate(&$uri, $config, $context) {
16571 // Authentication method is not supported
16572 $uri->userinfo = null;
16573 // file:// makes no provisions for accessing the resource
16575 // While it seems to work on Firefox, the querystring has
16576 // no possible effect and is thus stripped.
16577 $uri->query = null;
16588 * Validates ftp (File Transfer Protocol) URIs as defined by generic RFC 1738.
16590 class HTMLPurifier_URIScheme_ftp extends HTMLPurifier_URIScheme {
16592 public $default_port = 21;
16593 public $browsable = true; // usually
16594 public $hierarchical = true;
16596 public function doValidate(&$uri, $config, $context) {
16597 $uri->query = null;
16600 $semicolon_pos = strrpos($uri->path, ';'); // reverse
16601 if ($semicolon_pos !== false) {
16602 $type = substr($uri->path, $semicolon_pos + 1); // no semicolon
16603 $uri->path = substr($uri->path, 0, $semicolon_pos);
16605 if (strpos($type, '=') !== false) {
16606 // figure out whether or not the declaration is correct
16607 list($key, $typecode) = explode('=', $type, 2);
16608 if ($key !== 'type') {
16609 // invalid key, tack it back on encoded
16610 $uri->path .= '%3B' . $type;
16611 } elseif ($typecode === 'a' || $typecode === 'i' || $typecode === 'd') {
16612 $type_ret = ";type=$typecode";
16615 $uri->path .= '%3B' . $type;
16617 $uri->path = str_replace(';', '%3B', $uri->path);
16618 $uri->path .= $type_ret;
16631 * Validates http (HyperText Transfer Protocol) as defined by RFC 2616
16633 class HTMLPurifier_URIScheme_http extends HTMLPurifier_URIScheme {
16635 public $default_port = 80;
16636 public $browsable = true;
16637 public $hierarchical = true;
16639 public function doValidate(&$uri, $config, $context) {
16640 $uri->userinfo = null;
16651 * Validates https (Secure HTTP) according to http scheme.
16653 class HTMLPurifier_URIScheme_https extends HTMLPurifier_URIScheme_http {
16655 public $default_port = 443;
16663 // VERY RELAXED! Shouldn't cause problems, not even Firefox checks if the
16664 // email is valid, but be careful!
16667 * Validates mailto (for E-mail) according to RFC 2368
16668 * @todo Validate the email address
16669 * @todo Filter allowed query parameters
16672 class HTMLPurifier_URIScheme_mailto extends HTMLPurifier_URIScheme {
16674 public $browsable = false;
16675 public $may_omit_host = true;
16677 public function doValidate(&$uri, $config, $context) {
16678 $uri->userinfo = null;
16681 // we need to validate path against RFC 2368's addr-spec
16692 * Validates news (Usenet) as defined by generic RFC 1738
16694 class HTMLPurifier_URIScheme_news extends HTMLPurifier_URIScheme {
16696 public $browsable = false;
16697 public $may_omit_host = true;
16699 public function doValidate(&$uri, $config, $context) {
16700 $uri->userinfo = null;
16703 $uri->query = null;
16704 // typecode check needed on path
16715 * Validates nntp (Network News Transfer Protocol) as defined by generic RFC 1738
16717 class HTMLPurifier_URIScheme_nntp extends HTMLPurifier_URIScheme {
16719 public $default_port = 119;
16720 public $browsable = false;
16722 public function doValidate(&$uri, $config, $context) {
16723 $uri->userinfo = null;
16724 $uri->query = null;
16735 * Performs safe variable parsing based on types which can be used by
16736 * users. This may not be able to represent all possible data inputs,
16739 class HTMLPurifier_VarParser_Flexible extends HTMLPurifier_VarParser
16742 protected function parseImplementation($var, $type, $allow_null) {
16743 if ($allow_null && $var === null) return null;
16745 // Note: if code "breaks" from the switch, it triggers a generic
16746 // exception to be thrown. Specific errors can be specifically
16749 case self::ISTRING :
16750 case self::STRING :
16755 if (is_string($var) && ctype_digit($var)) $var = (int) $var;
16758 if ((is_string($var) && is_numeric($var)) || is_int($var)) $var = (float) $var;
16761 if (is_int($var) && ($var === 0 || $var === 1)) {
16762 $var = (bool) $var;
16763 } elseif (is_string($var)) {
16764 if ($var == 'on' || $var == 'true' || $var == '1') {
16766 } elseif ($var == 'off' || $var == 'false' || $var == '0') {
16769 throw new HTMLPurifier_VarParserException("Unrecognized value '$var' for $type");
16775 case self::LOOKUP :
16776 if (is_string($var)) {
16777 // special case: technically, this is an array with
16778 // a single empty string item, but having an empty
16779 // array is more intuitive
16780 if ($var == '') return array();
16781 if (strpos($var, "\n") === false && strpos($var, "\r") === false) {
16782 // simplistic string to array method that only works
16783 // for simple lists of tag names or alphanumeric characters
16784 $var = explode(',',$var);
16786 $var = preg_split('/(,|[\n\r]+)/', $var);
16789 foreach ($var as $i => $j) $var[$i] = trim($j);
16790 if ($type === self::HASH) {
16791 // key:value,key2:value2
16793 foreach ($var as $keypair) {
16794 $c = explode(':', $keypair, 2);
16795 if (!isset($c[1])) continue;
16796 $nvar[trim($c[0])] = trim($c[1]);
16801 if (!is_array($var)) break;
16802 $keys = array_keys($var);
16803 if ($keys === array_keys($keys)) {
16804 if ($type == self::ALIST) return $var;
16805 elseif ($type == self::LOOKUP) {
16807 foreach ($var as $key) {
16813 if ($type === self::ALIST) {
16814 trigger_error("Array list did not have consecutive integer indexes", E_USER_WARNING);
16815 return array_values($var);
16817 if ($type === self::LOOKUP) {
16818 foreach ($var as $key => $value) {
16819 if ($value !== true) {
16820 trigger_error("Lookup array has non-true value at key '$key'; maybe your input array was not indexed numerically", E_USER_WARNING);
16827 $this->errorInconsistent(__CLASS__, $type);
16829 $this->errorGeneric($var, $type);
16839 * This variable parser uses PHP's internal code engine. Because it does
16840 * this, it can represent all inputs; however, it is dangerous and cannot
16841 * be used by users.
16843 class HTMLPurifier_VarParser_Native extends HTMLPurifier_VarParser
16846 protected function parseImplementation($var, $type, $allow_null) {
16847 return $this->evalExpression($var);
16850 protected function evalExpression($expr) {
16852 $result = eval("\$var = $expr;");
16853 if ($result === false) {
16854 throw new HTMLPurifier_VarParserException("Fatal error in evaluated code");
16861 // CODE HAS TO BE MOVED WITH UPGRADE START
16862 class HTMLPurifier_Array implements ArrayAccess
16865 * @param HTMLPurifier_ArrayNode
16867 public $head = null;
16872 protected $count = 0;
16877 protected $offset = 0;
16880 * @var HTMLPurifier_ArrayNode
16882 protected $offsetItem = null;
16885 public function __construct(array $array = array())
16888 * @var HTMLPurifier_ArrayNode $temp
16893 foreach ($array as &$v) {
16894 $item = new HTMLPurifier_ArrayNode($v);
16896 if ($this->head == null) {
16897 $this->head = &$item;
16899 if ($temp instanceof HTMLPurifier_ArrayNode) {
16900 $item->prev = &$temp;
16901 $temp->next = &$item;
16912 $this->offsetItem = &$this->head;
16915 protected function findIndex($offset)
16917 if ($this->head == null) {
16919 'correct' => false,
16924 $current = &$this->head;
16928 if ($this->offset <= $offset && $this->offsetItem instanceof HTMLPurifier_ArrayNode) {
16929 $current = &$this->offsetItem;
16930 $index = $this->offset;
16931 } elseif ($this->offset > $offset && ($this->offset - $offset) < $offset && $this->offsetItem instanceof HTMLPurifier_ArrayNode) {
16932 $current = &$this->offsetItem;
16933 $index = $this->offset;
16938 while ($current->next instanceof HTMLPurifier_ArrayNode && $index != $offset) {
16939 $current = &$current->next;
16943 while ($current->prev instanceof HTMLPurifier_ArrayNode && $index != $offset) {
16944 $current = &$current->prev;
16949 if ($index == $offset) {
16950 $this->offset = $offset;
16951 $this->offsetItem = &$current;
16954 'value' => &$current
16959 'correct' => false,
16960 'value' => &$current
16964 public function insertBefore($offset, $value)
16966 $result = $this->findIndex($offset);
16969 $item = new HTMLPurifier_ArrayNode($value);
16970 if ($result['correct'] == false) {
16971 if ($result['value'] instanceof HTMLPurifier_ArrayNode) {
16972 $result['value']->next = &$item;
16973 $item->prev = &$result['value'];
16976 if ($result['value'] instanceof HTMLPurifier_ArrayNode) {
16977 $item->prev = &$result['value']->prev;
16978 $item->next = &$result['value'];
16980 if ($item->prev instanceof HTMLPurifier_ArrayNode) {
16981 $item->prev->next = &$item;
16983 if ($result['value'] instanceof HTMLPurifier_ArrayNode) {
16984 $result['value']->prev = &$item;
16987 if ($offset == 0) {
16988 $this->head = &$item;
16990 if ($offset <= $this->offset && $this->offsetItem instanceof HTMLPurifier_ArrayNode) {
16991 $this->offsetItem = &$this->offsetItem->prev;
16995 public function remove($offset)
16997 $result = $this->findIndex($offset);
16999 if ($result['correct']) {
17001 $item = $result['value'];
17002 if ($item->prev instanceof HTMLPurifier_ArrayNode) {
17003 $item->prev->next = &$result['value']->next;
17005 if ($item->next instanceof HTMLPurifier_ArrayNode) {
17006 $item->next->prev = &$result['value']->prev;
17008 if ($offset == 0) {
17009 $this->head = &$item->next;
17011 if ($offset < $this->offset) {
17013 } elseif ($offset == $this->offset) {
17014 $this->offsetItem = &$item->next;
17019 public function splice($offset, $length = 0, $replacement = null)
17023 for ($i = 0; $i < $length; $i ++) {
17024 $result = $this->findIndex($offset);
17025 if ($result['correct']) {
17026 $old[] = $result['value']->value;
17027 $this->remove($offset);
17030 foreach ((array)$replacement as $k => $v) {
17031 $this->insertBefore($offset + $k, $v);
17037 public function getArray()
17040 $head = $this->head;
17042 while ($head instanceof HTMLPurifier_ArrayNode) {
17043 $return[] = $head->value;
17044 $head = &$head->next;
17050 public function offsetExists($offset)
17052 return $offset >= 0 && $offset < $this->count;
17055 public function offsetGet($offset)
17057 $result = $this->findIndex($offset);
17058 if ($result['correct']) {
17059 return $result['value']->value;
17065 public function offsetSet($offset, $value)
17067 $result = $this->findIndex($offset);
17068 if ($result['correct']) {
17069 $result['value']->value = &$value;
17073 public function offsetUnset($offset)
17075 $this->remove($offset);
17079 class HTMLPurifier_ArrayNode
17081 public function __construct(&$value)
17083 $this->value = &$value;
17087 * @var HTMLPurifier_ArrayNode
17089 public $prev = null;
17092 * @var HTMLPurifier_ArrayNode
17094 public $next = null;
17099 public $value = null;
17101 // CODE HAS TO BE MOVED WITH UPGRADE END