5 * This file was auto-generated by generate-includes.php and includes all of
6 * the core files required by HTML Purifier. Use this if performance is a
7 * primary concern and you are using an opcode cache. PLEASE DO NOT EDIT THIS
8 * FILE, changes will be overwritten the next time the script is run.
13 * You must *not* include any other HTML Purifier files before this file,
14 * because 'require' not 'require_once' is used.
17 * This file requires that the include path contains the HTML Purifier
18 * library directory; this is not auto-set.
25 * HTML Purifier is an HTML filter that will take an arbitrary snippet of
26 * HTML and rigorously test, validate and filter it into a version that
27 * is safe for output onto webpages. It achieves this by:
29 * -# Lexing (parsing into tokens) the document,
30 * -# Executing various strategies on the tokens:
31 * -# Removing all elements not in the whitelist,
32 * -# Making the tokens well-formed,
33 * -# Fixing the nesting of the nodes, and
34 * -# Validating attributes of the nodes; and
35 * -# Generating HTML from the purified tokens.
37 * However, most users will only need to interface with the HTMLPurifier
38 * and HTMLPurifier_Config.
42 HTML Purifier 4.3.0 - Standards Compliant HTML Filtering
43 Copyright (C) 2006-2008 Edward Z. Yang
45 This library is free software; you can redistribute it and/or
46 modify it under the terms of the GNU Lesser General Public
47 License as published by the Free Software Foundation; either
48 version 2.1 of the License, or (at your option) any later version.
50 This library is distributed in the hope that it will be useful,
51 but WITHOUT ANY WARRANTY; without even the implied warranty of
52 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
53 Lesser General Public License for more details.
55 You should have received a copy of the GNU Lesser General Public
56 License along with this library; if not, write to the Free Software
57 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
61 * Facade that coordinates HTML Purifier's subsystems in order to purify HTML.
63 * @note There are several points in which configuration can be specified
64 * for HTML Purifier. The precedence of these (from lowest to
65 * highest) is as follows:
66 * -# Instance: new HTMLPurifier($config)
67 * -# Invocation: purify($html, $config)
68 * These configurations are entirely independent of each other and
69 * are *not* merged (this behavior may change in the future).
71 * @todo We need an easier way to inject strategies using the configuration
77 /** Version of HTML Purifier */
78 public $version = '4.3.0';
80 /** Constant with version of HTML Purifier */
81 const VERSION = '4.3.0';
83 /** Global configuration object */
86 /** Array of extra HTMLPurifier_Filter objects to run on HTML, for backwards compatibility */
87 private $filters = array();
89 /** Single instance of HTML Purifier */
90 private static $instance;
92 protected $strategy, $generator;
95 * Resultant HTMLPurifier_Context of last run purification. Is an array
96 * of contexts if the last called method was purifyArray().
101 * Initializes the purifier.
102 * @param $config Optional HTMLPurifier_Config object for all instances of
103 * the purifier, if omitted, a default configuration is
104 * supplied (which can be overridden on a per-use basis).
105 * The parameter can also be any type that
106 * HTMLPurifier_Config::create() supports.
108 public function __construct($config = null) {
110 $this->config = HTMLPurifier_Config::create($config);
112 $this->strategy = new HTMLPurifier_Strategy_Core();
117 * Adds a filter to process the output. First come first serve
118 * @param $filter HTMLPurifier_Filter object
120 public function addFilter($filter) {
121 trigger_error('HTMLPurifier->addFilter() is deprecated, use configuration directives in the Filter namespace or Filter.Custom', E_USER_WARNING);
122 $this->filters[] = $filter;
126 * Filters an HTML snippet/document to be XSS-free and standards-compliant.
128 * @param $html String of HTML to purify
129 * @param $config HTMLPurifier_Config object for this operation, if omitted,
130 * defaults to the config object specified during this
131 * object's construction. The parameter can also be any type
132 * that HTMLPurifier_Config::create() supports.
133 * @return Purified HTML
135 public function purify($html, $config = null) {
137 // :TODO: make the config merge in, instead of replace
138 $config = $config ? HTMLPurifier_Config::create($config) : $this->config;
140 // implementation is partially environment dependant, partially
141 // configuration dependant
142 $lexer = HTMLPurifier_Lexer::create($config);
144 $context = new HTMLPurifier_Context();
146 // setup HTML generator
147 $this->generator = new HTMLPurifier_Generator($config, $context);
148 $context->register('Generator', $this->generator);
150 // set up global context variables
151 if ($config->get('Core.CollectErrors')) {
152 // may get moved out if other facilities use it
153 $language_factory = HTMLPurifier_LanguageFactory::instance();
154 $language = $language_factory->create($config, $context);
155 $context->register('Locale', $language);
157 $error_collector = new HTMLPurifier_ErrorCollector($context);
158 $context->register('ErrorCollector', $error_collector);
161 // setup id_accumulator context, necessary due to the fact that
162 // AttrValidator can be called from many places
163 $id_accumulator = HTMLPurifier_IDAccumulator::build($config, $context);
164 $context->register('IDAccumulator', $id_accumulator);
166 $html = HTMLPurifier_Encoder::convertToUTF8($html, $config, $context);
169 $filter_flags = $config->getBatch('Filter');
170 $custom_filters = $filter_flags['Custom'];
171 unset($filter_flags['Custom']);
173 foreach ($filter_flags as $filter => $flag) {
174 if (!$flag) continue;
175 if (strpos($filter, '.') !== false) continue;
176 $class = "HTMLPurifier_Filter_$filter";
177 $filters[] = new $class;
179 foreach ($custom_filters as $filter) {
180 // maybe "HTMLPurifier_Filter_$filter", but be consistent with AutoFormat
181 $filters[] = $filter;
183 $filters = array_merge($filters, $this->filters);
184 // maybe prepare(), but later
186 for ($i = 0, $filter_size = count($filters); $i < $filter_size; $i++) {
187 $html = $filters[$i]->preFilter($html, $config, $context);
192 $this->generator->generateFromTokens(
194 $this->strategy->execute(
195 // list of un-purified tokens
196 $lexer->tokenizeHTML(
198 $html, $config, $context
204 for ($i = $filter_size - 1; $i >= 0; $i--) {
205 $html = $filters[$i]->postFilter($html, $config, $context);
208 $html = HTMLPurifier_Encoder::convertFromUTF8($html, $config, $context);
209 $this->context =& $context;
214 * Filters an array of HTML snippets
215 * @param $config Optional HTMLPurifier_Config object for this operation.
216 * See HTMLPurifier::purify() for more details.
217 * @return Array of purified HTML
219 public function purifyArray($array_of_html, $config = null) {
220 $context_array = array();
221 foreach ($array_of_html as $key => $html) {
222 $array_of_html[$key] = $this->purify($html, $config);
223 $context_array[$key] = $this->context;
225 $this->context = $context_array;
226 return $array_of_html;
230 * Singleton for enforcing just one HTML Purifier in your system
231 * @param $prototype Optional prototype HTMLPurifier instance to
232 * overload singleton with, or HTMLPurifier_Config
233 * instance to configure the generated version with.
235 public static function instance($prototype = null) {
236 if (!self::$instance || $prototype) {
237 if ($prototype instanceof HTMLPurifier) {
238 self::$instance = $prototype;
239 } elseif ($prototype) {
240 self::$instance = new HTMLPurifier($prototype);
242 self::$instance = new HTMLPurifier();
245 return self::$instance;
249 * @note Backwards compatibility, see instance()
251 public static function getInstance($prototype = null) {
252 return HTMLPurifier::instance($prototype);
262 * Defines common attribute collections that modules reference
265 class HTMLPurifier_AttrCollections
269 * Associative array of attribute collections, indexed by name
271 public $info = array();
274 * Performs all expansions on internal data for use by other inclusions
275 * It also collects all attribute collection extensions from
277 * @param $attr_types HTMLPurifier_AttrTypes instance
278 * @param $modules Hash array of HTMLPurifier_HTMLModule members
280 public function __construct($attr_types, $modules) {
281 // load extensions from the modules
282 foreach ($modules as $module) {
283 foreach ($module->attr_collections as $coll_i => $coll) {
284 if (!isset($this->info[$coll_i])) {
285 $this->info[$coll_i] = array();
287 foreach ($coll as $attr_i => $attr) {
288 if ($attr_i === 0 && isset($this->info[$coll_i][$attr_i])) {
290 $this->info[$coll_i][$attr_i] = array_merge(
291 $this->info[$coll_i][$attr_i], $attr);
294 $this->info[$coll_i][$attr_i] = $attr;
298 // perform internal expansions and inclusions
299 foreach ($this->info as $name => $attr) {
300 // merge attribute collections that include others
301 $this->performInclusions($this->info[$name]);
302 // replace string identifiers with actual attribute objects
303 $this->expandIdentifiers($this->info[$name], $attr_types);
308 * Takes a reference to an attribute associative array and performs
309 * all inclusions specified by the zero index.
310 * @param &$attr Reference to attribute array
312 public function performInclusions(&$attr) {
313 if (!isset($attr[0])) return;
315 $seen = array(); // recursion guard
316 // loop through all the inclusions
317 for ($i = 0; isset($merge[$i]); $i++) {
318 if (isset($seen[$merge[$i]])) continue;
319 $seen[$merge[$i]] = true;
320 // foreach attribute of the inclusion, copy it over
321 if (!isset($this->info[$merge[$i]])) continue;
322 foreach ($this->info[$merge[$i]] as $key => $value) {
323 if (isset($attr[$key])) continue; // also catches more inclusions
324 $attr[$key] = $value;
326 if (isset($this->info[$merge[$i]][0])) {
328 $merge = array_merge($merge, $this->info[$merge[$i]][0]);
335 * Expands all string identifiers in an attribute array by replacing
336 * them with the appropriate values inside HTMLPurifier_AttrTypes
337 * @param &$attr Reference to attribute array
338 * @param $attr_types HTMLPurifier_AttrTypes instance
340 public function expandIdentifiers(&$attr, $attr_types) {
342 // because foreach will process new elements we add, make sure we
344 $processed = array();
346 foreach ($attr as $def_i => $def) {
348 if ($def_i === 0) continue;
350 if (isset($processed[$def_i])) continue;
352 // determine whether or not attribute is required
353 if ($required = (strpos($def_i, '*') !== false)) {
354 // rename the definition
355 unset($attr[$def_i]);
356 $def_i = trim($def_i, '*');
357 $attr[$def_i] = $def;
360 $processed[$def_i] = true;
362 // if we've already got a literal object, move on
363 if (is_object($def)) {
364 // preserve previous required
365 $attr[$def_i]->required = ($required || $attr[$def_i]->required);
369 if ($def === false) {
370 unset($attr[$def_i]);
374 if ($t = $attr_types->get($def)) {
376 $attr[$def_i]->required = $required;
378 unset($attr[$def_i]);
391 * Base class for all validating attribute definitions.
393 * This family of classes forms the core for not only HTML attribute validation,
394 * but also any sort of string that needs to be validated or cleaned (which
395 * means CSS properties and composite definitions are defined here too).
396 * Besides defining (through code) what precisely makes the string valid,
397 * subclasses are also responsible for cleaning the code if possible.
400 abstract class HTMLPurifier_AttrDef
404 * Tells us whether or not an HTML attribute is minimized. Has no
405 * meaning in other contexts.
407 public $minimized = false;
410 * Tells us whether or not an HTML attribute is required. Has no
411 * meaning in other contexts
413 public $required = false;
416 * Validates and cleans passed string according to a definition.
418 * @param $string String to be validated and cleaned.
419 * @param $config Mandatory HTMLPurifier_Config object.
420 * @param $context Mandatory HTMLPurifier_AttrContext object.
422 abstract public function validate($string, $config, $context);
425 * Convenience method that parses a string as if it were CDATA.
427 * This method process a string in the manner specified at
428 * <http://www.w3.org/TR/html4/types.html#h-6.2> by removing
429 * leading and trailing whitespace, ignoring line feeds, and replacing
430 * carriage returns and tabs with spaces. While most useful for HTML
431 * attributes specified as CDATA, it can also be applied to most CSS
434 * @note This method is not entirely standards compliant, as trim() removes
435 * more types of whitespace than specified in the spec. In practice,
436 * this is rarely a problem, as those extra characters usually have
437 * already been removed by HTMLPurifier_Encoder.
439 * @warning This processing is inconsistent with XML's whitespace handling
440 * as specified by section 3.3.3 and referenced XHTML 1.0 section
441 * 4.7. However, note that we are NOT necessarily
442 * parsing XML, thus, this behavior may still be correct. We
443 * assume that newlines have been normalized.
445 public function parseCDATA($string) {
446 $string = trim($string);
447 $string = str_replace(array("\n", "\t", "\r"), ' ', $string);
452 * Factory method for creating this class from a string.
453 * @param $string String construction info
454 * @return Created AttrDef object corresponding to $string
456 public function make($string) {
457 // default implementation, return a flyweight of this object.
458 // If $string has an effect on the returned object (i.e. you
459 // need to overload this method), it is best
460 // to clone or instantiate new copies. (Instantiation is safer.)
465 * Removes spaces from rgb(0, 0, 0) so that shorthand CSS properties work
466 * properly. THIS IS A HACK!
468 protected function mungeRgb($string) {
469 return preg_replace('/rgb\((\d+)\s*,\s*(\d+)\s*,\s*(\d+)\)/', 'rgb(\1,\2,\3)', $string);
473 * Parses a possibly escaped CSS string and returns the "pure"
476 protected function expandCSSEscape($string) {
479 for ($i = 0, $c = strlen($string); $i < $c; $i++) {
480 if ($string[$i] === '\\') {
486 if (ctype_xdigit($string[$i])) {
488 for ($a = 1, $i++; $i < $c && $a < 6; $i++, $a++) {
489 if (!ctype_xdigit($string[$i])) break;
490 $code .= $string[$i];
492 // We have to be extremely careful when adding
493 // new characters, to make sure we're not breaking
495 $char = HTMLPurifier_Encoder::unichr(hexdec($code));
496 if (HTMLPurifier_Encoder::cleanUTF8($char) === '') continue;
498 if ($i < $c && trim($string[$i]) !== '') $i--;
501 if ($string[$i] === "\n") continue;
515 * Processes an entire attribute array for corrections needing multiple values.
517 * Occasionally, a certain attribute will need to be removed and popped onto
518 * another value. Instead of creating a complex return syntax for
519 * HTMLPurifier_AttrDef, we just pass the whole attribute array to a
520 * specialized object and have that do the special work. That is the
521 * family of HTMLPurifier_AttrTransform.
523 * An attribute transformation can be assigned to run before or after
524 * HTMLPurifier_AttrDef validation. See HTMLPurifier_HTMLDefinition for
528 abstract class HTMLPurifier_AttrTransform
532 * Abstract: makes changes to the attributes dependent on multiple values.
534 * @param $attr Assoc array of attributes, usually from
535 * HTMLPurifier_Token_Tag::$attr
536 * @param $config Mandatory HTMLPurifier_Config object.
537 * @param $context Mandatory HTMLPurifier_Context object
538 * @returns Processed attribute array.
540 abstract public function transform($attr, $config, $context);
543 * Prepends CSS properties to the style attribute, creating the
544 * attribute if it doesn't exist.
545 * @param $attr Attribute array to process (passed by reference)
546 * @param $css CSS to prepend
548 public function prependCSS(&$attr, $css) {
549 $attr['style'] = isset($attr['style']) ? $attr['style'] : '';
550 $attr['style'] = $css . $attr['style'];
554 * Retrieves and removes an attribute
555 * @param $attr Attribute array to process (passed by reference)
556 * @param $key Key of attribute to confiscate
558 public function confiscateAttr(&$attr, $key) {
559 if (!isset($attr[$key])) return null;
560 $value = $attr[$key];
572 * Provides lookup array of attribute types to HTMLPurifier_AttrDef objects
574 class HTMLPurifier_AttrTypes
577 * Lookup array of attribute string identifiers to concrete implementations
579 protected $info = array();
582 * Constructs the info array, supplying default implementations for attribute
585 public function __construct() {
586 // pseudo-types, must be instantiated via shorthand
587 $this->info['Enum'] = new HTMLPurifier_AttrDef_Enum();
588 $this->info['Bool'] = new HTMLPurifier_AttrDef_HTML_Bool();
590 $this->info['CDATA'] = new HTMLPurifier_AttrDef_Text();
591 $this->info['ID'] = new HTMLPurifier_AttrDef_HTML_ID();
592 $this->info['Length'] = new HTMLPurifier_AttrDef_HTML_Length();
593 $this->info['MultiLength'] = new HTMLPurifier_AttrDef_HTML_MultiLength();
594 $this->info['NMTOKENS'] = new HTMLPurifier_AttrDef_HTML_Nmtokens();
595 $this->info['Pixels'] = new HTMLPurifier_AttrDef_HTML_Pixels();
596 $this->info['Text'] = new HTMLPurifier_AttrDef_Text();
597 $this->info['URI'] = new HTMLPurifier_AttrDef_URI();
598 $this->info['LanguageCode'] = new HTMLPurifier_AttrDef_Lang();
599 $this->info['Color'] = new HTMLPurifier_AttrDef_HTML_Color();
601 // unimplemented aliases
602 $this->info['ContentType'] = new HTMLPurifier_AttrDef_Text();
603 $this->info['ContentTypes'] = new HTMLPurifier_AttrDef_Text();
604 $this->info['Charsets'] = new HTMLPurifier_AttrDef_Text();
605 $this->info['Character'] = new HTMLPurifier_AttrDef_Text();
607 // "proprietary" types
608 $this->info['Class'] = new HTMLPurifier_AttrDef_HTML_Class();
610 // number is really a positive integer (one or more digits)
611 // FIXME: ^^ not always, see start and value of list items
612 $this->info['Number'] = new HTMLPurifier_AttrDef_Integer(false, false, true);
617 * @param $type String type name
618 * @return Object AttrDef for type
620 public function get($type) {
622 // determine if there is any extra info tacked on
623 if (strpos($type, '#') !== false) list($type, $string) = explode('#', $type, 2);
626 if (!isset($this->info[$type])) {
627 trigger_error('Cannot retrieve undefined attribute type ' . $type, E_USER_ERROR);
631 return $this->info[$type]->make($string);
636 * Sets a new implementation for a type
637 * @param $type String type name
638 * @param $impl Object AttrDef for type
640 public function set($type, $impl) {
641 $this->info[$type] = $impl;
650 * Validates the attributes of a token. Doesn't manage required attributes
651 * very well. The only reason we factored this out was because RemoveForeignElements
652 * also needed it besides ValidateAttributes.
654 class HTMLPurifier_AttrValidator
658 * Validates the attributes of a token, returning a modified token
659 * that has valid tokens
660 * @param $token Reference to token to validate. We require a reference
661 * because the operation this class performs on the token are
662 * not atomic, so the context CurrentToken to be updated
664 * @param $config Instance of HTMLPurifier_Config
665 * @param $context Instance of HTMLPurifier_Context
667 public function validateToken(&$token, &$config, $context) {
669 $definition = $config->getHTMLDefinition();
670 $e =& $context->get('ErrorCollector', true);
672 // initialize IDAccumulator if necessary
673 $ok =& $context->get('IDAccumulator', true);
675 $id_accumulator = HTMLPurifier_IDAccumulator::build($config, $context);
676 $context->register('IDAccumulator', $id_accumulator);
679 // initialize CurrentToken if necessary
680 $current_token =& $context->get('CurrentToken', true);
681 if (!$current_token) $context->register('CurrentToken', $token);
684 !$token instanceof HTMLPurifier_Token_Start &&
685 !$token instanceof HTMLPurifier_Token_Empty
688 // create alias to global definition array, see also $defs
690 $d_defs = $definition->info_global_attr;
692 // don't update token until the very end, to ensure an atomic update
693 $attr = $token->attr;
695 // do global transformations (pre)
696 // nothing currently utilizes this
697 foreach ($definition->info_attr_transform_pre as $transform) {
698 $attr = $transform->transform($o = $attr, $config, $context);
700 if ($attr != $o) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr);
704 // do local transformations only applicable to this element (pre)
705 // ex. <p align="right"> to <p style="text-align:right;">
706 foreach ($definition->info[$token->name]->attr_transform_pre as $transform) {
707 $attr = $transform->transform($o = $attr, $config, $context);
709 if ($attr != $o) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr);
713 // create alias to this element's attribute definition array, see
714 // also $d_defs (global attribute definition array)
716 $defs = $definition->info[$token->name]->attr;
719 $context->register('CurrentAttr', $attr_key);
721 // iterate through all the attribute keypairs
722 // Watch out for name collisions: $key has previously been used
723 foreach ($attr as $attr_key => $value) {
725 // call the definition
726 if ( isset($defs[$attr_key]) ) {
727 // there is a local definition defined
728 if ($defs[$attr_key] === false) {
729 // We've explicitly been told not to allow this element.
730 // This is usually when there's a global definition
731 // that must be overridden.
732 // Theoretically speaking, we could have a
733 // AttrDef_DenyAll, but this is faster!
736 // validate according to the element's definition
737 $result = $defs[$attr_key]->validate(
738 $value, $config, $context
741 } elseif ( isset($d_defs[$attr_key]) ) {
742 // there is a global definition defined, validate according
743 // to the global definition
744 $result = $d_defs[$attr_key]->validate(
745 $value, $config, $context
748 // system never heard of the attribute? DELETE!
752 // put the results into effect
753 if ($result === false || $result === null) {
754 // this is a generic error message that should replaced
755 // with more specific ones when possible
756 if ($e) $e->send(E_ERROR, 'AttrValidator: Attribute removed');
758 // remove the attribute
759 unset($attr[$attr_key]);
760 } elseif (is_string($result)) {
761 // generally, if a substitution is happening, there
762 // was some sort of implicit correction going on. We'll
763 // delegate it to the attribute classes to say exactly what.
765 // simple substitution
766 $attr[$attr_key] = $result;
771 // we'd also want slightly more complicated substitution
772 // involving an array as the return value,
773 // although we're not sure how colliding attributes would
774 // resolve (certain ones would be completely overriden,
775 // others would prepend themselves).
778 $context->destroy('CurrentAttr');
782 // global (error reporting untested)
783 foreach ($definition->info_attr_transform_post as $transform) {
784 $attr = $transform->transform($o = $attr, $config, $context);
786 if ($attr != $o) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr);
790 // local (error reporting untested)
791 foreach ($definition->info[$token->name]->attr_transform_post as $transform) {
792 $attr = $transform->transform($o = $attr, $config, $context);
794 if ($attr != $o) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr);
798 $token->attr = $attr;
800 // destroy CurrentToken if we made it ourselves
801 if (!$current_token) $context->destroy('CurrentToken');
812 // constants are slow, so we use as few as possible
813 if (!defined('HTMLPURIFIER_PREFIX')) {
814 define('HTMLPURIFIER_PREFIX', dirname(__FILE__) . '/standalone');
815 set_include_path(HTMLPURIFIER_PREFIX . PATH_SEPARATOR . get_include_path());
818 // accomodations for versions earlier than 5.0.2
819 // borrowed from PHP_Compat, LGPL licensed, by Aidan Lister <aidan@php.net>
820 if (!defined('PHP_EOL')) {
821 switch (strtoupper(substr(PHP_OS, 0, 3))) {
823 define('PHP_EOL', "\r\n");
826 define('PHP_EOL', "\r");
829 define('PHP_EOL', "\n");
834 * Bootstrap class that contains meta-functionality for HTML Purifier such as
835 * the autoload function.
838 * This class may be used without any other files from HTML Purifier.
840 class HTMLPurifier_Bootstrap
844 * Autoload function for HTML Purifier
845 * @param $class Class to load
847 public static function autoload($class) {
848 $file = HTMLPurifier_Bootstrap::getPath($class);
849 if (!$file) return false;
850 // Technically speaking, it should be ok and more efficient to
851 // just do 'require', but Antonio Parraga reports that with
852 // Zend extensions such as Zend debugger and APC, this invariant
853 // may be broken. Since we have efficient alternatives, pay
854 // the cost here and avoid the bug.
855 require_once HTMLPURIFIER_PREFIX . '/' . $file;
860 * Returns the path for a specific class.
862 public static function getPath($class) {
863 if (strncmp('HTMLPurifier', $class, 12) !== 0) return false;
864 // Custom implementations
865 if (strncmp('HTMLPurifier_Language_', $class, 22) === 0) {
866 $code = str_replace('_', '-', substr($class, 22));
867 $file = 'HTMLPurifier/Language/classes/' . $code . '.php';
869 $file = str_replace('_', '/', $class) . '.php';
871 if (!file_exists(HTMLPURIFIER_PREFIX . '/' . $file)) return false;
876 * "Pre-registers" our autoloader on the SPL stack.
878 public static function registerAutoload() {
879 $autoload = array('HTMLPurifier_Bootstrap', 'autoload');
880 if ( ($funcs = spl_autoload_functions()) === false ) {
881 spl_autoload_register($autoload);
882 } elseif (function_exists('spl_autoload_unregister')) {
883 $buggy = version_compare(PHP_VERSION, '5.2.11', '<');
884 $compat = version_compare(PHP_VERSION, '5.1.2', '<=') &&
885 version_compare(PHP_VERSION, '5.1.0', '>=');
886 foreach ($funcs as $func) {
887 if ($buggy && is_array($func)) {
888 // :TRICKY: There are some compatibility issues and some
889 // places where we need to error out
890 $reflector = new ReflectionMethod($func[0], $func[1]);
891 if (!$reflector->isStatic()) {
892 throw new Exception('
893 HTML Purifier autoloader registrar is not compatible
894 with non-static object methods due to PHP Bug #44144;
895 Please do not use HTMLPurifier.autoload.php (or any
896 file that includes this file); instead, place the code:
897 spl_autoload_register(array(\'HTMLPurifier_Bootstrap\', \'autoload\'))
898 after your own autoloaders.
901 // Suprisingly, spl_autoload_register supports the
902 // Class::staticMethod callback format, although call_user_func doesn't
903 if ($compat) $func = implode('::', $func);
905 spl_autoload_unregister($func);
907 spl_autoload_register($autoload);
908 foreach ($funcs as $func) spl_autoload_register($func);
919 * Super-class for definition datatype objects, implements serialization
920 * functions for the class.
922 abstract class HTMLPurifier_Definition
926 * Has setup() been called yet?
928 public $setup = false;
931 * If true, write out the final definition object to the cache after
932 * setup. This will be true only if all invocations to get a raw
933 * definition object are also optimized. This does not cause file
934 * system thrashing because on subsequent calls the cached object
935 * is used and any writes to the raw definition object are short
936 * circuited. See enduser-customize.html for the high-level
939 public $optimized = null;
942 * What type of definition is it?
947 * Sets up the definition object into the final form, something
948 * not done by the constructor
949 * @param $config HTMLPurifier_Config instance
951 abstract protected function doSetup($config);
954 * Setup function that aborts if already setup
955 * @param $config HTMLPurifier_Config instance
957 public function setup($config) {
958 if ($this->setup) return;
960 $this->doSetup($config);
970 * Defines allowed CSS attributes and what their values are.
971 * @see HTMLPurifier_HTMLDefinition
973 class HTMLPurifier_CSSDefinition extends HTMLPurifier_Definition
976 public $type = 'CSS';
979 * Assoc array of attribute name to definition object.
981 public $info = array();
984 * Constructs the info array. The meat of this class.
986 protected function doSetup($config) {
988 $this->info['text-align'] = new HTMLPurifier_AttrDef_Enum(
989 array('left', 'right', 'center', 'justify'), false);
992 $this->info['border-bottom-style'] =
993 $this->info['border-right-style'] =
994 $this->info['border-left-style'] =
995 $this->info['border-top-style'] = new HTMLPurifier_AttrDef_Enum(
996 array('none', 'hidden', 'dotted', 'dashed', 'solid', 'double',
997 'groove', 'ridge', 'inset', 'outset'), false);
999 $this->info['border-style'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_style);
1001 $this->info['clear'] = new HTMLPurifier_AttrDef_Enum(
1002 array('none', 'left', 'right', 'both'), false);
1003 $this->info['float'] = new HTMLPurifier_AttrDef_Enum(
1004 array('none', 'left', 'right'), false);
1005 $this->info['font-style'] = new HTMLPurifier_AttrDef_Enum(
1006 array('normal', 'italic', 'oblique'), false);
1007 $this->info['font-variant'] = new HTMLPurifier_AttrDef_Enum(
1008 array('normal', 'small-caps'), false);
1010 $uri_or_none = new HTMLPurifier_AttrDef_CSS_Composite(
1012 new HTMLPurifier_AttrDef_Enum(array('none')),
1013 new HTMLPurifier_AttrDef_CSS_URI()
1017 $this->info['list-style-position'] = new HTMLPurifier_AttrDef_Enum(
1018 array('inside', 'outside'), false);
1019 $this->info['list-style-type'] = new HTMLPurifier_AttrDef_Enum(
1020 array('disc', 'circle', 'square', 'decimal', 'lower-roman',
1021 'upper-roman', 'lower-alpha', 'upper-alpha', 'none'), false);
1022 $this->info['list-style-image'] = $uri_or_none;
1024 $this->info['list-style'] = new HTMLPurifier_AttrDef_CSS_ListStyle($config);
1026 $this->info['text-transform'] = new HTMLPurifier_AttrDef_Enum(
1027 array('capitalize', 'uppercase', 'lowercase', 'none'), false);
1028 $this->info['color'] = new HTMLPurifier_AttrDef_CSS_Color();
1030 $this->info['background-image'] = $uri_or_none;
1031 $this->info['background-repeat'] = new HTMLPurifier_AttrDef_Enum(
1032 array('repeat', 'repeat-x', 'repeat-y', 'no-repeat')
1034 $this->info['background-attachment'] = new HTMLPurifier_AttrDef_Enum(
1035 array('scroll', 'fixed')
1037 $this->info['background-position'] = new HTMLPurifier_AttrDef_CSS_BackgroundPosition();
1040 $this->info['border-top-color'] =
1041 $this->info['border-bottom-color'] =
1042 $this->info['border-left-color'] =
1043 $this->info['border-right-color'] =
1044 $this->info['background-color'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1045 new HTMLPurifier_AttrDef_Enum(array('transparent')),
1046 new HTMLPurifier_AttrDef_CSS_Color()
1049 $this->info['background'] = new HTMLPurifier_AttrDef_CSS_Background($config);
1051 $this->info['border-color'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_color);
1054 $this->info['border-top-width'] =
1055 $this->info['border-bottom-width'] =
1056 $this->info['border-left-width'] =
1057 $this->info['border-right-width'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1058 new HTMLPurifier_AttrDef_Enum(array('thin', 'medium', 'thick')),
1059 new HTMLPurifier_AttrDef_CSS_Length('0') //disallow negative
1062 $this->info['border-width'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_width);
1064 $this->info['letter-spacing'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1065 new HTMLPurifier_AttrDef_Enum(array('normal')),
1066 new HTMLPurifier_AttrDef_CSS_Length()
1069 $this->info['word-spacing'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1070 new HTMLPurifier_AttrDef_Enum(array('normal')),
1071 new HTMLPurifier_AttrDef_CSS_Length()
1074 $this->info['font-size'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1075 new HTMLPurifier_AttrDef_Enum(array('xx-small', 'x-small',
1076 'small', 'medium', 'large', 'x-large', 'xx-large',
1077 'larger', 'smaller')),
1078 new HTMLPurifier_AttrDef_CSS_Percentage(),
1079 new HTMLPurifier_AttrDef_CSS_Length()
1082 $this->info['line-height'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1083 new HTMLPurifier_AttrDef_Enum(array('normal')),
1084 new HTMLPurifier_AttrDef_CSS_Number(true), // no negatives
1085 new HTMLPurifier_AttrDef_CSS_Length('0'),
1086 new HTMLPurifier_AttrDef_CSS_Percentage(true)
1090 $this->info['margin-top'] =
1091 $this->info['margin-bottom'] =
1092 $this->info['margin-left'] =
1093 $this->info['margin-right'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1094 new HTMLPurifier_AttrDef_CSS_Length(),
1095 new HTMLPurifier_AttrDef_CSS_Percentage(),
1096 new HTMLPurifier_AttrDef_Enum(array('auto'))
1099 $this->info['margin'] = new HTMLPurifier_AttrDef_CSS_Multiple($margin);
1103 $this->info['padding-top'] =
1104 $this->info['padding-bottom'] =
1105 $this->info['padding-left'] =
1106 $this->info['padding-right'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1107 new HTMLPurifier_AttrDef_CSS_Length('0'),
1108 new HTMLPurifier_AttrDef_CSS_Percentage(true)
1111 $this->info['padding'] = new HTMLPurifier_AttrDef_CSS_Multiple($padding);
1113 $this->info['text-indent'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1114 new HTMLPurifier_AttrDef_CSS_Length(),
1115 new HTMLPurifier_AttrDef_CSS_Percentage()
1118 $trusted_wh = new HTMLPurifier_AttrDef_CSS_Composite(array(
1119 new HTMLPurifier_AttrDef_CSS_Length('0'),
1120 new HTMLPurifier_AttrDef_CSS_Percentage(true),
1121 new HTMLPurifier_AttrDef_Enum(array('auto'))
1123 $max = $config->get('CSS.MaxImgLength');
1125 $this->info['width'] =
1126 $this->info['height'] =
1129 new HTMLPurifier_AttrDef_Switch('img',
1131 new HTMLPurifier_AttrDef_CSS_Composite(array(
1132 new HTMLPurifier_AttrDef_CSS_Length('0', $max),
1133 new HTMLPurifier_AttrDef_Enum(array('auto'))
1135 // For everyone else:
1139 $this->info['text-decoration'] = new HTMLPurifier_AttrDef_CSS_TextDecoration();
1141 $this->info['font-family'] = new HTMLPurifier_AttrDef_CSS_FontFamily();
1143 // this could use specialized code
1144 $this->info['font-weight'] = new HTMLPurifier_AttrDef_Enum(
1145 array('normal', 'bold', 'bolder', 'lighter', '100', '200', '300',
1146 '400', '500', '600', '700', '800', '900'), false);
1148 // MUST be called after other font properties, as it references
1149 // a CSSDefinition object
1150 $this->info['font'] = new HTMLPurifier_AttrDef_CSS_Font($config);
1153 $this->info['border'] =
1154 $this->info['border-bottom'] =
1155 $this->info['border-top'] =
1156 $this->info['border-left'] =
1157 $this->info['border-right'] = new HTMLPurifier_AttrDef_CSS_Border($config);
1159 $this->info['border-collapse'] = new HTMLPurifier_AttrDef_Enum(array(
1160 'collapse', 'separate'));
1162 $this->info['caption-side'] = new HTMLPurifier_AttrDef_Enum(array(
1165 $this->info['table-layout'] = new HTMLPurifier_AttrDef_Enum(array(
1168 $this->info['vertical-align'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1169 new HTMLPurifier_AttrDef_Enum(array('baseline', 'sub', 'super',
1170 'top', 'text-top', 'middle', 'bottom', 'text-bottom')),
1171 new HTMLPurifier_AttrDef_CSS_Length(),
1172 new HTMLPurifier_AttrDef_CSS_Percentage()
1175 $this->info['border-spacing'] = new HTMLPurifier_AttrDef_CSS_Multiple(new HTMLPurifier_AttrDef_CSS_Length(), 2);
1178 $this->info['white-space'] = new HTMLPurifier_AttrDef_Enum(array('nowrap'));
1180 if ($config->get('CSS.Proprietary')) {
1181 $this->doSetupProprietary($config);
1184 if ($config->get('CSS.AllowTricky')) {
1185 $this->doSetupTricky($config);
1188 if ($config->get('CSS.Trusted')) {
1189 $this->doSetupTrusted($config);
1192 $allow_important = $config->get('CSS.AllowImportant');
1193 // wrap all attr-defs with decorator that handles !important
1194 foreach ($this->info as $k => $v) {
1195 $this->info[$k] = new HTMLPurifier_AttrDef_CSS_ImportantDecorator($v, $allow_important);
1198 $this->setupConfigStuff($config);
1201 protected function doSetupProprietary($config) {
1202 // Internet Explorer only scrollbar colors
1203 $this->info['scrollbar-arrow-color'] = new HTMLPurifier_AttrDef_CSS_Color();
1204 $this->info['scrollbar-base-color'] = new HTMLPurifier_AttrDef_CSS_Color();
1205 $this->info['scrollbar-darkshadow-color'] = new HTMLPurifier_AttrDef_CSS_Color();
1206 $this->info['scrollbar-face-color'] = new HTMLPurifier_AttrDef_CSS_Color();
1207 $this->info['scrollbar-highlight-color'] = new HTMLPurifier_AttrDef_CSS_Color();
1208 $this->info['scrollbar-shadow-color'] = new HTMLPurifier_AttrDef_CSS_Color();
1210 // technically not proprietary, but CSS3, and no one supports it
1211 $this->info['opacity'] = new HTMLPurifier_AttrDef_CSS_AlphaValue();
1212 $this->info['-moz-opacity'] = new HTMLPurifier_AttrDef_CSS_AlphaValue();
1213 $this->info['-khtml-opacity'] = new HTMLPurifier_AttrDef_CSS_AlphaValue();
1215 // only opacity, for now
1216 $this->info['filter'] = new HTMLPurifier_AttrDef_CSS_Filter();
1220 protected function doSetupTricky($config) {
1221 $this->info['display'] = new HTMLPurifier_AttrDef_Enum(array(
1222 'inline', 'block', 'list-item', 'run-in', 'compact',
1223 'marker', 'table', 'inline-table', 'table-row-group',
1224 'table-header-group', 'table-footer-group', 'table-row',
1225 'table-column-group', 'table-column', 'table-cell', 'table-caption', 'none'
1227 $this->info['visibility'] = new HTMLPurifier_AttrDef_Enum(array(
1228 'visible', 'hidden', 'collapse'
1230 $this->info['overflow'] = new HTMLPurifier_AttrDef_Enum(array('visible', 'hidden', 'auto', 'scroll'));
1233 protected function doSetupTrusted($config) {
1234 $this->info['position'] = new HTMLPurifier_AttrDef_Enum(array(
1235 'static', 'relative', 'absolute', 'fixed'
1237 $this->info['top'] =
1238 $this->info['left'] =
1239 $this->info['right'] =
1240 $this->info['bottom'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1241 new HTMLPurifier_AttrDef_CSS_Length(),
1242 new HTMLPurifier_AttrDef_CSS_Percentage(),
1243 new HTMLPurifier_AttrDef_Enum(array('auto')),
1245 $this->info['z-index'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1246 new HTMLPurifier_AttrDef_Integer(),
1247 new HTMLPurifier_AttrDef_Enum(array('auto')),
1252 * Performs extra config-based processing. Based off of
1253 * HTMLPurifier_HTMLDefinition.
1254 * @todo Refactor duplicate elements into common class (probably using
1255 * composition, not inheritance).
1257 protected function setupConfigStuff($config) {
1259 // setup allowed elements
1260 $support = "(for information on implementing this, see the ".
1262 $allowed_properties = $config->get('CSS.AllowedProperties');
1263 if ($allowed_properties !== null) {
1264 foreach ($this->info as $name => $d) {
1265 if(!isset($allowed_properties[$name])) unset($this->info[$name]);
1266 unset($allowed_properties[$name]);
1269 foreach ($allowed_properties as $name => $d) {
1270 // :TODO: Is this htmlspecialchars() call really necessary?
1271 $name = htmlspecialchars($name);
1272 trigger_error("Style attribute '$name' is not supported $support", E_USER_WARNING);
1276 $forbidden_properties = $config->get('CSS.ForbiddenProperties');
1277 if ($forbidden_properties !== null) {
1278 foreach ($this->info as $name => $d) {
1279 if (isset($forbidden_properties[$name])) {
1280 unset($this->info[$name]);
1293 * Defines allowed child nodes and validates tokens against it.
1295 abstract class HTMLPurifier_ChildDef
1298 * Type of child definition, usually right-most part of class name lowercase.
1299 * Used occasionally in terms of context.
1304 * Bool that indicates whether or not an empty array of children is okay
1306 * This is necessary for redundant checking when changes affecting
1307 * a child node may cause a parent node to now be disallowed.
1309 public $allow_empty;
1312 * Lookup array of all elements that this definition could possibly allow
1314 public $elements = array();
1317 * Get lookup of tag names that should not close this element automatically.
1318 * All other elements will do so.
1320 public function getAllowedElements($config) {
1321 return $this->elements;
1325 * Validates nodes according to definition and returns modification.
1327 * @param $tokens_of_children Array of HTMLPurifier_Token
1328 * @param $config HTMLPurifier_Config object
1329 * @param $context HTMLPurifier_Context object
1330 * @return bool true to leave nodes as is
1331 * @return bool false to remove parent node
1332 * @return array of replacement child tokens
1334 abstract public function validateChildren($tokens_of_children, $config, $context);
1342 * Configuration object that triggers customizable behavior.
1344 * @warning This class is strongly defined: that means that the class
1345 * will fail if an undefined directive is retrieved or set.
1347 * @note Many classes that could (although many times don't) use the
1348 * configuration object make it a mandatory parameter. This is
1349 * because a configuration object should always be forwarded,
1350 * otherwise, you run the risk of missing a parameter and then
1351 * being stumped when a configuration directive doesn't work.
1353 * @todo Reconsider some of the public member variables
1355 class HTMLPurifier_Config
1359 * HTML Purifier's version
1361 public $version = '4.3.0';
1364 * Bool indicator whether or not to automatically finalize
1365 * the object if a read operation is done
1367 public $autoFinalize = true;
1369 // protected member variables
1372 * Namespace indexed array of serials for specific namespaces (see
1373 * getSerial() for more info).
1375 protected $serials = array();
1378 * Serial for entire configuration object
1383 * Parser for variables
1388 * Reference HTMLPurifier_ConfigSchema for value checking
1389 * @note This is public for introspective purposes. Please don't
1395 * Indexed array of definitions
1397 protected $definitions;
1400 * Bool indicator whether or not config is finalized
1402 protected $finalized = false;
1405 * Property list containing configuration directives.
1410 * Whether or not a set is taking place due to an
1416 * Set to false if you do not want line and file numbers in errors
1417 * (useful when unit testing). This will also compress some errors
1420 public $chatty = true;
1423 * Current lock; only gets to this namespace are allowed.
1428 * @param $definition HTMLPurifier_ConfigSchema that defines what directives
1431 public function __construct($definition, $parent = null) {
1432 $parent = $parent ? $parent : $definition->defaultPlist;
1433 $this->plist = new HTMLPurifier_PropertyList($parent);
1434 $this->def = $definition; // keep a copy around for checking
1435 $this->parser = new HTMLPurifier_VarParser_Flexible();
1439 * Convenience constructor that creates a config object based on a mixed var
1440 * @param mixed $config Variable that defines the state of the config
1441 * object. Can be: a HTMLPurifier_Config() object,
1442 * an array of directives based on loadArray(),
1443 * or a string filename of an ini file.
1444 * @param HTMLPurifier_ConfigSchema Schema object
1445 * @return Configured HTMLPurifier_Config object
1447 public static function create($config, $schema = null) {
1448 if ($config instanceof HTMLPurifier_Config) {
1453 $ret = HTMLPurifier_Config::createDefault();
1455 $ret = new HTMLPurifier_Config($schema);
1457 if (is_string($config)) $ret->loadIni($config);
1458 elseif (is_array($config)) $ret->loadArray($config);
1463 * Creates a new config object that inherits from a previous one.
1464 * @param HTMLPurifier_Config $config Configuration object to inherit
1466 * @return HTMLPurifier_Config object with $config as its parent.
1468 public static function inherit(HTMLPurifier_Config $config) {
1469 return new HTMLPurifier_Config($config->def, $config->plist);
1473 * Convenience constructor that creates a default configuration object.
1474 * @return Default HTMLPurifier_Config object.
1476 public static function createDefault() {
1477 $definition = HTMLPurifier_ConfigSchema::instance();
1478 $config = new HTMLPurifier_Config($definition);
1483 * Retreives a value from the configuration.
1484 * @param $key String key
1486 public function get($key, $a = null) {
1488 $this->triggerError("Using deprecated API: use \$config->get('$key.$a') instead", E_USER_WARNING);
1491 if (!$this->finalized) $this->autoFinalize();
1492 if (!isset($this->def->info[$key])) {
1493 // can't add % due to SimpleTest bug
1494 $this->triggerError('Cannot retrieve value of undefined directive ' . htmlspecialchars($key),
1498 if (isset($this->def->info[$key]->isAlias)) {
1499 $d = $this->def->info[$key];
1500 $this->triggerError('Cannot get value from aliased directive, use real name ' . $d->key,
1505 list($ns) = explode('.', $key);
1506 if ($ns !== $this->lock) {
1507 $this->triggerError('Cannot get value of namespace ' . $ns . ' when lock for ' . $this->lock . ' is active, this probably indicates a Definition setup method is accessing directives that are not within its namespace', E_USER_ERROR);
1511 return $this->plist->get($key);
1515 * Retreives an array of directives to values from a given namespace
1516 * @param $namespace String namespace
1518 public function getBatch($namespace) {
1519 if (!$this->finalized) $this->autoFinalize();
1520 $full = $this->getAll();
1521 if (!isset($full[$namespace])) {
1522 $this->triggerError('Cannot retrieve undefined namespace ' . htmlspecialchars($namespace),
1526 return $full[$namespace];
1530 * Returns a md5 signature of a segment of the configuration object
1531 * that uniquely identifies that particular configuration
1532 * @note Revision is handled specially and is removed from the batch
1533 * before processing!
1534 * @param $namespace Namespace to get serial for
1536 public function getBatchSerial($namespace) {
1537 if (empty($this->serials[$namespace])) {
1538 $batch = $this->getBatch($namespace);
1539 unset($batch['DefinitionRev']);
1540 $this->serials[$namespace] = md5(serialize($batch));
1542 return $this->serials[$namespace];
1546 * Returns a md5 signature for the entire configuration object
1547 * that uniquely identifies that particular configuration
1549 public function getSerial() {
1550 if (empty($this->serial)) {
1551 $this->serial = md5(serialize($this->getAll()));
1553 return $this->serial;
1557 * Retrieves all directives, organized by namespace
1558 * @warning This is a pretty inefficient function, avoid if you can
1560 public function getAll() {
1561 if (!$this->finalized) $this->autoFinalize();
1563 foreach ($this->plist->squash() as $name => $value) {
1564 list($ns, $key) = explode('.', $name, 2);
1565 $ret[$ns][$key] = $value;
1571 * Sets a value to configuration.
1572 * @param $key String key
1573 * @param $value Mixed value
1575 public function set($key, $value, $a = null) {
1576 if (strpos($key, '.') === false) {
1578 $directive = $value;
1580 $key = "$key.$directive";
1581 $this->triggerError("Using deprecated API: use \$config->set('$key', ...) instead", E_USER_NOTICE);
1583 list($namespace) = explode('.', $key);
1585 if ($this->isFinalized('Cannot set directive after finalization')) return;
1586 if (!isset($this->def->info[$key])) {
1587 $this->triggerError('Cannot set undefined directive ' . htmlspecialchars($key) . ' to value',
1591 $def = $this->def->info[$key];
1593 if (isset($def->isAlias)) {
1594 if ($this->aliasMode) {
1595 $this->triggerError('Double-aliases not allowed, please fix '.
1596 'ConfigSchema bug with' . $key, E_USER_ERROR);
1599 $this->aliasMode = true;
1600 $this->set($def->key, $value);
1601 $this->aliasMode = false;
1602 $this->triggerError("$key is an alias, preferred directive name is {$def->key}", E_USER_NOTICE);
1606 // Raw type might be negative when using the fully optimized form
1607 // of stdclass, which indicates allow_null == true
1608 $rtype = is_int($def) ? $def : $def->type;
1614 $allow_null = isset($def->allow_null);
1618 $value = $this->parser->parse($value, $type, $allow_null);
1619 } catch (HTMLPurifier_VarParserException $e) {
1620 $this->triggerError('Value for ' . $key . ' is of invalid type, should be ' . HTMLPurifier_VarParser::getTypeName($type), E_USER_WARNING);
1623 if (is_string($value) && is_object($def)) {
1624 // resolve value alias if defined
1625 if (isset($def->aliases[$value])) {
1626 $value = $def->aliases[$value];
1628 // check to see if the value is allowed
1629 if (isset($def->allowed) && !isset($def->allowed[$value])) {
1630 $this->triggerError('Value not supported, valid values are: ' .
1631 $this->_listify($def->allowed), E_USER_WARNING);
1635 $this->plist->set($key, $value);
1637 // reset definitions if the directives they depend on changed
1638 // this is a very costly process, so it's discouraged
1639 // with finalization
1640 if ($namespace == 'HTML' || $namespace == 'CSS' || $namespace == 'URI') {
1641 $this->definitions[$namespace] = null;
1644 $this->serials[$namespace] = false;
1648 * Convenience function for error reporting
1650 private function _listify($lookup) {
1652 foreach ($lookup as $name => $b) $list[] = $name;
1653 return implode(', ', $list);
1657 * Retrieves object reference to the HTML definition.
1658 * @param $raw Return a copy that has not been setup yet. Must be
1659 * called before it's been setup, otherwise won't work.
1660 * @param $optimized If true, this method may return null, to
1661 * indicate that a cached version of the modified
1662 * definition object is available and no further edits
1663 * are necessary. Consider using
1664 * maybeGetRawHTMLDefinition, which is more explicitly
1667 public function getHTMLDefinition($raw = false, $optimized = false) {
1668 return $this->getDefinition('HTML', $raw, $optimized);
1672 * Retrieves object reference to the CSS definition
1673 * @param $raw Return a copy that has not been setup yet. Must be
1674 * called before it's been setup, otherwise won't work.
1675 * @param $optimized If true, this method may return null, to
1676 * indicate that a cached version of the modified
1677 * definition object is available and no further edits
1678 * are necessary. Consider using
1679 * maybeGetRawCSSDefinition, which is more explicitly
1682 public function getCSSDefinition($raw = false, $optimized = false) {
1683 return $this->getDefinition('CSS', $raw, $optimized);
1687 * Retrieves object reference to the URI definition
1688 * @param $raw Return a copy that has not been setup yet. Must be
1689 * called before it's been setup, otherwise won't work.
1690 * @param $optimized If true, this method may return null, to
1691 * indicate that a cached version of the modified
1692 * definition object is available and no further edits
1693 * are necessary. Consider using
1694 * maybeGetRawURIDefinition, which is more explicitly
1697 public function getURIDefinition($raw = false, $optimized = false) {
1698 return $this->getDefinition('URI', $raw, $optimized);
1702 * Retrieves a definition
1703 * @param $type Type of definition: HTML, CSS, etc
1704 * @param $raw Whether or not definition should be returned raw
1705 * @param $optimized Only has an effect when $raw is true. Whether
1706 * or not to return null if the result is already present in
1707 * the cache. This is off by default for backwards
1708 * compatibility reasons, but you need to do things this
1709 * way in order to ensure that caching is done properly.
1710 * Check out enduser-customize.html for more details.
1711 * We probably won't ever change this default, as much as the
1712 * maybe semantics is the "right thing to do."
1714 public function getDefinition($type, $raw = false, $optimized = false) {
1715 if ($optimized && !$raw) {
1716 throw new HTMLPurifier_Exception("Cannot set optimized = true when raw = false");
1718 if (!$this->finalized) $this->autoFinalize();
1719 // temporarily suspend locks, so we can handle recursive definition calls
1720 $lock = $this->lock;
1722 $factory = HTMLPurifier_DefinitionCacheFactory::instance();
1723 $cache = $factory->create($type, $this);
1724 $this->lock = $lock;
1728 // check if definition is in memory
1729 if (!empty($this->definitions[$type])) {
1730 $def = $this->definitions[$type];
1731 // check if the definition is setup
1736 if ($def->optimized) $cache->add($def, $this);
1740 // check if definition is in cache
1741 $def = $cache->get($this);
1743 // definition in cache, save to memory and return it
1744 $this->definitions[$type] = $def;
1748 $def = $this->initDefinition($type);
1750 $this->lock = $type;
1754 $cache->add($def, $this);
1760 // check preconditions
1763 if (is_null($this->get($type . '.DefinitionID'))) {
1764 // fatally error out if definition ID not set
1765 throw new HTMLPurifier_Exception("Cannot retrieve raw version without specifying %$type.DefinitionID");
1768 if (!empty($this->definitions[$type])) {
1769 $def = $this->definitions[$type];
1770 if ($def->setup && !$optimized) {
1771 $extra = $this->chatty ? " (try moving this code block earlier in your initialization)" : "";
1772 throw new HTMLPurifier_Exception("Cannot retrieve raw definition after it has already been setup" . $extra);
1774 if ($def->optimized === null) {
1775 $extra = $this->chatty ? " (try flushing your cache)" : "";
1776 throw new HTMLPurifier_Exception("Optimization status of definition is unknown" . $extra);
1778 if ($def->optimized !== $optimized) {
1779 $msg = $optimized ? "optimized" : "unoptimized";
1780 $extra = $this->chatty ? " (this backtrace is for the first inconsistent call, which was for a $msg raw definition)" : "";
1781 throw new HTMLPurifier_Exception("Inconsistent use of optimized and unoptimized raw definition retrievals" . $extra);
1784 // check if definition was in memory
1787 // invariant: $optimized === true (checked above)
1793 // if optimized, check if definition was in cache
1794 // (because we do the memory check first, this formulation
1795 // is prone to cache slamming, but I think
1796 // guaranteeing that either /all/ of the raw
1797 // setup code or /none/ of it is run is more important.)
1799 // This code path only gets run once; once we put
1800 // something in $definitions (which is guaranteed by the
1801 // trailing code), we always short-circuit above.
1802 $def = $cache->get($this);
1804 // save the full definition for later, but don't
1806 $this->definitions[$type] = $def;
1810 // check invariants for creation
1812 if (!is_null($this->get($type . '.DefinitionID'))) {
1813 if ($this->chatty) {
1814 $this->triggerError("Due to a documentation error in previous version of HTML Purifier, your definitions are not being cached. If this is OK, you can remove the %$type.DefinitionRev and %$type.DefinitionID declaration. Otherwise, modify your code to use maybeGetRawDefinition, and test if the returned value is null before making any edits (if it is null, that means that a cached version is available, and no raw operations are necessary). See <a href='http://htmlpurifier.org/docs/enduser-customize.html#optimized'>Customize</a> for more details", E_USER_WARNING);
1816 $this->triggerError("Useless DefinitionID declaration", E_USER_WARNING);
1821 $def = $this->initDefinition($type);
1822 $def->optimized = $optimized;
1825 throw new HTMLPurifier_Exception("The impossible happened!");
1828 private function initDefinition($type) {
1829 // quick checks failed, let's create the object
1830 if ($type == 'HTML') {
1831 $def = new HTMLPurifier_HTMLDefinition();
1832 } elseif ($type == 'CSS') {
1833 $def = new HTMLPurifier_CSSDefinition();
1834 } elseif ($type == 'URI') {
1835 $def = new HTMLPurifier_URIDefinition();
1837 throw new HTMLPurifier_Exception("Definition of $type type not supported");
1839 $this->definitions[$type] = $def;
1843 public function maybeGetRawDefinition($name) {
1844 return $this->getDefinition($name, true, true);
1847 public function maybeGetRawHTMLDefinition() {
1848 return $this->getDefinition('HTML', true, true);
1851 public function maybeGetRawCSSDefinition() {
1852 return $this->getDefinition('CSS', true, true);
1855 public function maybeGetRawURIDefinition() {
1856 return $this->getDefinition('URI', true, true);
1860 * Loads configuration values from an array with the following structure:
1861 * Namespace.Directive => Value
1862 * @param $config_array Configuration associative array
1864 public function loadArray($config_array) {
1865 if ($this->isFinalized('Cannot load directives after finalization')) return;
1866 foreach ($config_array as $key => $value) {
1867 $key = str_replace('_', '.', $key);
1868 if (strpos($key, '.') !== false) {
1869 $this->set($key, $value);
1872 $namespace_values = $value;
1873 foreach ($namespace_values as $directive => $value) {
1874 $this->set($namespace .'.'. $directive, $value);
1881 * Returns a list of array(namespace, directive) for all directives
1882 * that are allowed in a web-form context as per an allowed
1883 * namespaces/directives list.
1884 * @param $allowed List of allowed namespaces/directives
1886 public static function getAllowedDirectivesForForm($allowed, $schema = null) {
1888 $schema = HTMLPurifier_ConfigSchema::instance();
1890 if ($allowed !== true) {
1891 if (is_string($allowed)) $allowed = array($allowed);
1892 $allowed_ns = array();
1893 $allowed_directives = array();
1894 $blacklisted_directives = array();
1895 foreach ($allowed as $ns_or_directive) {
1896 if (strpos($ns_or_directive, '.') !== false) {
1898 if ($ns_or_directive[0] == '-') {
1899 $blacklisted_directives[substr($ns_or_directive, 1)] = true;
1901 $allowed_directives[$ns_or_directive] = true;
1905 $allowed_ns[$ns_or_directive] = true;
1910 foreach ($schema->info as $key => $def) {
1911 list($ns, $directive) = explode('.', $key, 2);
1912 if ($allowed !== true) {
1913 if (isset($blacklisted_directives["$ns.$directive"])) continue;
1914 if (!isset($allowed_directives["$ns.$directive"]) && !isset($allowed_ns[$ns])) continue;
1916 if (isset($def->isAlias)) continue;
1917 if ($directive == 'DefinitionID' || $directive == 'DefinitionRev') continue;
1918 $ret[] = array($ns, $directive);
1924 * Loads configuration values from $_GET/$_POST that were posted
1926 * @param $array $_GET or $_POST array to import
1927 * @param $index Index/name that the config variables are in
1928 * @param $allowed List of allowed namespaces/directives
1929 * @param $mq_fix Boolean whether or not to enable magic quotes fix
1930 * @param $schema Instance of HTMLPurifier_ConfigSchema to use, if not global copy
1932 public static function loadArrayFromForm($array, $index = false, $allowed = true, $mq_fix = true, $schema = null) {
1933 $ret = HTMLPurifier_Config::prepareArrayFromForm($array, $index, $allowed, $mq_fix, $schema);
1934 $config = HTMLPurifier_Config::create($ret, $schema);
1939 * Merges in configuration values from $_GET/$_POST to object. NOT STATIC.
1940 * @note Same parameters as loadArrayFromForm
1942 public function mergeArrayFromForm($array, $index = false, $allowed = true, $mq_fix = true) {
1943 $ret = HTMLPurifier_Config::prepareArrayFromForm($array, $index, $allowed, $mq_fix, $this->def);
1944 $this->loadArray($ret);
1948 * Prepares an array from a form into something usable for the more
1949 * strict parts of HTMLPurifier_Config
1951 public static function prepareArrayFromForm($array, $index = false, $allowed = true, $mq_fix = true, $schema = null) {
1952 if ($index !== false) $array = (isset($array[$index]) && is_array($array[$index])) ? $array[$index] : array();
1953 $mq = $mq_fix && function_exists('get_magic_quotes_gpc') && get_magic_quotes_gpc();
1955 $allowed = HTMLPurifier_Config::getAllowedDirectivesForForm($allowed, $schema);
1957 foreach ($allowed as $key) {
1958 list($ns, $directive) = $key;
1959 $skey = "$ns.$directive";
1960 if (!empty($array["Null_$skey"])) {
1961 $ret[$ns][$directive] = null;
1964 if (!isset($array[$skey])) continue;
1965 $value = $mq ? stripslashes($array[$skey]) : $array[$skey];
1966 $ret[$ns][$directive] = $value;
1972 * Loads configuration values from an ini file
1973 * @param $filename Name of ini file
1975 public function loadIni($filename) {
1976 if ($this->isFinalized('Cannot load directives after finalization')) return;
1977 $array = parse_ini_file($filename, true);
1978 $this->loadArray($array);
1982 * Checks whether or not the configuration object is finalized.
1983 * @param $error String error message, or false for no error
1985 public function isFinalized($error = false) {
1986 if ($this->finalized && $error) {
1987 $this->triggerError($error, E_USER_ERROR);
1989 return $this->finalized;
1993 * Finalizes configuration only if auto finalize is on and not
1996 public function autoFinalize() {
1997 if ($this->autoFinalize) {
2000 $this->plist->squash(true);
2005 * Finalizes a configuration object, prohibiting further change
2007 public function finalize() {
2008 $this->finalized = true;
2009 unset($this->parser);
2013 * Produces a nicely formatted error message by supplying the
2014 * stack frame information OUTSIDE of HTMLPurifier_Config.
2016 protected function triggerError($msg, $no) {
2017 // determine previous stack frame
2019 if ($this->chatty) {
2020 $trace = debug_backtrace();
2021 // zip(tail(trace), trace) -- but PHP is not Haskell har har
2022 for ($i = 0, $c = count($trace); $i < $c - 1; $i++) {
2023 if ($trace[$i + 1]['class'] === 'HTMLPurifier_Config') {
2026 $frame = $trace[$i];
2027 $extra = " invoked on line {$frame['line']} in file {$frame['file']}";
2031 trigger_error($msg . $extra, $no);
2035 * Returns a serialized form of the configuration object that can
2038 public function serialize() {
2039 $this->getDefinition('HTML');
2040 $this->getDefinition('CSS');
2041 $this->getDefinition('URI');
2042 return serialize($this);
2052 * Configuration definition, defines directives and their defaults.
2054 class HTMLPurifier_ConfigSchema {
2057 * Defaults of the directives and namespaces.
2058 * @note This shares the exact same structure as HTMLPurifier_Config::$conf
2060 public $defaults = array();
2063 * The default property list. Do not edit this property list.
2065 public $defaultPlist;
2068 * Definition of the directives. The structure of this is:
2071 * 'Namespace' => array(
2072 * 'Directive' => new stdclass(),
2076 * The stdclass may have the following properties:
2078 * - If isAlias isn't set:
2079 * - type: Integer type of directive, see HTMLPurifier_VarParser for definitions
2080 * - allow_null: If set, this directive allows null values
2081 * - aliases: If set, an associative array of value aliases to real values
2082 * - allowed: If set, a lookup array of allowed (string) values
2083 * - If isAlias is set:
2084 * - namespace: Namespace this directive aliases to
2085 * - name: Directive name this directive aliases to
2087 * In certain degenerate cases, stdclass will actually be an integer. In
2088 * that case, the value is equivalent to an stdclass with the type
2089 * property set to the integer. If the integer is negative, type is
2090 * equal to the absolute value of integer, and allow_null is true.
2092 * This class is friendly with HTMLPurifier_Config. If you need introspection
2093 * about the schema, you're better of using the ConfigSchema_Interchange,
2094 * which uses more memory but has much richer information.
2096 public $info = array();
2099 * Application-wide singleton
2101 static protected $singleton;
2103 public function __construct() {
2104 $this->defaultPlist = new HTMLPurifier_PropertyList();
2108 * Unserializes the default ConfigSchema.
2110 public static function makeFromSerial() {
2111 $contents = file_get_contents(HTMLPURIFIER_PREFIX . '/HTMLPurifier/ConfigSchema/schema.ser');
2112 $r = unserialize($contents);
2114 $hash = sha1($contents);
2115 trigger_error("Unserialization of configuration schema failed, sha1 of file was $hash", E_USER_ERROR);
2121 * Retrieves an instance of the application-wide configuration definition.
2123 public static function instance($prototype = null) {
2124 if ($prototype !== null) {
2125 HTMLPurifier_ConfigSchema::$singleton = $prototype;
2126 } elseif (HTMLPurifier_ConfigSchema::$singleton === null || $prototype === true) {
2127 HTMLPurifier_ConfigSchema::$singleton = HTMLPurifier_ConfigSchema::makeFromSerial();
2129 return HTMLPurifier_ConfigSchema::$singleton;
2133 * Defines a directive for configuration
2134 * @warning Will fail of directive's namespace is defined.
2135 * @warning This method's signature is slightly different from the legacy
2136 * define() static method! Beware!
2137 * @param $namespace Namespace the directive is in
2138 * @param $name Key of directive
2139 * @param $default Default value of directive
2140 * @param $type Allowed type of the directive. See
2141 * HTMLPurifier_DirectiveDef::$type for allowed values
2142 * @param $allow_null Whether or not to allow null values
2144 public function add($key, $default, $type, $allow_null) {
2145 $obj = new stdclass();
2146 $obj->type = is_int($type) ? $type : HTMLPurifier_VarParser::$types[$type];
2147 if ($allow_null) $obj->allow_null = true;
2148 $this->info[$key] = $obj;
2149 $this->defaults[$key] = $default;
2150 $this->defaultPlist->set($key, $default);
2154 * Defines a directive value alias.
2156 * Directive value aliases are convenient for developers because it lets
2157 * them set a directive to several values and get the same result.
2158 * @param $namespace Directive's namespace
2159 * @param $name Name of Directive
2160 * @param $aliases Hash of aliased values to the real alias
2162 public function addValueAliases($key, $aliases) {
2163 if (!isset($this->info[$key]->aliases)) {
2164 $this->info[$key]->aliases = array();
2166 foreach ($aliases as $alias => $real) {
2167 $this->info[$key]->aliases[$alias] = $real;
2172 * Defines a set of allowed values for a directive.
2173 * @warning This is slightly different from the corresponding static
2174 * method definition.
2175 * @param $namespace Namespace of directive
2176 * @param $name Name of directive
2177 * @param $allowed Lookup array of allowed values
2179 public function addAllowedValues($key, $allowed) {
2180 $this->info[$key]->allowed = $allowed;
2184 * Defines a directive alias for backwards compatibility
2186 * @param $name Directive that will be aliased
2187 * @param $new_namespace
2188 * @param $new_name Directive that the alias will be to
2190 public function addAlias($key, $new_key) {
2191 $obj = new stdclass;
2192 $obj->key = $new_key;
2193 $obj->isAlias = true;
2194 $this->info[$key] = $obj;
2198 * Replaces any stdclass that only has the type property with type integer.
2200 public function postProcess() {
2201 foreach ($this->info as $key => $v) {
2202 if (count((array) $v) == 1) {
2203 $this->info[$key] = $v->type;
2204 } elseif (count((array) $v) == 2 && isset($v->allow_null)) {
2205 $this->info[$key] = -$v->type;
2219 class HTMLPurifier_ContentSets
2223 * List of content set strings (pipe seperators) indexed by name.
2225 public $info = array();
2228 * List of content set lookups (element => true) indexed by name.
2229 * @note This is in HTMLPurifier_HTMLDefinition->info_content_sets
2231 public $lookup = array();
2234 * Synchronized list of defined content sets (keys of info)
2236 protected $keys = array();
2238 * Synchronized list of defined content values (values of info)
2240 protected $values = array();
2243 * Merges in module's content sets, expands identifiers in the content
2244 * sets and populates the keys, values and lookup member variables.
2245 * @param $modules List of HTMLPurifier_HTMLModule
2247 public function __construct($modules) {
2248 if (!is_array($modules)) $modules = array($modules);
2249 // populate content_sets based on module hints
2250 // sorry, no way of overloading
2251 foreach ($modules as $module_i => $module) {
2252 foreach ($module->content_sets as $key => $value) {
2253 $temp = $this->convertToLookup($value);
2254 if (isset($this->lookup[$key])) {
2255 // add it into the existing content set
2256 $this->lookup[$key] = array_merge($this->lookup[$key], $temp);
2258 $this->lookup[$key] = $temp;
2262 $old_lookup = false;
2263 while ($old_lookup !== $this->lookup) {
2264 $old_lookup = $this->lookup;
2265 foreach ($this->lookup as $i => $set) {
2267 foreach ($set as $element => $x) {
2268 if (isset($this->lookup[$element])) {
2269 $add += $this->lookup[$element];
2270 unset($this->lookup[$i][$element]);
2273 $this->lookup[$i] += $add;
2277 foreach ($this->lookup as $key => $lookup) {
2278 $this->info[$key] = implode(' | ', array_keys($lookup));
2280 $this->keys = array_keys($this->info);
2281 $this->values = array_values($this->info);
2285 * Accepts a definition; generates and assigns a ChildDef for it
2286 * @param $def HTMLPurifier_ElementDef reference
2287 * @param $module Module that defined the ElementDef
2289 public function generateChildDef(&$def, $module) {
2290 if (!empty($def->child)) return; // already done!
2291 $content_model = $def->content_model;
2292 if (is_string($content_model)) {
2293 // Assume that $this->keys is alphanumeric
2294 $def->content_model = preg_replace_callback(
2295 '/\b(' . implode('|', $this->keys) . ')\b/',
2296 array($this, 'generateChildDefCallback'),
2299 //$def->content_model = str_replace(
2300 // $this->keys, $this->values, $content_model);
2302 $def->child = $this->getChildDef($def, $module);
2305 public function generateChildDefCallback($matches) {
2306 return $this->info[$matches[0]];
2310 * Instantiates a ChildDef based on content_model and content_model_type
2311 * member variables in HTMLPurifier_ElementDef
2312 * @note This will also defer to modules for custom HTMLPurifier_ChildDef
2313 * subclasses that need content set expansion
2314 * @param $def HTMLPurifier_ElementDef to have ChildDef extracted
2315 * @return HTMLPurifier_ChildDef corresponding to ElementDef
2317 public function getChildDef($def, $module) {
2318 $value = $def->content_model;
2319 if (is_object($value)) {
2321 'Literal object child definitions should be stored in '.
2322 'ElementDef->child not ElementDef->content_model',
2327 switch ($def->content_model_type) {
2329 return new HTMLPurifier_ChildDef_Required($value);
2331 return new HTMLPurifier_ChildDef_Optional($value);
2333 return new HTMLPurifier_ChildDef_Empty();
2335 return new HTMLPurifier_ChildDef_Custom($value);
2337 // defer to its module
2339 if ($module->defines_child_def) { // save a func call
2340 $return = $module->getChildDef($def);
2342 if ($return !== false) return $return;
2345 'Could not determine which ChildDef class to instantiate',
2352 * Converts a string list of elements separated by pipes into
2354 * @param $string List of elements
2355 * @return Lookup array of elements
2357 protected function convertToLookup($string) {
2358 $array = explode('|', str_replace(' ', '', $string));
2360 foreach ($array as $i => $k) {
2373 * Registry object that contains information about the current context.
2374 * @warning Is a bit buggy when variables are set to null: it thinks
2375 * they don't exist! So use false instead, please.
2376 * @note Since the variables Context deals with may not be objects,
2377 * references are very important here! Do not remove!
2379 class HTMLPurifier_Context
2383 * Private array that stores the references.
2385 private $_storage = array();
2388 * Registers a variable into the context.
2389 * @param $name String name
2390 * @param $ref Reference to variable to be registered
2392 public function register($name, &$ref) {
2393 if (isset($this->_storage[$name])) {
2394 trigger_error("Name $name produces collision, cannot re-register",
2398 $this->_storage[$name] =& $ref;
2402 * Retrieves a variable reference from the context.
2403 * @param $name String name
2404 * @param $ignore_error Boolean whether or not to ignore error
2406 public function &get($name, $ignore_error = false) {
2407 if (!isset($this->_storage[$name])) {
2408 if (!$ignore_error) {
2409 trigger_error("Attempted to retrieve non-existent variable $name",
2412 $var = null; // so we can return by reference
2415 return $this->_storage[$name];
2419 * Destorys a variable in the context.
2420 * @param $name String name
2422 public function destroy($name) {
2423 if (!isset($this->_storage[$name])) {
2424 trigger_error("Attempted to destroy non-existent variable $name",
2428 unset($this->_storage[$name]);
2432 * Checks whether or not the variable exists.
2433 * @param $name String name
2435 public function exists($name) {
2436 return isset($this->_storage[$name]);
2440 * Loads a series of variables from an associative array
2441 * @param $context_array Assoc array of variables to load
2443 public function loadArray($context_array) {
2444 foreach ($context_array as $key => $discard) {
2445 $this->register($key, $context_array[$key]);
2456 * Abstract class representing Definition cache managers that implements
2457 * useful common methods and is a factory.
2458 * @todo Create a separate maintenance file advanced users can use to
2459 * cache their custom HTMLDefinition, which can be loaded
2460 * via a configuration directive
2461 * @todo Implement memcached
2463 abstract class HTMLPurifier_DefinitionCache
2469 * @param $name Type of definition objects this instance of the
2470 * cache will handle.
2472 public function __construct($type) {
2473 $this->type = $type;
2477 * Generates a unique identifier for a particular configuration
2478 * @param Instance of HTMLPurifier_Config
2480 public function generateKey($config) {
2481 return $config->version . ',' . // possibly replace with function calls
2482 $config->getBatchSerial($this->type) . ',' .
2483 $config->get($this->type . '.DefinitionRev');
2487 * Tests whether or not a key is old with respect to the configuration's
2488 * version and revision number.
2489 * @param $key Key to test
2490 * @param $config Instance of HTMLPurifier_Config to test against
2492 public function isOld($key, $config) {
2493 if (substr_count($key, ',') < 2) return true;
2494 list($version, $hash, $revision) = explode(',', $key, 3);
2495 $compare = version_compare($version, $config->version);
2496 // version mismatch, is always old
2497 if ($compare != 0) return true;
2498 // versions match, ids match, check revision number
2500 $hash == $config->getBatchSerial($this->type) &&
2501 $revision < $config->get($this->type . '.DefinitionRev')
2507 * Checks if a definition's type jives with the cache's type
2508 * @note Throws an error on failure
2509 * @param $def Definition object to check
2510 * @return Boolean true if good, false if not
2512 public function checkDefType($def) {
2513 if ($def->type !== $this->type) {
2514 trigger_error("Cannot use definition of type {$def->type} in cache for {$this->type}");
2521 * Adds a definition object to the cache
2523 abstract public function add($def, $config);
2526 * Unconditionally saves a definition object to the cache
2528 abstract public function set($def, $config);
2531 * Replace an object in the cache
2533 abstract public function replace($def, $config);
2536 * Retrieves a definition object from the cache
2538 abstract public function get($config);
2541 * Removes a definition object to the cache
2543 abstract public function remove($config);
2546 * Clears all objects from cache
2548 abstract public function flush($config);
2551 * Clears all expired (older version or revision) objects from cache
2552 * @note Be carefuly implementing this method as flush. Flush must
2553 * not interfere with other Definition types, and cleanup()
2554 * should not be repeatedly called by userland code.
2556 abstract public function cleanup($config);
2565 * Responsible for creating definition caches.
2567 class HTMLPurifier_DefinitionCacheFactory
2570 protected $caches = array('Serializer' => array());
2571 protected $implementations = array();
2572 protected $decorators = array();
2575 * Initialize default decorators
2577 public function setup() {
2578 $this->addDecorator('Cleanup');
2582 * Retrieves an instance of global definition cache factory.
2584 public static function instance($prototype = null) {
2586 if ($prototype !== null) {
2587 $instance = $prototype;
2588 } elseif ($instance === null || $prototype === true) {
2589 $instance = new HTMLPurifier_DefinitionCacheFactory();
2596 * Registers a new definition cache object
2597 * @param $short Short name of cache object, for reference
2598 * @param $long Full class name of cache object, for construction
2600 public function register($short, $long) {
2601 $this->implementations[$short] = $long;
2605 * Factory method that creates a cache object based on configuration
2606 * @param $name Name of definitions handled by cache
2607 * @param $config Instance of HTMLPurifier_Config
2609 public function create($type, $config) {
2610 $method = $config->get('Cache.DefinitionImpl');
2611 if ($method === null) {
2612 return new HTMLPurifier_DefinitionCache_Null($type);
2614 if (!empty($this->caches[$method][$type])) {
2615 return $this->caches[$method][$type];
2618 isset($this->implementations[$method]) &&
2619 class_exists($class = $this->implementations[$method], false)
2621 $cache = new $class($type);
2623 if ($method != 'Serializer') {
2624 trigger_error("Unrecognized DefinitionCache $method, using Serializer instead", E_USER_WARNING);
2626 $cache = new HTMLPurifier_DefinitionCache_Serializer($type);
2628 foreach ($this->decorators as $decorator) {
2629 $new_cache = $decorator->decorate($cache);
2630 // prevent infinite recursion in PHP 4
2632 $cache = $new_cache;
2634 $this->caches[$method][$type] = $cache;
2635 return $this->caches[$method][$type];
2639 * Registers a decorator to add to all new cache objects
2642 public function addDecorator($decorator) {
2643 if (is_string($decorator)) {
2644 $class = "HTMLPurifier_DefinitionCache_Decorator_$decorator";
2645 $decorator = new $class;
2647 $this->decorators[$decorator->name] = $decorator;
2657 * Represents a document type, contains information on which modules
2658 * need to be loaded.
2659 * @note This class is inspected by Printer_HTMLDefinition->renderDoctype.
2660 * If structure changes, please update that function.
2662 class HTMLPurifier_Doctype
2665 * Full name of doctype
2670 * List of standard modules (string identifiers or literal objects)
2671 * that this doctype uses
2673 public $modules = array();
2676 * List of modules to use for tidying up code
2678 public $tidyModules = array();
2681 * Is the language derived from XML (i.e. XHTML)?
2686 * List of aliases for this doctype
2688 public $aliases = array();
2691 * Public DTD identifier
2696 * System DTD identifier
2700 public function __construct($name = null, $xml = true, $modules = array(),
2701 $tidyModules = array(), $aliases = array(), $dtd_public = null, $dtd_system = null
2703 $this->name = $name;
2705 $this->modules = $modules;
2706 $this->tidyModules = $tidyModules;
2707 $this->aliases = $aliases;
2708 $this->dtdPublic = $dtd_public;
2709 $this->dtdSystem = $dtd_system;
2717 class HTMLPurifier_DoctypeRegistry
2721 * Hash of doctype names to doctype objects
2723 protected $doctypes;
2726 * Lookup table of aliases to real doctype names
2731 * Registers a doctype to the registry
2732 * @note Accepts a fully-formed doctype object, or the
2733 * parameters for constructing a doctype object
2734 * @param $doctype Name of doctype or literal doctype object
2735 * @param $modules Modules doctype will load
2736 * @param $modules_for_modes Modules doctype will load for certain modes
2737 * @param $aliases Alias names for doctype
2738 * @return Editable registered doctype
2740 public function register($doctype, $xml = true, $modules = array(),
2741 $tidy_modules = array(), $aliases = array(), $dtd_public = null, $dtd_system = null
2743 if (!is_array($modules)) $modules = array($modules);
2744 if (!is_array($tidy_modules)) $tidy_modules = array($tidy_modules);
2745 if (!is_array($aliases)) $aliases = array($aliases);
2746 if (!is_object($doctype)) {
2747 $doctype = new HTMLPurifier_Doctype(
2748 $doctype, $xml, $modules, $tidy_modules, $aliases, $dtd_public, $dtd_system
2751 $this->doctypes[$doctype->name] = $doctype;
2752 $name = $doctype->name;
2754 foreach ($doctype->aliases as $alias) {
2755 if (isset($this->doctypes[$alias])) continue;
2756 $this->aliases[$alias] = $name;
2758 // remove old aliases
2759 if (isset($this->aliases[$name])) unset($this->aliases[$name]);
2764 * Retrieves reference to a doctype of a certain name
2765 * @note This function resolves aliases
2766 * @note When possible, use the more fully-featured make()
2767 * @param $doctype Name of doctype
2768 * @return Editable doctype object
2770 public function get($doctype) {
2771 if (isset($this->aliases[$doctype])) $doctype = $this->aliases[$doctype];
2772 if (!isset($this->doctypes[$doctype])) {
2773 trigger_error('Doctype ' . htmlspecialchars($doctype) . ' does not exist', E_USER_ERROR);
2774 $anon = new HTMLPurifier_Doctype($doctype);
2777 return $this->doctypes[$doctype];
2781 * Creates a doctype based on a configuration object,
2782 * will perform initialization on the doctype
2783 * @note Use this function to get a copy of doctype that config
2784 * can hold on to (this is necessary in order to tell
2785 * Generator whether or not the current document is XML
2788 public function make($config) {
2789 return clone $this->get($this->getDoctypeFromConfig($config));
2793 * Retrieves the doctype from the configuration object
2795 public function getDoctypeFromConfig($config) {
2797 $doctype = $config->get('HTML.Doctype');
2798 if (!empty($doctype)) return $doctype;
2799 $doctype = $config->get('HTML.CustomDoctype');
2800 if (!empty($doctype)) return $doctype;
2801 // backwards-compatibility
2802 if ($config->get('HTML.XHTML')) {
2803 $doctype = 'XHTML 1.0';
2805 $doctype = 'HTML 4.01';
2807 if ($config->get('HTML.Strict')) {
2808 $doctype .= ' Strict';
2810 $doctype .= ' Transitional';
2822 * Structure that stores an HTML element definition. Used by
2823 * HTMLPurifier_HTMLDefinition and HTMLPurifier_HTMLModule.
2824 * @note This class is inspected by HTMLPurifier_Printer_HTMLDefinition.
2825 * Please update that class too.
2826 * @warning If you add new properties to this class, you MUST update
2827 * the mergeIn() method.
2829 class HTMLPurifier_ElementDef
2833 * Does the definition work by itself, or is it created solely
2834 * for the purpose of merging into another definition?
2836 public $standalone = true;
2839 * Associative array of attribute name to HTMLPurifier_AttrDef
2840 * @note Before being processed by HTMLPurifier_AttrCollections
2841 * when modules are finalized during
2842 * HTMLPurifier_HTMLDefinition->setup(), this array may also
2843 * contain an array at index 0 that indicates which attribute
2844 * collections to load into the full array. It may also
2845 * contain string indentifiers in lieu of HTMLPurifier_AttrDef,
2846 * see HTMLPurifier_AttrTypes on how they are expanded during
2847 * HTMLPurifier_HTMLDefinition->setup() processing.
2849 public $attr = array();
2852 * Indexed list of tag's HTMLPurifier_AttrTransform to be done before validation
2854 public $attr_transform_pre = array();
2857 * Indexed list of tag's HTMLPurifier_AttrTransform to be done after validation
2859 public $attr_transform_post = array();
2862 * HTMLPurifier_ChildDef of this tag.
2867 * Abstract string representation of internal ChildDef rules. See
2868 * HTMLPurifier_ContentSets for how this is parsed and then transformed
2869 * into an HTMLPurifier_ChildDef.
2870 * @warning This is a temporary variable that is not available after
2871 * being processed by HTMLDefinition
2873 public $content_model;
2876 * Value of $child->type, used to determine which ChildDef to use,
2877 * used in combination with $content_model.
2878 * @warning This must be lowercase
2879 * @warning This is a temporary variable that is not available after
2880 * being processed by HTMLDefinition
2882 public $content_model_type;
2887 * Does the element have a content model (#PCDATA | Inline)*? This
2888 * is important for chameleon ins and del processing in
2889 * HTMLPurifier_ChildDef_Chameleon. Dynamically set: modules don't
2890 * have to worry about this one.
2892 public $descendants_are_inline = false;
2895 * List of the names of required attributes this element has. Dynamically
2896 * populated by HTMLPurifier_HTMLDefinition::getElement
2898 public $required_attr = array();
2901 * Lookup table of tags excluded from all descendants of this tag.
2902 * @note SGML permits exclusions for all descendants, but this is
2903 * not possible with DTDs or XML Schemas. W3C has elected to
2904 * use complicated compositions of content_models to simulate
2905 * exclusion for children, but we go the simpler, SGML-style
2906 * route of flat-out exclusions, which correctly apply to
2907 * all descendants and not just children. Note that the XHTML
2908 * Modularization Abstract Modules are blithely unaware of such
2911 public $excludes = array();
2914 * This tag is explicitly auto-closed by the following tags.
2916 public $autoclose = array();
2919 * If a foreign element is found in this element, test if it is
2920 * allowed by this sub-element; if it is, instead of closing the
2921 * current element, place it inside this element.
2926 * Whether or not this is a formatting element affected by the
2927 * "Active Formatting Elements" algorithm.
2932 * Low-level factory constructor for creating new standalone element defs
2934 public static function create($content_model, $content_model_type, $attr) {
2935 $def = new HTMLPurifier_ElementDef();
2936 $def->content_model = $content_model;
2937 $def->content_model_type = $content_model_type;
2943 * Merges the values of another element definition into this one.
2944 * Values from the new element def take precedence if a value is
2947 public function mergeIn($def) {
2949 // later keys takes precedence
2950 foreach($def->attr as $k => $v) {
2952 // merge in the includes
2953 // sorry, no way to override an include
2954 foreach ($v as $v2) {
2955 $this->attr[0][] = $v2;
2960 if (isset($this->attr[$k])) unset($this->attr[$k]);
2963 $this->attr[$k] = $v;
2965 $this->_mergeAssocArray($this->attr_transform_pre, $def->attr_transform_pre);
2966 $this->_mergeAssocArray($this->attr_transform_post, $def->attr_transform_post);
2967 $this->_mergeAssocArray($this->excludes, $def->excludes);
2969 if(!empty($def->content_model)) {
2970 $this->content_model =
2971 str_replace("#SUPER", $this->content_model, $def->content_model);
2972 $this->child = false;
2974 if(!empty($def->content_model_type)) {
2975 $this->content_model_type = $def->content_model_type;
2976 $this->child = false;
2978 if(!is_null($def->child)) $this->child = $def->child;
2979 if(!is_null($def->formatting)) $this->formatting = $def->formatting;
2980 if($def->descendants_are_inline) $this->descendants_are_inline = $def->descendants_are_inline;
2985 * Merges one array into another, removes values which equal false
2986 * @param $a1 Array by reference that is merged into
2987 * @param $a2 Array that merges into $a1
2989 private function _mergeAssocArray(&$a1, $a2) {
2990 foreach ($a2 as $k => $v) {
2992 if (isset($a1[$k])) unset($a1[$k]);
3006 * A UTF-8 specific character encoder that handles cleaning and transforming.
3007 * @note All functions in this class should be static.
3009 class HTMLPurifier_Encoder
3013 * Constructor throws fatal error if you attempt to instantiate class
3015 private function __construct() {
3016 trigger_error('Cannot instantiate encoder, call methods statically', E_USER_ERROR);
3020 * Error-handler that mutes errors, alternative to shut-up operator.
3022 public static function muteErrorHandler() {}
3025 * Cleans a UTF-8 string for well-formedness and SGML validity
3027 * It will parse according to UTF-8 and return a valid UTF8 string, with
3028 * non-SGML codepoints excluded.
3030 * @note Just for reference, the non-SGML code points are 0 to 31 and
3031 * 127 to 159, inclusive. However, we allow code points 9, 10
3032 * and 13, which are the tab, line feed and carriage return
3033 * respectively. 128 and above the code points map to multibyte
3034 * UTF-8 representations.
3036 * @note Fallback code adapted from utf8ToUnicode by Henri Sivonen and
3037 * hsivonen@iki.fi at <http://iki.fi/hsivonen/php-utf8/> under the
3038 * LGPL license. Notes on what changed are inside, but in general,
3039 * the original code transformed UTF-8 text into an array of integer
3040 * Unicode codepoints. Understandably, transforming that back to
3041 * a string would be somewhat expensive, so the function was modded to
3042 * directly operate on the string. However, this discourages code
3043 * reuse, and the logic enumerated here would be useful for any
3044 * function that needs to be able to understand UTF-8 characters.
3045 * As of right now, only smart lossless character encoding converters
3046 * would need that, and I'm probably not going to implement them.
3047 * Once again, PHP 6 should solve all our problems.
3049 public static function cleanUTF8($str, $force_php = false) {
3051 // UTF-8 validity is checked since PHP 4.3.5
3052 // This is an optimization: if the string is already valid UTF-8, no
3053 // need to do PHP stuff. 99% of the time, this will be the case.
3054 // The regexp matches the XML char production, as well as well as excluding
3055 // non-SGML codepoints U+007F to U+009F
3056 if (preg_match('/^[\x{9}\x{A}\x{D}\x{20}-\x{7E}\x{A0}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]*$/Du', $str)) {
3060 $mState = 0; // cached expected number of octets after the current octet
3061 // until the beginning of the next UTF8 character sequence
3062 $mUcs4 = 0; // cached Unicode character
3063 $mBytes = 1; // cached expected number of octets in the current sequence
3065 // original code involved an $out that was an array of Unicode
3066 // codepoints. Instead of having to convert back into UTF-8, we've
3067 // decided to directly append valid UTF-8 characters onto a string
3068 // $out once they're done. $char accumulates raw bytes, while $mUcs4
3069 // turns into the Unicode code point, so there's some redundancy.
3074 $len = strlen($str);
3075 for($i = 0; $i < $len; $i++) {
3076 $in = ord($str{$i});
3077 $char .= $str[$i]; // append byte to char
3079 // When mState is zero we expect either a US-ASCII character
3080 // or a multi-octet sequence.
3081 if (0 == (0x80 & ($in))) {
3082 // US-ASCII, pass straight through.
3083 if (($in <= 31 || $in == 127) &&
3084 !($in == 9 || $in == 13 || $in == 10) // save \r\t\n
3086 // control characters, remove
3093 } elseif (0xC0 == (0xE0 & ($in))) {
3094 // First octet of 2 octet sequence
3096 $mUcs4 = ($mUcs4 & 0x1F) << 6;
3099 } elseif (0xE0 == (0xF0 & ($in))) {
3100 // First octet of 3 octet sequence
3102 $mUcs4 = ($mUcs4 & 0x0F) << 12;
3105 } elseif (0xF0 == (0xF8 & ($in))) {
3106 // First octet of 4 octet sequence
3108 $mUcs4 = ($mUcs4 & 0x07) << 18;
3111 } elseif (0xF8 == (0xFC & ($in))) {
3112 // First octet of 5 octet sequence.
3114 // This is illegal because the encoded codepoint must be
3116 // (a) not the shortest form or
3117 // (b) outside the Unicode range of 0-0x10FFFF.
3118 // Rather than trying to resynchronize, we will carry on
3119 // until the end of the sequence and let the later error
3120 // handling code catch it.
3122 $mUcs4 = ($mUcs4 & 0x03) << 24;
3125 } elseif (0xFC == (0xFE & ($in))) {
3126 // First octet of 6 octet sequence, see comments for 5
3129 $mUcs4 = ($mUcs4 & 1) << 30;
3133 // Current octet is neither in the US-ASCII range nor a
3134 // legal first octet of a multi-octet sequence.
3141 // When mState is non-zero, we expect a continuation of the
3142 // multi-octet sequence
3143 if (0x80 == (0xC0 & ($in))) {
3144 // Legal continuation.
3145 $shift = ($mState - 1) * 6;
3147 $tmp = ($tmp & 0x0000003F) << $shift;
3150 if (0 == --$mState) {
3151 // End of the multi-octet sequence. mUcs4 now contains
3152 // the final Unicode codepoint to be output
3154 // Check for illegal sequences and codepoints.
3156 // From Unicode 3.1, non-shortest form is illegal
3157 if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
3158 ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
3159 ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
3161 // From Unicode 3.2, surrogate characters = illegal
3162 (($mUcs4 & 0xFFFFF800) == 0xD800) ||
3163 // Codepoints outside the Unicode range are illegal
3167 } elseif (0xFEFF != $mUcs4 && // omit BOM
3168 // check for valid Char unicode codepoints
3173 (0x20 <= $mUcs4 && 0x7E >= $mUcs4) ||
3174 // 7F-9F is not strictly prohibited by XML,
3175 // but it is non-SGML, and thus we don't allow it
3176 (0xA0 <= $mUcs4 && 0xD7FF >= $mUcs4) ||
3177 (0x10000 <= $mUcs4 && 0x10FFFF >= $mUcs4)
3182 // initialize UTF8 cache (reset)
3189 // ((0xC0 & (*in) != 0x80) && (mState != 0))
3190 // Incomplete multi-octet sequence.
3191 // used to result in complete fail, but we'll reset
3203 * Translates a Unicode codepoint into its corresponding UTF-8 character.
3204 * @note Based on Feyd's function at
3205 * <http://forums.devnetwork.net/viewtopic.php?p=191404#191404>,
3206 * which is in public domain.
3207 * @note While we're going to do code point parsing anyway, a good
3208 * optimization would be to refuse to translate code points that
3209 * are non-SGML characters. However, this could lead to duplication.
3210 * @note This is very similar to the unichr function in
3211 * maintenance/generate-entity-file.php (although this is superior,
3212 * due to its sanity checks).
3215 // +----------+----------+----------+----------+
3216 // | 33222222 | 22221111 | 111111 | |
3217 // | 10987654 | 32109876 | 54321098 | 76543210 | bit
3218 // +----------+----------+----------+----------+
3219 // | | | | 0xxxxxxx | 1 byte 0x00000000..0x0000007F
3220 // | | | 110yyyyy | 10xxxxxx | 2 byte 0x00000080..0x000007FF
3221 // | | 1110zzzz | 10yyyyyy | 10xxxxxx | 3 byte 0x00000800..0x0000FFFF
3222 // | 11110www | 10wwzzzz | 10yyyyyy | 10xxxxxx | 4 byte 0x00010000..0x0010FFFF
3223 // +----------+----------+----------+----------+
3224 // | 00000000 | 00011111 | 11111111 | 11111111 | Theoretical upper limit of legal scalars: 2097151 (0x001FFFFF)
3225 // | 00000000 | 00010000 | 11111111 | 11111111 | Defined upper limit of legal scalar codes
3226 // +----------+----------+----------+----------+
3228 public static function unichr($code) {
3229 if($code > 1114111 or $code < 0 or
3230 ($code >= 55296 and $code <= 57343) ) {
3231 // bits are set outside the "valid" range as defined
3236 $x = $y = $z = $w = 0;
3238 // regular ASCII character
3241 // set up bits for UTF-8
3242 $x = ($code & 63) | 128;
3244 $y = (($code & 2047) >> 6) | 192;
3246 $y = (($code & 4032) >> 6) | 128;
3248 $z = (($code >> 12) & 15) | 224;
3250 $z = (($code >> 12) & 63) | 128;
3251 $w = (($code >> 18) & 7) | 240;
3255 // set up the actual character
3257 if($w) $ret .= chr($w);
3258 if($z) $ret .= chr($z);
3259 if($y) $ret .= chr($y);
3266 * Converts a string to UTF-8 based on configuration.
3268 public static function convertToUTF8($str, $config, $context) {
3269 $encoding = $config->get('Core.Encoding');
3270 if ($encoding === 'utf-8') return $str;
3271 static $iconv = null;
3272 if ($iconv === null) $iconv = function_exists('iconv');
3273 set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
3274 if ($iconv && !$config->get('Test.ForceNoIconv')) {
3275 $str = iconv($encoding, 'utf-8//IGNORE', $str);
3276 if ($str === false) {
3277 // $encoding is not a valid encoding
3278 restore_error_handler();
3279 trigger_error('Invalid encoding ' . $encoding, E_USER_ERROR);
3282 // If the string is bjorked by Shift_JIS or a similar encoding
3283 // that doesn't support all of ASCII, convert the naughty
3284 // characters to their true byte-wise ASCII/UTF-8 equivalents.
3285 $str = strtr($str, HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding));
3286 restore_error_handler();
3288 } elseif ($encoding === 'iso-8859-1') {
3289 $str = utf8_encode($str);
3290 restore_error_handler();
3293 trigger_error('Encoding not supported, please install iconv', E_USER_ERROR);
3297 * Converts a string from UTF-8 based on configuration.
3298 * @note Currently, this is a lossy conversion, with unexpressable
3299 * characters being omitted.
3301 public static function convertFromUTF8($str, $config, $context) {
3302 $encoding = $config->get('Core.Encoding');
3303 if ($encoding === 'utf-8') return $str;
3304 static $iconv = null;
3305 if ($iconv === null) $iconv = function_exists('iconv');
3306 if ($escape = $config->get('Core.EscapeNonASCIICharacters')) {
3307 $str = HTMLPurifier_Encoder::convertToASCIIDumbLossless($str);
3309 set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
3310 if ($iconv && !$config->get('Test.ForceNoIconv')) {
3311 // Undo our previous fix in convertToUTF8, otherwise iconv will barf
3312 $ascii_fix = HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding);
3313 if (!$escape && !empty($ascii_fix)) {
3314 $clear_fix = array();
3315 foreach ($ascii_fix as $utf8 => $native) $clear_fix[$utf8] = '';
3316 $str = strtr($str, $clear_fix);
3318 $str = strtr($str, array_flip($ascii_fix));
3320 $str = iconv('utf-8', $encoding . '//IGNORE', $str);
3321 restore_error_handler();
3323 } elseif ($encoding === 'iso-8859-1') {
3324 $str = utf8_decode($str);
3325 restore_error_handler();
3328 trigger_error('Encoding not supported', E_USER_ERROR);
3332 * Lossless (character-wise) conversion of HTML to ASCII
3333 * @param $str UTF-8 string to be converted to ASCII
3334 * @returns ASCII encoded string with non-ASCII character entity-ized
3335 * @warning Adapted from MediaWiki, claiming fair use: this is a common
3336 * algorithm. If you disagree with this license fudgery,
3337 * implement it yourself.
3338 * @note Uses decimal numeric entities since they are best supported.
3339 * @note This is a DUMB function: it has no concept of keeping
3340 * character entities that the projected character encoding
3341 * can allow. We could possibly implement a smart version
3342 * but that would require it to also know which Unicode
3343 * codepoints the charset supported (not an easy task).
3344 * @note Sort of with cleanUTF8() but it assumes that $str is
3347 public static function convertToASCIIDumbLossless($str) {
3351 $len = strlen($str);
3352 for( $i = 0; $i < $len; $i++ ) {
3353 $bytevalue = ord( $str[$i] );
3354 if( $bytevalue <= 0x7F ) { //0xxx xxxx
3355 $result .= chr( $bytevalue );
3357 } elseif( $bytevalue <= 0xBF ) { //10xx xxxx
3358 $working = $working << 6;
3359 $working += ($bytevalue & 0x3F);
3361 if( $bytesleft <= 0 ) {
3362 $result .= "&#" . $working . ";";
3364 } elseif( $bytevalue <= 0xDF ) { //110x xxxx
3365 $working = $bytevalue & 0x1F;
3367 } elseif( $bytevalue <= 0xEF ) { //1110 xxxx
3368 $working = $bytevalue & 0x0F;
3370 } else { //1111 0xxx
3371 $working = $bytevalue & 0x07;
3379 * This expensive function tests whether or not a given character
3380 * encoding supports ASCII. 7/8-bit encodings like Shift_JIS will
3381 * fail this test, and require special processing. Variable width
3382 * encodings shouldn't ever fail.
3384 * @param string $encoding Encoding name to test, as per iconv format
3385 * @param bool $bypass Whether or not to bypass the precompiled arrays.
3386 * @return Array of UTF-8 characters to their corresponding ASCII,
3387 * which can be used to "undo" any overzealous iconv action.
3389 public static function testEncodingSupportsASCII($encoding, $bypass = false) {
3390 static $encodings = array();
3392 if (isset($encodings[$encoding])) return $encodings[$encoding];
3393 $lenc = strtolower($encoding);
3396 return array("\xC2\xA5" => '\\', "\xE2\x80\xBE" => '~');
3398 return array("\xE2\x82\xA9" => '\\');
3400 if (strpos($lenc, 'iso-8859-') === 0) return array();
3403 set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
3404 if (iconv('UTF-8', $encoding, 'a') === false) return false;
3405 for ($i = 0x20; $i <= 0x7E; $i++) { // all printable ASCII chars
3406 $c = chr($i); // UTF-8 char
3407 $r = iconv('UTF-8', "$encoding//IGNORE", $c); // initial conversion
3410 // This line is needed for iconv implementations that do not
3411 // omit characters that do not exist in the target character set
3412 ($r === $c && iconv($encoding, 'UTF-8//IGNORE', $r) !== $c)
3414 // Reverse engineer: what's the UTF-8 equiv of this byte
3415 // sequence? This assumes that there's no variable width
3416 // encoding that doesn't support ASCII.
3417 $ret[iconv($encoding, 'UTF-8//IGNORE', $c)] = $c;
3420 restore_error_handler();
3421 $encodings[$encoding] = $ret;
3433 * Object that provides entity lookup table from entity name to character
3435 class HTMLPurifier_EntityLookup {
3438 * Assoc array of entity name to character represented.
3443 * Sets up the entity lookup table from the serialized file contents.
3444 * @note The serialized contents are versioned, but were generated
3445 * using the maintenance script generate_entity_file.php
3446 * @warning This is not in constructor to help enforce the Singleton
3448 public function setup($file = false) {
3450 $file = HTMLPURIFIER_PREFIX . '/HTMLPurifier/EntityLookup/entities.ser';
3452 $this->table = unserialize(file_get_contents($file));
3456 * Retrieves sole instance of the object.
3457 * @param Optional prototype of custom lookup table to overload with.
3459 public static function instance($prototype = false) {
3460 // no references, since PHP doesn't copy unless modified
3461 static $instance = null;
3463 $instance = $prototype;
3464 } elseif (!$instance) {
3465 $instance = new HTMLPurifier_EntityLookup();
3477 // if want to implement error collecting here, we'll need to use some sort
3478 // of global data (probably trigger_error) because it's impossible to pass
3479 // $config or $context to the callback functions.
3482 * Handles referencing and derefencing character entities
3484 class HTMLPurifier_EntityParser
3488 * Reference to entity lookup table.
3490 protected $_entity_lookup;
3493 * Callback regex string for parsing entities.
3495 protected $_substituteEntitiesRegex =
3496 '/&(?:[#]x([a-fA-F0-9]+)|[#]0*(\d+)|([A-Za-z_:][A-Za-z0-9.\-_:]*));?/';
3497 // 1. hex 2. dec 3. string (XML style)
3501 * Decimal to parsed string conversion table for special entities.
3503 protected $_special_dec2str =
3513 * Stripped entity names to decimal conversion table for special entities.
3515 protected $_special_ent2dec =
3524 * Substitutes non-special entities with their parsed equivalents. Since
3525 * running this whenever you have parsed character is t3h 5uck, we run
3526 * it before everything else.
3528 * @param $string String to have non-special entities parsed.
3529 * @returns Parsed string.
3531 public function substituteNonSpecialEntities($string) {
3532 // it will try to detect missing semicolons, but don't rely on it
3533 return preg_replace_callback(
3534 $this->_substituteEntitiesRegex,
3535 array($this, 'nonSpecialEntityCallback'),
3541 * Callback function for substituteNonSpecialEntities() that does the work.
3543 * @param $matches PCRE matches array, with 0 the entire match, and
3544 * either index 1, 2 or 3 set with a hex value, dec value,
3545 * or string (respectively).
3546 * @returns Replacement string.
3549 protected function nonSpecialEntityCallback($matches) {
3550 // replaces all but big five
3551 $entity = $matches[0];
3552 $is_num = (@$matches[0][1] === '#');
3554 $is_hex = (@$entity[2] === 'x');
3555 $code = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
3557 // abort for special characters
3558 if (isset($this->_special_dec2str[$code])) return $entity;
3560 return HTMLPurifier_Encoder::unichr($code);
3562 if (isset($this->_special_ent2dec[$matches[3]])) return $entity;
3563 if (!$this->_entity_lookup) {
3564 $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
3566 if (isset($this->_entity_lookup->table[$matches[3]])) {
3567 return $this->_entity_lookup->table[$matches[3]];
3575 * Substitutes only special entities with their parsed equivalents.
3577 * @notice We try to avoid calling this function because otherwise, it
3578 * would have to be called a lot (for every parsed section).
3580 * @param $string String to have non-special entities parsed.
3581 * @returns Parsed string.
3583 public function substituteSpecialEntities($string) {
3584 return preg_replace_callback(
3585 $this->_substituteEntitiesRegex,
3586 array($this, 'specialEntityCallback'),
3591 * Callback function for substituteSpecialEntities() that does the work.
3593 * This callback has same syntax as nonSpecialEntityCallback().
3595 * @param $matches PCRE-style matches array, with 0 the entire match, and
3596 * either index 1, 2 or 3 set with a hex value, dec value,
3597 * or string (respectively).
3598 * @returns Replacement string.
3600 protected function specialEntityCallback($matches) {
3601 $entity = $matches[0];
3602 $is_num = (@$matches[0][1] === '#');
3604 $is_hex = (@$entity[2] === 'x');
3605 $int = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
3606 return isset($this->_special_dec2str[$int]) ?
3607 $this->_special_dec2str[$int] :
3610 return isset($this->_special_ent2dec[$matches[3]]) ?
3611 $this->_special_ent2dec[$matches[3]] :
3623 * Error collection class that enables HTML Purifier to report HTML
3624 * problems back to the user
3626 class HTMLPurifier_ErrorCollector
3630 * Identifiers for the returned error array. These are purposely numeric
3631 * so list() can be used.
3639 protected $_current;
3640 protected $_stacks = array(array());
3642 protected $generator;
3645 protected $lines = array();
3647 public function __construct($context) {
3648 $this->locale =& $context->get('Locale');
3649 $this->context = $context;
3650 $this->_current =& $this->_stacks[0];
3651 $this->errors =& $this->_stacks[0];
3655 * Sends an error message to the collector for later use
3656 * @param $severity int Error severity, PHP error style (don't use E_USER_)
3657 * @param $msg string Error message text
3658 * @param $subst1 string First substitution for $msg
3659 * @param $subst2 string ...
3661 public function send($severity, $msg) {
3664 if (func_num_args() > 2) {
3665 $args = func_get_args();
3670 $token = $this->context->get('CurrentToken', true);
3671 $line = $token ? $token->line : $this->context->get('CurrentLine', true);
3672 $col = $token ? $token->col : $this->context->get('CurrentCol', true);
3673 $attr = $this->context->get('CurrentAttr', true);
3675 // perform special substitutions, also add custom parameters
3677 if (!is_null($token)) {
3678 $args['CurrentToken'] = $token;
3680 if (!is_null($attr)) {
3681 $subst['$CurrentAttr.Name'] = $attr;
3682 if (isset($token->attr[$attr])) $subst['$CurrentAttr.Value'] = $token->attr[$attr];
3686 $msg = $this->locale->getMessage($msg);
3688 $msg = $this->locale->formatMessage($msg, $args);
3691 if (!empty($subst)) $msg = strtr($msg, $subst);
3693 // (numerically indexed)
3695 self::LINENO => $line,
3696 self::SEVERITY => $severity,
3697 self::MESSAGE => $msg,
3698 self::CHILDREN => array()
3700 $this->_current[] = $error;
3703 // NEW CODE BELOW ...
3706 // Top-level errors are either:
3707 // TOKEN type, if $value is set appropriately, or
3708 // "syntax" type, if $value is null
3709 $new_struct = new HTMLPurifier_ErrorStruct();
3710 $new_struct->type = HTMLPurifier_ErrorStruct::TOKEN;
3711 if ($token) $new_struct->value = clone $token;
3712 if (is_int($line) && is_int($col)) {
3713 if (isset($this->lines[$line][$col])) {
3714 $struct = $this->lines[$line][$col];
3716 $struct = $this->lines[$line][$col] = $new_struct;
3718 // These ksorts may present a performance problem
3719 ksort($this->lines[$line], SORT_NUMERIC);
3721 if (isset($this->lines[-1])) {
3722 $struct = $this->lines[-1];
3724 $struct = $this->lines[-1] = $new_struct;
3727 ksort($this->lines, SORT_NUMERIC);
3729 // Now, check if we need to operate on a lower structure
3730 if (!empty($attr)) {
3731 $struct = $struct->getChild(HTMLPurifier_ErrorStruct::ATTR, $attr);
3732 if (!$struct->value) {
3733 $struct->value = array($attr, 'PUT VALUE HERE');
3736 if (!empty($cssprop)) {
3737 $struct = $struct->getChild(HTMLPurifier_ErrorStruct::CSSPROP, $cssprop);
3738 if (!$struct->value) {
3739 // if we tokenize CSS this might be a little more difficult to do
3740 $struct->value = array($cssprop, 'PUT VALUE HERE');
3744 // Ok, structs are all setup, now time to register the error
3745 $struct->addError($severity, $msg);
3749 * Retrieves raw error data for custom formatter to use
3750 * @param List of arrays in format of array(line of error,
3751 * error severity, error message,
3752 * recursive sub-errors array)
3754 public function getRaw() {
3755 return $this->errors;
3759 * Default HTML formatting implementation for error messages
3760 * @param $config Configuration array, vital for HTML output nature
3761 * @param $errors Errors array to display; used for recursion.
3763 public function getHTMLFormatted($config, $errors = null) {
3766 $this->generator = new HTMLPurifier_Generator($config, $this->context);
3767 if ($errors === null) $errors = $this->errors;
3769 // 'At line' message needs to be removed
3771 // generation code for new structure goes here. It needs to be recursive.
3772 foreach ($this->lines as $line => $col_array) {
3773 if ($line == -1) continue;
3774 foreach ($col_array as $col => $struct) {
3775 $this->_renderStruct($ret, $struct, $line, $col);
3778 if (isset($this->lines[-1])) {
3779 $this->_renderStruct($ret, $this->lines[-1]);
3782 if (empty($errors)) {
3783 return '<p>' . $this->locale->getMessage('ErrorCollector: No errors') . '</p>';
3785 return '<ul><li>' . implode('</li><li>', $ret) . '</li></ul>';
3790 private function _renderStruct(&$ret, $struct, $line = null, $col = null) {
3791 $stack = array($struct);
3792 $context_stack = array(array());
3793 while ($current = array_pop($stack)) {
3794 $context = array_pop($context_stack);
3795 foreach ($current->errors as $error) {
3796 list($severity, $msg) = $error;
3799 // W3C uses an icon to indicate the severity of the error.
3800 $error = $this->locale->getErrorName($severity);
3801 $string .= "<span class=\"error e$severity\"><strong>$error</strong></span> ";
3802 if (!is_null($line) && !is_null($col)) {
3803 $string .= "<em class=\"location\">Line $line, Column $col: </em> ";
3805 $string .= '<em class="location">End of Document: </em> ';
3807 $string .= '<strong class="description">' . $this->generator->escape($msg) . '</strong> ';
3808 $string .= '</div>';
3809 // Here, have a marker for the character on the column appropriate.
3810 // Be sure to clip extremely long lines.
3811 //$string .= '<pre>';
3813 //$string .= '</pre>';
3816 foreach ($current->children as $type => $array) {
3817 $context[] = $current;
3818 $stack = array_merge($stack, array_reverse($array, true));
3819 for ($i = count($array); $i > 0; $i--) {
3820 $context_stack[] = $context;
3833 * Records errors for particular segments of an HTML document such as tokens,
3834 * attributes or CSS properties. They can contain error structs (which apply
3835 * to components of what they represent), but their main purpose is to hold
3836 * errors applying to whatever struct is being used.
3838 class HTMLPurifier_ErrorStruct
3842 * Possible values for $children first-key. Note that top-level structures
3843 * are automatically token-level.
3850 * Type of this struct.
3855 * Value of the struct we are recording errors for. There are various
3857 * - TOKEN: Instance of HTMLPurifier_Token
3858 * - ATTR: array('attr-name', 'value')
3859 * - CSSPROP: array('prop-name', 'value')
3864 * Errors registered for this structure.
3866 public $errors = array();
3869 * Child ErrorStructs that are from this structure. For example, a TOKEN
3870 * ErrorStruct would contain ATTR ErrorStructs. This is a multi-dimensional
3871 * array in structure: [TYPE]['identifier']
3873 public $children = array();
3875 public function getChild($type, $id) {
3876 if (!isset($this->children[$type][$id])) {
3877 $this->children[$type][$id] = new HTMLPurifier_ErrorStruct();
3878 $this->children[$type][$id]->type = $type;
3880 return $this->children[$type][$id];
3883 public function addError($severity, $message) {
3884 $this->errors[] = array($severity, $message);
3894 * Global exception class for HTML Purifier; any exceptions we throw
3897 class HTMLPurifier_Exception extends Exception
3907 * Represents a pre or post processing filter on HTML Purifier's output
3909 * Sometimes, a little ad-hoc fixing of HTML has to be done before
3910 * it gets sent through HTML Purifier: you can use filters to acheive
3911 * this effect. For instance, YouTube videos can be preserved using
3912 * this manner. You could have used a decorator for this task, but
3913 * PHP's support for them is not terribly robust, so we're going
3914 * to just loop through the filters.
3916 * Filters should be exited first in, last out. If there are three filters,
3917 * named 1, 2 and 3, the order of execution should go 1->preFilter,
3918 * 2->preFilter, 3->preFilter, purify, 3->postFilter, 2->postFilter,
3921 * @note Methods are not declared abstract as it is perfectly legitimate
3922 * for an implementation not to want anything to happen on a step
3925 class HTMLPurifier_Filter
3929 * Name of the filter for identification purposes
3934 * Pre-processor function, handles HTML before HTML Purifier
3936 public function preFilter($html, $config, $context) {
3941 * Post-processor function, handles HTML after HTML Purifier
3943 public function postFilter($html, $config, $context) {
3954 * Generates HTML from tokens.
3955 * @todo Refactor interface so that configuration/context is determined
3956 * upon instantiation, no need for messy generateFromTokens() calls
3957 * @todo Make some of the more internal functions protected, and have
3958 * unit tests work around that
3960 class HTMLPurifier_Generator
3964 * Whether or not generator should produce XML output
3966 private $_xhtml = true;
3969 * :HACK: Whether or not generator should comment the insides of <script> tags
3971 private $_scriptFix = false;
3974 * Cache of HTMLDefinition during HTML output to determine whether or
3975 * not attributes should be minimized.
3980 * Cache of %Output.SortAttr
3985 * Cache of %Output.FlashCompat
3987 private $_flashCompat;
3990 * Cache of %Output.FixInnerHTML
3992 private $_innerHTMLFix;
3995 * Stack for keeping track of object information when outputting IE
3996 * compatibility code.
3998 private $_flashStack = array();
4001 * Configuration for the generator
4006 * @param $config Instance of HTMLPurifier_Config
4007 * @param $context Instance of HTMLPurifier_Context
4009 public function __construct($config, $context) {
4010 $this->config = $config;
4011 $this->_scriptFix = $config->get('Output.CommentScriptContents');
4012 $this->_innerHTMLFix = $config->get('Output.FixInnerHTML');
4013 $this->_sortAttr = $config->get('Output.SortAttr');
4014 $this->_flashCompat = $config->get('Output.FlashCompat');
4015 $this->_def = $config->getHTMLDefinition();
4016 $this->_xhtml = $this->_def->doctype->xml;
4020 * Generates HTML from an array of tokens.
4021 * @param $tokens Array of HTMLPurifier_Token
4022 * @param $config HTMLPurifier_Config object
4023 * @return Generated HTML
4025 public function generateFromTokens($tokens) {
4026 if (!$tokens) return '';
4030 for ($i = 0, $size = count($tokens); $i < $size; $i++) {
4031 if ($this->_scriptFix && $tokens[$i]->name === 'script'
4032 && $i + 2 < $size && $tokens[$i+2] instanceof HTMLPurifier_Token_End) {
4033 // script special case
4034 // the contents of the script block must be ONE token
4035 // for this to work.
4036 $html .= $this->generateFromToken($tokens[$i++]);
4037 $html .= $this->generateScriptFromToken($tokens[$i++]);
4039 $html .= $this->generateFromToken($tokens[$i]);
4043 if (extension_loaded('tidy') && $this->config->get('Output.TidyFormat')) {
4045 $tidy->parseString($html, array(
4047 'output-xhtml' => $this->_xhtml,
4048 'show-body-only' => true,
4049 'indent-spaces' => 2,
4052 $tidy->cleanRepair();
4053 $html = (string) $tidy; // explicit cast necessary
4056 // Normalize newlines to system defined value
4057 if ($this->config->get('Core.NormalizeNewlines')) {
4058 $nl = $this->config->get('Output.Newline');
4059 if ($nl === null) $nl = PHP_EOL;
4060 if ($nl !== "\n") $html = str_replace("\n", $nl, $html);
4066 * Generates HTML from a single token.
4067 * @param $token HTMLPurifier_Token object.
4068 * @return Generated HTML
4070 public function generateFromToken($token) {
4071 if (!$token instanceof HTMLPurifier_Token) {
4072 trigger_error('Cannot generate HTML from non-HTMLPurifier_Token object', E_USER_WARNING);
4075 } elseif ($token instanceof HTMLPurifier_Token_Start) {
4076 $attr = $this->generateAttributes($token->attr, $token->name);
4077 if ($this->_flashCompat) {
4078 if ($token->name == "object") {
4079 $flash = new stdclass();
4080 $flash->attr = $token->attr;
4081 $flash->param = array();
4082 $this->_flashStack[] = $flash;
4085 return '<' . $token->name . ($attr ? ' ' : '') . $attr . '>';
4087 } elseif ($token instanceof HTMLPurifier_Token_End) {
4089 if ($this->_flashCompat) {
4090 if ($token->name == "object" && !empty($this->_flashStack)) {
4091 // doesn't do anything for now
4094 return $_extra . '</' . $token->name . '>';
4096 } elseif ($token instanceof HTMLPurifier_Token_Empty) {
4097 if ($this->_flashCompat && $token->name == "param" && !empty($this->_flashStack)) {
4098 $this->_flashStack[count($this->_flashStack)-1]->param[$token->attr['name']] = $token->attr['value'];
4100 $attr = $this->generateAttributes($token->attr, $token->name);
4101 return '<' . $token->name . ($attr ? ' ' : '') . $attr .
4102 ( $this->_xhtml ? ' /': '' ) // <br /> v. <br>
4105 } elseif ($token instanceof HTMLPurifier_Token_Text) {
4106 return $this->escape($token->data, ENT_NOQUOTES);
4108 } elseif ($token instanceof HTMLPurifier_Token_Comment) {
4109 return '<!--' . $token->data . '-->';
4117 * Special case processor for the contents of script tags
4118 * @warning This runs into problems if there's already a literal
4119 * --> somewhere inside the script contents.
4121 public function generateScriptFromToken($token) {
4122 if (!$token instanceof HTMLPurifier_Token_Text) return $this->generateFromToken($token);
4123 // Thanks <http://lachy.id.au/log/2005/05/script-comments>
4124 $data = preg_replace('#//\s*$#', '', $token->data);
4125 return '<!--//--><![CDATA[//><!--' . "\n" . trim($data) . "\n" . '//--><!]]>';
4129 * Generates attribute declarations from attribute array.
4130 * @note This does not include the leading or trailing space.
4131 * @param $assoc_array_of_attributes Attribute array
4132 * @param $element Name of element attributes are for, used to check
4133 * attribute minimization.
4134 * @return Generate HTML fragment for insertion.
4136 public function generateAttributes($assoc_array_of_attributes, $element = false) {
4138 if ($this->_sortAttr) ksort($assoc_array_of_attributes);
4139 foreach ($assoc_array_of_attributes as $key => $value) {
4140 if (!$this->_xhtml) {
4141 // Remove namespaced attributes
4142 if (strpos($key, ':') !== false) continue;
4143 // Check if we should minimize the attribute: val="val" -> val
4144 if ($element && !empty($this->_def->info[$element]->attr[$key]->minimized)) {
4145 $html .= $key . ' ';
4149 // Workaround for Internet Explorer innerHTML bug.
4150 // Essentially, Internet Explorer, when calculating
4151 // innerHTML, omits quotes if there are no instances of
4152 // angled brackets, quotes or spaces. However, when parsing
4153 // HTML (for example, when you assign to innerHTML), it
4154 // treats backticks as quotes. Thus,
4160 // Fortunately, all we need to do is trigger an appropriate
4161 // quoting style, which we do by adding an extra space.
4162 // This also is consistent with the W3C spec, which states
4163 // that user agents may ignore leading or trailing
4164 // whitespace (in fact, most don't, at least for attributes
4165 // like alt, but an extra space at the end is barely
4166 // noticeable). Still, we have a configuration knob for
4167 // this, since this transformation is not necesary if you
4168 // don't process user input with innerHTML or you don't plan
4169 // on supporting Internet Explorer.
4170 if ($this->_innerHTMLFix) {
4171 if (strpos($value, '`') !== false) {
4172 // check if correct quoting style would not already be
4174 if (strcspn($value, '"\' <>') === strlen($value)) {
4180 $html .= $key.'="'.$this->escape($value).'" ';
4182 return rtrim($html);
4186 * Escapes raw text data.
4187 * @todo This really ought to be protected, but until we have a facility
4188 * for properly generating HTML here w/o using tokens, it stays
4190 * @param $string String data to escape for HTML.
4191 * @param $quote Quoting style, like htmlspecialchars. ENT_NOQUOTES is
4192 * permissible for non-attribute output.
4193 * @return String escaped data.
4195 public function escape($string, $quote = null) {
4196 // Workaround for APC bug on Mac Leopard reported by sidepodcast
4197 // http://htmlpurifier.org/phorum/read.php?3,4823,4846
4198 if ($quote === null) $quote = ENT_COMPAT;
4199 return htmlspecialchars($string, $quote, 'UTF-8');
4209 * Definition of the purified HTML that describes allowed children,
4210 * attributes, and many other things.
4214 * All member variables that are prefixed with info
4215 * (including the main $info array) are used by HTML Purifier internals
4216 * and should not be directly edited when customizing the HTMLDefinition.
4217 * They can usually be set via configuration directives or custom
4220 * On the other hand, member variables without the info prefix are used
4221 * internally by the HTMLDefinition and MUST NOT be used by other HTML
4222 * Purifier internals. Many of them, however, are public, and may be
4223 * edited by userspace code to tweak the behavior of HTMLDefinition.
4225 * @note This class is inspected by Printer_HTMLDefinition; please
4226 * update that class if things here change.
4228 * @warning Directives that change this object's structure must be in
4229 * the HTML or Attr namespace!
4231 class HTMLPurifier_HTMLDefinition extends HTMLPurifier_Definition
4234 // FULLY-PUBLIC VARIABLES ---------------------------------------------
4237 * Associative array of element names to HTMLPurifier_ElementDef
4239 public $info = array();
4242 * Associative array of global attribute name to attribute definition.
4244 public $info_global_attr = array();
4247 * String name of parent element HTML will be going into.
4249 public $info_parent = 'div';
4252 * Definition for parent element, allows parent element to be a
4253 * tag that's not allowed inside the HTML fragment.
4255 public $info_parent_def;
4258 * String name of element used to wrap inline elements in block context
4259 * @note This is rarely used except for BLOCKQUOTEs in strict mode
4261 public $info_block_wrapper = 'p';
4264 * Associative array of deprecated tag name to HTMLPurifier_TagTransform
4266 public $info_tag_transform = array();
4269 * Indexed list of HTMLPurifier_AttrTransform to be performed before validation.
4271 public $info_attr_transform_pre = array();
4274 * Indexed list of HTMLPurifier_AttrTransform to be performed after validation.
4276 public $info_attr_transform_post = array();
4279 * Nested lookup array of content set name (Block, Inline) to
4280 * element name to whether or not it belongs in that content set.
4282 public $info_content_sets = array();
4285 * Indexed list of HTMLPurifier_Injector to be used.
4287 public $info_injector = array();
4296 // RAW CUSTOMIZATION STUFF --------------------------------------------
4299 * Adds a custom attribute to a pre-existing element
4300 * @note This is strictly convenience, and does not have a corresponding
4301 * method in HTMLPurifier_HTMLModule
4302 * @param $element_name String element name to add attribute to
4303 * @param $attr_name String name of attribute
4304 * @param $def Attribute definition, can be string or object, see
4305 * HTMLPurifier_AttrTypes for details
4307 public function addAttribute($element_name, $attr_name, $def) {
4308 $module = $this->getAnonymousModule();
4309 if (!isset($module->info[$element_name])) {
4310 $element = $module->addBlankElement($element_name);
4312 $element = $module->info[$element_name];
4314 $element->attr[$attr_name] = $def;
4318 * Adds a custom element to your HTML definition
4319 * @note See HTMLPurifier_HTMLModule::addElement for detailed
4320 * parameter and return value descriptions.
4322 public function addElement($element_name, $type, $contents, $attr_collections, $attributes = array()) {
4323 $module = $this->getAnonymousModule();
4324 // assume that if the user is calling this, the element
4325 // is safe. This may not be a good idea
4326 $element = $module->addElement($element_name, $type, $contents, $attr_collections, $attributes);
4331 * Adds a blank element to your HTML definition, for overriding
4333 * @note See HTMLPurifier_HTMLModule::addBlankElement for detailed
4334 * parameter and return value descriptions.
4336 public function addBlankElement($element_name) {
4337 $module = $this->getAnonymousModule();
4338 $element = $module->addBlankElement($element_name);
4343 * Retrieves a reference to the anonymous module, so you can
4344 * bust out advanced features without having to make your own
4347 public function getAnonymousModule() {
4348 if (!$this->_anonModule) {
4349 $this->_anonModule = new HTMLPurifier_HTMLModule();
4350 $this->_anonModule->name = 'Anonymous';
4352 return $this->_anonModule;
4355 private $_anonModule;
4358 // PUBLIC BUT INTERNAL VARIABLES --------------------------------------
4360 public $type = 'HTML';
4361 public $manager; /**< Instance of HTMLPurifier_HTMLModuleManager */
4364 * Performs low-cost, preliminary initialization.
4366 public function __construct() {
4367 $this->manager = new HTMLPurifier_HTMLModuleManager();
4370 protected function doSetup($config) {
4371 $this->processModules($config);
4372 $this->setupConfigStuff($config);
4373 unset($this->manager);
4375 // cleanup some of the element definitions
4376 foreach ($this->info as $k => $v) {
4377 unset($this->info[$k]->content_model);
4378 unset($this->info[$k]->content_model_type);
4383 * Extract out the information from the manager
4385 protected function processModules($config) {
4387 if ($this->_anonModule) {
4388 // for user specific changes
4389 // this is late-loaded so we don't have to deal with PHP4
4390 // reference wonky-ness
4391 $this->manager->addModule($this->_anonModule);
4392 unset($this->_anonModule);
4395 $this->manager->setup($config);
4396 $this->doctype = $this->manager->doctype;
4398 foreach ($this->manager->modules as $module) {
4399 foreach($module->info_tag_transform as $k => $v) {
4400 if ($v === false) unset($this->info_tag_transform[$k]);
4401 else $this->info_tag_transform[$k] = $v;
4403 foreach($module->info_attr_transform_pre as $k => $v) {
4404 if ($v === false) unset($this->info_attr_transform_pre[$k]);
4405 else $this->info_attr_transform_pre[$k] = $v;
4407 foreach($module->info_attr_transform_post as $k => $v) {
4408 if ($v === false) unset($this->info_attr_transform_post[$k]);
4409 else $this->info_attr_transform_post[$k] = $v;
4411 foreach ($module->info_injector as $k => $v) {
4412 if ($v === false) unset($this->info_injector[$k]);
4413 else $this->info_injector[$k] = $v;
4417 $this->info = $this->manager->getElements();
4418 $this->info_content_sets = $this->manager->contentSets->lookup;
4423 * Sets up stuff based on config. We need a better way of doing this.
4425 protected function setupConfigStuff($config) {
4427 $block_wrapper = $config->get('HTML.BlockWrapper');
4428 if (isset($this->info_content_sets['Block'][$block_wrapper])) {
4429 $this->info_block_wrapper = $block_wrapper;
4431 trigger_error('Cannot use non-block element as block wrapper',
4435 $parent = $config->get('HTML.Parent');
4436 $def = $this->manager->getElement($parent, true);
4438 $this->info_parent = $parent;
4439 $this->info_parent_def = $def;
4441 trigger_error('Cannot use unrecognized element as parent',
4443 $this->info_parent_def = $this->manager->getElement($this->info_parent, true);
4446 // support template text
4447 $support = "(for information on implementing this, see the ".
4450 // setup allowed elements -----------------------------------------
4452 $allowed_elements = $config->get('HTML.AllowedElements');
4453 $allowed_attributes = $config->get('HTML.AllowedAttributes'); // retrieve early
4455 if (!is_array($allowed_elements) && !is_array($allowed_attributes)) {
4456 $allowed = $config->get('HTML.Allowed');
4457 if (is_string($allowed)) {
4458 list($allowed_elements, $allowed_attributes) = $this->parseTinyMCEAllowedList($allowed);
4462 if (is_array($allowed_elements)) {
4463 foreach ($this->info as $name => $d) {
4464 if(!isset($allowed_elements[$name])) unset($this->info[$name]);
4465 unset($allowed_elements[$name]);
4468 foreach ($allowed_elements as $element => $d) {
4469 $element = htmlspecialchars($element); // PHP doesn't escape errors, be careful!
4470 trigger_error("Element '$element' is not supported $support", E_USER_WARNING);
4474 // setup allowed attributes ---------------------------------------
4476 $allowed_attributes_mutable = $allowed_attributes; // by copy!
4477 if (is_array($allowed_attributes)) {
4479 // This actually doesn't do anything, since we went away from
4480 // global attributes. It's possible that userland code uses
4481 // it, but HTMLModuleManager doesn't!
4482 foreach ($this->info_global_attr as $attr => $x) {
4483 $keys = array($attr, "*@$attr", "*.$attr");
4485 foreach ($keys as $key) {
4486 if ($delete && isset($allowed_attributes[$key])) {
4489 if (isset($allowed_attributes_mutable[$key])) {
4490 unset($allowed_attributes_mutable[$key]);
4493 if ($delete) unset($this->info_global_attr[$attr]);
4496 foreach ($this->info as $tag => $info) {
4497 foreach ($info->attr as $attr => $x) {
4498 $keys = array("$tag@$attr", $attr, "*@$attr", "$tag.$attr", "*.$attr");
4500 foreach ($keys as $key) {
4501 if ($delete && isset($allowed_attributes[$key])) {
4504 if (isset($allowed_attributes_mutable[$key])) {
4505 unset($allowed_attributes_mutable[$key]);
4509 if ($this->info[$tag]->attr[$attr]->required) {
4510 trigger_error("Required attribute '$attr' in element '$tag' was not allowed, which means '$tag' will not be allowed either", E_USER_WARNING);
4512 unset($this->info[$tag]->attr[$attr]);
4517 foreach ($allowed_attributes_mutable as $elattr => $d) {
4518 $bits = preg_split('/[.@]/', $elattr, 2);
4522 if ($bits[0] !== '*') {
4523 $element = htmlspecialchars($bits[0]);
4524 $attribute = htmlspecialchars($bits[1]);
4525 if (!isset($this->info[$element])) {
4526 trigger_error("Cannot allow attribute '$attribute' if element '$element' is not allowed/supported $support");
4528 trigger_error("Attribute '$attribute' in element '$element' not supported $support",
4533 // otherwise fall through
4535 $attribute = htmlspecialchars($bits[0]);
4536 trigger_error("Global attribute '$attribute' is not ".
4537 "supported in any elements $support",
4545 // setup forbidden elements ---------------------------------------
4547 $forbidden_elements = $config->get('HTML.ForbiddenElements');
4548 $forbidden_attributes = $config->get('HTML.ForbiddenAttributes');
4550 foreach ($this->info as $tag => $info) {
4551 if (isset($forbidden_elements[$tag])) {
4552 unset($this->info[$tag]);
4555 foreach ($info->attr as $attr => $x) {
4557 isset($forbidden_attributes["$tag@$attr"]) ||
4558 isset($forbidden_attributes["*@$attr"]) ||
4559 isset($forbidden_attributes[$attr])
4561 unset($this->info[$tag]->attr[$attr]);
4563 } // this segment might get removed eventually
4564 elseif (isset($forbidden_attributes["$tag.$attr"])) {
4565 // $tag.$attr are not user supplied, so no worries!
4566 trigger_error("Error with $tag.$attr: tag.attr syntax not supported for HTML.ForbiddenAttributes; use tag@attr instead", E_USER_WARNING);
4570 foreach ($forbidden_attributes as $key => $v) {
4571 if (strlen($key) < 2) continue;
4572 if ($key[0] != '*') continue;
4573 if ($key[1] == '.') {
4574 trigger_error("Error with $key: *.attr syntax not supported for HTML.ForbiddenAttributes; use attr instead", E_USER_WARNING);
4578 // setup injectors -----------------------------------------------------
4579 foreach ($this->info_injector as $i => $injector) {
4580 if ($injector->checkNeeded($config) !== false) {
4581 // remove injector that does not have it's required
4582 // elements/attributes present, and is thus not needed.
4583 unset($this->info_injector[$i]);
4589 * Parses a TinyMCE-flavored Allowed Elements and Attributes list into
4590 * separate lists for processing. Format is element[attr1|attr2],element2...
4591 * @warning Although it's largely drawn from TinyMCE's implementation,
4592 * it is different, and you'll probably have to modify your lists
4593 * @param $list String list to parse
4594 * @param array($allowed_elements, $allowed_attributes)
4595 * @todo Give this its own class, probably static interface
4597 public function parseTinyMCEAllowedList($list) {
4599 $list = str_replace(array(' ', "\t"), '', $list);
4601 $elements = array();
4602 $attributes = array();
4604 $chunks = preg_split('/(,|[\n\r]+)/', $list);
4605 foreach ($chunks as $chunk) {
4606 if (empty($chunk)) continue;
4607 // remove TinyMCE element control characters
4608 if (!strpos($chunk, '[')) {
4612 list($element, $attr) = explode('[', $chunk);
4614 if ($element !== '*') $elements[$element] = true;
4615 if (!$attr) continue;
4616 $attr = substr($attr, 0, strlen($attr) - 1); // remove trailing ]
4617 $attr = explode('|', $attr);
4618 foreach ($attr as $key) {
4619 $attributes["$element.$key"] = true;
4623 return array($elements, $attributes);
4635 * Represents an XHTML 1.1 module, with information on elements, tags
4637 * @note Even though this is technically XHTML 1.1, it is also used for
4638 * regular HTML parsing. We are using modulization as a convenient
4639 * way to represent the internals of HTMLDefinition, and our
4640 * implementation is by no means conforming and does not directly
4641 * use the normative DTDs or XML schemas.
4642 * @note The public variables in a module should almost directly
4643 * correspond to the variables in HTMLPurifier_HTMLDefinition.
4644 * However, the prefix info carries no special meaning in these
4645 * objects (include it anyway if that's the correspondence though).
4646 * @todo Consider making some member functions protected
4649 class HTMLPurifier_HTMLModule
4652 // -- Overloadable ----------------------------------------------------
4655 * Short unique string identifier of the module
4660 * Informally, a list of elements this module changes. Not used in
4661 * any significant way.
4663 public $elements = array();
4666 * Associative array of element names to element definitions.
4667 * Some definitions may be incomplete, to be merged in later
4668 * with the full definition.
4670 public $info = array();
4673 * Associative array of content set names to content set additions.
4674 * This is commonly used to, say, add an A element to the Inline
4675 * content set. This corresponds to an internal variable $content_sets
4676 * and NOT info_content_sets member variable of HTMLDefinition.
4678 public $content_sets = array();
4681 * Associative array of attribute collection names to attribute
4682 * collection additions. More rarely used for adding attributes to
4683 * the global collections. Example is the StyleAttribute module adding
4684 * the style attribute to the Core. Corresponds to HTMLDefinition's
4685 * attr_collections->info, since the object's data is only info,
4686 * with extra behavior associated with it.
4688 public $attr_collections = array();
4691 * Associative array of deprecated tag name to HTMLPurifier_TagTransform
4693 public $info_tag_transform = array();
4696 * List of HTMLPurifier_AttrTransform to be performed before validation.
4698 public $info_attr_transform_pre = array();
4701 * List of HTMLPurifier_AttrTransform to be performed after validation.
4703 public $info_attr_transform_post = array();
4706 * List of HTMLPurifier_Injector to be performed during well-formedness fixing.
4707 * An injector will only be invoked if all of it's pre-requisites are met;
4708 * if an injector fails setup, there will be no error; it will simply be
4709 * silently disabled.
4711 public $info_injector = array();
4714 * Boolean flag that indicates whether or not getChildDef is implemented.
4715 * For optimization reasons: may save a call to a function. Be sure
4716 * to set it if you do implement getChildDef(), otherwise it will have
4719 public $defines_child_def = false;
4722 * Boolean flag whether or not this module is safe. If it is not safe, all
4723 * of its members are unsafe. Modules are safe by default (this might be
4724 * slightly dangerous, but it doesn't make much sense to force HTML Purifier,
4725 * which is based off of safe HTML, to explicitly say, "This is safe," even
4726 * though there are modules which are "unsafe")
4728 * @note Previously, safety could be applied at an element level granularity.
4729 * We've removed this ability, so in order to add "unsafe" elements
4730 * or attributes, a dedicated module with this property set to false
4733 public $safe = true;
4736 * Retrieves a proper HTMLPurifier_ChildDef subclass based on
4737 * content_model and content_model_type member variables of
4738 * the HTMLPurifier_ElementDef class. There is a similar function
4739 * in HTMLPurifier_HTMLDefinition.
4740 * @param $def HTMLPurifier_ElementDef instance
4741 * @return HTMLPurifier_ChildDef subclass
4743 public function getChildDef($def) {return false;}
4745 // -- Convenience -----------------------------------------------------
4748 * Convenience function that sets up a new element
4749 * @param $element Name of element to add
4750 * @param $type What content set should element be registered to?
4751 * Set as false to skip this step.
4752 * @param $contents Allowed children in form of:
4753 * "$content_model_type: $content_model"
4754 * @param $attr_includes What attribute collections to register to
4756 * @param $attr What unique attributes does the element define?
4757 * @note See ElementDef for in-depth descriptions of these parameters.
4758 * @return Created element definition object, so you
4759 * can set advanced parameters
4761 public function addElement($element, $type, $contents, $attr_includes = array(), $attr = array()) {
4762 $this->elements[] = $element;
4763 // parse content_model
4764 list($content_model_type, $content_model) = $this->parseContents($contents);
4765 // merge in attribute inclusions
4766 $this->mergeInAttrIncludes($attr, $attr_includes);
4767 // add element to content sets
4768 if ($type) $this->addElementToContentSet($element, $type);
4770 $this->info[$element] = HTMLPurifier_ElementDef::create(
4771 $content_model, $content_model_type, $attr
4773 // literal object $contents means direct child manipulation
4774 if (!is_string($contents)) $this->info[$element]->child = $contents;
4775 return $this->info[$element];
4779 * Convenience function that creates a totally blank, non-standalone
4781 * @param $element Name of element to create
4782 * @return Created element
4784 public function addBlankElement($element) {
4785 if (!isset($this->info[$element])) {
4786 $this->elements[] = $element;
4787 $this->info[$element] = new HTMLPurifier_ElementDef();
4788 $this->info[$element]->standalone = false;
4790 trigger_error("Definition for $element already exists in module, cannot redefine");
4792 return $this->info[$element];
4796 * Convenience function that registers an element to a content set
4797 * @param Element to register
4798 * @param Name content set (warning: case sensitive, usually upper-case
4801 public function addElementToContentSet($element, $type) {
4802 if (!isset($this->content_sets[$type])) $this->content_sets[$type] = '';
4803 else $this->content_sets[$type] .= ' | ';
4804 $this->content_sets[$type] .= $element;
4808 * Convenience function that transforms single-string contents
4809 * into separate content model and content model type
4810 * @param $contents Allowed children in form of:
4811 * "$content_model_type: $content_model"
4812 * @note If contents is an object, an array of two nulls will be
4813 * returned, and the callee needs to take the original $contents
4814 * and use it directly.
4816 public function parseContents($contents) {
4817 if (!is_string($contents)) return array(null, null); // defer
4818 switch ($contents) {
4819 // check for shorthand content model forms
4821 return array('empty', '');
4823 return array('optional', 'Inline | #PCDATA');
4825 return array('optional', 'Flow | #PCDATA');
4827 list($content_model_type, $content_model) = explode(':', $contents);
4828 $content_model_type = strtolower(trim($content_model_type));
4829 $content_model = trim($content_model);
4830 return array($content_model_type, $content_model);
4834 * Convenience function that merges a list of attribute includes into
4835 * an attribute array.
4836 * @param $attr Reference to attr array to modify
4837 * @param $attr_includes Array of includes / string include to merge in
4839 public function mergeInAttrIncludes(&$attr, $attr_includes) {
4840 if (!is_array($attr_includes)) {
4841 if (empty($attr_includes)) $attr_includes = array();
4842 else $attr_includes = array($attr_includes);
4844 $attr[0] = $attr_includes;
4848 * Convenience function that generates a lookup table with boolean
4850 * @param $list List of values to turn into a lookup
4851 * @note You can also pass an arbitrary number of arguments in
4852 * place of the regular argument
4853 * @return Lookup array equivalent of list
4855 public function makeLookup($list) {
4856 if (is_string($list)) $list = func_get_args();
4858 foreach ($list as $value) {
4859 if (is_null($value)) continue;
4860 $ret[$value] = true;
4866 * Lazy load construction of the module after determining whether
4867 * or not it's needed, and also when a finalized configuration object
4869 * @param $config Instance of HTMLPurifier_Config
4871 public function setup($config) {}
4879 class HTMLPurifier_HTMLModuleManager
4883 * Instance of HTMLPurifier_DoctypeRegistry
4888 * Instance of current doctype
4893 * Instance of HTMLPurifier_AttrTypes
4898 * Active instances of modules for the specified doctype are
4899 * indexed, by name, in this array.
4901 public $modules = array();
4904 * Array of recognized HTMLPurifier_Module instances, indexed by
4905 * module's class name. This array is usually lazy loaded, but a
4906 * user can overload a module by pre-emptively registering it.
4908 public $registeredModules = array();
4911 * List of extra modules that were added by the user using addModule().
4912 * These get unconditionally merged into the current doctype, whatever
4915 public $userModules = array();
4918 * Associative array of element name to list of modules that have
4919 * definitions for the element; this array is dynamically filled.
4921 public $elementLookup = array();
4923 /** List of prefixes we should use for registering small names */
4924 public $prefixes = array('HTMLPurifier_HTMLModule_');
4926 public $contentSets; /**< Instance of HTMLPurifier_ContentSets */
4927 public $attrCollections; /**< Instance of HTMLPurifier_AttrCollections */
4929 /** If set to true, unsafe elements and attributes will be allowed */
4930 public $trusted = false;
4932 public function __construct() {
4934 // editable internal objects
4935 $this->attrTypes = new HTMLPurifier_AttrTypes();
4936 $this->doctypes = new HTMLPurifier_DoctypeRegistry();
4938 // setup basic modules
4940 'CommonAttributes', 'Text', 'Hypertext', 'List',
4941 'Presentation', 'Edit', 'Bdo', 'Tables', 'Image',
4944 'Scripting', 'Object', 'Forms',
4945 // Sorta legacy, but present in strict:
4948 $transitional = array('Legacy', 'Target');
4949 $xml = array('XMLCommonAttributes');
4950 $non_xml = array('NonXMLCommonAttributes');
4952 // setup basic doctypes
4953 $this->doctypes->register(
4954 'HTML 4.01 Transitional', false,
4955 array_merge($common, $transitional, $non_xml),
4956 array('Tidy_Transitional', 'Tidy_Proprietary'),
4958 '-//W3C//DTD HTML 4.01 Transitional//EN',
4959 'http://www.w3.org/TR/html4/loose.dtd'
4962 $this->doctypes->register(
4963 'HTML 4.01 Strict', false,
4964 array_merge($common, $non_xml),
4965 array('Tidy_Strict', 'Tidy_Proprietary', 'Tidy_Name'),
4967 '-//W3C//DTD HTML 4.01//EN',
4968 'http://www.w3.org/TR/html4/strict.dtd'
4971 $this->doctypes->register(
4972 'XHTML 1.0 Transitional', true,
4973 array_merge($common, $transitional, $xml, $non_xml),
4974 array('Tidy_Transitional', 'Tidy_XHTML', 'Tidy_Proprietary', 'Tidy_Name'),
4976 '-//W3C//DTD XHTML 1.0 Transitional//EN',
4977 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'
4980 $this->doctypes->register(
4981 'XHTML 1.0 Strict', true,
4982 array_merge($common, $xml, $non_xml),
4983 array('Tidy_Strict', 'Tidy_XHTML', 'Tidy_Strict', 'Tidy_Proprietary', 'Tidy_Name'),
4985 '-//W3C//DTD XHTML 1.0 Strict//EN',
4986 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd'
4989 $this->doctypes->register(
4991 array_merge($common, $xml, array('Ruby')),
4992 array('Tidy_Strict', 'Tidy_XHTML', 'Tidy_Proprietary', 'Tidy_Strict', 'Tidy_Name'), // Tidy_XHTML1_1
4994 '-//W3C//DTD XHTML 1.1//EN',
4995 'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd'
5001 * Registers a module to the recognized module list, useful for
5002 * overloading pre-existing modules.
5003 * @param $module Mixed: string module name, with or without
5004 * HTMLPurifier_HTMLModule prefix, or instance of
5005 * subclass of HTMLPurifier_HTMLModule.
5006 * @param $overload Boolean whether or not to overload previous modules.
5007 * If this is not set, and you do overload a module,
5008 * HTML Purifier will complain with a warning.
5009 * @note This function will not call autoload, you must instantiate
5010 * (and thus invoke) autoload outside the method.
5011 * @note If a string is passed as a module name, different variants
5012 * will be tested in this order:
5013 * - Check for HTMLPurifier_HTMLModule_$name
5014 * - Check all prefixes with $name in order they were added
5015 * - Check for literal object name
5016 * - Throw fatal error
5017 * If your object name collides with an internal class, specify
5018 * your module manually. All modules must have been included
5019 * externally: registerModule will not perform inclusions for you!
5021 public function registerModule($module, $overload = false) {
5022 if (is_string($module)) {
5023 // attempt to load the module
5024 $original_module = $module;
5026 foreach ($this->prefixes as $prefix) {
5027 $module = $prefix . $original_module;
5028 if (class_exists($module)) {
5034 $module = $original_module;
5035 if (!class_exists($module)) {
5036 trigger_error($original_module . ' module does not exist',
5041 $module = new $module();
5043 if (empty($module->name)) {
5044 trigger_error('Module instance of ' . get_class($module) . ' must have name');
5047 if (!$overload && isset($this->registeredModules[$module->name])) {
5048 trigger_error('Overloading ' . $module->name . ' without explicit overload parameter', E_USER_WARNING);
5050 $this->registeredModules[$module->name] = $module;
5054 * Adds a module to the current doctype by first registering it,
5055 * and then tacking it on to the active doctype
5057 public function addModule($module) {
5058 $this->registerModule($module);
5059 if (is_object($module)) $module = $module->name;
5060 $this->userModules[] = $module;
5064 * Adds a class prefix that registerModule() will use to resolve a
5065 * string name to a concrete class
5067 public function addPrefix($prefix) {
5068 $this->prefixes[] = $prefix;
5072 * Performs processing on modules, after being called you may
5073 * use getElement() and getElements()
5074 * @param $config Instance of HTMLPurifier_Config
5076 public function setup($config) {
5078 $this->trusted = $config->get('HTML.Trusted');
5081 $this->doctype = $this->doctypes->make($config);
5082 $modules = $this->doctype->modules;
5084 // take out the default modules that aren't allowed
5085 $lookup = $config->get('HTML.AllowedModules');
5086 $special_cases = $config->get('HTML.CoreModules');
5088 if (is_array($lookup)) {
5089 foreach ($modules as $k => $m) {
5090 if (isset($special_cases[$m])) continue;
5091 if (!isset($lookup[$m])) unset($modules[$k]);
5096 if ($config->get('HTML.Proprietary')) {
5097 $modules[] = 'Proprietary';
5099 if ($config->get('HTML.SafeObject')) {
5100 $modules[] = 'SafeObject';
5102 if ($config->get('HTML.SafeEmbed')) {
5103 $modules[] = 'SafeEmbed';
5105 if ($config->get('HTML.Nofollow')) {
5106 $modules[] = 'Nofollow';
5109 // merge in custom modules
5110 $modules = array_merge($modules, $this->userModules);
5112 foreach ($modules as $module) {
5113 $this->processModule($module);
5114 $this->modules[$module]->setup($config);
5117 foreach ($this->doctype->tidyModules as $module) {
5118 $this->processModule($module);
5119 $this->modules[$module]->setup($config);
5122 // prepare any injectors
5123 foreach ($this->modules as $module) {
5125 foreach ($module->info_injector as $i => $injector) {
5126 if (!is_object($injector)) {
5127 $class = "HTMLPurifier_Injector_$injector";
5128 $injector = new $class;
5130 $n[$injector->name] = $injector;
5132 $module->info_injector = $n;
5135 // setup lookup table based on all valid modules
5136 foreach ($this->modules as $module) {
5137 foreach ($module->info as $name => $def) {
5138 if (!isset($this->elementLookup[$name])) {
5139 $this->elementLookup[$name] = array();
5141 $this->elementLookup[$name][] = $module->name;
5145 // note the different choice
5146 $this->contentSets = new HTMLPurifier_ContentSets(
5147 // content set assembly deals with all possible modules,
5148 // not just ones deemed to be "safe"
5151 $this->attrCollections = new HTMLPurifier_AttrCollections(
5153 // there is no way to directly disable a global attribute,
5154 // but using AllowedAttributes or simply not including
5155 // the module in your custom doctype should be sufficient
5161 * Takes a module and adds it to the active module collection,
5162 * registering it if necessary.
5164 public function processModule($module) {
5165 if (!isset($this->registeredModules[$module]) || is_object($module)) {
5166 $this->registerModule($module);
5168 $this->modules[$module] = $this->registeredModules[$module];
5172 * Retrieves merged element definitions.
5173 * @return Array of HTMLPurifier_ElementDef
5175 public function getElements() {
5177 $elements = array();
5178 foreach ($this->modules as $module) {
5179 if (!$this->trusted && !$module->safe) continue;
5180 foreach ($module->info as $name => $v) {
5181 if (isset($elements[$name])) continue;
5182 $elements[$name] = $this->getElement($name);
5186 // remove dud elements, this happens when an element that
5187 // appeared to be safe actually wasn't
5188 foreach ($elements as $n => $v) {
5189 if ($v === false) unset($elements[$n]);
5197 * Retrieves a single merged element definition
5198 * @param $name Name of element
5199 * @param $trusted Boolean trusted overriding parameter: set to true
5200 * if you want the full version of an element
5201 * @return Merged HTMLPurifier_ElementDef
5202 * @note You may notice that modules are getting iterated over twice (once
5203 * in getElements() and once here). This
5206 public function getElement($name, $trusted = null) {
5208 if (!isset($this->elementLookup[$name])) {
5212 // setup global state variables
5214 if ($trusted === null) $trusted = $this->trusted;
5216 // iterate through each module that has registered itself to this
5218 foreach($this->elementLookup[$name] as $module_name) {
5220 $module = $this->modules[$module_name];
5222 // refuse to create/merge from a module that is deemed unsafe--
5223 // pretend the module doesn't exist--when trusted mode is not on.
5224 if (!$trusted && !$module->safe) {
5228 // clone is used because, ideally speaking, the original
5229 // definition should not be modified. Usually, this will
5230 // make no difference, but for consistency's sake
5231 $new_def = clone $module->info[$name];
5233 if (!$def && $new_def->standalone) {
5236 // This will occur even if $new_def is standalone. In practice,
5237 // this will usually result in a full replacement.
5238 $def->mergeIn($new_def);
5241 // non-standalone definitions that don't have a standalone
5242 // to merge into could be deferred to the end
5246 // attribute value expansions
5247 $this->attrCollections->performInclusions($def->attr);
5248 $this->attrCollections->expandIdentifiers($def->attr, $this->attrTypes);
5250 // descendants_are_inline, for ChildDef_Chameleon
5251 if (is_string($def->content_model) &&
5252 strpos($def->content_model, 'Inline') !== false) {
5253 if ($name != 'del' && $name != 'ins') {
5254 // this is for you, ins/del
5255 $def->descendants_are_inline = true;
5259 $this->contentSets->generateChildDef($def, $module);
5262 // This can occur if there is a blank definition, but no base to
5264 if (!$def) return false;
5266 // add information on required attributes
5267 foreach ($def->attr as $attr_name => $attr_def) {
5268 if ($attr_def->required) {
5269 $def->required_attr[] = $attr_name;
5284 * Component of HTMLPurifier_AttrContext that accumulates IDs to prevent dupes
5285 * @note In Slashdot-speak, dupe means duplicate.
5286 * @note The default constructor does not accept $config or $context objects:
5287 * use must use the static build() factory method to perform initialization.
5289 class HTMLPurifier_IDAccumulator
5293 * Lookup table of IDs we've accumulated.
5296 public $ids = array();
5299 * Builds an IDAccumulator, also initializing the default blacklist
5300 * @param $config Instance of HTMLPurifier_Config
5301 * @param $context Instance of HTMLPurifier_Context
5302 * @return Fully initialized HTMLPurifier_IDAccumulator
5304 public static function build($config, $context) {
5305 $id_accumulator = new HTMLPurifier_IDAccumulator();
5306 $id_accumulator->load($config->get('Attr.IDBlacklist'));
5307 return $id_accumulator;
5311 * Add an ID to the lookup table.
5312 * @param $id ID to be added.
5313 * @return Bool status, true if success, false if there's a dupe
5315 public function add($id) {
5316 if (isset($this->ids[$id])) return false;
5317 return $this->ids[$id] = true;
5321 * Load a list of IDs into the lookup table
5322 * @param $array_of_ids Array of IDs to load
5323 * @note This function doesn't care about duplicates
5325 public function load($array_of_ids) {
5326 foreach ($array_of_ids as $id) {
5327 $this->ids[$id] = true;
5338 * Injects tokens into the document while parsing for well-formedness.
5339 * This enables "formatter-like" functionality such as auto-paragraphing,
5340 * smiley-ification and linkification to take place.
5342 * A note on how handlers create changes; this is done by assigning a new
5343 * value to the $token reference. These values can take a variety of forms and
5344 * are best described HTMLPurifier_Strategy_MakeWellFormed->processToken()
5347 * @todo Allow injectors to request a re-run on their output. This
5348 * would help if an operation is recursive.
5350 abstract class HTMLPurifier_Injector
5354 * Advisory name of injector, this is for friendly error messages
5359 * Instance of HTMLPurifier_HTMLDefinition
5361 protected $htmlDefinition;
5364 * Reference to CurrentNesting variable in Context. This is an array
5365 * list of tokens that we are currently "inside"
5367 protected $currentNesting;
5370 * Reference to InputTokens variable in Context. This is an array
5371 * list of the input tokens that are being processed.
5373 protected $inputTokens;
5376 * Reference to InputIndex variable in Context. This is an integer
5377 * array index for $this->inputTokens that indicates what token
5378 * is currently being processed.
5380 protected $inputIndex;
5383 * Array of elements and attributes this injector creates and therefore
5384 * need to be allowed by the definition. Takes form of
5385 * array('element' => array('attr', 'attr2'), 'element2')
5387 public $needed = array();
5390 * Index of inputTokens to rewind to.
5392 protected $rewind = false;
5395 * Rewind to a spot to re-perform processing. This is useful if you
5396 * deleted a node, and now need to see if this change affected any
5397 * earlier nodes. Rewinding does not affect other injectors, and can
5398 * result in infinite loops if not used carefully.
5399 * @warning HTML Purifier will prevent you from fast-forwarding with this
5402 public function rewind($index) {
5403 $this->rewind = $index;
5407 * Retrieves rewind, and then unsets it.
5409 public function getRewind() {
5411 $this->rewind = false;
5416 * Prepares the injector by giving it the config and context objects:
5417 * this allows references to important variables to be made within
5418 * the injector. This function also checks if the HTML environment
5419 * will work with the Injector (see checkNeeded()).
5420 * @param $config Instance of HTMLPurifier_Config
5421 * @param $context Instance of HTMLPurifier_Context
5422 * @return Boolean false if success, string of missing needed element/attribute if failure
5424 public function prepare($config, $context) {
5425 $this->htmlDefinition = $config->getHTMLDefinition();
5426 // Even though this might fail, some unit tests ignore this and
5427 // still test checkNeeded, so be careful. Maybe get rid of that
5429 $result = $this->checkNeeded($config);
5430 if ($result !== false) return $result;
5431 $this->currentNesting =& $context->get('CurrentNesting');
5432 $this->inputTokens =& $context->get('InputTokens');
5433 $this->inputIndex =& $context->get('InputIndex');
5438 * This function checks if the HTML environment
5439 * will work with the Injector: if p tags are not allowed, the
5440 * Auto-Paragraphing injector should not be enabled.
5441 * @param $config Instance of HTMLPurifier_Config
5442 * @param $context Instance of HTMLPurifier_Context
5443 * @return Boolean false if success, string of missing needed element/attribute if failure
5445 public function checkNeeded($config) {
5446 $def = $config->getHTMLDefinition();
5447 foreach ($this->needed as $element => $attributes) {
5448 if (is_int($element)) $element = $attributes;
5449 if (!isset($def->info[$element])) return $element;
5450 if (!is_array($attributes)) continue;
5451 foreach ($attributes as $name) {
5452 if (!isset($def->info[$element]->attr[$name])) return "$element.$name";
5459 * Tests if the context node allows a certain element
5460 * @param $name Name of element to test for
5461 * @return True if element is allowed, false if it is not
5463 public function allowsElement($name) {
5464 if (!empty($this->currentNesting)) {
5465 $parent_token = array_pop($this->currentNesting);
5466 $this->currentNesting[] = $parent_token;
5467 $parent = $this->htmlDefinition->info[$parent_token->name];
5469 $parent = $this->htmlDefinition->info_parent_def;
5471 if (!isset($parent->child->elements[$name]) || isset($parent->excludes[$name])) {
5474 // check for exclusion
5475 for ($i = count($this->currentNesting) - 2; $i >= 0; $i--) {
5476 $node = $this->currentNesting[$i];
5477 $def = $this->htmlDefinition->info[$node->name];
5478 if (isset($def->excludes[$name])) return false;
5484 * Iterator function, which starts with the next token and continues until
5485 * you reach the end of the input tokens.
5486 * @warning Please prevent previous references from interfering with this
5487 * functions by setting $i = null beforehand!
5488 * @param &$i Current integer index variable for inputTokens
5489 * @param &$current Current token variable. Do NOT use $token, as that variable is also a reference
5491 protected function forward(&$i, &$current) {
5492 if ($i === null) $i = $this->inputIndex + 1;
5494 if (!isset($this->inputTokens[$i])) return false;
5495 $current = $this->inputTokens[$i];
5500 * Similar to _forward, but accepts a third parameter $nesting (which
5501 * should be initialized at 0) and stops when we hit the end tag
5502 * for the node $this->inputIndex starts in.
5504 protected function forwardUntilEndToken(&$i, &$current, &$nesting) {
5505 $result = $this->forward($i, $current);
5506 if (!$result) return false;
5507 if ($nesting === null) $nesting = 0;
5508 if ($current instanceof HTMLPurifier_Token_Start) $nesting++;
5509 elseif ($current instanceof HTMLPurifier_Token_End) {
5510 if ($nesting <= 0) return false;
5517 * Iterator function, starts with the previous token and continues until
5518 * you reach the beginning of input tokens.
5519 * @warning Please prevent previous references from interfering with this
5520 * functions by setting $i = null beforehand!
5521 * @param &$i Current integer index variable for inputTokens
5522 * @param &$current Current token variable. Do NOT use $token, as that variable is also a reference
5524 protected function backward(&$i, &$current) {
5525 if ($i === null) $i = $this->inputIndex - 1;
5527 if ($i < 0) return false;
5528 $current = $this->inputTokens[$i];
5533 * Initializes the iterator at the current position. Use in a do {} while;
5534 * loop to force the _forward and _backward functions to start at the
5536 * @warning Please prevent previous references from interfering with this
5537 * functions by setting $i = null beforehand!
5538 * @param &$i Current integer index variable for inputTokens
5539 * @param &$current Current token variable. Do NOT use $token, as that variable is also a reference
5541 protected function current(&$i, &$current) {
5542 if ($i === null) $i = $this->inputIndex;
5543 $current = $this->inputTokens[$i];
5547 * Handler that is called when a text token is processed
5549 public function handleText(&$token) {}
5552 * Handler that is called when a start or empty token is processed
5554 public function handleElement(&$token) {}
5557 * Handler that is called when an end token is processed
5559 public function handleEnd(&$token) {
5560 $this->notifyEnd($token);
5564 * Notifier that is called when an end token is processed
5565 * @note This differs from handlers in that the token is read-only
5568 public function notifyEnd($token) {}
5578 * Represents a language and defines localizable string formatting and
5579 * other functions, as well as the localized messages for HTML Purifier.
5581 class HTMLPurifier_Language
5585 * ISO 639 language code of language. Prefers shortest possible version
5587 public $code = 'en';
5590 * Fallback language code
5592 public $fallback = false;
5595 * Array of localizable messages
5597 public $messages = array();
5600 * Array of localizable error codes
5602 public $errorNames = array();
5605 * True if no message file was found for this language, so English
5606 * is being used instead. Check this if you'd like to notify the
5607 * user that they've used a non-supported language.
5609 public $error = false;
5612 * Has the language object been loaded yet?
5613 * @todo Make it private, fix usage in HTMLPurifier_LanguageTest
5615 public $_loaded = false;
5618 * Instances of HTMLPurifier_Config and HTMLPurifier_Context
5620 protected $config, $context;
5622 public function __construct($config, $context) {
5623 $this->config = $config;
5624 $this->context = $context;
5628 * Loads language object with necessary info from factory cache
5629 * @note This is a lazy loader
5631 public function load() {
5632 if ($this->_loaded) return;
5633 $factory = HTMLPurifier_LanguageFactory::instance();
5634 $factory->loadLanguage($this->code);
5635 foreach ($factory->keys as $key) {
5636 $this->$key = $factory->cache[$this->code][$key];
5638 $this->_loaded = true;
5642 * Retrieves a localised message.
5643 * @param $key string identifier of message
5644 * @return string localised message
5646 public function getMessage($key) {
5647 if (!$this->_loaded) $this->load();
5648 if (!isset($this->messages[$key])) return "[$key]";
5649 return $this->messages[$key];
5653 * Retrieves a localised error name.
5654 * @param $int integer error number, corresponding to PHP's error
5656 * @return string localised message
5658 public function getErrorName($int) {
5659 if (!$this->_loaded) $this->load();
5660 if (!isset($this->errorNames[$int])) return "[Error: $int]";
5661 return $this->errorNames[$int];
5665 * Converts an array list into a string readable representation
5667 public function listify($array) {
5668 $sep = $this->getMessage('Item separator');
5669 $sep_last = $this->getMessage('Item separator last');
5671 for ($i = 0, $c = count($array); $i < $c; $i++) {
5673 } elseif ($i + 1 < $c) {
5684 * Formats a localised message with passed parameters
5685 * @param $key string identifier of message
5686 * @param $args Parameters to substitute in
5687 * @return string localised message
5688 * @todo Implement conditionals? Right now, some messages make
5689 * reference to line numbers, but those aren't always available
5691 public function formatMessage($key, $args = array()) {
5692 if (!$this->_loaded) $this->load();
5693 if (!isset($this->messages[$key])) return "[$key]";
5694 $raw = $this->messages[$key];
5697 foreach ($args as $i => $value) {
5698 if (is_object($value)) {
5699 if ($value instanceof HTMLPurifier_Token) {
5700 // factor this out some time
5701 if (!$generator) $generator = $this->context->get('Generator');
5702 if (isset($value->name)) $subst['$'.$i.'.Name'] = $value->name;
5703 if (isset($value->data)) $subst['$'.$i.'.Data'] = $value->data;
5704 $subst['$'.$i.'.Compact'] =
5705 $subst['$'.$i.'.Serialized'] = $generator->generateFromToken($value);
5706 // a more complex algorithm for compact representation
5707 // could be introduced for all types of tokens. This
5708 // may need to be factored out into a dedicated class
5709 if (!empty($value->attr)) {
5710 $stripped_token = clone $value;
5711 $stripped_token->attr = array();
5712 $subst['$'.$i.'.Compact'] = $generator->generateFromToken($stripped_token);
5714 $subst['$'.$i.'.Line'] = $value->line ? $value->line : 'unknown';
5717 } elseif (is_array($value)) {
5718 $keys = array_keys($value);
5719 if (array_keys($keys) === $keys) {
5721 $subst['$'.$i] = $this->listify($value);
5723 // associative array
5724 // no $i implementation yet, sorry
5725 $subst['$'.$i.'.Keys'] = $this->listify($keys);
5726 $subst['$'.$i.'.Values'] = $this->listify(array_values($value));
5730 $subst['$' . $i] = $value;
5732 return strtr($raw, $subst);
5742 * Class responsible for generating HTMLPurifier_Language objects, managing
5743 * caching and fallbacks.
5744 * @note Thanks to MediaWiki for the general logic, although this version
5745 * has been entirely rewritten
5746 * @todo Serialized cache for languages
5748 class HTMLPurifier_LanguageFactory
5752 * Cache of language code information used to load HTMLPurifier_Language objects
5753 * Structure is: $factory->cache[$language_code][$key] = $value
5759 * Valid keys in the HTMLPurifier_Language object. Designates which
5760 * variables to slurp out of a message file.
5763 public $keys = array('fallback', 'messages', 'errorNames');
5766 * Instance of HTMLPurifier_AttrDef_Lang to validate language codes
5767 * @value object HTMLPurifier_AttrDef_Lang
5769 protected $validator;
5772 * Cached copy of dirname(__FILE__), directory of current file without
5774 * @value string filename
5779 * Keys whose contents are a hash map and can be merged
5780 * @value array lookup
5782 protected $mergeable_keys_map = array('messages' => true, 'errorNames' => true);
5785 * Keys whose contents are a list and can be merged
5786 * @value array lookup
5788 protected $mergeable_keys_list = array();
5791 * Retrieve sole instance of the factory.
5792 * @param $prototype Optional prototype to overload sole instance with,
5793 * or bool true to reset to default factory.
5795 public static function instance($prototype = null) {
5796 static $instance = null;
5797 if ($prototype !== null) {
5798 $instance = $prototype;
5799 } elseif ($instance === null || $prototype == true) {
5800 $instance = new HTMLPurifier_LanguageFactory();
5807 * Sets up the singleton, much like a constructor
5808 * @note Prevents people from getting this outside of the singleton
5810 public function setup() {
5811 $this->validator = new HTMLPurifier_AttrDef_Lang();
5812 $this->dir = HTMLPURIFIER_PREFIX . '/HTMLPurifier';
5816 * Creates a language object, handles class fallbacks
5817 * @param $config Instance of HTMLPurifier_Config
5818 * @param $context Instance of HTMLPurifier_Context
5819 * @param $code Code to override configuration with. Private parameter.
5821 public function create($config, $context, $code = false) {
5823 // validate language code
5824 if ($code === false) {
5825 $code = $this->validator->validate(
5826 $config->get('Core.Language'), $config, $context
5829 $code = $this->validator->validate($code, $config, $context);
5831 if ($code === false) $code = 'en'; // malformed code becomes English
5833 $pcode = str_replace('-', '_', $code); // make valid PHP classname
5834 static $depth = 0; // recursion protection
5836 if ($code == 'en') {
5837 $lang = new HTMLPurifier_Language($config, $context);
5839 $class = 'HTMLPurifier_Language_' . $pcode;
5840 $file = $this->dir . '/Language/classes/' . $code . '.php';
5841 if (file_exists($file) || class_exists($class, false)) {
5842 $lang = new $class($config, $context);
5845 $raw_fallback = $this->getFallbackFor($code);
5846 $fallback = $raw_fallback ? $raw_fallback : 'en';
5848 $lang = $this->create($config, $context, $fallback);
5849 if (!$raw_fallback) {
5850 $lang->error = true;
5856 $lang->code = $code;
5863 * Returns the fallback language for language
5864 * @note Loads the original language into cache
5865 * @param $code string language code
5867 public function getFallbackFor($code) {
5868 $this->loadLanguage($code);
5869 return $this->cache[$code]['fallback'];
5873 * Loads language into the cache, handles message file and fallbacks
5874 * @param $code string language code
5876 public function loadLanguage($code) {
5877 static $languages_seen = array(); // recursion guard
5879 // abort if we've already loaded it
5880 if (isset($this->cache[$code])) return;
5882 // generate filename
5883 $filename = $this->dir . '/Language/messages/' . $code . '.php';
5885 // default fallback : may be overwritten by the ensuing include
5886 $fallback = ($code != 'en') ? 'en' : false;
5888 // load primary localisation
5889 if (!file_exists($filename)) {
5890 // skip the include: will rely solely on fallback
5891 $filename = $this->dir . '/Language/messages/en.php';
5895 $cache = compact($this->keys);
5898 // load fallback localisation
5899 if (!empty($fallback)) {
5901 // infinite recursion guard
5902 if (isset($languages_seen[$code])) {
5903 trigger_error('Circular fallback reference in language ' .
5904 $code, E_USER_ERROR);
5907 $language_seen[$code] = true;
5909 // load the fallback recursively
5910 $this->loadLanguage($fallback);
5911 $fallback_cache = $this->cache[$fallback];
5913 // merge fallback with current language
5914 foreach ( $this->keys as $key ) {
5915 if (isset($cache[$key]) && isset($fallback_cache[$key])) {
5916 if (isset($this->mergeable_keys_map[$key])) {
5917 $cache[$key] = $cache[$key] + $fallback_cache[$key];
5918 } elseif (isset($this->mergeable_keys_list[$key])) {
5919 $cache[$key] = array_merge( $fallback_cache[$key], $cache[$key] );
5922 $cache[$key] = $fallback_cache[$key];
5928 // save to cache for later retrieval
5929 $this->cache[$code] = $cache;
5941 * Represents a measurable length, with a string numeric magnitude
5942 * and a unit. This object is immutable.
5944 class HTMLPurifier_Length
5948 * String numeric magnitude.
5953 * String unit. False is permitted if $n = 0.
5958 * Whether or not this length is valid. Null if not calculated yet.
5963 * Lookup array of units recognized by CSS 2.1
5965 protected static $allowedUnits = array(
5966 'em' => true, 'ex' => true, 'px' => true, 'in' => true,
5967 'cm' => true, 'mm' => true, 'pt' => true, 'pc' => true
5971 * @param number $n Magnitude
5972 * @param string $u Unit
5974 public function __construct($n = '0', $u = false) {
5975 $this->n = (string) $n;
5976 $this->unit = $u !== false ? (string) $u : false;
5980 * @param string $s Unit string, like '2em' or '3.4in'
5981 * @warning Does not perform validation.
5983 static public function make($s) {
5984 if ($s instanceof HTMLPurifier_Length) return $s;
5985 $n_length = strspn($s, '1234567890.+-');
5986 $n = substr($s, 0, $n_length);
5987 $unit = substr($s, $n_length);
5988 if ($unit === '') $unit = false;
5989 return new HTMLPurifier_Length($n, $unit);
5993 * Validates the number and unit.
5995 protected function validate() {
5997 if ($this->n === '+0' || $this->n === '-0') $this->n = '0';
5998 if ($this->n === '0' && $this->unit === false) return true;
5999 if (!ctype_lower($this->unit)) $this->unit = strtolower($this->unit);
6000 if (!isset(HTMLPurifier_Length::$allowedUnits[$this->unit])) return false;
6002 $def = new HTMLPurifier_AttrDef_CSS_Number();
6003 $result = $def->validate($this->n, false, false);
6004 if ($result === false) return false;
6010 * Returns string representation of number.
6012 public function toString() {
6013 if (!$this->isValid()) return false;
6014 return $this->n . $this->unit;
6018 * Retrieves string numeric magnitude.
6020 public function getN() {return $this->n;}
6023 * Retrieves string unit.
6025 public function getUnit() {return $this->unit;}
6028 * Returns true if this length unit is valid.
6030 public function isValid() {
6031 if ($this->isValid === null) $this->isValid = $this->validate();
6032 return $this->isValid;
6036 * Compares two lengths, and returns 1 if greater, -1 if less and 0 if equal.
6037 * @warning If both values are too large or small, this calculation will
6040 public function compareTo($l) {
6041 if ($l === false) return false;
6042 if ($l->unit !== $this->unit) {
6043 $converter = new HTMLPurifier_UnitConverter();
6044 $l = $converter->convert($l, $this->unit);
6045 if ($l === false) return false;
6047 return $this->n - $l->n;
6057 * Forgivingly lexes HTML (SGML-style) markup into tokens.
6059 * A lexer parses a string of SGML-style markup and converts them into
6060 * corresponding tokens. It doesn't check for well-formedness, although its
6061 * internal mechanism may make this automatic (such as the case of
6062 * HTMLPurifier_Lexer_DOMLex). There are several implementations to choose
6065 * A lexer is HTML-oriented: it might work with XML, but it's not
6066 * recommended, as we adhere to a subset of the specification for optimization
6067 * reasons. This might change in the future. Also, most tokenizers are not
6068 * expected to handle DTDs or PIs.
6070 * This class should not be directly instantiated, but you may use create() to
6071 * retrieve a default copy of the lexer. Being a supertype, this class
6072 * does not actually define any implementation, but offers commonly used
6073 * convenience functions for subclasses.
6075 * @note The unit tests will instantiate this class for testing purposes, as
6076 * many of the utility functions require a class to be instantiated.
6077 * This means that, even though this class is not runnable, it will
6078 * not be declared abstract.
6083 * We use tokens rather than create a DOM representation because DOM would:
6086 * -# Require more processing and memory to create,
6087 * -# Is not streamable, and
6088 * -# Has the entire document structure (html and body not needed).
6091 * However, DOM is helpful in that it makes it easy to move around nodes
6092 * without a lot of lookaheads to see when a tag is closed. This is a
6093 * limitation of the token system and some workarounds would be nice.
6095 class HTMLPurifier_Lexer
6099 * Whether or not this lexer implements line-number/column-number tracking.
6100 * If it does, set to true.
6102 public $tracksLineNumbers = false;
6104 // -- STATIC ----------------------------------------------------------
6107 * Retrieves or sets the default Lexer as a Prototype Factory.
6109 * By default HTMLPurifier_Lexer_DOMLex will be returned. There are
6110 * a few exceptions involving special features that only DirectLex
6113 * @note The behavior of this class has changed, rather than accepting
6114 * a prototype object, it now accepts a configuration object.
6115 * To specify your own prototype, set %Core.LexerImpl to it.
6116 * This change in behavior de-singletonizes the lexer object.
6118 * @param $config Instance of HTMLPurifier_Config
6119 * @return Concrete lexer.
6121 public static function create($config) {
6123 if (!($config instanceof HTMLPurifier_Config)) {
6125 trigger_error("Passing a prototype to
6126 HTMLPurifier_Lexer::create() is deprecated, please instead
6127 use %Core.LexerImpl", E_USER_WARNING);
6129 $lexer = $config->get('Core.LexerImpl');
6133 $config->get('Core.MaintainLineNumbers') ||
6134 $config->get('Core.CollectErrors');
6137 if (is_object($lexer)) {
6141 if (is_null($lexer)) { do {
6142 // auto-detection algorithm
6144 if ($needs_tracking) {
6145 $lexer = 'DirectLex';
6150 class_exists('DOMDocument') &&
6151 method_exists('DOMDocument', 'loadHTML') &&
6152 !extension_loaded('domxml')
6154 // check for DOM support, because while it's part of the
6155 // core, it can be disabled compile time. Also, the PECL
6156 // domxml extension overrides the default DOM, and is evil
6157 // and nasty and we shan't bother to support it
6160 $lexer = 'DirectLex';
6163 } while(0); } // do..while so we can break
6165 // instantiate recognized string names
6168 $inst = new HTMLPurifier_Lexer_DOMLex();
6171 $inst = new HTMLPurifier_Lexer_DirectLex();
6174 $inst = new HTMLPurifier_Lexer_PH5P();
6177 throw new HTMLPurifier_Exception("Cannot instantiate unrecognized Lexer type " . htmlspecialchars($lexer));
6181 if (!$inst) throw new HTMLPurifier_Exception('No lexer was instantiated');
6183 // once PHP DOM implements native line numbers, or we
6184 // hack out something using XSLT, remove this stipulation
6185 if ($needs_tracking && !$inst->tracksLineNumbers) {
6186 throw new HTMLPurifier_Exception('Cannot use lexer that does not support line numbers with Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)');
6193 // -- CONVENIENCE MEMBERS ---------------------------------------------
6195 public function __construct() {
6196 $this->_entity_parser = new HTMLPurifier_EntityParser();
6200 * Most common entity to raw value conversion table for special entities.
6202 protected $_special_entity2str =
6214 * Parses special entities into the proper characters.
6216 * This string will translate escaped versions of the special characters
6217 * into the correct ones.
6220 * You should be able to treat the output of this function as
6221 * completely parsed, but that's only because all other entities should
6222 * have been handled previously in substituteNonSpecialEntities()
6224 * @param $string String character data to be parsed.
6225 * @returns Parsed character data.
6227 public function parseData($string) {
6229 // following functions require at least one character
6230 if ($string === '') return '';
6232 // subtracts amps that cannot possibly be escaped
6233 $num_amp = substr_count($string, '&') - substr_count($string, '& ') -
6234 ($string[strlen($string)-1] === '&' ? 1 : 0);
6236 if (!$num_amp) return $string; // abort if no entities
6237 $num_esc_amp = substr_count($string, '&');
6238 $string = strtr($string, $this->_special_entity2str);
6240 // code duplication for sake of optimization, see above
6241 $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
6242 ($string[strlen($string)-1] === '&' ? 1 : 0);
6244 if ($num_amp_2 <= $num_esc_amp) return $string;
6246 // hmm... now we have some uncommon entities. Use the callback.
6247 $string = $this->_entity_parser->substituteSpecialEntities($string);
6252 * Lexes an HTML string into tokens.
6254 * @param $string String HTML.
6255 * @return HTMLPurifier_Token array representation of HTML.
6257 public function tokenizeHTML($string, $config, $context) {
6258 trigger_error('Call to abstract class', E_USER_ERROR);
6262 * Translates CDATA sections into regular sections (through escaping).
6264 * @param $string HTML string to process.
6265 * @returns HTML with CDATA sections escaped.
6267 protected static function escapeCDATA($string) {
6268 return preg_replace_callback(
6269 '/<!\[CDATA\[(.+?)\]\]>/s',
6270 array('HTMLPurifier_Lexer', 'CDATACallback'),
6276 * Special CDATA case that is especially convoluted for <script>
6278 protected static function escapeCommentedCDATA($string) {
6279 return preg_replace_callback(
6280 '#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s',
6281 array('HTMLPurifier_Lexer', 'CDATACallback'),
6287 * Special Internet Explorer conditional comments should be removed.
6289 protected static function removeIEConditional($string) {
6290 return preg_replace(
6291 '#<!--\[if [^>]+\]>.*?<!\[endif\]-->#si', // probably should generalize for all strings
6298 * Callback function for escapeCDATA() that does the work.
6300 * @warning Though this is public in order to let the callback happen,
6301 * calling it directly is not recommended.
6302 * @params $matches PCRE matches array, with index 0 the entire match
6303 * and 1 the inside of the CDATA section.
6304 * @returns Escaped internals of the CDATA section.
6306 protected static function CDATACallback($matches) {
6307 // not exactly sure why the character set is needed, but whatever
6308 return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8');
6312 * Takes a piece of HTML and normalizes it by converting entities, fixing
6313 * encoding, extracting bits, and other good stuff.
6314 * @todo Consider making protected
6316 public function normalize($html, $config, $context) {
6318 // normalize newlines to \n
6319 if ($config->get('Core.NormalizeNewlines')) {
6320 $html = str_replace("\r\n", "\n", $html);
6321 $html = str_replace("\r", "\n", $html);
6324 if ($config->get('HTML.Trusted')) {
6325 // escape convoluted CDATA
6326 $html = $this->escapeCommentedCDATA($html);
6330 $html = $this->escapeCDATA($html);
6332 $html = $this->removeIEConditional($html);
6334 // extract body from document if applicable
6335 if ($config->get('Core.ConvertDocumentToFragment')) {
6337 if ($config->get('Core.CollectErrors')) {
6338 $e =& $context->get('ErrorCollector');
6340 $new_html = $this->extractBody($html);
6341 if ($e && $new_html != $html) {
6342 $e->send(E_WARNING, 'Lexer: Extracted body');
6347 // expand entities that aren't the big five
6348 $html = $this->_entity_parser->substituteNonSpecialEntities($html);
6350 // clean into wellformed UTF-8 string for an SGML context: this has
6351 // to be done after entity expansion because the entities sometimes
6352 // represent non-SGML characters (horror, horror!)
6353 $html = HTMLPurifier_Encoder::cleanUTF8($html);
6355 // if processing instructions are to removed, remove them now
6356 if ($config->get('Core.RemoveProcessingInstructions')) {
6357 $html = preg_replace('#<\?.+?\?>#s', '', $html);
6364 * Takes a string of HTML (fragment or document) and returns the content
6365 * @todo Consider making protected
6367 public function extractBody($html) {
6369 $result = preg_match('!<body[^>]*>(.*)</body>!is', $html, $matches);
6384 * Class that handles operations involving percent-encoding in URIs.
6387 * Be careful when reusing instances of PercentEncoder. The object
6388 * you use for normalize() SHOULD NOT be used for encode(), or
6391 class HTMLPurifier_PercentEncoder
6395 * Reserved characters to preserve when using encode().
6397 protected $preserve = array();
6400 * String of characters that should be preserved while using encode().
6402 public function __construct($preserve = false) {
6403 // unreserved letters, ought to const-ify
6404 for ($i = 48; $i <= 57; $i++) $this->preserve[$i] = true; // digits
6405 for ($i = 65; $i <= 90; $i++) $this->preserve[$i] = true; // upper-case
6406 for ($i = 97; $i <= 122; $i++) $this->preserve[$i] = true; // lower-case
6407 $this->preserve[45] = true; // Dash -
6408 $this->preserve[46] = true; // Period .
6409 $this->preserve[95] = true; // Underscore _
6410 $this->preserve[126]= true; // Tilde ~
6412 // extra letters not to escape
6413 if ($preserve !== false) {
6414 for ($i = 0, $c = strlen($preserve); $i < $c; $i++) {
6415 $this->preserve[ord($preserve[$i])] = true;
6421 * Our replacement for urlencode, it encodes all non-reserved characters,
6422 * as well as any extra characters that were instructed to be preserved.
6424 * Assumes that the string has already been normalized, making any
6425 * and all percent escape sequences valid. Percents will not be
6426 * re-escaped, regardless of their status in $preserve
6427 * @param $string String to be encoded
6428 * @return Encoded string.
6430 public function encode($string) {
6432 for ($i = 0, $c = strlen($string); $i < $c; $i++) {
6433 if ($string[$i] !== '%' && !isset($this->preserve[$int = ord($string[$i])]) ) {
6434 $ret .= '%' . sprintf('%02X', $int);
6436 $ret .= $string[$i];
6443 * Fix up percent-encoding by decoding unreserved characters and normalizing.
6444 * @warning This function is affected by $preserve, even though the
6445 * usual desired behavior is for this not to preserve those
6446 * characters. Be careful when reusing instances of PercentEncoder!
6447 * @param $string String to normalize
6449 public function normalize($string) {
6450 if ($string == '') return '';
6451 $parts = explode('%', $string);
6452 $ret = array_shift($parts);
6453 foreach ($parts as $part) {
6454 $length = strlen($part);
6456 $ret .= '%25' . $part;
6459 $encoding = substr($part, 0, 2);
6460 $text = substr($part, 2);
6461 if (!ctype_xdigit($encoding)) {
6462 $ret .= '%25' . $part;
6465 $int = hexdec($encoding);
6466 if (isset($this->preserve[$int])) {
6467 $ret .= chr($int) . $text;
6470 $encoding = strtoupper($encoding);
6471 $ret .= '%' . $encoding . $text;
6483 * Generic property list implementation
6485 class HTMLPurifier_PropertyList
6488 * Internal data-structure for properties
6490 protected $data = array();
6499 public function __construct($parent = null) {
6500 $this->parent = $parent;
6504 * Recursively retrieves the value for a key
6506 public function get($name) {
6507 if ($this->has($name)) return $this->data[$name];
6508 // possible performance bottleneck, convert to iterative if necessary
6509 if ($this->parent) return $this->parent->get($name);
6510 throw new HTMLPurifier_Exception("Key '$name' not found");
6514 * Sets the value of a key, for this plist
6516 public function set($name, $value) {
6517 $this->data[$name] = $value;
6521 * Returns true if a given key exists
6523 public function has($name) {
6524 return array_key_exists($name, $this->data);
6528 * Resets a value to the value of it's parent, usually the default. If
6529 * no value is specified, the entire plist is reset.
6531 public function reset($name = null) {
6532 if ($name == null) $this->data = array();
6533 else unset($this->data[$name]);
6537 * Squashes this property list and all of its property lists into a single
6538 * array, and returns the array. This value is cached by default.
6539 * @param $force If true, ignores the cache and regenerates the array.
6541 public function squash($force = false) {
6542 if ($this->cache !== null && !$force) return $this->cache;
6543 if ($this->parent) {
6544 return $this->cache = array_merge($this->parent->squash($force), $this->data);
6546 return $this->cache = $this->data;
6551 * Returns the parent plist.
6553 public function getParent() {
6554 return $this->parent;
6558 * Sets the parent plist.
6560 public function setParent($plist) {
6561 $this->parent = $plist;
6570 * Property list iterator. Do not instantiate this class directly.
6572 class HTMLPurifier_PropertyListIterator extends FilterIterator
6579 * @param $data Array of data to iterate over
6580 * @param $filter Optional prefix to only allow values of
6582 public function __construct(Iterator $iterator, $filter = null) {
6583 parent::__construct($iterator);
6584 $this->l = strlen($filter);
6585 $this->filter = $filter;
6588 public function accept() {
6589 $key = $this->getInnerIterator()->key();
6590 if( strncmp($key, $this->filter, $this->l) !== 0 ) {
6603 * Supertype for classes that define a strategy for modifying/purifying tokens.
6605 * While HTMLPurifier's core purpose is fixing HTML into something proper,
6606 * strategies provide plug points for extra configuration or even extra
6607 * features, such as custom tags, custom parsing of text, etc.
6611 abstract class HTMLPurifier_Strategy
6615 * Executes the strategy on the tokens.
6617 * @param $tokens Array of HTMLPurifier_Token objects to be operated on.
6618 * @param $config Configuration options
6619 * @returns Processed array of token objects.
6621 abstract public function execute($tokens, $config, $context);
6630 * This is in almost every respect equivalent to an array except
6631 * that it keeps track of which keys were accessed.
6633 * @warning For the sake of backwards compatibility with early versions
6634 * of PHP 5, you must not use the $hash[$key] syntax; if you do
6635 * our version of offsetGet is never called.
6637 class HTMLPurifier_StringHash extends ArrayObject
6639 protected $accessed = array();
6642 * Retrieves a value, and logs the access.
6644 public function offsetGet($index) {
6645 $this->accessed[$index] = true;
6646 return parent::offsetGet($index);
6650 * Returns a lookup array of all array indexes that have been accessed.
6651 * @return Array in form array($index => true).
6653 public function getAccessed() {
6654 return $this->accessed;
6658 * Resets the access array.
6660 public function resetAccessed() {
6661 $this->accessed = array();
6670 * Parses string hash files. File format is as such:
6679 * Which would output something similar to:
6682 * 'ID' => 'DefaultKeyValue',
6684 * 'KEY2' => 'Value2',
6685 * 'MULTILINE-KEY' => "Multiline\nvalue.\n",
6688 * We use this as an easy to use file-format for configuration schema
6689 * files, but the class itself is usage agnostic.
6691 * You can use ---- to forcibly terminate parsing of a single string-hash;
6692 * this marker is used in multi string-hashes to delimit boundaries.
6694 class HTMLPurifier_StringHashParser
6697 public $default = 'ID';
6700 * Parses a file that contains a single string-hash.
6702 public function parseFile($file) {
6703 if (!file_exists($file)) return false;
6704 $fh = fopen($file, 'r');
6705 if (!$fh) return false;
6706 $ret = $this->parseHandle($fh);
6712 * Parses a file that contains multiple string-hashes delimited by '----'
6714 public function parseMultiFile($file) {
6715 if (!file_exists($file)) return false;
6717 $fh = fopen($file, 'r');
6718 if (!$fh) return false;
6719 while (!feof($fh)) {
6720 $ret[] = $this->parseHandle($fh);
6727 * Internal parser that acepts a file handle.
6728 * @note While it's possible to simulate in-memory parsing by using
6729 * custom stream wrappers, if such a use-case arises we should
6730 * factor out the file handle into its own class.
6731 * @param $fh File handle with pointer at start of valid string-hash
6734 protected function parseHandle($fh) {
6740 if ($line === false) break;
6741 $line = rtrim($line, "\n\r");
6742 if (!$state && $line === '') continue;
6743 if ($line === '----') break;
6744 if (strncmp('--#', $line, 3) === 0) {
6747 } elseif (strncmp('--', $line, 2) === 0) {
6748 // Multiline declaration
6749 $state = trim($line, '- ');
6750 if (!isset($ret[$state])) $ret[$state] = '';
6752 } elseif (!$state) {
6754 if (strpos($line, ':') !== false) {
6755 // Single-line declaration
6756 list($state, $line) = explode(':', $line, 2);
6757 $line = trim($line);
6759 // Use default declaration
6760 $state = $this->default;
6764 $ret[$state] = $line;
6768 $ret[$state] .= "$line\n";
6770 } while (!feof($fh));
6781 * Defines a mutation of an obsolete tag into a valid tag.
6783 abstract class HTMLPurifier_TagTransform
6787 * Tag name to transform the tag to.
6789 public $transform_to;
6792 * Transforms the obsolete tag into the valid tag.
6793 * @param $tag Tag to be transformed.
6794 * @param $config Mandatory HTMLPurifier_Config object
6795 * @param $context Mandatory HTMLPurifier_Context object
6797 abstract public function transform($tag, $config, $context);
6800 * Prepends CSS properties to the style attribute, creating the
6801 * attribute if it doesn't exist.
6802 * @warning Copied over from AttrTransform, be sure to keep in sync
6803 * @param $attr Attribute array to process (passed by reference)
6804 * @param $css CSS to prepend
6806 protected function prependCSS(&$attr, $css) {
6807 $attr['style'] = isset($attr['style']) ? $attr['style'] : '';
6808 $attr['style'] = $css . $attr['style'];
6818 * Abstract base token class that all others inherit from.
6820 class HTMLPurifier_Token {
6821 public $line; /**< Line number node was on in source document. Null if unknown. */
6822 public $col; /**< Column of line node was on in source document. Null if unknown. */
6825 * Lookup array of processing that this token is exempt from.
6826 * Currently, valid values are "ValidateAttributes" and
6827 * "MakeWellFormed_TagClosedError"
6829 public $armor = array();
6832 * Used during MakeWellFormed.
6838 public function __get($n) {
6839 if ($n === 'type') {
6840 trigger_error('Deprecated type property called; use instanceof', E_USER_NOTICE);
6841 switch (get_class($this)) {
6842 case 'HTMLPurifier_Token_Start': return 'start';
6843 case 'HTMLPurifier_Token_Empty': return 'empty';
6844 case 'HTMLPurifier_Token_End': return 'end';
6845 case 'HTMLPurifier_Token_Text': return 'text';
6846 case 'HTMLPurifier_Token_Comment': return 'comment';
6847 default: return null;
6853 * Sets the position of the token in the source document.
6855 public function position($l = null, $c = null) {
6861 * Convenience function for DirectLex settings line/col position.
6863 public function rawPosition($l, $c) {
6864 if ($c === -1) $l++;
6876 * Factory for token generation.
6878 * @note Doing some benchmarking indicates that the new operator is much
6879 * slower than the clone operator (even discounting the cost of the
6880 * constructor). This class is for that optimization.
6881 * Other then that, there's not much point as we don't
6882 * maintain parallel HTMLPurifier_Token hierarchies (the main reason why
6883 * you'd want to use an abstract factory).
6884 * @todo Port DirectLex to use this
6886 class HTMLPurifier_TokenFactory
6890 * Prototypes that will be cloned.
6893 // p stands for prototype
6894 private $p_start, $p_end, $p_empty, $p_text, $p_comment;
6897 * Generates blank prototypes for cloning.
6899 public function __construct() {
6900 $this->p_start = new HTMLPurifier_Token_Start('', array());
6901 $this->p_end = new HTMLPurifier_Token_End('');
6902 $this->p_empty = new HTMLPurifier_Token_Empty('', array());
6903 $this->p_text = new HTMLPurifier_Token_Text('');
6904 $this->p_comment= new HTMLPurifier_Token_Comment('');
6908 * Creates a HTMLPurifier_Token_Start.
6909 * @param $name Tag name
6910 * @param $attr Associative array of attributes
6911 * @return Generated HTMLPurifier_Token_Start
6913 public function createStart($name, $attr = array()) {
6914 $p = clone $this->p_start;
6915 $p->__construct($name, $attr);
6920 * Creates a HTMLPurifier_Token_End.
6921 * @param $name Tag name
6922 * @return Generated HTMLPurifier_Token_End
6924 public function createEnd($name) {
6925 $p = clone $this->p_end;
6926 $p->__construct($name);
6931 * Creates a HTMLPurifier_Token_Empty.
6932 * @param $name Tag name
6933 * @param $attr Associative array of attributes
6934 * @return Generated HTMLPurifier_Token_Empty
6936 public function createEmpty($name, $attr = array()) {
6937 $p = clone $this->p_empty;
6938 $p->__construct($name, $attr);
6943 * Creates a HTMLPurifier_Token_Text.
6944 * @param $data Data of text token
6945 * @return Generated HTMLPurifier_Token_Text
6947 public function createText($data) {
6948 $p = clone $this->p_text;
6949 $p->__construct($data);
6954 * Creates a HTMLPurifier_Token_Comment.
6955 * @param $data Data of comment token
6956 * @return Generated HTMLPurifier_Token_Comment
6958 public function createComment($data) {
6959 $p = clone $this->p_comment;
6960 $p->__construct($data);
6971 * HTML Purifier's internal representation of a URI.
6973 * Internal data-structures are completely escaped. If the data needs
6974 * to be used in a non-URI context (which is very unlikely), be sure
6975 * to decode it first. The URI may not necessarily be well-formed until
6976 * validate() is called.
6978 class HTMLPurifier_URI
6981 public $scheme, $userinfo, $host, $port, $path, $query, $fragment;
6984 * @note Automatically normalizes scheme and port
6986 public function __construct($scheme, $userinfo, $host, $port, $path, $query, $fragment) {
6987 $this->scheme = is_null($scheme) || ctype_lower($scheme) ? $scheme : strtolower($scheme);
6988 $this->userinfo = $userinfo;
6989 $this->host = $host;
6990 $this->port = is_null($port) ? $port : (int) $port;
6991 $this->path = $path;
6992 $this->query = $query;
6993 $this->fragment = $fragment;
6997 * Retrieves a scheme object corresponding to the URI's scheme/default
6998 * @param $config Instance of HTMLPurifier_Config
6999 * @param $context Instance of HTMLPurifier_Context
7000 * @return Scheme object appropriate for validating this URI
7002 public function getSchemeObj($config, $context) {
7003 $registry = HTMLPurifier_URISchemeRegistry::instance();
7004 if ($this->scheme !== null) {
7005 $scheme_obj = $registry->getScheme($this->scheme, $config, $context);
7006 if (!$scheme_obj) return false; // invalid scheme, clean it out
7008 // no scheme: retrieve the default one
7009 $def = $config->getDefinition('URI');
7010 $scheme_obj = $registry->getScheme($def->defaultScheme, $config, $context);
7012 // something funky happened to the default scheme object
7014 'Default scheme object "' . $def->defaultScheme . '" was not readable',
7024 * Generic validation method applicable for all schemes. May modify
7025 * this URI in order to get it into a compliant form.
7026 * @param $config Instance of HTMLPurifier_Config
7027 * @param $context Instance of HTMLPurifier_Context
7028 * @return True if validation/filtering succeeds, false if failure
7030 public function validate($config, $context) {
7032 // ABNF definitions from RFC 3986
7033 $chars_sub_delims = '!$&\'()*+,;=';
7034 $chars_gen_delims = ':/?#[]@';
7035 $chars_pchar = $chars_sub_delims . ':@';
7038 if (!is_null($this->host)) {
7039 $host_def = new HTMLPurifier_AttrDef_URI_Host();
7040 $this->host = $host_def->validate($this->host, $config, $context);
7041 if ($this->host === false) $this->host = null;
7045 // NOTE: It's not appropriate to check whether or not this
7046 // scheme is in our registry, since a URIFilter may convert a
7047 // URI that we don't allow into one we do. So instead, we just
7048 // check if the scheme can be dropped because there is no host
7049 // and it is our default scheme.
7050 if (!is_null($this->scheme) && is_null($this->host) || $this->host === '') {
7051 // support for relative paths is pretty abysmal when the
7052 // scheme is present, so axe it when possible
7053 $def = $config->getDefinition('URI');
7054 if ($def->defaultScheme === $this->scheme) {
7055 $this->scheme = null;
7059 // validate username
7060 if (!is_null($this->userinfo)) {
7061 $encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . ':');
7062 $this->userinfo = $encoder->encode($this->userinfo);
7066 if (!is_null($this->port)) {
7067 if ($this->port < 1 || $this->port > 65535) $this->port = null;
7071 $path_parts = array();
7072 $segments_encoder = new HTMLPurifier_PercentEncoder($chars_pchar . '/');
7073 if (!is_null($this->host)) { // this catches $this->host === ''
7074 // path-abempty (hier and relative)
7075 // http://www.example.com/my/path
7076 // //www.example.com/my/path (looks odd, but works, and
7077 // recognized by most browsers)
7078 // (this set is valid or invalid on a scheme by scheme
7079 // basis, so we'll deal with it later)
7082 $this->path = $segments_encoder->encode($this->path);
7083 } elseif ($this->path !== '') {
7084 if ($this->path[0] === '/') {
7085 // path-absolute (hier and relative)
7088 if (strlen($this->path) >= 2 && $this->path[1] === '/') {
7089 // This could happen if both the host gets stripped
7095 $this->path = $segments_encoder->encode($this->path);
7097 } elseif (!is_null($this->scheme)) {
7098 // path-rootless (hier)
7100 // Short circuit evaluation means we don't need to check nz
7101 $this->path = $segments_encoder->encode($this->path);
7103 // path-noscheme (relative)
7105 // (once again, not checking nz)
7106 $segment_nc_encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . '@');
7107 $c = strpos($this->path, '/');
7110 $segment_nc_encoder->encode(substr($this->path, 0, $c)) .
7111 $segments_encoder->encode(substr($this->path, $c));
7113 $this->path = $segment_nc_encoder->encode($this->path);
7117 // path-empty (hier and relative)
7118 $this->path = ''; // just to be safe
7121 // qf = query and fragment
7122 $qf_encoder = new HTMLPurifier_PercentEncoder($chars_pchar . '/?');
7124 if (!is_null($this->query)) {
7125 $this->query = $qf_encoder->encode($this->query);
7128 if (!is_null($this->fragment)) {
7129 $this->fragment = $qf_encoder->encode($this->fragment);
7137 * Convert URI back to string
7138 * @return String URI appropriate for output
7140 public function toString() {
7141 // reconstruct authority
7143 // there is a rendering difference between a null authority
7144 // (http:foo-bar) and an empty string authority
7145 // (http:///foo-bar).
7146 if (!is_null($this->host)) {
7148 if(!is_null($this->userinfo)) $authority .= $this->userinfo . '@';
7149 $authority .= $this->host;
7150 if(!is_null($this->port)) $authority .= ':' . $this->port;
7153 // Reconstruct the result
7154 // One might wonder about parsing quirks from browsers after
7155 // this reconstruction. Unfortunately, parsing behavior depends
7156 // on what *scheme* was employed (file:///foo is handled *very*
7157 // differently than http:///foo), so unfortunately we have to
7158 // defer to the schemes to do the right thing.
7160 if (!is_null($this->scheme)) $result .= $this->scheme . ':';
7161 if (!is_null($authority)) $result .= '//' . $authority;
7162 $result .= $this->path;
7163 if (!is_null($this->query)) $result .= '?' . $this->query;
7164 if (!is_null($this->fragment)) $result .= '#' . $this->fragment;
7175 class HTMLPurifier_URIDefinition extends HTMLPurifier_Definition
7178 public $type = 'URI';
7179 protected $filters = array();
7180 protected $postFilters = array();
7181 protected $registeredFilters = array();
7184 * HTMLPurifier_URI object of the base specified at %URI.Base
7189 * String host to consider "home" base, derived off of $base
7194 * Name of default scheme based on %URI.DefaultScheme and %URI.Base
7196 public $defaultScheme;
7198 public function __construct() {
7199 $this->registerFilter(new HTMLPurifier_URIFilter_DisableExternal());
7200 $this->registerFilter(new HTMLPurifier_URIFilter_DisableExternalResources());
7201 $this->registerFilter(new HTMLPurifier_URIFilter_HostBlacklist());
7202 $this->registerFilter(new HTMLPurifier_URIFilter_MakeAbsolute());
7203 $this->registerFilter(new HTMLPurifier_URIFilter_Munge());
7206 public function registerFilter($filter) {
7207 $this->registeredFilters[$filter->name] = $filter;
7210 public function addFilter($filter, $config) {
7211 $r = $filter->prepare($config);
7212 if ($r === false) return; // null is ok, for backwards compat
7213 if ($filter->post) {
7214 $this->postFilters[$filter->name] = $filter;
7216 $this->filters[$filter->name] = $filter;
7220 protected function doSetup($config) {
7221 $this->setupMemberVariables($config);
7222 $this->setupFilters($config);
7225 protected function setupFilters($config) {
7226 foreach ($this->registeredFilters as $name => $filter) {
7227 $conf = $config->get('URI.' . $name);
7228 if ($conf !== false && $conf !== null) {
7229 $this->addFilter($filter, $config);
7232 unset($this->registeredFilters);
7235 protected function setupMemberVariables($config) {
7236 $this->host = $config->get('URI.Host');
7237 $base_uri = $config->get('URI.Base');
7238 if (!is_null($base_uri)) {
7239 $parser = new HTMLPurifier_URIParser();
7240 $this->base = $parser->parse($base_uri);
7241 $this->defaultScheme = $this->base->scheme;
7242 if (is_null($this->host)) $this->host = $this->base->host;
7244 if (is_null($this->defaultScheme)) $this->defaultScheme = $config->get('URI.DefaultScheme');
7247 public function filter(&$uri, $config, $context) {
7248 foreach ($this->filters as $name => $f) {
7249 $result = $f->filter($uri, $config, $context);
7250 if (!$result) return false;
7255 public function postFilter(&$uri, $config, $context) {
7256 foreach ($this->postFilters as $name => $f) {
7257 $result = $f->filter($uri, $config, $context);
7258 if (!$result) return false;
7270 * Chainable filters for custom URI processing.
7272 * These filters can perform custom actions on a URI filter object,
7273 * including transformation or blacklisting.
7275 * @warning This filter is called before scheme object validation occurs.
7276 * Make sure, if you require a specific scheme object, you
7277 * you check that it exists. This allows filters to convert
7278 * proprietary URI schemes into regular ones.
7280 abstract class HTMLPurifier_URIFilter
7284 * Unique identifier of filter
7289 * True if this filter should be run after scheme validation.
7291 public $post = false;
7294 * Performs initialization for the filter
7296 public function prepare($config) {return true;}
7299 * Filter a URI object
7300 * @param $uri Reference to URI object variable
7301 * @param $config Instance of HTMLPurifier_Config
7302 * @param $context Instance of HTMLPurifier_Context
7303 * @return bool Whether or not to continue processing: false indicates
7304 * URL is no good, true indicates continue processing. Note that
7305 * all changes are committed directly on the URI object
7307 abstract public function filter(&$uri, $config, $context);
7316 * Parses a URI into the components and fragment identifier as specified
7319 class HTMLPurifier_URIParser
7323 * Instance of HTMLPurifier_PercentEncoder to do normalization with.
7325 protected $percentEncoder;
7327 public function __construct() {
7328 $this->percentEncoder = new HTMLPurifier_PercentEncoder();
7333 * @param $uri string URI to parse
7334 * @return HTMLPurifier_URI representation of URI. This representation has
7335 * not been validated yet and may not conform to RFC.
7337 public function parse($uri) {
7339 $uri = $this->percentEncoder->normalize($uri);
7341 // Regexp is as per Appendix B.
7342 // Note that ["<>] are an addition to the RFC's recommended
7343 // characters, because they represent external delimeters.
7345 '(([^:/?#"<>]+):)?'. // 2. Scheme
7346 '(//([^/?#"<>]*))?'. // 4. Authority
7347 '([^?#"<>]*)'. // 5. Path
7348 '(\?([^#"<>]*))?'. // 7. Query
7349 '(#([^"<>]*))?'. // 8. Fragment
7353 $result = preg_match($r_URI, $uri, $matches);
7355 if (!$result) return false; // *really* invalid URI
7357 // seperate out parts
7358 $scheme = !empty($matches[1]) ? $matches[2] : null;
7359 $authority = !empty($matches[3]) ? $matches[4] : null;
7360 $path = $matches[5]; // always present, can be empty
7361 $query = !empty($matches[6]) ? $matches[7] : null;
7362 $fragment = !empty($matches[8]) ? $matches[9] : null;
7364 // further parse authority
7365 if ($authority !== null) {
7366 $r_authority = "/^((.+?)@)?(\[[^\]]+\]|[^:]*)(:(\d*))?/";
7368 preg_match($r_authority, $authority, $matches);
7369 $userinfo = !empty($matches[1]) ? $matches[2] : null;
7370 $host = !empty($matches[3]) ? $matches[3] : '';
7371 $port = !empty($matches[4]) ? (int) $matches[5] : null;
7373 $port = $host = $userinfo = null;
7376 return new HTMLPurifier_URI(
7377 $scheme, $userinfo, $host, $port, $path, $query, $fragment);
7387 * Validator for the components of a URI for a specific scheme
7389 abstract class HTMLPurifier_URIScheme
7393 * Scheme's default port (integer). If an explicit port number is
7394 * specified that coincides with the default port, it will be
7397 public $default_port = null;
7400 * Whether or not URIs of this schem are locatable by a browser
7401 * http and ftp are accessible, while mailto and news are not.
7403 public $browsable = false;
7406 * Whether or not the URI always uses <hier_part>, resolves edge cases
7407 * with making relative URIs absolute
7409 public $hierarchical = false;
7412 * Whether or not the URI may omit a hostname when the scheme is
7413 * explicitly specified, ala file:///path/to/file. As of writing,
7414 * 'file' is the only scheme that browsers support his properly.
7416 public $may_omit_host = false;
7419 * Validates the components of a URI for a specific scheme.
7420 * @param $uri Reference to a HTMLPurifier_URI object
7421 * @param $config HTMLPurifier_Config object
7422 * @param $context HTMLPurifier_Context object
7423 * @return Bool success or failure
7425 public abstract function doValidate(&$uri, $config, $context);
7428 * Public interface for validating components of a URI. Performs a
7429 * bunch of default actions. Don't overload this method.
7430 * @param $uri Reference to a HTMLPurifier_URI object
7431 * @param $config HTMLPurifier_Config object
7432 * @param $context HTMLPurifier_Context object
7433 * @return Bool success or failure
7435 public function validate(&$uri, $config, $context) {
7436 if ($this->default_port == $uri->port) $uri->port = null;
7437 // kludge: browsers do funny things when the scheme but not the
7439 if (!$this->may_omit_host &&
7440 // if the scheme is present, a missing host is always in error
7441 (!is_null($uri->scheme) && ($uri->host === '' || is_null($uri->host))) ||
7442 // if the scheme is not present, a *blank* host is in error,
7443 // since this translates into '///path' which most browsers
7444 // interpret as being 'http://path'.
7445 (is_null($uri->scheme) && $uri->host === '')
7448 if (is_null($uri->scheme)) {
7449 if (substr($uri->path, 0, 2) != '//') {
7453 // URI is '////path', so we cannot nullify the
7454 // host to preserve semantics. Try expanding the
7455 // hostname instead (fall through)
7457 // first see if we can manually insert a hostname
7458 $host = $config->get('URI.Host');
7459 if (!is_null($host)) {
7462 // we can't do anything sensible, reject the URL.
7467 return $this->doValidate($uri, $config, $context);
7477 * Registry for retrieving specific URI scheme validator objects.
7479 class HTMLPurifier_URISchemeRegistry
7483 * Retrieve sole instance of the registry.
7484 * @param $prototype Optional prototype to overload sole instance with,
7485 * or bool true to reset to default registry.
7486 * @note Pass a registry object $prototype with a compatible interface and
7487 * the function will copy it and return it all further times.
7489 public static function instance($prototype = null) {
7490 static $instance = null;
7491 if ($prototype !== null) {
7492 $instance = $prototype;
7493 } elseif ($instance === null || $prototype == true) {
7494 $instance = new HTMLPurifier_URISchemeRegistry();
7500 * Cache of retrieved schemes.
7502 protected $schemes = array();
7505 * Retrieves a scheme validator object
7506 * @param $scheme String scheme name like http or mailto
7507 * @param $config HTMLPurifier_Config object
7508 * @param $config HTMLPurifier_Context object
7510 public function getScheme($scheme, $config, $context) {
7511 if (!$config) $config = HTMLPurifier_Config::createDefault();
7513 // important, otherwise attacker could include arbitrary file
7514 $allowed_schemes = $config->get('URI.AllowedSchemes');
7515 if (!$config->get('URI.OverrideAllowedSchemes') &&
7516 !isset($allowed_schemes[$scheme])
7521 if (isset($this->schemes[$scheme])) return $this->schemes[$scheme];
7522 if (!isset($allowed_schemes[$scheme])) return;
7524 $class = 'HTMLPurifier_URIScheme_' . $scheme;
7525 if (!class_exists($class)) return;
7526 $this->schemes[$scheme] = new $class();
7527 return $this->schemes[$scheme];
7531 * Registers a custom scheme to the cache, bypassing reflection.
7532 * @param $scheme Scheme name
7533 * @param $scheme_obj HTMLPurifier_URIScheme object
7535 public function register($scheme, $scheme_obj) {
7536 $this->schemes[$scheme] = $scheme_obj;
7546 * Class for converting between different unit-lengths as specified by
7549 class HTMLPurifier_UnitConverter
7557 * Units information array. Units are grouped into measuring systems
7558 * (English, Metric), and are assigned an integer representing
7559 * the conversion factor between that unit and the smallest unit in
7560 * the system. Numeric indexes are actually magical constants that
7561 * encode conversion data from one system to the next, with a O(n^2)
7562 * constraint on memory (this is generally not a problem, since
7563 * the number of measuring systems is small.)
7565 protected static $units = array(
7566 self::ENGLISH => array(
7567 'px' => 3, // This is as per CSS 2.1 and Firefox. Your mileage may vary
7571 self::METRIC => array('pt', '0.352777778', 'mm'),
7573 self::METRIC => array(
7576 self::ENGLISH => array('mm', '2.83464567', 'pt'),
7581 * Minimum bcmath precision for output.
7583 protected $outputPrecision;
7586 * Bcmath precision for internal calculations.
7588 protected $internalPrecision;
7591 * Whether or not BCMath is available
7595 public function __construct($output_precision = 4, $internal_precision = 10, $force_no_bcmath = false) {
7596 $this->outputPrecision = $output_precision;
7597 $this->internalPrecision = $internal_precision;
7598 $this->bcmath = !$force_no_bcmath && function_exists('bcmul');
7602 * Converts a length object of one unit into another unit.
7603 * @param HTMLPurifier_Length $length
7604 * Instance of HTMLPurifier_Length to convert. You must validate()
7605 * it before passing it here!
7606 * @param string $to_unit
7607 * Unit to convert to.
7609 * About precision: This conversion function pays very special
7610 * attention to the incoming precision of values and attempts
7611 * to maintain a number of significant figure. Results are
7612 * fairly accurate up to nine digits. Some caveats:
7613 * - If a number is zero-padded as a result of this significant
7614 * figure tracking, the zeroes will be eliminated.
7615 * - If a number contains less than four sigfigs ($outputPrecision)
7616 * and this causes some decimals to be excluded, those
7617 * decimals will be added on.
7619 public function convert($length, $to_unit) {
7621 if (!$length->isValid()) return false;
7623 $n = $length->getN();
7624 $unit = $length->getUnit();
7626 if ($n === '0' || $unit === false) {
7627 return new HTMLPurifier_Length('0', false);
7630 $state = $dest_state = false;
7631 foreach (self::$units as $k => $x) {
7632 if (isset($x[$unit])) $state = $k;
7633 if (isset($x[$to_unit])) $dest_state = $k;
7635 if (!$state || !$dest_state) return false;
7637 // Some calculations about the initial precision of the number;
7638 // this will be useful when we need to do final rounding.
7639 $sigfigs = $this->getSigFigs($n);
7640 if ($sigfigs < $this->outputPrecision) $sigfigs = $this->outputPrecision;
7642 // BCMath's internal precision deals only with decimals. Use
7643 // our default if the initial number has no decimals, or increase
7644 // it by how ever many decimals, thus, the number of guard digits
7645 // will always be greater than or equal to internalPrecision.
7646 $log = (int) floor(log(abs($n), 10));
7647 $cp = ($log < 0) ? $this->internalPrecision - $log : $this->internalPrecision; // internal precision
7649 for ($i = 0; $i < 2; $i++) {
7651 // Determine what unit IN THIS SYSTEM we need to convert to
7652 if ($dest_state === $state) {
7653 // Simple conversion
7654 $dest_unit = $to_unit;
7656 // Convert to the smallest unit, pending a system shift
7657 $dest_unit = self::$units[$state][$dest_state][0];
7660 // Do the conversion if necessary
7661 if ($dest_unit !== $unit) {
7662 $factor = $this->div(self::$units[$state][$unit], self::$units[$state][$dest_unit], $cp);
7663 $n = $this->mul($n, $factor, $cp);
7667 // Output was zero, so bail out early. Shouldn't ever happen.
7674 // It was a simple conversion, so bail out
7675 if ($dest_state === $state) {
7680 // Conversion failed! Apparently, the system we forwarded
7681 // to didn't have this unit. This should never happen!
7685 // Pre-condition: $i == 0
7687 // Perform conversion to next system of units
7688 $n = $this->mul($n, self::$units[$state][$dest_state][1], $cp);
7689 $unit = self::$units[$state][$dest_state][2];
7690 $state = $dest_state;
7692 // One more loop around to convert the unit in the new system.
7696 // Post-condition: $unit == $to_unit
7697 if ($unit !== $to_unit) return false;
7699 // Useful for debugging:
7701 //echo "$n\nsigfigs = $sigfigs\nnew_log = $new_log\nlog = $log\nrp = $rp\n</pre>\n";
7703 $n = $this->round($n, $sigfigs);
7704 if (strpos($n, '.') !== false) $n = rtrim($n, '0');
7705 $n = rtrim($n, '.');
7707 return new HTMLPurifier_Length($n, $unit);
7711 * Returns the number of significant figures in a string number.
7712 * @param string $n Decimal number
7713 * @return int number of sigfigs
7715 public function getSigFigs($n) {
7716 $n = ltrim($n, '0+-');
7717 $dp = strpos($n, '.'); // decimal position
7718 if ($dp === false) {
7719 $sigfigs = strlen(rtrim($n, '0'));
7721 $sigfigs = strlen(ltrim($n, '0.')); // eliminate extra decimal character
7722 if ($dp !== 0) $sigfigs--;
7728 * Adds two numbers, using arbitrary precision when available.
7730 private function add($s1, $s2, $scale) {
7731 if ($this->bcmath) return bcadd($s1, $s2, $scale);
7732 else return $this->scale($s1 + $s2, $scale);
7736 * Multiples two numbers, using arbitrary precision when available.
7738 private function mul($s1, $s2, $scale) {
7739 if ($this->bcmath) return bcmul($s1, $s2, $scale);
7740 else return $this->scale($s1 * $s2, $scale);
7744 * Divides two numbers, using arbitrary precision when available.
7746 private function div($s1, $s2, $scale) {
7747 if ($this->bcmath) return bcdiv($s1, $s2, $scale);
7748 else return $this->scale($s1 / $s2, $scale);
7752 * Rounds a number according to the number of sigfigs it should have,
7753 * using arbitrary precision when available.
7755 private function round($n, $sigfigs) {
7756 $new_log = (int) floor(log(abs($n), 10)); // Number of digits left of decimal - 1
7757 $rp = $sigfigs - $new_log - 1; // Number of decimal places needed
7758 $neg = $n < 0 ? '-' : ''; // Negative sign
7759 if ($this->bcmath) {
7761 $n = bcadd($n, $neg . '0.' . str_repeat('0', $rp) . '5', $rp + 1);
7762 $n = bcdiv($n, '1', $rp);
7764 // This algorithm partially depends on the standardized
7765 // form of numbers that comes out of bcmath.
7766 $n = bcadd($n, $neg . '5' . str_repeat('0', $new_log - $sigfigs), 0);
7767 $n = substr($n, 0, $sigfigs + strlen($neg)) . str_repeat('0', $new_log - $sigfigs + 1);
7771 return $this->scale(round($n, $sigfigs - $new_log - 1), $rp + 1);
7776 * Scales a float to $scale digits right of decimal point, like BCMath.
7778 private function scale($r, $scale) {
7780 // The f sprintf type doesn't support negative numbers, so we
7781 // need to cludge things manually. First get the string.
7782 $r = sprintf('%.0f', (float) $r);
7783 // Due to floating point precision loss, $r will more than likely
7784 // look something like 4652999999999.9234. We grab one more digit
7785 // than we need to precise from $r and then use that to round
7787 $precise = (string) round(substr($r, 0, strlen($r) + $scale), -1);
7788 // Now we return it, truncating the zero that was rounded off.
7789 return substr($precise, 0, -1) . str_repeat('0', -$scale + 1);
7791 return sprintf('%.' . $scale . 'f', (float) $r);
7801 * Parses string representations into their corresponding native PHP
7802 * variable type. The base implementation does a simple type-check.
7804 class HTMLPurifier_VarParser
7820 * Lookup table of allowed types. Mainly for backwards compatibility, but
7821 * also convenient for transforming string type names to the integer constants.
7823 static public $types = array(
7824 'string' => self::STRING,
7825 'istring' => self::ISTRING,
7826 'text' => self::TEXT,
7827 'itext' => self::ITEXT,
7829 'float' => self::FLOAT,
7830 'bool' => self::BOOL,
7831 'lookup' => self::LOOKUP,
7832 'list' => self::ALIST,
7833 'hash' => self::HASH,
7834 'mixed' => self::MIXED
7838 * Lookup table of types that are string, and can have aliases or
7839 * allowed value lists.
7841 static public $stringTypes = array(
7842 self::STRING => true,
7843 self::ISTRING => true,
7845 self::ITEXT => true,
7849 * Validate a variable according to type. Throws
7850 * HTMLPurifier_VarParserException if invalid.
7851 * It may return NULL as a valid type if $allow_null is true.
7853 * @param $var Variable to validate
7854 * @param $type Type of variable, see HTMLPurifier_VarParser->types
7855 * @param $allow_null Whether or not to permit null as a value
7856 * @return Validated and type-coerced variable
7858 final public function parse($var, $type, $allow_null = false) {
7859 if (is_string($type)) {
7860 if (!isset(HTMLPurifier_VarParser::$types[$type])) {
7861 throw new HTMLPurifier_VarParserException("Invalid type '$type'");
7863 $type = HTMLPurifier_VarParser::$types[$type];
7866 $var = $this->parseImplementation($var, $type, $allow_null);
7867 if ($allow_null && $var === null) return null;
7868 // These are basic checks, to make sure nothing horribly wrong
7869 // happened in our implementations.
7871 case (self::STRING):
7872 case (self::ISTRING):
7875 if (!is_string($var)) break;
7876 if ($type == self::ISTRING || $type == self::ITEXT) $var = strtolower($var);
7879 if (!is_int($var)) break;
7882 if (!is_float($var)) break;
7885 if (!is_bool($var)) break;
7887 case (self::LOOKUP):
7890 if (!is_array($var)) break;
7891 if ($type === self::LOOKUP) {
7892 foreach ($var as $k) if ($k !== true) $this->error('Lookup table contains value other than true');
7893 } elseif ($type === self::ALIST) {
7894 $keys = array_keys($var);
7895 if (array_keys($keys) !== $keys) $this->error('Indices for list are not uniform');
7901 $this->errorInconsistent(get_class($this), $type);
7903 $this->errorGeneric($var, $type);
7907 * Actually implements the parsing. Base implementation is to not
7908 * do anything to $var. Subclasses should overload this!
7910 protected function parseImplementation($var, $type, $allow_null) {
7915 * Throws an exception.
7917 protected function error($msg) {
7918 throw new HTMLPurifier_VarParserException($msg);
7922 * Throws an inconsistency exception.
7923 * @note This should not ever be called. It would be called if we
7924 * extend the allowed values of HTMLPurifier_VarParser without
7925 * updating subclasses.
7927 protected function errorInconsistent($class, $type) {
7928 throw new HTMLPurifier_Exception("Inconsistency in $class: ".HTMLPurifier_VarParser::getTypeName($type)." not implemented");
7932 * Generic error for if a type didn't work.
7934 protected function errorGeneric($var, $type) {
7935 $vtype = gettype($var);
7936 $this->error("Expected type ".HTMLPurifier_VarParser::getTypeName($type).", got $vtype");
7939 static public function getTypeName($type) {
7942 // Lazy load the alternative lookup table
7943 $lookup = array_flip(HTMLPurifier_VarParser::$types);
7945 if (!isset($lookup[$type])) return 'unknown';
7946 return $lookup[$type];
7956 * Exception type for HTMLPurifier_VarParser
7958 class HTMLPurifier_VarParserException extends HTMLPurifier_Exception
7968 * Validates the HTML attribute style, otherwise known as CSS.
7969 * @note We don't implement the whole CSS specification, so it might be
7970 * difficult to reuse this component in the context of validating
7971 * actual stylesheet declarations.
7972 * @note If we were really serious about validating the CSS, we would
7973 * tokenize the styles and then parse the tokens. Obviously, we
7974 * are not doing that. Doing that could seriously harm performance,
7975 * but would make these components a lot more viable for a CSS
7976 * filtering solution.
7978 class HTMLPurifier_AttrDef_CSS extends HTMLPurifier_AttrDef
7981 public function validate($css, $config, $context) {
7983 $css = $this->parseCDATA($css);
7985 $definition = $config->getCSSDefinition();
7987 // we're going to break the spec and explode by semicolons.
7988 // This is because semicolon rarely appears in escaped form
7989 // Doing this is generally flaky but fast
7990 // IT MIGHT APPEAR IN URIs, see HTMLPurifier_AttrDef_CSSURI
7993 $declarations = explode(';', $css);
7994 $propvalues = array();
7997 * Name of the current CSS property being validated.
8000 $context->register('CurrentCSSProperty', $property);
8002 foreach ($declarations as $declaration) {
8003 if (!$declaration) continue;
8004 if (!strpos($declaration, ':')) continue;
8005 list($property, $value) = explode(':', $declaration, 2);
8006 $property = trim($property);
8007 $value = trim($value);
8010 if (isset($definition->info[$property])) {
8014 if (ctype_lower($property)) break;
8015 $property = strtolower($property);
8016 if (isset($definition->info[$property])) {
8022 // inefficient call, since the validator will do this again
8023 if (strtolower(trim($value)) !== 'inherit') {
8024 // inherit works for everything (but only on the base property)
8025 $result = $definition->info[$property]->validate(
8026 $value, $config, $context );
8028 $result = 'inherit';
8030 if ($result === false) continue;
8031 $propvalues[$property] = $result;
8034 $context->destroy('CurrentCSSProperty');
8036 // procedure does not write the new CSS simultaneously, so it's
8037 // slightly inefficient, but it's the only way of getting rid of
8038 // duplicates. Perhaps config to optimize it, but not now.
8040 $new_declarations = '';
8041 foreach ($propvalues as $prop => $value) {
8042 $new_declarations .= "$prop:$value;";
8045 return $new_declarations ? $new_declarations : false;
8055 // Enum = Enumerated
8057 * Validates a keyword against a list of valid values.
8058 * @warning The case-insensitive compare of this function uses PHP's
8059 * built-in strtolower and ctype_lower functions, which may
8060 * cause problems with international comparisons
8062 class HTMLPurifier_AttrDef_Enum extends HTMLPurifier_AttrDef
8066 * Lookup table of valid values.
8067 * @todo Make protected
8069 public $valid_values = array();
8072 * Bool indicating whether or not enumeration is case sensitive.
8073 * @note In general this is always case insensitive.
8075 protected $case_sensitive = false; // values according to W3C spec
8078 * @param $valid_values List of valid values
8079 * @param $case_sensitive Bool indicating whether or not case sensitive
8081 public function __construct(
8082 $valid_values = array(), $case_sensitive = false
8084 $this->valid_values = array_flip($valid_values);
8085 $this->case_sensitive = $case_sensitive;
8088 public function validate($string, $config, $context) {
8089 $string = trim($string);
8090 if (!$this->case_sensitive) {
8091 // we may want to do full case-insensitive libraries
8092 $string = ctype_lower($string) ? $string : strtolower($string);
8094 $result = isset($this->valid_values[$string]);
8096 return $result ? $string : false;
8100 * @param $string In form of comma-delimited list of case-insensitive
8101 * valid values. Example: "foo,bar,baz". Prepend "s:" to make
8104 public function make($string) {
8105 if (strlen($string) > 2 && $string[0] == 's' && $string[1] == ':') {
8106 $string = substr($string, 2);
8111 $values = explode(',', $string);
8112 return new HTMLPurifier_AttrDef_Enum($values, $sensitive);
8122 * Validates an integer.
8123 * @note While this class was modeled off the CSS definition, no currently
8124 * allowed CSS uses this type. The properties that do are: widows,
8125 * orphans, z-index, counter-increment, counter-reset. Some of the
8126 * HTML attributes, however, find use for a non-negative version of this.
8128 class HTMLPurifier_AttrDef_Integer extends HTMLPurifier_AttrDef
8132 * Bool indicating whether or not negative values are allowed
8134 protected $negative = true;
8137 * Bool indicating whether or not zero is allowed
8139 protected $zero = true;
8142 * Bool indicating whether or not positive values are allowed
8144 protected $positive = true;
8147 * @param $negative Bool indicating whether or not negative values are allowed
8148 * @param $zero Bool indicating whether or not zero is allowed
8149 * @param $positive Bool indicating whether or not positive values are allowed
8151 public function __construct(
8152 $negative = true, $zero = true, $positive = true
8154 $this->negative = $negative;
8155 $this->zero = $zero;
8156 $this->positive = $positive;
8159 public function validate($integer, $config, $context) {
8161 $integer = $this->parseCDATA($integer);
8162 if ($integer === '') return false;
8164 // we could possibly simply typecast it to integer, but there are
8165 // certain fringe cases that must not return an integer.
8167 // clip leading sign
8168 if ( $this->negative && $integer[0] === '-' ) {
8169 $digits = substr($integer, 1);
8170 if ($digits === '0') $integer = '0'; // rm minus sign for zero
8171 } elseif( $this->positive && $integer[0] === '+' ) {
8172 $digits = $integer = substr($integer, 1); // rm unnecessary plus
8177 // test if it's numeric
8178 if (!ctype_digit($digits)) return false;
8180 // perform scope tests
8181 if (!$this->zero && $integer == 0) return false;
8182 if (!$this->positive && $integer > 0) return false;
8183 if (!$this->negative && $integer < 0) return false;
8196 * Validates the HTML attribute lang, effectively a language code.
8197 * @note Built according to RFC 3066, which obsoleted RFC 1766
8199 class HTMLPurifier_AttrDef_Lang extends HTMLPurifier_AttrDef
8202 public function validate($string, $config, $context) {
8204 $string = trim($string);
8205 if (!$string) return false;
8207 $subtags = explode('-', $string);
8208 $num_subtags = count($subtags);
8210 if ($num_subtags == 0) return false; // sanity check
8212 // process primary subtag : $subtags[0]
8213 $length = strlen($subtags[0]);
8218 if (! ($subtags[0] == 'x' || $subtags[0] == 'i') ) {
8224 if (! ctype_alpha($subtags[0]) ) {
8226 } elseif (! ctype_lower($subtags[0]) ) {
8227 $subtags[0] = strtolower($subtags[0]);
8234 $new_string = $subtags[0];
8235 if ($num_subtags == 1) return $new_string;
8237 // process second subtag : $subtags[1]
8238 $length = strlen($subtags[1]);
8239 if ($length == 0 || ($length == 1 && $subtags[1] != 'x') || $length > 8 || !ctype_alnum($subtags[1])) {
8242 if (!ctype_lower($subtags[1])) $subtags[1] = strtolower($subtags[1]);
8244 $new_string .= '-' . $subtags[1];
8245 if ($num_subtags == 2) return $new_string;
8247 // process all other subtags, index 2 and up
8248 for ($i = 2; $i < $num_subtags; $i++) {
8249 $length = strlen($subtags[$i]);
8250 if ($length == 0 || $length > 8 || !ctype_alnum($subtags[$i])) {
8253 if (!ctype_lower($subtags[$i])) {
8254 $subtags[$i] = strtolower($subtags[$i]);
8256 $new_string .= '-' . $subtags[$i];
8270 * Decorator that, depending on a token, switches between two definitions.
8272 class HTMLPurifier_AttrDef_Switch
8276 protected $withTag, $withoutTag;
8279 * @param string $tag Tag name to switch upon
8280 * @param HTMLPurifier_AttrDef $with_tag Call if token matches tag
8281 * @param HTMLPurifier_AttrDef $without_tag Call if token doesn't match, or there is no token
8283 public function __construct($tag, $with_tag, $without_tag) {
8285 $this->withTag = $with_tag;
8286 $this->withoutTag = $without_tag;
8289 public function validate($string, $config, $context) {
8290 $token = $context->get('CurrentToken', true);
8291 if (!$token || $token->name !== $this->tag) {
8292 return $this->withoutTag->validate($string, $config, $context);
8294 return $this->withTag->validate($string, $config, $context);
8305 * Validates arbitrary text according to the HTML spec.
8307 class HTMLPurifier_AttrDef_Text extends HTMLPurifier_AttrDef
8310 public function validate($string, $config, $context) {
8311 return $this->parseCDATA($string);
8321 * Validates a URI as defined by RFC 3986.
8322 * @note Scheme-specific mechanics deferred to HTMLPurifier_URIScheme
8324 class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
8328 protected $embedsResource;
8331 * @param $embeds_resource_resource Does the URI here result in an extra HTTP request?
8333 public function __construct($embeds_resource = false) {
8334 $this->parser = new HTMLPurifier_URIParser();
8335 $this->embedsResource = (bool) $embeds_resource;
8338 public function make($string) {
8339 $embeds = (bool) $string;
8340 return new HTMLPurifier_AttrDef_URI($embeds);
8343 public function validate($uri, $config, $context) {
8345 if ($config->get('URI.Disable')) return false;
8347 $uri = $this->parseCDATA($uri);
8350 $uri = $this->parser->parse($uri);
8351 if ($uri === false) return false;
8353 // add embedded flag to context for validators
8354 $context->register('EmbeddedURI', $this->embedsResource);
8359 // generic validation
8360 $result = $uri->validate($config, $context);
8361 if (!$result) break;
8363 // chained filtering
8364 $uri_def = $config->getDefinition('URI');
8365 $result = $uri_def->filter($uri, $config, $context);
8366 if (!$result) break;
8368 // scheme-specific validation
8369 $scheme_obj = $uri->getSchemeObj($config, $context);
8370 if (!$scheme_obj) break;
8371 if ($this->embedsResource && !$scheme_obj->browsable) break;
8372 $result = $scheme_obj->validate($uri, $config, $context);
8373 if (!$result) break;
8375 // Post chained filtering
8376 $result = $uri_def->postFilter($uri, $config, $context);
8377 if (!$result) break;
8379 // survived gauntlet
8384 $context->destroy('EmbeddedURI');
8385 if (!$ok) return false;
8388 return $uri->toString();
8399 * Validates a number as defined by the CSS spec.
8401 class HTMLPurifier_AttrDef_CSS_Number extends HTMLPurifier_AttrDef
8405 * Bool indicating whether or not only positive values allowed.
8407 protected $non_negative = false;
8410 * @param $non_negative Bool indicating whether negatives are forbidden
8412 public function __construct($non_negative = false) {
8413 $this->non_negative = $non_negative;
8417 * @warning Some contexts do not pass $config, $context. These
8418 * variables should not be used without checking HTMLPurifier_Length
8420 public function validate($number, $config, $context) {
8422 $number = $this->parseCDATA($number);
8424 if ($number === '') return false;
8425 if ($number === '0') return '0';
8428 switch ($number[0]) {
8430 if ($this->non_negative) return false;
8433 $number = substr($number, 1);
8436 if (ctype_digit($number)) {
8437 $number = ltrim($number, '0');
8438 return $number ? $sign . $number : '0';
8441 // Period is the only non-numeric character allowed
8442 if (strpos($number, '.') === false) return false;
8444 list($left, $right) = explode('.', $number, 2);
8446 if ($left === '' && $right === '') return false;
8447 if ($left !== '' && !ctype_digit($left)) return false;
8449 $left = ltrim($left, '0');
8450 $right = rtrim($right, '0');
8452 if ($right === '') {
8453 return $left ? $sign . $left : '0';
8454 } elseif (!ctype_digit($right)) {
8458 return $sign . $left . '.' . $right;
8468 class HTMLPurifier_AttrDef_CSS_AlphaValue extends HTMLPurifier_AttrDef_CSS_Number
8471 public function __construct() {
8472 parent::__construct(false); // opacity is non-negative, but we will clamp it
8475 public function validate($number, $config, $context) {
8476 $result = parent::validate($number, $config, $context);
8477 if ($result === false) return $result;
8478 $float = (float) $result;
8479 if ($float < 0.0) $result = '0';
8480 if ($float > 1.0) $result = '1';
8491 * Validates shorthand CSS property background.
8492 * @warning Does not support url tokens that have internal spaces.
8494 class HTMLPurifier_AttrDef_CSS_Background extends HTMLPurifier_AttrDef
8498 * Local copy of component validators.
8499 * @note See HTMLPurifier_AttrDef_Font::$info for a similar impl.
8503 public function __construct($config) {
8504 $def = $config->getCSSDefinition();
8505 $this->info['background-color'] = $def->info['background-color'];
8506 $this->info['background-image'] = $def->info['background-image'];
8507 $this->info['background-repeat'] = $def->info['background-repeat'];
8508 $this->info['background-attachment'] = $def->info['background-attachment'];
8509 $this->info['background-position'] = $def->info['background-position'];
8512 public function validate($string, $config, $context) {
8514 // regular pre-processing
8515 $string = $this->parseCDATA($string);
8516 if ($string === '') return false;
8518 // munge rgb() decl if necessary
8519 $string = $this->mungeRgb($string);
8521 // assumes URI doesn't have spaces in it
8522 $bits = explode(' ', strtolower($string)); // bits to process
8525 $caught['color'] = false;
8526 $caught['image'] = false;
8527 $caught['repeat'] = false;
8528 $caught['attachment'] = false;
8529 $caught['position'] = false;
8531 $i = 0; // number of catches
8534 foreach ($bits as $bit) {
8535 if ($bit === '') continue;
8536 foreach ($caught as $key => $status) {
8537 if ($key != 'position') {
8538 if ($status !== false) continue;
8539 $r = $this->info['background-' . $key]->validate($bit, $config, $context);
8543 if ($r === false) continue;
8544 if ($key == 'position') {
8545 if ($caught[$key] === false) $caught[$key] = '';
8546 $caught[$key] .= $r . ' ';
8555 if (!$i) return false;
8556 if ($caught['position'] !== false) {
8557 $caught['position'] = $this->info['background-position']->
8558 validate($caught['position'], $config, $context);
8562 foreach ($caught as $value) {
8563 if ($value === false) continue;
8567 if (empty($ret)) return false;
8568 return implode(' ', $ret);
8579 [ // adjective and number must be in correct order, even if
8580 // you could switch them without introducing ambiguity.
8581 // some browsers support that syntax
8583 <percentage> | <length> | left | center | right
8586 <percentage> | <length> | top | center | bottom
8589 [ // this signifies that the vertical and horizontal adjectives
8590 // can be arbitrarily ordered, however, there can only be two,
8591 // one of each, or none at all
8593 left | center | right
8596 top | center | bottom
8600 center, (none) = 50%
8601 bottom, right = 100%
8605 keyword + length/percentage must be ordered correctly, as per W3C
8607 Internet Explorer and Opera, however, support arbitrary ordering. We
8610 Minor issue though, not strictly necessary.
8613 // control freaks may appreciate the ability to convert these to
8614 // percentages or something, but it's not necessary
8617 * Validates the value of background-position.
8619 class HTMLPurifier_AttrDef_CSS_BackgroundPosition extends HTMLPurifier_AttrDef
8623 protected $percentage;
8625 public function __construct() {
8626 $this->length = new HTMLPurifier_AttrDef_CSS_Length();
8627 $this->percentage = new HTMLPurifier_AttrDef_CSS_Percentage();
8630 public function validate($string, $config, $context) {
8631 $string = $this->parseCDATA($string);
8632 $bits = explode(' ', $string);
8634 $keywords = array();
8635 $keywords['h'] = false; // left, right
8636 $keywords['v'] = false; // top, bottom
8637 $keywords['ch'] = false; // center (first word)
8638 $keywords['cv'] = false; // center (second word)
8639 $measures = array();
8651 foreach ($bits as $bit) {
8652 if ($bit === '') continue;
8655 $lbit = ctype_lower($bit) ? $bit : strtolower($bit);
8656 if (isset($lookup[$lbit])) {
8657 $status = $lookup[$lbit];
8658 if ($status == 'c') {
8665 $keywords[$status] = $lbit;
8670 $r = $this->length->validate($bit, $config, $context);
8676 // test for percentage
8677 $r = $this->percentage->validate($bit, $config, $context);
8685 if (!$i) return false; // no valid values were caught
8690 if ($keywords['h']) $ret[] = $keywords['h'];
8691 elseif ($keywords['ch']) {
8692 $ret[] = $keywords['ch'];
8693 $keywords['cv'] = false; // prevent re-use: center = center center
8695 elseif (count($measures)) $ret[] = array_shift($measures);
8697 if ($keywords['v']) $ret[] = $keywords['v'];
8698 elseif ($keywords['cv']) $ret[] = $keywords['cv'];
8699 elseif (count($measures)) $ret[] = array_shift($measures);
8701 if (empty($ret)) return false;
8702 return implode(' ', $ret);
8713 * Validates the border property as defined by CSS.
8715 class HTMLPurifier_AttrDef_CSS_Border extends HTMLPurifier_AttrDef
8719 * Local copy of properties this property is shorthand for.
8721 protected $info = array();
8723 public function __construct($config) {
8724 $def = $config->getCSSDefinition();
8725 $this->info['border-width'] = $def->info['border-width'];
8726 $this->info['border-style'] = $def->info['border-style'];
8727 $this->info['border-top-color'] = $def->info['border-top-color'];
8730 public function validate($string, $config, $context) {
8731 $string = $this->parseCDATA($string);
8732 $string = $this->mungeRgb($string);
8733 $bits = explode(' ', $string);
8734 $done = array(); // segments we've finished
8735 $ret = ''; // return value
8736 foreach ($bits as $bit) {
8737 foreach ($this->info as $propname => $validator) {
8738 if (isset($done[$propname])) continue;
8739 $r = $validator->validate($bit, $config, $context);
8742 $done[$propname] = true;
8757 * Validates Color as defined by CSS.
8759 class HTMLPurifier_AttrDef_CSS_Color extends HTMLPurifier_AttrDef
8762 public function validate($color, $config, $context) {
8764 static $colors = null;
8765 if ($colors === null) $colors = $config->get('Core.ColorKeywords');
8767 $color = trim($color);
8768 if ($color === '') return false;
8770 $lower = strtolower($color);
8771 if (isset($colors[$lower])) return $colors[$lower];
8773 if (strpos($color, 'rgb(') !== false) {
8774 // rgb literal handling
8775 $length = strlen($color);
8776 if (strpos($color, ')') !== $length - 1) return false;
8777 $triad = substr($color, 4, $length - 4 - 1);
8778 $parts = explode(',', $triad);
8779 if (count($parts) !== 3) return false;
8780 $type = false; // to ensure that they're all the same type
8781 $new_parts = array();
8782 foreach ($parts as $part) {
8783 $part = trim($part);
8784 if ($part === '') return false;
8785 $length = strlen($part);
8786 if ($part[$length - 1] === '%') {
8789 $type = 'percentage';
8790 } elseif ($type !== 'percentage') {
8793 $num = (float) substr($part, 0, $length - 1);
8794 if ($num < 0) $num = 0;
8795 if ($num > 100) $num = 100;
8796 $new_parts[] = "$num%";
8801 } elseif ($type !== 'integer') {
8805 if ($num < 0) $num = 0;
8806 if ($num > 255) $num = 255;
8807 $new_parts[] = (string) $num;
8810 $new_triad = implode(',', $new_parts);
8811 $color = "rgb($new_triad)";
8813 // hexadecimal handling
8814 if ($color[0] === '#') {
8815 $hex = substr($color, 1);
8818 $color = '#' . $color;
8820 $length = strlen($hex);
8821 if ($length !== 3 && $length !== 6) return false;
8822 if (!ctype_xdigit($hex)) return false;
8836 * Allows multiple validators to attempt to validate attribute.
8838 * Composite is just what it sounds like: a composite of many validators.
8839 * This means that multiple HTMLPurifier_AttrDef objects will have a whack
8840 * at the string. If one of them passes, that's what is returned. This is
8841 * especially useful for CSS values, which often are a choice between
8842 * an enumerated set of predefined values or a flexible data type.
8844 class HTMLPurifier_AttrDef_CSS_Composite extends HTMLPurifier_AttrDef
8848 * List of HTMLPurifier_AttrDef objects that may process strings
8849 * @todo Make protected
8854 * @param $defs List of HTMLPurifier_AttrDef objects
8856 public function __construct($defs) {
8857 $this->defs = $defs;
8860 public function validate($string, $config, $context) {
8861 foreach ($this->defs as $i => $def) {
8862 $result = $this->defs[$i]->validate($string, $config, $context);
8863 if ($result !== false) return $result;
8875 * Decorator which enables CSS properties to be disabled for specific elements.
8877 class HTMLPurifier_AttrDef_CSS_DenyElementDecorator extends HTMLPurifier_AttrDef
8879 public $def, $element;
8882 * @param $def Definition to wrap
8883 * @param $element Element to deny
8885 public function __construct($def, $element) {
8887 $this->element = $element;
8890 * Checks if CurrentToken is set and equal to $this->element
8892 public function validate($string, $config, $context) {
8893 $token = $context->get('CurrentToken', true);
8894 if ($token && $token->name == $this->element) return false;
8895 return $this->def->validate($string, $config, $context);
8904 * Microsoft's proprietary filter: CSS property
8905 * @note Currently supports the alpha filter. In the future, this will
8906 * probably need an extensible framework
8908 class HTMLPurifier_AttrDef_CSS_Filter extends HTMLPurifier_AttrDef
8911 protected $intValidator;
8913 public function __construct() {
8914 $this->intValidator = new HTMLPurifier_AttrDef_Integer();
8917 public function validate($value, $config, $context) {
8918 $value = $this->parseCDATA($value);
8919 if ($value === 'none') return $value;
8920 // if we looped this we could support multiple filters
8921 $function_length = strcspn($value, '(');
8922 $function = trim(substr($value, 0, $function_length));
8923 if ($function !== 'alpha' &&
8924 $function !== 'Alpha' &&
8925 $function !== 'progid:DXImageTransform.Microsoft.Alpha'
8927 $cursor = $function_length + 1;
8928 $parameters_length = strcspn($value, ')', $cursor);
8929 $parameters = substr($value, $cursor, $parameters_length);
8930 $params = explode(',', $parameters);
8931 $ret_params = array();
8933 foreach ($params as $param) {
8934 list($key, $value) = explode('=', $param);
8936 $value = trim($value);
8937 if (isset($lookup[$key])) continue;
8938 if ($key !== 'opacity') continue;
8939 $value = $this->intValidator->validate($value, $config, $context);
8940 if ($value === false) continue;
8941 $int = (int) $value;
8942 if ($int > 100) $value = '100';
8943 if ($int < 0) $value = '0';
8944 $ret_params[] = "$key=$value";
8945 $lookup[$key] = true;
8947 $ret_parameters = implode(',', $ret_params);
8948 $ret_function = "$function($ret_parameters)";
8949 return $ret_function;
8959 * Validates shorthand CSS property font.
8961 class HTMLPurifier_AttrDef_CSS_Font extends HTMLPurifier_AttrDef
8965 * Local copy of component validators.
8967 * @note If we moved specific CSS property definitions to their own
8968 * classes instead of having them be assembled at run time by
8969 * CSSDefinition, this wouldn't be necessary. We'd instantiate
8972 protected $info = array();
8974 public function __construct($config) {
8975 $def = $config->getCSSDefinition();
8976 $this->info['font-style'] = $def->info['font-style'];
8977 $this->info['font-variant'] = $def->info['font-variant'];
8978 $this->info['font-weight'] = $def->info['font-weight'];
8979 $this->info['font-size'] = $def->info['font-size'];
8980 $this->info['line-height'] = $def->info['line-height'];
8981 $this->info['font-family'] = $def->info['font-family'];
8984 public function validate($string, $config, $context) {
8986 static $system_fonts = array(
8990 'message-box' => true,
8991 'small-caption' => true,
8992 'status-bar' => true
8995 // regular pre-processing
8996 $string = $this->parseCDATA($string);
8997 if ($string === '') return false;
8999 // check if it's one of the keywords
9000 $lowercase_string = strtolower($string);
9001 if (isset($system_fonts[$lowercase_string])) {
9002 return $lowercase_string;
9005 $bits = explode(' ', $string); // bits to process
9006 $stage = 0; // this indicates what we're looking for
9007 $caught = array(); // which stage 0 properties have we caught?
9008 $stage_1 = array('font-style', 'font-variant', 'font-weight');
9009 $final = ''; // output
9011 for ($i = 0, $size = count($bits); $i < $size; $i++) {
9012 if ($bits[$i] === '') continue;
9015 // attempting to catch font-style, font-variant or font-weight
9017 foreach ($stage_1 as $validator_name) {
9018 if (isset($caught[$validator_name])) continue;
9019 $r = $this->info[$validator_name]->validate(
9020 $bits[$i], $config, $context);
9023 $caught[$validator_name] = true;
9027 // all three caught, continue on
9028 if (count($caught) >= 3) $stage = 1;
9029 if ($r !== false) break;
9031 // attempting to catch font-size and perhaps line-height
9033 $found_slash = false;
9034 if (strpos($bits[$i], '/') !== false) {
9035 list($font_size, $line_height) =
9036 explode('/', $bits[$i]);
9037 if ($line_height === '') {
9038 // ooh, there's a space after the slash!
9039 $line_height = false;
9040 $found_slash = true;
9043 $font_size = $bits[$i];
9044 $line_height = false;
9046 $r = $this->info['font-size']->validate(
9047 $font_size, $config, $context);
9050 // attempt to catch line-height
9051 if ($line_height === false) {
9052 // we need to scroll forward
9053 for ($j = $i + 1; $j < $size; $j++) {
9054 if ($bits[$j] === '') continue;
9055 if ($bits[$j] === '/') {
9059 $found_slash = true;
9063 $line_height = $bits[$j];
9067 // slash already found
9068 $found_slash = true;
9073 $r = $this->info['line-height']->validate(
9074 $line_height, $config, $context);
9085 // attempting to catch font-family
9088 implode(' ', array_slice($bits, $i, $size - $i));
9089 $r = $this->info['font-family']->validate(
9090 $font_family, $config, $context);
9093 // processing completed successfully
9094 return rtrim($final);
9109 * Validates a font family list according to CSS spec
9111 class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
9114 protected $mask = null;
9116 public function __construct() {
9118 for ($c = 'a'; $c <= 'z'; $c++) $this->mask .= $c;
9119 for ($c = 'A'; $c <= 'Z'; $c++) $this->mask .= $c;
9120 for ($c = '0'; $c <= '9'; $c++) $this->mask .= $c; // cast-y, but should be fine
9121 // special bytes used by UTF-8
9122 for ($i = 0x80; $i <= 0xFF; $i++) {
9123 // We don't bother excluding invalid bytes in this range,
9124 // because the our restriction of well-formed UTF-8 will
9125 // prevent these from ever occurring.
9126 $this->mask .= chr($i);
9130 PHP's internal strcspn implementation is
9131 O(length of string * length of mask), making it inefficient
9132 for large masks. However, it's still faster than
9137 if (*spanp == c || p == s1_end) {
9140 } while (spanp++ < (s2_end - 1));
9144 // possible optimization: invert the mask.
9147 public function validate($string, $config, $context) {
9148 static $generic_names = array(
9150 'sans-serif' => true,
9151 'monospace' => true,
9155 $allowed_fonts = $config->get('CSS.AllowedFonts');
9157 // assume that no font names contain commas in them
9158 $fonts = explode(',', $string);
9160 foreach($fonts as $font) {
9161 $font = trim($font);
9162 if ($font === '') continue;
9163 // match a generic name
9164 if (isset($generic_names[$font])) {
9165 if ($allowed_fonts === null || isset($allowed_fonts[$font])) {
9166 $final .= $font . ', ';
9170 // match a quoted name
9171 if ($font[0] === '"' || $font[0] === "'") {
9172 $length = strlen($font);
9173 if ($length <= 2) continue;
9175 if ($font[$length - 1] !== $quote) continue;
9176 $font = substr($font, 1, $length - 2);
9179 $font = $this->expandCSSEscape($font);
9181 // $font is a pure representation of the font name
9183 if ($allowed_fonts !== null && !isset($allowed_fonts[$font])) {
9187 if (ctype_alnum($font) && $font !== '') {
9188 // very simple font, allow it in unharmed
9189 $final .= $font . ', ';
9193 // bugger out on whitespace. form feed (0C) really
9194 // shouldn't show up regardless
9195 $font = str_replace(array("\n", "\t", "\r", "\x0C"), ' ', $font);
9197 // Here, there are various classes of characters which need
9198 // to be treated differently:
9199 // - Alphanumeric characters are essentially safe. We
9200 // handled these above.
9201 // - Spaces require quoting, though most parsers will do
9202 // the right thing if there aren't any characters that
9203 // can be misinterpreted
9204 // - Dashes rarely occur, but they fairly unproblematic
9205 // for parsing/rendering purposes.
9206 // The above characters cover the majority of Western font
9208 // - Arbitrary Unicode characters not in ASCII. Because
9209 // most parsers give little thought to Unicode, treatment
9210 // of these codepoints is basically uniform, even for
9211 // punctuation-like codepoints. These characters can
9212 // show up in non-Western pages and are supported by most
9213 // major browsers, for example: "MS 明朝" is a
9214 // legitimate font-name
9215 // <http://ja.wikipedia.org/wiki/MS_明朝>. See
9216 // the CSS3 spec for more examples:
9217 // <http://www.w3.org/TR/2011/WD-css3-fonts-20110324/localizedfamilynames.png>
9218 // You can see live samples of these on the Internet:
9219 // <http://www.google.co.jp/search?q=font-family+MS+明朝|ゴシック>
9220 // However, most of these fonts have ASCII equivalents:
9221 // for example, 'MS Mincho', and it's considered
9222 // professional to use ASCII font names instead of
9223 // Unicode font names. Thanks Takeshi Terada for
9224 // providing this information.
9225 // The following characters, to my knowledge, have not been
9226 // used to name font names.
9227 // - Single quote. While theoretically you might find a
9228 // font name that has a single quote in its name (serving
9229 // as an apostrophe, e.g. Dave's Scribble), I haven't
9230 // been able to find any actual examples of this.
9231 // Internet Explorer's cssText translation (which I
9232 // believe is invoked by innerHTML) normalizes any
9233 // quoting to single quotes, and fails to escape single
9234 // quotes. (Note that this is not IE's behavior for all
9235 // CSS properties, just some sort of special casing for
9236 // font-family). So a single quote *cannot* be used
9237 // safely in the font-family context if there will be an
9238 // innerHTML/cssText translation. Note that Firefox 3.x
9240 // - Double quote. In IE, these get normalized to
9241 // single-quotes, no matter what the encoding. (Fun
9242 // fact, in IE8, the 'content' CSS property gained
9243 // support, where they special cased to preserve encoded
9244 // double quotes, but still translate unadorned double
9245 // quotes into single quotes.) So, because their
9246 // fixpoint behavior is identical to single quotes, they
9247 // cannot be allowed either. Firefox 3.x displays
9248 // single-quote style behavior.
9249 // - Backslashes are reduced by one (so \\ -> \) every
9250 // iteration, so they cannot be used safely. This shows
9251 // up in IE7, IE8 and FF3
9252 // - Semicolons, commas and backticks are handled properly.
9253 // - The rest of the ASCII punctuation is handled properly.
9254 // We haven't checked what browsers do to unadorned
9255 // versions, but this is not important as long as the
9256 // browser doesn't /remove/ surrounding quotes (as IE does
9259 // With these results in hand, we conclude that there are
9260 // various levels of safety:
9261 // - Paranoid: alphanumeric, spaces and dashes(?)
9262 // - International: Paranoid + non-ASCII Unicode
9263 // - Edgy: Everything except quotes, backslashes
9264 // - NoJS: Standards compliance, e.g. sod IE. Note that
9265 // with some judicious character escaping (since certain
9266 // types of escaping doesn't work) this is theoretically
9267 // OK as long as innerHTML/cssText is not called.
9268 // We believe that international is a reasonable default
9269 // (that we will implement now), and once we do more
9270 // extensive research, we may feel comfortable with dropping
9273 // Edgy: alphanumeric, spaces, dashes and Unicode. Use of
9274 // str(c)spn assumes that the string was already well formed
9275 // Unicode (which of course it is).
9276 if (strspn($font, $this->mask) !== strlen($font)) {
9281 // In the absence of innerHTML/cssText, these ugly
9282 // transforms don't pose a security risk (as \\ and \"
9283 // might--these escapes are not supported by most browsers).
9284 // We could try to be clever and use single-quote wrapping
9285 // when there is a double quote present, but I have choosen
9286 // not to implement that. (NOTE: you can reduce the amount
9287 // of escapes by one depending on what quoting style you use)
9288 // $font = str_replace('\\', '\\5C ', $font);
9289 // $font = str_replace('"', '\\22 ', $font);
9290 // $font = str_replace("'", '\\27 ', $font);
9292 // font possibly with spaces, requires quoting
9293 $final .= "'$font', ";
9295 $final = rtrim($final, ', ');
9296 if ($final === '') return false;
9307 * Decorator which enables !important to be used in CSS values.
9309 class HTMLPurifier_AttrDef_CSS_ImportantDecorator extends HTMLPurifier_AttrDef
9311 public $def, $allow;
9314 * @param $def Definition to wrap
9315 * @param $allow Whether or not to allow !important
9317 public function __construct($def, $allow = false) {
9319 $this->allow = $allow;
9322 * Intercepts and removes !important if necessary
9324 public function validate($string, $config, $context) {
9325 // test for ! and important tokens
9326 $string = trim($string);
9327 $is_important = false;
9328 // :TODO: optimization: test directly for !important and ! important
9329 if (strlen($string) >= 9 && substr($string, -9) === 'important') {
9330 $temp = rtrim(substr($string, 0, -9));
9331 // use a temp, because we might want to restore important
9332 if (strlen($temp) >= 1 && substr($temp, -1) === '!') {
9333 $string = rtrim(substr($temp, 0, -1));
9334 $is_important = true;
9337 $string = $this->def->validate($string, $config, $context);
9338 if ($this->allow && $is_important) $string .= ' !important';
9348 * Represents a Length as defined by CSS.
9350 class HTMLPurifier_AttrDef_CSS_Length extends HTMLPurifier_AttrDef
9353 protected $min, $max;
9356 * @param HTMLPurifier_Length $max Minimum length, or null for no bound. String is also acceptable.
9357 * @param HTMLPurifier_Length $max Maximum length, or null for no bound. String is also acceptable.
9359 public function __construct($min = null, $max = null) {
9360 $this->min = $min !== null ? HTMLPurifier_Length::make($min) : null;
9361 $this->max = $max !== null ? HTMLPurifier_Length::make($max) : null;
9364 public function validate($string, $config, $context) {
9365 $string = $this->parseCDATA($string);
9368 if ($string === '') return false;
9369 if ($string === '0') return '0';
9370 if (strlen($string) === 1) return false;
9372 $length = HTMLPurifier_Length::make($string);
9373 if (!$length->isValid()) return false;
9376 $c = $length->compareTo($this->min);
9377 if ($c === false) return false;
9378 if ($c < 0) return false;
9381 $c = $length->compareTo($this->max);
9382 if ($c === false) return false;
9383 if ($c > 0) return false;
9386 return $length->toString();
9396 * Validates shorthand CSS property list-style.
9397 * @warning Does not support url tokens that have internal spaces.
9399 class HTMLPurifier_AttrDef_CSS_ListStyle extends HTMLPurifier_AttrDef
9403 * Local copy of component validators.
9404 * @note See HTMLPurifier_AttrDef_CSS_Font::$info for a similar impl.
9408 public function __construct($config) {
9409 $def = $config->getCSSDefinition();
9410 $this->info['list-style-type'] = $def->info['list-style-type'];
9411 $this->info['list-style-position'] = $def->info['list-style-position'];
9412 $this->info['list-style-image'] = $def->info['list-style-image'];
9415 public function validate($string, $config, $context) {
9417 // regular pre-processing
9418 $string = $this->parseCDATA($string);
9419 if ($string === '') return false;
9421 // assumes URI doesn't have spaces in it
9422 $bits = explode(' ', strtolower($string)); // bits to process
9425 $caught['type'] = false;
9426 $caught['position'] = false;
9427 $caught['image'] = false;
9429 $i = 0; // number of catches
9432 foreach ($bits as $bit) {
9433 if ($i >= 3) return; // optimization bit
9434 if ($bit === '') continue;
9435 foreach ($caught as $key => $status) {
9436 if ($status !== false) continue;
9437 $r = $this->info['list-style-' . $key]->validate($bit, $config, $context);
9438 if ($r === false) continue;
9439 if ($r === 'none') {
9440 if ($none) continue;
9442 if ($key == 'image') continue;
9450 if (!$i) return false;
9455 if ($caught['type']) $ret[] = $caught['type'];
9458 if ($caught['image']) $ret[] = $caught['image'];
9460 // construct position
9461 if ($caught['position']) $ret[] = $caught['position'];
9463 if (empty($ret)) return false;
9464 return implode(' ', $ret);
9475 * Framework class for strings that involve multiple values.
9477 * Certain CSS properties such as border-width and margin allow multiple
9478 * lengths to be specified. This class can take a vanilla border-width
9479 * definition and multiply it, usually into a max of four.
9481 * @note Even though the CSS specification isn't clear about it, inherit
9482 * can only be used alone: it will never manifest as part of a multi
9483 * shorthand declaration. Thus, this class does not allow inherit.
9485 class HTMLPurifier_AttrDef_CSS_Multiple extends HTMLPurifier_AttrDef
9489 * Instance of component definition to defer validation to.
9490 * @todo Make protected
9495 * Max number of values allowed.
9496 * @todo Make protected
9501 * @param $single HTMLPurifier_AttrDef to multiply
9502 * @param $max Max number of values allowed (usually four)
9504 public function __construct($single, $max = 4) {
9505 $this->single = $single;
9509 public function validate($string, $config, $context) {
9510 $string = $this->parseCDATA($string);
9511 if ($string === '') return false;
9512 $parts = explode(' ', $string); // parseCDATA replaced \r, \t and \n
9513 $length = count($parts);
9515 for ($i = 0, $num = 0; $i < $length && $num < $this->max; $i++) {
9516 if (ctype_space($parts[$i])) continue;
9517 $result = $this->single->validate($parts[$i], $config, $context);
9518 if ($result !== false) {
9519 $final .= $result . ' ';
9523 if ($final === '') return false;
9524 return rtrim($final);
9534 * Validates a Percentage as defined by the CSS spec.
9536 class HTMLPurifier_AttrDef_CSS_Percentage extends HTMLPurifier_AttrDef
9540 * Instance of HTMLPurifier_AttrDef_CSS_Number to defer number validation
9542 protected $number_def;
9545 * @param Bool indicating whether to forbid negative values
9547 public function __construct($non_negative = false) {
9548 $this->number_def = new HTMLPurifier_AttrDef_CSS_Number($non_negative);
9551 public function validate($string, $config, $context) {
9553 $string = $this->parseCDATA($string);
9555 if ($string === '') return false;
9556 $length = strlen($string);
9557 if ($length === 1) return false;
9558 if ($string[$length - 1] !== '%') return false;
9560 $number = substr($string, 0, $length - 1);
9561 $number = $this->number_def->validate($number, $config, $context);
9563 if ($number === false) return false;
9575 * Validates the value for the CSS property text-decoration
9576 * @note This class could be generalized into a version that acts sort of
9577 * like Enum except you can compound the allowed values.
9579 class HTMLPurifier_AttrDef_CSS_TextDecoration extends HTMLPurifier_AttrDef
9582 public function validate($string, $config, $context) {
9584 static $allowed_values = array(
9585 'line-through' => true,
9587 'underline' => true,
9590 $string = strtolower($this->parseCDATA($string));
9592 if ($string === 'none') return $string;
9594 $parts = explode(' ', $string);
9596 foreach ($parts as $part) {
9597 if (isset($allowed_values[$part])) {
9598 $final .= $part . ' ';
9601 $final = rtrim($final);
9602 if ($final === '') return false;
9614 * Validates a URI in CSS syntax, which uses url('http://example.com')
9615 * @note While theoretically speaking a URI in a CSS document could
9616 * be non-embedded, as of CSS2 there is no such usage so we're
9617 * generalizing it. This may need to be changed in the future.
9618 * @warning Since HTMLPurifier_AttrDef_CSS blindly uses semicolons as
9619 * the separator, you cannot put a literal semicolon in
9620 * in the URI. Try percent encoding it, in that case.
9622 class HTMLPurifier_AttrDef_CSS_URI extends HTMLPurifier_AttrDef_URI
9625 public function __construct() {
9626 parent::__construct(true); // always embedded
9629 public function validate($uri_string, $config, $context) {
9630 // parse the URI out of the string and then pass it onto
9631 // the parent object
9633 $uri_string = $this->parseCDATA($uri_string);
9634 if (strpos($uri_string, 'url(') !== 0) return false;
9635 $uri_string = substr($uri_string, 4);
9636 $new_length = strlen($uri_string) - 1;
9637 if ($uri_string[$new_length] != ')') return false;
9638 $uri = trim(substr($uri_string, 0, $new_length));
9640 if (!empty($uri) && ($uri[0] == "'" || $uri[0] == '"')) {
9642 $new_length = strlen($uri) - 1;
9643 if ($uri[$new_length] !== $quote) return false;
9644 $uri = substr($uri, 1, $new_length - 1);
9647 $uri = $this->expandCSSEscape($uri);
9649 $result = parent::validate($uri, $config, $context);
9651 if ($result === false) return false;
9653 // extra sanity check; should have been done by URI
9654 $result = str_replace(array('"', "\\", "\n", "\x0c", "\r"), "", $result);
9656 // suspicious characters are ()'; we're going to percent encode
9658 $result = str_replace(array('(', ')', "'"), array('%28', '%29', '%27'), $result);
9660 // there's an extra bug where ampersands lose their escaping on
9661 // an innerHTML cycle, so a very unlucky query parameter could
9662 // then change the meaning of the URL. Unfortunately, there's
9663 // not much we can do about that...
9665 return "url(\"$result\")";
9676 * Validates a boolean attribute
9678 class HTMLPurifier_AttrDef_HTML_Bool extends HTMLPurifier_AttrDef
9682 public $minimized = true;
9684 public function __construct($name = false) {$this->name = $name;}
9686 public function validate($string, $config, $context) {
9687 if (empty($string)) return false;
9692 * @param $string Name of attribute
9694 public function make($string) {
9695 return new HTMLPurifier_AttrDef_HTML_Bool($string);
9705 * Validates contents based on NMTOKENS attribute type.
9707 class HTMLPurifier_AttrDef_HTML_Nmtokens extends HTMLPurifier_AttrDef
9710 public function validate($string, $config, $context) {
9712 $string = trim($string);
9714 // early abort: '' and '0' (strings that convert to false) are invalid
9715 if (!$string) return false;
9717 $tokens = $this->split($string, $config, $context);
9718 $tokens = $this->filter($tokens, $config, $context);
9719 if (empty($tokens)) return false;
9720 return implode(' ', $tokens);
9725 * Splits a space separated list of tokens into its constituent parts.
9727 protected function split($string, $config, $context) {
9729 // do the preg_match, capture all subpatterns for reformulation
9731 // we don't support U+00A1 and up codepoints or
9732 // escaping because I don't know how to do that with regexps
9733 // and plus it would complicate optimization efforts (you never
9734 // see that anyway).
9735 $pattern = '/(?:(?<=\s)|\A)'. // look behind for space or string start
9736 '((?:--|-?[A-Za-z_])[A-Za-z_\-0-9]*)'.
9737 '(?:(?=\s)|\z)/'; // look ahead for space or string end
9738 preg_match_all($pattern, $string, $matches);
9743 * Template method for removing certain tokens based on arbitrary criteria.
9744 * @note If we wanted to be really functional, we'd do an array_filter
9745 * with a callback. But... we're not.
9747 protected function filter($tokens, $config, $context) {
9758 * Implements special behavior for class attribute (normally NMTOKENS)
9760 class HTMLPurifier_AttrDef_HTML_Class extends HTMLPurifier_AttrDef_HTML_Nmtokens
9762 protected function split($string, $config, $context) {
9763 // really, this twiddle should be lazy loaded
9764 $name = $config->getDefinition('HTML')->doctype->name;
9765 if ($name == "XHTML 1.1" || $name == "XHTML 2.0") {
9766 return parent::split($string, $config, $context);
9768 return preg_split('/\s+/', $string);
9771 protected function filter($tokens, $config, $context) {
9772 $allowed = $config->get('Attr.AllowedClasses');
9773 $forbidden = $config->get('Attr.ForbiddenClasses');
9775 foreach ($tokens as $token) {
9777 ($allowed === null || isset($allowed[$token])) &&
9778 !isset($forbidden[$token]) &&
9779 // We need this O(n) check because of PHP's array
9780 // implementation that casts -0 to 0.
9781 !in_array($token, $ret, true)
9793 * Validates a color according to the HTML spec.
9795 class HTMLPurifier_AttrDef_HTML_Color extends HTMLPurifier_AttrDef
9798 public function validate($string, $config, $context) {
9800 static $colors = null;
9801 if ($colors === null) $colors = $config->get('Core.ColorKeywords');
9803 $string = trim($string);
9805 if (empty($string)) return false;
9806 if (isset($colors[$string])) return $colors[$string];
9807 if ($string[0] === '#') $hex = substr($string, 1);
9808 else $hex = $string;
9810 $length = strlen($hex);
9811 if ($length !== 3 && $length !== 6) return false;
9812 if (!ctype_xdigit($hex)) return false;
9813 if ($length === 3) $hex = $hex[0].$hex[0].$hex[1].$hex[1].$hex[2].$hex[2];
9826 * Special-case enum attribute definition that lazy loads allowed frame targets
9828 class HTMLPurifier_AttrDef_HTML_FrameTarget extends HTMLPurifier_AttrDef_Enum
9831 public $valid_values = false; // uninitialized value
9832 protected $case_sensitive = false;
9834 public function __construct() {}
9836 public function validate($string, $config, $context) {
9837 if ($this->valid_values === false) $this->valid_values = $config->get('Attr.AllowedFrameTargets');
9838 return parent::validate($string, $config, $context);
9848 * Validates the HTML attribute ID.
9849 * @warning Even though this is the id processor, it
9850 * will ignore the directive Attr:IDBlacklist, since it will only
9851 * go according to the ID accumulator. Since the accumulator is
9852 * automatically generated, it will have already absorbed the
9853 * blacklist. If you're hacking around, make sure you use load()!
9856 class HTMLPurifier_AttrDef_HTML_ID extends HTMLPurifier_AttrDef
9859 // ref functionality disabled, since we also have to verify
9860 // whether or not the ID it refers to exists
9862 public function validate($id, $config, $context) {
9864 if (!$config->get('Attr.EnableID')) return false;
9866 $id = trim($id); // trim it first
9868 if ($id === '') return false;
9870 $prefix = $config->get('Attr.IDPrefix');
9871 if ($prefix !== '') {
9872 $prefix .= $config->get('Attr.IDPrefixLocal');
9873 // prevent re-appending the prefix
9874 if (strpos($id, $prefix) !== 0) $id = $prefix . $id;
9875 } elseif ($config->get('Attr.IDPrefixLocal') !== '') {
9876 trigger_error('%Attr.IDPrefixLocal cannot be used unless '.
9877 '%Attr.IDPrefix is set', E_USER_WARNING);
9880 //if (!$this->ref) {
9881 $id_accumulator =& $context->get('IDAccumulator');
9882 if (isset($id_accumulator->ids[$id])) return false;
9885 // we purposely avoid using regex, hopefully this is faster
9887 if (ctype_alpha($id)) {
9890 if (!ctype_alpha(@$id[0])) return false;
9891 $trim = trim( // primitive style of regexps, I suppose
9895 $result = ($trim === '');
9898 $regexp = $config->get('Attr.IDBlacklistRegexp');
9899 if ($regexp && preg_match($regexp, $id)) {
9903 if (/*!$this->ref && */$result) $id_accumulator->add($id);
9905 // if no change was made to the ID, return the result
9906 // else, return the new id if stripping whitespace made it
9907 // valid, or return false.
9908 return $result ? $id : false;
9919 * Validates an integer representation of pixels according to the HTML spec.
9921 class HTMLPurifier_AttrDef_HTML_Pixels extends HTMLPurifier_AttrDef
9926 public function __construct($max = null) {
9930 public function validate($string, $config, $context) {
9932 $string = trim($string);
9933 if ($string === '0') return $string;
9934 if ($string === '') return false;
9935 $length = strlen($string);
9936 if (substr($string, $length - 2) == 'px') {
9937 $string = substr($string, 0, $length - 2);
9939 if (!is_numeric($string)) return false;
9940 $int = (int) $string;
9942 if ($int < 0) return '0';
9944 // upper-bound value, extremely high values can
9945 // crash operating systems, see <http://ha.ckers.org/imagecrash.html>
9946 // WARNING, above link WILL crash you if you're using Windows
9948 if ($this->max !== null && $int > $this->max) return (string) $this->max;
9950 return (string) $int;
9954 public function make($string) {
9955 if ($string === '') $max = null;
9956 else $max = (int) $string;
9957 $class = get_class($this);
9958 return new $class($max);
9968 * Validates the HTML type length (not to be confused with CSS's length).
9970 * This accepts integer pixels or percentages as lengths for certain
9974 class HTMLPurifier_AttrDef_HTML_Length extends HTMLPurifier_AttrDef_HTML_Pixels
9977 public function validate($string, $config, $context) {
9979 $string = trim($string);
9980 if ($string === '') return false;
9982 $parent_result = parent::validate($string, $config, $context);
9983 if ($parent_result !== false) return $parent_result;
9985 $length = strlen($string);
9986 $last_char = $string[$length - 1];
9988 if ($last_char !== '%') return false;
9990 $points = substr($string, 0, $length - 1);
9992 if (!is_numeric($points)) return false;
9994 $points = (int) $points;
9996 if ($points < 0) return '0%';
9997 if ($points > 100) return '100%';
9999 return ((string) $points) . '%';
10010 * Validates a rel/rev link attribute against a directive of allowed values
10011 * @note We cannot use Enum because link types allow multiple
10013 * @note Assumes link types are ASCII text
10015 class HTMLPurifier_AttrDef_HTML_LinkTypes extends HTMLPurifier_AttrDef
10018 /** Name config attribute to pull. */
10021 public function __construct($name) {
10022 $configLookup = array(
10023 'rel' => 'AllowedRel',
10024 'rev' => 'AllowedRev'
10026 if (!isset($configLookup[$name])) {
10027 trigger_error('Unrecognized attribute name for link '.
10028 'relationship.', E_USER_ERROR);
10031 $this->name = $configLookup[$name];
10034 public function validate($string, $config, $context) {
10036 $allowed = $config->get('Attr.' . $this->name);
10037 if (empty($allowed)) return false;
10039 $string = $this->parseCDATA($string);
10040 $parts = explode(' ', $string);
10042 // lookup to prevent duplicates
10043 $ret_lookup = array();
10044 foreach ($parts as $part) {
10045 $part = strtolower(trim($part));
10046 if (!isset($allowed[$part])) continue;
10047 $ret_lookup[$part] = true;
10050 if (empty($ret_lookup)) return false;
10051 $string = implode(' ', array_keys($ret_lookup));
10064 * Validates a MultiLength as defined by the HTML spec.
10066 * A multilength is either a integer (pixel count), a percentage, or
10067 * a relative number.
10069 class HTMLPurifier_AttrDef_HTML_MultiLength extends HTMLPurifier_AttrDef_HTML_Length
10072 public function validate($string, $config, $context) {
10074 $string = trim($string);
10075 if ($string === '') return false;
10077 $parent_result = parent::validate($string, $config, $context);
10078 if ($parent_result !== false) return $parent_result;
10080 $length = strlen($string);
10081 $last_char = $string[$length - 1];
10083 if ($last_char !== '*') return false;
10085 $int = substr($string, 0, $length - 1);
10087 if ($int == '') return '*';
10088 if (!is_numeric($int)) return false;
10092 if ($int < 0) return false;
10093 if ($int == 0) return '0';
10094 if ($int == 1) return '*';
10095 return ((string) $int) . '*';
10105 abstract class HTMLPurifier_AttrDef_URI_Email extends HTMLPurifier_AttrDef
10109 * Unpacks a mailbox into its display-name and address
10111 function unpack($string) {
10112 // needs to be implemented
10117 // sub-implementations
10124 * Validates a host according to the IPv4, IPv6 and DNS (future) specifications.
10126 class HTMLPurifier_AttrDef_URI_Host extends HTMLPurifier_AttrDef
10130 * Instance of HTMLPurifier_AttrDef_URI_IPv4 sub-validator
10135 * Instance of HTMLPurifier_AttrDef_URI_IPv6 sub-validator
10139 public function __construct() {
10140 $this->ipv4 = new HTMLPurifier_AttrDef_URI_IPv4();
10141 $this->ipv6 = new HTMLPurifier_AttrDef_URI_IPv6();
10144 public function validate($string, $config, $context) {
10145 $length = strlen($string);
10146 // empty hostname is OK; it's usually semantically equivalent:
10147 // the default host as defined by a URI scheme is used:
10149 // If the URI scheme defines a default for host, then that
10150 // default applies when the host subcomponent is undefined
10151 // or when the registered name is empty (zero length).
10152 if ($string === '') return '';
10153 if ($length > 1 && $string[0] === '[' && $string[$length-1] === ']') {
10155 $ip = substr($string, 1, $length - 2);
10156 $valid = $this->ipv6->validate($ip, $config, $context);
10157 if ($valid === false) return false;
10158 return '['. $valid . ']';
10161 // need to do checks on unusual encodings too
10162 $ipv4 = $this->ipv4->validate($string, $config, $context);
10163 if ($ipv4 !== false) return $ipv4;
10165 // A regular domain name.
10167 // This breaks I18N domain names, but we don't have proper IRI support,
10168 // so force users to insert Punycode. If there's complaining we'll
10169 // try to fix things into an international friendly form.
10171 // The productions describing this are:
10172 $a = '[a-z]'; // alpha
10173 $an = '[a-z0-9]'; // alphanum
10174 $and = '[a-z0-9-]'; // alphanum | "-"
10175 // domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum
10176 $domainlabel = "$an($and*$an)?";
10177 // toplabel = alpha | alpha *( alphanum | "-" ) alphanum
10178 $toplabel = "$a($and*$an)?";
10179 // hostname = *( domainlabel "." ) toplabel [ "." ]
10180 $match = preg_match("/^($domainlabel\.)*$toplabel\.?$/i", $string);
10181 if (!$match) return false;
10193 * Validates an IPv4 address
10194 * @author Feyd @ forums.devnetwork.net (public domain)
10196 class HTMLPurifier_AttrDef_URI_IPv4 extends HTMLPurifier_AttrDef
10200 * IPv4 regex, protected so that IPv6 can reuse it
10204 public function validate($aIP, $config, $context) {
10206 if (!$this->ip4) $this->_loadRegex();
10208 if (preg_match('#^' . $this->ip4 . '$#s', $aIP))
10218 * Lazy load function to prevent regex from being stuffed in
10221 protected function _loadRegex() {
10222 $oct = '(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])'; // 0-255
10223 $this->ip4 = "(?:{$oct}\\.{$oct}\\.{$oct}\\.{$oct})";
10233 * Validates an IPv6 address.
10234 * @author Feyd @ forums.devnetwork.net (public domain)
10235 * @note This function requires brackets to have been removed from address
10238 class HTMLPurifier_AttrDef_URI_IPv6 extends HTMLPurifier_AttrDef_URI_IPv4
10241 public function validate($aIP, $config, $context) {
10243 if (!$this->ip4) $this->_loadRegex();
10247 $hex = '[0-9a-fA-F]';
10248 $blk = '(?:' . $hex . '{1,4})';
10249 $pre = '(?:/(?:12[0-8]|1[0-1][0-9]|[1-9][0-9]|[0-9]))'; // /0 - /128
10252 if (strpos($aIP, '/') !== false)
10254 if (preg_match('#' . $pre . '$#s', $aIP, $find))
10256 $aIP = substr($aIP, 0, 0-strlen($find[0]));
10265 // IPv4-compatiblity check
10266 if (preg_match('#(?<=:'.')' . $this->ip4 . '$#s', $aIP, $find))
10268 $aIP = substr($aIP, 0, 0-strlen($find[0]));
10269 $ip = explode('.', $find[0]);
10270 $ip = array_map('dechex', $ip);
10271 $aIP .= $ip[0] . $ip[1] . ':' . $ip[2] . $ip[3];
10275 // compression check
10276 $aIP = explode('::', $aIP);
10284 list($first, $second) = $aIP;
10285 $first = explode(':', $first);
10286 $second = explode(':', $second);
10288 if (count($first) + count($second) > 8)
10293 while(count($first) < 8)
10295 array_push($first, '0');
10298 array_splice($first, 8 - count($second), 8, $second);
10300 unset($first,$second);
10304 $aIP = explode(':', $aIP[0]);
10313 // All the pieces should be 16-bit hex strings. Are they?
10314 foreach ($aIP as $piece)
10316 if (!preg_match('#^[0-9a-fA-F]{4}$#s', sprintf('%04s', $piece)))
10333 * Primitive email validation class based on the regexp found at
10334 * http://www.regular-expressions.info/email.html
10336 class HTMLPurifier_AttrDef_URI_Email_SimpleCheck extends HTMLPurifier_AttrDef_URI_Email
10339 public function validate($string, $config, $context) {
10340 // no support for named mailboxes i.e. "Bob <bob@example.com>"
10341 // that needs more percent encoding to be done
10342 if ($string == '') return false;
10343 $string = trim($string);
10344 $result = preg_match('/^[A-Z0-9._%-]+@[A-Z0-9.-]+\.[A-Z]{2,4}$/i', $string);
10345 return $result ? $string : false;
10355 * Pre-transform that changes proprietary background attribute to CSS.
10357 class HTMLPurifier_AttrTransform_Background extends HTMLPurifier_AttrTransform {
10359 public function transform($attr, $config, $context) {
10361 if (!isset($attr['background'])) return $attr;
10363 $background = $this->confiscateAttr($attr, 'background');
10364 // some validation should happen here
10366 $this->prependCSS($attr, "background-image:url($background);");
10378 // this MUST be placed in post, as it assumes that any value in dir is valid
10381 * Post-trasnform that ensures that bdo tags have the dir attribute set.
10383 class HTMLPurifier_AttrTransform_BdoDir extends HTMLPurifier_AttrTransform
10386 public function transform($attr, $config, $context) {
10387 if (isset($attr['dir'])) return $attr;
10388 $attr['dir'] = $config->get('Attr.DefaultTextDir');
10399 * Pre-transform that changes deprecated bgcolor attribute to CSS.
10401 class HTMLPurifier_AttrTransform_BgColor extends HTMLPurifier_AttrTransform {
10403 public function transform($attr, $config, $context) {
10405 if (!isset($attr['bgcolor'])) return $attr;
10407 $bgcolor = $this->confiscateAttr($attr, 'bgcolor');
10408 // some validation should happen here
10410 $this->prependCSS($attr, "background-color:$bgcolor;");
10423 * Pre-transform that changes converts a boolean attribute to fixed CSS
10425 class HTMLPurifier_AttrTransform_BoolToCSS extends HTMLPurifier_AttrTransform {
10428 * Name of boolean attribute that is trigger
10433 * CSS declarations to add to style, needs trailing semicolon
10438 * @param $attr string attribute name to convert from
10439 * @param $css string CSS declarations to add to style (needs semicolon)
10441 public function __construct($attr, $css) {
10442 $this->attr = $attr;
10446 public function transform($attr, $config, $context) {
10447 if (!isset($attr[$this->attr])) return $attr;
10448 unset($attr[$this->attr]);
10449 $this->prependCSS($attr, $this->css);
10460 * Pre-transform that changes deprecated border attribute to CSS.
10462 class HTMLPurifier_AttrTransform_Border extends HTMLPurifier_AttrTransform {
10464 public function transform($attr, $config, $context) {
10465 if (!isset($attr['border'])) return $attr;
10466 $border_width = $this->confiscateAttr($attr, 'border');
10467 // some validation should happen here
10468 $this->prependCSS($attr, "border:{$border_width}px solid;");
10479 * Generic pre-transform that converts an attribute with a fixed number of
10480 * values (enumerated) to CSS.
10482 class HTMLPurifier_AttrTransform_EnumToCSS extends HTMLPurifier_AttrTransform {
10485 * Name of attribute to transform from
10490 * Lookup array of attribute values to CSS
10492 protected $enumToCSS = array();
10495 * Case sensitivity of the matching
10496 * @warning Currently can only be guaranteed to work with ASCII
10499 protected $caseSensitive = false;
10502 * @param $attr String attribute name to transform from
10503 * @param $enumToCSS Lookup array of attribute values to CSS
10504 * @param $case_sensitive Boolean case sensitivity indicator, default false
10506 public function __construct($attr, $enum_to_css, $case_sensitive = false) {
10507 $this->attr = $attr;
10508 $this->enumToCSS = $enum_to_css;
10509 $this->caseSensitive = (bool) $case_sensitive;
10512 public function transform($attr, $config, $context) {
10514 if (!isset($attr[$this->attr])) return $attr;
10516 $value = trim($attr[$this->attr]);
10517 unset($attr[$this->attr]);
10519 if (!$this->caseSensitive) $value = strtolower($value);
10521 if (!isset($this->enumToCSS[$value])) {
10525 $this->prependCSS($attr, $this->enumToCSS[$value]);
10537 // must be called POST validation
10540 * Transform that supplies default values for the src and alt attributes
10541 * in img tags, as well as prevents the img tag from being removed
10542 * because of a missing alt tag. This needs to be registered as both
10543 * a pre and post attribute transform.
10545 class HTMLPurifier_AttrTransform_ImgRequired extends HTMLPurifier_AttrTransform
10548 public function transform($attr, $config, $context) {
10551 if (!isset($attr['src'])) {
10552 if ($config->get('Core.RemoveInvalidImg')) return $attr;
10553 $attr['src'] = $config->get('Attr.DefaultInvalidImage');
10557 if (!isset($attr['alt'])) {
10559 $alt = $config->get('Attr.DefaultImageAlt');
10560 if ($alt === null) {
10561 // truncate if the alt is too long
10562 $attr['alt'] = substr(basename($attr['src']),0,40);
10564 $attr['alt'] = $alt;
10567 $attr['alt'] = $config->get('Attr.DefaultInvalidImageAlt');
10582 * Pre-transform that changes deprecated hspace and vspace attributes to CSS
10584 class HTMLPurifier_AttrTransform_ImgSpace extends HTMLPurifier_AttrTransform {
10587 protected $css = array(
10588 'hspace' => array('left', 'right'),
10589 'vspace' => array('top', 'bottom')
10592 public function __construct($attr) {
10593 $this->attr = $attr;
10594 if (!isset($this->css[$attr])) {
10595 trigger_error(htmlspecialchars($attr) . ' is not valid space attribute');
10599 public function transform($attr, $config, $context) {
10601 if (!isset($attr[$this->attr])) return $attr;
10603 $width = $this->confiscateAttr($attr, $this->attr);
10604 // some validation could happen here
10606 if (!isset($this->css[$this->attr])) return $attr;
10609 foreach ($this->css[$this->attr] as $suffix) {
10610 $property = "margin-$suffix";
10611 $style .= "$property:{$width}px;";
10614 $this->prependCSS($attr, $style);
10627 * Performs miscellaneous cross attribute validation and filtering for
10628 * input elements. This is meant to be a post-transform.
10630 class HTMLPurifier_AttrTransform_Input extends HTMLPurifier_AttrTransform {
10634 public function __construct() {
10635 $this->pixels = new HTMLPurifier_AttrDef_HTML_Pixels();
10638 public function transform($attr, $config, $context) {
10639 if (!isset($attr['type'])) $t = 'text';
10640 else $t = strtolower($attr['type']);
10641 if (isset($attr['checked']) && $t !== 'radio' && $t !== 'checkbox') {
10642 unset($attr['checked']);
10644 if (isset($attr['maxlength']) && $t !== 'text' && $t !== 'password') {
10645 unset($attr['maxlength']);
10647 if (isset($attr['size']) && $t !== 'text' && $t !== 'password') {
10648 $result = $this->pixels->validate($attr['size'], $config, $context);
10649 if ($result === false) unset($attr['size']);
10650 else $attr['size'] = $result;
10652 if (isset($attr['src']) && $t !== 'image') {
10653 unset($attr['src']);
10655 if (!isset($attr['value']) && ($t === 'radio' || $t === 'checkbox')) {
10656 $attr['value'] = '';
10668 * Post-transform that copies lang's value to xml:lang (and vice-versa)
10669 * @note Theoretically speaking, this could be a pre-transform, but putting
10670 * post is more efficient.
10672 class HTMLPurifier_AttrTransform_Lang extends HTMLPurifier_AttrTransform
10675 public function transform($attr, $config, $context) {
10677 $lang = isset($attr['lang']) ? $attr['lang'] : false;
10678 $xml_lang = isset($attr['xml:lang']) ? $attr['xml:lang'] : false;
10680 if ($lang !== false && $xml_lang === false) {
10681 $attr['xml:lang'] = $lang;
10682 } elseif ($xml_lang !== false) {
10683 $attr['lang'] = $xml_lang;
10697 * Class for handling width/height length attribute transformations to CSS
10699 class HTMLPurifier_AttrTransform_Length extends HTMLPurifier_AttrTransform
10703 protected $cssName;
10705 public function __construct($name, $css_name = null) {
10706 $this->name = $name;
10707 $this->cssName = $css_name ? $css_name : $name;
10710 public function transform($attr, $config, $context) {
10711 if (!isset($attr[$this->name])) return $attr;
10712 $length = $this->confiscateAttr($attr, $this->name);
10713 if(ctype_digit($length)) $length .= 'px';
10714 $this->prependCSS($attr, $this->cssName . ":$length;");
10725 * Pre-transform that changes deprecated name attribute to ID if necessary
10727 class HTMLPurifier_AttrTransform_Name extends HTMLPurifier_AttrTransform
10730 public function transform($attr, $config, $context) {
10731 // Abort early if we're using relaxed definition of name
10732 if ($config->get('HTML.Attr.Name.UseCDATA')) return $attr;
10733 if (!isset($attr['name'])) return $attr;
10734 $id = $this->confiscateAttr($attr, 'name');
10735 if ( isset($attr['id'])) return $attr;
10747 * Post-transform that performs validation to the name attribute; if
10748 * it is present with an equivalent id attribute, it is passed through;
10749 * otherwise validation is performed.
10751 class HTMLPurifier_AttrTransform_NameSync extends HTMLPurifier_AttrTransform
10754 public function __construct() {
10755 $this->idDef = new HTMLPurifier_AttrDef_HTML_ID();
10758 public function transform($attr, $config, $context) {
10759 if (!isset($attr['name'])) return $attr;
10760 $name = $attr['name'];
10761 if (isset($attr['id']) && $attr['id'] === $name) return $attr;
10762 $result = $this->idDef->validate($name, $config, $context);
10763 if ($result === false) unset($attr['name']);
10764 else $attr['name'] = $result;
10774 // must be called POST validation
10777 * Adds rel="nofollow" to all outbound links. This transform is
10778 * only attached if Attr.Nofollow is TRUE.
10780 class HTMLPurifier_AttrTransform_Nofollow extends HTMLPurifier_AttrTransform
10784 public function __construct() {
10785 $this->parser = new HTMLPurifier_URIParser();
10788 public function transform($attr, $config, $context) {
10790 if (!isset($attr['href'])) {
10794 // XXX Kind of inefficient
10795 $url = $this->parser->parse($attr['href']);
10796 $scheme = $url->getSchemeObj($config, $context);
10798 if (!is_null($url->host) && $scheme !== false && $scheme->browsable) {
10799 if (isset($attr['rel'])) {
10800 $attr['rel'] .= ' nofollow';
10802 $attr['rel'] = 'nofollow';
10816 class HTMLPurifier_AttrTransform_SafeEmbed extends HTMLPurifier_AttrTransform
10818 public $name = "SafeEmbed";
10820 public function transform($attr, $config, $context) {
10821 $attr['allowscriptaccess'] = 'never';
10822 $attr['allownetworking'] = 'internal';
10823 $attr['type'] = 'application/x-shockwave-flash';
10833 * Writes default type for all objects. Currently only supports flash.
10835 class HTMLPurifier_AttrTransform_SafeObject extends HTMLPurifier_AttrTransform
10837 public $name = "SafeObject";
10839 function transform($attr, $config, $context) {
10840 if (!isset($attr['type'])) $attr['type'] = 'application/x-shockwave-flash';
10850 * Validates name/value pairs in param tags to be used in safe objects. This
10851 * will only allow name values it recognizes, and pre-fill certain attributes
10852 * with required values.
10855 * This class only supports Flash. In the future, Quicktime support
10859 * This class expects an injector to add the necessary parameters tags.
10861 class HTMLPurifier_AttrTransform_SafeParam extends HTMLPurifier_AttrTransform
10863 public $name = "SafeParam";
10866 public function __construct() {
10867 $this->uri = new HTMLPurifier_AttrDef_URI(true); // embedded
10868 $this->wmode = new HTMLPurifier_AttrDef_Enum(array('window', 'opaque', 'transparent'));
10871 public function transform($attr, $config, $context) {
10872 // If we add support for other objects, we'll need to alter the
10874 switch ($attr['name']) {
10875 // application/x-shockwave-flash
10876 // Keep this synchronized with Injector/SafeObject.php
10877 case 'allowScriptAccess':
10878 $attr['value'] = 'never';
10880 case 'allowNetworking':
10881 $attr['value'] = 'internal';
10883 case 'allowFullScreen':
10884 if ($config->get('HTML.FlashAllowFullScreen')) {
10885 $attr['value'] = ($attr['value'] == 'true') ? 'true' : 'false';
10887 $attr['value'] = 'false';
10891 $attr['value'] = $this->wmode->validate($attr['value'], $config, $context);
10895 $attr['name'] = "movie";
10896 $attr['value'] = $this->uri->validate($attr['value'], $config, $context);
10899 // we're going to allow arbitrary inputs to the SWF, on
10900 // the reasoning that it could only hack the SWF, not us.
10902 // add other cases to support other param name/value pairs
10904 $attr['name'] = $attr['value'] = null;
10915 * Implements required attribute stipulation for <script>
10917 class HTMLPurifier_AttrTransform_ScriptRequired extends HTMLPurifier_AttrTransform
10919 public function transform($attr, $config, $context) {
10920 if (!isset($attr['type'])) {
10921 $attr['type'] = 'text/javascript';
10932 * Sets height/width defaults for <textarea>
10934 class HTMLPurifier_AttrTransform_Textarea extends HTMLPurifier_AttrTransform
10937 public function transform($attr, $config, $context) {
10938 // Calculated from Firefox
10939 if (!isset($attr['cols'])) $attr['cols'] = '22';
10940 if (!isset($attr['rows'])) $attr['rows'] = '3';
10951 * Definition that uses different definitions depending on context.
10953 * The del and ins tags are notable because they allow different types of
10954 * elements depending on whether or not they're in a block or inline context.
10955 * Chameleon allows this behavior to happen by using two different
10956 * definitions depending on context. While this somewhat generalized,
10957 * it is specifically intended for those two tags.
10959 class HTMLPurifier_ChildDef_Chameleon extends HTMLPurifier_ChildDef
10963 * Instance of the definition object to use when inline. Usually stricter.
10968 * Instance of the definition object to use when block.
10972 public $type = 'chameleon';
10975 * @param $inline List of elements to allow when inline.
10976 * @param $block List of elements to allow when block.
10978 public function __construct($inline, $block) {
10979 $this->inline = new HTMLPurifier_ChildDef_Optional($inline);
10980 $this->block = new HTMLPurifier_ChildDef_Optional($block);
10981 $this->elements = $this->block->elements;
10984 public function validateChildren($tokens_of_children, $config, $context) {
10985 if ($context->get('IsInline') === false) {
10986 return $this->block->validateChildren(
10987 $tokens_of_children, $config, $context);
10989 return $this->inline->validateChildren(
10990 $tokens_of_children, $config, $context);
11000 * Custom validation class, accepts DTD child definitions
11002 * @warning Currently this class is an all or nothing proposition, that is,
11003 * it will only give a bool return value.
11005 class HTMLPurifier_ChildDef_Custom extends HTMLPurifier_ChildDef
11007 public $type = 'custom';
11008 public $allow_empty = false;
11010 * Allowed child pattern as defined by the DTD
11014 * PCRE regex derived from $dtd_regex
11017 private $_pcre_regex;
11019 * @param $dtd_regex Allowed child pattern from the DTD
11021 public function __construct($dtd_regex) {
11022 $this->dtd_regex = $dtd_regex;
11023 $this->_compileRegex();
11026 * Compiles the PCRE regex from a DTD regex ($dtd_regex to $_pcre_regex)
11028 protected function _compileRegex() {
11029 $raw = str_replace(' ', '', $this->dtd_regex);
11030 if ($raw{0} != '(') {
11033 $el = '[#a-zA-Z0-9_.-]+';
11036 // COMPLICATED! AND MIGHT BE BUGGY! I HAVE NO CLUE WHAT I'M
11037 // DOING! Seriously: if there's problems, please report them.
11039 // collect all elements into the $elements array
11040 preg_match_all("/$el/", $reg, $matches);
11041 foreach ($matches[0] as $match) {
11042 $this->elements[$match] = true;
11045 // setup all elements as parentheticals with leading commas
11046 $reg = preg_replace("/$el/", '(,\\0)', $reg);
11048 // remove commas when they were not solicited
11049 $reg = preg_replace("/([^,(|]\(+),/", '\\1', $reg);
11051 // remove all non-paranthetical commas: they are handled by first regex
11052 $reg = preg_replace("/,\(/", '(', $reg);
11054 $this->_pcre_regex = $reg;
11056 public function validateChildren($tokens_of_children, $config, $context) {
11057 $list_of_children = '';
11058 $nesting = 0; // depth into the nest
11059 foreach ($tokens_of_children as $token) {
11060 if (!empty($token->is_whitespace)) continue;
11062 $is_child = ($nesting == 0); // direct
11064 if ($token instanceof HTMLPurifier_Token_Start) {
11066 } elseif ($token instanceof HTMLPurifier_Token_End) {
11071 $list_of_children .= $token->name . ',';
11074 // add leading comma to deal with stray comma declarations
11075 $list_of_children = ',' . rtrim($list_of_children, ',');
11078 '/^,?'.$this->_pcre_regex.'$/',
11082 return (bool) $okay;
11091 * Definition that disallows all elements.
11092 * @warning validateChildren() in this class is actually never called, because
11093 * empty elements are corrected in HTMLPurifier_Strategy_MakeWellFormed
11094 * before child definitions are parsed in earnest by
11095 * HTMLPurifier_Strategy_FixNesting.
11097 class HTMLPurifier_ChildDef_Empty extends HTMLPurifier_ChildDef
11099 public $allow_empty = true;
11100 public $type = 'empty';
11101 public function __construct() {}
11102 public function validateChildren($tokens_of_children, $config, $context) {
11112 * Definition that allows a set of elements, but disallows empty children.
11114 class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef
11117 * Lookup table of allowed elements.
11120 public $elements = array();
11122 * Whether or not the last passed node was all whitespace.
11124 protected $whitespace = false;
11126 * @param $elements List of allowed element names (lowercase).
11128 public function __construct($elements) {
11129 if (is_string($elements)) {
11130 $elements = str_replace(' ', '', $elements);
11131 $elements = explode('|', $elements);
11133 $keys = array_keys($elements);
11134 if ($keys == array_keys($keys)) {
11135 $elements = array_flip($elements);
11136 foreach ($elements as $i => $x) {
11137 $elements[$i] = true;
11138 if (empty($i)) unset($elements[$i]); // remove blank
11141 $this->elements = $elements;
11143 public $allow_empty = false;
11144 public $type = 'required';
11145 public function validateChildren($tokens_of_children, $config, $context) {
11146 // Flag for subclasses
11147 $this->whitespace = false;
11149 // if there are no tokens, delete parent node
11150 if (empty($tokens_of_children)) return false;
11152 // the new set of children
11155 // current depth into the nest
11158 // whether or not we're deleting a node
11159 $is_deleting = false;
11161 // whether or not parsed character data is allowed
11162 // this controls whether or not we silently drop a tag
11163 // or generate escaped HTML from it
11164 $pcdata_allowed = isset($this->elements['#PCDATA']);
11166 // a little sanity check to make sure it's not ALL whitespace
11167 $all_whitespace = true;
11169 // some configuration
11170 $escape_invalid_children = $config->get('Core.EscapeInvalidChildren');
11173 $gen = new HTMLPurifier_Generator($config, $context);
11175 foreach ($tokens_of_children as $token) {
11176 if (!empty($token->is_whitespace)) {
11177 $result[] = $token;
11180 $all_whitespace = false; // phew, we're not talking about whitespace
11182 $is_child = ($nesting == 0);
11184 if ($token instanceof HTMLPurifier_Token_Start) {
11186 } elseif ($token instanceof HTMLPurifier_Token_End) {
11191 $is_deleting = false;
11192 if (!isset($this->elements[$token->name])) {
11193 $is_deleting = true;
11194 if ($pcdata_allowed && $token instanceof HTMLPurifier_Token_Text) {
11195 $result[] = $token;
11196 } elseif ($pcdata_allowed && $escape_invalid_children) {
11197 $result[] = new HTMLPurifier_Token_Text(
11198 $gen->generateFromToken($token)
11204 if (!$is_deleting || ($pcdata_allowed && $token instanceof HTMLPurifier_Token_Text)) {
11205 $result[] = $token;
11206 } elseif ($pcdata_allowed && $escape_invalid_children) {
11208 new HTMLPurifier_Token_Text(
11209 $gen->generateFromToken($token)
11215 if (empty($result)) return false;
11216 if ($all_whitespace) {
11217 $this->whitespace = true;
11220 if ($tokens_of_children == $result) return true;
11230 * Definition that allows a set of elements, and allows no children.
11231 * @note This is a hack to reuse code from HTMLPurifier_ChildDef_Required,
11232 * really, one shouldn't inherit from the other. Only altered behavior
11233 * is to overload a returned false with an array. Thus, it will never
11236 class HTMLPurifier_ChildDef_Optional extends HTMLPurifier_ChildDef_Required
11238 public $allow_empty = true;
11239 public $type = 'optional';
11240 public function validateChildren($tokens_of_children, $config, $context) {
11241 $result = parent::validateChildren($tokens_of_children, $config, $context);
11242 // we assume that $tokens_of_children is not modified
11243 if ($result === false) {
11244 if (empty($tokens_of_children)) return true;
11245 elseif ($this->whitespace) return $tokens_of_children;
11246 else return array();
11257 * Takes the contents of blockquote when in strict and reformats for validation.
11259 class HTMLPurifier_ChildDef_StrictBlockquote extends HTMLPurifier_ChildDef_Required
11261 protected $real_elements;
11262 protected $fake_elements;
11263 public $allow_empty = true;
11264 public $type = 'strictblockquote';
11265 protected $init = false;
11268 * @note We don't want MakeWellFormed to auto-close inline elements since
11269 * they might be allowed.
11271 public function getAllowedElements($config) {
11272 $this->init($config);
11273 return $this->fake_elements;
11276 public function validateChildren($tokens_of_children, $config, $context) {
11278 $this->init($config);
11280 // trick the parent class into thinking it allows more
11281 $this->elements = $this->fake_elements;
11282 $result = parent::validateChildren($tokens_of_children, $config, $context);
11283 $this->elements = $this->real_elements;
11285 if ($result === false) return array();
11286 if ($result === true) $result = $tokens_of_children;
11288 $def = $config->getHTMLDefinition();
11289 $block_wrap_start = new HTMLPurifier_Token_Start($def->info_block_wrapper);
11290 $block_wrap_end = new HTMLPurifier_Token_End( $def->info_block_wrapper);
11291 $is_inline = false;
11295 // assuming that there are no comment tokens
11296 foreach ($result as $i => $token) {
11297 $token = $result[$i];
11298 // ifs are nested for readability
11302 ($token instanceof HTMLPurifier_Token_Text && !$token->is_whitespace) ||
11303 (!$token instanceof HTMLPurifier_Token_Text && !isset($this->elements[$token->name]))
11306 $ret[] = $block_wrap_start;
11311 // starting tokens have been inline text / empty
11312 if ($token instanceof HTMLPurifier_Token_Start || $token instanceof HTMLPurifier_Token_Empty) {
11313 if (isset($this->elements[$token->name])) {
11315 $ret[] = $block_wrap_end;
11316 $is_inline = false;
11322 if ($token instanceof HTMLPurifier_Token_Start) $depth++;
11323 if ($token instanceof HTMLPurifier_Token_End) $depth--;
11325 if ($is_inline) $ret[] = $block_wrap_end;
11329 private function init($config) {
11330 if (!$this->init) {
11331 $def = $config->getHTMLDefinition();
11332 // allow all inline elements
11333 $this->real_elements = $this->elements;
11334 $this->fake_elements = $def->info_content_sets['Flow'];
11335 $this->fake_elements['#PCDATA'] = true;
11336 $this->init = true;
11346 * Definition for tables
11348 class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
11350 public $allow_empty = false;
11351 public $type = 'table';
11352 public $elements = array('tr' => true, 'tbody' => true, 'thead' => true,
11353 'tfoot' => true, 'caption' => true, 'colgroup' => true, 'col' => true);
11354 public function __construct() {}
11355 public function validateChildren($tokens_of_children, $config, $context) {
11356 if (empty($tokens_of_children)) return false;
11358 // this ensures that the loop gets run one last time before closing
11359 // up. It's a little bit of a hack, but it works! Just make sure you
11360 // get rid of the token later.
11361 $tokens_of_children[] = false;
11363 // only one of these elements is allowed in a table
11368 // as many of these as you want
11370 $content = array();
11372 $nesting = 0; // current depth so we can determine nodes
11373 $is_collecting = false; // are we globbing together tokens to package
11374 // into one of the collectors?
11375 $collection = array(); // collected nodes
11376 $tag_index = 0; // the first node might be whitespace,
11377 // so this tells us where the start tag is
11379 foreach ($tokens_of_children as $token) {
11380 $is_child = ($nesting == 0);
11382 if ($token === false) {
11383 // terminating sequence started
11384 } elseif ($token instanceof HTMLPurifier_Token_Start) {
11386 } elseif ($token instanceof HTMLPurifier_Token_End) {
11390 // handle node collection
11391 if ($is_collecting) {
11393 // okay, let's stash the tokens away
11394 // first token tells us the type of the collection
11395 switch ($collection[$tag_index]->name) {
11398 $content[] = $collection;
11401 if ($caption !== false) break;
11402 $caption = $collection;
11406 // access the appropriate variable, $thead or $tfoot
11407 $var = $collection[$tag_index]->name;
11408 if ($$var === false) {
11409 $$var = $collection;
11411 // transmutate the first and less entries into
11412 // tbody tags, and then put into content
11413 $collection[$tag_index]->name = 'tbody';
11414 $collection[count($collection)-1]->name = 'tbody';
11415 $content[] = $collection;
11419 $cols[] = $collection;
11422 $collection = array();
11423 $is_collecting = false;
11426 // add the node to the collection
11427 $collection[] = $token;
11432 if ($token === false) break;
11435 // determine what we're dealing with
11436 if ($token->name == 'col') {
11437 // the only empty tag in the possie, we can handle it
11439 $cols[] = array_merge($collection, array($token));
11440 $collection = array();
11444 switch($token->name) {
11451 $is_collecting = true;
11452 $collection[] = $token;
11455 if (!empty($token->is_whitespace)) {
11456 $collection[] = $token;
11464 if (empty($content)) return false;
11467 if ($caption !== false) $ret = array_merge($ret, $caption);
11468 if ($cols !== false) foreach ($cols as $token_array) $ret = array_merge($ret, $token_array);
11469 if ($thead !== false) $ret = array_merge($ret, $thead);
11470 if ($tfoot !== false) $ret = array_merge($ret, $tfoot);
11471 foreach ($content as $token_array) $ret = array_merge($ret, $token_array);
11472 if (!empty($collection) && $is_collecting == false){
11473 // grab the trailing space
11474 $ret = array_merge($ret, $collection);
11477 array_pop($tokens_of_children); // remove phantom token
11479 return ($ret === $tokens_of_children) ? true : $ret;
11488 class HTMLPurifier_DefinitionCache_Decorator extends HTMLPurifier_DefinitionCache
11492 * Cache object we are decorating
11496 public function __construct() {}
11499 * Lazy decorator function
11500 * @param $cache Reference to cache object to decorate
11502 public function decorate(&$cache) {
11503 $decorator = $this->copy();
11504 // reference is necessary for mocks in PHP 4
11505 $decorator->cache =& $cache;
11506 $decorator->type = $cache->type;
11511 * Cross-compatible clone substitute
11513 public function copy() {
11514 return new HTMLPurifier_DefinitionCache_Decorator();
11517 public function add($def, $config) {
11518 return $this->cache->add($def, $config);
11521 public function set($def, $config) {
11522 return $this->cache->set($def, $config);
11525 public function replace($def, $config) {
11526 return $this->cache->replace($def, $config);
11529 public function get($config) {
11530 return $this->cache->get($config);
11533 public function remove($config) {
11534 return $this->cache->remove($config);
11537 public function flush($config) {
11538 return $this->cache->flush($config);
11541 public function cleanup($config) {
11542 return $this->cache->cleanup($config);
11552 * Null cache object to use when no caching is on.
11554 class HTMLPurifier_DefinitionCache_Null extends HTMLPurifier_DefinitionCache
11557 public function add($def, $config) {
11561 public function set($def, $config) {
11565 public function replace($def, $config) {
11569 public function remove($config) {
11573 public function get($config) {
11577 public function flush($config) {
11581 public function cleanup($config) {
11591 class HTMLPurifier_DefinitionCache_Serializer extends
11592 HTMLPurifier_DefinitionCache
11595 public function add($def, $config) {
11596 if (!$this->checkDefType($def)) return;
11597 $file = $this->generateFilePath($config);
11598 if (file_exists($file)) return false;
11599 if (!$this->_prepareDir($config)) return false;
11600 return $this->_write($file, serialize($def), $config);
11603 public function set($def, $config) {
11604 if (!$this->checkDefType($def)) return;
11605 $file = $this->generateFilePath($config);
11606 if (!$this->_prepareDir($config)) return false;
11607 return $this->_write($file, serialize($def), $config);
11610 public function replace($def, $config) {
11611 if (!$this->checkDefType($def)) return;
11612 $file = $this->generateFilePath($config);
11613 if (!file_exists($file)) return false;
11614 if (!$this->_prepareDir($config)) return false;
11615 return $this->_write($file, serialize($def), $config);
11618 public function get($config) {
11619 $file = $this->generateFilePath($config);
11620 if (!file_exists($file)) return false;
11621 return unserialize(file_get_contents($file));
11624 public function remove($config) {
11625 $file = $this->generateFilePath($config);
11626 if (!file_exists($file)) return false;
11627 return unlink($file);
11630 public function flush($config) {
11631 if (!$this->_prepareDir($config)) return false;
11632 $dir = $this->generateDirectoryPath($config);
11633 $dh = opendir($dir);
11634 while (false !== ($filename = readdir($dh))) {
11635 if (empty($filename)) continue;
11636 if ($filename[0] === '.') continue;
11637 unlink($dir . '/' . $filename);
11641 public function cleanup($config) {
11642 if (!$this->_prepareDir($config)) return false;
11643 $dir = $this->generateDirectoryPath($config);
11644 $dh = opendir($dir);
11645 while (false !== ($filename = readdir($dh))) {
11646 if (empty($filename)) continue;
11647 if ($filename[0] === '.') continue;
11648 $key = substr($filename, 0, strlen($filename) - 4);
11649 if ($this->isOld($key, $config)) unlink($dir . '/' . $filename);
11654 * Generates the file path to the serial file corresponding to
11655 * the configuration and definition name
11656 * @todo Make protected
11658 public function generateFilePath($config) {
11659 $key = $this->generateKey($config);
11660 return $this->generateDirectoryPath($config) . '/' . $key . '.ser';
11664 * Generates the path to the directory contain this cache's serial files
11665 * @note No trailing slash
11666 * @todo Make protected
11668 public function generateDirectoryPath($config) {
11669 $base = $this->generateBaseDirectoryPath($config);
11670 return $base . '/' . $this->type;
11674 * Generates path to base directory that contains all definition type
11676 * @todo Make protected
11678 public function generateBaseDirectoryPath($config) {
11679 $base = $config->get('Cache.SerializerPath');
11680 $base = is_null($base) ? HTMLPURIFIER_PREFIX . '/HTMLPurifier/DefinitionCache/Serializer' : $base;
11685 * Convenience wrapper function for file_put_contents
11686 * @param $file File name to write to
11687 * @param $data Data to write into file
11688 * @param $config Config object
11689 * @return Number of bytes written if success, or false if failure.
11691 private function _write($file, $data, $config) {
11692 $result = file_put_contents($file, $data);
11693 if ($result !== false) {
11694 // set permissions of the new file (no execute)
11695 $chmod = $config->get('Cache.SerializerPermissions');
11697 $chmod = 0644; // invalid config or simpletest
11699 $chmod = $chmod & 0666;
11700 chmod($file, $chmod);
11706 * Prepares the directory that this type stores the serials in
11707 * @param $config Config object
11708 * @return True if successful
11710 private function _prepareDir($config) {
11711 $directory = $this->generateDirectoryPath($config);
11712 $chmod = $config->get('Cache.SerializerPermissions');
11714 $chmod = 0755; // invalid config or simpletest
11716 if (!is_dir($directory)) {
11717 $base = $this->generateBaseDirectoryPath($config);
11718 if (!is_dir($base)) {
11719 trigger_error('Base directory '.$base.' does not exist,
11720 please create or change using %Cache.SerializerPath',
11723 } elseif (!$this->_testPermissions($base, $chmod)) {
11726 $old = umask(0000);
11727 mkdir($directory, $chmod);
11729 } elseif (!$this->_testPermissions($directory, $chmod)) {
11736 * Tests permissions on a directory and throws out friendly
11737 * error messages and attempts to chmod it itself if possible
11738 * @param $dir Directory path
11739 * @param $chmod Permissions
11740 * @return True if directory writable
11742 private function _testPermissions($dir, $chmod) {
11743 // early abort, if it is writable, everything is hunky-dory
11744 if (is_writable($dir)) return true;
11745 if (!is_dir($dir)) {
11746 // generally, you'll want to handle this beforehand
11747 // so a more specific error message can be given
11748 trigger_error('Directory '.$dir.' does not exist',
11752 if (function_exists('posix_getuid')) {
11753 // POSIX system, we can give more specific advice
11754 if (fileowner($dir) === posix_getuid()) {
11755 // we can chmod it ourselves
11756 $chmod = $chmod | 0700;
11757 if (chmod($dir, $chmod)) return true;
11758 } elseif (filegroup($dir) === posix_getgid()) {
11759 $chmod = $chmod | 0070;
11761 // PHP's probably running as nobody, so we'll
11762 // need to give global permissions
11763 $chmod = $chmod | 0777;
11765 trigger_error('Directory '.$dir.' not writable, '.
11766 'please chmod to ' . decoct($chmod),
11769 // generic error message
11770 trigger_error('Directory '.$dir.' not writable, '.
11771 'please alter file permissions',
11784 * Definition cache decorator class that cleans up the cache
11785 * whenever there is a cache miss.
11787 class HTMLPurifier_DefinitionCache_Decorator_Cleanup extends
11788 HTMLPurifier_DefinitionCache_Decorator
11791 public $name = 'Cleanup';
11793 public function copy() {
11794 return new HTMLPurifier_DefinitionCache_Decorator_Cleanup();
11797 public function add($def, $config) {
11798 $status = parent::add($def, $config);
11799 if (!$status) parent::cleanup($config);
11803 public function set($def, $config) {
11804 $status = parent::set($def, $config);
11805 if (!$status) parent::cleanup($config);
11809 public function replace($def, $config) {
11810 $status = parent::replace($def, $config);
11811 if (!$status) parent::cleanup($config);
11815 public function get($config) {
11816 $ret = parent::get($config);
11817 if (!$ret) parent::cleanup($config);
11828 * Definition cache decorator class that saves all cache retrievals
11829 * to PHP's memory; good for unit tests or circumstances where
11830 * there are lots of configuration objects floating around.
11832 class HTMLPurifier_DefinitionCache_Decorator_Memory extends
11833 HTMLPurifier_DefinitionCache_Decorator
11836 protected $definitions;
11837 public $name = 'Memory';
11839 public function copy() {
11840 return new HTMLPurifier_DefinitionCache_Decorator_Memory();
11843 public function add($def, $config) {
11844 $status = parent::add($def, $config);
11845 if ($status) $this->definitions[$this->generateKey($config)] = $def;
11849 public function set($def, $config) {
11850 $status = parent::set($def, $config);
11851 if ($status) $this->definitions[$this->generateKey($config)] = $def;
11855 public function replace($def, $config) {
11856 $status = parent::replace($def, $config);
11857 if ($status) $this->definitions[$this->generateKey($config)] = $def;
11861 public function get($config) {
11862 $key = $this->generateKey($config);
11863 if (isset($this->definitions[$key])) return $this->definitions[$key];
11864 $this->definitions[$key] = parent::get($config);
11865 return $this->definitions[$key];
11875 * XHTML 1.1 Bi-directional Text Module, defines elements that
11876 * declare directionality of content. Text Extension Module.
11878 class HTMLPurifier_HTMLModule_Bdo extends HTMLPurifier_HTMLModule
11881 public $name = 'Bdo';
11882 public $attr_collections = array(
11883 'I18N' => array('dir' => false)
11886 public function setup($config) {
11887 $bdo = $this->addElement(
11888 'bdo', 'Inline', 'Inline', array('Core', 'Lang'),
11890 'dir' => 'Enum#ltr,rtl', // required
11891 // The Abstract Module specification has the attribute
11892 // inclusions wrong for bdo: bdo allows Lang
11895 $bdo->attr_transform_post['required-dir'] = new HTMLPurifier_AttrTransform_BdoDir();
11897 $this->attr_collections['I18N']['dir'] = 'Enum#ltr,rtl';
11906 class HTMLPurifier_HTMLModule_CommonAttributes extends HTMLPurifier_HTMLModule
11908 public $name = 'CommonAttributes';
11910 public $attr_collections = array(
11912 0 => array('Style'),
11913 // 'xml:space' => false,
11914 'class' => 'Class',
11916 'title' => 'CDATA',
11920 0 => array('Lang'), // proprietary, for xml:lang/lang
11923 0 => array('Core', 'I18N')
11934 * XHTML 1.1 Edit Module, defines editing-related elements. Text Extension
11937 class HTMLPurifier_HTMLModule_Edit extends HTMLPurifier_HTMLModule
11940 public $name = 'Edit';
11942 public function setup($config) {
11943 $contents = 'Chameleon: #PCDATA | Inline ! #PCDATA | Flow';
11946 // 'datetime' => 'Datetime', // not implemented
11948 $this->addElement('del', 'Inline', $contents, 'Common', $attr);
11949 $this->addElement('ins', 'Inline', $contents, 'Common', $attr);
11952 // HTML 4.01 specifies that ins/del must not contain block
11953 // elements when used in an inline context, chameleon is
11954 // a complicated workaround to acheive this effect
11956 // Inline context ! Block context (exclamation mark is
11957 // separator, see getChildDef for parsing)
11959 public $defines_child_def = true;
11960 public function getChildDef($def) {
11961 if ($def->content_model_type != 'chameleon') return false;
11962 $value = explode('!', $def->content_model);
11963 return new HTMLPurifier_ChildDef_Chameleon($value[0], $value[1]);
11973 * XHTML 1.1 Forms module, defines all form-related elements found in HTML 4.
11975 class HTMLPurifier_HTMLModule_Forms extends HTMLPurifier_HTMLModule
11977 public $name = 'Forms';
11978 public $safe = false;
11980 public $content_sets = array(
11982 'Inline' => 'Formctrl',
11985 public function setup($config) {
11986 $form = $this->addElement('form', 'Form',
11987 'Required: Heading | List | Block | fieldset', 'Common', array(
11988 'accept' => 'ContentTypes',
11989 'accept-charset' => 'Charsets',
11990 'action*' => 'URI',
11991 'method' => 'Enum#get,post',
11992 // really ContentType, but these two are the only ones used today
11993 'enctype' => 'Enum#application/x-www-form-urlencoded,multipart/form-data',
11995 $form->excludes = array('form' => true);
11997 $input = $this->addElement('input', 'Formctrl', 'Empty', 'Common', array(
11998 'accept' => 'ContentTypes',
11999 'accesskey' => 'Character',
12001 'checked' => 'Bool#checked',
12002 'disabled' => 'Bool#disabled',
12003 'maxlength' => 'Number',
12005 'readonly' => 'Bool#readonly',
12006 'size' => 'Number',
12007 'src' => 'URI#embeds',
12008 'tabindex' => 'Number',
12009 'type' => 'Enum#text,password,checkbox,button,radio,submit,reset,file,hidden,image',
12010 'value' => 'CDATA',
12012 $input->attr_transform_post[] = new HTMLPurifier_AttrTransform_Input();
12014 $this->addElement('select', 'Formctrl', 'Required: optgroup | option', 'Common', array(
12015 'disabled' => 'Bool#disabled',
12016 'multiple' => 'Bool#multiple',
12018 'size' => 'Number',
12019 'tabindex' => 'Number',
12022 $this->addElement('option', false, 'Optional: #PCDATA', 'Common', array(
12023 'disabled' => 'Bool#disabled',
12025 'selected' => 'Bool#selected',
12026 'value' => 'CDATA',
12028 // It's illegal for there to be more than one selected, but not
12029 // be multiple. Also, no selected means undefined behavior. This might
12030 // be difficult to implement; perhaps an injector, or a context variable.
12032 $textarea = $this->addElement('textarea', 'Formctrl', 'Optional: #PCDATA', 'Common', array(
12033 'accesskey' => 'Character',
12034 'cols*' => 'Number',
12035 'disabled' => 'Bool#disabled',
12037 'readonly' => 'Bool#readonly',
12038 'rows*' => 'Number',
12039 'tabindex' => 'Number',
12041 $textarea->attr_transform_pre[] = new HTMLPurifier_AttrTransform_Textarea();
12043 $button = $this->addElement('button', 'Formctrl', 'Optional: #PCDATA | Heading | List | Block | Inline', 'Common', array(
12044 'accesskey' => 'Character',
12045 'disabled' => 'Bool#disabled',
12047 'tabindex' => 'Number',
12048 'type' => 'Enum#button,submit,reset',
12049 'value' => 'CDATA',
12052 // For exclusions, ideally we'd specify content sets, not literal elements
12053 $button->excludes = $this->makeLookup(
12054 'form', 'fieldset', // Form
12055 'input', 'select', 'textarea', 'label', 'button', // Formctrl
12056 'a' // as per HTML 4.01 spec, this is omitted by modularization
12059 // Extra exclusion: img usemap="" is not permitted within this element.
12060 // We'll omit this for now, since we don't have any good way of
12061 // indicating it yet.
12063 // This is HIGHLY user-unfriendly; we need a custom child-def for this
12064 $this->addElement('fieldset', 'Form', 'Custom: (#WS?,legend,(Flow|#PCDATA)*)', 'Common');
12066 $label = $this->addElement('label', 'Formctrl', 'Optional: #PCDATA | Inline', 'Common', array(
12067 'accesskey' => 'Character',
12068 // 'for' => 'IDREF', // IDREF not implemented, cannot allow
12070 $label->excludes = array('label' => true);
12072 $this->addElement('legend', false, 'Optional: #PCDATA | Inline', 'Common', array(
12073 'accesskey' => 'Character',
12076 $this->addElement('optgroup', false, 'Required: option', 'Common', array(
12077 'disabled' => 'Bool#disabled',
12078 'label*' => 'Text',
12081 // Don't forget an injector for <isindex>. This one's a little complex
12082 // because it maps to multiple elements.
12092 * XHTML 1.1 Hypertext Module, defines hypertext links. Core Module.
12094 class HTMLPurifier_HTMLModule_Hypertext extends HTMLPurifier_HTMLModule
12097 public $name = 'Hypertext';
12099 public function setup($config) {
12100 $a = $this->addElement(
12101 'a', 'Inline', 'Inline', 'Common',
12103 // 'accesskey' => 'Character',
12104 // 'charset' => 'Charset',
12106 // 'hreflang' => 'LanguageCode',
12107 'rel' => new HTMLPurifier_AttrDef_HTML_LinkTypes('rel'),
12108 'rev' => new HTMLPurifier_AttrDef_HTML_LinkTypes('rev'),
12109 // 'tabindex' => 'Number',
12110 // 'type' => 'ContentType',
12113 $a->formatting = true;
12114 $a->excludes = array('a' => true);
12124 * XHTML 1.1 Image Module provides basic image embedding.
12125 * @note There is specialized code for removing empty images in
12126 * HTMLPurifier_Strategy_RemoveForeignElements
12128 class HTMLPurifier_HTMLModule_Image extends HTMLPurifier_HTMLModule
12131 public $name = 'Image';
12133 public function setup($config) {
12134 $max = $config->get('HTML.MaxImgLength');
12135 $img = $this->addElement(
12136 'img', 'Inline', 'Empty', 'Common',
12139 // According to the spec, it's Length, but percents can
12140 // be abused, so we allow only Pixels.
12141 'height' => 'Pixels#' . $max,
12142 'width' => 'Pixels#' . $max,
12143 'longdesc' => 'URI',
12144 'src*' => new HTMLPurifier_AttrDef_URI(true), // embedded
12147 if ($max === null || $config->get('HTML.Trusted')) {
12148 $img->attr['height'] =
12149 $img->attr['width'] = 'Length';
12152 // kind of strange, but splitting things up would be inefficient
12153 $img->attr_transform_pre[] =
12154 $img->attr_transform_post[] =
12155 new HTMLPurifier_AttrTransform_ImgRequired();
12165 * XHTML 1.1 Legacy module defines elements that were previously
12168 * @note Not all legacy elements have been implemented yet, which
12169 * is a bit of a reverse problem as compared to browsers! In
12170 * addition, this legacy module may implement a bit more than
12171 * mandated by XHTML 1.1.
12173 * This module can be used in combination with TransformToStrict in order
12174 * to transform as many deprecated elements as possible, but retain
12175 * questionably deprecated elements that do not have good alternatives
12176 * as well as transform elements that don't have an implementation.
12177 * See docs/ref-strictness.txt for more details.
12180 class HTMLPurifier_HTMLModule_Legacy extends HTMLPurifier_HTMLModule
12183 public $name = 'Legacy';
12185 public function setup($config) {
12187 $this->addElement('basefont', 'Inline', 'Empty', false, array(
12188 'color' => 'Color',
12189 'face' => 'Text', // extremely broad, we should
12190 'size' => 'Text', // tighten it
12193 $this->addElement('center', 'Block', 'Flow', 'Common');
12194 $this->addElement('dir', 'Block', 'Required: li', 'Common', array(
12195 'compact' => 'Bool#compact'
12197 $this->addElement('font', 'Inline', 'Inline', array('Core', 'I18N'), array(
12198 'color' => 'Color',
12199 'face' => 'Text', // extremely broad, we should
12200 'size' => 'Text', // tighten it
12202 $this->addElement('menu', 'Block', 'Required: li', 'Common', array(
12203 'compact' => 'Bool#compact'
12206 $s = $this->addElement('s', 'Inline', 'Inline', 'Common');
12207 $s->formatting = true;
12209 $strike = $this->addElement('strike', 'Inline', 'Inline', 'Common');
12210 $strike->formatting = true;
12212 $u = $this->addElement('u', 'Inline', 'Inline', 'Common');
12213 $u->formatting = true;
12215 // setup modifications to old elements
12217 $align = 'Enum#left,right,center,justify';
12219 $address = $this->addBlankElement('address');
12220 $address->content_model = 'Inline | #PCDATA | p';
12221 $address->content_model_type = 'optional';
12222 $address->child = false;
12224 $blockquote = $this->addBlankElement('blockquote');
12225 $blockquote->content_model = 'Flow | #PCDATA';
12226 $blockquote->content_model_type = 'optional';
12227 $blockquote->child = false;
12229 $br = $this->addBlankElement('br');
12230 $br->attr['clear'] = 'Enum#left,all,right,none';
12232 $caption = $this->addBlankElement('caption');
12233 $caption->attr['align'] = 'Enum#top,bottom,left,right';
12235 $div = $this->addBlankElement('div');
12236 $div->attr['align'] = $align;
12238 $dl = $this->addBlankElement('dl');
12239 $dl->attr['compact'] = 'Bool#compact';
12241 for ($i = 1; $i <= 6; $i++) {
12242 $h = $this->addBlankElement("h$i");
12243 $h->attr['align'] = $align;
12246 $hr = $this->addBlankElement('hr');
12247 $hr->attr['align'] = $align;
12248 $hr->attr['noshade'] = 'Bool#noshade';
12249 $hr->attr['size'] = 'Pixels';
12250 $hr->attr['width'] = 'Length';
12252 $img = $this->addBlankElement('img');
12253 $img->attr['align'] = 'Enum#top,middle,bottom,left,right';
12254 $img->attr['border'] = 'Pixels';
12255 $img->attr['hspace'] = 'Pixels';
12256 $img->attr['vspace'] = 'Pixels';
12258 // figure out this integer business
12260 $li = $this->addBlankElement('li');
12261 $li->attr['value'] = new HTMLPurifier_AttrDef_Integer();
12262 $li->attr['type'] = 'Enum#s:1,i,I,a,A,disc,square,circle';
12264 $ol = $this->addBlankElement('ol');
12265 $ol->attr['compact'] = 'Bool#compact';
12266 $ol->attr['start'] = new HTMLPurifier_AttrDef_Integer();
12267 $ol->attr['type'] = 'Enum#s:1,i,I,a,A';
12269 $p = $this->addBlankElement('p');
12270 $p->attr['align'] = $align;
12272 $pre = $this->addBlankElement('pre');
12273 $pre->attr['width'] = 'Number';
12277 $table = $this->addBlankElement('table');
12278 $table->attr['align'] = 'Enum#left,center,right';
12279 $table->attr['bgcolor'] = 'Color';
12281 $tr = $this->addBlankElement('tr');
12282 $tr->attr['bgcolor'] = 'Color';
12284 $th = $this->addBlankElement('th');
12285 $th->attr['bgcolor'] = 'Color';
12286 $th->attr['height'] = 'Length';
12287 $th->attr['nowrap'] = 'Bool#nowrap';
12288 $th->attr['width'] = 'Length';
12290 $td = $this->addBlankElement('td');
12291 $td->attr['bgcolor'] = 'Color';
12292 $td->attr['height'] = 'Length';
12293 $td->attr['nowrap'] = 'Bool#nowrap';
12294 $td->attr['width'] = 'Length';
12296 $ul = $this->addBlankElement('ul');
12297 $ul->attr['compact'] = 'Bool#compact';
12298 $ul->attr['type'] = 'Enum#square,disc,circle';
12309 * XHTML 1.1 List Module, defines list-oriented elements. Core Module.
12311 class HTMLPurifier_HTMLModule_List extends HTMLPurifier_HTMLModule
12314 public $name = 'List';
12316 // According to the abstract schema, the List content set is a fully formed
12317 // one or more expr, but it invariably occurs in an optional declaration
12318 // so we're not going to do that subtlety. It might cause trouble
12319 // if a user defines "List" and expects that multiple lists are
12320 // allowed to be specified, but then again, that's not very intuitive.
12321 // Furthermore, the actual XML Schema may disagree. Regardless,
12322 // we don't have support for such nested expressions without using
12323 // the incredibly inefficient and draconic Custom ChildDef.
12325 public $content_sets = array('Flow' => 'List');
12327 public function setup($config) {
12328 $ol = $this->addElement('ol', 'List', 'Required: li', 'Common');
12330 $ul = $this->addElement('ul', 'List', 'Required: li', 'Common');
12332 $this->addElement('dl', 'List', 'Required: dt | dd', 'Common');
12334 $this->addElement('li', false, 'Flow', 'Common');
12336 $this->addElement('dd', false, 'Flow', 'Common');
12337 $this->addElement('dt', false, 'Inline', 'Common');
12346 class HTMLPurifier_HTMLModule_Name extends HTMLPurifier_HTMLModule
12349 public $name = 'Name';
12351 public function setup($config) {
12352 $elements = array('a', 'applet', 'form', 'frame', 'iframe', 'img', 'map');
12353 foreach ($elements as $name) {
12354 $element = $this->addBlankElement($name);
12355 $element->attr['name'] = 'CDATA';
12356 if (!$config->get('HTML.Attr.Name.UseCDATA')) {
12357 $element->attr_transform_post['NameSync'] = new HTMLPurifier_AttrTransform_NameSync();
12369 * Module adds the nofollow attribute transformation to a tags. It
12370 * is enabled by HTML.Nofollow
12372 class HTMLPurifier_HTMLModule_Nofollow extends HTMLPurifier_HTMLModule
12375 public $name = 'Nofollow';
12377 public function setup($config) {
12378 $a = $this->addBlankElement('a');
12379 $a->attr_transform_post[] = new HTMLPurifier_AttrTransform_Nofollow();
12388 class HTMLPurifier_HTMLModule_NonXMLCommonAttributes extends HTMLPurifier_HTMLModule
12390 public $name = 'NonXMLCommonAttributes';
12392 public $attr_collections = array(
12394 'lang' => 'LanguageCode',
12404 * XHTML 1.1 Object Module, defines elements for generic object inclusion
12405 * @warning Users will commonly use <embed> to cater to legacy browsers: this
12406 * module does not allow this sort of behavior
12408 class HTMLPurifier_HTMLModule_Object extends HTMLPurifier_HTMLModule
12411 public $name = 'Object';
12412 public $safe = false;
12414 public function setup($config) {
12416 $this->addElement('object', 'Inline', 'Optional: #PCDATA | Flow | param', 'Common',
12418 'archive' => 'URI',
12419 'classid' => 'URI',
12420 'codebase' => 'URI',
12421 'codetype' => 'Text',
12423 'declare' => 'Bool#declare',
12424 'height' => 'Length',
12426 'standby' => 'Text',
12427 'tabindex' => 'Number',
12428 'type' => 'ContentType',
12429 'width' => 'Length'
12433 $this->addElement('param', false, 'Empty', false,
12439 'valuetype' => 'Enum#data,ref,object'
12452 * XHTML 1.1 Presentation Module, defines simple presentation-related
12453 * markup. Text Extension Module.
12454 * @note The official XML Schema and DTD specs further divide this into
12456 * - Block Presentation (hr)
12457 * - Inline Presentation (b, big, i, small, sub, sup, tt)
12458 * We have chosen not to heed this distinction, as content_sets
12459 * provides satisfactory disambiguation.
12461 class HTMLPurifier_HTMLModule_Presentation extends HTMLPurifier_HTMLModule
12464 public $name = 'Presentation';
12466 public function setup($config) {
12467 $this->addElement('hr', 'Block', 'Empty', 'Common');
12468 $this->addElement('sub', 'Inline', 'Inline', 'Common');
12469 $this->addElement('sup', 'Inline', 'Inline', 'Common');
12470 $b = $this->addElement('b', 'Inline', 'Inline', 'Common');
12471 $b->formatting = true;
12472 $big = $this->addElement('big', 'Inline', 'Inline', 'Common');
12473 $big->formatting = true;
12474 $i = $this->addElement('i', 'Inline', 'Inline', 'Common');
12475 $i->formatting = true;
12476 $small = $this->addElement('small', 'Inline', 'Inline', 'Common');
12477 $small->formatting = true;
12478 $tt = $this->addElement('tt', 'Inline', 'Inline', 'Common');
12479 $tt->formatting = true;
12489 * Module defines proprietary tags and attributes in HTML.
12490 * @warning If this module is enabled, standards-compliance is off!
12492 class HTMLPurifier_HTMLModule_Proprietary extends HTMLPurifier_HTMLModule
12495 public $name = 'Proprietary';
12497 public function setup($config) {
12499 $this->addElement('marquee', 'Inline', 'Flow', 'Common',
12501 'direction' => 'Enum#left,right,up,down',
12502 'behavior' => 'Enum#alternate',
12503 'width' => 'Length',
12504 'height' => 'Length',
12505 'scrolldelay' => 'Number',
12506 'scrollamount' => 'Number',
12507 'loop' => 'Number',
12508 'bgcolor' => 'Color',
12509 'hspace' => 'Pixels',
12510 'vspace' => 'Pixels',
12523 * XHTML 1.1 Ruby Annotation Module, defines elements that indicate
12524 * short runs of text alongside base text for annotation or pronounciation.
12526 class HTMLPurifier_HTMLModule_Ruby extends HTMLPurifier_HTMLModule
12529 public $name = 'Ruby';
12531 public function setup($config) {
12532 $this->addElement('ruby', 'Inline',
12533 'Custom: ((rb, (rt | (rp, rt, rp))) | (rbc, rtc, rtc?))',
12535 $this->addElement('rbc', false, 'Required: rb', 'Common');
12536 $this->addElement('rtc', false, 'Required: rt', 'Common');
12537 $rb = $this->addElement('rb', false, 'Inline', 'Common');
12538 $rb->excludes = array('ruby' => true);
12539 $rt = $this->addElement('rt', false, 'Inline', 'Common', array('rbspan' => 'Number'));
12540 $rt->excludes = array('ruby' => true);
12541 $this->addElement('rp', false, 'Optional: #PCDATA', 'Common');
12551 * A "safe" embed module. See SafeObject. This is a proprietary element.
12553 class HTMLPurifier_HTMLModule_SafeEmbed extends HTMLPurifier_HTMLModule
12556 public $name = 'SafeEmbed';
12558 public function setup($config) {
12560 $max = $config->get('HTML.MaxImgLength');
12561 $embed = $this->addElement(
12562 'embed', 'Inline', 'Empty', 'Common',
12564 'src*' => 'URI#embedded',
12565 'type' => 'Enum#application/x-shockwave-flash',
12566 'width' => 'Pixels#' . $max,
12567 'height' => 'Pixels#' . $max,
12568 'allowscriptaccess' => 'Enum#never',
12569 'allownetworking' => 'Enum#internal',
12570 'flashvars' => 'Text',
12571 'wmode' => 'Enum#window,transparent,opaque',
12575 $embed->attr_transform_post[] = new HTMLPurifier_AttrTransform_SafeEmbed();
12586 * A "safe" object module. In theory, objects permitted by this module will
12587 * be safe, and untrusted users can be allowed to embed arbitrary flash objects
12588 * (maybe other types too, but only Flash is supported as of right now).
12589 * Highly experimental.
12591 class HTMLPurifier_HTMLModule_SafeObject extends HTMLPurifier_HTMLModule
12594 public $name = 'SafeObject';
12596 public function setup($config) {
12598 // These definitions are not intrinsically safe: the attribute transforms
12599 // are a vital part of ensuring safety.
12601 $max = $config->get('HTML.MaxImgLength');
12602 $object = $this->addElement(
12605 'Optional: param | Flow | #PCDATA',
12608 // While technically not required by the spec, we're forcing
12609 // it to this value.
12610 'type' => 'Enum#application/x-shockwave-flash',
12611 'width' => 'Pixels#' . $max,
12612 'height' => 'Pixels#' . $max,
12613 'data' => 'URI#embedded',
12614 'codebase' => new HTMLPurifier_AttrDef_Enum(array(
12615 'http://download.macromedia.com/pub/shockwave/cabs/flash/swflash.cab#version=6,0,40,0')),
12618 $object->attr_transform_post[] = new HTMLPurifier_AttrTransform_SafeObject();
12620 $param = $this->addElement('param', false, 'Empty', false,
12627 $param->attr_transform_post[] = new HTMLPurifier_AttrTransform_SafeParam();
12628 $this->info_injector[] = 'SafeObject';
12640 WARNING: THIS MODULE IS EXTREMELY DANGEROUS AS IT ENABLES INLINE SCRIPTING
12641 INSIDE HTML PURIFIER DOCUMENTS. USE ONLY WITH TRUSTED USER INPUT!!!
12646 * XHTML 1.1 Scripting module, defines elements that are used to contain
12647 * information pertaining to executable scripts or the lack of support
12648 * for executable scripts.
12649 * @note This module does not contain inline scripting elements
12651 class HTMLPurifier_HTMLModule_Scripting extends HTMLPurifier_HTMLModule
12653 public $name = 'Scripting';
12654 public $elements = array('script', 'noscript');
12655 public $content_sets = array('Block' => 'script | noscript', 'Inline' => 'script | noscript');
12656 public $safe = false;
12658 public function setup($config) {
12659 // TODO: create custom child-definition for noscript that
12660 // auto-wraps stray #PCDATA in a similar manner to
12661 // blockquote's custom definition (we would use it but
12662 // blockquote's contents are optional while noscript's contents
12665 // TODO: convert this to new syntax, main problem is getting
12666 // both content sets working
12668 // In theory, this could be safe, but I don't see any reason to
12670 $this->info['noscript'] = new HTMLPurifier_ElementDef();
12671 $this->info['noscript']->attr = array( 0 => array('Common') );
12672 $this->info['noscript']->content_model = 'Heading | List | Block';
12673 $this->info['noscript']->content_model_type = 'required';
12675 $this->info['script'] = new HTMLPurifier_ElementDef();
12676 $this->info['script']->attr = array(
12677 'defer' => new HTMLPurifier_AttrDef_Enum(array('defer')),
12678 'src' => new HTMLPurifier_AttrDef_URI(true),
12679 'type' => new HTMLPurifier_AttrDef_Enum(array('text/javascript'))
12681 $this->info['script']->content_model = '#PCDATA';
12682 $this->info['script']->content_model_type = 'optional';
12683 $this->info['script']->attr_transform_pre['type'] =
12684 $this->info['script']->attr_transform_post['type'] =
12685 new HTMLPurifier_AttrTransform_ScriptRequired();
12694 * XHTML 1.1 Edit Module, defines editing-related elements. Text Extension
12697 class HTMLPurifier_HTMLModule_StyleAttribute extends HTMLPurifier_HTMLModule
12700 public $name = 'StyleAttribute';
12701 public $attr_collections = array(
12702 // The inclusion routine differs from the Abstract Modules but
12703 // is in line with the DTD and XML Schemas.
12704 'Style' => array('style' => false), // see constructor
12705 'Core' => array(0 => array('Style'))
12708 public function setup($config) {
12709 $this->attr_collections['Style']['style'] = new HTMLPurifier_AttrDef_CSS();
12719 * XHTML 1.1 Tables Module, fully defines accessible table elements.
12721 class HTMLPurifier_HTMLModule_Tables extends HTMLPurifier_HTMLModule
12724 public $name = 'Tables';
12726 public function setup($config) {
12728 $this->addElement('caption', false, 'Inline', 'Common');
12730 $this->addElement('table', 'Block',
12731 new HTMLPurifier_ChildDef_Table(), 'Common',
12733 'border' => 'Pixels',
12734 'cellpadding' => 'Length',
12735 'cellspacing' => 'Length',
12736 'frame' => 'Enum#void,above,below,hsides,lhs,rhs,vsides,box,border',
12737 'rules' => 'Enum#none,groups,rows,cols,all',
12738 'summary' => 'Text',
12739 'width' => 'Length'
12743 // common attributes
12744 $cell_align = array(
12745 'align' => 'Enum#left,center,right,justify,char',
12746 'charoff' => 'Length',
12747 'valign' => 'Enum#top,middle,bottom,baseline',
12750 $cell_t = array_merge(
12753 'colspan' => 'Number',
12754 'rowspan' => 'Number',
12758 $this->addElement('td', false, 'Flow', 'Common', $cell_t);
12759 $this->addElement('th', false, 'Flow', 'Common', $cell_t);
12761 $this->addElement('tr', false, 'Required: td | th', 'Common', $cell_align);
12763 $cell_col = array_merge(
12765 'span' => 'Number',
12766 'width' => 'MultiLength',
12770 $this->addElement('col', false, 'Empty', 'Common', $cell_col);
12771 $this->addElement('colgroup', false, 'Optional: col', 'Common', $cell_col);
12773 $this->addElement('tbody', false, 'Required: tr', 'Common', $cell_align);
12774 $this->addElement('thead', false, 'Required: tr', 'Common', $cell_align);
12775 $this->addElement('tfoot', false, 'Required: tr', 'Common', $cell_align);
12786 * XHTML 1.1 Target Module, defines target attribute in link elements.
12788 class HTMLPurifier_HTMLModule_Target extends HTMLPurifier_HTMLModule
12791 public $name = 'Target';
12793 public function setup($config) {
12794 $elements = array('a');
12795 foreach ($elements as $name) {
12796 $e = $this->addBlankElement($name);
12798 'target' => new HTMLPurifier_AttrDef_HTML_FrameTarget()
12810 * XHTML 1.1 Text Module, defines basic text containers. Core Module.
12811 * @note In the normative XML Schema specification, this module
12812 * is further abstracted into the following modules:
12813 * - Block Phrasal (address, blockquote, pre, h1, h2, h3, h4, h5, h6)
12814 * - Block Structural (div, p)
12815 * - Inline Phrasal (abbr, acronym, cite, code, dfn, em, kbd, q, samp, strong, var)
12816 * - Inline Structural (br, span)
12817 * This module, functionally, does not distinguish between these
12818 * sub-modules, but the code is internally structured to reflect
12819 * these distinctions.
12821 class HTMLPurifier_HTMLModule_Text extends HTMLPurifier_HTMLModule
12824 public $name = 'Text';
12825 public $content_sets = array(
12826 'Flow' => 'Heading | Block | Inline'
12829 public function setup($config) {
12831 // Inline Phrasal -------------------------------------------------
12832 $this->addElement('abbr', 'Inline', 'Inline', 'Common');
12833 $this->addElement('acronym', 'Inline', 'Inline', 'Common');
12834 $this->addElement('cite', 'Inline', 'Inline', 'Common');
12835 $this->addElement('dfn', 'Inline', 'Inline', 'Common');
12836 $this->addElement('kbd', 'Inline', 'Inline', 'Common');
12837 $this->addElement('q', 'Inline', 'Inline', 'Common', array('cite' => 'URI'));
12838 $this->addElement('samp', 'Inline', 'Inline', 'Common');
12839 $this->addElement('var', 'Inline', 'Inline', 'Common');
12841 $em = $this->addElement('em', 'Inline', 'Inline', 'Common');
12842 $em->formatting = true;
12844 $strong = $this->addElement('strong', 'Inline', 'Inline', 'Common');
12845 $strong->formatting = true;
12847 $code = $this->addElement('code', 'Inline', 'Inline', 'Common');
12848 $code->formatting = true;
12850 // Inline Structural ----------------------------------------------
12851 $this->addElement('span', 'Inline', 'Inline', 'Common');
12852 $this->addElement('br', 'Inline', 'Empty', 'Core');
12854 // Block Phrasal --------------------------------------------------
12855 $this->addElement('address', 'Block', 'Inline', 'Common');
12856 $this->addElement('blockquote', 'Block', 'Optional: Heading | Block | List', 'Common', array('cite' => 'URI') );
12857 $pre = $this->addElement('pre', 'Block', 'Inline', 'Common');
12858 $pre->excludes = $this->makeLookup(
12859 'img', 'big', 'small', 'object', 'applet', 'font', 'basefont' );
12860 $this->addElement('h1', 'Heading', 'Inline', 'Common');
12861 $this->addElement('h2', 'Heading', 'Inline', 'Common');
12862 $this->addElement('h3', 'Heading', 'Inline', 'Common');
12863 $this->addElement('h4', 'Heading', 'Inline', 'Common');
12864 $this->addElement('h5', 'Heading', 'Inline', 'Common');
12865 $this->addElement('h6', 'Heading', 'Inline', 'Common');
12867 // Block Structural -----------------------------------------------
12868 $p = $this->addElement('p', 'Block', 'Inline', 'Common');
12869 $p->autoclose = array_flip(array("address", "blockquote", "center", "dir", "div", "dl", "fieldset", "ol", "p", "ul"));
12871 $this->addElement('div', 'Block', 'Flow', 'Common');
12882 * Abstract class for a set of proprietary modules that clean up (tidy)
12883 * poorly written HTML.
12884 * @todo Figure out how to protect some of these methods/properties
12886 class HTMLPurifier_HTMLModule_Tidy extends HTMLPurifier_HTMLModule
12890 * List of supported levels. Index zero is a special case "no fixes"
12893 public $levels = array(0 => 'none', 'light', 'medium', 'heavy');
12896 * Default level to place all fixes in. Disabled by default
12898 public $defaultLevel = null;
12901 * Lists of fixes used by getFixesForLevel(). Format is:
12902 * HTMLModule_Tidy->fixesForLevel[$level] = array('fix-1', 'fix-2');
12904 public $fixesForLevel = array(
12905 'light' => array(),
12906 'medium' => array(),
12911 * Lazy load constructs the module by determining the necessary
12912 * fixes to create and then delegating to the populate() function.
12913 * @todo Wildcard matching and error reporting when an added or
12914 * subtracted fix has no effect.
12916 public function setup($config) {
12918 // create fixes, initialize fixesForLevel
12919 $fixes = $this->makeFixes();
12920 $this->makeFixesForLevel($fixes);
12922 // figure out which fixes to use
12923 $level = $config->get('HTML.TidyLevel');
12924 $fixes_lookup = $this->getFixesForLevel($level);
12926 // get custom fix declarations: these need namespace processing
12927 $add_fixes = $config->get('HTML.TidyAdd');
12928 $remove_fixes = $config->get('HTML.TidyRemove');
12930 foreach ($fixes as $name => $fix) {
12931 // needs to be refactored a little to implement globbing
12933 isset($remove_fixes[$name]) ||
12934 (!isset($add_fixes[$name]) && !isset($fixes_lookup[$name]))
12936 unset($fixes[$name]);
12940 // populate this module with necessary fixes
12941 $this->populate($fixes);
12946 * Retrieves all fixes per a level, returning fixes for that specific
12947 * level as well as all levels below it.
12948 * @param $level String level identifier, see $levels for valid values
12949 * @return Lookup up table of fixes
12951 public function getFixesForLevel($level) {
12952 if ($level == $this->levels[0]) {
12955 $activated_levels = array();
12956 for ($i = 1, $c = count($this->levels); $i < $c; $i++) {
12957 $activated_levels[] = $this->levels[$i];
12958 if ($this->levels[$i] == $level) break;
12962 'Tidy level ' . htmlspecialchars($level) . ' not recognized',
12968 foreach ($activated_levels as $level) {
12969 foreach ($this->fixesForLevel[$level] as $fix) {
12977 * Dynamically populates the $fixesForLevel member variable using
12978 * the fixes array. It may be custom overloaded, used in conjunction
12979 * with $defaultLevel, or not used at all.
12981 public function makeFixesForLevel($fixes) {
12982 if (!isset($this->defaultLevel)) return;
12983 if (!isset($this->fixesForLevel[$this->defaultLevel])) {
12985 'Default level ' . $this->defaultLevel . ' does not exist',
12990 $this->fixesForLevel[$this->defaultLevel] = array_keys($fixes);
12994 * Populates the module with transforms and other special-case code
12995 * based on a list of fixes passed to it
12996 * @param $lookup Lookup table of fixes to activate
12998 public function populate($fixes) {
12999 foreach ($fixes as $name => $fix) {
13000 // determine what the fix is for
13001 list($type, $params) = $this->getFixType($name);
13003 case 'attr_transform_pre':
13004 case 'attr_transform_post':
13005 $attr = $params['attr'];
13006 if (isset($params['element'])) {
13007 $element = $params['element'];
13008 if (empty($this->info[$element])) {
13009 $e = $this->addBlankElement($element);
13011 $e = $this->info[$element];
13014 $type = "info_$type";
13017 // PHP does some weird parsing when I do
13018 // $e->$type[$attr], so I have to assign a ref.
13022 case 'tag_transform':
13023 $this->info_tag_transform[$params['element']] = $fix;
13026 case 'content_model_type':
13027 $element = $params['element'];
13028 if (empty($this->info[$element])) {
13029 $e = $this->addBlankElement($element);
13031 $e = $this->info[$element];
13036 trigger_error("Fix type $type not supported", E_USER_ERROR);
13043 * Parses a fix name and determines what kind of fix it is, as well
13044 * as other information defined by the fix
13045 * @param $name String name of fix
13046 * @return array(string $fix_type, array $fix_parameters)
13047 * @note $fix_parameters is type dependant, see populate() for usage
13048 * of these parameters
13050 public function getFixType($name) {
13052 $property = $attr = null;
13053 if (strpos($name, '#') !== false) list($name, $property) = explode('#', $name);
13054 if (strpos($name, '@') !== false) list($name, $attr) = explode('@', $name);
13056 // figure out the parameters
13058 if ($name !== '') $params['element'] = $name;
13059 if (!is_null($attr)) $params['attr'] = $attr;
13061 // special case: attribute transform
13062 if (!is_null($attr)) {
13063 if (is_null($property)) $property = 'pre';
13064 $type = 'attr_transform_' . $property;
13065 return array($type, $params);
13068 // special case: tag transform
13069 if (is_null($property)) {
13070 return array('tag_transform', $params);
13073 return array($property, $params);
13078 * Defines all fixes the module will perform in a compact
13079 * associative array of fix name to fix implementation.
13081 public function makeFixes() {}
13089 class HTMLPurifier_HTMLModule_XMLCommonAttributes extends HTMLPurifier_HTMLModule
13091 public $name = 'XMLCommonAttributes';
13093 public $attr_collections = array(
13095 'xml:lang' => 'LanguageCode',
13105 * Name is deprecated, but allowed in strict doctypes, so onl
13107 class HTMLPurifier_HTMLModule_Tidy_Name extends HTMLPurifier_HTMLModule_Tidy
13109 public $name = 'Tidy_Name';
13110 public $defaultLevel = 'heavy';
13111 public function makeFixes() {
13115 // @name for img, a -----------------------------------------------
13116 // Technically, it's allowed even on strict, so we allow authors to use
13117 // it. However, it's deprecated in future versions of XHTML.
13119 $r['a@name'] = new HTMLPurifier_AttrTransform_Name();
13129 class HTMLPurifier_HTMLModule_Tidy_Proprietary extends HTMLPurifier_HTMLModule_Tidy
13132 public $name = 'Tidy_Proprietary';
13133 public $defaultLevel = 'light';
13135 public function makeFixes() {
13137 $r['table@background'] = new HTMLPurifier_AttrTransform_Background();
13138 $r['td@background'] = new HTMLPurifier_AttrTransform_Background();
13139 $r['th@background'] = new HTMLPurifier_AttrTransform_Background();
13140 $r['tr@background'] = new HTMLPurifier_AttrTransform_Background();
13141 $r['thead@background'] = new HTMLPurifier_AttrTransform_Background();
13142 $r['tfoot@background'] = new HTMLPurifier_AttrTransform_Background();
13143 $r['tbody@background'] = new HTMLPurifier_AttrTransform_Background();
13144 $r['table@height'] = new HTMLPurifier_AttrTransform_Length('height');
13154 class HTMLPurifier_HTMLModule_Tidy_XHTMLAndHTML4 extends HTMLPurifier_HTMLModule_Tidy
13157 public function makeFixes() {
13161 // == deprecated tag transforms ===================================
13163 $r['font'] = new HTMLPurifier_TagTransform_Font();
13164 $r['menu'] = new HTMLPurifier_TagTransform_Simple('ul');
13165 $r['dir'] = new HTMLPurifier_TagTransform_Simple('ul');
13166 $r['center'] = new HTMLPurifier_TagTransform_Simple('div', 'text-align:center;');
13167 $r['u'] = new HTMLPurifier_TagTransform_Simple('span', 'text-decoration:underline;');
13168 $r['s'] = new HTMLPurifier_TagTransform_Simple('span', 'text-decoration:line-through;');
13169 $r['strike'] = new HTMLPurifier_TagTransform_Simple('span', 'text-decoration:line-through;');
13171 // == deprecated attribute transforms =============================
13173 $r['caption@align'] =
13174 new HTMLPurifier_AttrTransform_EnumToCSS('align', array(
13175 // we're following IE's behavior, not Firefox's, due
13176 // to the fact that no one supports caption-side:right,
13177 // W3C included (with CSS 2.1). This is a slightly
13178 // unreasonable attribute!
13179 'left' => 'text-align:left;',
13180 'right' => 'text-align:right;',
13181 'top' => 'caption-side:top;',
13182 'bottom' => 'caption-side:bottom;' // not supported by IE
13185 // @align for img -------------------------------------------------
13187 new HTMLPurifier_AttrTransform_EnumToCSS('align', array(
13188 'left' => 'float:left;',
13189 'right' => 'float:right;',
13190 'top' => 'vertical-align:top;',
13191 'middle' => 'vertical-align:middle;',
13192 'bottom' => 'vertical-align:baseline;',
13195 // @align for table -----------------------------------------------
13196 $r['table@align'] =
13197 new HTMLPurifier_AttrTransform_EnumToCSS('align', array(
13198 'left' => 'float:left;',
13199 'center' => 'margin-left:auto;margin-right:auto;',
13200 'right' => 'float:right;'
13203 // @align for hr -----------------------------------------------
13205 new HTMLPurifier_AttrTransform_EnumToCSS('align', array(
13206 // we use both text-align and margin because these work
13207 // for different browsers (IE and Firefox, respectively)
13208 // and the melange makes for a pretty cross-compatible
13210 'left' => 'margin-left:0;margin-right:auto;text-align:left;',
13211 'center' => 'margin-left:auto;margin-right:auto;text-align:center;',
13212 'right' => 'margin-left:auto;margin-right:0;text-align:right;'
13215 // @align for h1, h2, h3, h4, h5, h6, p, div ----------------------
13217 $align_lookup = array();
13218 $align_values = array('left', 'right', 'center', 'justify');
13219 foreach ($align_values as $v) $align_lookup[$v] = "text-align:$v;";
13229 new HTMLPurifier_AttrTransform_EnumToCSS('align', $align_lookup);
13231 // @bgcolor for table, tr, td, th ---------------------------------
13232 $r['table@bgcolor'] =
13235 new HTMLPurifier_AttrTransform_BgColor();
13237 // @border for img ------------------------------------------------
13238 $r['img@border'] = new HTMLPurifier_AttrTransform_Border();
13240 // @clear for br --------------------------------------------------
13242 new HTMLPurifier_AttrTransform_EnumToCSS('clear', array(
13243 'left' => 'clear:left;',
13244 'right' => 'clear:right;',
13245 'all' => 'clear:both;',
13246 'none' => 'clear:none;',
13249 // @height for td, th ---------------------------------------------
13252 new HTMLPurifier_AttrTransform_Length('height');
13254 // @hspace for img ------------------------------------------------
13255 $r['img@hspace'] = new HTMLPurifier_AttrTransform_ImgSpace('hspace');
13257 // @noshade for hr ------------------------------------------------
13258 // this transformation is not precise but often good enough.
13259 // different browsers use different styles to designate noshade
13261 new HTMLPurifier_AttrTransform_BoolToCSS(
13263 'color:#808080;background-color:#808080;border:0;'
13266 // @nowrap for td, th ---------------------------------------------
13269 new HTMLPurifier_AttrTransform_BoolToCSS(
13271 'white-space:nowrap;'
13274 // @size for hr --------------------------------------------------
13275 $r['hr@size'] = new HTMLPurifier_AttrTransform_Length('size', 'height');
13277 // @type for li, ol, ul -------------------------------------------
13280 'disc' => 'list-style-type:disc;',
13281 'square' => 'list-style-type:square;',
13282 'circle' => 'list-style-type:circle;'
13285 '1' => 'list-style-type:decimal;',
13286 'i' => 'list-style-type:lower-roman;',
13287 'I' => 'list-style-type:upper-roman;',
13288 'a' => 'list-style-type:lower-alpha;',
13289 'A' => 'list-style-type:upper-alpha;'
13291 $li_types = $ul_types + $ol_types;
13294 $r['ul@type'] = new HTMLPurifier_AttrTransform_EnumToCSS('type', $ul_types);
13295 $r['ol@type'] = new HTMLPurifier_AttrTransform_EnumToCSS('type', $ol_types, true);
13296 $r['li@type'] = new HTMLPurifier_AttrTransform_EnumToCSS('type', $li_types, true);
13298 // @vspace for img ------------------------------------------------
13299 $r['img@vspace'] = new HTMLPurifier_AttrTransform_ImgSpace('vspace');
13301 // @width for hr, td, th ------------------------------------------
13304 $r['hr@width'] = new HTMLPurifier_AttrTransform_Length('width');
13316 class HTMLPurifier_HTMLModule_Tidy_Strict extends HTMLPurifier_HTMLModule_Tidy_XHTMLAndHTML4
13318 public $name = 'Tidy_Strict';
13319 public $defaultLevel = 'light';
13321 public function makeFixes() {
13322 $r = parent::makeFixes();
13323 $r['blockquote#content_model_type'] = 'strictblockquote';
13327 public $defines_child_def = true;
13328 public function getChildDef($def) {
13329 if ($def->content_model_type != 'strictblockquote') return parent::getChildDef($def);
13330 return new HTMLPurifier_ChildDef_StrictBlockquote($def->content_model);
13338 class HTMLPurifier_HTMLModule_Tidy_Transitional extends HTMLPurifier_HTMLModule_Tidy_XHTMLAndHTML4
13340 public $name = 'Tidy_Transitional';
13341 public $defaultLevel = 'heavy';
13348 class HTMLPurifier_HTMLModule_Tidy_XHTML extends HTMLPurifier_HTMLModule_Tidy
13351 public $name = 'Tidy_XHTML';
13352 public $defaultLevel = 'medium';
13354 public function makeFixes() {
13356 $r['@lang'] = new HTMLPurifier_AttrTransform_Lang();
13367 * Injector that auto paragraphs text in the root node based on
13369 * @todo Ensure all states are unit tested, including variations as well.
13370 * @todo Make a graph of the flow control for this Injector.
13372 class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
13375 public $name = 'AutoParagraph';
13376 public $needed = array('p');
13378 private function _pStart() {
13379 $par = new HTMLPurifier_Token_Start('p');
13380 $par->armor['MakeWellFormed_TagClosedError'] = true;
13384 public function handleText(&$token) {
13385 $text = $token->data;
13386 // Does the current parent allow <p> tags?
13387 if ($this->allowsElement('p')) {
13388 if (empty($this->currentNesting) || strpos($text, "\n\n") !== false) {
13389 // Note that we have differing behavior when dealing with text
13390 // in the anonymous root node, or a node inside the document.
13391 // If the text as a double-newline, the treatment is the same;
13392 // if it doesn't, see the next if-block if you're in the document.
13394 $i = $nesting = null;
13395 if (!$this->forwardUntilEndToken($i, $current, $nesting) && $token->is_whitespace) {
13396 // State 1.1: ... ^ (whitespace, then document end)
13398 // This is a degenerate case
13400 if (!$token->is_whitespace || $this->_isInline($current)) {
13404 // State 1.3: PAR1\n\nPAR2
13407 // State 1.4: <div>PAR1\n\nPAR2 (see State 2)
13409 $token = array($this->_pStart());
13410 $this->_splitText($text, $token);
13412 // State 1.5: \n<hr />
13417 // State 2: <div>PAR1... (similar to 1.4)
13420 // We're in an element that allows paragraph tags, but we're not
13421 // sure if we're going to need them.
13422 if ($this->_pLookAhead()) {
13423 // State 2.1: <div>PAR1<b>PAR1\n\nPAR2
13425 // Note: This will always be the first child, since any
13426 // previous inline element would have triggered this very
13427 // same routine, and found the double newline. One possible
13428 // exception would be a comment.
13429 $token = array($this->_pStart(), $token);
13431 // State 2.2.1: <div>PAR1<div>
13434 // State 2.2.2: <div>PAR1<b>PAR1</b></div>
13438 // Is the current parent a <p> tag?
13440 !empty($this->currentNesting) &&
13441 $this->currentNesting[count($this->currentNesting)-1]->name == 'p'
13443 // State 3.1: ...<p>PAR1
13446 // State 3.2: ...<p>PAR1\n\nPAR2
13449 $this->_splitText($text, $token);
13452 // State 4.1: ...<b>PAR1
13455 // State 4.2: ...<b>PAR1\n\nPAR2
13460 public function handleElement(&$token) {
13461 // We don't have to check if we're already in a <p> tag for block
13462 // tokens, because the tag would have been autoclosed by MakeWellFormed.
13463 if ($this->allowsElement('p')) {
13464 if (!empty($this->currentNesting)) {
13465 if ($this->_isInline($token)) {
13466 // State 1: <div>...<b>
13469 // Check if this token is adjacent to the parent token
13470 // (seek backwards until token isn't whitespace)
13472 $this->backward($i, $prev);
13474 if (!$prev instanceof HTMLPurifier_Token_Start) {
13475 // Token wasn't adjacent
13478 $prev instanceof HTMLPurifier_Token_Text &&
13479 substr($prev->data, -2) === "\n\n"
13481 // State 1.1.4: <div><p>PAR1</p>\n\n<b>
13484 // Quite frankly, this should be handled by splitText
13485 $token = array($this->_pStart(), $token);
13487 // State 1.1.1: <div><p>PAR1</p><b>
13490 // State 1.1.2: <div><br /><b>
13493 // State 1.1.3: <div>PAR<b>
13498 // State 1.2.1: <div><b>
13501 // Lookahead to see if <p> is needed.
13502 if ($this->_pLookAhead()) {
13503 // State 1.3.1: <div><b>PAR1\n\nPAR2
13505 $token = array($this->_pStart(), $token);
13507 // State 1.3.2: <div><b>PAR1</b></div>
13510 // State 1.3.3: <div><b>PAR1</b><div></div>\n\n</div>
13515 // State 2.3: ...<div>
13519 if ($this->_isInline($token)) {
13522 // This is where the {p} tag is inserted, not reflected in
13523 // inputTokens yet, however.
13524 $token = array($this->_pStart(), $token);
13526 // State 3.2: <div>
13531 if ($this->backward($i, $prev)) {
13533 !$prev instanceof HTMLPurifier_Token_Text
13535 // State 3.1.1: ...</p>{p}<b>
13538 // State 3.2.1: ...</p><div>
13541 if (!is_array($token)) $token = array($token);
13542 array_unshift($token, new HTMLPurifier_Token_Text("\n\n"));
13544 // State 3.1.2: ...</p>\n\n{p}<b>
13547 // State 3.2.2: ...</p>\n\n<div>
13550 // Note: PAR<ELEM> cannot occur because PAR would have been
13551 // wrapped in <p> tags.
13556 // State 2.2: <ul><li>
13559 // State 2.4: <p><b>
13565 * Splits up a text in paragraph tokens and appends them
13566 * to the result stream that will replace the original
13567 * @param $data String text data that will be processed
13569 * @param $result Reference to array of tokens that the
13570 * tags will be appended onto
13571 * @param $config Instance of HTMLPurifier_Config
13572 * @param $context Instance of HTMLPurifier_Context
13574 private function _splitText($data, &$result) {
13575 $raw_paragraphs = explode("\n\n", $data);
13576 $paragraphs = array(); // without empty paragraphs
13577 $needs_start = false;
13578 $needs_end = false;
13580 $c = count($raw_paragraphs);
13582 // There were no double-newlines, abort quickly. In theory this
13583 // should never happen.
13584 $result[] = new HTMLPurifier_Token_Text($data);
13587 for ($i = 0; $i < $c; $i++) {
13588 $par = $raw_paragraphs[$i];
13589 if (trim($par) !== '') {
13590 $paragraphs[] = $par;
13593 // Double newline at the front
13594 if (empty($result)) {
13595 // The empty result indicates that the AutoParagraph
13596 // injector did not add any start paragraph tokens.
13597 // This means that we have been in a paragraph for
13598 // a while, and the newline means we should start a new one.
13599 $result[] = new HTMLPurifier_Token_End('p');
13600 $result[] = new HTMLPurifier_Token_Text("\n\n");
13601 // However, the start token should only be added if
13602 // there is more processing to be done (i.e. there are
13603 // real paragraphs in here). If there are none, the
13604 // next start paragraph tag will be handled by the
13605 // next call to the injector
13606 $needs_start = true;
13608 // We just started a new paragraph!
13609 // Reinstate a double-newline for presentation's sake, since
13610 // it was in the source code.
13611 array_unshift($result, new HTMLPurifier_Token_Text("\n\n"));
13613 } elseif ($i + 1 == $c) {
13614 // Double newline at the end
13615 // There should be a trailing </p> when we're finally done.
13621 // Check if this was just a giant blob of whitespace. Move this earlier,
13623 if (empty($paragraphs)) {
13627 // Add the start tag indicated by \n\n at the beginning of $data
13628 if ($needs_start) {
13629 $result[] = $this->_pStart();
13632 // Append the paragraphs onto the result
13633 foreach ($paragraphs as $par) {
13634 $result[] = new HTMLPurifier_Token_Text($par);
13635 $result[] = new HTMLPurifier_Token_End('p');
13636 $result[] = new HTMLPurifier_Token_Text("\n\n");
13637 $result[] = $this->_pStart();
13640 // Remove trailing start token; Injector will handle this later if
13641 // it was indeed needed. This prevents from needing to do a lookahead,
13642 // at the cost of a lookbehind later.
13643 array_pop($result);
13645 // If there is no need for an end tag, remove all of it and let
13646 // MakeWellFormed close it later.
13648 array_pop($result); // removes \n\n
13649 array_pop($result); // removes </p>
13655 * Returns true if passed token is inline (and, ergo, allowed in
13658 private function _isInline($token) {
13659 return isset($this->htmlDefinition->info['p']->child->elements[$token->name]);
13663 * Looks ahead in the token list and determines whether or not we need
13664 * to insert a <p> tag.
13666 private function _pLookAhead() {
13667 $this->current($i, $current);
13668 if ($current instanceof HTMLPurifier_Token_Start) $nesting = 1;
13671 while ($this->forwardUntilEndToken($i, $current, $nesting)) {
13672 $result = $this->_checkNeedsP($current);
13673 if ($result !== null) {
13682 * Determines if a particular token requires an earlier inline token
13683 * to get a paragraph. This should be used with _forwardUntilEndToken
13685 private function _checkNeedsP($current) {
13686 if ($current instanceof HTMLPurifier_Token_Start){
13687 if (!$this->_isInline($current)) {
13690 // Terminate early, since we hit a block element
13693 } elseif ($current instanceof HTMLPurifier_Token_Text) {
13694 if (strpos($current->data, "\n\n") !== false) {
13695 // <div>PAR1<b>PAR1\n\nPAR2
13699 // <div>PAR1<b>PAR1...
13713 * Injector that displays the URL of an anchor instead of linking to it, in addition to showing the text of the link.
13715 class HTMLPurifier_Injector_DisplayLinkURI extends HTMLPurifier_Injector
13718 public $name = 'DisplayLinkURI';
13719 public $needed = array('a');
13721 public function handleElement(&$token) {
13724 public function handleEnd(&$token) {
13725 if (isset($token->start->attr['href'])){
13726 $url = $token->start->attr['href'];
13727 unset($token->start->attr['href']);
13728 $token = array($token, new HTMLPurifier_Token_Text(" ($url)"));
13730 // nothing to display
13740 * Injector that converts http, https and ftp text URLs to actual links.
13742 class HTMLPurifier_Injector_Linkify extends HTMLPurifier_Injector
13745 public $name = 'Linkify';
13746 public $needed = array('a' => array('href'));
13748 public function handleText(&$token) {
13749 if (!$this->allowsElement('a')) return;
13751 if (strpos($token->data, '://') === false) {
13752 // our really quick heuristic failed, abort
13753 // this may not work so well if we want to match things like
13754 // "google.com", but then again, most people don't
13758 // there is/are URL(s). Let's split the string:
13759 // Note: this regex is extremely permissive
13760 $bits = preg_split('#((?:https?|ftp)://[^\s\'"<>()]+)#S', $token->data, -1, PREG_SPLIT_DELIM_CAPTURE);
13767 for ($i = 0, $c = count($bits), $l = false; $i < $c; $i++, $l = !$l) {
13769 if ($bits[$i] === '') continue;
13770 $token[] = new HTMLPurifier_Token_Text($bits[$i]);
13772 $token[] = new HTMLPurifier_Token_Start('a', array('href' => $bits[$i]));
13773 $token[] = new HTMLPurifier_Token_Text($bits[$i]);
13774 $token[] = new HTMLPurifier_Token_End('a');
13787 * Injector that converts configuration directive syntax %Namespace.Directive
13790 class HTMLPurifier_Injector_PurifierLinkify extends HTMLPurifier_Injector
13793 public $name = 'PurifierLinkify';
13795 public $needed = array('a' => array('href'));
13797 public function prepare($config, $context) {
13798 $this->docURL = $config->get('AutoFormat.PurifierLinkify.DocURL');
13799 return parent::prepare($config, $context);
13802 public function handleText(&$token) {
13803 if (!$this->allowsElement('a')) return;
13804 if (strpos($token->data, '%') === false) return;
13806 $bits = preg_split('#%([a-z0-9]+\.[a-z0-9]+)#Si', $token->data, -1, PREG_SPLIT_DELIM_CAPTURE);
13812 for ($i = 0, $c = count($bits), $l = false; $i < $c; $i++, $l = !$l) {
13814 if ($bits[$i] === '') continue;
13815 $token[] = new HTMLPurifier_Token_Text($bits[$i]);
13817 $token[] = new HTMLPurifier_Token_Start('a',
13818 array('href' => str_replace('%s', $bits[$i], $this->docURL)));
13819 $token[] = new HTMLPurifier_Token_Text('%' . $bits[$i]);
13820 $token[] = new HTMLPurifier_Token_End('a');
13832 class HTMLPurifier_Injector_RemoveEmpty extends HTMLPurifier_Injector
13835 private $context, $config, $attrValidator, $removeNbsp, $removeNbspExceptions;
13837 public function prepare($config, $context) {
13838 parent::prepare($config, $context);
13839 $this->config = $config;
13840 $this->context = $context;
13841 $this->removeNbsp = $config->get('AutoFormat.RemoveEmpty.RemoveNbsp');
13842 $this->removeNbspExceptions = $config->get('AutoFormat.RemoveEmpty.RemoveNbsp.Exceptions');
13843 $this->attrValidator = new HTMLPurifier_AttrValidator();
13846 public function handleElement(&$token) {
13847 if (!$token instanceof HTMLPurifier_Token_Start) return;
13849 for ($i = $this->inputIndex + 1, $c = count($this->inputTokens); $i < $c; $i++) {
13850 $next = $this->inputTokens[$i];
13851 if ($next instanceof HTMLPurifier_Token_Text) {
13852 if ($next->is_whitespace) continue;
13853 if ($this->removeNbsp && !isset($this->removeNbspExceptions[$token->name])) {
13854 $plain = str_replace("\xC2\xA0", "", $next->data);
13855 $isWsOrNbsp = $plain === '' || ctype_space($plain);
13856 if ($isWsOrNbsp) continue;
13861 if (!$next || ($next instanceof HTMLPurifier_Token_End && $next->name == $token->name)) {
13862 if ($token->name == 'colgroup') return;
13863 $this->attrValidator->validateToken($token, $this->config, $this->context);
13864 $token->armor['ValidateAttributes'] = true;
13865 if (isset($token->attr['id']) || isset($token->attr['name'])) return;
13866 $token = $i - $this->inputIndex + 1;
13867 for ($b = $this->inputIndex - 1; $b > 0; $b--) {
13868 $prev = $this->inputTokens[$b];
13869 if ($prev instanceof HTMLPurifier_Token_Text && $prev->is_whitespace) continue;
13872 // This is safe because we removed the token that triggered this.
13873 $this->rewind($b - 1);
13885 * Injector that removes spans with no attributes
13887 class HTMLPurifier_Injector_RemoveSpansWithoutAttributes extends HTMLPurifier_Injector
13889 public $name = 'RemoveSpansWithoutAttributes';
13890 public $needed = array('span');
13892 private $attrValidator;
13895 * Used by AttrValidator
13900 public function prepare($config, $context) {
13901 $this->attrValidator = new HTMLPurifier_AttrValidator();
13902 $this->config = $config;
13903 $this->context = $context;
13904 return parent::prepare($config, $context);
13907 public function handleElement(&$token) {
13908 if ($token->name !== 'span' || !$token instanceof HTMLPurifier_Token_Start) {
13912 // We need to validate the attributes now since this doesn't normally
13913 // happen until after MakeWellFormed. If all the attributes are removed
13914 // the span needs to be removed too.
13915 $this->attrValidator->validateToken($token, $this->config, $this->context);
13916 $token->armor['ValidateAttributes'] = true;
13918 if (!empty($token->attr)) {
13923 $spanContentTokens = array();
13924 while ($this->forwardUntilEndToken($i, $current, $nesting)) {}
13926 if ($current instanceof HTMLPurifier_Token_End && $current->name === 'span') {
13927 // Mark closing span tag for deletion
13928 $current->markForDeletion = true;
13929 // Delete open span tag
13934 public function handleEnd(&$token) {
13935 if ($token->markForDeletion) {
13946 * Adds important param elements to inside of object in order to make
13949 class HTMLPurifier_Injector_SafeObject extends HTMLPurifier_Injector
13951 public $name = 'SafeObject';
13952 public $needed = array('object', 'param');
13954 protected $objectStack = array();
13955 protected $paramStack = array();
13957 // Keep this synchronized with AttrTransform/SafeParam.php
13958 protected $addParam = array(
13959 'allowScriptAccess' => 'never',
13960 'allowNetworking' => 'internal',
13962 protected $allowedParam = array(
13965 'flashvars' => true,
13967 'allowFullScreen' => true, // if omitted, assume to be 'false'
13970 public function prepare($config, $context) {
13971 parent::prepare($config, $context);
13974 public function handleElement(&$token) {
13975 if ($token->name == 'object') {
13976 $this->objectStack[] = $token;
13977 $this->paramStack[] = array();
13978 $new = array($token);
13979 foreach ($this->addParam as $name => $value) {
13980 $new[] = new HTMLPurifier_Token_Empty('param', array('name' => $name, 'value' => $value));
13983 } elseif ($token->name == 'param') {
13984 $nest = count($this->currentNesting) - 1;
13985 if ($nest >= 0 && $this->currentNesting[$nest]->name === 'object') {
13986 $i = count($this->objectStack) - 1;
13987 if (!isset($token->attr['name'])) {
13991 $n = $token->attr['name'];
13992 // We need this fix because YouTube doesn't supply a data
13993 // attribute, which we need if a type is specified. This is
13994 // *very* Flash specific.
13995 if (!isset($this->objectStack[$i]->attr['data']) &&
13996 ($token->attr['name'] == 'movie' || $token->attr['name'] == 'src')) {
13997 $this->objectStack[$i]->attr['data'] = $token->attr['value'];
13999 // Check if the parameter is the correct value but has not
14000 // already been added
14002 !isset($this->paramStack[$i][$n]) &&
14003 isset($this->addParam[$n]) &&
14004 $token->attr['name'] === $this->addParam[$n]
14006 // keep token, and add to param stack
14007 $this->paramStack[$i][$n] = true;
14008 } elseif (isset($this->allowedParam[$n])) {
14009 // keep token, don't do anything to it
14010 // (could possibly check for duplicates here)
14015 // not directly inside an object, DENY!
14021 public function handleEnd(&$token) {
14022 // This is the WRONG way of handling the object and param stacks;
14023 // we should be inserting them directly on the relevant object tokens
14024 // so that the global stack handling handles it.
14025 if ($token->name == 'object') {
14026 array_pop($this->objectStack);
14027 array_pop($this->paramStack);
14038 * Parser that uses PHP 5's DOM extension (part of the core).
14040 * In PHP 5, the DOM XML extension was revamped into DOM and added to the core.
14041 * It gives us a forgiving HTML parser, which we use to transform the HTML
14042 * into a DOM, and then into the tokens. It is blazingly fast (for large
14043 * documents, it performs twenty times faster than
14044 * HTMLPurifier_Lexer_DirectLex,and is the default choice for PHP 5.
14046 * @note Any empty elements will have empty tokens associated with them, even if
14047 * this is prohibited by the spec. This is cannot be fixed until the spec
14050 * @note PHP's DOM extension does not actually parse any entities, we use
14051 * our own function to do that.
14053 * @warning DOM tends to drop whitespace, which may wreak havoc on indenting.
14054 * If this is a huge problem, due to the fact that HTML is hand
14055 * edited and you are unable to get a parser cache that caches the
14056 * the output of HTML Purifier while keeping the original HTML lying
14057 * around, you may want to run Tidy on the resulting output or use
14058 * HTMLPurifier_DirectLex
14061 class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
14066 public function __construct() {
14067 // setup the factory
14068 parent::__construct();
14069 $this->factory = new HTMLPurifier_TokenFactory();
14072 public function tokenizeHTML($html, $config, $context) {
14074 $html = $this->normalize($html, $config, $context);
14076 // attempt to armor stray angled brackets that cannot possibly
14077 // form tags and thus are probably being used as emoticons
14078 if ($config->get('Core.AggressivelyFixLt')) {
14079 $char = '[^a-z!\/]';
14080 $comment = "/<!--(.*?)(-->|\z)/is";
14081 $html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html);
14084 $html = preg_replace("/<($char)/i", '<\\1', $html);
14085 } while ($html !== $old);
14086 $html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments
14089 // preprocess html, essential for UTF-8
14090 $html = $this->wrapHTML($html, $config, $context);
14092 $doc = new DOMDocument();
14093 $doc->encoding = 'UTF-8'; // theoretically, the above has this covered
14095 set_error_handler(array($this, 'muteErrorHandler'));
14096 $doc->loadHTML($html);
14097 restore_error_handler();
14100 $this->tokenizeDOM(
14101 $doc->getElementsByTagName('html')->item(0)-> // <html>
14102 getElementsByTagName('body')->item(0)-> // <body>
14103 getElementsByTagName('div')->item(0) // <div>
14109 * Iterative function that tokenizes a node, putting it into an accumulator.
14110 * To iterate is human, to recurse divine - L. Peter Deutsch
14111 * @param $node DOMNode to be tokenized.
14112 * @param $tokens Array-list of already tokenized tokens.
14113 * @returns Tokens of node appended to previously passed tokens.
14115 protected function tokenizeDOM($node, &$tokens) {
14118 $nodes = array($level => array($node));
14119 $closingNodes = array();
14121 while (!empty($nodes[$level])) {
14122 $node = array_shift($nodes[$level]); // FIFO
14123 $collect = $level > 0 ? true : false;
14124 $needEndingTag = $this->createStartNode($node, $tokens, $collect);
14125 if ($needEndingTag) {
14126 $closingNodes[$level][] = $node;
14128 if ($node->childNodes && $node->childNodes->length) {
14130 $nodes[$level] = array();
14131 foreach ($node->childNodes as $childNode) {
14132 array_push($nodes[$level], $childNode);
14137 if ($level && isset($closingNodes[$level])) {
14138 while($node = array_pop($closingNodes[$level])) {
14139 $this->createEndNode($node, $tokens);
14142 } while ($level > 0);
14146 * @param $node DOMNode to be tokenized.
14147 * @param $tokens Array-list of already tokenized tokens.
14148 * @param $collect Says whether or start and close are collected, set to
14149 * false at first recursion because it's the implicit DIV
14150 * tag you're dealing with.
14151 * @returns bool if the token needs an endtoken
14153 protected function createStartNode($node, &$tokens, $collect) {
14154 // intercept non element nodes. WE MUST catch all of them,
14155 // but we're not getting the character reference nodes because
14156 // those should have been preprocessed
14157 if ($node->nodeType === XML_TEXT_NODE) {
14158 $tokens[] = $this->factory->createText($node->data);
14160 } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {
14161 // undo libxml's special treatment of <script> and <style> tags
14162 $last = end($tokens);
14163 $data = $node->data;
14164 // (note $node->tagname is already normalized)
14165 if ($last instanceof HTMLPurifier_Token_Start && ($last->name == 'script' || $last->name == 'style')) {
14166 $new_data = trim($data);
14167 if (substr($new_data, 0, 4) === '<!--') {
14168 $data = substr($new_data, 4);
14169 if (substr($data, -3) === '-->') {
14170 $data = substr($data, 0, -3);
14172 // Highly suspicious! Not sure what to do...
14176 $tokens[] = $this->factory->createText($this->parseData($data));
14178 } elseif ($node->nodeType === XML_COMMENT_NODE) {
14179 // this is code is only invoked for comments in script/style in versions
14180 // of libxml pre-2.6.28 (regular comments, of course, are still
14181 // handled regularly)
14182 $tokens[] = $this->factory->createComment($node->data);
14185 // not-well tested: there may be other nodes we have to grab
14186 $node->nodeType !== XML_ELEMENT_NODE
14191 $attr = $node->hasAttributes() ? $this->transformAttrToAssoc($node->attributes) : array();
14193 // We still have to make sure that the element actually IS empty
14194 if (!$node->childNodes->length) {
14196 $tokens[] = $this->factory->createEmpty($node->tagName, $attr);
14201 $tokens[] = $this->factory->createStart(
14202 $tag_name = $node->tagName, // somehow, it get's dropped
14210 protected function createEndNode($node, &$tokens) {
14211 $tokens[] = $this->factory->createEnd($node->tagName);
14216 * Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array.
14218 * @param $attribute_list DOMNamedNodeMap of DOMAttr objects.
14219 * @returns Associative array of attributes.
14221 protected function transformAttrToAssoc($node_map) {
14222 // NamedNodeMap is documented very well, so we're using undocumented
14223 // features, namely, the fact that it implements Iterator and
14224 // has a ->length attribute
14225 if ($node_map->length === 0) return array();
14227 foreach ($node_map as $attr) {
14228 $array[$attr->name] = $attr->value;
14234 * An error handler that mutes all errors
14236 public function muteErrorHandler($errno, $errstr) {}
14239 * Callback function for undoing escaping of stray angled brackets
14242 public function callbackUndoCommentSubst($matches) {
14243 return '<!--' . strtr($matches[1], array('&'=>'&','<'=>'<')) . $matches[2];
14247 * Callback function that entity-izes ampersands in comments so that
14248 * callbackUndoCommentSubst doesn't clobber them
14250 public function callbackArmorCommentEntities($matches) {
14251 return '<!--' . str_replace('&', '&', $matches[1]) . $matches[2];
14255 * Wraps an HTML fragment in the necessary HTML
14257 protected function wrapHTML($html, $config, $context) {
14258 $def = $config->getDefinition('HTML');
14261 if (!empty($def->doctype->dtdPublic) || !empty($def->doctype->dtdSystem)) {
14262 $ret .= '<!DOCTYPE html ';
14263 if (!empty($def->doctype->dtdPublic)) $ret .= 'PUBLIC "' . $def->doctype->dtdPublic . '" ';
14264 if (!empty($def->doctype->dtdSystem)) $ret .= '"' . $def->doctype->dtdSystem . '" ';
14268 $ret .= '<html><head>';
14269 $ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />';
14270 // No protection if $html contains a stray </div>!
14271 $ret .= '</head><body><div>'.$html.'</div></body></html>';
14282 * Our in-house implementation of a parser.
14284 * A pure PHP parser, DirectLex has absolutely no dependencies, making
14285 * it a reasonably good default for PHP4. Written with efficiency in mind,
14286 * it can be four times faster than HTMLPurifier_Lexer_PEARSax3, although it
14287 * pales in comparison to HTMLPurifier_Lexer_DOMLex.
14289 * @todo Reread XML spec and document differences.
14291 class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
14294 public $tracksLineNumbers = true;
14297 * Whitespace characters for str(c)spn.
14299 protected $_whitespace = "\x20\x09\x0D\x0A";
14302 * Callback function for script CDATA fudge
14303 * @param $matches, in form of array(opening tag, contents, closing tag)
14305 protected function scriptCallback($matches) {
14306 return $matches[1] . htmlspecialchars($matches[2], ENT_COMPAT, 'UTF-8') . $matches[3];
14309 public function tokenizeHTML($html, $config, $context) {
14311 // special normalization for script tags without any armor
14312 // our "armor" heurstic is a < sign any number of whitespaces after
14313 // the first script tag
14314 if ($config->get('HTML.Trusted')) {
14315 $html = preg_replace_callback('#(<script[^>]*>)(\s*[^<].+?)(</script>)#si',
14316 array($this, 'scriptCallback'), $html);
14319 $html = $this->normalize($html, $config, $context);
14321 $cursor = 0; // our location in the text
14322 $inside_tag = false; // whether or not we're parsing the inside of a tag
14323 $array = array(); // result array
14325 // This is also treated to mean maintain *column* numbers too
14326 $maintain_line_numbers = $config->get('Core.MaintainLineNumbers');
14328 if ($maintain_line_numbers === null) {
14329 // automatically determine line numbering by checking
14330 // if error collection is on
14331 $maintain_line_numbers = $config->get('Core.CollectErrors');
14334 if ($maintain_line_numbers) {
14337 $length = strlen($html);
14339 $current_line = false;
14340 $current_col = false;
14343 $context->register('CurrentLine', $current_line);
14344 $context->register('CurrentCol', $current_col);
14346 // how often to manually recalculate. This will ALWAYS be right,
14347 // but it's pretty wasteful. Set to 0 to turn off
14348 $synchronize_interval = $config->get('Core.DirectLexLineNumberSyncInterval');
14351 if ($config->get('Core.CollectErrors')) {
14352 $e =& $context->get('ErrorCollector');
14355 // for testing synchronization
14360 // $cursor is either at the start of a token, or inside of
14361 // a tag (i.e. there was a < immediately before it), as indicated
14364 if ($maintain_line_numbers) {
14366 // $rcursor, however, is always at the start of a token.
14367 $rcursor = $cursor - (int) $inside_tag;
14369 // Column number is cheap, so we calculate it every round.
14370 // We're interested at the *end* of the newline string, so
14371 // we need to add strlen($nl) == 1 to $nl_pos before subtracting it
14372 // from our "rcursor" position.
14373 $nl_pos = strrpos($html, $nl, $rcursor - $length);
14374 $current_col = $rcursor - (is_bool($nl_pos) ? 0 : $nl_pos + 1);
14376 // recalculate lines
14378 $synchronize_interval && // synchronization is on
14379 $cursor > 0 && // cursor is further than zero
14380 $loops % $synchronize_interval === 0 // time to synchronize!
14382 $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor);
14387 $position_next_lt = strpos($html, '<', $cursor);
14388 $position_next_gt = strpos($html, '>', $cursor);
14390 // triggers on "<b>asdf</b>" but not "asdf <b></b>"
14391 // special case to set up context
14392 if ($position_next_lt === $cursor) {
14393 $inside_tag = true;
14397 if (!$inside_tag && $position_next_lt !== false) {
14398 // We are not inside tag and there still is another tag to parse
14400 HTMLPurifier_Token_Text(
14403 $html, $cursor, $position_next_lt - $cursor
14407 if ($maintain_line_numbers) {
14408 $token->rawPosition($current_line, $current_col);
14409 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor);
14412 $cursor = $position_next_lt + 1;
14413 $inside_tag = true;
14415 } elseif (!$inside_tag) {
14416 // We are not inside tag but there are no more tags
14417 // If we're already at the end, break
14418 if ($cursor === strlen($html)) break;
14419 // Create Text of rest of string
14421 HTMLPurifier_Token_Text(
14428 if ($maintain_line_numbers) $token->rawPosition($current_line, $current_col);
14431 } elseif ($inside_tag && $position_next_gt !== false) {
14432 // We are in tag and it is well formed
14433 // Grab the internals of the tag
14434 $strlen_segment = $position_next_gt - $cursor;
14436 if ($strlen_segment < 1) {
14437 // there's nothing to process!
14438 $token = new HTMLPurifier_Token_Text('<');
14443 $segment = substr($html, $cursor, $strlen_segment);
14445 if ($segment === false) {
14446 // somehow, we attempted to access beyond the end of
14447 // the string, defense-in-depth, reported by Nate Abele
14451 // Check if it's a comment
14453 substr($segment, 0, 3) === '!--'
14455 // re-determine segment length, looking for -->
14456 $position_comment_end = strpos($html, '-->', $cursor);
14457 if ($position_comment_end === false) {
14458 // uh oh, we have a comment that extends to
14459 // infinity. Can't be helped: set comment
14460 // end position to end of string
14461 if ($e) $e->send(E_WARNING, 'Lexer: Unclosed comment');
14462 $position_comment_end = strlen($html);
14467 $strlen_segment = $position_comment_end - $cursor;
14468 $segment = substr($html, $cursor, $strlen_segment);
14470 HTMLPurifier_Token_Comment(
14472 $segment, 3, $strlen_segment - 3
14475 if ($maintain_line_numbers) {
14476 $token->rawPosition($current_line, $current_col);
14477 $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment);
14480 $cursor = $end ? $position_comment_end : $position_comment_end + 3;
14481 $inside_tag = false;
14485 // Check if it's an end tag
14486 $is_end_tag = (strpos($segment,'/') === 0);
14488 $type = substr($segment, 1);
14489 $token = new HTMLPurifier_Token_End($type);
14490 if ($maintain_line_numbers) {
14491 $token->rawPosition($current_line, $current_col);
14492 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
14495 $inside_tag = false;
14496 $cursor = $position_next_gt + 1;
14500 // Check leading character is alnum, if not, we may
14501 // have accidently grabbed an emoticon. Translate into
14502 // text and go our merry way
14503 if (!ctype_alpha($segment[0])) {
14504 // XML: $segment[0] !== '_' && $segment[0] !== ':'
14505 if ($e) $e->send(E_NOTICE, 'Lexer: Unescaped lt');
14506 $token = new HTMLPurifier_Token_Text('<');
14507 if ($maintain_line_numbers) {
14508 $token->rawPosition($current_line, $current_col);
14509 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
14512 $inside_tag = false;
14516 // Check if it is explicitly self closing, if so, remove
14517 // trailing slash. Remember, we could have a tag like <br>, so
14518 // any later token processing scripts must convert improperly
14519 // classified EmptyTags from StartTags.
14520 $is_self_closing = (strrpos($segment,'/') === $strlen_segment-1);
14521 if ($is_self_closing) {
14523 $segment = substr($segment, 0, $strlen_segment);
14526 // Check if there are any attributes
14527 $position_first_space = strcspn($segment, $this->_whitespace);
14529 if ($position_first_space >= $strlen_segment) {
14530 if ($is_self_closing) {
14531 $token = new HTMLPurifier_Token_Empty($segment);
14533 $token = new HTMLPurifier_Token_Start($segment);
14535 if ($maintain_line_numbers) {
14536 $token->rawPosition($current_line, $current_col);
14537 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
14540 $inside_tag = false;
14541 $cursor = $position_next_gt + 1;
14545 // Grab out all the data
14546 $type = substr($segment, 0, $position_first_space);
14547 $attribute_string =
14550 $segment, $position_first_space
14553 if ($attribute_string) {
14554 $attr = $this->parseAttributeString(
14556 , $config, $context
14562 if ($is_self_closing) {
14563 $token = new HTMLPurifier_Token_Empty($type, $attr);
14565 $token = new HTMLPurifier_Token_Start($type, $attr);
14567 if ($maintain_line_numbers) {
14568 $token->rawPosition($current_line, $current_col);
14569 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
14572 $cursor = $position_next_gt + 1;
14573 $inside_tag = false;
14576 // inside tag, but there's no ending > sign
14577 if ($e) $e->send(E_WARNING, 'Lexer: Missing gt');
14579 HTMLPurifier_Token_Text(
14582 substr($html, $cursor)
14585 if ($maintain_line_numbers) $token->rawPosition($current_line, $current_col);
14586 // no cursor scroll? Hmm...
14593 $context->destroy('CurrentLine');
14594 $context->destroy('CurrentCol');
14599 * PHP 5.0.x compatible substr_count that implements offset and length
14601 protected function substrCount($haystack, $needle, $offset, $length) {
14602 static $oldVersion;
14603 if ($oldVersion === null) {
14604 $oldVersion = version_compare(PHP_VERSION, '5.1', '<');
14607 $haystack = substr($haystack, $offset, $length);
14608 return substr_count($haystack, $needle);
14610 return substr_count($haystack, $needle, $offset, $length);
14615 * Takes the inside of an HTML tag and makes an assoc array of attributes.
14617 * @param $string Inside of tag excluding name.
14618 * @returns Assoc array of attributes.
14620 public function parseAttributeString($string, $config, $context) {
14621 $string = (string) $string; // quick typecast
14623 if ($string == '') return array(); // no attributes
14626 if ($config->get('Core.CollectErrors')) {
14627 $e =& $context->get('ErrorCollector');
14630 // let's see if we can abort as quickly as possible
14631 // one equal sign, no spaces => one attribute
14632 $num_equal = substr_count($string, '=');
14633 $has_space = strpos($string, ' ');
14634 if ($num_equal === 0 && !$has_space) {
14636 return array($string => $string);
14637 } elseif ($num_equal === 1 && !$has_space) {
14638 // only one attribute
14639 list($key, $quoted_value) = explode('=', $string);
14640 $quoted_value = trim($quoted_value);
14642 if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
14645 if (!$quoted_value) return array($key => '');
14646 $first_char = @$quoted_value[0];
14647 $last_char = @$quoted_value[strlen($quoted_value)-1];
14649 $same_quote = ($first_char == $last_char);
14650 $open_quote = ($first_char == '"' || $first_char == "'");
14652 if ( $same_quote && $open_quote) {
14654 $value = substr($quoted_value, 1, strlen($quoted_value) - 2);
14656 // not well behaved
14658 if ($e) $e->send(E_ERROR, 'Lexer: Missing end quote');
14659 $value = substr($quoted_value, 1);
14661 $value = $quoted_value;
14664 if ($value === false) $value = '';
14665 return array($key => $this->parseData($value));
14668 // setup loop environment
14669 $array = array(); // return assoc array of attributes
14670 $cursor = 0; // current position in string (moves forward)
14671 $size = strlen($string); // size of the string (stays the same)
14673 // if we have unquoted attributes, the parser expects a terminating
14674 // space, so let's guarantee that there's always a terminating space.
14679 if ($cursor >= $size) {
14683 $cursor += ($value = strspn($string, $this->_whitespace, $cursor));
14686 $key_begin = $cursor; //we're currently at the start of the key
14688 // scroll past all characters that are the key (not whitespace or =)
14689 $cursor += strcspn($string, $this->_whitespace . '=', $cursor);
14691 $key_end = $cursor; // now at the end of the key
14693 $key = substr($string, $key_begin, $key_end - $key_begin);
14696 if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
14697 $cursor += strcspn($string, $this->_whitespace, $cursor + 1); // prevent infinite loop
14698 continue; // empty key
14701 // scroll past all whitespace
14702 $cursor += strspn($string, $this->_whitespace, $cursor);
14704 if ($cursor >= $size) {
14705 $array[$key] = $key;
14709 // if the next character is an equal sign, we've got a regular
14710 // pair, otherwise, it's a bool attribute
14711 $first_char = @$string[$cursor];
14713 if ($first_char == '=') {
14717 $cursor += strspn($string, $this->_whitespace, $cursor);
14719 if ($cursor === false) {
14724 // we might be in front of a quote right now
14726 $char = @$string[$cursor];
14728 if ($char == '"' || $char == "'") {
14729 // it's quoted, end bound is $char
14731 $value_begin = $cursor;
14732 $cursor = strpos($string, $char, $cursor);
14733 $value_end = $cursor;
14735 // it's not quoted, end bound is whitespace
14736 $value_begin = $cursor;
14737 $cursor += strcspn($string, $this->_whitespace, $cursor);
14738 $value_end = $cursor;
14741 // we reached a premature end
14742 if ($cursor === false) {
14744 $value_end = $cursor;
14747 $value = substr($string, $value_begin, $value_end - $value_begin);
14748 if ($value === false) $value = '';
14749 $array[$key] = $this->parseData($value);
14755 $array[$key] = $key;
14757 // purely theoretical
14758 if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
14773 * Composite strategy that runs multiple strategies on tokens.
14775 abstract class HTMLPurifier_Strategy_Composite extends HTMLPurifier_Strategy
14779 * List of strategies to run tokens through.
14781 protected $strategies = array();
14783 abstract public function __construct();
14785 public function execute($tokens, $config, $context) {
14786 foreach ($this->strategies as $strategy) {
14787 $tokens = $strategy->execute($tokens, $config, $context);
14799 * Core strategy composed of the big four strategies.
14801 class HTMLPurifier_Strategy_Core extends HTMLPurifier_Strategy_Composite
14804 public function __construct() {
14805 $this->strategies[] = new HTMLPurifier_Strategy_RemoveForeignElements();
14806 $this->strategies[] = new HTMLPurifier_Strategy_MakeWellFormed();
14807 $this->strategies[] = new HTMLPurifier_Strategy_FixNesting();
14808 $this->strategies[] = new HTMLPurifier_Strategy_ValidateAttributes();
14818 * Takes a well formed list of tokens and fixes their nesting.
14820 * HTML elements dictate which elements are allowed to be their children,
14821 * for example, you can't have a p tag in a span tag. Other elements have
14822 * much more rigorous definitions: tables, for instance, require a specific
14823 * order for their elements. There are also constraints not expressible by
14824 * document type definitions, such as the chameleon nature of ins/del
14825 * tags and global child exclusions.
14827 * The first major objective of this strategy is to iterate through all the
14828 * nodes (not tokens) of the list of tokens and determine whether or not
14829 * their children conform to the element's definition. If they do not, the
14830 * child definition may optionally supply an amended list of elements that
14831 * is valid or require that the entire node be deleted (and the previous
14834 * The second objective is to ensure that explicitly excluded elements of
14835 * an element do not appear in its children. Code that accomplishes this
14836 * task is pervasive through the strategy, though the two are distinct tasks
14837 * and could, theoretically, be seperated (although it's not recommended).
14839 * @note Whether or not unrecognized children are silently dropped or
14840 * translated into text depends on the child definitions.
14842 * @todo Enable nodes to be bubbled out of the structure.
14845 class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
14848 public function execute($tokens, $config, $context) {
14849 //####################################################################//
14852 // get a copy of the HTML definition
14853 $definition = $config->getHTMLDefinition();
14855 // insert implicit "parent" node, will be removed at end.
14857 $parent_name = $definition->info_parent;
14858 array_unshift($tokens, new HTMLPurifier_Token_Start($parent_name));
14859 $tokens[] = new HTMLPurifier_Token_End($parent_name);
14861 // setup the context variable 'IsInline', for chameleon processing
14862 // is 'false' when we are not inline, 'true' when it must always
14863 // be inline, and an integer when it is inline for a certain
14864 // branch of the document tree
14865 $is_inline = $definition->info_parent_def->descendants_are_inline;
14866 $context->register('IsInline', $is_inline);
14868 // setup error collector
14869 $e =& $context->get('ErrorCollector', true);
14871 //####################################################################//
14872 // Loop initialization
14874 // stack that contains the indexes of all parents,
14875 // $stack[count($stack)-1] being the current parent
14878 // stack that contains all elements that are excluded
14879 // it is organized by parent elements, similar to $stack,
14880 // but it is only populated when an element with exclusions is
14881 // processed, i.e. there won't be empty exclusions.
14882 $exclude_stack = array();
14884 // variable that contains the start token while we are processing
14885 // nodes. This enables error reporting to do its job
14886 $start_token = false;
14887 $context->register('CurrentToken', $start_token);
14889 //####################################################################//
14892 // iterate through all start nodes. Determining the start node
14893 // is complicated so it has been omitted from the loop construct
14894 for ($i = 0, $size = count($tokens) ; $i < $size; ) {
14896 //################################################################//
14897 // Gather information on children
14899 // child token accumulator
14900 $child_tokens = array();
14902 // scroll to the end of this node, report number, and collect
14904 for ($j = $i, $depth = 0; ; $j++) {
14905 if ($tokens[$j] instanceof HTMLPurifier_Token_Start) {
14907 // skip token assignment on first iteration, this is the
14908 // token we currently are on
14909 if ($depth == 1) continue;
14910 } elseif ($tokens[$j] instanceof HTMLPurifier_Token_End) {
14912 // skip token assignment on last iteration, this is the
14913 // end token of the token we're currently on
14914 if ($depth == 0) break;
14916 $child_tokens[] = $tokens[$j];
14919 // $i is index of start token
14920 // $j is index of end token
14922 $start_token = $tokens[$i]; // to make token available via CurrentToken
14924 //################################################################//
14925 // Gather information on parent
14927 // calculate parent information
14928 if ($count = count($stack)) {
14929 $parent_index = $stack[$count-1];
14930 $parent_name = $tokens[$parent_index]->name;
14931 if ($parent_index == 0) {
14932 $parent_def = $definition->info_parent_def;
14934 $parent_def = $definition->info[$parent_name];
14937 // processing as if the parent were the "root" node
14938 // unknown info, it won't be used anyway, in the future,
14939 // we may want to enforce one element only (this is
14940 // necessary for HTML Purifier to clean entire documents
14941 $parent_index = $parent_name = $parent_def = null;
14944 // calculate context
14945 if ($is_inline === false) {
14946 // check if conditions make it inline
14947 if (!empty($parent_def) && $parent_def->descendants_are_inline) {
14948 $is_inline = $count - 1;
14951 // check if we're out of inline
14952 if ($count === $is_inline) {
14953 $is_inline = false;
14957 //################################################################//
14958 // Determine whether element is explicitly excluded SGML-style
14960 // determine whether or not element is excluded by checking all
14961 // parent exclusions. The array should not be very large, two
14962 // elements at most.
14964 if (!empty($exclude_stack)) {
14965 foreach ($exclude_stack as $lookup) {
14966 if (isset($lookup[$tokens[$i]->name])) {
14968 // no need to continue processing
14974 //################################################################//
14975 // Perform child validation
14978 // there is an exclusion, remove the entire node
14980 $excludes = array(); // not used, but good to initialize anyway
14984 // special processing for the first node
14985 $def = $definition->info_parent_def;
14987 $def = $definition->info[$tokens[$i]->name];
14991 if (!empty($def->child)) {
14992 // have DTD child def validate children
14993 $result = $def->child->validateChildren(
14994 $child_tokens, $config, $context);
14996 // weird, no child definition, get rid of everything
15000 // determine whether or not this element has any exclusions
15001 $excludes = $def->excludes;
15004 // $result is now a bool or array
15006 //################################################################//
15007 // Process result by interpreting $result
15009 if ($result === true || $child_tokens === $result) {
15010 // leave the node as is
15012 // register start token as a parental node start
15015 // register exclusions if there are any
15016 if (!empty($excludes)) $exclude_stack[] = $excludes;
15018 // move cursor to next possible start node
15021 } elseif($result === false) {
15022 // remove entire node
15026 $e->send(E_ERROR, 'Strategy_FixNesting: Node excluded');
15028 $e->send(E_ERROR, 'Strategy_FixNesting: Node removed');
15032 // calculate length of inner tokens and current tokens
15033 $length = $j - $i + 1;
15036 array_splice($tokens, $i, $length);
15041 // there is no start token to register,
15042 // current node is now the next possible start node
15043 // unless it turns out that we need to do a double-check
15045 // this is a rought heuristic that covers 100% of HTML's
15046 // cases and 99% of all other cases. A child definition
15047 // that would be tricked by this would be something like:
15048 // ( | a b c) where it's all or nothing. Fortunately,
15049 // our current implementation claims that that case would
15050 // not allow empty, even if it did
15051 if (!$parent_def->child->allow_empty) {
15052 // we need to do a double-check
15053 $i = $parent_index;
15057 // PROJECTED OPTIMIZATION: Process all children elements before
15058 // reprocessing parent node.
15061 // replace node with $result
15063 // calculate length of inner tokens
15064 $length = $j - $i - 1;
15067 if (empty($result) && $length) {
15068 $e->send(E_ERROR, 'Strategy_FixNesting: Node contents removed');
15070 $e->send(E_WARNING, 'Strategy_FixNesting: Node reorganized');
15074 // perform replacement
15075 array_splice($tokens, $i + 1, $length, $result);
15079 $size += count($result);
15081 // register start token as a parental node start
15084 // register exclusions if there are any
15085 if (!empty($excludes)) $exclude_stack[] = $excludes;
15087 // move cursor to next possible start node
15092 //################################################################//
15093 // Scroll to next start node
15095 // We assume, at this point, that $i is the index of the token
15096 // that is the first possible new start point for a node.
15098 // Test if the token indeed is a start tag, if not, move forward
15100 $size = count($tokens);
15101 while ($i < $size and !$tokens[$i] instanceof HTMLPurifier_Token_Start) {
15102 if ($tokens[$i] instanceof HTMLPurifier_Token_End) {
15103 // pop a token index off the stack if we ended a node
15105 // pop an exclusion lookup off exclusion stack if
15106 // we ended node and that node had exclusions
15107 if ($i == 0 || $i == $size - 1) {
15108 // use specialized var if it's the super-parent
15109 $s_excludes = $definition->info_parent_def->excludes;
15111 $s_excludes = $definition->info[$tokens[$i]->name]->excludes;
15114 array_pop($exclude_stack);
15122 //####################################################################//
15125 // remove implicit parent tokens at the beginning and end
15126 array_shift($tokens);
15127 array_pop($tokens);
15129 // remove context variables
15130 $context->destroy('IsInline');
15131 $context->destroy('CurrentToken');
15133 //####################################################################//
15147 * Takes tokens makes them well-formed (balance end tags, etc.)
15149 * Specification of the armor attributes this strategy uses:
15151 * - MakeWellFormed_TagClosedError: This armor field is used to
15152 * suppress tag closed errors for certain tokens [TagClosedSuppress],
15153 * in particular, if a tag was generated automatically by HTML
15154 * Purifier, we may rely on our infrastructure to close it for us
15155 * and shouldn't report an error to the user [TagClosedAuto].
15157 class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
15161 * Array stream of tokens being processed.
15166 * Current index in $tokens.
15171 * Current nesting of elements.
15176 * Injectors active in this stream processing.
15178 protected $injectors;
15181 * Current instance of HTMLPurifier_Config.
15186 * Current instance of HTMLPurifier_Context.
15188 protected $context;
15190 public function execute($tokens, $config, $context) {
15192 $definition = $config->getHTMLDefinition();
15195 $generator = new HTMLPurifier_Generator($config, $context);
15196 $escape_invalid_tags = $config->get('Core.EscapeInvalidTags');
15197 // used for autoclose early abortion
15198 $global_parent_allowed_elements = array();
15199 if (isset($definition->info[$definition->info_parent])) {
15200 // may be unset under testing circumstances
15201 $global_parent_allowed_elements = $definition->info[$definition->info_parent]->child->getAllowedElements($config);
15203 $e = $context->get('ErrorCollector', true);
15204 $t = false; // token index
15205 $i = false; // injector index
15206 $token = false; // the current token
15207 $reprocess = false; // whether or not to reprocess the same token
15210 // member variables
15211 $this->stack =& $stack;
15213 $this->tokens =& $tokens;
15214 $this->config = $config;
15215 $this->context = $context;
15217 // context variables
15218 $context->register('CurrentNesting', $stack);
15219 $context->register('InputIndex', $t);
15220 $context->register('InputTokens', $tokens);
15221 $context->register('CurrentToken', $token);
15223 // -- begin INJECTOR --
15225 $this->injectors = array();
15227 $injectors = $config->getBatch('AutoFormat');
15228 $def_injectors = $definition->info_injector;
15229 $custom_injectors = $injectors['Custom'];
15230 unset($injectors['Custom']); // special case
15231 foreach ($injectors as $injector => $b) {
15232 // XXX: Fix with a legitimate lookup table of enabled filters
15233 if (strpos($injector, '.') !== false) continue;
15234 $injector = "HTMLPurifier_Injector_$injector";
15236 $this->injectors[] = new $injector;
15238 foreach ($def_injectors as $injector) {
15239 // assumed to be objects
15240 $this->injectors[] = $injector;
15242 foreach ($custom_injectors as $injector) {
15243 if (!$injector) continue;
15244 if (is_string($injector)) {
15245 $injector = "HTMLPurifier_Injector_$injector";
15246 $injector = new $injector;
15248 $this->injectors[] = $injector;
15251 // give the injectors references to the definition and context
15252 // variables for performance reasons
15253 foreach ($this->injectors as $ix => $injector) {
15254 $error = $injector->prepare($config, $context);
15255 if (!$error) continue;
15256 array_splice($this->injectors, $ix, 1); // rm the injector
15257 trigger_error("Cannot enable {$injector->name} injector because $error is not allowed", E_USER_WARNING);
15260 // -- end INJECTOR --
15262 // a note on reprocessing:
15263 // In order to reduce code duplication, whenever some code needs
15264 // to make HTML changes in order to make things "correct", the
15265 // new HTML gets sent through the purifier, regardless of its
15266 // status. This means that if we add a start token, because it
15267 // was totally necessary, we don't have to update nesting; we just
15268 // punt ($reprocess = true; continue;) and it does that for us.
15270 // isset is in loop because $tokens size changes during loop exec
15273 $t == 0 || isset($tokens[$t - 1]);
15274 // only increment if we don't need to reprocess
15275 $reprocess ? $reprocess = false : $t++
15278 // check for a rewind
15279 if (is_int($i) && $i >= 0) {
15280 // possibility: disable rewinding if the current token has a
15281 // rewind set on it already. This would offer protection from
15282 // infinite loop, but might hinder some advanced rewinding.
15283 $rewind_to = $this->injectors[$i]->getRewind();
15284 if (is_int($rewind_to) && $rewind_to < $t) {
15285 if ($rewind_to < 0) $rewind_to = 0;
15286 while ($t > $rewind_to) {
15288 $prev = $tokens[$t];
15289 // indicate that other injectors should not process this token,
15290 // but we need to reprocess it
15291 unset($prev->skip[$i]);
15292 $prev->rewind = $i;
15293 if ($prev instanceof HTMLPurifier_Token_Start) array_pop($this->stack);
15294 elseif ($prev instanceof HTMLPurifier_Token_End) $this->stack[] = $prev->start;
15300 // handle case of document end
15301 if (!isset($tokens[$t])) {
15302 // kill processing if stack is empty
15303 if (empty($this->stack)) break;
15306 $top_nesting = array_pop($this->stack);
15307 $this->stack[] = $top_nesting;
15309 // send error [TagClosedSuppress]
15310 if ($e && !isset($top_nesting->armor['MakeWellFormed_TagClosedError'])) {
15311 $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by document end', $top_nesting);
15314 // append, don't splice, since this is the end
15315 $tokens[] = new HTMLPurifier_Token_End($top_nesting->name);
15322 $token = $tokens[$t];
15324 //echo '<br>'; printTokens($tokens, $t); printTokens($this->stack);
15327 // quick-check: if it's not a tag, no need to process
15328 if (empty($token->is_tag)) {
15329 if ($token instanceof HTMLPurifier_Token_Text) {
15330 foreach ($this->injectors as $i => $injector) {
15331 if (isset($token->skip[$i])) continue;
15332 if ($token->rewind !== null && $token->rewind !== $i) continue;
15333 $injector->handleText($token);
15334 $this->processToken($token, $i);
15339 // another possibility is a comment
15343 if (isset($definition->info[$token->name])) {
15344 $type = $definition->info[$token->name]->child->type;
15346 $type = false; // Type is unknown, treat accordingly
15349 // quick tag checks: anything that's *not* an end tag
15351 if ($type === 'empty' && $token instanceof HTMLPurifier_Token_Start) {
15352 // claims to be a start tag but is empty
15353 $token = new HTMLPurifier_Token_Empty($token->name, $token->attr, $token->line, $token->col, $token->armor);
15355 } elseif ($type && $type !== 'empty' && $token instanceof HTMLPurifier_Token_Empty) {
15356 // claims to be empty but really is a start tag
15357 $this->swap(new HTMLPurifier_Token_End($token->name));
15358 $this->insertBefore(new HTMLPurifier_Token_Start($token->name, $token->attr, $token->line, $token->col, $token->armor));
15359 // punt (since we had to modify the input stream in a non-trivial way)
15362 } elseif ($token instanceof HTMLPurifier_Token_Empty) {
15363 // real empty token
15365 } elseif ($token instanceof HTMLPurifier_Token_Start) {
15368 // ...unless they also have to close their parent
15369 if (!empty($this->stack)) {
15371 // Performance note: you might think that it's rather
15372 // inefficient, recalculating the autoclose information
15373 // for every tag that a token closes (since when we
15374 // do an autoclose, we push a new token into the
15375 // stream and then /process/ that, before
15376 // re-processing this token.) But this is
15377 // necessary, because an injector can make an
15378 // arbitrary transformations to the autoclosing
15379 // tokens we introduce, so things may have changed
15380 // in the meantime. Also, doing the inefficient thing is
15381 // "easy" to reason about (for certain perverse definitions
15384 $parent = array_pop($this->stack);
15385 $this->stack[] = $parent;
15387 if (isset($definition->info[$parent->name])) {
15388 $elements = $definition->info[$parent->name]->child->getAllowedElements($config);
15389 $autoclose = !isset($elements[$token->name]);
15391 $autoclose = false;
15394 if ($autoclose && $definition->info[$token->name]->wrap) {
15395 // Check if an element can be wrapped by another
15396 // element to make it valid in a context (for
15397 // example, <ul><ul> needs a <li> in between)
15398 $wrapname = $definition->info[$token->name]->wrap;
15399 $wrapdef = $definition->info[$wrapname];
15400 $elements = $wrapdef->child->getAllowedElements($config);
15401 $parent_elements = $definition->info[$parent->name]->child->getAllowedElements($config);
15402 if (isset($elements[$token->name]) && isset($parent_elements[$wrapname])) {
15403 $newtoken = new HTMLPurifier_Token_Start($wrapname);
15404 $this->insertBefore($newtoken);
15410 $carryover = false;
15411 if ($autoclose && $definition->info[$parent->name]->formatting) {
15416 // check if this autoclose is doomed to fail
15417 // (this rechecks $parent, which his harmless)
15418 $autoclose_ok = isset($global_parent_allowed_elements[$token->name]);
15419 if (!$autoclose_ok) {
15420 foreach ($this->stack as $ancestor) {
15421 $elements = $definition->info[$ancestor->name]->child->getAllowedElements($config);
15422 if (isset($elements[$token->name])) {
15423 $autoclose_ok = true;
15426 if ($definition->info[$token->name]->wrap) {
15427 $wrapname = $definition->info[$token->name]->wrap;
15428 $wrapdef = $definition->info[$wrapname];
15429 $wrap_elements = $wrapdef->child->getAllowedElements($config);
15430 if (isset($wrap_elements[$token->name]) && isset($elements[$wrapname])) {
15431 $autoclose_ok = true;
15437 if ($autoclose_ok) {
15438 // errors need to be updated
15439 $new_token = new HTMLPurifier_Token_End($parent->name);
15440 $new_token->start = $parent;
15442 $element = clone $parent;
15444 $element->armor['MakeWellFormed_TagClosedError'] = true;
15445 $element->carryover = true;
15446 $this->processToken(array($new_token, $token, $element));
15448 $this->insertBefore($new_token);
15450 // [TagClosedSuppress]
15451 if ($e && !isset($parent->armor['MakeWellFormed_TagClosedError'])) {
15453 $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag auto closed', $parent);
15455 $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag carryover', $parent);
15470 foreach ($this->injectors as $i => $injector) {
15471 if (isset($token->skip[$i])) continue;
15472 if ($token->rewind !== null && $token->rewind !== $i) continue;
15473 $injector->handleElement($token);
15474 $this->processToken($token, $i);
15479 // ah, nothing interesting happened; do normal processing
15480 $this->swap($token);
15481 if ($token instanceof HTMLPurifier_Token_Start) {
15482 $this->stack[] = $token;
15483 } elseif ($token instanceof HTMLPurifier_Token_End) {
15484 throw new HTMLPurifier_Exception('Improper handling of end tag in start code; possible error in MakeWellFormed');
15490 // sanity check: we should be dealing with a closing tag
15491 if (!$token instanceof HTMLPurifier_Token_End) {
15492 throw new HTMLPurifier_Exception('Unaccounted for tag token in input stream, bug in HTML Purifier');
15495 // make sure that we have something open
15496 if (empty($this->stack)) {
15497 if ($escape_invalid_tags) {
15498 if ($e) $e->send(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag to text');
15499 $this->swap(new HTMLPurifier_Token_Text(
15500 $generator->generateFromToken($token)
15504 if ($e) $e->send(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag removed');
15510 // first, check for the simplest case: everything closes neatly.
15511 // Eventually, everything passes through here; if there are problems
15512 // we modify the input stream accordingly and then punt, so that
15513 // the tokens get processed again.
15514 $current_parent = array_pop($this->stack);
15515 if ($current_parent->name == $token->name) {
15516 $token->start = $current_parent;
15517 foreach ($this->injectors as $i => $injector) {
15518 if (isset($token->skip[$i])) continue;
15519 if ($token->rewind !== null && $token->rewind !== $i) continue;
15520 $injector->handleEnd($token);
15521 $this->processToken($token, $i);
15522 $this->stack[] = $current_parent;
15529 // okay, so we're trying to close the wrong tag
15531 // undo the pop previous pop
15532 $this->stack[] = $current_parent;
15534 // scroll back the entire nest, trying to find our tag.
15535 // (feature could be to specify how far you'd like to go)
15536 $size = count($this->stack);
15537 // -2 because -1 is the last element, but we already checked that
15538 $skipped_tags = false;
15539 for ($j = $size - 2; $j >= 0; $j--) {
15540 if ($this->stack[$j]->name == $token->name) {
15541 $skipped_tags = array_slice($this->stack, $j);
15546 // we didn't find the tag, so remove
15547 if ($skipped_tags === false) {
15548 if ($escape_invalid_tags) {
15549 $this->swap(new HTMLPurifier_Token_Text(
15550 $generator->generateFromToken($token)
15552 if ($e) $e->send(E_WARNING, 'Strategy_MakeWellFormed: Stray end tag to text');
15555 if ($e) $e->send(E_WARNING, 'Strategy_MakeWellFormed: Stray end tag removed');
15561 // do errors, in REVERSE $j order: a,b,c with </a></b></c>
15562 $c = count($skipped_tags);
15564 for ($j = $c - 1; $j > 0; $j--) {
15565 // notice we exclude $j == 0, i.e. the current ending tag, from
15566 // the errors... [TagClosedSuppress]
15567 if (!isset($skipped_tags[$j]->armor['MakeWellFormed_TagClosedError'])) {
15568 $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by element end', $skipped_tags[$j]);
15573 // insert tags, in FORWARD $j order: c,b,a with </a></b></c>
15574 $replace = array($token);
15575 for ($j = 1; $j < $c; $j++) {
15576 // ...as well as from the insertions
15577 $new_token = new HTMLPurifier_Token_End($skipped_tags[$j]->name);
15578 $new_token->start = $skipped_tags[$j];
15579 array_unshift($replace, $new_token);
15580 if (isset($definition->info[$new_token->name]) && $definition->info[$new_token->name]->formatting) {
15582 $element = clone $skipped_tags[$j];
15583 $element->carryover = true;
15584 $element->armor['MakeWellFormed_TagClosedError'] = true;
15585 $replace[] = $element;
15588 $this->processToken($replace);
15593 $context->destroy('CurrentNesting');
15594 $context->destroy('InputTokens');
15595 $context->destroy('InputIndex');
15596 $context->destroy('CurrentToken');
15598 unset($this->injectors, $this->stack, $this->tokens, $this->t);
15603 * Processes arbitrary token values for complicated substitution patterns.
15606 * If $token is an array, it is a list of tokens to substitute for the
15607 * current token. These tokens then get individually processed. If there
15608 * is a leading integer in the list, that integer determines how many
15609 * tokens from the stream should be removed.
15611 * If $token is a regular token, it is swapped with the current token.
15613 * If $token is false, the current token is deleted.
15615 * If $token is an integer, that number of tokens (with the first token
15616 * being the current one) will be deleted.
15618 * @param $token Token substitution value
15619 * @param $injector Injector that performed the substitution; default is if
15620 * this is not an injector related operation.
15622 protected function processToken($token, $injector = -1) {
15624 // normalize forms of token
15625 if (is_object($token)) $token = array(1, $token);
15626 if (is_int($token)) $token = array($token);
15627 if ($token === false) $token = array(1);
15628 if (!is_array($token)) throw new HTMLPurifier_Exception('Invalid token type from injector');
15629 if (!is_int($token[0])) array_unshift($token, 1);
15630 if ($token[0] === 0) throw new HTMLPurifier_Exception('Deleting zero tokens is not valid');
15632 // $token is now an array with the following form:
15633 // array(number nodes to delete, new node 1, new node 2, ...)
15635 $delete = array_shift($token);
15636 $old = array_splice($this->tokens, $this->t, $delete, $token);
15638 if ($injector > -1) {
15639 // determine appropriate skips
15640 $oldskip = isset($old[0]) ? $old[0]->skip : array();
15641 foreach ($token as $object) {
15642 $object->skip = $oldskip;
15643 $object->skip[$injector] = true;
15650 * Inserts a token before the current token. Cursor now points to
15651 * this token. You must reprocess after this.
15653 private function insertBefore($token) {
15654 array_splice($this->tokens, $this->t, 0, array($token));
15658 * Removes current token. Cursor now points to new token occupying previously
15659 * occupied space. You must reprocess after this.
15661 private function remove() {
15662 array_splice($this->tokens, $this->t, 1);
15666 * Swap current token with new token. Cursor points to new token (no
15667 * change). You must reprocess after this.
15669 private function swap($token) {
15670 $this->tokens[$this->t] = $token;
15680 * Removes all unrecognized tags from the list of tokens.
15682 * This strategy iterates through all the tokens and removes unrecognized
15683 * tokens. If a token is not recognized but a TagTransform is defined for
15684 * that element, the element will be transformed accordingly.
15687 class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy
15690 public function execute($tokens, $config, $context) {
15691 $definition = $config->getHTMLDefinition();
15692 $generator = new HTMLPurifier_Generator($config, $context);
15695 $escape_invalid_tags = $config->get('Core.EscapeInvalidTags');
15696 $remove_invalid_img = $config->get('Core.RemoveInvalidImg');
15698 // currently only used to determine if comments should be kept
15699 $trusted = $config->get('HTML.Trusted');
15701 $remove_script_contents = $config->get('Core.RemoveScriptContents');
15702 $hidden_elements = $config->get('Core.HiddenElements');
15704 // remove script contents compatibility
15705 if ($remove_script_contents === true) {
15706 $hidden_elements['script'] = true;
15707 } elseif ($remove_script_contents === false && isset($hidden_elements['script'])) {
15708 unset($hidden_elements['script']);
15711 $attr_validator = new HTMLPurifier_AttrValidator();
15713 // removes tokens until it reaches a closing tag with its value
15714 $remove_until = false;
15716 // converts comments into text tokens when this is equal to a tag name
15717 $textify_comments = false;
15720 $context->register('CurrentToken', $token);
15723 if ($config->get('Core.CollectErrors')) {
15724 $e =& $context->get('ErrorCollector');
15727 foreach($tokens as $token) {
15728 if ($remove_until) {
15729 if (empty($token->is_tag) || $token->name !== $remove_until) {
15733 if (!empty( $token->is_tag )) {
15736 // before any processing, try to transform the element
15738 isset($definition->info_tag_transform[$token->name])
15740 $original_name = $token->name;
15741 // there is a transformation for this tag
15743 $token = $definition->
15744 info_tag_transform[$token->name]->
15745 transform($token, $config, $context);
15746 if ($e) $e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Tag transform', $original_name);
15749 if (isset($definition->info[$token->name])) {
15751 // mostly everything's good, but
15752 // we need to make sure required attributes are in order
15754 ($token instanceof HTMLPurifier_Token_Start || $token instanceof HTMLPurifier_Token_Empty) &&
15755 $definition->info[$token->name]->required_attr &&
15756 ($token->name != 'img' || $remove_invalid_img) // ensure config option still works
15758 $attr_validator->validateToken($token, $config, $context);
15760 foreach ($definition->info[$token->name]->required_attr as $name) {
15761 if (!isset($token->attr[$name])) {
15767 if ($e) $e->send(E_ERROR, 'Strategy_RemoveForeignElements: Missing required attribute', $name);
15770 $token->armor['ValidateAttributes'] = true;
15773 if (isset($hidden_elements[$token->name]) && $token instanceof HTMLPurifier_Token_Start) {
15774 $textify_comments = $token->name;
15775 } elseif ($token->name === $textify_comments && $token instanceof HTMLPurifier_Token_End) {
15776 $textify_comments = false;
15779 } elseif ($escape_invalid_tags) {
15780 // invalid tag, generate HTML representation and insert in
15781 if ($e) $e->send(E_WARNING, 'Strategy_RemoveForeignElements: Foreign element to text');
15782 $token = new HTMLPurifier_Token_Text(
15783 $generator->generateFromToken($token)
15786 // check if we need to destroy all of the tag's children
15787 // CAN BE GENERICIZED
15788 if (isset($hidden_elements[$token->name])) {
15789 if ($token instanceof HTMLPurifier_Token_Start) {
15790 $remove_until = $token->name;
15791 } elseif ($token instanceof HTMLPurifier_Token_Empty) {
15792 // do nothing: we're still looking
15794 $remove_until = false;
15796 if ($e) $e->send(E_ERROR, 'Strategy_RemoveForeignElements: Foreign meta element removed');
15798 if ($e) $e->send(E_ERROR, 'Strategy_RemoveForeignElements: Foreign element removed');
15802 } elseif ($token instanceof HTMLPurifier_Token_Comment) {
15803 // textify comments in script tags when they are allowed
15804 if ($textify_comments !== false) {
15805 $data = $token->data;
15806 $token = new HTMLPurifier_Token_Text($data);
15807 } elseif ($trusted) {
15808 // keep, but perform comment cleaning
15810 // perform check whether or not there's a trailing hyphen
15811 if (substr($token->data, -1) == '-') {
15812 $e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Trailing hyphen in comment removed');
15815 $token->data = rtrim($token->data, '-');
15816 $found_double_hyphen = false;
15817 while (strpos($token->data, '--') !== false) {
15818 if ($e && !$found_double_hyphen) {
15819 $e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Hyphens in comment collapsed');
15821 $found_double_hyphen = true; // prevent double-erroring
15822 $token->data = str_replace('--', '-', $token->data);
15826 if ($e) $e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Comment removed');
15829 } elseif ($token instanceof HTMLPurifier_Token_Text) {
15833 $result[] = $token;
15835 if ($remove_until && $e) {
15836 // we removed tokens until the end, throw error
15837 $e->send(E_ERROR, 'Strategy_RemoveForeignElements: Token removed to end', $remove_until);
15840 $context->destroy('CurrentToken');
15852 * Validate all attributes in the tokens.
15855 class HTMLPurifier_Strategy_ValidateAttributes extends HTMLPurifier_Strategy
15858 public function execute($tokens, $config, $context) {
15861 $validator = new HTMLPurifier_AttrValidator();
15864 $context->register('CurrentToken', $token);
15866 foreach ($tokens as $key => $token) {
15868 // only process tokens that have attributes,
15869 // namely start and empty tags
15870 if (!$token instanceof HTMLPurifier_Token_Start && !$token instanceof HTMLPurifier_Token_Empty) continue;
15872 // skip tokens that are armored
15873 if (!empty($token->armor['ValidateAttributes'])) continue;
15875 // note that we have no facilities here for removing tokens
15876 $validator->validateToken($token, $config, $context);
15878 $tokens[$key] = $token; // for PHP 4
15880 $context->destroy('CurrentToken');
15892 * Transforms FONT tags to the proper form (SPAN with CSS styling)
15894 * This transformation takes the three proprietary attributes of FONT and
15895 * transforms them into their corresponding CSS attributes. These are color,
15898 * @note Size is an interesting case because it doesn't map cleanly to CSS.
15900 * http://style.cleverchimp.com/font_size_intervals/altintervals.html
15901 * for reasonable mappings.
15902 * @warning This doesn't work completely correctly; specifically, this
15903 * TagTransform operates before well-formedness is enforced, so
15904 * the "active formatting elements" algorithm doesn't get applied.
15906 class HTMLPurifier_TagTransform_Font extends HTMLPurifier_TagTransform
15909 public $transform_to = 'span';
15911 protected $_size_lookup = array(
15928 public function transform($tag, $config, $context) {
15930 if ($tag instanceof HTMLPurifier_Token_End) {
15931 $new_tag = clone $tag;
15932 $new_tag->name = $this->transform_to;
15936 $attr = $tag->attr;
15937 $prepend_style = '';
15939 // handle color transform
15940 if (isset($attr['color'])) {
15941 $prepend_style .= 'color:' . $attr['color'] . ';';
15942 unset($attr['color']);
15945 // handle face transform
15946 if (isset($attr['face'])) {
15947 $prepend_style .= 'font-family:' . $attr['face'] . ';';
15948 unset($attr['face']);
15951 // handle size transform
15952 if (isset($attr['size'])) {
15953 // normalize large numbers
15954 if ($attr['size'] !== '') {
15955 if ($attr['size']{0} == '+' || $attr['size']{0} == '-') {
15956 $size = (int) $attr['size'];
15957 if ($size < -2) $attr['size'] = '-2';
15958 if ($size > 4) $attr['size'] = '+4';
15960 $size = (int) $attr['size'];
15961 if ($size > 7) $attr['size'] = '7';
15964 if (isset($this->_size_lookup[$attr['size']])) {
15965 $prepend_style .= 'font-size:' .
15966 $this->_size_lookup[$attr['size']] . ';';
15968 unset($attr['size']);
15971 if ($prepend_style) {
15972 $attr['style'] = isset($attr['style']) ?
15973 $prepend_style . $attr['style'] :
15977 $new_tag = clone $tag;
15978 $new_tag->name = $this->transform_to;
15979 $new_tag->attr = $attr;
15991 * Simple transformation, just change tag name to something else,
15992 * and possibly add some styling. This will cover most of the deprecated
15995 class HTMLPurifier_TagTransform_Simple extends HTMLPurifier_TagTransform
16001 * @param $transform_to Tag name to transform to.
16002 * @param $style CSS style to add to the tag
16004 public function __construct($transform_to, $style = null) {
16005 $this->transform_to = $transform_to;
16006 $this->style = $style;
16009 public function transform($tag, $config, $context) {
16010 $new_tag = clone $tag;
16011 $new_tag->name = $this->transform_to;
16012 if (!is_null($this->style) &&
16013 ($new_tag instanceof HTMLPurifier_Token_Start || $new_tag instanceof HTMLPurifier_Token_Empty)
16015 $this->prependCSS($new_tag->attr, $this->style);
16027 * Concrete comment token class. Generally will be ignored.
16029 class HTMLPurifier_Token_Comment extends HTMLPurifier_Token
16031 public $data; /**< Character data within comment. */
16032 public $is_whitespace = true;
16034 * Transparent constructor.
16036 * @param $data String comment data.
16038 public function __construct($data, $line = null, $col = null) {
16039 $this->data = $data;
16040 $this->line = $line;
16050 * Abstract class of a tag token (start, end or empty), and its behavior.
16052 class HTMLPurifier_Token_Tag extends HTMLPurifier_Token
16055 * Static bool marker that indicates the class is a tag.
16057 * This allows us to check objects with <tt>!empty($obj->is_tag)</tt>
16058 * without having to use a function call <tt>is_a()</tt>.
16060 public $is_tag = true;
16063 * The lower-case name of the tag, like 'a', 'b' or 'blockquote'.
16065 * @note Strictly speaking, XML tags are case sensitive, so we shouldn't
16066 * be lower-casing them, but these tokens cater to HTML tags, which are
16072 * Associative array of the tag's attributes.
16074 public $attr = array();
16077 * Non-overloaded constructor, which lower-cases passed tag name.
16079 * @param $name String name.
16080 * @param $attr Associative array of attributes.
16082 public function __construct($name, $attr = array(), $line = null, $col = null, $armor = array()) {
16083 $this->name = ctype_lower($name) ? $name : strtolower($name);
16084 foreach ($attr as $key => $value) {
16085 // normalization only necessary when key is not lowercase
16086 if (!ctype_lower($key)) {
16087 $new_key = strtolower($key);
16088 if (!isset($attr[$new_key])) {
16089 $attr[$new_key] = $attr[$key];
16091 if ($new_key !== $key) {
16092 unset($attr[$key]);
16096 $this->attr = $attr;
16097 $this->line = $line;
16099 $this->armor = $armor;
16108 * Concrete empty token class.
16110 class HTMLPurifier_Token_Empty extends HTMLPurifier_Token_Tag
16120 * Concrete end token class.
16122 * @warning This class accepts attributes even though end tags cannot. This
16123 * is for optimization reasons, as under normal circumstances, the Lexers
16124 * do not pass attributes.
16126 class HTMLPurifier_Token_End extends HTMLPurifier_Token_Tag
16129 * Token that started this node. Added by MakeWellFormed. Please
16130 * do not edit this!
16140 * Concrete start token class.
16142 class HTMLPurifier_Token_Start extends HTMLPurifier_Token_Tag
16152 * Concrete text token class.
16154 * Text tokens comprise of regular parsed character data (PCDATA) and raw
16155 * character data (from the CDATA sections). Internally, their
16156 * data is parsed with all entities expanded. Surprisingly, the text token
16157 * does have a "tag name" called #PCDATA, which is how the DTD represents it
16158 * in permissible child nodes.
16160 class HTMLPurifier_Token_Text extends HTMLPurifier_Token
16163 public $name = '#PCDATA'; /**< PCDATA tag name compatible with DTD. */
16164 public $data; /**< Parsed character data of text. */
16165 public $is_whitespace; /**< Bool indicating if node is whitespace. */
16168 * Constructor, accepts data and determines if it is whitespace.
16170 * @param $data String parsed character data.
16172 public function __construct($data, $line = null, $col = null) {
16173 $this->data = $data;
16174 $this->is_whitespace = ctype_space($data);
16175 $this->line = $line;
16185 class HTMLPurifier_URIFilter_DisableExternal extends HTMLPurifier_URIFilter
16187 public $name = 'DisableExternal';
16188 protected $ourHostParts = false;
16189 public function prepare($config) {
16190 $our_host = $config->getDefinition('URI')->host;
16191 if ($our_host !== null) $this->ourHostParts = array_reverse(explode('.', $our_host));
16193 public function filter(&$uri, $config, $context) {
16194 if (is_null($uri->host)) return true;
16195 if ($this->ourHostParts === false) return false;
16196 $host_parts = array_reverse(explode('.', $uri->host));
16197 foreach ($this->ourHostParts as $i => $x) {
16198 if (!isset($host_parts[$i])) return false;
16199 if ($host_parts[$i] != $this->ourHostParts[$i]) return false;
16209 class HTMLPurifier_URIFilter_DisableExternalResources extends HTMLPurifier_URIFilter_DisableExternal
16211 public $name = 'DisableExternalResources';
16212 public function filter(&$uri, $config, $context) {
16213 if (!$context->get('EmbeddedURI', true)) return true;
16214 return parent::filter($uri, $config, $context);
16222 class HTMLPurifier_URIFilter_DisableResources extends HTMLPurifier_URIFilter
16224 public $name = 'DisableResources';
16225 public function filter(&$uri, $config, $context) {
16226 return !$context->get('EmbeddedURI', true);
16234 class HTMLPurifier_URIFilter_HostBlacklist extends HTMLPurifier_URIFilter
16236 public $name = 'HostBlacklist';
16237 protected $blacklist = array();
16238 public function prepare($config) {
16239 $this->blacklist = $config->get('URI.HostBlacklist');
16242 public function filter(&$uri, $config, $context) {
16243 foreach($this->blacklist as $blacklisted_host_fragment) {
16244 if (strpos($uri->host, $blacklisted_host_fragment) !== false) {
16256 // does not support network paths
16258 class HTMLPurifier_URIFilter_MakeAbsolute extends HTMLPurifier_URIFilter
16260 public $name = 'MakeAbsolute';
16262 protected $basePathStack = array();
16263 public function prepare($config) {
16264 $def = $config->getDefinition('URI');
16265 $this->base = $def->base;
16266 if (is_null($this->base)) {
16267 trigger_error('URI.MakeAbsolute is being ignored due to lack of value for URI.Base configuration', E_USER_WARNING);
16270 $this->base->fragment = null; // fragment is invalid for base URI
16271 $stack = explode('/', $this->base->path);
16272 array_pop($stack); // discard last segment
16273 $stack = $this->_collapseStack($stack); // do pre-parsing
16274 $this->basePathStack = $stack;
16277 public function filter(&$uri, $config, $context) {
16278 if (is_null($this->base)) return true; // abort early
16280 $uri->path === '' && is_null($uri->scheme) &&
16281 is_null($uri->host) && is_null($uri->query) && is_null($uri->fragment)
16283 // reference to current document
16284 $uri = clone $this->base;
16287 if (!is_null($uri->scheme)) {
16288 // absolute URI already: don't change
16289 if (!is_null($uri->host)) return true;
16290 $scheme_obj = $uri->getSchemeObj($config, $context);
16291 if (!$scheme_obj) {
16292 // scheme not recognized
16295 if (!$scheme_obj->hierarchical) {
16296 // non-hierarchal URI with explicit scheme, don't change
16299 // special case: had a scheme but always is hierarchical and had no authority
16301 if (!is_null($uri->host)) {
16302 // network path, don't bother
16305 if ($uri->path === '') {
16306 $uri->path = $this->base->path;
16307 } elseif ($uri->path[0] !== '/') {
16308 // relative path, needs more complicated processing
16309 $stack = explode('/', $uri->path);
16310 $new_stack = array_merge($this->basePathStack, $stack);
16311 if ($new_stack[0] !== '' && !is_null($this->base->host)) {
16312 array_unshift($new_stack, '');
16314 $new_stack = $this->_collapseStack($new_stack);
16315 $uri->path = implode('/', $new_stack);
16317 // absolute path, but still we should collapse
16318 $uri->path = implode('/', $this->_collapseStack(explode('/', $uri->path)));
16321 $uri->scheme = $this->base->scheme;
16322 if (is_null($uri->userinfo)) $uri->userinfo = $this->base->userinfo;
16323 if (is_null($uri->host)) $uri->host = $this->base->host;
16324 if (is_null($uri->port)) $uri->port = $this->base->port;
16329 * Resolve dots and double-dots in a path stack
16331 private function _collapseStack($stack) {
16333 $is_folder = false;
16334 for ($i = 0; isset($stack[$i]); $i++) {
16335 $is_folder = false;
16336 // absorb an internally duplicated slash
16337 if ($stack[$i] == '' && $i && isset($stack[$i+1])) continue;
16338 if ($stack[$i] == '..') {
16339 if (!empty($result)) {
16340 $segment = array_pop($result);
16341 if ($segment === '' && empty($result)) {
16342 // error case: attempted to back out too far:
16343 // restore the leading slash
16345 } elseif ($segment === '..') {
16346 $result[] = '..'; // cannot remove .. with ..
16349 // relative path, preserve the double-dots
16355 if ($stack[$i] == '.') {
16360 $result[] = $stack[$i];
16362 if ($is_folder) $result[] = '';
16371 class HTMLPurifier_URIFilter_Munge extends HTMLPurifier_URIFilter
16373 public $name = 'Munge';
16374 public $post = true;
16375 private $target, $parser, $doEmbed, $secretKey;
16377 protected $replace = array();
16379 public function prepare($config) {
16380 $this->target = $config->get('URI.' . $this->name);
16381 $this->parser = new HTMLPurifier_URIParser();
16382 $this->doEmbed = $config->get('URI.MungeResources');
16383 $this->secretKey = $config->get('URI.MungeSecretKey');
16386 public function filter(&$uri, $config, $context) {
16387 if ($context->get('EmbeddedURI', true) && !$this->doEmbed) return true;
16389 $scheme_obj = $uri->getSchemeObj($config, $context);
16390 if (!$scheme_obj) return true; // ignore unknown schemes, maybe another postfilter did it
16391 if (is_null($uri->host) || empty($scheme_obj->browsable)) {
16394 // don't redirect if target host is our host
16395 if ($uri->host === $config->getDefinition('URI')->host) {
16399 $this->makeReplace($uri, $config, $context);
16400 $this->replace = array_map('rawurlencode', $this->replace);
16402 $new_uri = strtr($this->target, $this->replace);
16403 $new_uri = $this->parser->parse($new_uri);
16404 // don't redirect if the target host is the same as the
16406 if ($uri->host === $new_uri->host) return true;
16407 $uri = $new_uri; // overwrite
16411 protected function makeReplace($uri, $config, $context) {
16412 $string = $uri->toString();
16413 // always available
16414 $this->replace['%s'] = $string;
16415 $this->replace['%r'] = $context->get('EmbeddedURI', true);
16416 $token = $context->get('CurrentToken', true);
16417 $this->replace['%n'] = $token ? $token->name : null;
16418 $this->replace['%m'] = $context->get('CurrentAttr', true);
16419 $this->replace['%p'] = $context->get('CurrentCSSProperty', true);
16420 // not always available
16421 if ($this->secretKey) $this->replace['%t'] = sha1($this->secretKey . ':' . $string);
16431 * Implements data: URI for base64 encoded images supported by GD.
16433 class HTMLPurifier_URIScheme_data extends HTMLPurifier_URIScheme {
16435 public $browsable = true;
16436 public $allowed_types = array(
16437 // you better write validation code for other types if you
16438 // decide to allow them
16439 'image/jpeg' => true,
16440 'image/gif' => true,
16441 'image/png' => true,
16443 // this is actually irrelevant since we only write out the path
16445 public $may_omit_host = true;
16447 public function doValidate(&$uri, $config, $context) {
16448 $result = explode(',', $uri->path, 2);
16449 $is_base64 = false;
16451 $content_type = null;
16452 if (count($result) == 2) {
16453 list($metadata, $data) = $result;
16454 // do some legwork on the metadata
16455 $metas = explode(';', $metadata);
16456 while(!empty($metas)) {
16457 $cur = array_shift($metas);
16458 if ($cur == 'base64') {
16462 if (substr($cur, 0, 8) == 'charset=') {
16463 // doesn't match if there are arbitrary spaces, but
16465 if ($charset !== null) continue; // garbage
16466 $charset = substr($cur, 8); // not used
16468 if ($content_type !== null) continue; // garbage
16469 $content_type = $cur;
16473 $data = $result[0];
16475 if ($content_type !== null && empty($this->allowed_types[$content_type])) {
16478 if ($charset !== null) {
16479 // error; we don't allow plaintext stuff
16482 $data = rawurldecode($data);
16484 $raw_data = base64_decode($data);
16488 // XXX probably want to refactor this into a general mechanism
16489 // for filtering arbitrary content types
16490 $file = tempnam("/tmp", "");
16491 file_put_contents($file, $raw_data);
16492 if (function_exists('exif_imagetype')) {
16493 $image_code = exif_imagetype($file);
16494 } elseif (function_exists('getimagesize')) {
16495 set_error_handler(array($this, 'muteErrorHandler'));
16496 $info = getimagesize($file);
16497 restore_error_handler();
16498 if ($info == false) return false;
16499 $image_code = $info[2];
16501 trigger_error("could not find exif_imagetype or getimagesize functions", E_USER_ERROR);
16503 $real_content_type = image_type_to_mime_type($image_code);
16504 if ($real_content_type != $content_type) {
16505 // we're nice guys; if the content type is something else we
16506 // support, change it over
16507 if (empty($this->allowed_types[$real_content_type])) return false;
16508 $content_type = $real_content_type;
16510 // ok, it's kosher, rewrite what we need
16511 $uri->userinfo = null;
16514 $uri->fragment = null;
16515 $uri->query = null;
16516 $uri->path = "$content_type;base64," . base64_encode($raw_data);
16520 public function muteErrorHandler($errno, $errstr) {}
16528 * Validates file as defined by RFC 1630 and RFC 1738.
16530 class HTMLPurifier_URIScheme_file extends HTMLPurifier_URIScheme {
16532 // Generally file:// URLs are not accessible from most
16533 // machines, so placing them as an img src is incorrect.
16534 public $browsable = false;
16536 // Basically the *only* URI scheme for which this is true, since
16537 // accessing files on the local machine is very common. In fact,
16538 // browsers on some operating systems don't understand the
16539 // authority, though I hear it is used on Windows to refer to
16541 public $may_omit_host = true;
16543 public function doValidate(&$uri, $config, $context) {
16544 // Authentication method is not supported
16545 $uri->userinfo = null;
16546 // file:// makes no provisions for accessing the resource
16548 // While it seems to work on Firefox, the querystring has
16549 // no possible effect and is thus stripped.
16550 $uri->query = null;
16561 * Validates ftp (File Transfer Protocol) URIs as defined by generic RFC 1738.
16563 class HTMLPurifier_URIScheme_ftp extends HTMLPurifier_URIScheme {
16565 public $default_port = 21;
16566 public $browsable = true; // usually
16567 public $hierarchical = true;
16569 public function doValidate(&$uri, $config, $context) {
16570 $uri->query = null;
16573 $semicolon_pos = strrpos($uri->path, ';'); // reverse
16574 if ($semicolon_pos !== false) {
16575 $type = substr($uri->path, $semicolon_pos + 1); // no semicolon
16576 $uri->path = substr($uri->path, 0, $semicolon_pos);
16578 if (strpos($type, '=') !== false) {
16579 // figure out whether or not the declaration is correct
16580 list($key, $typecode) = explode('=', $type, 2);
16581 if ($key !== 'type') {
16582 // invalid key, tack it back on encoded
16583 $uri->path .= '%3B' . $type;
16584 } elseif ($typecode === 'a' || $typecode === 'i' || $typecode === 'd') {
16585 $type_ret = ";type=$typecode";
16588 $uri->path .= '%3B' . $type;
16590 $uri->path = str_replace(';', '%3B', $uri->path);
16591 $uri->path .= $type_ret;
16604 * Validates http (HyperText Transfer Protocol) as defined by RFC 2616
16606 class HTMLPurifier_URIScheme_http extends HTMLPurifier_URIScheme {
16608 public $default_port = 80;
16609 public $browsable = true;
16610 public $hierarchical = true;
16612 public function doValidate(&$uri, $config, $context) {
16613 $uri->userinfo = null;
16624 * Validates https (Secure HTTP) according to http scheme.
16626 class HTMLPurifier_URIScheme_https extends HTMLPurifier_URIScheme_http {
16628 public $default_port = 443;
16636 // VERY RELAXED! Shouldn't cause problems, not even Firefox checks if the
16637 // email is valid, but be careful!
16640 * Validates mailto (for E-mail) according to RFC 2368
16641 * @todo Validate the email address
16642 * @todo Filter allowed query parameters
16645 class HTMLPurifier_URIScheme_mailto extends HTMLPurifier_URIScheme {
16647 public $browsable = false;
16648 public $may_omit_host = true;
16650 public function doValidate(&$uri, $config, $context) {
16651 $uri->userinfo = null;
16654 // we need to validate path against RFC 2368's addr-spec
16665 * Validates news (Usenet) as defined by generic RFC 1738
16667 class HTMLPurifier_URIScheme_news extends HTMLPurifier_URIScheme {
16669 public $browsable = false;
16670 public $may_omit_host = true;
16672 public function doValidate(&$uri, $config, $context) {
16673 $uri->userinfo = null;
16676 $uri->query = null;
16677 // typecode check needed on path
16688 * Validates nntp (Network News Transfer Protocol) as defined by generic RFC 1738
16690 class HTMLPurifier_URIScheme_nntp extends HTMLPurifier_URIScheme {
16692 public $default_port = 119;
16693 public $browsable = false;
16695 public function doValidate(&$uri, $config, $context) {
16696 $uri->userinfo = null;
16697 $uri->query = null;
16708 * Performs safe variable parsing based on types which can be used by
16709 * users. This may not be able to represent all possible data inputs,
16712 class HTMLPurifier_VarParser_Flexible extends HTMLPurifier_VarParser
16715 protected function parseImplementation($var, $type, $allow_null) {
16716 if ($allow_null && $var === null) return null;
16718 // Note: if code "breaks" from the switch, it triggers a generic
16719 // exception to be thrown. Specific errors can be specifically
16722 case self::ISTRING :
16723 case self::STRING :
16728 if (is_string($var) && ctype_digit($var)) $var = (int) $var;
16731 if ((is_string($var) && is_numeric($var)) || is_int($var)) $var = (float) $var;
16734 if (is_int($var) && ($var === 0 || $var === 1)) {
16735 $var = (bool) $var;
16736 } elseif (is_string($var)) {
16737 if ($var == 'on' || $var == 'true' || $var == '1') {
16739 } elseif ($var == 'off' || $var == 'false' || $var == '0') {
16742 throw new HTMLPurifier_VarParserException("Unrecognized value '$var' for $type");
16748 case self::LOOKUP :
16749 if (is_string($var)) {
16750 // special case: technically, this is an array with
16751 // a single empty string item, but having an empty
16752 // array is more intuitive
16753 if ($var == '') return array();
16754 if (strpos($var, "\n") === false && strpos($var, "\r") === false) {
16755 // simplistic string to array method that only works
16756 // for simple lists of tag names or alphanumeric characters
16757 $var = explode(',',$var);
16759 $var = preg_split('/(,|[\n\r]+)/', $var);
16762 foreach ($var as $i => $j) $var[$i] = trim($j);
16763 if ($type === self::HASH) {
16764 // key:value,key2:value2
16766 foreach ($var as $keypair) {
16767 $c = explode(':', $keypair, 2);
16768 if (!isset($c[1])) continue;
16769 $nvar[trim($c[0])] = trim($c[1]);
16774 if (!is_array($var)) break;
16775 $keys = array_keys($var);
16776 if ($keys === array_keys($keys)) {
16777 if ($type == self::ALIST) return $var;
16778 elseif ($type == self::LOOKUP) {
16780 foreach ($var as $key) {
16786 if ($type === self::ALIST) {
16787 trigger_error("Array list did not have consecutive integer indexes", E_USER_WARNING);
16788 return array_values($var);
16790 if ($type === self::LOOKUP) {
16791 foreach ($var as $key => $value) {
16792 if ($value !== true) {
16793 trigger_error("Lookup array has non-true value at key '$key'; maybe your input array was not indexed numerically", E_USER_WARNING);
16800 $this->errorInconsistent(__CLASS__, $type);
16802 $this->errorGeneric($var, $type);
16812 * This variable parser uses PHP's internal code engine. Because it does
16813 * this, it can represent all inputs; however, it is dangerous and cannot
16814 * be used by users.
16816 class HTMLPurifier_VarParser_Native extends HTMLPurifier_VarParser
16819 protected function parseImplementation($var, $type, $allow_null) {
16820 return $this->evalExpression($var);
16823 protected function evalExpression($expr) {
16825 $result = eval("\$var = $expr;");
16826 if ($result === false) {
16827 throw new HTMLPurifier_VarParserException("Fatal error in evaluated code");