2 /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
11 * @author Roman Ivanov <thingol@mail.ru>
12 * @author Miguel Vazquez Gocobachi <demrit@mx.gnu.org>
13 * @copyright 2004-2009 Roman Ivanov, Miguel Vazquez Gocobachi
14 * @license http://www.debian.org/misc/bsd.license BSD License (3 Clause)
16 * @link http://pear.php.net/package/HTML_Safe
20 * This package requires HTMLSax3 package
22 require_once 'include/Pear/XML_HTMLSax3/HTMLSax3.php';
27 * This parser strips down all potentially dangerous content within HTML:
29 * <li>opening tag without its closing tag</li>
30 * <li>closing tag without its opening tag</li>
31 * <li>any of these tags: "base", "basefont", "head", "html", "body", "applet",
32 * "object", "iframe", "frame", "frameset", "script", "layer", "ilayer", "embed",
33 * "bgsound", "link", "meta", "style", "title", "blink", "xml" etc.</li>
34 * <li>any of these attributes: on*, data*, dynsrc</li>
35 * <li>javascript:/vbscript:/about: etc. protocols</li>
36 * <li>expression/behavior etc. in styles</li>
37 * <li>any other active content</li>
39 * It also tries to convert code to XHTML valid, but htmltidy is far better
40 * solution for this task.
44 * $parser = new HTML_Safe;
45 * $result = $parser->parse($doc);
50 * @author Roman Ivanov <thingol@mail.ru>
51 * @author Miguel Vazquez Gocobachi <demrit@mx.gnu.org>
52 * @copyright 2004-2009 Roman Ivanov, Miguel Vazquez Gocobachi
53 * @license http://www.debian.org/misc/bsd.license BSD License (3 Clause)
54 * @version Release: @package_version@
55 * @link http://pear.php.net/package/HTML_Safe
60 * Storage for resulting HTML output
64 protected $xhtml = '';
67 * Array of counters for each tag
71 protected $counter = array();
74 * Stack of unclosed tags
78 protected $stack = array();
81 * Array of counters for tags that must be deleted with all content
85 protected $dcCounter = array();
88 * Stack of unclosed tags that must be deleted with all content
92 protected $dcStack = array();
95 * Stores level of list (ol/ul) nesting
99 protected $listScope = 0;
102 * Stack of unclosed list tags
106 protected $liStack = array();
109 * Array of prepared regular expressions for protocols (schemas) matching
113 protected $protoRegexps = array();
116 * Array of prepared regular expressions for CSS matching
120 protected $cssRegexps = array();
127 protected $allowTags = array();
131 * List of single tags ("<tag />")
135 public $singleTags = array('area', 'br', 'img', 'input', 'hr', 'wbr', );
138 * List of dangerous tags (such tags will be deleted)
142 public $deleteTags = array(
143 'applet', 'base', 'basefont', 'bgsound', 'blink', 'body',
144 'embed', 'frame', 'frameset', 'head', 'html', 'ilayer',
145 'iframe', 'layer', 'link', 'meta', 'object', 'style',
150 * List of dangerous tags (such tags will be deleted, and all content
151 * inside this tags will be also removed)
155 public $deleteTagsContent = array('script', 'style', 'title', 'xml', );
158 * Type of protocols filtering ('white' or 'black')
162 public $protocolFiltering = 'white';
165 * List of "dangerous" protocols (used for blacklist-filtering)
169 public $blackProtocols = array(
170 'about', 'chrome', 'data', 'disk', 'hcp',
171 'help', 'javascript', 'livescript', 'lynxcgi', 'lynxexec',
172 'ms-help', 'ms-its', 'mhtml', 'mocha', 'opera',
173 'res', 'resource', 'shell', 'vbscript', 'view-source',
174 'vnd.ms.radio', 'wysiwyg',
178 * List of "safe" protocols (used for whitelist-filtering)
182 public $whiteProtocols = array(
183 'ed2k', 'file', 'ftp', 'gopher', 'http', 'https',
184 'irc', 'mailto', 'news', 'nntp', 'telnet', 'webcal',
189 * List of attributes that can contain protocols
193 public $protocolAttributes = array(
194 'action', 'background', 'codebase', 'dynsrc', 'href', 'lowsrc', 'src',
198 * List of dangerous CSS keywords
200 * Whole style="" attribute will be removed, if parser will find one of
205 public $cssKeywords = array(
206 'absolute', 'behavior', 'behaviour', 'content', 'expression',
207 'fixed', 'include-source', 'moz-binding',
211 * List of tags that can have no "closing tag"
214 * @deprecated XHTML does not allow such tags
216 public $noClose = array();
219 * List of block-level tags that terminates paragraph
221 * Paragraph will be closed when this tags opened
225 public $closeParagraph = array(
226 'address', 'blockquote', 'center', 'dd', 'dir', 'div',
227 'dl', 'dt', 'h1', 'h2', 'h3', 'h4',
228 'h5', 'h6', 'hr', 'isindex', 'listing', 'marquee',
229 'menu', 'multicol', 'ol', 'p', 'plaintext', 'pre',
230 'table', 'ul', 'xmp',
234 * List of table tags, all table tags outside a table will be removed
238 public $tableTags = array(
239 'caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
248 public $listTags = array('dir', 'menu', 'ol', 'ul', 'dl', );
251 * List of dangerous attributes
255 public $attributes = array('dynsrc', 'id', 'name', );
258 * List of allowed "namespaced" attributes
262 public $attributesNS = array('xml:lang', );
265 * URL validation callback
269 protected $urlCallback;
276 public function __construct()
278 //making regular expressions based on Proto & CSS arrays
279 foreach ($this->blackProtocols as $proto) {
280 $preg = "/[\s\x01-\x1F]*";
281 for ($i=0; $i<strlen($proto); $i++) {
282 $preg .= $proto{$i} . "[\s\x01-\x1F]*";
285 $this->protoRegexps[] = $preg;
288 foreach ($this->cssKeywords as $css) {
289 $this->cssRegexps[] = '/' . $css . '/i';
295 * Handles the writing of attributes - called from $this->openHandler()
297 * @param array $attrs array of attributes $name => $value
301 protected function writeAttrs($attrs)
303 if (is_array($attrs)) {
304 foreach ($attrs as $name => $value) {
305 $name = strtolower($name);
307 if (strpos($name, 'on') === 0) {
311 if (strpos($name, 'data') === 0) {
315 if (in_array($name, $this->attributes)) {
319 if (!preg_match('/^[a-z0-9]+$/i', $name)) {
320 if (!in_array($name, $this->attributesNS)) {
325 if (($value === true) || (is_null($value))) {
329 if ($name == 'style') {
330 // removes insignificant backslahes
331 $value = str_replace("\\", '', $value);
333 // removes CSS comments
335 $_value = preg_replace('!/\*.*?\*/!s', '', $value);
337 if ($_value == $value) {
344 // replace all & to &
345 $value = str_replace('&', '&', $value);
346 $value = str_replace('&', '&', $value);
348 foreach ($this->cssRegexps as $css) {
349 if (preg_match($css, $value)) {
354 foreach ($this->protoRegexps as $proto) {
355 if (preg_match($proto, $value)) {
361 $tempval = preg_replace('/&#(\d+);?/me', "chr('\\1')", $value); //"'
362 $tempval = preg_replace(
363 '/&#x([0-9a-f]+);?/mei',
364 "chr(hexdec('\\1'))",
368 // CUSTOMIZATION: check URl against validator callback
369 if(in_array($name, $this->protocolAttributes) && is_callable($this->urlCallback) && !call_user_func($this->urlCallback, $name, $tempval)) {
373 if ((in_array($name, $this->protocolAttributes))
374 && (strpos($tempval, ':') !== false)
376 if ($this->protocolFiltering == 'black') {
377 foreach ($this->protoRegexps as $proto) {
378 if (preg_match($proto, $tempval)) {
383 $_tempval = explode(':', $tempval);
384 $proto = $_tempval[0];
386 if (!in_array($proto, $this->whiteProtocols)) {
392 $value = str_replace("\"", '"', $value);
393 $this->xhtml .= ' ' . $name . '="' . $value . '"';
401 * Opening tag handler - called from HTMLSax
403 * @param object &$parser HTML Parser
404 * @param string $name tag name
405 * @param array $attrs tag attributes
409 public function openHandler(&$parser, $name, $attrs)
411 $name = strtolower($name);
413 if (in_array($name, $this->deleteTagsContent)) {
414 array_push($this->dcStack, $name);
415 $this->dcCounter[$name] = isset($this->dcCounter[$name])
416 ? $this->dcCounter[$name]+1 : 1;
418 if (count($this->dcStack) != 0) {
422 if (in_array($name, $this->deleteTags)
423 && !in_array($name, $this->allowTags)
428 if (!preg_match("/^[a-z0-9]+$/i", $name)) {
429 if (preg_match("!(?:\@|://)!i", $name)) {
430 $this->xhtml .= '<' . $name . '>';
435 if (in_array($name, $this->singleTags)) {
436 $this->xhtml .= '<' . $name;
437 $this->writeAttrs($attrs);
438 $this->xhtml .= ' />';
442 // TABLES: cannot open table elements when we are not inside table
443 if ((isset($this->counter['table']))
444 && ($this->counter['table'] <= 0)
445 && (in_array($name, $this->tableTags))
450 // PARAGRAPHS: close paragraph when closeParagraph tags opening
451 if ((in_array($name, $this->closeParagraph))
452 && (in_array('p', $this->stack))
454 $this->closeHandler($parser, 'p');
457 // LISTS: we should close <li> if <li> of the same level opening
458 if (($name == 'li') && count($this->liStack)
459 && ($this->listScope == $this->liStack[count($this->liStack) - 1])
461 $this->closeHandler($parser, 'li');
464 // LISTS: we want to know on what nesting level of lists we are
465 if (in_array($name, $this->listTags)) {
470 array_push($this->liStack, $this->listScope);
473 $this->xhtml .= '<' . $name;
474 $this->writeAttrs($attrs);
476 array_push($this->stack, $name);
477 $this->counter[$name] = isset($this->counter[$name])
478 ? ($this->counter[$name] + 1) : 1;
484 * Closing tag handler - called from HTMLSax
486 * @param object &$parser HTML parser
487 * @param string $name tag name
491 public function closeHandler(&$parser, $name)
493 $name = strtolower($name);
495 if (isset($this->dcCounter[$name])
496 && ($this->dcCounter[$name] > 0)
497 && (in_array($name, $this->deleteTagsContent))
499 while ($name != ($tag = array_pop($this->dcStack))) {
500 --$this->dcCounter[$tag];
503 --$this->dcCounter[$name];
506 if (count($this->dcStack) != 0) {
510 if ((isset($this->counter[$name])) && ($this->counter[$name] > 0)) {
511 while ($name != ($tag = array_pop($this->stack))) {
512 $this->closeTag($tag);
515 $this->closeTag($name);
523 * @param string $tag tag name
527 protected function closeTag($tag)
529 if (!in_array($tag, $this->noClose)) {
530 $this->xhtml .= '</' . $tag . '>';
533 --$this->counter[$tag];
535 if (in_array($tag, $this->listTags)) {
540 array_pop($this->liStack);
547 * Character data handler - called from HTMLSax
549 * @param object &$parser HTML parser
550 * @param string $data textual data
554 public function dataHandler(&$parser, $data)
556 if (count($this->dcStack) == 0) {
557 $this->xhtml .= $data;
564 * Escape handler - called from HTMLSax
566 * @param object &$parser HTML parser
567 * @param string $data comments or other type of data
571 public function escapeHandler(&$parser, $data)
581 * $safe = new HTML_Safe;
582 * $safe->setAllowTags(array('body'));
585 * @param array $tags Tags to allow
589 public function setAllowTags($tags = array())
591 if (is_array($tags)) {
592 $this->allowTags = $tags;
597 * Returns the allowed tags
601 public function getAllowTags()
603 return $this->allowTags;
607 * Reset the allowed tags
611 public function resetAllowTags()
613 $this->allowTags = array();
617 * Set URL validation callback
618 * CUSTOMIZATION: check URl against validator callback
619 * @param callback $callback
621 public function setUrlCallback($callback)
623 if(empty($callback)) {
624 $this->urlCallback = null;
625 } elseif(is_callable($callback)) {
626 $this->urlCallback = $callback;
632 * Returns the XHTML document
634 * @return string Processed (X)HTML document
636 public function getXHTML()
638 while ($tag = array_pop($this->stack)) {
639 $this->closeTag($tag);
646 * Clears current document data
650 public function clear()
653 $this->dcCounter = array();
654 $this->stack = array();
659 * Main parsing fuction
661 * @param string $doc HTML document for processing
663 * @return string Processed (X)HTML document
665 public function parse($doc, $checkUTF7 = true)
669 // Save all '<' symbols
670 $doc = preg_replace("/<(?=[^a-zA-Z\/\!\?\%])/", '<', $doc);
674 $doc = $this->repackUTF7($doc);
676 // Instantiate the parser
677 $parser = new XML_HTMLSax3;
680 $parser->set_object($this);
682 $parser->set_element_handler('openHandler', 'closeHandler');
683 $parser->set_data_handler('dataHandler');
684 $parser->set_escape_handler('escapeHandler');
686 $parser->parse($doc);
688 $result = $this->getXHTML();
696 * UTF-7 decoding fuction
698 * @param string $str HTML document for recode ASCII part of UTF-7 back to ASCII
699 * @return string Decoded document
702 function repackUTF7($str)
704 return preg_replace_callback('!\+([0-9a-zA-Z/]+)\-!', array($this, 'repackUTF7Callback'), $str);
708 * Additional UTF-7 decoding fuction
710 * @param string $str String for recode ASCII part of UTF-7 back to ASCII
711 * @return string Recoded string
714 function repackUTF7Callback($str)
716 $str = base64_decode($str[1]);
717 $str = preg_replace_callback('/^((?:\x00.)*)((?:[^\x00].)+)/', array($this, 'repackUTF7Back'), $str);
718 return preg_replace('/\x00(.)/', '$1', $str);
722 * Additional UTF-7 encoding fuction
724 * @param string $str String for recode ASCII part of UTF-7 back to ASCII
725 * @return string Recoded string
728 function repackUTF7Back($str)
730 return $str[1].'+'.rtrim(base64_encode($str[2]), '=').'-';
738 * c-hanging-comment-ender-p: nil