2 /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
11 * @author Roman Ivanov <thingol@mail.ru>
12 * @copyright 2004-2005 Roman Ivanov
13 * @license http://www.debian.org/misc/bsd.license BSD License (3 Clause)
15 * @link http://pear.php.net/package/HTML_Safe
20 * This package requires HTMLSax3 package
22 require_once 'include/Pear/XML_HTMLSax3/HTMLSax3.php';
29 * This parser strips down all potentially dangerous content within HTML:
31 * <li>opening tag without its closing tag</li>
32 * <li>closing tag without its opening tag</li>
33 * <li>any of these tags: "base", "basefont", "head", "html", "body", "applet",
34 * "object", "iframe", "frame", "frameset", "script", "layer", "ilayer", "embed",
35 * "bgsound", "link", "meta", "style", "title", "blink", "xml" etc.</li>
36 * <li>any of these attributes: on*, data*, dynsrc</li>
37 * <li>javascript:/vbscript:/about: etc. protocols</li>
38 * <li>expression/behavior etc. in styles</li>
39 * <li>any other active content</li>
41 * It also tries to convert code to XHTML valid, but htmltidy is far better
42 * solution for this task.
46 * $parser = new HTML_Safe();
47 * $result = $parser->parse($doc);
52 * @author Roman Ivanov <thingol@mail.ru>
53 * @copyright 1997-2005 Roman Ivanov
54 * @license http://www.debian.org/misc/bsd.license BSD License (3 Clause)
55 * @version Release: @package_version@
56 * @link http://pear.php.net/package/HTML_Safe
61 * Storage for resulting HTML output
69 * Array of counters for each tag
74 var $_counter = array();
77 * Stack of unclosed tags
82 var $_stack = array();
85 * Array of counters for tags that must be deleted with all content
90 var $_dcCounter = array();
93 * Stack of unclosed tags that must be deleted with all content
98 var $_dcStack = array();
101 * Stores level of list (ol/ul) nesting
109 * Stack of unclosed list tags
114 var $_liStack = array();
117 * Array of prepared regular expressions for protocols (schemas) matching
122 var $_protoRegexps = array();
125 * Array of prepared regular expressions for CSS matching
130 var $_cssRegexps = array();
133 * List of single tags ("<tag />")
138 var $singleTags = array('area', 'br', 'img', 'input', 'hr', 'wbr', );
141 * List of dangerous tags (such tags will be deleted)
146 var $deleteTags = array(
147 'applet', 'base', 'basefont', 'bgsound', 'blink', 'body',
148 'embed', 'frame', 'frameset', 'head', 'html', 'ilayer',
149 'iframe', 'layer', 'link', 'meta', 'object', 'style',
154 * List of dangerous tags (such tags will be deleted, and all content
155 * inside this tags will be also removed)
160 var $deleteTagsContent = array('script', 'style', 'title', 'xml', );
163 * Type of protocols filtering ('white' or 'black')
168 var $protocolFiltering = 'white';
171 * List of "dangerous" protocols (used for blacklist-filtering)
176 var $blackProtocols = array(
177 'about', 'chrome', 'data', 'disk', 'hcp',
178 'help', 'javascript', 'livescript', 'lynxcgi', 'lynxexec',
179 'ms-help', 'ms-its', 'mhtml', 'mocha', 'opera',
180 'res', 'resource', 'shell', 'vbscript', 'view-source',
181 'vnd.ms.radio', 'wysiwyg',
185 * List of "safe" protocols (used for whitelist-filtering)
190 var $whiteProtocols = array(
191 'ed2k', 'file', 'ftp', 'gopher', 'http', 'https',
192 'irc', 'mailto', 'news', 'nntp', 'telnet', 'webcal',
197 * List of attributes that can contain protocols
202 var $protocolAttributes = array(
203 'action', 'background', 'codebase', 'dynsrc', 'href', 'lowsrc', 'src',
207 * List of dangerous CSS keywords
209 * Whole style="" attribute will be removed, if parser will find one of
215 var $cssKeywords = array(
216 'absolute', 'behavior', 'behaviour', 'content', 'expression',
217 'fixed', 'include-source', 'moz-binding',
221 * List of tags that can have no "closing tag"
225 * @deprecated XHTML does not allow such tags
227 var $noClose = array();
230 * List of block-level tags that terminates paragraph
232 * Paragraph will be closed when this tags opened
237 var $closeParagraph = array(
238 'address', 'blockquote', 'center', 'dd', 'dir', 'div',
239 'dl', 'dt', 'h1', 'h2', 'h3', 'h4',
240 'h5', 'h6', 'hr', 'isindex', 'listing', 'marquee',
241 'menu', 'multicol', 'ol', 'p', 'plaintext', 'pre',
242 'table', 'ul', 'xmp',
246 * List of table tags, all table tags outside a table will be removed
251 var $tableTags = array(
252 'caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
262 var $listTags = array('dir', 'menu', 'ol', 'ul', 'dl', );
265 * List of dangerous attributes
270 var $attributes = array('dynsrc', 'id', 'name', );
273 * List of allowed "namespaced" attributes
278 var $attributesNS = array('xml:lang', );
287 //making regular expressions based on Proto & CSS arrays
288 foreach ($this->blackProtocols as $proto) {
289 $preg = "/[\s\x01-\x1F]*";
290 for ($i=0; $i<strlen($proto); $i++) {
291 $preg .= $proto{$i} . "[\s\x01-\x1F]*";
294 $this->_protoRegexps[] = $preg;
297 foreach ($this->cssKeywords as $css) {
298 $this->_cssRegexps[] = '/' . $css . '/i';
304 * Handles the writing of attributes - called from $this->_openHandler()
306 * @param array $attrs array of attributes $name => $value
310 function _writeAttrs ($attrs)
312 if (is_array($attrs)) {
313 foreach ($attrs as $name => $value) {
315 $name = strtolower($name);
317 if (strpos($name, 'on') === 0) {
320 if (strpos($name, 'data') === 0) {
323 if (in_array($name, $this->attributes)) {
326 if (!preg_match("/^[a-z0-9]+$/i", $name)) {
327 if (!in_array($name, $this->attributesNS)) {
332 if (($value === TRUE) || (is_null($value))) {
336 if ($name == 'style') {
338 // removes insignificant backslahes
339 $value = str_replace("\\", '', $value);
341 // removes CSS comments
344 $_value = preg_replace("!/\*.*?\*/!s", '', $value);
345 if ($_value == $value) break;
349 // replace all & to &
350 $value = str_replace('&', '&', $value);
351 $value = str_replace('&', '&', $value);
353 foreach ($this->_cssRegexps as $css) {
354 if (preg_match($css, $value)) {
358 foreach ($this->_protoRegexps as $proto) {
359 if (preg_match($proto, $value)) {
365 $tempval = preg_replace('/&#(\d+);?/me', "chr('\\1')", $value); //"'
366 $tempval = preg_replace('/&#x([0-9a-f]+);?/mei', "chr(hexdec('\\1'))", $tempval);
368 if ((in_array($name, $this->protocolAttributes)) &&
369 (strpos($tempval, ':') !== false))
371 if ($this->protocolFiltering == 'black') {
372 foreach ($this->_protoRegexps as $proto) {
373 if (preg_match($proto, $tempval)) continue 2;
376 $_tempval = explode(':', $tempval);
377 $proto = $_tempval[0];
378 if (!in_array($proto, $this->whiteProtocols)) {
384 $value = str_replace("\"", """, $value);
385 $this->_xhtml .= ' ' . $name . '="' . $value . '"';
392 * Opening tag handler - called from HTMLSax
394 * @param object $parser HTML Parser
395 * @param string $name tag name
396 * @param array $attrs tag attributes
400 function _openHandler(&$parser, $name, $attrs)
402 $name = strtolower($name);
404 if (in_array($name, $this->deleteTagsContent)) {
405 array_push($this->_dcStack, $name);
406 $this->_dcCounter[$name] = isset($this->_dcCounter[$name]) ? $this->_dcCounter[$name]+1 : 1;
408 if (count($this->_dcStack) != 0) {
412 if (in_array($name, $this->deleteTags)) {
416 if (!preg_match("/^[a-z0-9]+$/i", $name)) {
417 if (preg_match("!(?:\@|://)!i", $name)) {
418 $this->_xhtml .= '<' . $name . '>';
423 if (in_array($name, $this->singleTags)) {
424 $this->_xhtml .= '<' . $name;
425 $this->_writeAttrs($attrs);
426 $this->_xhtml .= ' />';
430 // TABLES: cannot open table elements when we are not inside table
431 if ((isset($this->_counter['table'])) && ($this->_counter['table'] <= 0)
432 && (in_array($name, $this->tableTags)))
437 // PARAGRAPHS: close paragraph when closeParagraph tags opening
438 if ((in_array($name, $this->closeParagraph)) && (in_array('p', $this->_stack))) {
439 $this->_closeHandler($parser, 'p');
442 // LISTS: we should close <li> if <li> of the same level opening
443 if ($name == 'li' && count($this->_liStack) &&
444 $this->_listScope == $this->_liStack[count($this->_liStack)-1])
446 $this->_closeHandler($parser, 'li');
449 // LISTS: we want to know on what nesting level of lists we are
450 if (in_array($name, $this->listTags)) {
454 array_push($this->_liStack, $this->_listScope);
457 $this->_xhtml .= '<' . $name;
458 $this->_writeAttrs($attrs);
459 $this->_xhtml .= '>';
460 array_push($this->_stack,$name);
461 $this->_counter[$name] = isset($this->_counter[$name]) ? $this->_counter[$name]+1 : 1;
466 * Closing tag handler - called from HTMLSax
468 * @param object $parsers HTML parser
469 * @param string $name tag name
473 function _closeHandler(&$parser, $name)
476 $name = strtolower($name);
478 if (isset($this->_dcCounter[$name]) && ($this->_dcCounter[$name] > 0) &&
479 (in_array($name, $this->deleteTagsContent)))
481 while ($name != ($tag = array_pop($this->_dcStack))) {
482 $this->_dcCounter[$tag]--;
485 $this->_dcCounter[$name]--;
488 if (count($this->_dcStack) != 0) {
492 if ((isset($this->_counter[$name])) && ($this->_counter[$name] > 0)) {
493 while ($name != ($tag = array_pop($this->_stack))) {
494 $this->_closeTag($tag);
497 $this->_closeTag($name);
505 * @param string $tag tag name
509 function _closeTag($tag)
511 if (!in_array($tag, $this->noClose)) {
512 $this->_xhtml .= '</' . $tag . '>';
515 $this->_counter[$tag]--;
517 if (in_array($tag, $this->listTags)) {
522 array_pop($this->_liStack);
528 * Character data handler - called from HTMLSax
530 * @param object $parser HTML parser
531 * @param string $data textual data
535 function _dataHandler(&$parser, $data)
537 if (count($this->_dcStack) == 0) {
538 $this->_xhtml .= $data;
544 * Escape handler - called from HTMLSax
546 * @param object $parser HTML parser
547 * @param string $data comments or other type of data
551 function _escapeHandler(&$parser, $data)
557 * Returns the XHTML document
559 * @return string Processed (X)HTML document
564 while ($tag = array_pop($this->_stack)) {
565 $this->_closeTag($tag);
568 return $this->_xhtml;
572 * Clears current document data
580 $this->_dcStack = array();
581 $this->_dcCounter = array();
586 * Main parsing fuction
588 * @param string $doc HTML document for processing
589 * @return string Processed (X)HTML document
595 // Save all '<' symbols
596 $doc = preg_replace("/<(?=[^a-zA-Z\/\!\?\%])/", '<', $doc);
598 // Web documents shouldn't contains \x00 symbol
599 $doc = str_replace("\x00", '', $doc);
601 // Opera6 bug workaround
602 $doc = str_replace("\xC0\xBC", '<', $doc);
604 // UTF-7 encoding ASCII decode
606 //$doc = $this->repackUTF7($doc);
608 // Instantiate the parser
609 $parser= new XML_HTMLSax3();
612 $parser->set_object($this);
614 $parser->set_element_handler('_openHandler','_closeHandler');
615 $parser->set_data_handler('_dataHandler');
616 $parser->set_escape_handler('_escapeHandler');
618 $parser->parse($doc);
620 return $this->getXHTML();
626 * UTF-7 decoding fuction
628 * @param string $str HTML document for recode ASCII part of UTF-7 back to ASCII
629 * @return string Decoded document
632 function repackUTF7($str)
634 return preg_replace_callback('!\+([0-9a-zA-Z/]+)\-!', array($this, 'repackUTF7Callback'), $str);
638 * Additional UTF-7 decoding fuction
640 * @param string $str String for recode ASCII part of UTF-7 back to ASCII
641 * @return string Recoded string
644 function repackUTF7Callback($str)
646 $str = base64_decode($str[1]);
647 $str = preg_replace_callback('/^((?:\x00.)*)((?:[^\x00].)+)/', array($this, 'repackUTF7Back'), $str);
648 return preg_replace('/\x00(.)/', '$1', $str);
652 * Additional UTF-7 encoding fuction
654 * @param string $str String for recode ASCII part of UTF-7 back to ASCII
655 * @return string Recoded string
658 function repackUTF7Back($str)
660 return $str[1].'+'.rtrim(base64_encode($str[2]), '=').'-';
668 * c-hanging-comment-ender-p: nil