5 * This set of functions allows you to filter html in order to remove
6 * any malicious tags from it. Useful in cases when you need to filter
7 * user input for any cross-site-scripting attempts.
9 * Copyright (C) 2002-2004 by Duke University
11 * This library is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License as published by the Free Software Foundation; either
14 * version 2.1 of the License, or (at your option) any later version.
16 * This library is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with this library; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26 * @Author Konstantin Riabitsev <icon@linux.duke.edu>
27 * @Version 1.1 ($Date: 2011-07-04 14:02:23 -0400 (Mon, 04 Jul 2011) $)
31 * @Author Jim Jagielski <jim@jaguNET.com / jimjag@gmail.com>
35 * This function returns the final tag out of the tag name, an array
36 * of attributes, and the type of the tag. This function is called by
37 * tln_sanitize internally.
39 * @param $tagname the name of the tag.
40 * @param $attary the array of attributes and their values
41 * @param $tagtype The type of the tag (see in comments).
42 * @return a string with the final tag representation.
44 function tln_tagprint($tagname, $attary, $tagtype){
47 $fulltag = '</' . $tagname . '>';
49 $fulltag = '<' . $tagname;
50 if (is_array($attary) && sizeof($attary)){
52 while (list($attname, $attvalue) = each($attary)){
53 array_push($atts, "$attname=$attvalue");
55 $fulltag .= ' ' . join(' ', $atts);
66 * A small helper function to use with array_walk. Modifies a by-ref
67 * value and makes it lowercase.
69 * @param $val a value passed by-ref.
70 * @return void since it modifies a by-ref value.
72 function tln_casenormalize(&$val){
73 $val = strtolower($val);
77 * This function skips any whitespace from the current position within
78 * a string and to the next non-whitespace value.
80 * @param $body the string
81 * @param $offset the offset within the string where we should start
82 * looking for the next non-whitespace character.
83 * @return the location within the $body where the next
84 * non-whitespace char is located.
86 function tln_skipspace($body, $offset){
87 $me = 'tln_skipspace';
88 preg_match('/^(\s*)/s', substr($body, $offset), $matches);
89 if (sizeof($matches[1])){
90 $count = strlen($matches[1]);
97 * This function looks for the next character within a string. It's
98 * really just a glorified "strpos", except it catches the failures
101 * @param $body The string to look for needle in.
102 * @param $offset Start looking from this position.
103 * @param $needle The character/string to look for.
104 * @return location of the next occurance of the needle, or
105 * strlen($body) if needle wasn't found.
107 function tln_findnxstr($body, $offset, $needle){
108 $me = 'tln_findnxstr';
109 $pos = strpos($body, $needle, $offset);
111 $pos = strlen($body);
117 * This function takes a PCRE-style regexp and tries to match it
120 * @param $body The string to look for needle in.
121 * @param $offset Start looking from here.
122 * @param $reg A PCRE-style regex to match.
123 * @return Returns a false if no matches found, or an array
124 * with the following members:
125 * - integer with the location of the match within $body
126 * - string with whatever content between offset and the match
127 * - string with whatever it is we matched
129 function tln_findnxreg($body, $offset, $reg){
130 $me = 'tln_findnxreg';
133 $preg_rule = '%^(.*?)(' . $reg . ')%s';
134 preg_match($preg_rule, substr($body, $offset), $matches);
135 if (!isset($matches[0])){
138 $retarr[0] = $offset + strlen($matches[1]);
139 $retarr[1] = $matches[1];
140 $retarr[2] = $matches[2];
146 * This function looks for the next tag.
148 * @param $body String where to look for the next tag.
149 * @param $offset Start looking from here.
150 * @return false if no more tags exist in the body, or
151 * an array with the following members:
152 * - string with the name of the tag
153 * - array with attributes and their values
154 * - integer with tag type (1, 2, or 3)
155 * - integer where the tag starts (starting "<")
156 * - integer where the tag ends (ending ">")
157 * first three members will be false, if the tag is invalid.
159 function tln_getnxtag($body, $offset){
160 $me = 'tln_getnxtag';
161 if ($offset > strlen($body)){
164 $lt = tln_findnxstr($body, $offset, '<');
165 if ($lt == strlen($body)){
170 * blah blah <tag attribute="value">
173 $pos = tln_skipspace($body, $lt + 1);
174 if ($pos >= strlen($body)){
175 return Array(false, false, false, $lt, strlen($body));
178 * There are 3 kinds of tags:
179 * 1. Opening tag, e.g.:
181 * 2. Closing tag, e.g.:
183 * 3. XHTML-style content-less tag, e.g.:
187 switch (substr($body, $pos, 1)){
194 * A comment or an SGML declaration.
196 if (substr($body, $pos+1, 2) == '--'){
197 $gt = strpos($body, '-->', $pos);
203 return Array(false, false, false, $lt, $gt);
205 $gt = tln_findnxstr($body, $pos, '>');
206 return Array(false, false, false, $lt, $gt);
211 * Assume tagtype 1 for now. If it's type 3, we'll switch values
221 * Look for next [\W-_], which will indicate the end of the tag name.
223 $regary = tln_findnxreg($body, $pos, '[^\w\-_]');
224 if ($regary == false){
225 return Array(false, false, false, $lt, strlen($body));
227 list($pos, $tagname, $match) = $regary;
228 $tagname = strtolower($tagname);
231 * $match can be either of these:
232 * '>' indicating the end of the tag entirely.
233 * '\s' indicating the end of the tag name.
234 * '/' indicating that this is type-3 xhtml tag.
236 * Whatever else we find there indicates an invalid tag.
241 * This is an xhtml-style tag with a closing / at the
242 * end, like so: <img src="blah"/>. Check if it's followed
243 * by the closing bracket. If not, then this tag is invalid
245 if (substr($body, $pos, 2) == '/>'){
249 $gt = tln_findnxstr($body, $pos, '>');
250 $retary = Array(false, false, false, $lt, $gt);
254 return Array($tagname, false, $tagtype, $lt, $pos);
258 * Check if it's whitespace
260 if (preg_match('/\s/', $match)){
263 * This is an invalid tag! Look for the next closing ">".
265 $gt = tln_findnxstr($body, $lt, '>');
266 return Array(false, false, false, $lt, $gt);
271 * At this point we're here:
272 * <tagname attribute='blah'>
275 * At this point we loop in order to find all attributes.
281 while ($pos <= strlen($body)){
282 $pos = tln_skipspace($body, $pos);
283 if ($pos == strlen($body)){
287 return Array(false, false, false, $lt, $pos);
290 * See if we arrived at a ">" or "/>", which means that we reached
291 * the end of the tag.
294 preg_match('%^(\s*)(>|/>)%s', substr($body, $pos), $matches);
295 if (isset($matches[0]) && $matches[0]){
299 $pos += strlen($matches[1]);
300 if ($matches[2] == '/>'){
304 return Array($tagname, $attary, $tagtype, $lt, $pos);
308 * There are several types of attributes, with optional
309 * [:space:] between members.
311 * attrname[:space:]=[:space:]'CDATA'
313 * attrname[:space:]=[:space:]"CDATA"
315 * attr[:space:]=[:space:]CDATA
319 * We leave types 1 and 2 the same, type 3 we check for
320 * '"' and convert to """ if needed, then wrap in
321 * double quotes. Type 4 we convert into:
324 $regary = tln_findnxreg($body, $pos, '[^\w\-_]');
325 if ($regary == false){
327 * Looks like body ended before the end of tag.
329 return Array(false, false, false, $lt, strlen($body));
331 list($pos, $attname, $match) = $regary;
332 $attname = strtolower($attname);
334 * We arrived at the end of attribute name. Several things possible
336 * '>' means the end of the tag and this is attribute type 4
337 * '/' if followed by '>' means the same thing as above
338 * '\s' means a lot of things -- look what it's followed by.
339 * anything else means the attribute is invalid.
344 * This is an xhtml-style tag with a closing / at the
345 * end, like so: <img src="blah"/>. Check if it's followed
346 * by the closing bracket. If not, then this tag is invalid
348 if (substr($body, $pos, 2) == '/>'){
352 $gt = tln_findnxstr($body, $pos, '>');
353 $retary = Array(false, false, false, $lt, $gt);
357 $attary{$attname} = '"yes"';
358 return Array($tagname, $attary, $tagtype, $lt, $pos);
362 * Skip whitespace and see what we arrive at.
364 $pos = tln_skipspace($body, $pos);
365 $char = substr($body, $pos, 1);
367 * Two things are valid here:
368 * '=' means this is attribute type 1 2 or 3.
369 * \w means this was attribute type 4.
370 * anything else we ignore and re-loop. End of tag and
371 * invalid stuff will be caught by our checks at the beginning
376 $pos = tln_skipspace($body, $pos);
378 * Here are 3 possibilities:
379 * "'" attribute type 1
380 * '"' attribute type 2
381 * everything else is the content of tag type 3
383 $quot = substr($body, $pos, 1);
385 $regary = tln_findnxreg($body, $pos+1, '\'');
386 if ($regary == false){
387 return Array(false, false, false, $lt, strlen($body));
389 list($pos, $attval, $match) = $regary;
391 $attary{$attname} = '\'' . $attval . '\'';
392 } else if ($quot == '"'){
393 $regary = tln_findnxreg($body, $pos+1, '\"');
394 if ($regary == false){
395 return Array(false, false, false, $lt, strlen($body));
397 list($pos, $attval, $match) = $regary;
399 $attary{$attname} = '"' . $attval . '"';
402 * These are hateful. Look for \s, or >.
404 $regary = tln_findnxreg($body, $pos, '[\s>]');
405 if ($regary == false){
406 return Array(false, false, false, $lt, strlen($body));
408 list($pos, $attval, $match) = $regary;
410 * If it's ">" it will be caught at the top.
412 $attval = preg_replace('/\"/s', '"', $attval);
413 $attary{$attname} = '"' . $attval . '"';
415 } else if (preg_match('|[\w/>]|', $char)) {
417 * That was attribute type 4.
419 $attary{$attname} = '"yes"';
422 * An illegal character. Find next '>' and return.
424 $gt = tln_findnxstr($body, $pos, '>');
425 return Array(false, false, false, $lt, $gt);
430 * The fact that we got here indicates that the tag end was never
431 * found. Return invalid tag indication so it gets stripped.
433 return Array(false, false, false, $lt, strlen($body));
437 * Translates entities into literal values so they can be checked.
439 * @param $attvalue the by-ref value to check.
440 * @param $regex the regular expression to check against.
441 * @param $hex whether the entites are hexadecimal.
442 * @return True or False depending on whether there were matches.
444 function tln_deent(&$attvalue, $regex, $hex=false){
447 preg_match_all($regex, $attvalue, $matches);
448 if (is_array($matches) && sizeof($matches[0]) > 0){
450 for ($i = 0; $i < sizeof($matches[0]); $i++){
451 $numval = $matches[1][$i];
453 $numval = hexdec($numval);
455 $repl{$matches[0][$i]} = chr($numval);
457 $attvalue = strtr($attvalue, $repl);
465 * This function checks attribute values for entity-encoded values
466 * and returns them translated into 8-bit strings so we can run
469 * @param $attvalue A string to run entity check against.
470 * @return Nothing, modifies a reference value.
472 function tln_defang(&$attvalue){
475 * Skip this if there aren't ampersands or backslashes.
477 if (strpos($attvalue, '&') === false
478 && strpos($attvalue, '\\') === false){
484 $m = $m || tln_deent($attvalue, '/\�*(\d+);*/s');
485 $m = $m || tln_deent($attvalue, '/\�*((\d|[a-f])+);*/si', true);
486 $m = $m || tln_deent($attvalue, '/\\\\(\d+)/s', true);
487 } while ($m == true);
488 $attvalue = stripslashes($attvalue);
492 * Kill any tabs, newlines, or carriage returns. Our friends the
493 * makers of the browser with 95% market value decided that it'd
494 * be funny to make "java[tab]script" be just as good as "javascript".
496 * @param attvalue The attribute value before extraneous spaces removed.
497 * @return attvalue Nothing, modifies a reference value.
499 function tln_unspace(&$attvalue){
501 if (strcspn($attvalue, "\t\r\n\0 ") != strlen($attvalue)){
502 $attvalue = str_replace(Array("\t", "\r", "\n", "\0", " "),
503 Array('', '', '', '', ''), $attvalue);
508 * This function runs various checks against the attributes.
510 * @param $tagname String with the name of the tag.
511 * @param $attary Array with all tag attributes.
512 * @param $rm_attnames See description for tln_sanitize
513 * @param $bad_attvals See description for tln_sanitize
514 * @param $add_attr_to_tag See description for tln_sanitize
515 * @return Array with modified attributes.
517 function tln_fixatts($tagname,
524 while (list($attname, $attvalue) = each($attary)){
526 * See if this attribute should be removed.
528 foreach ($rm_attnames as $matchtag=>$matchattrs){
529 if (preg_match($matchtag, $tagname)){
530 foreach ($matchattrs as $matchattr){
531 if (preg_match($matchattr, $attname)){
532 unset($attary{$attname});
539 * Remove any backslashes, entities, or extraneous whitespace.
541 tln_defang($attvalue);
542 tln_unspace($attvalue);
545 * Now let's run checks on the attvalues.
546 * I don't expect anyone to comprehend this. If you do,
547 * get in touch with me so I can drive to where you live and
548 * shake your hand personally. :)
550 foreach ($bad_attvals as $matchtag=>$matchattrs){
551 if (preg_match($matchtag, $tagname)){
552 foreach ($matchattrs as $matchattr=>$valary){
553 if (preg_match($matchattr, $attname)){
555 * There are two arrays in valary.
557 * Second one is replacements
559 list($valmatch, $valrepl) = $valary;
560 $newvalue = preg_replace($valmatch,$valrepl,$attvalue);
561 if ($newvalue != $attvalue){
562 $attary{$attname} = $newvalue;
570 * See if we need to append any attributes to this tag.
572 foreach ($add_attr_to_tag as $matchtag=>$addattary){
573 if (preg_match($matchtag, $tagname)){
574 $attary = array_merge($attary, $addattary);
582 * @param $body the string with HTML you wish to filter
583 * @param $tag_list see description above
584 * @param $rm_tags_with_content see description above
585 * @param $self_closing_tags see description above
586 * @param $force_tag_closing see description above
587 * @param $rm_attnames see description above
588 * @param $bad_attvals see description above
589 * @param $add_attr_to_tag see description above
590 * @return tln_sanitized html safe to show on your pages.
592 function tln_sanitize($body,
594 $rm_tags_with_content,
602 $me = 'tln_sanitize';
604 * Normalize rm_tags and rm_tags_with_content.
606 $rm_tags = array_shift($tag_list);
607 @array_walk($tag_list, 'tln_casenormalize');
608 @array_walk($rm_tags_with_content, 'tln_casenormalize');
609 @array_walk($self_closing_tags, 'tln_casenormalize');
611 * See if tag_list is of tags to remove or tags to allow.
612 * false means remove these tags
613 * true means allow these tags
616 $open_tags = Array();
617 $trusted = "<!-- begin tln_sanitized html -->\n";
618 $skip_content = false;
620 * Take care of netscape's stupid javascript entities like
623 $body = preg_replace('/&(\{.*?\};)/si', '&\\1', $body);
624 while (($curtag = tln_getnxtag($body, $curpos)) != FALSE){
625 list($tagname, $attary, $tagtype, $lt, $gt) = $curtag;
626 $free_content = substr($body, $curpos, $lt - $curpos);
627 if ($skip_content == false){
628 $trusted .= $free_content;
631 if ($tagname != FALSE){
633 if ($skip_content == $tagname){
635 * Got to the end of tag we needed to remove.
638 $skip_content = false;
640 if ($skip_content == false){
641 if (isset($open_tags{$tagname}) &&
642 $open_tags{$tagname} > 0){
643 $open_tags{$tagname}--;
652 * $rm_tags_with_content
654 if ($skip_content == false){
656 * See if this is a self-closing type and change
657 * tagtype appropriately.
660 && in_array($tagname, $self_closing_tags)){
664 * See if we should skip this tag and any content
668 && in_array($tagname, $rm_tags_with_content)){
669 $skip_content = $tagname;
671 if (($rm_tags == false
672 && in_array($tagname, $tag_list)) ||
674 && !in_array($tagname, $tag_list))){
678 if (isset($open_tags{$tagname})){
679 $open_tags{$tagname}++;
681 $open_tags{$tagname} = 1;
685 * This is where we run other checks.
687 if (is_array($attary) && sizeof($attary) > 0){
688 $attary = tln_fixatts($tagname,
699 if ($tagname != false && $skip_content == false){
700 $trusted .= tln_tagprint($tagname, $attary, $tagtype);
706 $trusted .= substr($body, $curpos, strlen($body) - $curpos);
707 if ($force_tag_closing == true){
708 foreach ($open_tags as $tagname=>$opentimes){
709 while ($opentimes > 0){
710 $trusted .= '</' . $tagname . '>';
716 $trusted .= "<!-- end tln_sanitized html -->\n";
721 // Use the nifty htmlfilter library
725 function HTMLFilter($body, $trans_image_path, $block_external_images = false) {
741 $rm_tags_with_content = Array(
751 $self_closing_tags = Array(
759 $force_tag_closing = true;
761 $rm_attnames = Array(
772 $bad_attvals = Array(
775 "/^src|background/i" =>
778 "/^([\'\"])\s*\S+script\s*:.*([\'\"])/si",
779 "/^([\'\"])\s*mocha\s*:*.*([\'\"])/si",
780 "/^([\'\"])\s*about\s*:.*([\'\"])/si"
783 "\\1$trans_image_path\\2",
784 "\\1$trans_image_path\\2",
785 "\\1$trans_image_path\\2",
786 "\\1$trans_image_path\\2"
792 "/^([\'\"])\s*\S+script\s*:.*([\'\"])/si",
793 "/^([\'\"])\s*mocha\s*:*.*([\'\"])/si",
794 "/^([\'\"])\s*about\s*:.*([\'\"])/si"
810 "/position\s*:\s*absolute/i",
811 "/url\s*\(\s*([\'\"])\s*\S+script\s*:.*([\'\"])\s*\)/si",
812 "/url\s*\(\s*([\'\"])\s*mocha\s*:.*([\'\"])\s*\)/si",
813 "/url\s*\(\s*([\'\"])\s*about\s*:.*([\'\"])\s*\)/si",
814 "/(.*)\s*:\s*url\s*\(\s*([\'\"]*)\s*\S+script\s*:.*([\'\"]*)\s*\)/si"
833 if ($block_external_images){
834 array_push($bad_attvals{'/.*/'}{'/^src|background/i'}[0],
835 '/^([\'\"])\s*https*:.*([\'\"])/si');
836 array_push($bad_attvals{'/.*/'}{'/^src|background/i'}[1],
837 "\\1$trans_image_path\\1");
838 array_push($bad_attvals{'/.*/'}{'/^style/i'}[0],
839 '/url\(([\'\"])\s*https*:.*([\'\"])\)/si');
840 array_push($bad_attvals{'/.*/'}{'/^style/i'}[1],
841 "url(\\1$trans_image_path\\1)");
844 $add_attr_to_tag = Array(
846 Array('target'=>'"_blank"')
849 $trusted = tln_sanitize($body,
851 $rm_tags_with_content,