include/phpmailer/extras/htmlfilter.php

   1 <?php
   2 /**
   3  * htmlfilter.inc
   4  * ---------------
   5  * This set of functions allows you to filter html in order to remove
   6  * any malicious tags from it. Useful in cases when you need to filter
   7  * user input for any cross-site-scripting attempts.
   8  *
   9  * Copyright (C) 2002-2004 by Duke University
  10  *
  11  * This library is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public
  13  * License as published by the Free Software Foundation; either
  14  * version 2.1 of the License, or (at your option) any later version.
  15  *
  16  * This library is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with this library; if not, write to the Free Software
  23  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
  24  * 02110-1301  USA
  25  *
  26  * @Author      Konstantin Riabitsev <icon@linux.duke.edu>
  27  * @Version 1.1 ($Date: 2011-07-04 14:02:23 -0400 (Mon, 04 Jul 2011) $)
  28  */
  29
  30 /**
  31  * @Author  Jim Jagielski <jim@jaguNET.com / jimjag@gmail.com>
  32  */
  33
  34 /**
  35  * This function returns the final tag out of the tag name, an array
  36  * of attributes, and the type of the tag. This function is called by
  37  * tln_sanitize internally.
  38  *
  39  * @param  $tagname      the name of the tag.
  40  * @param  $attary       the array of attributes and their values
  41  * @param  $tagtype      The type of the tag (see in comments).
  42  * @return                       a string with the final tag representation.
  43  */
  44 function tln_tagprint($tagname, $attary, $tagtype){
  45         $me = 'tln_tagprint';
  46         if ($tagtype == 2){
  47                 $fulltag = '</' . $tagname . '>';
  48         } else {
  49                 $fulltag = '<' . $tagname;
  50                 if (is_array($attary) && sizeof($attary)){
  51                         $atts = Array();
  52                         while (list($attname, $attvalue) = each($attary)){
  53                                 array_push($atts, "$attname=$attvalue");
  54                         }
  55                         $fulltag .= ' ' . join(' ', $atts);
  56                 }
  57                 if ($tagtype == 3){
  58                         $fulltag .= ' /';
  59                 }
  60                 $fulltag .= '>';
  61         }
  62         return $fulltag;
  63 }
  64
  65 /**
  66  * A small helper function to use with array_walk. Modifies a by-ref
  67  * value and makes it lowercase.
  68  *
  69  * @param  $val a value passed by-ref.
  70  * @return              void since it modifies a by-ref value.
  71  */
  72 function tln_casenormalize(&$val){
  73         $val = strtolower($val);
  74 }
  75
  76 /**
  77  * This function skips any whitespace from the current position within
  78  * a string and to the next non-whitespace value.
  79  *
  80  * @param  $body   the string
  81  * @param  $offset the offset within the string where we should start
  82  *                                 looking for the next non-whitespace character.
  83  * @return                 the location within the $body where the next
  84  *                                 non-whitespace char is located.
  85  */
  86 function tln_skipspace($body, $offset){
  87         $me = 'tln_skipspace';
  88         preg_match('/^(\s*)/s', substr($body, $offset), $matches);
  89         if (sizeof($matches[1])){
  90                 $count = strlen($matches[1]);
  91                 $offset += $count;
  92         }
  93         return $offset;
  94 }
  95
  96 /**
  97  * This function looks for the next character within a string.  It's
  98  * really just a glorified "strpos", except it catches the failures
  99  * nicely.
 100  *
 101  * @param  $body   The string to look for needle in.
 102  * @param  $offset Start looking from this position.
 103  * @param  $needle The character/string to look for.
 104  * @return                 location of the next occurance of the needle, or
 105  *                                 strlen($body) if needle wasn't found.
 106  */
 107 function tln_findnxstr($body, $offset, $needle){
 108         $me = 'tln_findnxstr';
 109         $pos = strpos($body, $needle, $offset);
 110         if ($pos === FALSE){
 111                 $pos = strlen($body);
 112         }
 113         return $pos;
 114 }
 115
 116 /**
 117  * This function takes a PCRE-style regexp and tries to match it
 118  * within the string.
 119  *
 120  * @param  $body   The string to look for needle in.
 121  * @param  $offset Start looking from here.
 122  * @param  $reg    A PCRE-style regex to match.
 123  * @return                 Returns a false if no matches found, or an array
 124  *                                 with the following members:
 125  *                                 - integer with the location of the match within $body
 126  *                                 - string with whatever content between offset and the match
 127  *                                 - string with whatever it is we matched
 128  */
 129 function tln_findnxreg($body, $offset, $reg){
 130         $me = 'tln_findnxreg';
 131         $matches = Array();
 132         $retarr = Array();
 133         $preg_rule = '%^(.*?)(' . $reg . ')%s';
 134         preg_match($preg_rule, substr($body, $offset), $matches);
 135         if (!isset($matches[0])){
 136                 $retarr = false;
 137         } else {
 138                 $retarr[0] = $offset + strlen($matches[1]);
 139                 $retarr[1] = $matches[1];
 140                 $retarr[2] = $matches[2];
 141         }
 142         return $retarr;
 143 }
 144
 145 /**
 146  * This function looks for the next tag.
 147  *
 148  * @param  $body   String where to look for the next tag.
 149  * @param  $offset Start looking from here.
 150  * @return                 false if no more tags exist in the body, or
 151  *                                 an array with the following members:
 152  *                                 - string with the name of the tag
 153  *                                 - array with attributes and their values
 154  *                                 - integer with tag type (1, 2, or 3)
 155  *                                 - integer where the tag starts (starting "<")
 156  *                                 - integer where the tag ends (ending ">")
 157  *                                 first three members will be false, if the tag is invalid.
 158  */
 159 function tln_getnxtag($body, $offset){
 160         $me = 'tln_getnxtag';
 161         if ($offset > strlen($body)){
 162                 return false;
 163         }
 164         $lt = tln_findnxstr($body, $offset, '<');
 165         if ($lt == strlen($body)){
 166                 return false;
 167         }
 168         /**
 169          * We are here:
 170          * blah blah <tag attribute="value">
 171          * \---------^
 172          */
 173         $pos = tln_skipspace($body, $lt + 1);
 174         if ($pos >= strlen($body)){
 175                 return Array(false, false, false, $lt, strlen($body));
 176         }
 177         /**
 178          * There are 3 kinds of tags:
 179          * 1. Opening tag, e.g.:
 180          *        <a href="blah">
 181          * 2. Closing tag, e.g.:
 182          *        </a>
 183          * 3. XHTML-style content-less tag, e.g.:
 184          *        <img src="blah"/>
 185          */
 186         $tagtype = false;
 187         switch (substr($body, $pos, 1)){
 188         case '/':
 189                 $tagtype = 2;
 190                 $pos++;
 191                 break;
 192         case '!':
 193                 /**
 194                  * A comment or an SGML declaration.
 195                  */
 196                 if (substr($body, $pos+1, 2) == '--'){
 197                         $gt = strpos($body, '-->', $pos);
 198                         if ($gt === false){
 199                                 $gt = strlen($body);
 200                         } else {
 201                                 $gt += 2;
 202                         }
 203                         return Array(false, false, false, $lt, $gt);
 204                 } else {
 205                         $gt = tln_findnxstr($body, $pos, '>');
 206                         return Array(false, false, false, $lt, $gt);
 207                 }
 208                 break;
 209         default:
 210                 /**
 211                  * Assume tagtype 1 for now. If it's type 3, we'll switch values
 212                  * later.
 213                  */
 214                 $tagtype = 1;
 215                 break;
 216         }
 217
 218         $tag_start = $pos;
 219         $tagname = '';
 220         /**
 221          * Look for next [\W-_], which will indicate the end of the tag name.
 222          */
 223         $regary = tln_findnxreg($body, $pos, '[^\w\-_]');
 224         if ($regary == false){
 225                 return Array(false, false, false, $lt, strlen($body));
 226         }
 227         list($pos, $tagname, $match) = $regary;
 228         $tagname = strtolower($tagname);
 229
 230         /**
 231          * $match can be either of these:
 232          * '>'  indicating the end of the tag entirely.
 233          * '\s' indicating the end of the tag name.
 234          * '/'  indicating that this is type-3 xhtml tag.
 235          *
 236          * Whatever else we find there indicates an invalid tag.
 237          */
 238         switch ($match){
 239         case '/':
 240                 /**
 241                  * This is an xhtml-style tag with a closing / at the
 242                  * end, like so: <img src="blah"/>. Check if it's followed
 243                  * by the closing bracket. If not, then this tag is invalid
 244                  */
 245                 if (substr($body, $pos, 2) == '/>'){
 246                         $pos++;
 247                         $tagtype = 3;
 248                 } else {
 249                         $gt = tln_findnxstr($body, $pos, '>');
 250                         $retary = Array(false, false, false, $lt, $gt);
 251                         return $retary;
 252                 }
 253         case '>':
 254                 return Array($tagname, false, $tagtype, $lt, $pos);
 255                 break;
 256         default:
 257                 /**
 258                  * Check if it's whitespace
 259                  */
 260                 if (preg_match('/\s/', $match)){
 261                 } else {
 262                         /**
 263                          * This is an invalid tag! Look for the next closing ">".
 264                          */
 265                         $gt = tln_findnxstr($body, $lt, '>');
 266                         return Array(false, false, false, $lt, $gt);
 267                 }
 268         }
 269
 270         /**
 271          * At this point we're here:
 272          * <tagname      attribute='blah'>
 273          * \-------^
 274          *
 275          * At this point we loop in order to find all attributes.
 276          */
 277         $attname = '';
 278         $atttype = false;
 279         $attary = Array();
 280
 281         while ($pos <= strlen($body)){
 282                 $pos = tln_skipspace($body, $pos);
 283                 if ($pos == strlen($body)){
 284                         /**
 285                          * Non-closed tag.
 286                          */
 287                         return Array(false, false, false, $lt, $pos);
 288                 }
 289                 /**
 290                  * See if we arrived at a ">" or "/>", which means that we reached
 291                  * the end of the tag.
 292                  */
 293                 $matches = Array();
 294                 preg_match('%^(\s*)(>|/>)%s', substr($body, $pos), $matches);
 295                 if (isset($matches[0]) && $matches[0]){
 296                         /**
 297                          * Yep. So we did.
 298                          */
 299                         $pos += strlen($matches[1]);
 300                         if ($matches[2] == '/>'){
 301                                 $tagtype = 3;
 302                                 $pos++;
 303                         }
 304                         return Array($tagname, $attary, $tagtype, $lt, $pos);
 305                 }
 306
 307                 /**
 308                  * There are several types of attributes, with optional
 309                  * [:space:] between members.
 310                  * Type 1:
 311                  *       attrname[:space:]=[:space:]'CDATA'
 312                  * Type 2:
 313                  *       attrname[:space:]=[:space:]"CDATA"
 314                  * Type 3:
 315                  *       attr[:space:]=[:space:]CDATA
 316                  * Type 4:
 317                  *       attrname
 318                  *
 319                  * We leave types 1 and 2 the same, type 3 we check for
 320                  * '"' and convert to "&quot" if needed, then wrap in
 321                  * double quotes. Type 4 we convert into:
 322                  * attrname="yes".
 323                  */
 324                 $regary = tln_findnxreg($body, $pos, '[^\w\-_]');
 325                 if ($regary == false){
 326                         /**
 327                          * Looks like body ended before the end of tag.
 328                          */
 329                         return Array(false, false, false, $lt, strlen($body));
 330                 }
 331                 list($pos, $attname, $match) = $regary;
 332                 $attname = strtolower($attname);
 333                 /**
 334                  * We arrived at the end of attribute name. Several things possible
 335                  * here:
 336                  * '>'  means the end of the tag and this is attribute type 4
 337                  * '/'  if followed by '>' means the same thing as above
 338                  * '\s' means a lot of things -- look what it's followed by.
 339                  *              anything else means the attribute is invalid.
 340                  */
 341                 switch($match){
 342                 case '/':
 343                         /**
 344                          * This is an xhtml-style tag with a closing / at the
 345                          * end, like so: <img src="blah"/>. Check if it's followed
 346                          * by the closing bracket. If not, then this tag is invalid
 347                          */
 348                         if (substr($body, $pos, 2) == '/>'){
 349                                 $pos++;
 350                                 $tagtype = 3;
 351                         } else {
 352                                 $gt = tln_findnxstr($body, $pos, '>');
 353                                 $retary = Array(false, false, false, $lt, $gt);
 354                                 return $retary;
 355                         }
 356                 case '>':
 357                         $attary{$attname} = '"yes"';
 358                         return Array($tagname, $attary, $tagtype, $lt, $pos);
 359                         break;
 360                 default:
 361                         /**
 362                          * Skip whitespace and see what we arrive at.
 363                          */
 364                         $pos = tln_skipspace($body, $pos);
 365                         $char = substr($body, $pos, 1);
 366                         /**
 367                          * Two things are valid here:
 368                          * '=' means this is attribute type 1 2 or 3.
 369                          * \w means this was attribute type 4.
 370                          * anything else we ignore and re-loop. End of tag and
 371                          * invalid stuff will be caught by our checks at the beginning
 372                          * of the loop.
 373                          */
 374                         if ($char == '='){
 375                                 $pos++;
 376                                 $pos = tln_skipspace($body, $pos);
 377                                 /**
 378                                  * Here are 3 possibilities:
 379                                  * "'"  attribute type 1
 380                                  * '"'  attribute type 2
 381                                  * everything else is the content of tag type 3
 382                                  */
 383                                 $quot = substr($body, $pos, 1);
 384                                 if ($quot == '\''){
 385                                         $regary = tln_findnxreg($body, $pos+1, '\'');
 386                                         if ($regary == false){
 387                                                 return Array(false, false, false, $lt, strlen($body));
 388                                         }
 389                                         list($pos, $attval, $match) = $regary;
 390                                         $pos++;
 391                                         $attary{$attname} = '\'' . $attval . '\'';
 392                                 } else if ($quot == '"'){
 393                                         $regary = tln_findnxreg($body, $pos+1, '\"');
 394                                         if ($regary == false){
 395                                                 return Array(false, false, false, $lt, strlen($body));
 396                                         }
 397                                         list($pos, $attval, $match) = $regary;
 398                                         $pos++;
 399                                         $attary{$attname} = '"' . $attval . '"';
 400                                 } else {
 401                                         /**
 402                                          * These are hateful. Look for \s, or >.
 403                                          */
 404                                         $regary = tln_findnxreg($body, $pos, '[\s>]');
 405                                         if ($regary == false){
 406                                                 return Array(false, false, false, $lt, strlen($body));
 407                                         }
 408                                         list($pos, $attval, $match) = $regary;
 409                                         /**
 410                                          * If it's ">" it will be caught at the top.
 411                                          */
 412                                         $attval = preg_replace('/\"/s', '&quot;', $attval);
 413                                         $attary{$attname} = '"' . $attval . '"';
 414                                 }
 415                         } else if (preg_match('|[\w/>]|', $char)) {
 416                                 /**
 417                                  * That was attribute type 4.
 418                                  */
 419                                 $attary{$attname} = '"yes"';
 420                         } else {
 421                                 /**
 422                                  * An illegal character. Find next '>' and return.
 423                                  */
 424                                 $gt = tln_findnxstr($body, $pos, '>');
 425                                 return Array(false, false, false, $lt, $gt);
 426                         }
 427                 }
 428         }
 429         /**
 430          * The fact that we got here indicates that the tag end was never
 431          * found. Return invalid tag indication so it gets stripped.
 432          */
 433         return Array(false, false, false, $lt, strlen($body));
 434 }
 435
 436 /**
 437  * Translates entities into literal values so they can be checked.
 438  *
 439  * @param $attvalue the by-ref value to check.
 440  * @param $regex        the regular expression to check against.
 441  * @param $hex          whether the entites are hexadecimal.
 442  * @return                      True or False depending on whether there were matches.
 443  */
 444 function tln_deent(&$attvalue, $regex, $hex=false){
 445         $me = 'tln_deent';
 446         $ret_match = false;
 447         preg_match_all($regex, $attvalue, $matches);
 448         if (is_array($matches) && sizeof($matches[0]) > 0){
 449                 $repl = Array();
 450                 for ($i = 0; $i < sizeof($matches[0]); $i++){
 451                         $numval = $matches[1][$i];
 452                         if ($hex){
 453                                 $numval = hexdec($numval);
 454                         }
 455                         $repl{$matches[0][$i]} = chr($numval);
 456                 }
 457                 $attvalue = strtr($attvalue, $repl);
 458                 return true;
 459         } else {
 460                 return false;
 461         }
 462 }
 463
 464 /**
 465  * This function checks attribute values for entity-encoded values
 466  * and returns them translated into 8-bit strings so we can run
 467  * checks on them.
 468  *
 469  * @param  $attvalue A string to run entity check against.
 470  * @return                       Nothing, modifies a reference value.
 471  */
 472 function tln_defang(&$attvalue){
 473         $me = 'tln_defang';
 474         /**
 475          * Skip this if there aren't ampersands or backslashes.
 476          */
 477         if (strpos($attvalue, '&') === false
 478                 && strpos($attvalue, '\\') === false){
 479                 return;
 480         }
 481         $m = false;
 482         do {
 483                 $m = false;
 484                 $m = $m || tln_deent($attvalue, '/\&#0*(\d+);*/s');
 485                 $m = $m || tln_deent($attvalue, '/\&#x0*((\d|[a-f])+);*/si', true);
 486                 $m = $m || tln_deent($attvalue, '/\\\\(\d+)/s', true);
 487         } while ($m == true);
 488         $attvalue = stripslashes($attvalue);
 489 }
 490
 491 /**
 492  * Kill any tabs, newlines, or carriage returns. Our friends the
 493  * makers of the browser with 95% market value decided that it'd
 494  * be funny to make "java[tab]script" be just as good as "javascript".
 495  *
 496  * @param  attvalue      The attribute value before extraneous spaces removed.
 497  * @return attvalue      Nothing, modifies a reference value.
 498  */
 499 function tln_unspace(&$attvalue){
 500         $me = 'tln_unspace';
 501         if (strcspn($attvalue, "\t\r\n\0 ") != strlen($attvalue)){
 502                 $attvalue = str_replace(Array("\t", "\r", "\n", "\0", " "),
 503                                                                 Array('',       '',       '',   '',       ''), $attvalue);
 504         }
 505 }
 506
 507 /**
 508  * This function runs various checks against the attributes.
 509  *
 510  * @param  $tagname                     String with the name of the tag.
 511  * @param  $attary                      Array with all tag attributes.
 512  * @param  $rm_attnames         See description for tln_sanitize
 513  * @param  $bad_attvals         See description for tln_sanitize
 514  * @param  $add_attr_to_tag See description for tln_sanitize
 515  * @return                                      Array with modified attributes.
 516  */
 517 function tln_fixatts($tagname,
 518                                  $attary,
 519                                  $rm_attnames,
 520                                  $bad_attvals,
 521                                  $add_attr_to_tag
 522                                  ){
 523         $me = 'tln_fixatts';
 524         while (list($attname, $attvalue) = each($attary)){
 525                 /**
 526                  * See if this attribute should be removed.
 527                  */
 528                 foreach ($rm_attnames as $matchtag=>$matchattrs){
 529                         if (preg_match($matchtag, $tagname)){
 530                                 foreach ($matchattrs as $matchattr){
 531                                         if (preg_match($matchattr, $attname)){
 532                                                 unset($attary{$attname});
 533                                                 continue;
 534                                         }
 535                                 }
 536                         }
 537                 }
 538                 /**
 539                  * Remove any backslashes, entities, or extraneous whitespace.
 540                  */
 541                 tln_defang($attvalue);
 542                 tln_unspace($attvalue);
 543
 544                 /**
 545                  * Now let's run checks on the attvalues.
 546                  * I don't expect anyone to comprehend this. If you do,
 547                  * get in touch with me so I can drive to where you live and
 548                  * shake your hand personally. :)
 549                  */
 550                 foreach ($bad_attvals as $matchtag=>$matchattrs){
 551                         if (preg_match($matchtag, $tagname)){
 552                                 foreach ($matchattrs as $matchattr=>$valary){
 553                                         if (preg_match($matchattr, $attname)){
 554                                                 /**
 555                                                  * There are two arrays in valary.
 556                                                  * First is matches.
 557                                                  * Second one is replacements
 558                                                  */
 559                                                 list($valmatch, $valrepl) = $valary;
 560                                                 $newvalue = preg_replace($valmatch,$valrepl,$attvalue);
 561                                                 if ($newvalue != $attvalue){
 562                                                         $attary{$attname} = $newvalue;
 563                                                 }
 564                                         }
 565                                 }
 566                         }
 567                 }
 568         }
 569         /**
 570          * See if we need to append any attributes to this tag.
 571          */
 572         foreach ($add_attr_to_tag as $matchtag=>$addattary){
 573                 if (preg_match($matchtag, $tagname)){
 574                         $attary = array_merge($attary, $addattary);
 575                 }
 576         }
 577         return $attary;
 578 }
 579
 580 /**
 581  *
 582  * @param $body                                 the string with HTML you wish to filter
 583  * @param $tag_list                             see description above
 584  * @param $rm_tags_with_content see description above
 585  * @param $self_closing_tags    see description above
 586  * @param $force_tag_closing    see description above
 587  * @param $rm_attnames                  see description above
 588  * @param $bad_attvals                  see description above
 589  * @param $add_attr_to_tag              see description above
 590  * @return                                              tln_sanitized html safe to show on your pages.
 591  */
 592 function tln_sanitize($body,
 593                                   $tag_list,
 594                                   $rm_tags_with_content,
 595                                   $self_closing_tags,
 596                                   $force_tag_closing,
 597                                   $rm_attnames,
 598                                   $bad_attvals,
 599                                   $add_attr_to_tag
 600                                   )
 601 {
 602         $me = 'tln_sanitize';
 603         /**
 604          * Normalize rm_tags and rm_tags_with_content.
 605          */
 606         $rm_tags = array_shift($tag_list);
 607         @array_walk($tag_list, 'tln_casenormalize');
 608         @array_walk($rm_tags_with_content, 'tln_casenormalize');
 609         @array_walk($self_closing_tags, 'tln_casenormalize');
 610         /**
 611          * See if tag_list is of tags to remove or tags to allow.
 612          * false  means remove these tags
 613          * true   means allow these tags
 614          */
 615         $curpos = 0;
 616         $open_tags = Array();
 617         $trusted = "<!-- begin tln_sanitized html -->\n";
 618         $skip_content = false;
 619         /**
 620          * Take care of netscape's stupid javascript entities like
 621          * &{alert('boo')};
 622          */
 623         $body = preg_replace('/&(\{.*?\};)/si', '&amp;\\1', $body);
 624         while (($curtag = tln_getnxtag($body, $curpos)) != FALSE){
 625                 list($tagname, $attary, $tagtype, $lt, $gt) = $curtag;
 626                 $free_content = substr($body, $curpos, $lt - $curpos);
 627                 if ($skip_content == false){
 628                         $trusted .= $free_content;
 629                 } else {
 630                 }
 631                 if ($tagname != FALSE){
 632                         if ($tagtype == 2){
 633                                 if ($skip_content == $tagname){
 634                                         /**
 635                                          * Got to the end of tag we needed to remove.
 636                                          */
 637                                         $tagname = false;
 638                                         $skip_content = false;
 639                                 } else {
 640                                         if ($skip_content == false){
 641                                                 if (isset($open_tags{$tagname}) &&
 642                                                         $open_tags{$tagname} > 0){
 643                                                         $open_tags{$tagname}--;
 644                                                 } else {
 645                                                         $tagname = false;
 646                                                 }
 647                                         } else {
 648                                         }
 649                                 }
 650                         } else {
 651                                 /**
 652                                  * $rm_tags_with_content
 653                                  */
 654                                 if ($skip_content == false){
 655                                         /**
 656                                          * See if this is a self-closing type and change
 657                                          * tagtype appropriately.
 658                                          */
 659                                         if ($tagtype == 1
 660                                                 && in_array($tagname, $self_closing_tags)){
 661                                                 $tagtype = 3;
 662                                         }
 663                                         /**
 664                                          * See if we should skip this tag and any content
 665                                          * inside it.
 666                                          */
 667                                         if ($tagtype == 1
 668                                                 && in_array($tagname, $rm_tags_with_content)){
 669                                                 $skip_content = $tagname;
 670                                         } else {
 671                                                 if (($rm_tags == false
 672                                                          && in_array($tagname, $tag_list)) ||
 673                                                         ($rm_tags == true
 674                                                          && !in_array($tagname, $tag_list))){
 675                                                         $tagname = false;
 676                                                 } else {
 677                                                         if ($tagtype == 1){
 678                                                                 if (isset($open_tags{$tagname})){
 679                                                                         $open_tags{$tagname}++;
 680                                                                 } else {
 681                                                                         $open_tags{$tagname} = 1;
 682                                                                 }
 683                                                         }
 684                                                         /**
 685                                                          * This is where we run other checks.
 686                                                          */
 687                                                         if (is_array($attary) && sizeof($attary) > 0){
 688                                                                 $attary = tln_fixatts($tagname,
 689                                                                                                   $attary,
 690                                                                                                   $rm_attnames,
 691                                                                                                   $bad_attvals,
 692                                                                                                   $add_attr_to_tag);
 693                                                         }
 694                                                 }
 695                                         }
 696                                 } else {
 697                                 }
 698                         }
 699                         if ($tagname != false && $skip_content == false){
 700                                 $trusted .= tln_tagprint($tagname, $attary, $tagtype);
 701                         }
 702                 } else {
 703                 }
 704                 $curpos = $gt + 1;
 705         }
 706         $trusted .= substr($body, $curpos, strlen($body) - $curpos);
 707         if ($force_tag_closing == true){
 708                 foreach ($open_tags as $tagname=>$opentimes){
 709                         while ($opentimes > 0){
 710                                 $trusted .= '</' . $tagname . '>';
 711                                 $opentimes--;
 712                         }
 713                 }
 714                 $trusted .= "\n";
 715         }
 716         $trusted .= "<!-- end tln_sanitized html -->\n";
 717         return $trusted;
 718 }
 719
 720 //
 721 // Use the nifty htmlfilter library
 722 //
 723
 724
 725 function HTMLFilter($body, $trans_image_path, $block_external_images = false) {
 726
 727         $tag_list = Array(
 728                 false,
 729                 "object",
 730                 "meta",
 731                 "html",
 732                 "head",
 733                 "base",
 734                 "link",
 735                 "frame",
 736                 "iframe",
 737                 "plaintext",
 738                 "marquee"
 739         );
 740
 741         $rm_tags_with_content = Array(
 742                 "script",
 743                 "applet",
 744                 "embed",
 745                 "title",
 746                 "frameset",
 747                 "xmp",
 748                 "xml"
 749         );
 750
 751         $self_closing_tags =  Array(
 752                 "img",
 753                 "br",
 754                 "hr",
 755                 "input",
 756                 "outbind"
 757         );
 758
 759         $force_tag_closing = true;
 760
 761         $rm_attnames = Array(
 762                 "/.*/" =>
 763                         Array(
 764                                 // "/target/i",
 765                                 "/^on.*/i",
 766                                 "/^dynsrc/i",
 767                                 "/^data.*/i",
 768                                 "/^lowsrc.*/i"
 769                         )
 770         );
 771
 772         $bad_attvals = Array(
 773                 "/.*/" =>
 774                 Array(
 775                         "/^src|background/i" =>
 776                         Array(
 777                                 Array(
 778                                         "/^([\'\"])\s*\S+script\s*:.*([\'\"])/si",
 779                                         "/^([\'\"])\s*mocha\s*:*.*([\'\"])/si",
 780                                         "/^([\'\"])\s*about\s*:.*([\'\"])/si"
 781                                 ),
 782                                 Array(
 783                                         "\\1$trans_image_path\\2",
 784                                         "\\1$trans_image_path\\2",
 785                                         "\\1$trans_image_path\\2",
 786                                         "\\1$trans_image_path\\2"
 787                                 )
 788                         ),
 789                         "/^href|action/i" =>
 790                         Array(
 791                                 Array(
 792                                         "/^([\'\"])\s*\S+script\s*:.*([\'\"])/si",
 793                                         "/^([\'\"])\s*mocha\s*:*.*([\'\"])/si",
 794                                         "/^([\'\"])\s*about\s*:.*([\'\"])/si"
 795                                 ),
 796                                 Array(
 797                                         "\\1#\\1",
 798                                         "\\1#\\1",
 799                                         "\\1#\\1",
 800                                         "\\1#\\1"
 801                                 )
 802                         ),
 803                         "/^style/i" =>
 804                         Array(
 805                                 Array(
 806                                         "/expression/i",
 807                                         "/binding/i",
 808                                         "/behaviou*r/i",
 809                                         "/include-source/i",
 810                                         "/position\s*:\s*absolute/i",
 811                                         "/url\s*\(\s*([\'\"])\s*\S+script\s*:.*([\'\"])\s*\)/si",
 812                                         "/url\s*\(\s*([\'\"])\s*mocha\s*:.*([\'\"])\s*\)/si",
 813                                         "/url\s*\(\s*([\'\"])\s*about\s*:.*([\'\"])\s*\)/si",
 814                                         "/(.*)\s*:\s*url\s*\(\s*([\'\"]*)\s*\S+script\s*:.*([\'\"]*)\s*\)/si"
 815                                 ),
 816                                 Array(
 817                                         "idiocy",
 818                                         "idiocy",
 819                                         "idiocy",
 820                                         "idiocy",
 821                                         "",
 822                                         "url(\\1#\\1)",
 823                                         "url(\\1#\\1)",
 824                                         "url(\\1#\\1)",
 825                                         "url(\\1#\\1)",
 826                                         "url(\\1#\\1)",
 827                                         "\\1:url(\\2#\\3)"
 828                                 )
 829                         )
 830                 )
 831         );
 832
 833         if ($block_external_images){
 834                 array_push($bad_attvals{'/.*/'}{'/^src|background/i'}[0],
 835                                 '/^([\'\"])\s*https*:.*([\'\"])/si');
 836                 array_push($bad_attvals{'/.*/'}{'/^src|background/i'}[1],
 837                                 "\\1$trans_image_path\\1");
 838                 array_push($bad_attvals{'/.*/'}{'/^style/i'}[0],
 839                                 '/url\(([\'\"])\s*https*:.*([\'\"])\)/si');
 840                 array_push($bad_attvals{'/.*/'}{'/^style/i'}[1],
 841                                 "url(\\1$trans_image_path\\1)");
 842         }
 843
 844         $add_attr_to_tag = Array(
 845                 "/^a$/i" =>
 846                         Array('target'=>'"_blank"')
 847         );
 848
 849         $trusted = tln_sanitize($body,
 850                         $tag_list,
 851                         $rm_tags_with_content,
 852                         $self_closing_tags,
 853                         $force_tag_closing,
 854                         $rm_attnames,
 855                         $bad_attvals,
 856                         $add_attr_to_tag
 857                         );
 858         return $trusted;
 859 }
 860
 861 ?>