3 * YOURLS modification of a small subset from WordPress' KSES implementation.
4 * Straight from the Let's Not Reinvent The Wheel department.
8 * kses 0.2.2 - HTML/XHTML filter that only allows some elements and attributes
9 * Copyright (C) 2002, 2003, 2005 Ulf Harnhammar
11 * This program is free software and open source software; you can redistribute
12 * it and/or modify it under the terms of the GNU General Public License as
13 * published by the Free Software Foundation; either version 2 of the License,
14 * or (at your option) any later version.
16 * This program is distributed in the hope that it will be useful, but WITHOUT
17 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
18 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
21 * You should have received a copy of the GNU General Public License along
22 * with this program; if not, write to the Free Software Foundation, Inc.,
23 * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
24 * http://www.gnu.org/licenses/gpl.html
26 * [kses strips evil scripts!]
29 * @copyright (C) 2002, 2003, 2005
30 * @author Ulf Harnhammar <http://advogato.org/person/metaur/>
38 * Two globals are defined: $yourls_allowedentitynames and $yourls_allowedprotocols
39 * - $yourls_allowedentitynames is used internally in KSES functions to sanitize HTML entities
40 * - $yourls_allowedprotocols is used in various parts of YOURLS, not just in KSES, albeit being defined here
41 * Two globals are not defined and unused at this moment: $yourls_allowedtags_all and $yourls_allowedtags
42 * The code for these vars is here and ready for any future use
45 // Populate after plugins have loaded to allow user defined values
46 yourls_add_action( 'plugins_loaded', 'yourls_kses_init' );
49 * Init KSES globals if not already defined (by a plugin)
54 function yourls_kses_init() {
55 global $yourls_allowedentitynames, $yourls_allowedprotocols;
57 if( ! $yourls_allowedentitynames ) {
58 $yourls_allowedentitynames = yourls_apply_filter( 'kses_allowed_entities', yourls_kses_allowed_entities() );
61 if( ! $yourls_allowedprotocols ) {
62 $yourls_allowedprotocols = yourls_apply_filter( 'kses_allowed_protocols', yourls_kses_allowed_protocols() );
65 /** See NOTE ABOUT GLOBALS **
67 if( ! $yourls_allowedtags_all ) {
68 $yourls_allowedtags_all = yourls_kses_allowed_tags_all();
69 $yourls_allowedtags_all = array_map( '_yourls_add_global_attributes', $yourls_allowedtags_all );
70 $yourls_allowedtags_all = yourls_apply_filter( 'kses_allowed_tags_all', $yourls_allowedtags_all );
72 // User defined: let's sanitize
73 $yourls_allowedtags_all = yourls_kses_array_lc( $yourls_allowedtags_all );
76 if( ! $yourls_allowedtags ) {
77 $yourls_allowedtags = yourls_kses_allowed_tags();
78 $yourls_allowedtags = array_map( '_yourls_add_global_attributes', $yourls_allowedtags );
79 $yourls_allowedtags = yourls_apply_filter( 'kses_allowed_tags', $yourls_allowedtags );
81 // User defined: let's sanitize
82 $yourls_allowedtags = yourls_kses_array_lc( $yourls_allowedtags );
89 * Kses global for all allowable HTML tags.
91 * Complete (?) list of HTML tags. Keep this function available for any plugin or
92 * future feature that will want to display lots of HTML.
96 * @return array All tags
98 function yourls_kses_allowed_tags_all() {
100 'address' => array(),
109 'acronym' => array(),
132 'blockquote' => array(
181 'fieldset' => array(),
188 'figcaption' => array(
208 'accept-charset' => true,
329 'cellpadding' => true,
330 'cellspacing' => true,
416 * Kses global for default allowable HTML tags. TODO: trim down to necessary only.
418 * Short list of HTML tags used in YOURLS core for display
422 * @return array Allowed tags
424 function yourls_kses_allowed_tags() {
437 'blockquote' => array(
456 * Kses global for allowable HTML entities.
460 * @return array Allowed entities
462 function yourls_kses_allowed_entities() {
464 'nbsp', 'iexcl', 'cent', 'pound', 'curren', 'yen',
465 'brvbar', 'sect', 'uml', 'copy', 'ordf', 'laquo',
466 'not', 'shy', 'reg', 'macr', 'deg', 'plusmn',
467 'acute', 'micro', 'para', 'middot', 'cedil', 'ordm',
468 'raquo', 'iquest', 'Agrave', 'Aacute', 'Acirc', 'Atilde',
469 'Auml', 'Aring', 'AElig', 'Ccedil', 'Egrave', 'Eacute',
470 'Ecirc', 'Euml', 'Igrave', 'Iacute', 'Icirc', 'Iuml',
471 'ETH', 'Ntilde', 'Ograve', 'Oacute', 'Ocirc', 'Otilde',
472 'Ouml', 'times', 'Oslash', 'Ugrave', 'Uacute', 'Ucirc',
473 'Uuml', 'Yacute', 'THORN', 'szlig', 'agrave', 'aacute',
474 'acirc', 'atilde', 'auml', 'aring', 'aelig', 'ccedil',
475 'egrave', 'eacute', 'ecirc', 'euml', 'igrave', 'iacute',
476 'icirc', 'iuml', 'eth', 'ntilde', 'ograve', 'oacute',
477 'ocirc', 'otilde', 'ouml', 'divide', 'oslash', 'ugrave',
478 'uacute', 'ucirc', 'uuml', 'yacute', 'thorn', 'yuml',
479 'quot', 'amp', 'lt', 'gt', 'apos', 'OElig',
480 'oelig', 'Scaron', 'scaron', 'Yuml', 'circ', 'tilde',
481 'ensp', 'emsp', 'thinsp', 'zwnj', 'zwj', 'lrm',
482 'rlm', 'ndash', 'mdash', 'lsquo', 'rsquo', 'sbquo',
483 'ldquo', 'rdquo', 'bdquo', 'dagger', 'Dagger', 'permil',
484 'lsaquo', 'rsaquo', 'euro', 'fnof', 'Alpha', 'Beta',
485 'Gamma', 'Delta', 'Epsilon', 'Zeta', 'Eta', 'Theta',
486 'Iota', 'Kappa', 'Lambda', 'Mu', 'Nu', 'Xi',
487 'Omicron', 'Pi', 'Rho', 'Sigma', 'Tau', 'Upsilon',
488 'Phi', 'Chi', 'Psi', 'Omega', 'alpha', 'beta',
489 'gamma', 'delta', 'epsilon', 'zeta', 'eta', 'theta',
490 'iota', 'kappa', 'lambda', 'mu', 'nu', 'xi',
491 'omicron', 'pi', 'rho', 'sigmaf', 'sigma', 'tau',
492 'upsilon', 'phi', 'chi', 'psi', 'omega', 'thetasym',
493 'upsih', 'piv', 'bull', 'hellip', 'prime', 'Prime',
494 'oline', 'frasl', 'weierp', 'image', 'real', 'trade',
495 'alefsym', 'larr', 'uarr', 'rarr', 'darr', 'harr',
496 'crarr', 'lArr', 'uArr', 'rArr', 'dArr', 'hArr',
497 'forall', 'part', 'exist', 'empty', 'nabla', 'isin',
498 'notin', 'ni', 'prod', 'sum', 'minus', 'lowast',
499 'radic', 'prop', 'infin', 'ang', 'and', 'or',
500 'cap', 'cup', 'int', 'sim', 'cong', 'asymp',
501 'ne', 'equiv', 'le', 'ge', 'sub', 'sup',
502 'nsub', 'sube', 'supe', 'oplus', 'otimes', 'perp',
503 'sdot', 'lceil', 'rceil', 'lfloor', 'rfloor', 'lang',
504 'rang', 'loz', 'spades', 'clubs', 'hearts', 'diams',
509 * Kses global for allowable protocols.
513 * @return array Allowed protocols
515 function yourls_kses_allowed_protocols() {
516 // More or less common stuff in links. From http://en.wikipedia.org/wiki/URI_scheme
519 'http://', 'https://', 'ftp://',
526 // Old school bearded geek
527 'gopher://', 'telnet://', 'finger://',
528 'nntp://', 'worldwind://',
531 'ssh://', 'svn://', 'svn+ssh://', 'git://', 'cvs://',
533 'market://', // Google Play
537 'ed2k://', 'magnet:', 'udp://',
540 'mms://', 'lastfm://', 'spotify:', 'rtsp://',
543 'aim:', 'facetime://', 'gtalk:', 'xmpp:',
544 'irc://', 'ircs://', 'mumble://',
545 'callto:', 'skype:', 'sip:',
546 'teamspeak://', 'ventrilo://', 'xfire:',
550 'steam:', 'steam://',
552 'ldap://', 'ldaps://',
554 // Purposedly removed for security
556 'about:', 'chrome://', 'chrome-extension://',
565 * Converts and fixes HTML entities.
567 * This function normalizes HTML entities. It will convert "AT&T" to the correct
568 * "AT&T", ":" to ":", "&#XYZZY;" to "&#XYZZY;" and so on.
572 * @param string $string Content to normalize entities
573 * @return string Content with normalized entities
575 function yourls_kses_normalize_entities($string) {
576 # Disarm all entities by converting & to &
578 $string = str_replace('&', '&', $string);
580 # Change back the allowed entities in our entity whitelist
582 $string = preg_replace_callback('/&([A-Za-z]{2,8});/', 'yourls_kses_named_entities', $string);
583 $string = preg_replace_callback('/&#(0*[0-9]{1,7});/', 'yourls_kses_normalize_entities2', $string);
584 $string = preg_replace_callback('/&#[Xx](0*[0-9A-Fa-f]{1,6});/', 'yourls_kses_normalize_entities3', $string);
590 * Callback for yourls_kses_normalize_entities() regular expression.
592 * This function only accepts valid named entity references, which are finite,
593 * case-sensitive, and highly scrutinized by HTML and XML validators.
597 * @param array $matches preg_replace_callback() matches array
598 * @return string Correctly encoded entity
600 function yourls_kses_named_entities($matches) {
601 global $yourls_allowedentitynames;
603 if ( empty($matches[1]) )
607 return ( ( ! in_array($i, $yourls_allowedentitynames) ) ? "&$i;" : "&$i;" );
611 * Callback for yourls_kses_normalize_entities() regular expression.
613 * This function helps yourls_kses_normalize_entities() to only accept 16-bit values
614 * and nothing more for &#number; entities.
619 * @param array $matches preg_replace_callback() matches array
620 * @return string Correctly encoded entity
622 function yourls_kses_normalize_entities2($matches) {
623 if ( empty($matches[1]) )
627 if (yourls_valid_unicode($i)) {
628 $i = str_pad(ltrim($i,'0'), 3, '0', STR_PAD_LEFT);
638 * Callback for yourls_kses_normalize_entities() for regular expression.
640 * This function helps yourls_kses_normalize_entities() to only accept valid Unicode
641 * numeric entities in hex form.
646 * @param array $matches preg_replace_callback() matches array
647 * @return string Correctly encoded entity
649 function yourls_kses_normalize_entities3($matches) {
650 if ( empty($matches[1]) )
653 $hexchars = $matches[1];
654 return ( ( ! yourls_valid_unicode(hexdec($hexchars)) ) ? "&#x$hexchars;" : '&#x'.ltrim($hexchars,'0').';' );
658 * Helper function to add global attributes to a tag in the allowed html list.
663 * @param array $value An array of attributes.
664 * @return array The array of attributes with global attributes added.
666 function _yourls_add_global_attributes( $value ) {
667 $global_attributes = array(
674 if ( true === $value )
677 if ( is_array( $value ) )
678 return array_merge( $value, $global_attributes );
684 * Helper function to determine if a Unicode value is valid.
688 * @param int $i Unicode value
689 * @return bool True if the value was a valid Unicode number
691 function yourls_valid_unicode($i) {
692 return ( $i == 0x9 || $i == 0xa || $i == 0xd ||
693 ($i >= 0x20 && $i <= 0xd7ff) ||
694 ($i >= 0xe000 && $i <= 0xfffd) ||
695 ($i >= 0x10000 && $i <= 0x10ffff) );
699 * Goes through an array and changes the keys to all lower case.
703 * @param array $inarray Unfiltered array
704 * @return array Fixed array with all lowercase keys
706 function yourls_kses_array_lc($inarray) {
707 $outarray = array ();
709 foreach ( (array) $inarray as $inkey => $inval) {
710 $outkey = strtolower($inkey);
711 $outarray[$outkey] = array ();
713 foreach ( (array) $inval as $inkey2 => $inval2) {
714 $outkey2 = strtolower($inkey2);
715 $outarray[$outkey][$outkey2] = $inval2;
723 * Convert all entities to their character counterparts.
725 * This function decodes numeric HTML entities (A and A). It doesn't do
726 * anything with other entities like ä, but we don't need them in the URL
727 * protocol whitelisting system anyway.
731 * @param string $string Content to change entities
732 * @return string Content after decoded entities
734 function yourls_kses_decode_entities($string) {
735 $string = preg_replace_callback('/&#([0-9]+);/', '_yourls_kses_decode_entities_chr', $string);
736 $string = preg_replace_callback('/&#[Xx]([0-9A-Fa-f]+);/', '_yourls_kses_decode_entities_chr_hexdec', $string);
742 * Regex callback for yourls_kses_decode_entities()
746 * @param array $match preg match
749 function _yourls_kses_decode_entities_chr( $match ) {
750 return chr( $match[1] );
754 * Regex callback for yourls_kses_decode_entities()
758 * @param array $match preg match
761 function _yourls_kses_decode_entities_chr_hexdec( $match ) {
762 return chr( hexdec( $match[1] ) );
766 * Removes any null characters in $string.
770 * @param string $string
773 function yourls_kses_no_null($string) {
774 $string = preg_replace( '/\0+/', '', $string );
775 $string = preg_replace( '/(\\\\0)+/', '', $string );