3 * YOURLS modification of a small subset from WordPress' KSES implementation.
4 * Straight from the Let's Not Reinvent The Wheel department.
8 * kses 0.2.2 - HTML/XHTML filter that only allows some elements and attributes
9 * Copyright (C) 2002, 2003, 2005 Ulf Harnhammar
11 * This program is free software and open source software; you can redistribute
12 * it and/or modify it under the terms of the GNU General Public License as
13 * published by the Free Software Foundation; either version 2 of the License,
14 * or (at your option) any later version.
16 * This program is distributed in the hope that it will be useful, but WITHOUT
17 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
18 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
21 * You should have received a copy of the GNU General Public License along
22 * with this program; if not, write to the Free Software Foundation, Inc.,
23 * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
24 * http://www.gnu.org/licenses/gpl.html
26 * [kses strips evil scripts!]
29 * @copyright (C) 2002, 2003, 2005
30 * @author Ulf Harnhammar <http://advogato.org/person/metaur/>
38 * Two globals are defined: $yourls_allowedentitynames and $yourls_allowedprotocols
39 * - $yourls_allowedentitynames is used internally in KSES functions to sanitize HTML entities
40 * - $yourls_allowedprotocols is used in various parts of YOURLS, not just in KSES, albeit being defined here
41 * Two globals are not defined and unused at this moment: $yourls_allowedtags_all and $yourls_allowedtags
42 * The code for these vars is here and ready for any future use
45 // Initialize empty values in globals - populate after plugins have loaded to allow user defined values
46 $yourls_allowedentitynames = $yourls_allowedprotocols = array();
47 yourls_add_action( 'plugins_loaded', 'yourls_kses_init' );
50 * Init KSES globals if not already defined (by a plugin)
55 function yourls_kses_init() {
56 global $yourls_allowedentitynames, $yourls_allowedprotocols;
58 if( ! $yourls_allowedentitynames ) {
59 $yourls_allowedentitynames = yourls_apply_filter( 'kses_allowed_entities', yourls_kses_allowed_entities() );
62 if( ! $yourls_allowedprotocols ) {
63 $yourls_allowedprotocols = yourls_apply_filter( 'kses_allowed_protocols', yourls_kses_allowed_protocols() );
66 /** See NOTE ABOUT GLOBALS **
68 if( ! $yourls_allowedtags_all ) {
69 $yourls_allowedtags_all = yourls_kses_allowed_tags_all();
70 $yourls_allowedtags_all = array_map( '_yourls_add_global_attributes', $yourls_allowedtags_all );
71 $yourls_allowedtags_all = yourls_apply_filter( 'kses_allowed_tags_all', $yourls_allowedtags_all );
73 // User defined: let's sanitize
74 $yourls_allowedtags_all = yourls_kses_array_lc( $yourls_allowedtags_all );
77 if( ! $yourls_allowedtags ) {
78 $yourls_allowedtags = yourls_kses_allowed_tags();
79 $yourls_allowedtags = array_map( '_yourls_add_global_attributes', $yourls_allowedtags );
80 $yourls_allowedtags = yourls_apply_filter( 'kses_allowed_tags', $yourls_allowedtags );
82 // User defined: let's sanitize
83 $yourls_allowedtags = yourls_kses_array_lc( $yourls_allowedtags );
90 * Kses global for all allowable HTML tags.
92 * Complete (?) list of HTML tags. Keep this function available for any plugin or
93 * future feature that will want to display lots of HTML.
97 * @return array All tags
99 function yourls_kses_allowed_tags_all() {
101 'address' => array(),
110 'acronym' => array(),
133 'blockquote' => array(
182 'fieldset' => array(),
189 'figcaption' => array(
209 'accept-charset' => true,
330 'cellpadding' => true,
331 'cellspacing' => true,
417 * Kses global for default allowable HTML tags. TODO: trim down to necessary only.
419 * Short list of HTML tags used in YOURLS core for display
423 * @return array Allowed tags
425 function yourls_kses_allowed_tags() {
438 'blockquote' => array(
457 * Kses global for allowable HTML entities.
461 * @return array Allowed entities
463 function yourls_kses_allowed_entities() {
465 'nbsp', 'iexcl', 'cent', 'pound', 'curren', 'yen',
466 'brvbar', 'sect', 'uml', 'copy', 'ordf', 'laquo',
467 'not', 'shy', 'reg', 'macr', 'deg', 'plusmn',
468 'acute', 'micro', 'para', 'middot', 'cedil', 'ordm',
469 'raquo', 'iquest', 'Agrave', 'Aacute', 'Acirc', 'Atilde',
470 'Auml', 'Aring', 'AElig', 'Ccedil', 'Egrave', 'Eacute',
471 'Ecirc', 'Euml', 'Igrave', 'Iacute', 'Icirc', 'Iuml',
472 'ETH', 'Ntilde', 'Ograve', 'Oacute', 'Ocirc', 'Otilde',
473 'Ouml', 'times', 'Oslash', 'Ugrave', 'Uacute', 'Ucirc',
474 'Uuml', 'Yacute', 'THORN', 'szlig', 'agrave', 'aacute',
475 'acirc', 'atilde', 'auml', 'aring', 'aelig', 'ccedil',
476 'egrave', 'eacute', 'ecirc', 'euml', 'igrave', 'iacute',
477 'icirc', 'iuml', 'eth', 'ntilde', 'ograve', 'oacute',
478 'ocirc', 'otilde', 'ouml', 'divide', 'oslash', 'ugrave',
479 'uacute', 'ucirc', 'uuml', 'yacute', 'thorn', 'yuml',
480 'quot', 'amp', 'lt', 'gt', 'apos', 'OElig',
481 'oelig', 'Scaron', 'scaron', 'Yuml', 'circ', 'tilde',
482 'ensp', 'emsp', 'thinsp', 'zwnj', 'zwj', 'lrm',
483 'rlm', 'ndash', 'mdash', 'lsquo', 'rsquo', 'sbquo',
484 'ldquo', 'rdquo', 'bdquo', 'dagger', 'Dagger', 'permil',
485 'lsaquo', 'rsaquo', 'euro', 'fnof', 'Alpha', 'Beta',
486 'Gamma', 'Delta', 'Epsilon', 'Zeta', 'Eta', 'Theta',
487 'Iota', 'Kappa', 'Lambda', 'Mu', 'Nu', 'Xi',
488 'Omicron', 'Pi', 'Rho', 'Sigma', 'Tau', 'Upsilon',
489 'Phi', 'Chi', 'Psi', 'Omega', 'alpha', 'beta',
490 'gamma', 'delta', 'epsilon', 'zeta', 'eta', 'theta',
491 'iota', 'kappa', 'lambda', 'mu', 'nu', 'xi',
492 'omicron', 'pi', 'rho', 'sigmaf', 'sigma', 'tau',
493 'upsilon', 'phi', 'chi', 'psi', 'omega', 'thetasym',
494 'upsih', 'piv', 'bull', 'hellip', 'prime', 'Prime',
495 'oline', 'frasl', 'weierp', 'image', 'real', 'trade',
496 'alefsym', 'larr', 'uarr', 'rarr', 'darr', 'harr',
497 'crarr', 'lArr', 'uArr', 'rArr', 'dArr', 'hArr',
498 'forall', 'part', 'exist', 'empty', 'nabla', 'isin',
499 'notin', 'ni', 'prod', 'sum', 'minus', 'lowast',
500 'radic', 'prop', 'infin', 'ang', 'and', 'or',
501 'cap', 'cup', 'int', 'sim', 'cong', 'asymp',
502 'ne', 'equiv', 'le', 'ge', 'sub', 'sup',
503 'nsub', 'sube', 'supe', 'oplus', 'otimes', 'perp',
504 'sdot', 'lceil', 'rceil', 'lfloor', 'rfloor', 'lang',
505 'rang', 'loz', 'spades', 'clubs', 'hearts', 'diams',
510 * Kses global for allowable protocols.
514 * @return array Allowed protocols
516 function yourls_kses_allowed_protocols() {
517 // More or less common stuff in links. From http://en.wikipedia.org/wiki/URI_scheme
520 'http://', 'https://', 'ftp://',
527 // Old school bearded geek
528 'gopher://', 'telnet://', 'finger://',
529 'nntp://', 'worldwind://',
532 'ssh://', 'svn://', 'svn+ssh://', 'git://', 'cvs://',
534 'market://', // Google Play
539 'ed2k://', 'magnet:', 'udp://',
542 'mms://', 'lastfm://', 'spotify:', 'rtsp://',
545 'aim:', 'facetime://', 'gtalk:', 'xmpp:',
546 'irc://', 'ircs://', 'mumble://',
547 'callto:', 'skype:', 'sip:',
548 'teamspeak://', 'ventrilo://', 'xfire:',
552 'steam:', 'steam://',
554 'ldap://', 'ldaps://',
556 // Purposedly removed for security
558 'about:', 'chrome://', 'chrome-extension://',
566 * Converts and fixes HTML entities.
568 * This function normalizes HTML entities. It will convert "AT&T" to the correct
569 * "AT&T", ":" to ":", "&#XYZZY;" to "&#XYZZY;" and so on.
573 * @param string $string Content to normalize entities
574 * @return string Content with normalized entities
576 function yourls_kses_normalize_entities($string) {
577 # Disarm all entities by converting & to &
579 $string = str_replace('&', '&', $string);
581 # Change back the allowed entities in our entity whitelist
583 $string = preg_replace_callback('/&([A-Za-z]{2,8});/', 'yourls_kses_named_entities', $string);
584 $string = preg_replace_callback('/&#(0*[0-9]{1,7});/', 'yourls_kses_normalize_entities2', $string);
585 $string = preg_replace_callback('/&#[Xx](0*[0-9A-Fa-f]{1,6});/', 'yourls_kses_normalize_entities3', $string);
591 * Callback for yourls_kses_normalize_entities() regular expression.
593 * This function only accepts valid named entity references, which are finite,
594 * case-sensitive, and highly scrutinized by HTML and XML validators.
598 * @param array $matches preg_replace_callback() matches array
599 * @return string Correctly encoded entity
601 function yourls_kses_named_entities($matches) {
602 global $yourls_allowedentitynames;
604 if ( empty($matches[1]) )
608 return ( ( ! in_array($i, $yourls_allowedentitynames) ) ? "&$i;" : "&$i;" );
612 * Callback for yourls_kses_normalize_entities() regular expression.
614 * This function helps yourls_kses_normalize_entities() to only accept 16-bit values
615 * and nothing more for &#number; entities.
620 * @param array $matches preg_replace_callback() matches array
621 * @return string Correctly encoded entity
623 function yourls_kses_normalize_entities2($matches) {
624 if ( empty($matches[1]) )
628 if (yourls_valid_unicode($i)) {
629 $i = str_pad(ltrim($i,'0'), 3, '0', STR_PAD_LEFT);
639 * Callback for yourls_kses_normalize_entities() for regular expression.
641 * This function helps yourls_kses_normalize_entities() to only accept valid Unicode
642 * numeric entities in hex form.
647 * @param array $matches preg_replace_callback() matches array
648 * @return string Correctly encoded entity
650 function yourls_kses_normalize_entities3($matches) {
651 if ( empty($matches[1]) )
654 $hexchars = $matches[1];
655 return ( ( ! yourls_valid_unicode(hexdec($hexchars)) ) ? "&#x$hexchars;" : '&#x'.ltrim($hexchars,'0').';' );
659 * Helper function to add global attributes to a tag in the allowed html list.
664 * @param array $value An array of attributes.
665 * @return array The array of attributes with global attributes added.
667 function _yourls_add_global_attributes( $value ) {
668 $global_attributes = array(
675 if ( true === $value )
678 if ( is_array( $value ) )
679 return array_merge( $value, $global_attributes );
685 * Helper function to determine if a Unicode value is valid.
689 * @param int $i Unicode value
690 * @return bool True if the value was a valid Unicode number
692 function yourls_valid_unicode($i) {
693 return ( $i == 0x9 || $i == 0xa || $i == 0xd ||
694 ($i >= 0x20 && $i <= 0xd7ff) ||
695 ($i >= 0xe000 && $i <= 0xfffd) ||
696 ($i >= 0x10000 && $i <= 0x10ffff) );
700 * Goes through an array and changes the keys to all lower case.
704 * @param array $inarray Unfiltered array
705 * @return array Fixed array with all lowercase keys
707 function yourls_kses_array_lc($inarray) {
708 $outarray = array ();
710 foreach ( (array) $inarray as $inkey => $inval) {
711 $outkey = strtolower($inkey);
712 $outarray[$outkey] = array ();
714 foreach ( (array) $inval as $inkey2 => $inval2) {
715 $outkey2 = strtolower($inkey2);
716 $outarray[$outkey][$outkey2] = $inval2;
724 * Convert all entities to their character counterparts.
726 * This function decodes numeric HTML entities (A and A). It doesn't do
727 * anything with other entities like ä, but we don't need them in the URL
728 * protocol whitelisting system anyway.
732 * @param string $string Content to change entities
733 * @return string Content after decoded entities
735 function yourls_kses_decode_entities($string) {
736 $string = preg_replace_callback('/&#([0-9]+);/', '_yourls_kses_decode_entities_chr', $string);
737 $string = preg_replace_callback('/&#[Xx]([0-9A-Fa-f]+);/', '_yourls_kses_decode_entities_chr_hexdec', $string);
743 * Regex callback for yourls_kses_decode_entities()
747 * @param array $match preg match
750 function _yourls_kses_decode_entities_chr( $match ) {
751 return chr( $match[1] );
755 * Regex callback for yourls_kses_decode_entities()
759 * @param array $match preg match
762 function _yourls_kses_decode_entities_chr_hexdec( $match ) {
763 return chr( hexdec( $match[1] ) );
767 * Removes any null characters in $string.
771 * @param string $string
774 function yourls_kses_no_null($string) {
775 $string = preg_replace( '/\0+/', '', $string );
776 $string = preg_replace( '/(\\\\0)+/', '', $string );