lib/InlineParser.php

   1 <?php
   2
   3 /* Copyright (C) 2002 Geoffrey T. Dairiki <dairiki@dairiki.org>
   4  * Copyright (C) 2004-2010 Reini Urban
   5  * Copyright (C) 2008-2010 Marc-Etienne Vargenau, Alcatel-Lucent
   6  *
   7  * This file is part of PhpWiki.
   8  *
   9  * PhpWiki is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * PhpWiki is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License along
  20  * with PhpWiki; if not, write to the Free Software Foundation, Inc.,
  21  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  22  */
  23 /**
  24  * This is the code which deals with the inline part of the
  25  * wiki-markup.
  26  *
  27  * @package Markup
  28  * @author Geoffrey T. Dairiki, Reini Urban
  29  */
  30
  31 /**
  32  * This is the character used in wiki markup to escape characters with
  33  * special meaning.
  34  */
  35 define('ESCAPE_CHAR', '~');
  36
  37 require_once 'lib/CachedMarkup.php';
  38 require_once 'lib/stdlib.php';
  39
  40 function WikiEscape($text)
  41 {
  42     return str_replace('#', ESCAPE_CHAR . '#', $text);
  43 }
  44
  45 function UnWikiEscape($text)
  46 {
  47     return preg_replace('/' . ESCAPE_CHAR . '(.)/', '\1', $text);
  48 }
  49
  50 /**
  51  * Return type from RegexpSet::match and RegexpSet::nextMatch.
  52  *
  53  * @see RegexpSet
  54  */
  55 class RegexpSet_match
  56 {
  57     /**
  58      * The text leading up the the next match.
  59      */
  60     public $prematch;
  61     /**
  62      * The matched text.
  63      */
  64     public $match;
  65     /**
  66      * The text following the matched text.
  67      */
  68     public $postmatch;
  69     /**
  70      * Index of the regular expression which matched.
  71      */
  72     public $regexp_ind;
  73 }
  74
  75 /**
  76  * A set of regular expressions.
  77  *
  78  * This class is probably only useful for InlineTransformer.
  79  */
  80 class RegexpSet
  81 {
  82     /**
  83      * @param array $regexps A list of regular expressions.  The
  84      * regular expressions should not include any sub-pattern groups
  85      * "(...)".  (Anonymous groups, like "(?:...)", as well as
  86      * look-ahead and look-behind assertions are okay.)
  87      */
  88     function __construct($regexps)
  89     {
  90         assert($regexps);
  91         $this->_regexps = array_unique($regexps);
  92     }
  93
  94     /**
  95      * Search text for the next matching regexp from the Regexp Set.
  96      *
  97      * @param string $text The text to search.
  98      *
  99      * @return RegexpSet_match A RegexpSet_match object, or false if no match.
 100      */
 101     function match($text)
 102     {
 103         return $this->_match($text, $this->_regexps, '*?');
 104     }
 105
 106     /**
 107      * Search for next matching regexp.
 108      *
 109      * Here, 'next' has two meanings:
 110      *
 111      * Match the next regexp(s) in the set, at the same position as the last match.
 112      *
 113      * If that fails, match the whole RegexpSet, starting after the position of the
 114      * previous match.
 115      *
 116      * @param string $text Text to search.
 117      *
 118      * @param RegexpSet_match $prevMatch A RegexpSet_match object.
 119      * $prevMatch should be a match object obtained by a previous
 120      * match upon the same value of $text.
 121      *
 122      * @return RegexpSet_match A RegexpSet_match object, or false if no match.
 123      */
 124     function nextMatch($text, $prevMatch)
 125     {
 126         // Try to find match at same position.
 127         $pos = strlen($prevMatch->prematch);
 128         $regexps = array_slice($this->_regexps, $prevMatch->regexp_ind + 1);
 129         if ($regexps) {
 130             $repeat = sprintf('{%d}', $pos);
 131             if (($match = $this->_match($text, $regexps, $repeat))) {
 132                 $match->regexp_ind += $prevMatch->regexp_ind + 1;
 133                 return $match;
 134             }
 135
 136         }
 137
 138         // Failed.  Look for match after current position.
 139         $repeat = sprintf('{%d,}?', $pos + 1);
 140         return $this->_match($text, $this->_regexps, $repeat);
 141     }
 142
 143     // Syntax: http://www.pcre.org/pcre.txt
 144     //   x - EXTENDED, ignore whitespace
 145     //   s - DOTALL
 146     //   A - ANCHORED
 147     //   S - STUDY
 148     private function _match($text, $regexps, $repeat)
 149     {
 150         $match = new RegexpSet_match;
 151
 152         // Optimization: if the matches are only "$" and another, then omit "$"
 153         assert(!empty($repeat));
 154         assert(!empty($regexps));
 155         // We could do much better, if we would know the matching markup for the
 156         // longest regexp match:
 157         $hugepat = "/ ( . $repeat ) ( (" . join(')|(', $regexps) . ") ) /Asx";
 158         // Proposed premature optimization 1:
 159         //$hugepat= "/ ( . $repeat ) ( (" . join(')|(', array_values($matched)) . ") ) /Asx";
 160         if (!preg_match($hugepat, $text, $m)) {
 161             return false;
 162         }
 163         // Proposed premature optimization 1:
 164         //$match->regexp_ind = $matched_ind[count($m) - 4];
 165         $match->regexp_ind = count($m) - 4;
 166
 167         $match->postmatch = substr($text, strlen($m[0]));
 168         $match->prematch = $m[1];
 169         $match->match = $m[2];
 170
 171         return $match;
 172     }
 173 }
 174
 175 /**
 176  * A simple markup rule (i.e. terminal token).
 177  *
 178  * These are defined by a regexp.
 179  *
 180  * When a match is found for the regexp, the matching text is replaced.
 181  * The replacement content is obtained by calling the SimpleMarkup::markup method.
 182  */
 183 abstract class SimpleMarkup
 184 {
 185     public $_match_regexp;
 186
 187     /** Get regexp.
 188      *
 189      * @return string Regexp which matches this token.
 190      */
 191     function getMatchRegexp()
 192     {
 193         return $this->_match_regexp;
 194     }
 195
 196     /** Markup matching text.
 197      *
 198      * @param string $match The text which matched the regexp
 199      * (obtained from getMatchRegexp).
 200      *
 201      * @return mixed The expansion of the matched text.
 202      */
 203     abstract function markup($match /*, $body */);
 204 }
 205
 206 /**
 207  * A balanced markup rule.
 208  *
 209  * These are defined by a start regexp, and an end regexp.
 210  */
 211 abstract class BalancedMarkup
 212 {
 213     /** Get the starting regexp for this rule.
 214      *
 215      * @return string The starting regexp.
 216      */
 217     abstract function getStartRegexp();
 218
 219     /** Get the ending regexp for this rule.
 220      *
 221      * @param string $match The text which matched the starting regexp.
 222      *
 223      * @return string The ending regexp.
 224      */
 225     abstract function getEndRegexp($match);
 226
 227     /** Get expansion for matching input.
 228      *
 229      * @param string $match The text which matched the starting regexp.
 230      *
 231      * @param mixed $body Transformed text found between the starting
 232      * and ending regexps.
 233      *
 234      * @return mixed The expansion of the matched text.
 235      */
 236     abstract function markup($match, $body);
 237 }
 238
 239 class Markup_escape extends SimpleMarkup
 240 {
 241     function getMatchRegexp()
 242     {
 243         return ESCAPE_CHAR . '(?: [[:alnum:]]+ | .)';
 244     }
 245
 246     function markup($match)
 247     {
 248         assert(strlen($match) >= 2);
 249         return substr($match, 1);
 250     }
 251 }
 252
 253 /**
 254  * [image.jpg size=50% border=5], [image.jpg size=50x30]
 255  * Support for the following attributes: see stdlib.php:LinkImage()
 256  *   size=<percent>%, size=<width>x<height>
 257  *   border=n, align=\w+, hspace=n, vspace=n
 258  *   width=n, height=n
 259  *   title, lang, id, alt
 260  */
 261 function isImageLink($link)
 262 {
 263     if (!$link) return false;
 264     assert(defined('INLINE_IMAGES'));
 265     return preg_match("/\\.(" . INLINE_IMAGES . ")$/i", $link)
 266         or preg_match("/\\.(" . INLINE_IMAGES . ")\s+(size|border|align|hspace|vspace|type|data|width|height|title|lang|id|alt)=/i", $link);
 267 }
 268
 269 function LinkBracketLink($bracketlink)
 270 {
 271
 272     // $bracketlink will start and end with brackets; in between will
 273     // be either a page name, a URL or both separated by a pipe.
 274
 275     $wikicreolesyntax = false;
 276
 277     if (string_starts_with($bracketlink, "[[") or string_starts_with($bracketlink, "#[[")) {
 278         $wikicreolesyntax = true;
 279         $bracketlink = str_replace("[[", "[", $bracketlink);
 280         $bracketlink = str_replace("]]", "]", $bracketlink);
 281     }
 282
 283     // Strip brackets and leading space
 284     // bug#1904088  Some brackets links on 2 lines cause the parser to crash
 285     preg_match('/(\#?) \[\s* (?: (.*?) \s* (?<!' . ESCAPE_CHAR . ')(\|) )? \s* (.+?) \s*\]/x',
 286         str_replace("\n", " ", $bracketlink), $matches);
 287     if (count($matches) < 4) {
 288         return HTML::span(array('class' => 'error'),
 289             _("Invalid [] syntax ignored") . _(": ") . $bracketlink);
 290     }
 291     list (, $hash, $label, $bar, $rawlink) = $matches;
 292
 293     if ($wikicreolesyntax and $label) {
 294         $temp = $label;
 295         $label = $rawlink;
 296         $rawlink = $temp;
 297     }
 298
 299     // Mediawiki compatibility: allow "Image:" and "File:"
 300     // as synonyms of "Upload:"
 301     // Allow "upload:", "image:" and "file:" also
 302     // Remove spaces before and after ":", if any
 303     if (string_starts_with($rawlink, "Upload")) {
 304         $rawlink = preg_replace("/^Upload\\s*:\\s*/", "Upload:", $rawlink);
 305     } elseif (string_starts_with($rawlink, "upload")) {
 306         $rawlink = preg_replace("/^upload\\s*:\\s*/", "Upload:", $rawlink);
 307     } elseif (string_starts_with($rawlink, "Image")) {
 308         $rawlink = preg_replace("/^Image\\s*:\\s*/", "Upload:", $rawlink);
 309     } elseif (string_starts_with($rawlink, "image")) {
 310         $rawlink = preg_replace("/^image\\s*:\\s*/", "Upload:", $rawlink);
 311     } elseif (string_starts_with($rawlink, "File")) {
 312         $rawlink = preg_replace("/^File\\s*:\\s*/", "Upload:", $rawlink);
 313     } elseif (string_starts_with($rawlink, "file")) {
 314         $rawlink = preg_replace("/^file\\s*:\\s*/", "Upload:", $rawlink);
 315     }
 316
 317     $label = UnWikiEscape($label);
 318     /*
 319      * Check if the user has typed a explicit URL. This solves the
 320      * problem where the URLs have a ~ character, which would be stripped away.
 321      *   "[http:/server/~name/]" will work as expected
 322      *   "http:/server/~name/"   will NOT work as expected, will remove the ~
 323      */
 324     if (string_starts_with($rawlink, "http://")
 325         or string_starts_with($rawlink, "https://")
 326     ) {
 327         $link = $rawlink;
 328         // Mozilla Browser URI Obfuscation Weakness 2004-06-14
 329         //   http://www.securityfocus.com/bid/10532/
 330         //   goodurl+"%2F%20%20%20."+badurl
 331         if (preg_match("/%2F(%20)+\./i", $rawlink)) {
 332             $rawlink = preg_replace("/%2F(%20)+\./i", "%2F.", $rawlink);
 333         }
 334     } else {
 335         // Check page name lenght
 336         if (!string_starts_with($rawlink, "Upload:")) {
 337             if (strlen($rawlink) > MAX_PAGENAME_LENGTH) {
 338                 return HTML::span(array('class' => 'error'),
 339                     _('Page name too long'));
 340             }
 341         }
 342         // Check illegal characters in page names: <>[]{}|"
 343         if (preg_match("/[<\[\{\|\"\}\]>]/", $rawlink, $matches) > 0) {
 344             return HTML::span(array('class' => 'error'),
 345                 sprintf(_("Illegal character “%s” in page name."),
 346                     $matches[0]));
 347         }
 348         $link = UnWikiEscape($rawlink);
 349     }
 350
 351     /* Relatives links by Joel Schaubert.
 352      * Recognize [../bla] or [/bla] as relative links, without needing http://
 353      * Normally /Page links to the subpage /Page.
 354      */
 355     if (preg_match('/^\.\.\//', $link)) {
 356         return new Cached_ExternalLink($link, $label);
 357     }
 358
 359     // Handle "[[SandBox|{{image.jpg}}]]" and "[[SandBox|{{image.jpg|alt text}}]]"
 360     if (string_starts_with($label, "{{")) {
 361         $imgurl = substr($label, 2, -2); // Remove "{{" and "}}"
 362         $pipe = strpos($imgurl, '|');
 363         if ($pipe === false) {
 364             $label = LinkImage(getUploadDataPath() . $imgurl, $link);
 365         } else {
 366             list($img, $alt) = explode("|", $imgurl);
 367             $label = LinkImage(getUploadDataPath() . $img, $alt);
 368         }
 369     } else
 370
 371         // [label|link]
 372         // If label looks like a url to an image or object, we want an image link.
 373         if (isImageLink($label)) {
 374             $imgurl = $label;
 375             $intermap = getInterwikiMap();
 376             if (preg_match("/^" . $intermap->getRegexp() . ":/", $label)) {
 377                 $imgurl = $intermap->link($label);
 378                 $imgurl = $imgurl->getAttr('href');
 379             } elseif (!preg_match("#^(" . ALLOWED_PROTOCOLS . "):#", $imgurl)) {
 380                 // local theme linkname like 'images/next.gif'.
 381                 global $WikiTheme;
 382                 $imgurl = $WikiTheme->getImageURL($imgurl);
 383             }
 384             // for objects (non-images) the link is taken as alt tag,
 385             // which is in return taken as alternative img
 386             $label = LinkImage($imgurl, $link);
 387         }
 388
 389     if ($hash) {
 390         // It's an anchor, not a link...
 391         $id = MangleXmlIdentifier($link);
 392         return HTML::a(array('id' => $id), $bar ? $label : $link);
 393     }
 394
 395     if (preg_match("#^(" . ALLOWED_PROTOCOLS . "):#", $link)) {
 396         // if it's an image, embed it; otherwise, it's a regular link
 397         if (isImageLink($link) and empty($label)) // patch #1348996 by Robert Litwiniec
 398             return LinkImage($link, $label);
 399         else
 400             return new Cached_ExternalLink($link, $label);
 401     } elseif (substr($link, 0, 8) == 'phpwiki:')
 402         return new Cached_PhpwikiURL($link, $label); /* Semantic relations and attributes.
 403      * Relation and attribute names must be word chars only, no space.
 404      * Links and Attributes may contain everything. word, nums, units, space, groupsep, numsep, ...
 405      */
 406     elseif (preg_match("/^ (\w+) (:[:=]) (.*) $/x", $link) and !isImageLink($link))
 407         return new Cached_SemanticLink($link, $label); /* Do not store the link */
 408     elseif (substr($link, 0, 1) == ':')
 409         return new Cached_WikiLink($link, $label); /*
 410      * Inline images in Interwiki urls's:
 411      * [File:my_image.gif] inlines the image,
 412      * File:my_image.gif shows a plain inter-wiki link,
 413      * [what a pic|File:my_image.gif] shows a named inter-wiki link to the gif
 414      * [File:my_image.gif|what a pic] shows an inlined image linked to the page "what a pic"
 415      *
 416      * Note that for simplicity we will accept embedded object tags (non-images)
 417      * here also, and separate them later in LinkImage()
 418      */
 419     elseif (strstr($link, ':')
 420         and ($intermap = getInterwikiMap())
 421             and preg_match("/^" . $intermap->getRegexp() . ":/", $link)
 422     ) {
 423         // trigger_error("label: $label link: $link", E_USER_WARNING);
 424         if (empty($label) and isImageLink($link)) {
 425             // if without label => inlined image [File:xx.gif]
 426             $imgurl = $intermap->link($link);
 427             return LinkImage($imgurl->getAttr('href'));
 428         }
 429         return new Cached_InterwikiLink($link, $label);
 430     } else {
 431         // Split anchor off end of pagename.
 432         if (preg_match('/\A(.*)(?<!' . ESCAPE_CHAR . ')#(.*?)\Z/', $rawlink, $m)) {
 433             list(, $rawlink, $anchor) = $m;
 434             $pagename = UnWikiEscape($rawlink);
 435             $anchor = UnWikiEscape($anchor);
 436             if (!$label)
 437                 $label = $link;
 438         } else {
 439             $pagename = $link;
 440             $anchor = false;
 441         }
 442
 443         global $backlinks;
 444         $backlinks[] = array('linkto' => $pagename);
 445
 446         return new Cached_WikiLink($pagename, $label, $anchor);
 447     }
 448 }
 449
 450 class Markup_wikicreolebracketlink extends SimpleMarkup
 451 {
 452     public $_match_regexp = "\\#? \\[\\[ .*? [^]\\s] .*? \\]\\]";
 453
 454     function markup($match)
 455     {
 456         return LinkBracketLink($match);
 457     }
 458 }
 459
 460 class Markup_bracketlink extends SimpleMarkup
 461 {
 462     public $_match_regexp = "\\#? \\[ .*? [^]\\s] .*? \\]";
 463
 464     function markup($match)
 465     {
 466         return LinkBracketLink($match);
 467     }
 468 }
 469
 470 class Markup_spellcheck extends SimpleMarkup
 471 {
 472     function __construct()
 473     {
 474         /**
 475          * @var WikiRequest $request
 476          */
 477         global $request;
 478
 479         $this->suggestions = $request->getArg('suggestions');
 480     }
 481
 482     function getMatchRegexp()
 483     {
 484         if (empty($this->suggestions))
 485             return "(?# false )";
 486         $words = array_keys($this->suggestions);
 487         return "(?<= \W ) (?:" . join('|', $words) . ") (?= \W )";
 488     }
 489
 490     function markup($match)
 491     {
 492         if (empty($this->suggestions) or empty($this->suggestions[$match]))
 493             return $match;
 494         return new Cached_SpellCheck(UnWikiEscape($match), $this->suggestions[$match]);
 495     }
 496 }
 497
 498 class Markup_searchhighlight extends SimpleMarkup
 499 {
 500     function __construct()
 501     {
 502         /**
 503          * @var WikiRequest $request
 504          */
 505         global $request;
 506
 507         $result = $request->_searchhighlight;
 508         require_once 'lib/TextSearchQuery.php';
 509         $query = new TextSearchQuery($result['query']);
 510         $this->hilight_re = $query->getHighlightRegexp();
 511         $this->engine = $result['engine'];
 512     }
 513
 514     function getMatchRegexp()
 515     {
 516         return $this->hilight_re;
 517     }
 518
 519     function markup($match)
 520     {
 521         return new Cached_SearchHighlight(UnWikiEscape($match), $this->engine);
 522     }
 523 }
 524
 525 class Markup_url extends SimpleMarkup
 526 {
 527     function getMatchRegexp()
 528     {
 529         return "(?<![[:alnum:]]) (?:" . ALLOWED_PROTOCOLS . ") : [^\s<>\"']+ (?<![ ,.?; \] \) ])";
 530     }
 531
 532     function markup($match)
 533     {
 534         return new Cached_ExternalLink(UnWikiEscape($match));
 535     }
 536 }
 537
 538 class Markup_interwiki extends SimpleMarkup
 539 {
 540     function getMatchRegexp()
 541     {
 542         $map = getInterwikiMap();
 543         return "(?<! [[:alnum:]])" . $map->getRegexp() . ": [^:=]\S+ (?<![ ,.?;! \] \) \" \' ])";
 544     }
 545
 546     function markup($match)
 547     {
 548         return new Cached_InterwikiLink(UnWikiEscape($match));
 549     }
 550 }
 551
 552 class Markup_semanticlink extends SimpleMarkup
 553 {
 554     // No units separated by space allowed here
 555     // For :: (relations) only words, no comma,
 556     // but for := (attributes) comma and dots are allowed. Units with groupsep.
 557     // Ending dots or comma are not part of the link.
 558     public $_match_regexp = "(?: \w+:=\S+(?<![\.,]))|(?: \w+::[\w\.]+(?<!\.))";
 559
 560     function markup($match)
 561     {
 562         return new Cached_SemanticLink(UnWikiEscape($match));
 563     }
 564 }
 565
 566 class Markup_wikiword extends SimpleMarkup
 567 {
 568     function getMatchRegexp()
 569     {
 570         global $WikiNameRegexp;
 571         if (!trim($WikiNameRegexp))
 572             return " " . WIKI_NAME_REGEXP;
 573         return " $WikiNameRegexp";
 574     }
 575
 576     function markup($match)
 577     {
 578         if (!$match) return false;
 579         if ($this->_isWikiUserPage($match))
 580             return new Cached_UserLink($match); //$this->_UserLink($match);
 581         else
 582             return new Cached_WikiLink($match);
 583     }
 584
 585     // FIXME: there's probably a more useful place to put these two functions
 586     function _isWikiUserPage($page)
 587     {
 588         global $request;
 589         $dbi = $request->getDbh();
 590         $page_handle = $dbi->getPage($page);
 591         if ($page_handle and $page_handle->get('pref'))
 592             return true;
 593         else
 594             return false;
 595     }
 596
 597     function _UserLink($PageName)
 598     {
 599         $link = HTML::a(array('href' => $PageName));
 600         $link->pushContent(PossiblyGlueIconToText('wikiuser', $PageName));
 601         $link->setAttr('class', 'wikiuser');
 602         return $link;
 603     }
 604 }
 605
 606 class Markup_linebreak extends SimpleMarkup
 607 {
 608     public $_match_regexp = "(?: (?<! %) %%% (?! %) | \\\\\\\\ | <\s*(?:br|BR)\s*> | <\s*(?:br|BR)\s*\/\s*> )";
 609
 610     function markup($match)
 611     {
 612         return HTML::br();
 613     }
 614 }
 615
 616 class Markup_wikicreole_italics extends BalancedMarkup
 617 {
 618     function getStartRegexp()
 619     {
 620         return "\\/\\/";
 621     }
 622
 623     function getEndRegexp($match)
 624     {
 625         return "\\/\\/";
 626     }
 627
 628     function markup($match, $body)
 629     {
 630         $tag = 'em';
 631         return new HtmlElement($tag, $body);
 632     }
 633 }
 634
 635 class Markup_wikicreole_bold extends BalancedMarkup
 636 {
 637     function getStartRegexp()
 638     {
 639         return "\\*\\*";
 640     }
 641
 642     function getEndRegexp($match)
 643     {
 644         return "\\*\\*";
 645     }
 646
 647     function markup($match, $body)
 648     {
 649         $tag = 'strong';
 650         return new HtmlElement($tag, $body);
 651     }
 652 }
 653
 654 class Markup_wikicreole_monospace extends BalancedMarkup
 655 {
 656     function getStartRegexp()
 657     {
 658         return "\\#\\#";
 659     }
 660
 661     function getEndRegexp($match)
 662     {
 663         return "\\#\\#";
 664     }
 665
 666     function markup($match, $body)
 667     {
 668         return new HtmlElement('span', array('class' => 'tt'), $body);
 669     }
 670 }
 671
 672 class Markup_wikicreole_underline extends BalancedMarkup
 673 {
 674     function getStartRegexp()
 675     {
 676         return "\\_\\_";
 677     }
 678
 679     function getEndRegexp($match)
 680     {
 681         return "\\_\\_";
 682     }
 683
 684     function markup($match, $body)
 685     {
 686         $tag = 'u';
 687         return new HtmlElement($tag, $body);
 688     }
 689 }
 690
 691 class Markup_wikicreole_superscript extends BalancedMarkup
 692 {
 693     function getStartRegexp()
 694     {
 695         return "\\^\\^";
 696     }
 697
 698     function getEndRegexp($match)
 699     {
 700         return "\\^\\^";
 701     }
 702
 703     function markup($match, $body)
 704     {
 705         $tag = 'sup';
 706         return new HtmlElement($tag, $body);
 707     }
 708 }
 709
 710 class Markup_wikicreole_subscript extends BalancedMarkup
 711 {
 712     function getStartRegexp()
 713     {
 714         return ",,";
 715     }
 716
 717     function getEndRegexp($match)
 718     {
 719         return ",,";
 720     }
 721
 722     function markup($match, $body)
 723     {
 724         $tag = 'sub';
 725         return new HtmlElement($tag, $body);
 726     }
 727 }
 728
 729 class Markup_old_emphasis extends BalancedMarkup
 730 {
 731     function getStartRegexp()
 732     {
 733         return "''";
 734     }
 735
 736     function getEndRegexp($match)
 737     {
 738         return "''";
 739     }
 740
 741     function markup($match, $body)
 742     {
 743         $tag = 'em';
 744         return new HtmlElement($tag, $body);
 745     }
 746 }
 747
 748 class Markup_nestled_emphasis extends BalancedMarkup
 749 {
 750     function getStartRegexp()
 751     {
 752         static $start_regexp = false;
 753
 754         if (!$start_regexp) {
 755             // The three possible delimiters
 756             // (none of which can be followed by itself.)
 757             $i = "_ (?! _)";
 758             $b = "\\* (?! \\*)";
 759             $tt = "= (?! =)";
 760
 761             $any = "(?: ${i}|${b}|${tt})"; // any of the three.
 762
 763             // Any of [_*=] is okay if preceded by space or one of [-"'/:]
 764             $start[] = "(?<= \\s|^|[-\"'\\/:]) ${any}";
 765
 766             // _ or * is okay after = as long as not immediately followed by =
 767             $start[] = "(?<= =) (?: ${i}|${b}) (?! =)";
 768             // etc...
 769             $start[] = "(?<= _) (?: ${b}|${tt}) (?! _)";
 770             $start[] = "(?<= \\*) (?: ${i}|${tt}) (?! \\*)";
 771
 772             // any delimiter okay after an opening brace ( [{<(] )
 773             // as long as it's not immediately followed by the matching closing
 774             // brace.
 775             $start[] = "(?<= { ) ${any} (?! } )";
 776             $start[] = "(?<= < ) ${any} (?! > )";
 777             $start[] = "(?<= \\( ) ${any} (?! \\) )";
 778
 779             $start = "(?:" . join('|', $start) . ")";
 780
 781             // Any of the above must be immediately followed by non-whitespace.
 782             $start_regexp = $start . "(?= \S)";
 783         }
 784
 785         return $start_regexp;
 786     }
 787
 788     function getEndRegexp($match)
 789     {
 790         $chr = preg_quote($match);
 791         return "(?<= \S | ^ ) (?<! $chr) $chr (?! $chr) (?= \s | [-)}>\"'\\/:.,;!? _*=] | $)";
 792     }
 793
 794     function markup($match, $body)
 795     {
 796         switch ($match) {
 797             case '*':
 798                 return new HtmlElement('b', $body);
 799             case '=':
 800                 return new HtmlElement('span', array('class' => 'tt'), $body);
 801             case '_':
 802                 return new HtmlElement('i', $body);
 803         }
 804         return null;
 805     }
 806 }
 807
 808 class Markup_html_emphasis extends BalancedMarkup
 809 {
 810     function getStartRegexp()
 811     {
 812         return "<(?: b|big|i|small|tt|em|strong|cite|code|dfn|kbd|samp|s|strike|del|var|sup|sub )>";
 813     }
 814
 815     function getEndRegexp($match)
 816     {
 817         return "<\\/" . substr($match, 1);
 818     }
 819
 820     function markup($match, $body)
 821     {
 822         $tag = substr($match, 1, -1);
 823         if (($tag == 'big') || ($tag == 'strike') || ($tag == 'tt')) {
 824             return new HtmlElement('span', array('class' => $tag), $body);
 825         }
 826         return new HtmlElement($tag, $body);
 827     }
 828 }
 829
 830 class Markup_html_divspan extends BalancedMarkup
 831 {
 832     function getStartRegexp()
 833     {
 834         return "<(?: div|span )(?: \s[^>]*)?>";
 835     }
 836     function getEndRegexp($match)
 837     {
 838         if (substr($match, 1, 4) == 'span')
 839             $tag = 'span';
 840         else
 841             $tag = 'div';
 842         return "<\\/" . $tag . '>';
 843     }
 844
 845     function markup($match, $body)
 846     {
 847         if (substr($match, 1, 4) == 'span')
 848             $tag = 'span';
 849         else
 850             $tag = 'div';
 851         $rest = substr($match, 1 + strlen($tag), -1);
 852         if (!empty($rest)) {
 853             $args = parse_attributes($rest);
 854         } else {
 855             $args = array();
 856         }
 857         return new HtmlElement($tag, $args, $body);
 858     }
 859 }
 860
 861 class Markup_html_abbr extends BalancedMarkup
 862 {
 863     //rurban: abbr|acronym need an optional title tag.
 864     //sf.net bug #728595
 865
 866     function getStartRegexp()
 867     {
 868         return  "<(?: abbr|acronym )(?: [^>]*)?>";
 869     }
 870
 871     function getEndRegexp($match)
 872     {
 873         if (substr($match, 1, 4) == 'abbr')
 874             $tag = 'abbr';
 875         else
 876             $tag = 'acronym';
 877         return "<\\/" . $tag . '>';
 878     }
 879
 880     function markup($match, $body)
 881     {
 882         // 'acronym' is deprecated in HTML 5, replace by 'abbr'
 883         $tag = 'abbr';
 884         $rest = substr($match, 1 + strlen($tag), -1);
 885         $attrs = parse_attributes($rest);
 886         // Remove attributes other than title and lang
 887         $allowedargs = array();
 888         foreach ($attrs as $key => $value) {
 889             if (in_array($key, array("title", "lang"))) {
 890                 $allowedargs[$key] = $value;
 891             }
 892         }
 893         return new HtmlElement($tag, $allowedargs, $body);
 894     }
 895 }
 896
 897 /** ENABLE_MARKUP_COLOR
 898  *  See http://www.pmwiki.org/wiki/PmWiki/WikiStyles and
 899  *      http://www.flexwiki.com/default.aspx/FlexWiki/FormattingRules.html
 900  */
 901 class Markup_color extends BalancedMarkup
 902 {
 903     // %color=blue% blue text %% and back to normal
 904
 905     function getStartRegexp()
 906     {
 907         return  "%color=(?: [^%]*)%";
 908     }
 909
 910     function getEndRegexp($match)
 911     {
 912         return "%%";
 913     }
 914
 915     function markup($match, $body)
 916     {
 917         $color = strtolower(substr($match, 7, -1));
 918
 919         $morecolors = array('beige' => '#f5f5dc',
 920             'brown' => '#a52a2a',
 921             'chocolate' => '#d2691e',
 922             'cyan' => '#00ffff',
 923             'gold' => '#ffd700',
 924             'ivory' => '#fffff0',
 925             'indigo' => '#4b0082',
 926             'magenta' => '#ff00ff',
 927             'orange' => '#ffa500',
 928             'pink' => '#ffc0cb',
 929             'salmon' => '#fa8072',
 930             'snow' => '#fffafa',
 931             'turquoise' => '#40e0d0',
 932             'violet' => '#ee82ee',
 933         );
 934
 935         if (isset($morecolors[$color])) {
 936             $color = $morecolors[$color];
 937         }
 938
 939         // HTML 4 defines the following 16 colors
 940         if (in_array($color, array('aqua', 'black', 'blue', 'fuchsia',
 941             'gray', 'green', 'lime', 'maroon',
 942             'navy', 'olive', 'purple', 'red',
 943             'silver', 'teal', 'white', 'yellow'))
 944             or ((substr($color, 0, 1) == '#')
 945                 and ((strlen($color) == 4) or (strlen($color) == 7))
 946                     and (strspn(substr($color, 1), '0123456789abcdef') == strlen($color) - 1))
 947         ) {
 948             return new HtmlElement('span', array('style' => "color: $color"), $body);
 949         } else {
 950             return new HtmlElement('span', array('class' => 'error'),
 951                 sprintf(_("unknown color %s ignored"), substr($match, 7, -1)));
 952         }
 953     }
 954 }
 955
 956 // Wikicreole placeholder
 957 // <<<placeholder>>>
 958 class Markup_placeholder extends SimpleMarkup
 959 {
 960     public $_match_regexp = '<<<.*?>>>';
 961
 962     function markup($match)
 963     {
 964         return HTML::span($match);
 965     }
 966 }
 967
 968 // Single-line HTML comment
 969 // <!-- This is a comment -->
 970 class Markup_html_comment extends SimpleMarkup
 971 {
 972     public $_match_regexp = '<!--.*?-->';
 973
 974     function markup($match)
 975     {
 976         return HTML::raw('');
 977     }
 978 }
 979
 980 // Special version for single-line plugins formatting,
 981 //  like: '<small>< ?plugin PopularNearby ? ></small>'
 982 class Markup_plugin extends SimpleMarkup
 983 {
 984     public $_match_regexp = '<\?plugin(?:-form)?\s[^\n]+?\?>';
 985
 986     function markup($match)
 987     {
 988         return new Cached_PluginInvocation($match);
 989     }
 990 }
 991
 992 // Special version for single-line Wikicreole plugins formatting.
 993 class Markup_plugin_wikicreole extends SimpleMarkup
 994 {
 995     public $_match_regexp = '<<[^\n]+?>>';
 996
 997     function markup($match)
 998     {
 999         $pi = str_replace("<<", "<?plugin ", $match);
1000         $pi = str_replace(">>", " ?>", $pi);
1001         return new Cached_PluginInvocation($pi);
1002     }
1003 }
1004
1005 /**
1006  *  Mediawiki <nowiki>
1007  *  <nowiki>...</nowiki>
1008  */
1009 class Markup_nowiki extends SimpleMarkup
1010 {
1011     public $_match_regexp = '<nowiki>.*?<\/nowiki>';
1012
1013     function markup($match)
1014     {
1015         // Remove <nowiki> and </nowiki>
1016         return HTML::raw(substr($match, 8, -9));
1017     }
1018 }
1019
1020 /**
1021  *  Wikicreole preformatted
1022  *  {{{
1023  *  }}}
1024  */
1025 class Markup_wikicreole_preformatted extends SimpleMarkup
1026 {
1027     public $_match_regexp = '\{\{\{.*?\}\}\}';
1028
1029     function markup($match)
1030     {
1031         // Remove {{{ and }}}
1032         return new HtmlElement('span', array('class' => 'tt'), substr($match, 3, -3));
1033     }
1034 }
1035
1036 /** ENABLE_MARKUP_TEMPLATE
1037  *  Template syntax similar to Mediawiki
1038  *  {{template}}
1039  * => < ? plugin Template page=template ? >
1040  *  {{template|var1=value1|var2=value|...}}
1041  * => < ? plugin Template page=template var=value ... ? >
1042  *
1043  * The {{...}} syntax is also used for:
1044  *  - Wikicreole images
1045  *  - videos
1046  *  - predefined icons
1047  */
1048 class Markup_template_plugin extends SimpleMarkup
1049 {
1050     // patch #1732793: allow \n, mult. {{ }} in one line, and single letters
1051     public $_match_regexp = '\{\{.*?\}\}';
1052
1053     function markup($match)
1054     {
1055
1056         $page = substr($match, 2, -2);
1057         $page = trim($page);
1058
1059         // Check for predefined icons.
1060         $predefinedicons = array(":)" => "ic_smile.png",
1061             ":(" => "ic_sad.png",
1062             ":P" => "ic_tongue.png",
1063             ":D" => "ic_biggrin.png",
1064             ";)" => "ic_wink.png",
1065             "(y)" => "ic_handyes.png",
1066             "(n)" => "ic_handno.png",
1067             "(i)" => "ic_info.png",
1068             "(/)" => "ic_check.png",
1069             "(x)" => "ic_cross.png",
1070             "(!)" => "ic_danger.png",
1071             "(+)" => "ic_plus.png",
1072             "(-)" => "ic_minus.png",
1073             "(?)" => "ic_help.png",
1074             "(on)" => "ic_lighton.png",
1075             "(off)" => "ic_lightoff.png",
1076             "(*)" => "ic_yellowstar.png",
1077             "(*r)" => "ic_redstar.png",
1078             "(*g)" => "ic_greenstar.png",
1079             "(*b)" => "ic_bluestar.png",
1080             "(*y)" => "ic_yellowstar.png",
1081         );
1082         foreach ($predefinedicons as $ascii => $icon) {
1083             if ($page == $ascii) {
1084                 return LinkImage(DATA_PATH . "/themes/default/images/$icon", $page);
1085             }
1086         }
1087
1088         if (strpos($page, "|") === false) {
1089             $imagename = $page;
1090             $alt = "";
1091         } else {
1092             $imagename = substr($page, 0, strpos($page, "|"));
1093             $alt = ltrim(strstr($page, "|"), "|");
1094         }
1095
1096         // It's not a Mediawiki template, it's a Wikicreole image
1097         if (is_image($imagename)) {
1098             if ((strpos($imagename, "http://") === 0) || (strpos($imagename, "https://") === 0)) {
1099                 return LinkImage($imagename, $alt);
1100             } elseif ($imagename[0] == '/') {
1101                 return LinkImage(DATA_PATH . '/' . $imagename, $alt);
1102             } else {
1103                 return LinkImage(getUploadDataPath() . $imagename, $alt);
1104             }
1105         }
1106
1107         // It's a video
1108         if (is_video($imagename)) {
1109             $s = '<' . '?plugin Video file="' . $imagename . '" ?' . '>';
1110             return new Cached_PluginInvocation($s);
1111         }
1112
1113         $page = str_replace("\n", "", $page);
1114
1115         // The argument value might contain a double quote (")
1116         // We have to encode that.
1117         $page = htmlspecialchars($page);
1118
1119         $vars = '';
1120
1121         if (preg_match('/^(\S+?)\|(.*)$/', $page, $_m)) {
1122             $page = $_m[1];
1123             $vars = '"' . preg_replace('/\|/', '" "', $_m[2]) . '"';
1124             $vars = preg_replace('/"(\S+)=([^"]*)"/', '\\1="\\2"', $vars);
1125         }
1126
1127         // page may contain a version number
1128         // {{foo?version=5}}
1129         // in that case, output is "page=foo rev=5"
1130         if (strstr($page, "?")) {
1131             $page = str_replace("?version=", "\" rev=\"", $page);
1132         }
1133
1134         if ($vars)
1135             $s = '<' . '?plugin Template page="' . $page . '" ' . $vars . ' ?' . '>';
1136         else
1137             $s = '<' . '?plugin Template page="' . $page . '" ?' . '>';
1138         return new Cached_PluginInvocation($s);
1139     }
1140 }
1141
1142 // "..." => "&#133;"  browser specific display (not cached?)
1143 // Support some HTML::Entities: (C) for copy, --- for mdash, -- for ndash
1144 // TODO: "--" => "&emdash;" browser specific display (not cached?)
1145
1146 class Markup_html_entities extends SimpleMarkup
1147 {
1148     //public $_match_regexp = '(: \.\.\.|\-\-|\-\-\-|\(C\) )';
1149
1150     function __construct()
1151     {
1152         $this->_entities = array('...' => '&#133;',
1153             '--' => '&ndash;',
1154             '---' => '&mdash;',
1155             '(C)' => '&copy;',
1156             '&copy;' => '&copy;',
1157             '&trade;' => '&trade;',
1158         );
1159         $this->_match_regexp =
1160             '(: ' .
1161                 join('|', array_map('preg_quote', array_keys($this->_entities))) .
1162                 ' )';
1163     }
1164
1165     function markup($match)
1166     {
1167         return HTML::raw($this->_entities[$match]);
1168     }
1169 }
1170
1171 class Markup_isonumchars extends SimpleMarkup
1172 {
1173     public $_match_regexp = '\&\#\d{2,5};';
1174
1175     function markup($match)
1176     {
1177         return HTML::raw($match);
1178     }
1179 }
1180
1181 class Markup_isohexchars extends SimpleMarkup
1182 {
1183     // hexnums, like &#x00A4; <=> &curren;
1184     public $_match_regexp = '\&\#x[0-9a-fA-F]{2,4};';
1185
1186     function markup($match)
1187     {
1188         return HTML::raw($match);
1189     }
1190 }
1191
1192 // FIXME: Do away with magic phpwiki forms.  (Maybe phpwiki: links too?)
1193
1194 class InlineTransformer
1195 {
1196     public $_regexps = array();
1197     public $_markup = array();
1198
1199     function __construct($markup_types = array())
1200     {
1201         global $request;
1202         // We need to extend the inline parsers by certain actions, like SearchHighlight,
1203         // SpellCheck and maybe CreateToc.
1204         if (empty($markup_types)) {
1205             $non_default = false;
1206             $markup_types = array
1207             ('escape', 'wikicreolebracketlink', 'bracketlink', 'url',
1208                 'html_comment', 'placeholder',
1209                 'interwiki', 'semanticlink', 'wikiword', 'linebreak',
1210                 'wikicreole_superscript',
1211                 'wikicreole_subscript',
1212                 'wikicreole_italics', 'wikicreole_bold',
1213                 'wikicreole_monospace',
1214                 'wikicreole_underline',
1215                 'old_emphasis', 'nestled_emphasis',
1216                 'html_emphasis', 'html_abbr', 'plugin', 'plugin_wikicreole',
1217                 'isonumchars', 'isohexchars', /*'html_entities'*/
1218             );
1219             if (defined('DISABLE_MARKUP_WIKIWORD') and DISABLE_MARKUP_WIKIWORD)
1220                 $markup_types = array_remove($markup_types, 'wikiword');
1221
1222             $action = $request->getArg('action');
1223             if ($action == 'SpellCheck' and $request->getArg('suggestions')) { // insert it after url
1224                 array_splice($markup_types, 2, 1, array('url', 'spellcheck'));
1225             }
1226             if (isset($request->_searchhighlight)) { // insert it after url
1227                 array_splice($markup_types, 2, 1, array('url', 'searchhighlight'));
1228                 //$request->setArg('searchhighlight', false);
1229             }
1230         } else {
1231             $non_default = true;
1232         }
1233         foreach ($markup_types as $mtype) {
1234             $class = "Markup_$mtype";
1235             $this->_addMarkup(new $class);
1236         }
1237         $this->_addMarkup(new Markup_nowiki);
1238         if (defined('ENABLE_MARKUP_DIVSPAN') and ENABLE_MARKUP_DIVSPAN and !$non_default)
1239             $this->_addMarkup(new Markup_html_divspan);
1240         if (defined('ENABLE_MARKUP_COLOR') and ENABLE_MARKUP_COLOR and !$non_default)
1241             $this->_addMarkup(new Markup_color);
1242         // Markup_wikicreole_preformatted must be before Markup_template_plugin
1243         $this->_addMarkup(new Markup_wikicreole_preformatted);
1244         if (defined('ENABLE_MARKUP_TEMPLATE') and ENABLE_MARKUP_TEMPLATE and !$non_default)
1245             $this->_addMarkup(new Markup_template_plugin);
1246     }
1247
1248     function _addMarkup($markup)
1249     {
1250         if (is_a($markup, 'SimpleMarkup'))
1251             $regexp = $markup->getMatchRegexp();
1252         else
1253             $regexp = $markup->getStartRegexp();
1254
1255         assert(!isset($this->_markup[$regexp]));
1256         assert(strlen(trim($regexp)) > 0);
1257         $this->_regexps[] = $regexp;
1258         $this->_markup[] = $markup;
1259     }
1260
1261     function parse(&$text, $end_regexps = array('$'))
1262     {
1263         $regexps = $this->_regexps;
1264
1265         // $end_re takes precedence: "favor reduce over shift"
1266         array_unshift($regexps, $end_regexps[0]);
1267         //array_push($regexps, $end_regexps[0]);
1268         $regexps = new RegexpSet($regexps);
1269
1270         $input = $text;
1271         $output = new XmlContent;
1272
1273         $match = $regexps->match($input);
1274
1275         while ($match) {
1276             if ($match->regexp_ind == 0) {
1277                 // No start pattern found before end pattern.
1278                 // We're all done!
1279                 if (isset($markup) and is_object($markup)
1280                     and is_a($markup, 'Markup_plugin')
1281                 ) {
1282                     $current =& $output->_content[count($output->_content) - 1];
1283                     $current->setTightness(true, true);
1284                 }
1285                 $output->pushContent($match->prematch);
1286                 $text = $match->postmatch;
1287                 return $output;
1288             }
1289
1290             $markup = $this->_markup[$match->regexp_ind - 1];
1291             $body = $this->_parse_markup_body($markup, $match->match,
1292                 $match->postmatch, $end_regexps);
1293             if (!$body) {
1294                 // Couldn't match balanced expression.
1295                 // Ignore and look for next matching start regexp.
1296                 $match = $regexps->nextMatch($input, $match);
1297                 continue;
1298             }
1299
1300             // Matched markup.  Eat input, push output.
1301             // FIXME: combine adjacent strings.
1302             if (is_a($markup, 'SimpleMarkup'))
1303                 $current = $markup->markup($match->match);
1304             else
1305                 $current = $markup->markup($match->match, $body);
1306             $input = $match->postmatch;
1307             if (isset($markup) and is_object($markup)
1308                 and is_a($markup, 'Markup_plugin')
1309             ) {
1310                 $current->setTightness(true, true);
1311             }
1312             $output->pushContent($match->prematch, $current);
1313
1314             $match = $regexps->match($input);
1315         }
1316
1317         // No pattern matched, not even the end pattern.
1318         // Parse fails.
1319         return false;
1320     }
1321
1322     function _parse_markup_body($markup, $match, &$text, $end_regexps)
1323     {
1324         if (is_a($markup, 'SimpleMarkup')) {
1325             return true; // Done. SimpleMarkup is simple.
1326         }
1327
1328         if (!is_object($markup)) {
1329            return false; // Some error: Should assert
1330         }
1331         array_unshift($end_regexps, $markup->getEndRegexp($match));
1332
1333         // Optimization: if no end pattern in text, we know the
1334         // parse will fail.  This is an important optimization,
1335         // e.g. when text is "*lots *of *start *delims *with
1336         // *no *matching *end *delims".
1337         $ends_pat = "/(?:" . join(").*(?:", $end_regexps) . ")/xs";
1338         if (!@preg_match($ends_pat, $text)) { // Add "@" to avoid warning with "{{(*y)}}"
1339             return false;
1340         }
1341         return $this->parse($text, $end_regexps);
1342     }
1343 }
1344
1345 class LinkTransformer extends InlineTransformer
1346 {
1347     function __construct()
1348     {
1349         parent::__construct(array('escape', 'wikicreolebracketlink', 'bracketlink', 'url',
1350             'semanticlink', 'interwiki', 'wikiword',
1351         ));
1352     }
1353 }
1354
1355 class NowikiTransformer extends InlineTransformer
1356 {
1357     function __construct()
1358     {
1359         parent::__construct
1360         (array('linebreak',
1361             'html_emphasis', 'html_abbr', 'plugin', 'plugin_wikicreole',
1362             'isonumchars', 'isohexchars', /*'html_entities',*/
1363         ));
1364     }
1365 }
1366
1367 function TransformInline($text, $basepage = false)
1368 {
1369     /**
1370       * @var WikiRequest $request
1371       */
1372     global $request;
1373
1374     static $trfm;
1375     $action = $request->getArg('action');
1376     if (empty($trfm) or $action == 'SpellCheck') {
1377         $trfm = new InlineTransformer;
1378     }
1379
1380     if ($basepage) {
1381         return new CacheableMarkup($trfm->parse($text), $basepage);
1382     }
1383     return $trfm->parse($text);
1384 }
1385
1386 function TransformLinks($text, $basepage = false)
1387 {
1388     static $trfm;
1389
1390     if (empty($trfm)) {
1391         $trfm = new LinkTransformer;
1392     }
1393
1394     if ($basepage) {
1395         return new CacheableMarkup($trfm->parse($text), $basepage);
1396     }
1397     return $trfm->parse($text);
1398 }
1399
1400 /**
1401  * Transform only html markup and entities.
1402  */
1403 function TransformInlineNowiki($text, $basepage = false)
1404 {
1405     static $trfm;
1406
1407     if (empty($trfm)) {
1408         $trfm = new NowikiTransformer;
1409     }
1410     if ($basepage) {
1411         return new CacheableMarkup($trfm->parse($text), $basepage);
1412     }
1413     return $trfm->parse($text);
1414 }
1415
1416 // Local Variables:
1417 // mode: php
1418 // tab-width: 8
1419 // c-basic-offset: 4
1420 // c-hanging-comment-ender-p: nil
1421 // indent-tabs-mode: nil
1422 // End: