2 rcs_id('$Id: InlineParser.php,v 1.46 2004-05-08 19:55:29 rurban Exp $');
3 /* Copyright (C) 2002, Geoffrey T. Dairiki <dairiki@dairiki.org>
5 * This file is part of PhpWiki.
7 * PhpWiki is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * PhpWiki is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with PhpWiki; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 * This is the code which deals with the inline part of the (new-style)
26 * @author Geoffrey T. Dairiki
32 * This is the character used in wiki markup to escape characters with
35 define('ESCAPE_CHAR', '~');
37 require_once('lib/HtmlElement.php');
38 require_once('lib/CachedMarkup.php');
39 //require_once('lib/interwiki.php');
40 require_once('lib/stdlib.php');
43 function WikiEscape($text) {
44 return str_replace('#', ESCAPE_CHAR . '#', $text);
47 function UnWikiEscape($text) {
48 return preg_replace('/' . ESCAPE_CHAR . '(.)/', '\1', $text);
52 * Return type from RegexpSet::match and RegexpSet::nextMatch.
56 class RegexpSet_match {
58 * The text leading up the the next match.
68 * The text following the matched text.
73 * Index of the regular expression which matched.
79 * A set of regular expressions.
81 * This class is probably only useful for InlineTransformer.
87 * @param array $regexps A list of regular expressions. The
88 * regular expressions should not include any sub-pattern groups
89 * "(...)". (Anonymous groups, like "(?:...)", as well as
90 * look-ahead and look-behind assertions are okay.)
92 function RegexpSet ($regexps) {
94 $this->_regexps = array_unique($regexps);
98 * Search text for the next matching regexp from the Regexp Set.
100 * @param string $text The text to search.
102 * @return RegexpSet_match A RegexpSet_match object, or false if no match.
104 function match ($text) {
105 return $this->_match($text, $this->_regexps, '*?');
109 * Search for next matching regexp.
111 * Here, 'next' has two meanings:
113 * Match the next regexp(s) in the set, at the same position as the last match.
115 * If that fails, match the whole RegexpSet, starting after the position of the
118 * @param string $text Text to search.
120 * @param RegexpSet_match $prevMatch A RegexpSet_match object.
121 * $prevMatch should be a match object obtained by a previous
122 * match upon the same value of $text.
124 * @return RegexpSet_match A RegexpSet_match object, or false if no match.
126 function nextMatch ($text, $prevMatch) {
127 // Try to find match at same position.
128 $pos = strlen($prevMatch->prematch);
129 $regexps = array_slice($this->_regexps, $prevMatch->regexp_ind + 1);
131 $repeat = sprintf('{%d}', $pos);
132 if ( ($match = $this->_match($text, $regexps, $repeat)) ) {
133 $match->regexp_ind += $prevMatch->regexp_ind + 1;
139 // Failed. Look for match after current position.
140 $repeat = sprintf('{%d,}?', $pos + 1);
141 return $this->_match($text, $this->_regexps, $repeat);
145 function _match ($text, $regexps, $repeat) {
146 // certain php builds crash here:
147 // sf.net: Fatal error: Allowed memory size of 8388608 bytes exhausted
148 // (tried to allocate 634 bytes)
150 // So we try to minize memory usage, by looping explicitly
151 // and storing only those regexp which actually match.
152 // There may be more than one, so we have to find the longest,
153 // and match inside until the shortest is empty.
155 for ($i=0; $i<count($regexps); $i++) {
156 // Syntax: http://www.pcre.org/pcre.txt
157 // x - EXTENDED, ignore whitespace
161 $pat= "/ ( . $repeat ) ( " . $regexps[$i] . " ) /x";
162 if (preg_match($pat, $text, $_m)) {
164 $matched[$i] = $regexps[$i];
168 if (empty($matched)) return false;
169 $match = new RegexpSet_match;
171 // Optimization: if the matches are only "$" and another, then omit "$"
172 if (count($matched) > 2) {
173 // we could do much better, if we would know the matching markup for the longest regexp match
174 $hugepat= "/ ( . $repeat ) ( (" . join(')|(', $regexps) . ") ) /Asx";
175 //$hugepat= "/ ( . $repeat ) ( (" . join(')|(', array_values($matched)) . ") ) /Asx";
176 if (! preg_match($hugepat, $text, $m)) {
179 $match->regexp_ind = count($m) - 4; // TODO: Optimisation with matched only
181 $match->regexp_ind = $regexp_ind;
184 $match->postmatch = substr($text, strlen($m[0]));
185 $match->prematch = $m[1];
186 $match->match = $m[2];
189 PrintXML(HTML::dl(HTML::dt("input"),
190 HTML::dd(HTML::pre($text)),
192 HTML::dd(HTML::pre($match->match)),
194 HTML::dd(HTML::pre($regexps[$match->regexp_ind])),
195 HTML::dt("prematch"),
196 HTML::dd(HTML::pre($match->prematch))));
205 * A simple markup rule (i.e. terminal token).
207 * These are defined by a regexp.
209 * When a match is found for the regexp, the matching text is replaced.
210 * The replacement content is obtained by calling the SimpleMarkup::markup method.
218 * @return string Regexp which matches this token.
220 function getMatchRegexp () {
221 return $this->_match_regexp;
224 /** Markup matching text.
226 * @param string $match The text which matched the regexp
227 * (obtained from getMatchRegexp).
229 * @return mixed The expansion of the matched text.
231 function markup ($match /*, $body */) {
232 trigger_error("pure virtual", E_USER_ERROR);
237 * A balanced markup rule.
239 * These are defined by a start regexp, and an end regexp.
245 /** Get the starting regexp for this rule.
247 * @return string The starting regexp.
249 function getStartRegexp () {
250 return $this->_start_regexp;
253 /** Get the ending regexp for this rule.
255 * @param string $match The text which matched the starting regexp.
257 * @return string The ending regexp.
259 function getEndRegexp ($match) {
260 return $this->_end_regexp;
263 /** Get expansion for matching input.
265 * @param string $match The text which matched the starting regexp.
267 * @param mixed $body Transformed text found between the starting
268 * and ending regexps.
270 * @return mixed The expansion of the matched text.
272 function markup ($match, $body) {
273 trigger_error("pure virtual", E_USER_ERROR);
277 class Markup_escape extends SimpleMarkup
279 function getMatchRegexp () {
280 return ESCAPE_CHAR . '(?: [[:alnum:]]+ | .)';
283 function markup ($match) {
284 assert(strlen($match) >= 2);
285 return substr($match, 1);
290 * [image.jpg size=50% border=5], [image.jpg size=50x30]
291 * Support for the following attributes: see stdlib.php:LinkImage()
292 * size=<precent>%, size=<width>x<height>
293 * border=n, align=\w+, hspace=n, vspace=n
295 function isImageLink($link) {
296 if (!$link) return false;
297 return preg_match("/\\.(" . INLINE_IMAGES . ")$/i", $link)
298 or preg_match("/\\.(" . INLINE_IMAGES . ")\s+(size|border|align|hspace|vspace)=/i", $link);
301 function LinkBracketLink($bracketlink) {
303 // $bracketlink will start and end with brackets; in between will
304 // be either a page name, a URL or both separated by a pipe.
306 // strip brackets and leading space
307 preg_match('/(\#?) \[\s* (?: (.*?) \s* (?<!' . ESCAPE_CHAR . ')(\|) )? \s* (.+?) \s*\]/x',
308 $bracketlink, $matches);
309 list (, $hash, $label, $bar, $rawlink) = $matches;
311 $label = UnWikiEscape($label);
313 * Check if the user has typed a explicit URL. This solves the
314 * problem where the URLs have a ~ character, which would be stripped away.
315 * "[http:/server/~name/]" will work as expected
316 * "http:/server/~name/" will NOT work as expected, will remove the ~
318 if (strstr($rawlink, "http://") or strstr($rawlink, "https://"))
321 $link = UnWikiEscape($rawlink);
324 // if label looks like a url to an image, we want an image link.
325 if (isImageLink($label)) {
327 if (preg_match("/^" . $intermap->getRegexp() . ":/", $label)) {
328 $imgurl = $intermap->link($label);
329 $imgurl = $imgurl->getAttr('href');
330 } elseif (! preg_match("#^(" . ALLOWED_PROTOCOLS . "):#", $imgurl)) {
331 // local theme linkname like 'images/next.gif'.
333 $imgurl = $Theme->getImageURL($imgurl);
335 $label = LinkImage($imgurl, $link);
339 // It's an anchor, not a link...
340 $id = MangleXmlIdentifier($link);
341 return HTML::a(array('name' => $id, 'id' => $id),
342 $bar ? $label : $link);
345 if (preg_match("#^(" . ALLOWED_PROTOCOLS . "):#", $link)) {
346 // if it's an image, embed it; otherwise, it's a regular link
347 if (isImageLink($link))
348 return LinkImage($link, $label);
350 return new Cached_ExternalLink($link, $label);
352 elseif (preg_match("/^phpwiki:/", $link))
353 return new Cached_PhpwikiURL($link, $label);
355 * Inline images in Interwiki urls's:
356 * [File:my_image.gif] inlines the image,
357 * File:my_image.gif shows a plain inter-wiki link,
358 * [what a pic|File:my_image.gif] shows a named inter-wiki link to the gif
359 * [File:my_image.gif|what a pic] shows a inlimed image linked to the page "what a pic"
361 elseif (strstr($link,':') and
362 ($intermap = getInterwikiMap()) and
363 preg_match("/^" . $intermap->getRegexp() . ":/", $link)) {
364 if (empty($label) && isImageLink($link)) {
365 // if without label => inlined image [File:xx.gif]
366 $imgurl = $intermap->link($link);
367 return LinkImage($imgurl->getAttr('href'), $label);
369 return new Cached_InterwikiLink($link, $label);
371 // Split anchor off end of pagename.
372 if (preg_match('/\A(.*)(?<!'.ESCAPE_CHAR.')#(.*?)\Z/', $rawlink, $m)) {
373 list(,$rawlink,$anchor) = $m;
374 $pagename = UnWikiEscape($rawlink);
375 $anchor = UnWikiEscape($anchor);
383 return new Cached_WikiLink($pagename, $label, $anchor);
387 class Markup_bracketlink extends SimpleMarkup
389 var $_match_regexp = "\\#? \\[ .*? [^]\\s] .*? \\]";
391 function markup ($match) {
392 $link = LinkBracketLink($match);
393 assert($link->isInlineElement());
398 class Markup_url extends SimpleMarkup
400 function getMatchRegexp () {
401 return "(?<![[:alnum:]]) (?:" . ALLOWED_PROTOCOLS . ") : [^\s<>\"']+ (?<![ ,.?; \] \) ])";
404 function markup ($match) {
405 return new Cached_ExternalLink(UnWikiEscape($match));
410 class Markup_interwiki extends SimpleMarkup
412 function getMatchRegexp () {
414 $map = getInterwikiMap();
415 return "(?<! [[:alnum:]])" . $map->getRegexp(). ": \S+ (?<![ ,.?;! \] \) \" \' ])";
418 function markup ($match) {
419 //$map = getInterwikiMap();
420 return new Cached_InterwikiLink(UnWikiEscape($match));
424 class Markup_wikiword extends SimpleMarkup
426 function getMatchRegexp () {
427 global $WikiNameRegexp;
428 return " $WikiNameRegexp";
431 function markup ($match) {
432 if (!$match) return false;
433 if ($this->_isWikiUserPage($match))
434 return new Cached_UserLink($match); //$this->_UserLink($match);
436 return new Cached_WikiLink($match);
439 // FIXME: there's probably a more useful place to put these two functions
440 function _isWikiUserPage ($page) {
442 $dbi = $request->getDbh();
443 $page_handle = $dbi->getPage($page);
444 if ($page_handle and $page_handle->get('pref'))
450 function _UserLink($PageName) {
451 $link = HTML::a(array('href' => $PageName));
452 $link->pushContent(PossiblyGlueIconToText('wikiuser', $PageName));
453 $link->setAttr('class', 'wikiuser');
458 class Markup_linebreak extends SimpleMarkup
460 //var $_match_regexp = "(?: (?<! %) %%% (?! %) | <(?:br|BR)> | <(?:br|BR) \/> )";
461 var $_match_regexp = "(?: (?<! %) %%% (?! %) | <(?:br|BR)> )";
463 function markup ($match) {
468 class Markup_old_emphasis extends BalancedMarkup
470 var $_start_regexp = "''|__";
472 function getEndRegexp ($match) {
476 function markup ($match, $body) {
477 $tag = $match == "''" ? 'em' : 'strong';
478 return new HtmlElement($tag, $body);
482 class Markup_nestled_emphasis extends BalancedMarkup
484 function getStartRegexp() {
485 static $start_regexp = false;
487 if (!$start_regexp) {
488 // The three possible delimiters
489 // (none of which can be followed by itself.)
494 $any = "(?: ${i}|${b}|${tt})"; // any of the three.
496 // Any of [_*=] is okay if preceded by space or one of [-"'/:]
497 $start[] = "(?<= \\s|^|[-\"'\\/:]) ${any}";
499 // _ or * is okay after = as long as not immediately followed by =
500 $start[] = "(?<= =) (?: ${i}|${b}) (?! =)";
502 $start[] = "(?<= _) (?: ${b}|${tt}) (?! _)";
503 $start[] = "(?<= \\*) (?: ${i}|${tt}) (?! \\*)";
506 // any delimiter okay after an opening brace ( [{<(] )
507 // as long as it's not immediately followed by the matching closing
509 $start[] = "(?<= { ) ${any} (?! } )";
510 $start[] = "(?<= < ) ${any} (?! > )";
511 $start[] = "(?<= \\( ) ${any} (?! \\) )";
513 $start = "(?:" . join('|', $start) . ")";
515 // Any of the above must be immediately followed by non-whitespace.
516 $start_regexp = $start . "(?= \S)";
519 return $start_regexp;
522 function getEndRegexp ($match) {
523 $chr = preg_quote($match);
524 return "(?<= \S | ^ ) (?<! $chr) $chr (?! $chr) (?= \s | [-)}>\"'\\/:.,;!? _*=] | $)";
527 function markup ($match, $body) {
529 case '*': return new HtmlElement('b', $body);
530 case '=': return new HtmlElement('tt', $body);
531 case '_': return new HtmlElement('i', $body);
536 class Markup_html_emphasis extends BalancedMarkup
539 "<(?: b|big|i|small|tt|em|strong|cite|code|dfn|kbd|samp|var|sup|sub )>";
541 function getEndRegexp ($match) {
542 return "<\\/" . substr($match, 1);
545 function markup ($match, $body) {
546 $tag = substr($match, 1, -1);
547 return new HtmlElement($tag, $body);
551 class Markup_html_abbr extends BalancedMarkup
553 //rurban: abbr|acronym need an optional title tag.
555 var $_start_regexp = "<(?: abbr|acronym )(?: \stitle=[^>]*)?>";
557 function getEndRegexp ($match) {
558 if (substr($match,1,4) == 'abbr')
562 return "<\\/" . $tag . '>';
565 function markup ($match, $body) {
566 if (substr($match,1,4) == 'abbr')
570 $rest = substr($match,1+strlen($tag),-1);
572 list($key,$val) = explode("=",$rest);
573 $args = array($key => $val);
574 } else $args = array();
575 return new HtmlElement($tag, $args, $body);
579 // Special version for single-line plugins formatting,
580 // like: '<small>< ?plugin PopularNearby ? ></small>'
581 class Markup_plugin extends SimpleMarkup
583 var $_match_regexp = '<\?plugin(?:-form)?\s[^\n]+?\?>';
585 function markup ($match) {
586 //$xml = new Cached_PluginInvocation($match);
587 //$xml->setTightness(true,true);
588 return new Cached_PluginInvocation($match);
593 // TODO: "..." => "…" browser specific display (not cached?)
594 // TODO: "--" => "&emdash;" browser specific display (not cached?)
596 // FIXME: Do away with magic phpwiki forms. (Maybe phpwiki: links too?)
597 // FIXME: Do away with plugin-links. They seem not to be used.
601 class InlineTransformer
603 var $_regexps = array();
604 var $_markup = array();
606 function InlineTransformer ($markup_types = false) {
608 $markup_types = array('escape', 'bracketlink', 'url',
609 'interwiki', 'wikiword', 'linebreak',
610 'old_emphasis', 'nestled_emphasis',
611 'html_emphasis', 'html_abbr', 'plugin');
613 foreach ($markup_types as $mtype) {
614 $class = "Markup_$mtype";
615 /*if ($GLOBALS['HTTP_SERVER_VARS']['SERVER_NAME'] == 'phpwiki.sourceforge.net' and
616 in_array($mtype,array('interwiki','plugin')))
618 $this->_addMarkup(new $class);
622 function _addMarkup ($markup) {
623 if (isa($markup, 'SimpleMarkup'))
624 $regexp = $markup->getMatchRegexp();
626 $regexp = $markup->getStartRegexp();
628 assert(!isset($this->_markup[$regexp]));
629 $this->_regexps[] = $regexp;
630 $this->_markup[] = $markup;
633 function parse (&$text, $end_regexps = array('$')) {
634 $regexps = $this->_regexps;
636 // $end_re takes precedence: "favor reduce over shift"
637 array_unshift($regexps, $end_regexps[0]);
638 //array_push($regexps, $end_regexps[0]);
639 $regexps = new RegexpSet($regexps);
642 $output = new XmlContent;
644 $match = $regexps->match($input);
647 if ($match->regexp_ind == 0) {
648 // No start pattern found before end pattern.
650 if (isa($markup,'Markup_plugin')) {
651 $output->_content[count($output->_content)-1]->setTightness(!empty($match->prematch),false);
653 $output->pushContent($match->prematch);
654 $text = $match->postmatch;
658 $markup = $this->_markup[$match->regexp_ind - 1];
659 $body = $this->_parse_markup_body($markup, $match->match, $match->postmatch, $end_regexps);
661 // Couldn't match balanced expression.
662 // Ignore and look for next matching start regexp.
663 $match = $regexps->nextMatch($input, $match);
667 // Matched markup. Eat input, push output.
668 // FIXME: combine adjacent strings.
669 $current = $markup->markup($match->match, $body);
670 $input = $match->postmatch;
671 if (isa($markup,'Markup_plugin')) {
672 $current->setTightness(!empty($match->prematch),!empty($match->postmatch));
674 $output->pushContent($match->prematch, $current);
676 $match = $regexps->match($input);
679 // No pattern matched, not even the end pattern.
684 function _parse_markup_body ($markup, $match, &$text, $end_regexps) {
685 if (isa($markup, 'SimpleMarkup'))
686 return true; // Done. SimpleMarkup is simple.
688 if (!is_object($markup)) return false; // Some error: Should assert
689 array_unshift($end_regexps, $markup->getEndRegexp($match));
691 // Optimization: if no end pattern in text, we know the
692 // parse will fail. This is an important optimization,
693 // e.g. when text is "*lots *of *start *delims *with
694 // *no *matching *end *delims".
695 $ends_pat = "/(?:" . join(").*(?:", $end_regexps) . ")/xs";
696 if (!preg_match($ends_pat, $text))
698 return $this->parse($text, $end_regexps);
702 class LinkTransformer extends InlineTransformer
704 function LinkTransformer () {
705 $this->InlineTransformer(array('escape', 'bracketlink', 'url',
706 'interwiki', 'wikiword'));
710 function TransformInline($text, $markup = 2.0, $basepage=false) {
714 $trfm = new InlineTransformer;
718 $text = ConvertOldMarkup($text, 'inline');
722 return new CacheableMarkup($trfm->parse($text), $basepage);
724 return $trfm->parse($text);
727 function TransformLinks($text, $markup = 2.0, $basepage = false) {
731 $trfm = new LinkTransformer;
735 $text = ConvertOldMarkup($text, 'links');
739 return new CacheableMarkup($trfm->parse($text), $basepage);
741 return $trfm->parse($text);
744 // (c-file-style: "gnu")
749 // c-hanging-comment-ender-p: nil
750 // indent-tabs-mode: nil