1 <?php rcs_id('$Id: InlineParser.php,v 1.20 2003-01-28 21:07:16 zorloc Exp $');
2 /* Copyright (C) 2002, Geoffrey T. Dairiki <dairiki@dairiki.org>
4 * This file is part of PhpWiki.
6 * PhpWiki is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * PhpWiki is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with PhpWiki; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 * This is the code which deals with the inline part of the (new-style)
25 * @author Geoffrey T. Dairiki
30 require_once('lib/HtmlElement.php');
31 require_once('lib/interwiki.php');
33 //FIXME: intubate ESCAPE_CHAR into BlockParser.php.
34 define('ESCAPE_CHAR', '~');
37 * Return type from RegexpSet::match and RegexpSet::nextMatch.
41 class RegexpSet_match {
43 * The text leading up the the next match.
53 * The text following the matched text.
58 * Index of the regular expression which matched.
64 * A set of regular expressions.
66 * This class is probably only useful for InlineTransformer.
72 * @param array $regexps A list of regular expressions. The
73 * regular expressions should not include any sub-pattern groups
74 * "(...)". (Anonymous groups, like "(?:...)", as well as
75 * look-ahead and look-behind assertions are okay.)
77 function RegexpSet ($regexps) {
78 $this->_regexps = $regexps;
82 * Search text for the next matching regexp from the Regexp Set.
84 * @param string $text The text to search.
86 * @return RegexpSet_match A RegexpSet_match object, or false if no match.
88 function match ($text) {
89 return $this->_match($text, $this->_regexps, '*?');
93 * Search for next matching regexp.
95 * Here, 'next' has two meanings:
97 * Match the next regexp(s) in the set, at the same position as the last match.
99 * If that fails, match the whole RegexpSet, starting after the position of the
102 * @param string $text Text to search.
104 * @param RegexpSet_match $prevMatch A RegexpSet_match object.
105 * $prevMatch should be a match object obtained by a previous
106 * match upon the same value of $text.
108 * @return RegexpSet_match A RegexpSet_match object, or false if no match.
110 function nextMatch ($text, $prevMatch) {
111 // Try to find match at same position.
112 $pos = strlen($prevMatch->prematch);
113 $regexps = array_slice($this->_regexps, $prevMatch->regexp_ind + 1);
115 $repeat = sprintf('{%d}', $pos);
116 if ( ($match = $this->_match($text, $regexps, $repeat)) ) {
117 $match->regexp_ind += $prevMatch->regexp_ind + 1;
123 // Failed. Look for match after current position.
124 $repeat = sprintf('{%d,}?', $pos + 1);
125 return $this->_match($text, $this->_regexps, $repeat);
129 function _match ($text, $regexps, $repeat) {
130 $pat= "/ ( . $repeat ) ( (" . join(')|(', $regexps) . ") ) /Axs";
132 if (! preg_match($pat, $text, $m)) {
136 $match = new RegexpSet_match;
137 $match->postmatch = substr($text, strlen($m[0]));
138 $match->prematch = $m[1];
139 $match->match = $m[2];
140 $match->regexp_ind = count($m) - 4;
143 PrintXML(HTML::dl(HTML::dt("input"),
144 HTML::dd(HTML::pre($text)),
146 HTML::dd(HTML::pre($match->match)),
148 HTML::dd(HTML::pre($regexps[$match->regexp_ind])),
149 HTML::dt("prematch"),
150 HTML::dd(HTML::pre($match->prematch))));
159 * A simple markup rule (i.e. terminal token).
161 * These are defined by a regexp.
163 * When a match is found for the regexp, the matching text is replaced.
164 * The replacement content is obtained by calling the SimpleMarkup::markup method.
172 * @return string Regexp which matches this token.
174 function getMatchRegexp () {
175 return $this->_match_regexp;
178 /** Markup matching text.
180 * @param string $match The text which matched the regexp
181 * (obtained from getMatchRegexp).
183 * @return mixed The expansion of the matched text.
185 function markup ($match /*, $body */) {
186 // De-escape matched text.
187 $str = preg_replace('/' . ESCAPE_CHAR . '(.)/', '\1', $match);
188 return $this->_markup($str);
191 /** Markup matching text.
193 * @param string $match The text which matched the regexp
194 * with escaped charecters de-escaped.
196 * @return mixed The expansion of the matched text.
198 function _markup ($match) {
199 trigger_error("pure virtual", E_USER_ERROR);
204 * A balanced markup rule.
206 * These are defined by a start regexp, and and end regexp.
212 /** Get the starting regexp for this rule.
214 * @return string The starting regexp.
216 function getStartRegexp () {
217 return $this->_start_regexp;
220 /** Get the ending regexp for this rule.
222 * @param string $match The text which matched the starting regexp.
224 * @return string The ending regexp.
226 function getEndRegexp ($match) {
227 return $this->_end_regexp;
230 /** Get expansion for matching input.
232 * @param string $match The text which matched the starting regexp.
234 * @param mixed $body Transformed text found between the starting
235 * and ending regexps.
237 * @return mixed The expansion of the matched text.
239 function markup ($match, $body) {
240 trigger_error("pure virtual", E_USER_ERROR);
244 class Markup_escape extends SimpleMarkup
246 function getMatchRegexp () {
247 return ESCAPE_CHAR . ".";
250 function _markup ($match) {
255 class Markup_bracketlink extends SimpleMarkup
257 var $_match_regexp = "\\#? \\[ .*?\S.*? \\]";
259 function _markup ($match) {
260 $link = LinkBracketLink($match);
261 assert($link->isInlineElement());
266 class Markup_url extends SimpleMarkup
268 function getMatchRegexp () {
269 global $AllowedProtocols;
270 return "(?<![[:alnum:]]) (?:$AllowedProtocols) : [^\s<>\"']+ (?<![ ,.?; \] \) ])";
273 function _markup ($match) {
274 return LinkURL($match);
279 class Markup_interwiki extends SimpleMarkup
281 function getMatchRegexp () {
283 $map = InterWikiMap::GetMap($request);
284 return "(?<! [[:alnum:]])" . $map->getRegexp(). ": \S+ (?<![ ,.?;! \] \) \" \' ])";
287 function _markup ($match) {
289 $map = InterWikiMap::GetMap($request);
290 return $map->link($match);
294 class Markup_wikiword extends SimpleMarkup
296 function getMatchRegexp () {
297 global $WikiNameRegexp;
298 return " $WikiNameRegexp";
301 function _markup ($match) {
302 return WikiLink($match, 'auto');
306 class Markup_linebreak extends SimpleMarkup
308 var $_match_regexp = "(?: (?<! %) %%% (?! %) | <(?:br|BR)> )";
310 function _markup () {
315 class Markup_old_emphasis extends BalancedMarkup
317 var $_start_regexp = "''|__";
319 function getEndRegexp ($match) {
323 function markup ($match, $body) {
324 $tag = $match == "''" ? 'em' : 'strong';
325 return new HtmlElement($tag, $body);
329 class Markup_nestled_emphasis extends BalancedMarkup
331 //var $_start_regexp = "(?<! [[:alnum:]] ) [*_=] (?=[[:alnum:]])";
332 var $_start_regexp = "(?<= \\s | ^ | [_=*(] )
334 | (?<! \\*) \\* (?! \\*)
338 function getEndRegexp ($match) {
339 $chr = preg_quote($match);
340 return "(?<= \S | ^ ) (?<! $chr) $chr (?! $chr) (?= \s | [.,:;\"'?_*=)] | $)";
343 function markup ($match, $body) {
345 case '*': return new HtmlElement('b', $body);
346 case '=': return new HtmlElement('tt', $body);
347 case '_': return new HtmlElement('i', $body);
352 class Markup_html_emphasis extends BalancedMarkup
354 var $_start_regexp = "<(?: b|big|i|small|tt|
356 abbr|acronym|cite|code|dfn|kbd|samp|var|
359 function getEndRegexp ($match) {
360 return "<\\/" . substr($match, 1);
363 function markup ($match, $body) {
364 $tag = substr($match, 1, -1);
365 return new HtmlElement($tag, $body);
369 // FIXME: Do away with magic phpwiki forms. (Maybe phpwiki: links too?)
370 // FIXME: Do away with plugin-links. They seem not to be used.
374 class InlineTransformer
376 var $_regexps = array();
377 var $_markup = array();
379 function InlineTransformer ($markup_types = false) {
381 $markup_types = array('escape', 'bracketlink', 'url',
382 'interwiki', 'wikiword', 'linebreak',
383 'old_emphasis', 'nestled_emphasis',
386 foreach ($markup_types as $mtype) {
387 $class = "Markup_$mtype";
388 $this->_addMarkup(new $class);
392 function _addMarkup ($markup) {
393 if (isa($markup, 'SimpleMarkup'))
394 $regexp = $markup->getMatchRegexp();
396 $regexp = $markup->getStartRegexp();
398 assert(!isset($this->_markup[$regexp]));
399 $this->_regexps[] = $regexp;
400 $this->_markup[] = $markup;
403 function parse (&$text, $end_regexps = array('$')) {
404 $regexps = $this->_regexps;
406 // $end_re takes precedence: "favor reduce over shift"
407 array_unshift($regexps, $end_regexps[0]);
408 $regexps = new RegexpSet($regexps);
411 $output = new XmlContent;
413 $match = $regexps->match($input);
416 if ($match->regexp_ind == 0) {
417 // No start pattern found before end pattern.
419 $output->pushContent($match->prematch);
420 $text = $match->postmatch;
424 $markup = $this->_markup[$match->regexp_ind - 1];
425 $body = $this->_parse_markup_body($markup, $match->match, $match->postmatch, $end_regexps);
427 // Couldn't match balanced expression.
428 // Ignore and look for next matching start regexp.
429 $match = $regexps->nextMatch($input, $match);
433 // Matched markup. Eat input, push output.
434 // FIXME: combine adjacent strings.
435 $input = $match->postmatch;
436 $output->pushContent($match->prematch,
437 $markup->markup($match->match, $body));
439 $match = $regexps->match($input);
442 // No pattern matched, not even the end pattern.
447 function _parse_markup_body ($markup, $match, &$text, $end_regexps) {
448 if (isa($markup, 'SimpleMarkup'))
449 return true; // Done. SimpleMarkup is simple.
451 array_unshift($end_regexps, $markup->getEndRegexp($match));
452 // Optimization: if no end pattern in text, we know the
453 // parse will fail. This is an important optimization,
454 // e.g. when text is "*lots *of *start *delims *with
455 // *no *matching *end *delims".
456 $ends_pat = "/(?:" . join(").*(?:", $end_regexps) . ")/xs";
457 if (!preg_match($ends_pat, $text))
459 return $this->parse($text, $end_regexps);
463 class LinkTransformer extends InlineTransformer
465 function LinkTransformer () {
466 $this->InlineTransformer(array('escape', 'bracketlink', 'url',
467 'interwiki', 'wikiword'));
471 function TransformInline($text, $markup = 2.0) {
475 $trfm = new InlineTransformer;
479 $text = ConvertOldMarkup($text, 'inline');
482 return $trfm->parse($text);
485 function TransformLinks($text, $markup = 2.0) {
489 $trfm = new LinkTransformer;
493 $text = ConvertOldMarkup($text, 'links');
496 return $trfm->parse($text);
499 // (c-file-style: "gnu")
504 // c-hanging-comment-ender-p: nil
505 // indent-tabs-mode: nil