1 <?php rcs_id('$Id: InlineParser.php,v 1.3 2002-01-29 19:28:16 dairiki Exp $');
2 /* Copyright (C) 2002, Geoffrey T. Dairiki <dairiki@dairiki.org>
4 * This file is part of PhpWiki.
6 * PhpWiki is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * PhpWiki is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with PhpWiki; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 require_once('lib/HtmlElement.php');
22 //FIXME: intubate ESCAPE_CHAR into BlockParser.php.
23 define('ESCAPE_CHAR', '~');
26 * Return type from RegexpSet::match and RegexpSet::nextMatch.
30 class RegexpSet_match {
32 * The text leading up the the next match.
42 * The text following the matched text.
47 * Index of the regular expression which matched.
53 * A set of regular expressions.
55 * This class is probably only useful for InlineTransformer.
61 * @param $regexps array A list of regular expressions. The
62 * regular expressions should not include any sub-pattern groups
63 * "(...)". (Anonymous groups, like "(?:...)", as well as
64 * look-ahead and look-behind assertions are fine.)
66 function RegexpSet ($regexps) {
67 $this->_regexps = $regexps;
71 * Search text for the next matching regexp from the Regexp Set.
73 * @param $text string The text to search.
75 * @return object A RegexpSet_match object, or false if no match.
77 function match ($text) {
78 return $this->_match($text, $this->_regexps, '*?');
82 * Search for next matching regexp.
84 * Here, 'next' has two meanings:
86 * Match the next regexp(s) in the set, at the same position as the last match.
88 * If that fails, match the whole RegexpSet, starting after the position of the
91 * @param $text string Text to search.
93 * @param $prevMatch A RegexpSet_match object
95 * $prevMatch should be a match object obtained by a previous
96 * match upon the same value of $text.
98 * @return object A RegexpSet_match object, or false if no match.
100 function nextMatch ($text, $prevMatch) {
101 // Try to find match at same position.
102 $pos = strlen($prevMatch->prematch);
103 $regexps = array_slice($this->_regexps, $prevMatch->regexp_ind + 1);
105 $repeat = sprintf('{%d}', $pos);
106 if ( ($match = $this->_match($text, $regexps, $repeat)) )
110 // Failed. Look for match after current position.
111 $repeat = sprintf('{%d,}?', $pos + 1);
112 return $this->_match($text, $this->_regexps, $repeat);
116 function _match ($text, $regexps, $repeat) {
118 $pat= "/ ( . $repeat ) ( (" . join(')|(', $regexps) . ") ) /Axs";
120 if (! preg_match($pat, $text, $m))
123 $match = new RegexpSet_match;
124 $match->postmatch = substr($text, strlen($m[0]));
125 $match->prematch = $m[1];
126 $match->match = $m[2];
127 $match->regexp_ind = count($m) - 4;
130 PrintXML(HTML::dl(HTML::dt("input"),
131 HTML::dd(HTML::pre($text)),
133 HTML::dd(HTML::pre($match->match)),
135 HTML::dd(HTML::pre($regexps[$match->regexp_ind])),
136 HTML::dt("prematch"),
137 HTML::dd(HTML::pre($match->prematch))));
147 * A simple markup rule (i.e. terminal token).
149 * These are defined by a regexp.
151 * When a match is found for the regexp, the matching text is replaced.
152 * The replacement content is obtained by calling the SimpleMarkup::markup method.
160 * @return string Regexp which matches this token.
162 function getMatchRegexp () {
163 return $this->_match_regexp;
166 /** Markup matching text.
168 * @param $match string The text which matched the regexp
169 * (obtained from getMatchRegexp).
171 * @return mixed The expansion of the matched text.
173 function markup ($match /*, $body */) {
174 trigger_error("pure virtual", E_USER_ERROR);
179 * A balanced markup rule.
181 * These are defined by a start regexp, and and end regexp.
187 /** Get the starting regexp for this rule.
189 * @return string The starting regexp.
191 function getStartRegexp () {
192 return $this->_start_regexp;
195 /** Get the ending regexp for this rule.
197 * @param $match string The text which matched the starting regexp.
199 * @return string The ending regexp.
201 function getEndRegexp ($match) {
202 return $this->_end_regexp;
205 /** Get expansion for matching input.
207 * @param $match string The text which matched the starting regexp.
209 * @param $body mixed Transformed text found between the starting
210 * and ending regexps.
212 * @return mixed The expansion of the matched text.
214 function markup ($match, $body) {
215 trigger_error("pure virtual", E_USER_ERROR);
219 class Markup_escape extends SimpleMarkup
221 function getMatchRegexp () {
222 return ESCAPE_CHAR . ".";
225 function markup ($match) {
230 class Markup_bracketlink extends SimpleMarkup
232 var $_match_regexp = "\\[ .*?\S.*? \\]";
234 function markup ($match) {
235 $link = LinkBracketLink($match);
236 assert($link->isInlineElement());
241 class Markup_url extends SimpleMarkup
243 function getMatchRegexp () {
244 global $AllowedProtocols;
245 return "(?<![[:alnum:]]) (?:$AllowedProtocols) : [^\s<>\"']+ (?<![ ,.?; \] \) ])";
248 function markup ($match) {
249 return LinkURL($match);
254 class Markup_interwiki extends SimpleMarkup
256 function getMatchRegexp () {
257 global $InterWikiLinkRegexp;
258 return "(?<! [[:alnum:]]) $InterWikiLinkRegexp : \S+ (?<![ ,.?; \] \) \" \' ])";
261 function markup ($match) {
262 return LinkInterWikiLink($match);
266 class Markup_wikiword extends SimpleMarkup
268 function getMatchRegexp () {
269 global $WikiNameRegexp;
270 return " $WikiNameRegexp";
273 function markup ($match) {
274 return LinkWikiWord($match);
278 class Markup_linebreak extends SimpleMarkup
280 var $_match_regexp = "(?: (?<! %) %%% (?! %) | <br> )";
287 class Markup_old_emphasis extends BalancedMarkup
289 var $_start_regexp = "''|__";
291 function getEndRegexp ($match) {
295 function markup ($match, $body) {
296 $tag = $match == "''" ? 'em' : 'strong';
297 return new HtmlElement($tag, $body);
301 class Markup_nestled_emphasis extends BalancedMarkup
303 var $_start_regexp = "(?<! [[:alnum:]] ) [*_=] (?=[[:alnum:]])";
305 function getEndRegexp ($match) {
306 return "(?<= [[:alnum:]]) \\$match (?![[:alnum:]])";
309 function markup ($match, $body) {
311 case '*': return new HtmlElement('b', $body);
312 case '=': return new HtmlElement('tt', $body);
313 default: return new HtmlElement('i', $body);
318 class Markup_html_emphasis extends BalancedMarkup
320 var $_start_regexp = "<(?: b|big|i|small|tt|
321 abbr|acronym|cite|code|dfn|kbd|samp|strong|var|
324 function getEndRegexp ($match) {
325 return "<\\/" . substr($match, 1);
328 function markup ($match, $body) {
329 $tag = substr($match, 1, -1);
330 return new HtmlElement($tag, $body);
334 // FIXME: Do away with magic phpwiki forms. (Maybe phpwiki: links too?)
335 // FIXME: Do away with plugin-links. They seem not to be used.
339 class InlineTransformer
341 var $_regexps = array();
342 var $_markup = array();
344 function InlineTransformer () {
345 foreach (array('escape', 'bracketlink', 'url',
346 'interwiki', 'wikiword', 'linebreak',
347 'old_emphasis', 'nestled_emphasis',
348 'html_emphasis') as $mtype) {
349 $class = "Markup_$mtype";
350 $this->_addMarkup(new $class);
354 function _addMarkup ($markup) {
355 if (isa($markup, 'SimpleMarkup'))
356 $regexp = $markup->getMatchRegexp();
358 $regexp = $markup->getStartRegexp();
360 assert(!isset($this->_markup[$regexp]));
361 $this->_regexps[] = $regexp;
362 $this->_markup[] = $markup;
365 function parse (&$text, $end_re = '$') {
367 $regexps = $this->_regexps;
369 // $end_re takes precedence: "favor reduce over shift"
370 array_unshift($regexps, $end_re);
371 $regexps = new RegexpSet($regexps);
374 $output = new XmlContent;
376 $match = $regexps->match($input);
379 if ($match->regexp_ind == 0) {
380 // No start pattern found before end pattern.
382 $output->pushContent($match->prematch);
383 $text = $match->postmatch;
387 $markup = $this->_markup[$match->regexp_ind - 1];
388 $body = $this->_parse_markup_body($markup, $match->match, $match->postmatch);
390 // Couldn't match balanced expression.
391 // Ignore and look for next matching start regexp.
392 $match = $regexps->nextMatch($input, $match);
396 // Matched markup. Eat input, push output.
397 // FIXME: combine adjacent strings.
398 $input = $match->postmatch;
399 $output->pushContent($match->prematch,
400 $markup->markup($match->match, $body));
402 $match = $regexps->match($input);
405 // No pattern matched, not even the end pattern.
410 function _parse_markup_body ($markup, $match, &$text) {
411 if (isa($markup, 'SimpleMarkup'))
412 return true; // Done. SimpleMarkup is simple.
414 $end_regexp = $markup->getEndRegexp($match);
415 return $this->parse($text, $end_regexp);
419 function TransformInline($text) {
422 $trfm = new InlineTransformer;
423 return $trfm->parse($text);
426 // (c-file-style: "gnu")
431 // c-hanging-comment-ender-p: nil
432 // indent-tabs-mode: nil