1 <?php rcs_id('$Id: transform.php,v 1.31 2001-12-18 18:47:29 dairiki Exp $');
2 require_once('lib/WikiPlugin.php');
4 define('WT_SIMPLE_MARKUP', 0);
5 define('WT_TOKENIZER', 1);
6 define('WT_MODE_MARKUP', 2);
8 define("ZERO_LEVEL", 0);
9 define("NESTED_LEVEL", 1);
13 // public variables (only meaningful during do_transform)
14 var $linenumber; // current linenumber
15 var $replacements; // storage for tokenized strings of current line
16 var $user_data; // can be used by the transformer functions
17 // to store miscellaneous data.
20 var $content; // wiki markup, array of lines
21 var $mode_set; // stores if a HTML mode for this line has been set
22 var $trfrm_func; // array of registered functions
23 var $stack; // stack for SetHTMLMode (keeping track of open tags)
26 function WikiTransform()
28 $this->trfrm_func = array();
29 $this->stack = new Stack;
33 * Register transformation functions
35 * This should be done *before* calling do_transform
37 * @param $type enum <dl>
38 * <dt>WT_MODE_MARKUP</dt>
39 * <dd>If one WT_MODE_MARKUP really sets the html mode, then
40 * all successive WT_MODE_MARKUP functions are skipped.</dd>
41 * <dt>WT_TOKENIZER</dt>
42 * <dd> The transformer function is called once for each match
43 * of the $regexp in the line. The matched values are tokenized
44 * to protect them from further transformation.</dd>
46 * @param $function string Function name
47 * @param $regexp string Required for WT_TOKENIZER functions.
48 * Optional for others. If given, the transformer function will only be
49 * called if the line matches the $regexp.
51 function register($type, $function, $regexp = false)
53 $this->trfrm_func[] = array ($type, $function, $regexp);
57 * Sets current mode like list, preformatted text, plain text
59 * Takes care of closing (open) tags
61 * This is a helper function used to keep track of what HTML
62 * block-level element we are currently processing.
63 * Block-level elements are things like paragraphs "<p>",
64 * pre-formatted text "<pre>", and the various list elements:
65 * "<ul>", "<ol>" and "<dl>". Now, SetHTMLMode is also used to
66 * keep track of "<li>" and "<dd>" elements. Note that some of these elements
67 * can be nested, while others can not. (In particular, according to
68 * the HTML 4.01 specification, a paragraph "<p>" element is not
69 * allowed to contain any other block-level elements. Also <pre>,
70 * <li>, <dt>, <dd>, <h1> ... have this same restriction.)
72 * SetHTMLMode generates whatever HTML is necessary to get us into
73 * the requested element type at the requested nesting level.
75 * @param $tag string Type of HTML element to open.
77 * If $tag is an array, $tag[0] gives the element type,
78 * and $tag[1] should be a hash containing attribute-value
79 * pairs for the element.
81 * If $tag is the empty string, all open elements (down to the
82 * level requested by $level) are closed. Use
83 * SetHTMLMode('',0) to close all open block-level elements.
85 * @param $level string Rrequested nesting level for current element.
86 * The nesting level for top level block is one (which is
88 * Nesting is arbitrary limited to 20 levels.
90 * @return string Returns the HTML markup to open the specified element.
92 function SetHTMLMode($tag, $level = 1)
102 $this->mode_set = 1; // in order to prevent other mode markup
107 // arbitrarily limit tag nesting
108 ExitWiki(gettext ("Lists nested too deep in SetHTMLOutputMode"));
111 if ($level <= $this->stack->cnt()) {
112 // $tag has fewer nestings (old: tabs) than stack,
113 // reduce stack to that tab count
114 while ($this->stack->cnt() > $level) {
115 $closetag = $this->stack->pop();
116 assert('$closetag != false');
117 $retvar .= "</$closetag>\n";
120 // if list type isn't the same,
121 // back up one more and push new tag
122 if ($tag && $tag != $this->stack->top()) {
123 $closetag = $this->stack->pop();
124 $retvar .= "</$closetag>" . StartTag($tag, $args) . "\n";
125 $this->stack->push($tag);
129 else {// $level > $this->stack->cnt()
130 // Test for and close top level elements which are not allowed to contain
131 // other block-level elements.
132 if ($this->stack->cnt() == 1 and
133 preg_match('/^(p|pre|h\d)$/i', $this->stack->top()))
135 $closetag = $this->stack->pop();
136 $retvar .= "</$closetag>";
139 // we add the diff to the stack
140 // stack might be zero
141 if ($this->stack->cnt() < $level) {
142 while ($this->stack->cnt() < $level - 1) {
143 // This is a bit of a hack:
145 // We're not nested deep enough, and have to make up some kind of block
146 // element to nest within.
148 // Currently, this can only happen for nested list element
149 // (either <ul> <ol> or <dl>). What we used to do here is
150 // to open extra lists of whatever type was requested.
151 // This would result in invalid HTML, since and list is
152 // not allowed to contain another list without first containing
153 // a list item. ("<ul><ul><li>Item</ul></ul>" is invalid.)
155 // So now, when we need extra list elements, we use a <dl>, and
156 // open it with an empty <dd>.
157 $el = $this->stack->cnt() % 2 == 0 ? 'dl' : 'dd';
159 $this->stack->push($el);
162 $retvar .= StartTag($tag, $args) . "\n";
163 $this->stack->push($tag);
167 return $this->token($retvar);
171 * Start new list item element.
173 * This closes any currently open list items at the specified level or deeper,
174 * then opens a new list item element.
176 * @param $list_type string Type of list element to open. This should
177 * be one of 'dl', 'ol', or 'ul'.
179 * @param $level integer Nesting depth for list item. Should be a positive integer.
181 * @param $defn_term string Definition term. Specifies the contents for the
182 * <dt> element. Only used if $list_type is 'dl'.
184 * @return string HTML
186 function ListItem($list_type, $level, $defn_term = '')
188 $level = min($level, 10);
190 $retval = $this->SetHTMLMode($list_type, 2 * $level - 1);
191 if ($list_type == 'dl') {
192 $retval .= Element('dt', $defn_term);
193 $retval .= $this->SetHTMLMode('dd', 2 * $level);
196 $retval .= $this->SetHTMLMode('li', 2 * $level);
202 /** Work horse and main loop.
204 * This function does the transform from wiki markup to HTML.
206 * Contains main-loop and calls transformer functions.
208 * @param $html string HTML header (if needed, otherwise '')
209 * (This string is prepended to the return value.)
211 * @param $content array Wiki markup as array of lines
213 * @return string HTML
215 function do_transform($html, $content)
217 global $FieldSeparator;
219 $this->content = $content;
220 $this->replacements = array();
221 $this->user_data = array();
223 // Loop over all lines of the page and apply transformation rules
224 $numlines = count($this->content);
225 for ($lnum = 0; $lnum < $numlines; $lnum++)
228 $this->linenumber = $lnum;
229 $line = $this->content[$lnum];
231 // blank lines clear the current mode (to force new paragraph)
232 if (!strlen($line) || $line == "\r") {
233 $html .= $this->SetHTMLMode('', 0);
239 // main loop applying all registered functions
240 // tokenizers, markup, html mode, ...
241 // functions are executed in order of registering
242 for (reset($this->trfrm_func);
243 list($flags, $func, $regexp) = current($this->trfrm_func);
244 next($this->trfrm_func)) {
246 // if HTMLmode is already set then skip all following
247 // WT_MODE_MARKUP functions
248 if ($this->mode_set && ($flags & WT_MODE_MARKUP) != 0)
251 if (!empty($regexp) && !preg_match("/$regexp/", $line))
254 // call registered function
255 if (($flags & WT_TOKENIZER) != 0)
256 $line = $this->tokenize($line, $regexp, $func);
258 $line = $func($line, $this);
261 $html .= $line . "\n";
264 $html .= $this->SetHTMLMode('', 0);
266 return $this->untokenize($html);
268 // end do_transfrom()
270 // Register a new token.
271 function token($repl) {
272 global $FieldSeparator;
273 $tok = $FieldSeparator . sizeof($this->replacements) . $FieldSeparator;
274 $this->replacements[] = $repl;
278 // helper function which does actual tokenizing
279 function tokenize($str, $pattern, $func) {
280 // Find any strings in $str that match $pattern and
281 // store them in $orig, replacing them with tokens
282 // starting at number $ntokens - returns tokenized string
284 while (preg_match("/^(.*?)($pattern)/", $str, $matches)) {
285 $str = substr($str, strlen($matches[0]));
286 $new .= $matches[1] . $this->token($func($matches[2], $this));
291 function untokenize($line) {
292 global $FieldSeparator;
294 $chunks = explode ($FieldSeparator, "$line ");
296 for ($i = 1; $i < count($chunks); $i += 2)
299 $line .= $this->replacements[$tok] . $chunks[$i + 1];
304 // end class WikiTransform
307 //////////////////////////////////////////////////////////
309 class WikiPageTransform
310 extends WikiTransform {
311 function WikiPageTransform() {
312 global $WikiNameRegexp, $AllowedProtocols, $InterWikiLinkRegexp;
314 $this->WikiTransform();
316 // register functions
317 // functions are applied in order of registering
319 $this->register(WT_SIMPLE_MARKUP, 'wtm_plugin_link');
320 $this->register(WT_MODE_MARKUP, 'wtm_plugin');
322 $this->register(WT_TOKENIZER, 'wtt_doublebrackets', '\[\[');
323 $this->register(WT_TOKENIZER, 'wtt_footnotes', '^\[\d+\]');
324 $this->register(WT_TOKENIZER, 'wtt_footnoterefs', '\[\d+\]');
325 $this->register(WT_TOKENIZER, 'wtt_bracketlinks', '\[.+?\]');
326 $this->register(WT_TOKENIZER, 'wtt_urls',
327 "!?\b($AllowedProtocols):[^\s<>\[\]\"'()]*[^\s<>\[\]\"'(),.?]");
329 if (function_exists('wtt_interwikilinks')) {
330 $this->register(WT_TOKENIZER, 'wtt_interwikilinks',
331 pcre_fix_posix_classes("!?(?<![[:alnum:]])") .
332 "$InterWikiLinkRegexp:[^\\s.,;?()]+");
334 $this->register(WT_TOKENIZER, 'wtt_bumpylinks', "!?$WikiNameRegexp");
336 if (function_exists('wtm_table')) {
337 $this->register(WT_MODE_MARKUP, 'wtm_table', '^\|');
339 $this->register(WT_SIMPLE_MARKUP, 'wtm_htmlchars');
340 $this->register(WT_SIMPLE_MARKUP, 'wtm_linebreak');
341 $this->register(WT_SIMPLE_MARKUP, 'wtm_bold_italics');
343 $this->register(WT_MODE_MARKUP, 'wtm_list_ul');
344 $this->register(WT_MODE_MARKUP, 'wtm_list_ol');
345 $this->register(WT_MODE_MARKUP, 'wtm_list_dl');
346 $this->register(WT_MODE_MARKUP, 'wtm_preformatted');
347 $this->register(WT_MODE_MARKUP, 'wtm_headings');
348 $this->register(WT_MODE_MARKUP, 'wtm_hr');
349 $this->register(WT_MODE_MARKUP, 'wtm_paragraph');
353 function do_transform ($lines, $class = 'WikiPageTransform') {
354 if (is_string($lines))
355 $lines = preg_split('/[ \t\r]*\n/', trim($lines));
358 return $trfm->do_transform('', $lines);
362 extends WikiTransform {
363 function LinkTransform() {
364 global $WikiNameRegexp, $AllowedProtocols, $InterWikiLinkRegexp;
366 $this->WikiTransform();
368 // register functions
369 // functions are applied in order of registering
371 $this->register(WT_TOKENIZER, 'wtt_doublebrackets', '\[\[');
372 $this->register(WT_TOKENIZER, 'wtt_quotetoken', '\[\d+\]');
373 $this->register(WT_TOKENIZER, 'wtt_bracketlinks', '\[.+?\]');
374 $this->register(WT_TOKENIZER, 'wtt_urls',
375 "!?\b($AllowedProtocols):[^\s<>\[\]\"'()]*[^\s<>\[\]\"'(),.?]");
377 if (function_exists('wtt_interwikilinks')) {
378 $this->register(WT_TOKENIZER, 'wtt_interwikilinks',
379 pcre_fix_posix_classes("!?(?<![[:alnum:]])") .
380 "$InterWikiLinkRegexp:[^\\s.,;?()]+");
382 $this->register(WT_TOKENIZER, 'wtt_bumpylinks', "!?$WikiNameRegexp");
383 $this->register(WT_SIMPLE_MARKUP, 'wtm_htmlchars');
388 Requirements for functions registered to WikiTransform:
390 Signature: function wtm_xxxx($line, &$transform)
392 $line ... current line containing wiki markup
393 (Note: it may already contain HTML from other transform functions)
394 &$transform ... WikiTransform object -- public variables of this
395 object and their use see above.
397 Functions have to return $line (doesn't matter if modified or not)
398 All conversion should take place inside $line.
400 Tokenizer functions should use $transform->replacements to store
401 the replacement strings. Also, they have to keep track of
402 $transform->tokencounter. See functions below. Back substitution
403 of tokenized strings is done by do_transform().
408 //////////////////////////////////////////////////////////
409 // Tokenizer functions
412 function wtt_doublebrackets($match, &$trfrm)
417 function wtt_footnotes($match, &$trfrm)
419 // FIXME: should this set HTML mode?
420 $ftnt = trim(substr($match,1,-1)) + 0;
422 $html = Element('br');
424 $fnlist = $trfrm->user_data['footnotes'][$ftnt];
425 if (!is_array($fnlist))
426 return $html . $fntext;
428 $trfrm->user_data['footnotes'][$ftnt] = 'footnote_seen';
430 while (list($k, $anchor) = each($fnlist))
432 $html .= Element("a", array("name" => "footnote-$ftnt",
433 "href" => "#$anchor",
434 "class" => "footnote-rev"),
441 function wtt_footnoterefs($match, &$trfrm)
443 $ftnt = trim(substr($match,1,-1)) + 0;
445 $footnote_definition_seen = false;
447 if (empty($trfrm->user_data['footnotes']))
448 $trfrm->user_data['footnotes'] = array();
449 if (empty($trfrm->user_data['footnotes'][$ftnt]))
450 $trfrm->user_data['footnotes'][$ftnt] = array();
451 else if (!is_array($trfrm->user_data['footnotes'][$ftnt]))
452 $footnote_definition_seen = true;
455 $args['href'] = "#footnote-$ftnt";
456 if (!$footnote_definition_seen)
458 $args['name'] = "footrev-$ftnt-" .
459 count($trfrm->user_data['footnotes'][$ftnt]);
460 $trfrm->user_data['footnotes'][$ftnt][] = $args['name'];
463 return Element('sup', array('class' => 'footnote'),
464 QElement("a", $args, "[$ftnt]"));
467 function wtt_bracketlinks($match, &$trfrm)
469 if (preg_match('/^\[\s*\]$/', $match)) {
470 return htmlspecialchars($match);
473 $link = ParseAndLink($match);
474 if (strstr($link['link'], "</form>")) {
475 // FIXME: BIG HACK: see note in wtm_plugin.
476 return "</p>" . $link['link'] . "<p>";
478 return $link["link"];
483 // replace all URL's with tokens, so we don't confuse them
484 // with Wiki words later. Wiki words in URL's break things.
485 // URLs preceeded by a '!' are not linked
486 function wtt_urls($match, &$trfrm)
488 if ($match[0] == "!")
489 return htmlspecialchars(substr($match,1));
490 return LinkURL($match);
493 // Link Wiki words (BumpyText)
494 // Wikiwords preceeded by a '!' are not linked
495 function wtt_bumpylinks($match, &$trfrm)
498 if ($match[0] == "!")
499 return htmlspecialchars(substr($match,1));
500 return LinkWikiWord($match);
504 // Just quote the token.
505 function wtt_quotetoken($match, &$trfrm)
507 return htmlspecialchars($match);
512 // end of tokenizer functions
513 //////////////////////////////////////////////////////////
516 //////////////////////////////////////////////////////////
517 // basic simple markup functions
519 // escape HTML metachars
520 function wtm_htmlchars($line, &$transformer)
522 $line = str_replace('&', '&', $line);
523 $line = str_replace('>', '>', $line);
524 $line = str_replace('<', '<', $line);
529 // %%% are linebreaks
530 function wtm_linebreak($line, &$transformer) {
531 return str_replace('%%%', Element('br'), $line);
535 function wtm_bold_italics($line, &$transformer) {
536 $line = preg_replace('|(__)(.*?)(__)|', '<strong>\2</strong>', $line);
537 $line = preg_replace("|('')(.*?)('')|", '<em>\2</em>', $line);
543 //////////////////////////////////////////////////////////
544 // some tokens to be replaced by (dynamic) content
546 // FIXME: some plugins are in-line (maybe?) and some are block level.
547 // Here we treat them all as inline, which will probably
548 // generate some minorly invalid HTML in some cases.
550 function wtm_plugin_link($line, &$transformer) {
551 // FIXME: is this good syntax?
552 global $dbi, $request; // FIXME: make these non-global?
554 if (preg_match('/^(.*?)(<\?plugin-link\s+.*?\?>)(.*)$/', $line, $m)) {
555 list(, $prematch, $plugin_pi, $postmatch) = $m;
556 $loader = new WikiPluginLoader;
557 $html = $loader->expandPI($plugin_pi, $dbi, $request);
558 $line = $prematch . $transformer->token($html) . $postmatch;
563 function wtm_plugin($line, &$transformer) {
564 // FIXME: is this good syntax?
565 global $dbi, $request; // FIXME: make these non-global?
567 if (preg_match('/^<\?plugin(-form)?\s.*\?>\s*$/', $line)) {
568 $loader = new WikiPluginLoader;
569 $html = $loader->expandPI($line, $dbi, $request);
570 $line = $transformer->SetHTMLMode('', 0) . $transformer->token($html);
576 //////////////////////////////////////////////////////////
577 // mode markup functions
580 // tabless markup for unordered, ordered, and dictionary lists
581 // ul/ol list types can be mixed, so we only look at the last
582 // character. Changes e.g. from "**#*" to "###*" go unnoticed.
583 // and wouldn't make a difference to the HTML layout anyway.
585 // unordered lists <UL>: "*"
586 // has to be registereed before list OL
587 function wtm_list_ul($line, &$trfrm) {
588 if (preg_match("/^([#*;]*\*)[^#]/", $line, $matches)) {
589 $numtabs = strlen($matches[1]);
590 $line = preg_replace("/^([#*]*\*)/", '', $line);
591 $line = $trfrm->ListItem('ul', $numtabs) . $line;
596 // ordered lists <OL>: "#"
597 function wtm_list_ol($line, &$trfrm) {
598 if (preg_match("/^([#*;]*\#)/", $line, $matches)) {
599 $numtabs = strlen($matches[1]);
600 $line = preg_replace("/^([#*]*\#)/", "", $line);
601 $line = $trfrm->ListItem('ol', $numtabs) . $line;
607 // definition lists <DL>: ";text:text"
608 function wtm_list_dl($line, &$trfrm) {
609 if (preg_match("/^([#*;]*;)(.*?):(.*$)/", $line, $matches)) {
610 $numtabs = strlen($matches[1]);
611 $line = $trfrm->ListItem('dl', $numtabs, $matches[2]) . $matches[3];
616 // mode: preformatted text, i.e. <pre>
617 function wtm_preformatted($line, &$trfrm) {
618 if (preg_match("/^\s+/", $line)) {
619 $line = $trfrm->SetHTMLMode('pre') . $line;
624 // mode: headings, i.e. <h1>, <h2>, <h3>
625 // lines starting with !,!!,!!! are headings
626 // Patch from steph/tara <tellme@climbtothestars.org>:
627 // use <h2>, <h3>, <h4> since <h1> is page title.
628 function wtm_headings($line, &$trfrm) {
629 if (preg_match("/^(!{1,3})[^!]/", $line, $whichheading)) {
630 if($whichheading[1] == '!') $heading = 'h4';
631 elseif($whichheading[1] == '!!') $heading = 'h3';
632 elseif($whichheading[1] == '!!!') $heading = 'h2';
633 $line = preg_replace("/^!+/", '', $line);
634 $line = $trfrm->SetHTMLMode($heading) . $line;
640 function wtm_table($line, &$trfrm)
643 while (preg_match('/^(\|+)(v*)([<>^]?)([^|]*)/', $line, $m))
645 $line = substr($line, strlen($m[0]));
648 if (strlen($m[1]) > 1)
649 $td['colspan'] = strlen($m[1]);
650 if (strlen($m[2]) > 0)
651 $td['rowspan'] = strlen($m[2]) + 1;
654 $td['align'] = 'center';
655 else if ($m[3] == '>')
656 $td['align'] = 'right';
658 $td['align'] = 'left';
660 $row .= $trfrm->token(StartTag('td', $td) . " ");
662 $row .= $trfrm->token(" </td>");
664 assert(empty($line));
665 $row = $trfrm->token("<tr>") . $row . $trfrm->token("</tr>");
667 return $trfrm->SetHTMLMode(array('table',
668 array(//'align' => 'left',
675 // four or more dashes to <hr>
676 // Note this is of type WT_MODE_MARKUP becuase <hr>'s aren't
677 // allowed within <p>'s. (e.g. "<p><hr></p>" is not valid HTML.)
678 function wtm_hr($line, &$trfrm) {
679 if (preg_match('/^-{4,}(.*)$/', $line, $m)) {
680 $line = $trfrm->SetHTMLMode('', 0) . Element('hr');
682 $line .= $trfrm->SetHTMLMode('p') . $m[1];
687 // default mode: simple text paragraph
688 function wtm_paragraph($line, &$trfrm) {
689 $line = $trfrm->SetHTMLMode('p') . $line;
693 // (c-file-style: "gnu")
698 // c-hanging-comment-ender-p: nil
699 // indent-tabs-mode: nil