1 <?php rcs_id('$Id: transform.php,v 1.45 2002-08-22 23:28:31 rurban Exp $');
2 require_once('lib/WikiPlugin.php');
3 require_once('lib/HtmlElement.php');
4 require_once('lib/interwiki.php');
6 define('WT_SIMPLE_MARKUP', 0);
7 define('WT_TOKENIZER', 1);
8 define('WT_MODE_MARKUP', 2);
10 define("ZERO_LEVEL", 0);
11 define("NESTED_LEVEL", 1);
15 // public variables (only meaningful during do_transform)
16 var $linenumber; // current linenumber
17 var $replacements; // storage for tokenized strings of current line
18 var $user_data; // can be used by the transformer functions
19 // to store miscellaneous data.
22 var $content; // wiki markup, array of lines
23 var $mode_set; // stores if a HTML mode for this line has been set
24 var $trfrm_func; // array of registered functions
25 var $stack; // stack for SetHTMLMode (keeping track of open tags)
28 function WikiTransform()
30 $this->trfrm_func = array();
31 $this->stack = new Stack;
35 * Register transformation functions
37 * This should be done *before* calling do_transform
39 * @param $type enum <dl>
40 * <dt>WT_MODE_MARKUP</dt>
41 * <dd>If one WT_MODE_MARKUP really sets the html mode, then
42 * all successive WT_MODE_MARKUP functions are skipped.</dd>
43 * <dt>WT_TOKENIZER</dt>
44 * <dd> The transformer function is called once for each match
45 * of the $regexp in the line. The matched values are tokenized
46 * to protect them from further transformation.</dd>
48 * @param $function string Function name
49 * @param $regexp string Required for WT_TOKENIZER functions.
50 * Optional for others. If given, the transformer function will only be
51 * called if the line matches the $regexp.
53 function register($type, $function, $regexp = false)
55 $this->trfrm_func[] = array ($type, $function, $regexp);
59 * Sets current mode like list, preformatted text, plain text
61 * Takes care of closing (open) tags
63 * This is a helper function used to keep track of what HTML
64 * block-level element we are currently processing.
65 * Block-level elements are things like paragraphs "<p>",
66 * pre-formatted text "<pre>", and the various list elements:
67 * "<ul>", "<ol>" and "<dl>". Now, SetHTMLMode is also used to
68 * keep track of "<li>" and "<dd>" elements. Note that some of these elements
69 * can be nested, while others can not. (In particular, according to
70 * the HTML 4.01 specification, a paragraph "<p>" element is not
71 * allowed to contain any other block-level elements. Also <pre>,
72 * <li>, <dt>, <dd>, <h1> ... have this same restriction.)
74 * SetHTMLMode generates whatever HTML is necessary to get us into
75 * the requested element type at the requested nesting level.
77 * @param $tag string Type of HTML element to open.
79 * If $tag is an array, $tag[0] gives the element type,
80 * and $tag[1] should be a hash containing attribute-value
81 * pairs for the element.
83 * If $tag is the empty string, all open elements (down to the
84 * level requested by $level) are closed. Use
85 * SetHTMLMode('',0) to close all open block-level elements.
87 * @param $level string Rrequested nesting level for current element.
88 * The nesting level for top level block is one (which is
90 * Nesting is arbitrary limited to 20 levels.
92 * @return string Returns the HTML markup to open the specified element.
94 function SetHTMLMode($tag, $level = 1)
97 $el = new HtmlElement($tag[0], $tag[1]);
101 $el = new HtmlElement($tag);
104 $this->mode_set = 1; // in order to prevent other mode markup
109 // arbitrarily limit tag nesting
111 $request->finish(_("Lists nested too deep in SetHTMLOutputMode"));
114 if ($level <= $this->stack->cnt()) {
115 // $tag has fewer nestings (old: tabs) than stack,
116 // reduce stack to that tab count
117 while ($this->stack->cnt() > $level) {
118 $closetag = $this->stack->pop();
119 assert('$closetag != false');
120 $retvar .= "</$closetag>\n";
123 // if list type isn't the same,
124 // back up one more and push new tag
125 if ($tag && $tag != $this->stack->top()) {
126 $closetag = $this->stack->pop();
127 $retvar .= "</$closetag>" . $el->startTag() . "\n";
128 $this->stack->push($tag);
132 else {// $level > $this->stack->cnt()
133 // Test for and close top level elements which are not allowed to contain
134 // other block-level elements.
135 if ($this->stack->cnt() == 1 and
136 preg_match('/^(p|pre|h\d)$/i', $this->stack->top()))
138 $closetag = $this->stack->pop();
139 $retvar .= "</$closetag>";
142 // we add the diff to the stack
143 // stack might be zero
144 if ($this->stack->cnt() < $level) {
145 while ($this->stack->cnt() < $level - 1) {
146 // This is a bit of a hack:
148 // We're not nested deep enough, and have to make up some kind of block
149 // element to nest within.
151 // Currently, this can only happen for nested list element
152 // (either <ul> <ol> or <dl>). What we used to do here is
153 // to open extra lists of whatever type was requested.
154 // This would result in invalid HTML, since and list is
155 // not allowed to contain another list without first containing
156 // a list item. ("<ul><ul><li>Item</ul></ul>" is invalid.)
158 // So now, when we need extra list elements, we use a <dl>, and
159 // open it with an empty <dd>.
160 $stuff = $this->stack->cnt() % 2 == 0 ? 'dl' : 'dd';
161 $retvar .= "<$stuff>";
162 $this->stack->push($stuff);
165 $retvar .= $el->startTag() . "\n";
166 $this->stack->push($tag);
170 return $this->rawtoken($retvar);
174 * Start new list item element.
176 * This closes any currently open list items at the specified level or deeper,
177 * then opens a new list item element.
179 * @param $list_type string Type of list element to open. This should
180 * be one of 'dl', 'ol', or 'ul'.
182 * @param $level integer Nesting depth for list item. Should be a positive integer.
184 * @param $defn_term string Definition term. Specifies the contents for the
185 * <dt> element. Only used if $list_type is 'dl'.
187 * @return string HTML
189 function ListItem($list_type, $level, $defn_term = '')
191 $level = min($level, 10);
193 $retval = $this->SetHTMLMode($list_type, 2 * $level - 1);
194 if ($list_type == 'dl') {
195 $retval .= AsXML(HTML::dt(HTML::raw($defn_term)));
196 $retval .= $this->SetHTMLMode('dd', 2 * $level);
199 $retval .= $this->SetHTMLMode('li', 2 * $level);
205 /** Work horse and main loop.
207 * This function does the transform from wiki markup to HTML.
209 * Contains main-loop and calls transformer functions.
211 * @param $html string HTML header (if needed, otherwise '')
212 * (This string is prepended to the return value.)
214 * @param $content array Wiki markup as array of lines
216 * @return string HTML
218 function do_transform($html, $content)
220 global $FieldSeparator;
222 $this->content = $content;
223 $this->replacements = array();
224 $this->user_data = array();
226 // Loop over all lines of the page and apply transformation rules
227 $numlines = count($this->content);
228 for ($lnum = 0; $lnum < $numlines; $lnum++)
231 $this->linenumber = $lnum;
232 $line = $this->content[$lnum];
234 // blank lines clear the current mode (to force new paragraph)
235 if (!strlen($line) || $line == "\r") {
236 $html .= $this->SetHTMLMode('', 0);
242 // main loop applying all registered functions
243 // tokenizers, markup, html mode, ...
244 // functions are executed in order of registering
245 foreach ($this->trfrm_func as $trfrm) {
246 list($flags, $func, $regexp) = $trfrm;
248 // if HTMLmode is already set then skip all following
249 // WT_MODE_MARKUP functions
250 if ($this->mode_set && ($flags & WT_MODE_MARKUP) != 0)
253 if (!empty($regexp) && !preg_match("/$regexp/", $line))
256 // call registered function
257 if (($flags & WT_TOKENIZER) != 0)
258 $line = $this->tokenize($line, $regexp, $func);
260 $line = $func($line, $this);
263 $html .= $line . "\n";
266 $html .= $this->SetHTMLMode('', 0);
268 return new RawXml($this->untokenize($html));
270 // end do_transfrom()
272 // Register a new token.
273 function rawtoken($repl) {
274 global $FieldSeparator;
275 $tok = $FieldSeparator . sizeof($this->replacements) . $FieldSeparator;
276 $this->replacements[] = $repl;
280 // Register a new token.
281 function token($repl) {
282 return $this->rawtoken(AsXML($repl));
285 // helper function which does actual tokenizing
286 function tokenize($str, $pattern, $func) {
287 // Find any strings in $str that match $pattern and
288 // store them in $orig, replacing them with tokens
289 // starting at number $ntokens - returns tokenized string
291 while (preg_match("/^(.*?)($pattern)/", $str, $matches)) {
292 $str = substr($str, strlen($matches[0]));
293 $new .= $matches[1] . $this->token($func($matches[2], $this));
298 function untokenize($line) {
299 global $FieldSeparator;
301 $chunks = explode ($FieldSeparator, "$line ");
303 for ($i = 1; $i < count($chunks); $i += 2)
306 $line .= $this->replacements[$tok] . $chunks[$i + 1];
311 // end class WikiTransform
314 //////////////////////////////////////////////////////////
316 class WikiPageTransform
317 extends WikiTransform {
318 function WikiPageTransform() {
319 global $WikiNameRegexp, $AllowedProtocols, $request;
321 $this->WikiTransform();
323 // register functions
324 // functions are applied in order of registering
326 $this->register(WT_SIMPLE_MARKUP, 'wtm_plugin_link');
327 $this->register(WT_MODE_MARKUP, 'wtm_plugin');
329 $this->register(WT_TOKENIZER, 'wtt_doublebrackets', '\[\[');
330 $this->register(WT_TOKENIZER, 'wtt_footnotes', '^\[\d+\]');
331 $this->register(WT_TOKENIZER, 'wtt_footnoterefs', '\[\d+\]');
332 $this->register(WT_TOKENIZER, 'wtt_bracketlinks', '\[.+?\]');
333 $this->register(WT_TOKENIZER, 'wtt_urls',
334 "!?\b($AllowedProtocols):[^\s<>\[\]\"'()]*[^\s<>\[\]\"'(),.?]");
335 // Todo: get the map only when a interwikilink occurs.
336 $map = InterWikiMap::GetMap($request);
337 $this->register(WT_TOKENIZER, 'wtt_interwikilinks',
338 pcre_fix_posix_classes("!?(?<![[:alnum:]])")
339 . $map->getRegexp() . ":[^\\s.,;?()]+");
341 $this->register(WT_TOKENIZER, 'wtt_bumpylinks', "!?$WikiNameRegexp");
343 if (function_exists('wtm_table')) {
344 $this->register(WT_MODE_MARKUP, 'wtm_table', '^\|');
346 $this->register(WT_SIMPLE_MARKUP, 'wtm_htmlchars');
347 $this->register(WT_SIMPLE_MARKUP, 'wtm_linebreak');
348 $this->register(WT_SIMPLE_MARKUP, 'wtm_bold_italics');
350 $this->register(WT_MODE_MARKUP, 'wtm_list_ul');
351 $this->register(WT_MODE_MARKUP, 'wtm_list_ol');
352 $this->register(WT_MODE_MARKUP, 'wtm_list_dl');
353 $this->register(WT_MODE_MARKUP, 'wtm_preformatted');
354 $this->register(WT_MODE_MARKUP, 'wtm_headings');
355 $this->register(WT_MODE_MARKUP, 'wtm_hr');
356 $this->register(WT_MODE_MARKUP, 'wtm_paragraph');
360 function do_transform ($lines, $class = 'WikiPageTransform') {
361 if (is_string($lines))
362 $lines = preg_split('/[ \t\r]*\n/', trim($lines));
365 return $trfm->do_transform('', $lines);
369 extends WikiTransform {
370 function LinkTransform() {
371 global $WikiNameRegexp, $AllowedProtocols, $request;
373 $this->WikiTransform();
375 // register functions
376 // functions are applied in order of registering
378 $this->register(WT_TOKENIZER, 'wtt_doublebrackets', '\[\[');
379 $this->register(WT_TOKENIZER, 'wtt_quotetoken', '\[\d+\]');
380 $this->register(WT_TOKENIZER, 'wtt_bracketlinks', '\[.+?\]');
381 $this->register(WT_TOKENIZER, 'wtt_urls',
382 "!?\b($AllowedProtocols):[^\s<>\[\]\"'()]*[^\s<>\[\]\"'(),.?]");
384 $map = InterWikiMap::GetMap($request);
385 if (function_exists('wtt_interwikilinks')) {
386 $this->register(WT_TOKENIZER, 'wtt_interwikilinks',
387 pcre_fix_posix_classes("!?(?<![[:alnum:]])")
388 . $map->getRegexp() . ":[^\\s.,;?()]+");
390 $this->register(WT_TOKENIZER, 'wtt_bumpylinks', "!?$WikiNameRegexp");
391 $this->register(WT_SIMPLE_MARKUP, 'wtm_htmlchars');
396 Requirements for functions registered to WikiTransform:
398 Signature: function wtm_xxxx($line, &$transform)
400 $line ... current line containing wiki markup
401 (Note: it may already contain HTML from other transform functions)
402 &$transform ... WikiTransform object -- public variables of this
403 object and their use see above.
405 Functions have to return $line (doesn't matter if modified or not)
406 All conversion should take place inside $line.
408 Tokenizer functions should use $transform->replacements to store
409 the replacement strings. Also, they have to keep track of
410 $transform->tokencounter. See functions below. Back substitution
411 of tokenized strings is done by do_transform().
416 //////////////////////////////////////////////////////////
417 // Tokenizer functions
420 function wtt_doublebrackets($match, &$trfrm) {
425 function wtt_footnotes($match, &$trfrm) {
427 $page = rawurlencode($request->getArg('pagename'));
428 // FIXME: should this set HTML mode?
429 $ftnt = trim(substr($match,1,-1)) + 0;
431 $html = HTML(HTML::br());
433 $fnlist = $trfrm->user_data['footnotes'][$ftnt];
434 if (!is_array($fnlist)) {
435 $html->pushContent($fntext);
438 $trfrm->user_data['footnotes'][$ftnt] = 'footnote_seen';
439 while (list($k, $anchor) = each($fnlist)) {
440 $html->pushContent(HTML::a(array("name" => "footnote-$ftnt",
441 "href" => "$page#$anchor",
442 "class" => "footnote-rev"),
444 $fntext = ''; //$fntext = '+';
450 // [d] link to footnote
451 function wtt_footnoterefs($match, &$trfrm) {
453 $ftnt = trim(substr($match,1,-1)) + 0;
455 $footnote_definition_seen = false;
457 if (empty($trfrm->user_data['footnotes']))
458 $trfrm->user_data['footnotes'] = array();
459 if (empty($trfrm->user_data['footnotes'][$ftnt]))
460 $trfrm->user_data['footnotes'][$ftnt] = array();
461 else if (!is_array($trfrm->user_data['footnotes'][$ftnt]))
462 $footnote_definition_seen = true;
464 $page = rawurlencode($request->getArg('pagename'));
465 $link = HTML::a(array('href' => "$page#footnote-$ftnt"), "[$ftnt]");
466 if (!$footnote_definition_seen) {
467 $name = "footrev-$ftnt-" . count($trfrm->user_data['footnotes'][$ftnt]);
468 $link->setAttr('name', $name);
469 $trfrm->user_data['footnotes'][$ftnt][] = $name;
471 return HTML::sup(array('class' => 'footnote'), $link);
474 function wtt_bracketlinks($match, &$trfrm) {
476 if (preg_match('/^\[\s*\]$/', $match))
479 $link = LinkBracketLink($match);
481 if ($link->isInlineElement())
485 return new RawXml("</p>" . $link->asXML() . "<p>");
489 // replace all URL's with tokens, so we don't confuse them
490 // with Wiki words later. Wiki words in URL's break things.
491 // URLs preceeded by a '!' are not linked
492 function wtt_urls($match, &$trfrm) {
493 return $match[0] == "!"
499 // Link InterWiki links
500 // These can be protected by a '!' like Wiki words.
501 function wtt_interwikilinks($match, &$trfrm) {
502 if ($match[0] == '!')
503 return substr($match, 1);
504 $map = InterWikiMap::GetMap($GLOBALS['request']);
505 return $map->link($match);
508 // Link Wiki words (BumpyText)
509 // Wikiwords preceeded by a '!' are not linked
510 function wtt_bumpylinks($match, &$trfrm) {
511 return $match[0] == "!" ? substr($match,1) : WikiLink($match, 'auto');
515 // Just quote the token.
516 function wtt_quotetoken($match, &$trfrm) {
522 // end of tokenizer functions
523 //////////////////////////////////////////////////////////
526 //////////////////////////////////////////////////////////
527 // basic simple markup functions
529 // escape HTML metachars
530 function wtm_htmlchars($line, &$transformer) {
531 return XmlElement::_quote($line);
534 // %%% are linebreaks
535 function wtm_linebreak($line, &$transformer) {
536 return str_replace('%%%', '<br />', $line);
540 function wtm_bold_italics($line, &$transformer) {
541 $line = preg_replace('|(__)(.*?)(__)|', '<strong>\2</strong>', $line);
542 $line = preg_replace("|('')(.*?)('')|", '<em>\2</em>', $line);
548 //////////////////////////////////////////////////////////
549 // some tokens to be replaced by (dynamic) content
551 // FIXME: some plugins are in-line (maybe?) and some are block level.
552 // Here we treat them all as inline, which will probably
553 // generate some minorly invalid HTML in some cases.
555 function wtm_plugin_link($line, &$transformer) {
556 // FIXME: is this good syntax?
557 if (preg_match('/^(.*?)(<\?plugin-link\s+.*?\?>)(.*)$/', $line, $m)) {
558 list(, $prematch, $plugin_pi, $postmatch) = $m;
559 $loader = new WikiPluginLoader;
560 $html = $loader->expandPI($plugin_pi, $GLOBALS['request']);
561 $line = $prematch . $transformer->token($html) . $postmatch;
566 function wtm_plugin($line, &$transformer) {
567 // FIXME: is this good syntax?
568 if (preg_match('/^<\?plugin(-form)?\s.*\?>\s*$/', $line)) {
569 $loader = new WikiPluginLoader;
570 $html = $loader->expandPI($line, $GLOBALS['request']);
571 $line = $transformer->SetHTMLMode('', 0) . $transformer->token($html);
577 //////////////////////////////////////////////////////////
578 // mode markup functions
581 // tabless markup for unordered, ordered, and dictionary lists
582 // ul/ol list types can be mixed, so we only look at the last
583 // character. Changes e.g. from "**#*" to "###*" go unnoticed.
584 // and wouldn't make a difference to the HTML layout anyway.
586 // unordered lists <UL>: "*"
587 // has to be registereed before list OL
588 function wtm_list_ul($line, &$trfrm) {
589 if (preg_match("/^([#*;]*\*)[^#]/", $line, $matches)) {
590 $numtabs = strlen($matches[1]);
591 $line = preg_replace("/^([#*]*\*)/", '', $line);
592 $line = $trfrm->ListItem('ul', $numtabs) . $line;
597 // ordered lists <OL>: "#"
598 function wtm_list_ol($line, &$trfrm) {
599 if (preg_match("/^([#*;]*\#)/", $line, $matches)) {
600 $numtabs = strlen($matches[1]);
601 $line = preg_replace("/^([#*]*\#)/", "", $line);
602 $line = $trfrm->ListItem('ol', $numtabs) . $line;
608 // definition lists <DL>: ";text:text"
609 function wtm_list_dl($line, &$trfrm) {
610 if (preg_match("/^([#*;]*;)(.*?):(.*$)/", $line, $matches)) {
611 $numtabs = strlen($matches[1]);
612 $line = $trfrm->ListItem('dl', $numtabs, $matches[2]) . $matches[3];
617 // mode: preformatted text, i.e. <pre>
618 function wtm_preformatted($line, &$trfrm) {
619 if (preg_match("/^\s+/", $line)) {
620 $line = $trfrm->SetHTMLMode('pre') . $line;
625 // mode: headings, i.e. <h1>, <h2>, <h3>
626 // lines starting with !,!!,!!! are headings
627 // Patch from steph/tara <tellme@climbtothestars.org>:
628 // use <h2>, <h3>, <h4> since <h1> is page title.
629 function wtm_headings($line, &$trfrm) {
630 if (preg_match("/^(!{1,3})[^!]/", $line, $whichheading)) {
631 if($whichheading[1] == '!') $heading = 'h4';
632 elseif($whichheading[1] == '!!') $heading = 'h3';
633 elseif($whichheading[1] == '!!!') $heading = 'h2';
634 $line = preg_replace("/^!+/", '', $line);
635 $line = $trfrm->SetHTMLMode($heading) . $line;
641 function wtm_table($line, &$trfrm) {
643 while (preg_match('/^(\|+)(v*)([<>^]?)([^|]*)/', $line, $m))
645 $line = substr($line, strlen($m[0]));
649 if (strlen($m[1]) > 1)
650 $td->setAttr('colspan', strlen($m[1]));
651 if (strlen($m[2]) > 0)
652 $td->setAttr('rowspan', strlen($m[2]) + 1);
655 $td->setAttr('align', 'center');
656 else if ($m[3] == '>')
657 $td->setAttr('align', 'right');
659 $td->setAttr('align', 'left');
661 // FIXME: this is a hack: can't tokenize whole <td></td> since we
662 // haven't marked up italics, etc... yet
663 $row .= $trfrm->rawtoken($td->startTag() . " ");
665 $row .= $trfrm->rawtoken(" " . $td->endTag());
667 assert(empty($line));
668 $row = $trfrm->rawtoken("<tr>") . $row . $trfrm->rawtoken("</tr>");
670 return $trfrm->SetHTMLMode(array('table',
671 array('cellpadding' => 1,
677 // four or more dashes to <hr>
678 // Note this is of type WT_MODE_MARKUP becuase <hr>'s aren't
679 // allowed within <p>'s. (e.g. "<p><hr></p>" is not valid HTML.)
680 function wtm_hr($line, &$trfrm) {
681 if (preg_match('/^-{4,}(.*)$/', $line, $m)) {
682 $line = $trfrm->SetHTMLMode('', 0) . '<hr />';
684 $line .= $trfrm->SetHTMLMode('p') . $m[1];
689 // default mode: simple text paragraph
690 function wtm_paragraph($line, &$trfrm) {
691 return $trfrm->SetHTMLMode('p') . $line;
694 // (c-file-style: "gnu")
699 // c-hanging-comment-ender-p: nil
700 // indent-tabs-mode: nil