1 <?php rcs_id('$Id: transform.php,v 1.23 2001-09-18 19:16:23 dairiki Exp $');
2 require_once('lib/WikiPlugin.php');
4 define('WT_SIMPLE_MARKUP', 0);
5 define('WT_TOKENIZER', 1);
6 define('WT_MODE_MARKUP', 2);
8 define("ZERO_LEVEL", 0);
9 define("NESTED_LEVEL", 1);
14 function WikiTransform() -- init
16 function register($type, $function, [$regexp])
17 Registers transformer functions
18 This should be done *before* calling do_transform
22 If one WT_MODE_MARKUP really sets the html mode, then
23 all successive WT_MODE_MARKUP functions are skipped
25 The transformer function is called once for each match
26 of the $regexp in the line. The matched values are tokenized
27 to protect them from further transformation.
29 $function: function name
31 $regexp: Required for WT_TOKENIZER functions. Optional for others.
32 If given, the transformer function will only be called if the
33 line matches the $regexp.
35 function SetHTMLMode($tag, $tagtype, $level)
36 This is a helper function used to keep track of what HTML
37 block-level element we are currently processing.
38 Block-level elements are things like paragraphs "<p>",
39 pre-formatted text "<pre>", and the various list elements:
40 "<ul>", "<ol>" and "<dl>". Note that some of these elements
41 can be nested, while others can not. (In particular, according to
42 the HTML 4.01 specification, a paragraph "<p>" element is not
43 allowed to contain any other block-level elements. Also <pre>,
44 <li>, <dt>, <dd>, <h1> ... have this same restriction.)
46 SetHTMLMode generates whatever HTML is necessary to get us into
47 the requested element type at the requested nesting level.
49 $tag ... type of HTML element to open.
50 If $tag is an array, $tag[0] gives the element type,
51 and $tag[1] should be a hash containing attribute-value
52 pairs for the element.
54 If $tag is the empty string, all open elements (down to the
55 level requested by $level) are closed. Use
56 SetHTMLMode('',0) to close all open block-level elements.
58 $level ... requested nesting level for current element.
59 The nesting level for top level block is one (which is
62 Nesting is arbitrary limited to 10 levels
64 function do_transform($html, $content)
65 contains main-loop and calls transformer functions
67 $html ... HTML header (if needed, otherwise '')
68 $content ... wiki markup as array of lines
72 // public variables (only meaningful during do_transform)
73 var $linenumber; // current linenumber
74 var $replacements; // storage for tokenized strings of current line
75 var $user_data; // can be used by the transformer functions
76 // to store miscellaneous data.
79 var $content; // wiki markup, array of lines
80 var $mode_set; // stores if a HTML mode for this line has been set
81 var $trfrm_func; // array of registered functions
82 var $stack; // stack for SetHTMLMode (keeping track of open tags)
85 function WikiTransform()
87 $this->trfrm_func = array();
88 $this->stack = new Stack;
91 // register transformation functions
92 function register($type, $function, $regexp = false)
94 $this->trfrm_func[] = array ($type, $function, $regexp);
97 // sets current mode like list, preformatted text, plain text, ...
98 // takes care of closing (open) tags
99 function SetHTMLMode($tag, $level = 1)
101 if (is_array($tag)) {
109 $this->mode_set = 1; // in order to prevent other mode markup
114 // arbitrarily limit tag nesting
115 ExitWiki(gettext ("Lists nested too deep in SetHTMLOutputMode"));
118 if ($level <= $this->stack->cnt()) {
119 // $tag has fewer nestings (old: tabs) than stack,
120 // reduce stack to that tab count
121 while ($this->stack->cnt() > $level) {
122 $closetag = $this->stack->pop();
123 assert('$closetag != false');
124 $retvar .= "</$closetag>\n";
127 // if list type isn't the same,
128 // back up one more and push new tag
129 if ($tag && $tag != $this->stack->top()) {
130 $closetag = $this->stack->pop();
131 $retvar .= "</$closetag>" . StartTag($tag, $args) . "\n";
132 $this->stack->push($tag);
135 } else {// $level > $this->stack->cnt()
136 // Test for and close top level elements which are not allowed to contain
137 // other block-level elements.
138 if ($this->stack->cnt() == 1 and
139 preg_match('/^(p|pre|h\d)$/i', $this->stack->top()))
141 $closetag = $this->stack->pop();
142 $retvar .= "</$closetag>";
145 // we add the diff to the stack
146 // stack might be zero
147 if ($this->stack->cnt() < $level) {
148 while ($this->stack->cnt() < $level - 1) {
149 // This is a bit of a hack:
151 // We're not nested deep enough, and have to make up some kind of block
152 // element to nest within.
154 // Currently, this can only happen for nested list element
155 // (either <ul> <ol> or <dl>). What we used to do here is
156 // to open extra lists of whatever type was requested.
157 // This would result in invalid HTML, since and list is
158 // not allowed to contain another list without first containing
159 // a list item. ("<ul><ul><li>Item</ul></ul>" is invalid.)
161 // So now, when we need extra list elements, we use a <dl>, and
162 // open it with an empty <dd>.
163 $retvar .= "<dl><dd>";
164 $this->stack->push('dl');
167 $retvar .= StartTag($tag, $args) . "\n";
168 $this->stack->push($tag);
172 return $this->token($retvar);
177 // work horse and main loop
178 // this function does the transform from wiki markup to HTML
179 function do_transform($html, $content)
181 global $FieldSeparator;
183 $this->content = $content;
184 $this->replacements = array();
185 $this->user_data = array();
187 // Loop over all lines of the page and apply transformation rules
188 $numlines = count($this->content);
189 for ($lnum = 0; $lnum < $numlines; $lnum++)
192 $this->linenumber = $lnum;
193 $line = $this->content[$lnum];
195 // blank lines clear the current mode (to force new paragraph)
196 if (!strlen($line) || $line == "\r") {
197 $html .= $this->SetHTMLMode('', 0);
203 // main loop applying all registered functions
204 // tokenizers, markup, html mode, ...
205 // functions are executed in order of registering
206 for (reset($this->trfrm_func);
207 list($flags, $func, $regexp) = current($this->trfrm_func);
208 next($this->trfrm_func)) {
210 // if HTMLmode is already set then skip all following
211 // WT_MODE_MARKUP functions
212 if ($this->mode_set && ($flags & WT_MODE_MARKUP) != 0)
215 if (!empty($regexp) && !preg_match("/$regexp/", $line))
218 // call registered function
219 if (($flags & WT_TOKENIZER) != 0)
220 $line = $this->tokenize($line, $regexp, $func);
222 $line = $func($line, $this);
225 $html .= $line . "\n";
228 $html .= $this->SetHTMLMode('', 0);
230 return $this->untokenize($html);
232 // end do_transfrom()
234 // Register a new token.
235 function token($repl) {
236 global $FieldSeparator;
237 $tok = $FieldSeparator . sizeof($this->replacements) . $FieldSeparator;
238 $this->replacements[] = $repl;
242 // helper function which does actual tokenizing
243 function tokenize($str, $pattern, $func) {
244 // Find any strings in $str that match $pattern and
245 // store them in $orig, replacing them with tokens
246 // starting at number $ntokens - returns tokenized string
248 while (preg_match("/^(.*?)($pattern)/", $str, $matches)) {
249 $str = substr($str, strlen($matches[0]));
250 $new .= $matches[1] . $this->token($func($matches[2], $this));
255 function untokenize($line) {
256 global $FieldSeparator;
258 $chunks = explode ($FieldSeparator, "$line ");
260 for ($i = 1; $i < count($chunks); $i += 2)
263 $line .= $this->replacements[$tok] . $chunks[$i + 1];
268 // end class WikiTransform
271 //////////////////////////////////////////////////////////
273 function do_transform ($lines) {
274 global $WikiNameRegexp, $AllowedProtocols, $InterWikiLinkRegexp;
276 if (is_string($lines))
277 $lines = preg_split('/[ \t\r]*\n/', trim($lines));
280 $transform = new WikiTransform;
282 // register functions
283 // functions are applied in order of registering
285 $transform->register(WT_SIMPLE_MARKUP, 'wtm_plugin_link');
286 $transform->register(WT_MODE_MARKUP, 'wtm_plugin');
288 $transform->register(WT_TOKENIZER, 'wtt_doublebrackets', '\[\[');
289 $transform->register(WT_TOKENIZER, 'wtt_footnotes', '^\[\d+\]');
290 $transform->register(WT_TOKENIZER, 'wtt_footnoterefs', '\[\d+\]');
291 $transform->register(WT_TOKENIZER, 'wtt_bracketlinks', '\[.+?\]');
292 $transform->register(WT_TOKENIZER, 'wtt_urls',
293 "!?\b($AllowedProtocols):[^\s<>\[\]\"'()]*[^\s<>\[\]\"'(),.?]");
295 if (function_exists('wtt_interwikilinks')) {
296 $transform->register(WT_TOKENIZER, 'wtt_interwikilinks',
297 pcre_fix_posix_classes("!?(?<![[:alnum:]])") .
298 "$InterWikiLinkRegexp:$WikiNameRegexp");
300 $transform->register(WT_TOKENIZER, 'wtt_bumpylinks', "!?$WikiNameRegexp");
302 if (function_exists('wtm_table')) {
303 $transform->register(WT_MODE_MARKUP, 'wtm_table', '^\|');
305 $transform->register(WT_SIMPLE_MARKUP, 'wtm_htmlchars');
306 $transform->register(WT_SIMPLE_MARKUP, 'wtm_linebreak');
307 $transform->register(WT_SIMPLE_MARKUP, 'wtm_bold_italics');
309 $transform->register(WT_MODE_MARKUP, 'wtm_list_ul');
310 $transform->register(WT_MODE_MARKUP, 'wtm_list_ol');
311 $transform->register(WT_MODE_MARKUP, 'wtm_list_dl');
312 $transform->register(WT_MODE_MARKUP, 'wtm_preformatted');
313 $transform->register(WT_MODE_MARKUP, 'wtm_headings');
314 $transform->register(WT_MODE_MARKUP, 'wtm_hr');
315 $transform->register(WT_MODE_MARKUP, 'wtm_paragraph');
317 //$html = $transform->do_transform($html, $pagehash['content']);
318 return $transform->do_transform('', $lines);
322 Requirements for functions registered to WikiTransform:
324 Signature: function wtm_xxxx($line, &$transform)
326 $line ... current line containing wiki markup
327 (Note: it may already contain HTML from other transform functions)
328 &$transform ... WikiTransform object -- public variables of this
329 object and their use see above.
331 Functions have to return $line (doesn't matter if modified or not)
332 All conversion should take place inside $line.
334 Tokenizer functions should use $transform->replacements to store
335 the replacement strings. Also, they have to keep track of
336 $transform->tokencounter. See functions below. Back substitution
337 of tokenized strings is done by do_transform().
342 //////////////////////////////////////////////////////////
343 // Tokenizer functions
346 function wtt_doublebrackets($match, &$trfrm)
351 function wtt_footnotes($match, &$trfrm)
353 // FIXME: should this set HTML mode?
354 $ftnt = trim(substr($match,1,-1)) + 0;
358 $fnlist = $trfrm->user_data['footnotes'][$ftnt];
359 if (!is_array($fnlist))
360 return $html . $fntext;
362 $trfrm->user_data['footnotes'][$ftnt] = 'footnote_seen';
364 while (list($k, $anchor) = each($fnlist))
366 $html .= Element("a", array("name" => "footnote-$ftnt",
367 "href" => "#$anchor",
368 "class" => "footnote-rev"),
375 function wtt_footnoterefs($match, &$trfrm)
377 $ftnt = trim(substr($match,1,-1)) + 0;
379 $footnote_definition_seen = false;
381 if (empty($trfrm->user_data['footnotes']))
382 $trfrm->user_data['footnotes'] = array();
383 if (empty($trfrm->user_data['footnotes'][$ftnt]))
384 $trfrm->user_data['footnotes'][$ftnt] = array();
385 else if (!is_array($trfrm->user_data['footnotes'][$ftnt]))
386 $footnote_definition_seen = true;
389 $args['href'] = "#footnote-$ftnt";
390 if (!$footnote_definition_seen)
392 $args['name'] = "footrev-$ftnt-" .
393 count($trfrm->user_data['footnotes'][$ftnt]);
394 $trfrm->user_data['footnotes'][$ftnt][] = $args['name'];
397 return Element('sup', array('class' => 'footnote'),
398 QElement("a", $args, "[$ftnt]"));
401 function wtt_bracketlinks($match, &$trfrm)
403 $link = ParseAndLink($match);
404 if (strstr($link['link'], "</form>")) {
405 // FIXME: BIG HACK: see note in wtm_plugin.
406 return "</p>" . $link['link'] . "<p>";
408 return $link["link"];
413 // replace all URL's with tokens, so we don't confuse them
414 // with Wiki words later. Wiki words in URL's break things.
415 // URLs preceeded by a '!' are not linked
416 function wtt_urls($match, &$trfrm)
418 if ($match[0] == "!")
419 return htmlspecialchars(substr($match,1));
420 return LinkURL($match);
423 // Link Wiki words (BumpyText)
424 // Wikiwords preceeded by a '!' are not linked
425 function wtt_bumpylinks($match, &$trfrm)
428 if ($match[0] == "!")
429 return htmlspecialchars(substr($match,1));
430 return LinkWikiWord($match);
433 // end of tokenizer functions
434 //////////////////////////////////////////////////////////
437 //////////////////////////////////////////////////////////
438 // basic simple markup functions
440 // escape HTML metachars
441 function wtm_htmlchars($line, &$transformer)
443 $line = str_replace('&', '&', $line);
444 $line = str_replace('>', '>', $line);
445 $line = str_replace('<', '<', $line);
450 // %%% are linebreaks
451 function wtm_linebreak($line, &$transformer) {
452 return str_replace('%%%', '<br>', $line);
456 function wtm_bold_italics($line, &$transformer) {
457 $line = preg_replace('|(__)(.*?)(__)|', '<strong>\2</strong>', $line);
458 $line = preg_replace("|('')(.*?)('')|", '<em>\2</em>', $line);
464 //////////////////////////////////////////////////////////
465 // some tokens to be replaced by (dynamic) content
467 // FIXME: some plugins are in-line (maybe?) and some are block level.
468 // Here we treat them all as inline, which will probably
469 // generate some minorly invalid HTML in some cases.
471 function wtm_plugin_link($line, &$transformer) {
472 // FIXME: is this good syntax?
473 global $dbi, $request; // FIXME: make these non-global?
475 if (preg_match('/^(.*?)(<\?plugin-link\s+.*?\?>)(.*)$/', $line, $m)) {
476 list(, $prematch, $plugin_pi, $postmatch) = $m;
477 $loader = new WikiPluginLoader;
478 $html = $loader->expandPI($plugin_pi, $dbi, $request);
479 $line = $prematch . $transformer->token($html) . $postmatch;
484 function wtm_plugin($line, &$transformer) {
485 // FIXME: is this good syntax?
486 global $dbi, $request; // FIXME: make these non-global?
488 if (preg_match('/^<\?plugin(-form)?\s.*\?>\s*$/', $line)) {
489 $loader = new WikiPluginLoader;
490 $html = $loader->expandPI($line, $dbi, $request);
491 $line = $transformer->SetHTMLMode('', 0) . $transformer->token($html);
497 //////////////////////////////////////////////////////////
498 // mode markup functions
501 // tabless markup for unordered, ordered, and dictionary lists
502 // ul/ol list types can be mixed, so we only look at the last
503 // character. Changes e.g. from "**#*" to "###*" go unnoticed.
504 // and wouldn't make a difference to the HTML layout anyway.
506 // unordered lists <UL>: "*"
507 // has to be registereed before list OL
508 function wtm_list_ul($line, &$trfrm) {
509 if (preg_match("/^([#*;]*\*)[^#]/", $line, $matches)) {
510 $numtabs = strlen($matches[1]);
511 $line = preg_replace("/^([#*]*\*)/", '', $line);
512 $html = $trfrm->SetHTMLMode('ul', $numtabs) . '<li>';
513 $line = $html . $line;
518 // ordered lists <OL>: "#"
519 function wtm_list_ol($line, &$trfrm) {
520 if (preg_match("/^([#*;]*\#)/", $line, $matches)) {
521 $numtabs = strlen($matches[1]);
522 $line = preg_replace("/^([#*]*\#)/", "", $line);
523 $html = $trfrm->SetHTMLMode('ol', $numtabs) . '<li>';
524 $line = $html . $line;
530 // definition lists <DL>: ";text:text"
531 function wtm_list_dl($line, &$trfrm) {
532 if (preg_match("/^([#*;]*;)(.*?):(.*$)/", $line, $matches)) {
533 $numtabs = strlen($matches[1]);
534 $line = $trfrm->SetHTMLMode('dl', $numtabs);
535 if(trim($matches[2]))
536 $line .= '<dt>' . $matches[2];
537 $line .= '<dd>' . $matches[3];
542 // mode: preformatted text, i.e. <pre>
543 function wtm_preformatted($line, &$trfrm) {
544 if (preg_match("/^\s+/", $line)) {
545 $line = $trfrm->SetHTMLMode('pre') . $line;
550 // mode: headings, i.e. <h1>, <h2>, <h3>
551 // lines starting with !,!!,!!! are headings
552 function wtm_headings($line, &$trfrm) {
553 if (preg_match("/^(!{1,3})[^!]/", $line, $whichheading)) {
554 if($whichheading[1] == '!') $heading = 'h3';
555 elseif($whichheading[1] == '!!') $heading = 'h2';
556 elseif($whichheading[1] == '!!!') $heading = 'h1';
557 $line = preg_replace("/^!+/", '', $line);
558 $line = $trfrm->SetHTMLMode($heading) . $line;
564 function wtm_table($line, &$trfrm)
567 while (preg_match('/^(\|+)(v*)([<>^]?)([^|]*)/', $line, $m))
569 $line = substr($line, strlen($m[0]));
572 if (strlen($m[1]) > 1)
573 $td['colspan'] = strlen($m[1]);
574 if (strlen($m[2]) > 0)
575 $td['rowspan'] = strlen($m[2]) + 1;
578 $td['align'] = 'center';
579 else if ($m[3] == '>')
580 $td['align'] = 'right';
582 $td['align'] = 'left';
584 $row .= $trfrm->token(StartTag('td', $td) . " ");
586 $row .= $trfrm->token(" </td>");
588 assert(empty($line));
589 $row = $trfrm->token("<tr>") . $row . $trfrm->token("</tr>");
591 return $trfrm->SetHTMLMode(array('table',
592 array(//'align' => 'left',
599 // four or more dashes to <hr>
600 // Note this is of type WT_MODE_MARKUP becuase <hr>'s aren't
601 // allowed within <p>'s. (e.g. "<p><hr></p>" is not valid HTML.)
602 function wtm_hr($line, &$trfrm) {
603 if (preg_match('/^-{4,}(.*)$/', $line, $m)) {
604 $line = $trfrm->SetHTMLMode('', 0) . '<hr>';
606 $line .= $trfrm->SetHTMLMode('p') . $m[1];
611 // default mode: simple text paragraph
612 function wtm_paragraph($line, &$trfrm) {
613 $line = $trfrm->SetHTMLMode('p') . $line;
617 // (c-file-style: "gnu")
622 // c-hanging-comment-ender-p: nil
623 // indent-tabs-mode: nil