1 <?php rcs_id('$Id: transform.php,v 1.22 2001-04-09 19:28:52 dairiki Exp $');
3 define('WT_SIMPLE_MARKUP', 0);
4 define('WT_TOKENIZER', 1);
5 define('WT_MODE_MARKUP', 2);
7 define("ZERO_LEVEL", 0);
8 define("NESTED_LEVEL", 1);
13 function WikiTransform() -- init
15 function register($type, $function, [$regexp])
16 Registers transformer functions
17 This should be done *before* calling do_transform
21 If one WT_MODE_MARKUP really sets the html mode, then
22 all successive WT_MODE_MARKUP functions are skipped
24 The transformer function is called once for each match
25 of the $regexp in the line. The matched values are tokenized
26 to protect them from further transformation.
28 $function: function name
30 $regexp: Required for WT_TOKENIZER functions. Optional for others.
31 If given, the transformer function will only be called if the
32 line matches the $regexp.
34 function SetHTMLMode($tag, $tagtype, $level)
35 This is a helper function used to keep track of what HTML
36 block-level element we are currently processing.
37 Block-level elements are things like paragraphs "<p>",
38 pre-formatted text "<pre>", and the various list elements:
39 "<ul>", "<ol>" and "<dl>". Note that some of these elements
40 can be nested, while others can not. (In particular, according to
41 the HTML 4.01 specification, a paragraph "<p>" element is not
42 allowed to contain any other block-level elements. Also <pre>,
43 <li>, <dt>, <dd>, <h1> ... have this same restriction.)
45 SetHTMLMode generates whatever HTML is necessary to get us into
46 the requested element type at the requested nesting level.
48 $tag ... type of HTML element to open.
49 If $tag is an array, $tag[0] gives the element type,
50 and $tag[1] should be a hash containing attribute-value
51 pairs for the element.
53 If $tag is the empty string, all open elements (down to the
54 level requested by $level) are closed. Use
55 SetHTMLMode('',0) to close all open block-level elements.
57 $level ... requested nesting level for current element.
58 The nesting level for top level block is one (which is
61 Nesting is arbitrary limited to 10 levels
63 function do_transform($html, $content)
64 contains main-loop and calls transformer functions
66 $html ... HTML header (if needed, otherwise '')
67 $content ... wiki markup as array of lines
71 // public variables (only meaningful during do_transform)
72 var $linenumber; // current linenumber
73 var $replacements; // storage for tokenized strings of current line
74 var $user_data; // can be used by the transformer functions
75 // to store miscellaneous data.
78 var $content; // wiki markup, array of lines
79 var $mode_set; // stores if a HTML mode for this line has been set
80 var $trfrm_func; // array of registered functions
81 var $stack; // stack for SetHTMLMode (keeping track of open tags)
84 function WikiTransform()
86 $this->trfrm_func = array();
87 $this->stack = new Stack;
90 // register transformation functions
91 function register($type, $function, $regexp = false)
93 $this->trfrm_func[] = array ($type, $function, $regexp);
96 // sets current mode like list, preformatted text, plain text, ...
97 // takes care of closing (open) tags
98 function SetHTMLMode($tag, $level = 1)
100 if (is_array($tag)) {
108 $this->mode_set = 1; // in order to prevent other mode markup
113 // arbitrarily limit tag nesting
114 ExitWiki(gettext ("Lists nested too deep in SetHTMLOutputMode"));
117 if ($level <= $this->stack->cnt()) {
118 // $tag has fewer nestings (old: tabs) than stack,
119 // reduce stack to that tab count
120 while ($this->stack->cnt() > $level) {
121 $closetag = $this->stack->pop();
122 assert('$closetag != false');
123 $retvar .= "</$closetag>\n";
126 // if list type isn't the same,
127 // back up one more and push new tag
128 if ($tag && $tag != $this->stack->top()) {
129 $closetag = $this->stack->pop();
130 $retvar .= "</$closetag>" . StartTag($tag, $args) . "\n";
131 $this->stack->push($tag);
134 } else {// $level > $this->stack->cnt()
135 // Test for and close top level elements which are not allowed to contain
136 // other block-level elements.
137 if ($this->stack->cnt() == 1 and
138 preg_match('/^(p|pre|h\d)$/i', $this->stack->top()))
140 $closetag = $this->stack->pop();
141 $retvar .= "</$closetag>";
144 // we add the diff to the stack
145 // stack might be zero
146 if ($this->stack->cnt() < $level) {
147 while ($this->stack->cnt() < $level - 1) {
148 // This is a bit of a hack:
150 // We're not nested deep enough, and have to make up some kind of block
151 // element to nest within.
153 // Currently, this can only happen for nested list element
154 // (either <ul> <ol> or <dl>). What we used to do here is
155 // to open extra lists of whatever type was requested.
156 // This would result in invalid HTML, since and list is
157 // not allowed to contain another list without first containing
158 // a list item. ("<ul><ul><li>Item</ul></ul>" is invalid.)
160 // So now, when we need extra list elements, we use a <dl>, and
161 // open it with an empty <dd>.
162 $retvar .= "<dl><dd>";
163 $this->stack->push('dl');
166 $retvar .= StartTag($tag, $args) . "\n";
167 $this->stack->push($tag);
171 return $this->token($retvar);
176 // work horse and main loop
177 // this function does the transform from wiki markup to HTML
178 function do_transform($html, $content)
180 global $FieldSeparator;
182 $this->content = $content;
183 $this->replacements = array();
184 $this->user_data = array();
186 // Loop over all lines of the page and apply transformation rules
187 $numlines = count($this->content);
188 for ($lnum = 0; $lnum < $numlines; $lnum++)
191 $this->linenumber = $lnum;
192 $line = $this->content[$lnum];
194 // blank lines clear the current mode (to force new paragraph)
195 if (!strlen($line) || $line == "\r") {
196 $html .= $this->SetHTMLMode('', 0);
202 // main loop applying all registered functions
203 // tokenizers, markup, html mode, ...
204 // functions are executed in order of registering
205 for (reset($this->trfrm_func);
206 list($flags, $func, $regexp) = current($this->trfrm_func);
207 next($this->trfrm_func)) {
209 // if HTMLmode is already set then skip all following
210 // WT_MODE_MARKUP functions
211 if ($this->mode_set && ($flags & WT_MODE_MARKUP) != 0)
214 if (!empty($regexp) && !preg_match("/$regexp/", $line))
217 // call registered function
218 if (($flags & WT_TOKENIZER) != 0)
219 $line = $this->tokenize($line, $regexp, $func);
221 $line = $func($line, $this);
224 $html .= $line . "\n";
227 $html .= $this->SetHTMLMode('', 0);
229 return $this->untokenize($html);
231 // end do_transfrom()
233 // Register a new token.
234 function token($repl) {
235 global $FieldSeparator;
236 $tok = $FieldSeparator . sizeof($this->replacements) . $FieldSeparator;
237 $this->replacements[] = $repl;
241 // helper function which does actual tokenizing
242 function tokenize($str, $pattern, $func) {
243 // Find any strings in $str that match $pattern and
244 // store them in $orig, replacing them with tokens
245 // starting at number $ntokens - returns tokenized string
247 while (preg_match("/^(.*?)($pattern)/", $str, $matches)) {
248 $str = substr($str, strlen($matches[0]));
249 $new .= $matches[1] . $this->token($func($matches[2], $this));
254 function untokenize($line) {
255 global $FieldSeparator;
257 $chunks = explode ($FieldSeparator, "$line ");
259 for ($i = 1; $i < count($chunks); $i += 2)
262 $line .= $this->replacements[$tok] . $chunks[$i + 1];
267 // end class WikiTransform
270 //////////////////////////////////////////////////////////
272 $transform = new WikiTransform;
274 // register functions
275 // functions are applied in order of registering
277 $transform->register(WT_TOKENIZER, 'wtt_doublebrackets', '\[\[');
278 $transform->register(WT_TOKENIZER, 'wtt_footnotes', '^\[\d+\]');
279 $transform->register(WT_TOKENIZER, 'wtt_footnoterefs', '\[\d+\]');
280 $transform->register(WT_TOKENIZER, 'wtt_bracketlinks', '\[.+?\]');
281 $transform->register(WT_TOKENIZER, 'wtt_urls',
282 "!?\b($AllowedProtocols):[^\s<>\[\]\"'()]*[^\s<>\[\]\"'(),.?]");
284 if (function_exists('wtt_interwikilinks')) {
285 $transform->register(WT_TOKENIZER, 'wtt_interwikilinks',
286 pcre_fix_posix_classes("!?(?<![[:alnum:]])") .
287 "$InterWikiLinkRegexp:$WikiNameRegexp");
289 $transform->register(WT_TOKENIZER, 'wtt_bumpylinks', "!?$WikiNameRegexp");
291 if (function_exists('wtm_table')) {
292 $transform->register(WT_MODE_MARKUP, 'wtm_table', '^\|');
294 $transform->register(WT_SIMPLE_MARKUP, 'wtm_htmlchars');
295 $transform->register(WT_SIMPLE_MARKUP, 'wtm_linebreak');
296 $transform->register(WT_SIMPLE_MARKUP, 'wtm_bold_italics');
297 $transform->register(WT_SIMPLE_MARKUP, 'wtm_title_search');
298 $transform->register(WT_SIMPLE_MARKUP, 'wtm_fulltext_search');
299 $transform->register(WT_SIMPLE_MARKUP, 'wtm_mostpopular');
301 $transform->register(WT_MODE_MARKUP, 'wtm_list_ul');
302 $transform->register(WT_MODE_MARKUP, 'wtm_list_ol');
303 $transform->register(WT_MODE_MARKUP, 'wtm_list_dl');
304 $transform->register(WT_MODE_MARKUP, 'wtm_preformatted');
305 $transform->register(WT_MODE_MARKUP, 'wtm_headings');
306 $transform->register(WT_MODE_MARKUP, 'wtm_hr');
307 $transform->register(WT_MODE_MARKUP, 'wtm_paragraph');
309 $html = $transform->do_transform($html, $pagehash['content']);
312 Requirements for functions registered to WikiTransform:
314 Signature: function wtm_xxxx($line, &$transform)
316 $line ... current line containing wiki markup
317 (Note: it may already contain HTML from other transform functions)
318 &$transform ... WikiTransform object -- public variables of this
319 object and their use see above.
321 Functions have to return $line (doesn't matter if modified or not)
322 All conversion should take place inside $line.
324 Tokenizer functions should use $transform->replacements to store
325 the replacement strings. Also, they have to keep track of
326 $transform->tokencounter. See functions below. Back substitution
327 of tokenized strings is done by do_transform().
332 //////////////////////////////////////////////////////////
333 // Tokenizer functions
336 function wtt_doublebrackets($match, &$trfrm)
341 function wtt_footnotes($match, &$trfrm)
343 // FIXME: should this set HTML mode?
344 $ftnt = trim(substr($match,1,-1)) + 0;
348 $fnlist = $trfrm->user_data['footnotes'][$ftnt];
349 if (!is_array($fnlist))
350 return $html . $fntext;
352 $trfrm->user_data['footnotes'][$ftnt] = 'footnote_seen';
354 while (list($k, $anchor) = each($fnlist))
356 $html .= Element("a", array("name" => "footnote-$ftnt",
357 "href" => "#$anchor",
358 "class" => "footnote-rev"),
365 function wtt_footnoterefs($match, &$trfrm)
367 $ftnt = trim(substr($match,1,-1)) + 0;
369 $footnote_definition_seen = false;
371 if (empty($trfrm->user_data['footnotes']))
372 $trfrm->user_data['footnotes'] = array();
373 if (empty($trfrm->user_data['footnotes'][$ftnt]))
374 $trfrm->user_data['footnotes'][$ftnt] = array();
375 else if (!is_array($trfrm->user_data['footnotes'][$ftnt]))
376 $footnote_definition_seen = true;
379 $args['href'] = "#footnote-$ftnt";
380 if (!$footnote_definition_seen)
382 $args['name'] = "footrev-$ftnt-" .
383 count($trfrm->user_data['footnotes'][$ftnt]);
384 $trfrm->user_data['footnotes'][$ftnt][] = $args['name'];
387 return Element('sup', array('class' => 'footnote'),
388 QElement("a", $args, "[$ftnt]"));
391 function wtt_bracketlinks($match, &$trfrm)
393 $link = ParseAndLink($match);
394 return $link["link"];
399 // replace all URL's with tokens, so we don't confuse them
400 // with Wiki words later. Wiki words in URL's break things.
401 // URLs preceeded by a '!' are not linked
402 function wtt_urls($match, &$trfrm)
404 if ($match[0] == "!")
405 return htmlspecialchars(substr($match,1));
406 return LinkURL($match);
409 // Link Wiki words (BumpyText)
410 // Wikiwords preceeded by a '!' are not linked
411 function wtt_bumpylinks($match, &$trfrm)
414 if ($match[0] == "!")
415 return htmlspecialchars(substr($match,1));
416 // FIXME: make a LinkWikiWord() function?
417 if (IsWikiPage($dbi, $match))
418 return LinkExistingWikiWord($match);
419 return LinkUnknownWikiWord($match);
422 // end of tokenizer functions
423 //////////////////////////////////////////////////////////
426 //////////////////////////////////////////////////////////
427 // basic simple markup functions
429 // escape HTML metachars
430 function wtm_htmlchars($line, &$transformer)
432 $line = str_replace('&', '&', $line);
433 $line = str_replace('>', '>', $line);
434 $line = str_replace('<', '<', $line);
439 // %%% are linebreaks
440 function wtm_linebreak($line, &$transformer) {
441 return str_replace('%%%', '<br>', $line);
445 function wtm_bold_italics($line, &$transformer) {
446 $line = preg_replace('|(__)(.*?)(__)|', '<strong>\2</strong>', $line);
447 $line = preg_replace("|('')(.*?)('')|", '<em>\2</em>', $line);
453 //////////////////////////////////////////////////////////
454 // some tokens to be replaced by (dynamic) content
456 // wiki token: title search dialog
457 function wtm_title_search($line, &$transformer) {
458 if (strpos($line, '%%Search%%') !== false) {
459 $html = LinkPhpwikiURL(
460 "phpwiki:?action=search&searchterm=()&searchtype=title",
463 $line = str_replace('%%Search%%', $html, $line);
468 // wiki token: fulltext search dialog
469 function wtm_fulltext_search($line, &$transformer) {
470 if (strpos($line, '%%Fullsearch%%') !== false) {
471 $html = LinkPhpwikiURL(
472 "phpwiki:?action=search&searchterm=()&searchtype=full",
475 $line = str_replace('%%Fullsearch%%', $html, $line);
480 // wiki token: mostpopular list
481 function wtm_mostpopular($line, &$transformer) {
482 global $ScriptUrl, $dbi;
483 if (strpos($line, '%%Mostpopular%%') !== false) {
484 $query = InitMostPopular($dbi, MOST_POPULAR_LIST_LENGTH);
486 while ($qhash = MostPopularNextMatch($dbi, $query)) {
487 $html .= "<DD>$qhash[hits] ... " . LinkExistingWikiWord($qhash['pagename']) . "\n";
490 $line = str_replace('%%Mostpopular%%', $html, $line);
496 //////////////////////////////////////////////////////////
497 // mode markup functions
500 // tabless markup for unordered, ordered, and dictionary lists
501 // ul/ol list types can be mixed, so we only look at the last
502 // character. Changes e.g. from "**#*" to "###*" go unnoticed.
503 // and wouldn't make a difference to the HTML layout anyway.
505 // unordered lists <UL>: "*"
506 // has to be registereed before list OL
507 function wtm_list_ul($line, &$trfrm) {
508 if (preg_match("/^([#*;]*\*)[^#]/", $line, $matches)) {
509 $numtabs = strlen($matches[1]);
510 $line = preg_replace("/^([#*]*\*)/", '', $line);
511 $html = $trfrm->SetHTMLMode('ul', $numtabs) . '<li>';
512 $line = $html . $line;
517 // ordered lists <OL>: "#"
518 function wtm_list_ol($line, &$trfrm) {
519 if (preg_match("/^([#*;]*\#)/", $line, $matches)) {
520 $numtabs = strlen($matches[1]);
521 $line = preg_replace("/^([#*]*\#)/", "", $line);
522 $html = $trfrm->SetHTMLMode('ol', $numtabs) . '<li>';
523 $line = $html . $line;
529 // definition lists <DL>: ";text:text"
530 function wtm_list_dl($line, &$trfrm) {
531 if (preg_match("/^([#*;]*;)(.*?):(.*$)/", $line, $matches)) {
532 $numtabs = strlen($matches[1]);
533 $line = $trfrm->SetHTMLMode('dl', $numtabs);
534 if(trim($matches[2]))
535 $line .= '<dt>' . $matches[2];
536 $line .= '<dd>' . $matches[3];
541 // mode: preformatted text, i.e. <pre>
542 function wtm_preformatted($line, &$trfrm) {
543 if (preg_match("/^\s+/", $line)) {
544 $line = $trfrm->SetHTMLMode('pre') . $line;
549 // mode: headings, i.e. <h1>, <h2>, <h3>
550 // lines starting with !,!!,!!! are headings
551 function wtm_headings($line, &$trfrm) {
552 if (preg_match("/^(!{1,3})[^!]/", $line, $whichheading)) {
553 if($whichheading[1] == '!') $heading = 'h3';
554 elseif($whichheading[1] == '!!') $heading = 'h2';
555 elseif($whichheading[1] == '!!!') $heading = 'h1';
556 $line = preg_replace("/^!+/", '', $line);
557 $line = $trfrm->SetHTMLMode($heading) . $line;
563 function wtm_table($line, &$trfrm)
566 while (preg_match('/^(\|+)(v*)([<>^]?)([^|]*)/', $line, $m))
568 $line = substr($line, strlen($m[0]));
571 if (strlen($m[1]) > 1)
572 $td['colspan'] = strlen($m[1]);
573 if (strlen($m[2]) > 0)
574 $td['rowspan'] = strlen($m[2]) + 1;
577 $td['align'] = 'center';
578 else if ($m[3] == '>')
579 $td['align'] = 'right';
581 $td['align'] = 'left';
583 $row .= $trfrm->token(StartTag('td', $td) . " ");
585 $row .= $trfrm->token(" </td>");
587 assert(empty($line));
588 $row = $trfrm->token("<tr>") . $row . $trfrm->token("</tr>");
590 return $trfrm->SetHTMLMode(array('table',
591 array(//'align' => 'left',
598 // four or more dashes to <hr>
599 // Note this is of type WT_MODE_MARKUP becuase <hr>'s aren't
600 // allowed within <p>'s. (e.g. "<p><hr></p>" is not valid HTML.)
601 function wtm_hr($line, &$trfrm) {
602 if (preg_match('/^-{4,}(.*)$/', $line, $m)) {
603 $line = $trfrm->SetHTMLMode('', 0) . '<hr>';
605 $line .= $trfrm->SetHTMLMode('p') . $m[1];
610 // default mode: simple text paragraph
611 function wtm_paragraph($line, &$trfrm) {
612 $line = $trfrm->SetHTMLMode('p') . $line;
619 // c-file-style: "ellemtel"