1 <?php rcs_id('$Id: BlockParser.php,v 1.33 2003-02-18 03:59:11 dairiki Exp $');
2 /* Copyright (C) 2002, Geoffrey T. Dairiki <dairiki@dairiki.org>
4 * This file is part of PhpWiki.
6 * PhpWiki is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * PhpWiki is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with PhpWiki; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 require_once('lib/HtmlElement.php');
21 require_once('lib/InlineParser.php');
23 ////////////////////////////////////////////////////////////////
33 // FIXME: unify this with the RegexpSet in InlinePArser.
36 * Return type from RegexpSet::match and RegexpSet::nextMatch.
40 class AnchoredRegexpSet_match {
47 * The text following the matched text.
52 * Index of the regular expression which matched.
58 * A set of regular expressions.
60 * This class is probably only useful for InlineTransformer.
62 class AnchoredRegexpSet
66 * @param $regexps array A list of regular expressions. The
67 * regular expressions should not include any sub-pattern groups
68 * "(...)". (Anonymous groups, like "(?:...)", as well as
69 * look-ahead and look-behind assertions are fine.)
71 function AnchoredRegexpSet ($regexps) {
72 $this->_regexps = $regexps;
73 $this->_re = "/((" . join(")|(", $regexps) . "))/Ax";
77 * Search text for the next matching regexp from the Regexp Set.
79 * @param $text string The text to search.
81 * @return object A RegexpSet_match object, or false if no match.
83 function match ($text) {
84 if (! preg_match($this->_re, $text, $m)) {
88 $match = new AnchoredRegexpSet_match;
89 $match->postmatch = substr($text, strlen($m[0]));
90 $match->match = $m[1];
91 $match->regexp_ind = count($m) - 3;
96 * Search for next matching regexp.
98 * Here, 'next' has two meanings:
100 * Match the next regexp(s) in the set, at the same position as the last match.
102 * If that fails, match the whole RegexpSet, starting after the position of the
105 * @param $text string Text to search.
107 * @param $prevMatch A RegexpSet_match object
109 * $prevMatch should be a match object obtained by a previous
110 * match upon the same value of $text.
112 * @return object A RegexpSet_match object, or false if no match.
114 function nextMatch ($text, $prevMatch) {
115 // Try to find match at same position.
116 $regexps = array_slice($this->_regexps, $prevMatch->regexp_ind + 1);
121 $pat= "/ ( (" . join(')|(', $regexps) . ") ) /Axs";
123 if (! preg_match($pat, $text, $m)) {
127 $match = new AnchoredRegexpSet_match;
128 $match->postmatch = substr($text, strlen($m[0]));
129 $match->match = $m[1];
130 $match->regexp_ind = count($m) - 3 + $prevMatch->regexp_ind + 1;;
137 class BlockParser_Input {
139 function BlockParser_Input ($text) {
141 // Expand leading tabs.
142 // FIXME: do this better.
144 // We want to ensure the only characters matching \s are ' ' and "\n".
146 $text = preg_replace('/(?![ \n])\s/', ' ', $text);
147 assert(!preg_match('/(?![ \n])\s/', $text));
149 $this->_lines = preg_split('/[^\S\n]*\n/', $text);
151 $this->_atSpace = true;
154 function skipSpace () {
155 // For top-level input, the end of file looks like a space.
156 // (The last block is not of class tight-bottom.)
157 $nlines = count($this->_lines);
159 if ($this->_pos >= $nlines) {
160 $this->_atSpace = true;
163 if ($this->_lines[$this->_pos] != '')
166 $this->_atSpace = true;
168 return $this->_atSpace;
171 function currentLine () {
172 if ($this->_pos >= count($this->_lines)) {
175 return $this->_lines[$this->_pos];
178 function nextLine () {
179 $this->_atSpace = $this->_lines[$this->_pos++] === '';
180 if ($this->_pos >= count($this->_lines)) {
183 return $this->_lines[$this->_pos];
186 function advance () {
187 $this->_atSpace = $this->_lines[$this->_pos++] === '';
191 return array($this->_pos, $this->_atSpace);
194 function setPos ($pos) {
195 list($this->_pos, $this->_atSpace) = $pos;
198 function getPrefix () {
202 function getDepth () {
207 if ($this->_pos < count($this->_lines))
208 return $this->_lines[$this->_pos];
213 function _debug ($tab, $msg) {
215 $where = $this->where();
216 $tab = str_repeat('____', $this->getDepth() ) . $tab;
217 printXML(HTML::div("$tab $msg: at: '",
223 class BlockParser_InputSubBlock extends BlockParser_Input
225 function BlockParser_InputSubBlock (&$input, $prefix_re, $initial_prefix = false) {
226 $this->_input = &$input;
227 $this->_prefix_pat = "/$prefix_re|\\s*\$/Ax";
228 $this->_atSpace = false;
230 if (($line = $input->currentLine()) === false)
231 $this->_line = false;
232 elseif ($initial_prefix) {
233 assert(substr($line, 0, strlen($initial_prefix)) == $initial_prefix);
234 $this->_line = (string) substr($line, strlen($initial_prefix));
235 $this->_atBlank = ! ltrim($line);
237 elseif (preg_match($this->_prefix_pat, $line, $m)) {
238 $this->_line = (string) substr($line, strlen($m[0]));
239 $this->_atBlank = ! ltrim($line);
242 $this->_line = false;
245 function skipSpace () {
246 // In contrast to the case for top-level blocks,
247 // for sub-blocks, there never appears to be any trailing space.
248 // (The last block in the sub-block should always be of class tight-bottom.)
249 while ($this->_line === '')
252 if ($this->_line === false)
253 return $this->_atSpace == 'strong_space';
255 return $this->_atSpace;
258 function currentLine () {
262 function nextLine () {
263 if ($this->_line === '')
264 $this->_atSpace = $this->_atBlank ? 'weak_space' : 'strong_space';
266 $this->_atSpace = false;
268 $line = $this->_input->nextLine();
269 if ($line !== false && preg_match($this->_prefix_pat, $line, $m)) {
270 $this->_line = (string) substr($line, strlen($m[0]));
271 $this->_atBlank = ! ltrim($line);
274 $this->_line = false;
279 function advance () {
284 return array($this->_line, $this->_atSpace, $this->_input->getPos());
287 function setPos ($pos) {
288 $this->_line = $pos[0];
289 $this->_atSpace = $pos[1];
290 $this->_input->setPos($pos[2]);
293 function getPrefix () {
294 assert ($this->_line !== false);
295 $line = $this->_input->currentLine();
296 assert ($line !== false && strlen($line) >= strlen($this->_line));
297 return substr($line, 0, strlen($line) - strlen($this->_line));
300 function getDepth () {
301 return $this->_input->getDepth() + 1;
305 return $this->_input->where();
310 class Block_HtmlElement extends HtmlElement
312 function Block_HtmlElement($tag /*, ... */) {
313 $this->_init(func_get_args());
317 function setTightness($top, $bottom) {
318 $class = (string) $this->getAttr('class');
320 $class .= " tight-top";
322 $class .= " tight-bottom";
323 $class = ltrim($class);
325 $this->setAttr('class', $class);
329 class ParsedBlock extends Block_HtmlElement {
331 function ParsedBlock (&$input, $tag = 'div', $attr = false) {
332 $this->Block_HtmlElement($tag, $attr);
333 $this->_initBlockTypes();
334 $this->_parse($input);
337 function _parse (&$input) {
338 for ($block = $this->_getBlock($input); $block; $block = $nextBlock) {
339 while ($nextBlock = $this->_getBlock($input)) {
340 // Attempt to merge current with following block.
341 if (! ($merged = $block->merge($nextBlock)) ) {
342 break; // can't merge
346 $this->pushContent($block->finish());
351 function _initBlockTypes () {
352 foreach (array('oldlists', 'list', 'dl', 'table_dl',
353 'blockquote', 'heading', 'hr', 'pre', 'email_blockquote',
356 $class = "Block_$type";
358 $this->_block_types[] = $proto;
359 $this->_regexps[] = $proto->_re;
361 $this->_regexpset = new AnchoredRegexpSet($this->_regexps);
364 function _getBlock (&$input) {
365 $this->_atSpace = $input->skipSpace();
367 if (! ($line = $input->currentLine()) )
370 $tight_top = !$this->_atSpace;
371 $re_set = &$this->_regexpset;
372 for ($m = $re_set->match($line); $m; $m = $re_set->nextMatch($line, $m)) {
373 $block = $this->_block_types[$m->regexp_ind];
374 //$input->_debug('>', get_class($block));
376 if ($block->_match($input, $m)) {
377 //$input->_debug('<', get_class($block));
378 $tight_bottom = ! $input->skipSpace();
379 $block->_setTightness($tight_top, $tight_bottom);
382 //$input->_debug('[', "_match failed");
385 trigger_error("Couldn't match block: '$line'", E_USER_NOTICE);
390 class WikiText extends ParsedBlock {
391 function WikiText ($text) {
392 $input = new BlockParser_Input($text);
393 $this->ParsedBlock($input);
397 class SubBlock extends ParsedBlock {
398 function SubBlock (&$input, $indent_re, $initial_indent = false,
399 $tag = 'div', $attr = false) {
400 $subinput = new BlockParser_InputSubBlock($input, $indent_re, $initial_indent);
401 $this->ParsedBlock($subinput, $tag, $attr);
406 * TightSubBlock is for use in parsing lists item bodies.
408 * If the sub-block consists of a single paragraph, it omits
409 * the paragraph element.
411 * We go to this trouble so that "tight" lists look somewhat reasonable
412 * in older (non-CSS) browsers. (If you don't do this, then, without
413 * CSS, you only get "loose" lists.
415 class TightSubBlock extends SubBlock {
416 function TightSubBlock (&$input, $indent_re, $initial_indent = false,
417 $tag = 'div', $attr = false) {
418 $this->SubBlock($input, $indent_re, $initial_indent, $tag, $attr);
420 // If content is a single paragraph, eliminate the paragraph...
421 if (count($this->_content) == 1) {
422 $elem = $this->_content[0];
423 if ($elem->getTag() == 'p') {
424 assert($elem->getAttr('class') == 'tight-top tight-bottom');
425 $this->setContent($elem->getContent());
433 var $_tight_top = false;
434 var $_tight_bot = false;
436 function _match (&$input, $match) {
437 trigger_error('pure virtual', E_USER_ERROR);
440 function _setTightness ($top, $bot) {
441 $this->_tight_top = $top;
442 $this->_tight_bot = $bot;
445 function merge ($followingBlock) {
450 $this->_element->setTightness($this->_tight_top, $this->_tight_bot);
451 return $this->_element;
455 class Block_blockquote extends BlockMarkup
459 var $_re = '\ +(?=\S)';
461 function _match (&$input, $m) {
462 $this->_depth = strlen($m->match);
463 $indent = sprintf("\\ {%d}", $this->_depth);
464 $this->_element = new SubBlock($input, $indent, $m->match,
469 function merge ($nextBlock) {
470 if (get_class($nextBlock) == get_class($this)) {
471 assert ($nextBlock->_depth < $this->_depth);
472 $nextBlock->_element->unshiftContent($this->_element);
473 $nextBlock->_tight_top = $this->_tight_top;
480 class Block_list extends BlockMarkup
482 //var $_tag = 'ol' or 'ul';
488 | [*] (?! \S[^*]*(?<=\S)[*](?!\S) )
491 var $_content = array();
493 function _match (&$input, $m) {
494 // A list as the first content in a list is not allowed.
497 // Should markup as <ul><li>* Item</li></ul>,
498 // not <ul><li><ul><li>Item</li></ul>/li></ul>.
500 if (preg_match('/[*#+-o]/', $input->getPrefix())) {
505 $indent = sprintf("\\ {%d}", strlen($prefix));
507 $bullet = trim($m->match);
508 $this->_tag = $bullet == '#' ? 'ol' : 'ul';
509 $this->_content[] = new TightSubBlock($input, $indent, $m->match, 'li');
513 function _setTightness($top, $bot) {
514 $li = &$this->_content[0];
515 $li->setTightness($top, $bot);
518 function merge ($nextBlock) {
519 if (isa($nextBlock, 'Block_list') && $this->_tag == $nextBlock->_tag) {
520 array_splice($this->_content, count($this->_content), 0,
521 $nextBlock->_content);
528 return new Block_HtmlElement($this->_tag, false, $this->_content);
532 class Block_dl extends Block_list
535 var $_re = '\ {0,4}\S.*(?<! ~):\s*$';
537 function _match (&$input, $m) {
538 if (!($p = $this->_do_match($input, $m)))
540 list ($term, $defn) = $p;
542 $this->_content[] = new Block_HtmlElement('dt', false, $term);
543 $this->_content[] = $defn;
547 function _setTightness($top, $bot) {
548 $dt = &$this->_content[0];
549 $dd = &$this->_content[1];
551 $dt->setTightness($top, false);
552 $dd->setTightness(false, $bot);
555 function _do_match (&$input, $m) {
556 $pos = $input->getPos();
558 $firstIndent = strspn($m->match, ' ');
559 $pat = sprintf('/\ {%d,%d}(?=\s*\S)/A', $firstIndent + 1, $firstIndent + 5);
563 $line = $input->currentLine();
565 if (!$line || !preg_match($pat, $line, $mm)) {
566 $input->setPos($pos);
567 return false; // No body found.
570 $indent = strlen($mm[0]);
571 $term = TransformInline(rtrim(substr(trim($m->match),0,-1)));
572 $defn = new TightSubBlock($input, sprintf("\\ {%d}", $indent), false, 'dd');
573 return array($term, $defn);
579 class Block_table_dl_defn extends XmlContent
584 function Block_table_dl_defn ($term, $defn) {
586 if (!is_array($defn))
587 $defn = $defn->getContent();
589 $this->_ncols = $this->_ComputeNcols($defn);
591 $this->_tight_top = false;
592 $this->_tight_bot = false;
593 $this->_atSpace = true;
595 foreach ($defn as $item) {
596 if ($this->_IsASubtable($item))
597 $this->_addSubtable($item);
599 $this->_addToRow($item);
603 $th = HTML::th($term);
604 if ($this->_nrows > 1)
605 $th->setAttr('rowspan', $this->_nrows);
606 $this->_setTerm($th);
609 function setTightness($top, $bot) {
610 $this->_content[0]->setTightness($top, false);
611 $this->_content[$this->_nrows-1]->setTightness(false, $bot);
612 $this->_tight_top = $top;
613 $this->_tight_bot = $bot;
616 function _addToRow ($item) {
617 if (empty($this->_accum)) {
618 $this->_accum = HTML::td();
619 if ($this->_ncols > 2)
620 $this->_accum->setAttr('colspan', $this->_ncols - 1);
622 $this->_accum->pushContent($item);
625 function _flushRow ($tight_bottom=false) {
626 if (!empty($this->_accum)) {
627 $row = new Block_HtmlElement('tr', false, $this->_accum);
629 $row->setTightness(!$this->_atSpace, $tight_bottom);
630 $this->_atSpace = !$tight_bottom;
632 $this->pushContent($row);
633 $this->_accum = false;
638 function _addSubtable ($table) {
639 if (!($table_rows = $table->getContent()))
642 $this->_flushRow($table_rows[0]->_tight_top);
644 foreach ($table_rows as $subdef) {
645 $this->pushContent($subdef);
646 $this->_nrows += $subdef->nrows();
647 $this->_atSpace = ! $subdef->_tight_bot;
651 function _setTerm ($th) {
652 $first_row = &$this->_content[0];
653 if (isa($first_row, 'Block_table_dl_defn'))
654 $first_row->_setTerm($th);
656 $first_row->unshiftContent($th);
659 function _ComputeNcols ($defn) {
661 foreach ($defn as $item) {
662 if ($this->_IsASubtable($item)) {
663 $row = $this->_FirstDefn($item);
664 $ncols = max($ncols, $row->ncols() + 1);
670 function _IsASubtable ($item) {
671 return isa($item, 'HtmlElement')
672 && $item->getTag() == 'table'
673 && $item->getAttr('class') == 'wiki-dl-table';
676 function _FirstDefn ($subtable) {
677 $defs = $subtable->getContent();
682 return $this->_ncols;
686 return $this->_nrows;
689 function setWidth ($ncols) {
690 assert($ncols >= $this->_ncols);
691 if ($ncols <= $this->_ncols)
693 $rows = &$this->_content;
694 for ($i = 0; $i < count($rows); $i++) {
696 if (isa($row, 'Block_table_dl_defn'))
697 $row->setWidth($ncols - 1);
699 $n = count($row->_content);
700 $lastcol = &$row->_content[$n - 1];
701 $lastcol->setAttr('colspan', $ncols - 1);
707 class Block_table_dl extends Block_dl
709 var $_tag = 'dl-table'; // phony.
711 var $_re = '\ {0,4} (?:\S.*)? (?<! ~) \| \s* $';
713 function _match (&$input, $m) {
714 if (!($p = $this->_do_match($input, $m)))
716 list ($term, $defn) = $p;
718 $this->_content[] = new Block_table_dl_defn($term, $defn);
722 function _setTightness($top, $bot) {
723 $this->_content[0]->setTightness($top, $bot);
728 $defs = &$this->_content;
731 foreach ($defs as $defn)
732 $ncols = max($ncols, $defn->ncols());
734 foreach ($defs as $key => $defn)
735 $defs[$key]->setWidth($ncols);
737 return HTML::table(array('class' => 'wiki-dl-table',
745 class Block_oldlists extends Block_list
747 //var $_tag = 'ol', 'ul', or 'dl';
748 var $_re = '(?: [*] (?! \S[^*]* (?<=\S) [*](?!\S) )
749 | [#] (?! \[ .*? \] )
753 function _match (&$input, $m) {
755 if (!preg_match('/[*#;]*$/A', $input->getPrefix())) {
761 $oldindent = '[*#;](?=[#*]|;.*:.*\S)';
762 $newindent = sprintf('\\ {%d}', strlen($prefix));
763 $indent = "(?:$oldindent|$newindent)";
765 $bullet = $prefix[0];
766 if ($bullet == '*') {
770 elseif ($bullet == '#') {
776 list ($term,) = explode(':', substr($prefix, 1), 2);
779 $this->_content[] = new Block_HtmlElement('dt', false,
780 TransformInline($term));
784 $this->_content[] = new TightSubBlock($input, $indent, $m->match, $itemtag);
788 function _setTightness($top, $bot) {
789 if (count($this->_content) == 1) {
790 $li = &$this->_content[0];
791 $li->setTightness($top, $bot);
794 assert(count($this->_content) == 2);
795 $dt = &$this->_content[0];
796 $dd = &$this->_content[1];
797 $dt->setTightness($top, false);
798 $dd->setTightness(false, $bot);
803 class Block_pre extends BlockMarkup
805 var $_re = '<(?:pre|verbatim)>';
807 function _match (&$input, $m) {
808 $endtag = '</' . substr($m->match, 1);
810 $pos = $input->getPos();
812 $line = $m->postmatch;
813 while (ltrim($line) != $endtag) {
815 if (($line = $input->nextLine()) === false) {
816 $input->setPos($pos);
822 $text = join("\n", $text);
824 // FIXME: no <img>, <big>, <small>, <sup>, or <sub>'s allowed
826 if ($m->match == '<pre>')
827 $text = TransformInline($text);
829 $this->_element = new Block_HtmlElement('pre', false, $text);
835 class Block_plugin extends Block_pre
837 var $_re = '<\?plugin(?:-form)?(?!\S)';
840 /* <?plugin Backlinks
844 function _match (&$input, $m) {
845 $pos = $input->getPos();
846 $pi = $m->match . $m->postmatch;
847 while (!preg_match('/(?<!~)\?>\s*$/', $pi)) {
848 if (($line = $input->nextLine()) === false) {
849 $input->setPos($pos);
857 $loader = new WikiPluginLoader;
858 $expansion = $loader->expandPI($pi, $request);
859 $this->_element = new Block_HtmlElement('div', array('class' => 'plugin'),
865 class Block_email_blockquote extends BlockMarkup
867 var $_attr = array('class' => 'mail-style-quote');
870 function _match (&$input, $m) {
871 //$indent = str_replace(' ', '\\ ', $m->match) . '|>$';
872 $indent = $this->_re;
873 $this->_element = new SubBlock($input, $indent, $m->match,
874 'blockquote', $this->_attr);
879 class Block_hr extends BlockMarkup
881 var $_re = '-{4,}\s*$';
883 function _match (&$input, $m) {
885 $this->_element = new Block_HtmlElement('hr');
889 function _setTightness($top, $bot) {
890 // Don't tighten <hr/>s
894 class Block_heading extends BlockMarkup
898 function _match (&$input, $m) {
899 $tag = "h" . (5 - strlen($m->match));
900 $text = TransformInline(trim($m->postmatch));
903 $this->_element = new Block_HtmlElement($tag, false, $text);
908 function _setTightness($top, $bot) {
909 // Don't tighten headers.
913 class Block_p extends BlockMarkup
918 function _match (&$input, $m) {
919 $this->_text = $m->match;
924 function merge ($nextBlock) {
925 $class = get_class($nextBlock);
926 if ($class == 'block_p' && $this->_tight_bot) {
927 $this->_text .= "\n" . $nextBlock->_text;
928 $this->_tight_bot = $nextBlock->_tight_bot;
935 $content = TransformInline(trim($this->_text));
936 $p = new Block_HtmlElement('p', false, $content);
937 $p->setTightness($this->_tight_top, $this->_tight_bot);
942 ////////////////////////////////////////////////////////////////
945 function TransformText ($text, $markup = 2.0) {
946 if (isa($text, 'WikiDB_PageRevision')) {
948 $text = $rev->getPackedContent();
949 $markup = $rev->get('markup');
952 if (empty($markup) || $markup < 2.0) {
953 //include_once("lib/transform.php");
954 //return do_transform($text);
955 $text = ConvertOldMarkup($text);
958 // Expand leading tabs.
959 $text = expand_tabs($text);
963 $output = new WikiText($text);
964 return new XmlContent($output->getContent());
967 // (c-file-style: "gnu")
972 // c-hanging-comment-ender-p: nil
973 // indent-tabs-mode: nil