1 <?php rcs_id('$Id: BlockParser.php,v 1.35 2003-02-21 04:09:11 dairiki Exp $');
2 /* Copyright (C) 2002, Geoffrey T. Dairiki <dairiki@dairiki.org>
4 * This file is part of PhpWiki.
6 * PhpWiki is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * PhpWiki is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with PhpWiki; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 require_once('lib/HtmlElement.php');
21 require_once('lib/CachedMarkup.php');
22 require_once('lib/InlineParser.php');
24 ////////////////////////////////////////////////////////////////
34 // FIXME: unify this with the RegexpSet in InlinePArser.
37 * Return type from RegexpSet::match and RegexpSet::nextMatch.
41 class AnchoredRegexpSet_match {
48 * The text following the matched text.
53 * Index of the regular expression which matched.
59 * A set of regular expressions.
61 * This class is probably only useful for InlineTransformer.
63 class AnchoredRegexpSet
67 * @param $regexps array A list of regular expressions. The
68 * regular expressions should not include any sub-pattern groups
69 * "(...)". (Anonymous groups, like "(?:...)", as well as
70 * look-ahead and look-behind assertions are fine.)
72 function AnchoredRegexpSet ($regexps) {
73 $this->_regexps = $regexps;
74 $this->_re = "/((" . join(")|(", $regexps) . "))/Ax";
78 * Search text for the next matching regexp from the Regexp Set.
80 * @param $text string The text to search.
82 * @return object A RegexpSet_match object, or false if no match.
84 function match ($text) {
85 if (! preg_match($this->_re, $text, $m)) {
89 $match = new AnchoredRegexpSet_match;
90 $match->postmatch = substr($text, strlen($m[0]));
91 $match->match = $m[1];
92 $match->regexp_ind = count($m) - 3;
97 * Search for next matching regexp.
99 * Here, 'next' has two meanings:
101 * Match the next regexp(s) in the set, at the same position as the last match.
103 * If that fails, match the whole RegexpSet, starting after the position of the
106 * @param $text string Text to search.
108 * @param $prevMatch A RegexpSet_match object
110 * $prevMatch should be a match object obtained by a previous
111 * match upon the same value of $text.
113 * @return object A RegexpSet_match object, or false if no match.
115 function nextMatch ($text, $prevMatch) {
116 // Try to find match at same position.
117 $regexps = array_slice($this->_regexps, $prevMatch->regexp_ind + 1);
122 $pat= "/ ( (" . join(')|(', $regexps) . ") ) /Axs";
124 if (! preg_match($pat, $text, $m)) {
128 $match = new AnchoredRegexpSet_match;
129 $match->postmatch = substr($text, strlen($m[0]));
130 $match->match = $m[1];
131 $match->regexp_ind = count($m) - 3 + $prevMatch->regexp_ind + 1;;
138 class BlockParser_Input {
140 function BlockParser_Input ($text) {
142 // Expand leading tabs.
143 // FIXME: do this better.
145 // We want to ensure the only characters matching \s are ' ' and "\n".
147 $text = preg_replace('/(?![ \n])\s/', ' ', $text);
148 assert(!preg_match('/(?![ \n])\s/', $text));
150 $this->_lines = preg_split('/[^\S\n]*\n/', $text);
153 // Strip leading blank lines.
154 while ($this->_lines and ! $this->_lines[0])
155 array_shift($this->_lines);
156 $this->_atSpace = false;
159 function skipSpace () {
160 $nlines = count($this->_lines);
162 if ($this->_pos >= $nlines) {
163 $this->_atSpace = false;
166 if ($this->_lines[$this->_pos] != '')
169 $this->_atSpace = true;
171 return $this->_atSpace;
174 function currentLine () {
175 if ($this->_pos >= count($this->_lines)) {
178 return $this->_lines[$this->_pos];
181 function nextLine () {
182 $this->_atSpace = $this->_lines[$this->_pos++] === '';
183 if ($this->_pos >= count($this->_lines)) {
186 return $this->_lines[$this->_pos];
189 function advance () {
190 $this->_atSpace = $this->_lines[$this->_pos++] === '';
194 return array($this->_pos, $this->_atSpace);
197 function setPos ($pos) {
198 list($this->_pos, $this->_atSpace) = $pos;
201 function getPrefix () {
205 function getDepth () {
210 if ($this->_pos < count($this->_lines))
211 return $this->_lines[$this->_pos];
216 function _debug ($tab, $msg) {
218 $where = $this->where();
219 $tab = str_repeat('____', $this->getDepth() ) . $tab;
220 printXML(HTML::div("$tab $msg: at: '",
226 class BlockParser_InputSubBlock extends BlockParser_Input
228 function BlockParser_InputSubBlock (&$input, $prefix_re, $initial_prefix = false) {
229 $this->_input = &$input;
230 $this->_prefix_pat = "/$prefix_re|\\s*\$/Ax";
231 $this->_atSpace = false;
233 if (($line = $input->currentLine()) === false)
234 $this->_line = false;
235 elseif ($initial_prefix) {
236 assert(substr($line, 0, strlen($initial_prefix)) == $initial_prefix);
237 $this->_line = (string) substr($line, strlen($initial_prefix));
238 $this->_atBlank = ! ltrim($line);
240 elseif (preg_match($this->_prefix_pat, $line, $m)) {
241 $this->_line = (string) substr($line, strlen($m[0]));
242 $this->_atBlank = ! ltrim($line);
245 $this->_line = false;
248 function skipSpace () {
249 // In contrast to the case for top-level blocks,
250 // for sub-blocks, there never appears to be any trailing space.
251 // (The last block in the sub-block should always be of class tight-bottom.)
252 while ($this->_line === '')
255 if ($this->_line === false)
256 return $this->_atSpace == 'strong_space';
258 return $this->_atSpace;
261 function currentLine () {
265 function nextLine () {
266 if ($this->_line === '')
267 $this->_atSpace = $this->_atBlank ? 'weak_space' : 'strong_space';
269 $this->_atSpace = false;
271 $line = $this->_input->nextLine();
272 if ($line !== false && preg_match($this->_prefix_pat, $line, $m)) {
273 $this->_line = (string) substr($line, strlen($m[0]));
274 $this->_atBlank = ! ltrim($line);
277 $this->_line = false;
282 function advance () {
287 return array($this->_line, $this->_atSpace, $this->_input->getPos());
290 function setPos ($pos) {
291 $this->_line = $pos[0];
292 $this->_atSpace = $pos[1];
293 $this->_input->setPos($pos[2]);
296 function getPrefix () {
297 assert ($this->_line !== false);
298 $line = $this->_input->currentLine();
299 assert ($line !== false && strlen($line) >= strlen($this->_line));
300 return substr($line, 0, strlen($line) - strlen($this->_line));
303 function getDepth () {
304 return $this->_input->getDepth() + 1;
308 return $this->_input->where();
313 class Block_HtmlElement extends HtmlElement
315 function Block_HtmlElement($tag /*, ... */) {
316 $this->_init(func_get_args());
320 function setTightness($top, $bottom) {
321 $this->setInClass('tightenable');
322 $this->setInClass('top', $top);
323 $this->setInClass('bottom', $bottom);
327 class ParsedBlock extends Block_HtmlElement {
329 function ParsedBlock (&$input, $tag = 'div', $attr = false) {
330 $this->Block_HtmlElement($tag, $attr);
331 $this->_initBlockTypes();
332 $this->_parse($input);
335 function _parse (&$input) {
336 for ($block = $this->_getBlock($input); $block; $block = $nextBlock) {
337 while ($nextBlock = $this->_getBlock($input)) {
338 // Attempt to merge current with following block.
339 if (! ($merged = $block->merge($nextBlock)) ) {
340 break; // can't merge
344 $this->pushContent($block->finish());
349 function _initBlockTypes () {
350 foreach (array('oldlists', 'list', 'dl', 'table_dl',
351 'blockquote', 'heading', 'hr', 'pre', 'email_blockquote',
354 $class = "Block_$type";
356 $this->_block_types[] = $proto;
357 $this->_regexps[] = $proto->_re;
359 $this->_regexpset = new AnchoredRegexpSet($this->_regexps);
362 function _getBlock (&$input) {
363 $this->_atSpace = $input->skipSpace();
365 if (! ($line = $input->currentLine()) )
368 $tight_top = !$this->_atSpace;
369 $re_set = &$this->_regexpset;
370 for ($m = $re_set->match($line); $m; $m = $re_set->nextMatch($line, $m)) {
371 $block = $this->_block_types[$m->regexp_ind];
372 //$input->_debug('>', get_class($block));
374 if ($block->_match($input, $m)) {
375 //$input->_debug('<', get_class($block));
376 $tight_bottom = ! $input->skipSpace();
377 $block->_setTightness($tight_top, $tight_bottom);
380 //$input->_debug('[', "_match failed");
383 trigger_error("Couldn't match block: '$line'", E_USER_NOTICE);
388 class WikiText extends ParsedBlock {
389 function WikiText ($text) {
390 $input = new BlockParser_Input($text);
391 $this->ParsedBlock($input);
395 class SubBlock extends ParsedBlock {
396 function SubBlock (&$input, $indent_re, $initial_indent = false,
397 $tag = 'div', $attr = false) {
398 $subinput = new BlockParser_InputSubBlock($input, $indent_re, $initial_indent);
399 $this->ParsedBlock($subinput, $tag, $attr);
404 * TightSubBlock is for use in parsing lists item bodies.
406 * If the sub-block consists of a single paragraph, it omits
407 * the paragraph element.
409 * We go to this trouble so that "tight" lists look somewhat reasonable
410 * in older (non-CSS) browsers. (If you don't do this, then, without
411 * CSS, you only get "loose" lists.
413 class TightSubBlock extends SubBlock {
414 function TightSubBlock (&$input, $indent_re, $initial_indent = false,
415 $tag = 'div', $attr = false) {
416 $this->SubBlock($input, $indent_re, $initial_indent, $tag, $attr);
418 // If content is a single paragraph, eliminate the paragraph...
419 if (count($this->_content) == 1) {
420 $elem = $this->_content[0];
421 if ($elem->getTag() == 'p') {
422 assert($elem->getAttr('class') == 'tightenable top bottom');
423 $this->setContent($elem->getContent());
432 function _match (&$input, $match) {
433 trigger_error('pure virtual', E_USER_ERROR);
436 function _setTightness ($top, $bot) {
437 $this->_element->setTightness($top, $bot);
440 function merge ($followingBlock) {
445 return $this->_element;
449 class Block_blockquote extends BlockMarkup
453 var $_re = '\ +(?=\S)';
455 function _match (&$input, $m) {
456 $this->_depth = strlen($m->match);
457 $indent = sprintf("\\ {%d}", $this->_depth);
458 $this->_element = new SubBlock($input, $indent, $m->match,
463 function merge ($nextBlock) {
464 if (get_class($nextBlock) == get_class($this)) {
465 assert ($nextBlock->_depth < $this->_depth);
466 $nextBlock->_element->unshiftContent($this->_element);
467 $nextBlock->_tight_top = $this->_tight_top;
474 class Block_list extends BlockMarkup
476 //var $_tag = 'ol' or 'ul';
482 | [*] (?! \S[^*]*(?<=\S)[*](?!\S) )
485 var $_content = array();
487 function _match (&$input, $m) {
488 // A list as the first content in a list is not allowed.
491 // Should markup as <ul><li>* Item</li></ul>,
492 // not <ul><li><ul><li>Item</li></ul>/li></ul>.
494 if (preg_match('/[*#+-o]/', $input->getPrefix())) {
499 $indent = sprintf("\\ {%d}", strlen($prefix));
501 $bullet = trim($m->match);
502 $this->_tag = $bullet == '#' ? 'ol' : 'ul';
503 $this->_content[] = new TightSubBlock($input, $indent, $m->match, 'li');
507 function _setTightness($top, $bot) {
508 $li = &$this->_content[0];
509 $li->setTightness($top, $bot);
512 function merge ($nextBlock) {
513 if (isa($nextBlock, 'Block_list') && $this->_tag == $nextBlock->_tag) {
514 array_splice($this->_content, count($this->_content), 0,
515 $nextBlock->_content);
522 return new Block_HtmlElement($this->_tag, false, $this->_content);
526 class Block_dl extends Block_list
530 function Block_dl () {
531 $this->_re = '\ {0,4}\S.*(?<!'.ESCAPE_CHAR.'):\s*$';
534 function _match (&$input, $m) {
535 if (!($p = $this->_do_match($input, $m)))
537 list ($term, $defn, $loose) = $p;
539 $this->_content[] = new Block_HtmlElement('dt', false, $term);
540 $this->_content[] = $defn;
541 $this->_tight_defn = !$loose;
545 function _setTightness($top, $bot) {
546 $dt = &$this->_content[0];
547 $dd = &$this->_content[1];
549 $dt->setTightness($top, $this->_tight_defn);
550 $dd->setTightness($this->_tight_defn, $bot);
553 function _do_match (&$input, $m) {
554 $pos = $input->getPos();
556 $firstIndent = strspn($m->match, ' ');
557 $pat = sprintf('/\ {%d,%d}(?=\s*\S)/A', $firstIndent + 1, $firstIndent + 5);
560 $loose = $input->skipSpace();
561 $line = $input->currentLine();
563 if (!$line || !preg_match($pat, $line, $mm)) {
564 $input->setPos($pos);
565 return false; // No body found.
568 $indent = strlen($mm[0]);
569 $term = TransformInline(rtrim(substr(trim($m->match),0,-1)));
570 $defn = new TightSubBlock($input, sprintf("\\ {%d}", $indent), false, 'dd');
571 return array($term, $defn, $loose);
577 class Block_table_dl_defn extends XmlContent
582 function Block_table_dl_defn ($term, $defn) {
584 if (!is_array($defn))
585 $defn = $defn->getContent();
587 $this->_next_tight_top = false; // value irrelevant - gets fixed later
588 $this->_ncols = $this->_ComputeNcols($defn);
591 foreach ($defn as $item) {
592 if ($this->_IsASubtable($item))
593 $this->_addSubtable($item);
595 $this->_addToRow($item);
599 $th = HTML::th($term);
600 if ($this->_nrows > 1)
601 $th->setAttr('rowspan', $this->_nrows);
602 $this->_setTerm($th);
605 function setTightness($tight_top, $tight_bot) {
606 $this->_tight_top = $tight_top;
607 $this->_tight_bot = $tight_bot;
608 $first = &$this->firstTR();
609 $last = &$this->lastTR();
610 $first->setInClass('top', $tight_top);
611 $last->setInClass('bottom', $tight_bot);
614 function _addToRow ($item) {
615 if (empty($this->_accum)) {
616 $this->_accum = HTML::td();
617 if ($this->_ncols > 2)
618 $this->_accum->setAttr('colspan', $this->_ncols - 1);
620 $this->_accum->pushContent($item);
623 function _flushRow ($tight_bottom=false) {
624 if (!empty($this->_accum)) {
625 $row = new Block_HtmlElement('tr', false, $this->_accum);
627 $row->setTightness($this->_next_tight_top, $tight_bottom);
628 $this->_next_tight_top = $tight_bottom;
630 $this->pushContent($row);
631 $this->_accum = false;
636 function _addSubtable ($table) {
637 if (!($table_rows = $table->getContent()))
640 $this->_flushRow($table_rows[0]->_tight_top);
642 foreach ($table_rows as $subdef) {
643 $this->pushContent($subdef);
644 $this->_nrows += $subdef->nrows();
645 $this->_next_tight_top = $subdef->_tight_bot;
649 function _setTerm ($th) {
650 $first_row = &$this->_content[0];
651 if (isa($first_row, 'Block_table_dl_defn'))
652 $first_row->_setTerm($th);
654 $first_row->unshiftContent($th);
657 function _ComputeNcols ($defn) {
659 foreach ($defn as $item) {
660 if ($this->_IsASubtable($item)) {
661 $row = $this->_FirstDefn($item);
662 $ncols = max($ncols, $row->ncols() + 1);
668 function _IsASubtable ($item) {
669 return isa($item, 'HtmlElement')
670 && $item->getTag() == 'table'
671 && $item->getAttr('class') == 'wiki-dl-table';
674 function _FirstDefn ($subtable) {
675 $defs = $subtable->getContent();
680 return $this->_ncols;
684 return $this->_nrows;
687 function & firstTR() {
688 $first = &$this->_content[0];
689 if (isa($first, 'Block_table_dl_defn'))
690 return $first->firstTR();
694 function & lastTR() {
695 $last = &$this->_content[$this->_nrows - 1];
696 if (isa($last, 'Block_table_dl_defn'))
697 return $last->lastTR();
701 function setWidth ($ncols) {
702 assert($ncols >= $this->_ncols);
703 if ($ncols <= $this->_ncols)
705 $rows = &$this->_content;
706 for ($i = 0; $i < count($rows); $i++) {
708 if (isa($row, 'Block_table_dl_defn'))
709 $row->setWidth($ncols - 1);
711 $n = count($row->_content);
712 $lastcol = &$row->_content[$n - 1];
713 $lastcol->setAttr('colspan', $ncols - 1);
719 class Block_table_dl extends Block_dl
721 var $_tag = 'dl-table'; // phony.
723 function Block_table_dl() {
724 $this->_re = '\ {0,4} (?:\S.*)? (?<!'.ESCAPE_CHAR.') \| \s* $';
727 function _match (&$input, $m) {
728 if (!($p = $this->_do_match($input, $m)))
730 list ($term, $defn, $loose) = $p;
732 $this->_content[] = new Block_table_dl_defn($term, $defn);
736 function _setTightness($top, $bot) {
737 $this->_content[0]->setTightness($top, $bot);
742 $defs = &$this->_content;
745 foreach ($defs as $defn)
746 $ncols = max($ncols, $defn->ncols());
748 foreach ($defs as $key => $defn)
749 $defs[$key]->setWidth($ncols);
751 return HTML::table(array('class' => 'wiki-dl-table',
759 class Block_oldlists extends Block_list
761 //var $_tag = 'ol', 'ul', or 'dl';
762 var $_re = '(?: [*] (?! \S[^*]* (?<=\S) [*](?!\S) )
763 | [#] (?! \[ .*? \] )
767 function _match (&$input, $m) {
769 if (!preg_match('/[*#;]*$/A', $input->getPrefix())) {
775 $oldindent = '[*#;](?=[#*]|;.*:.*\S)';
776 $newindent = sprintf('\\ {%d}', strlen($prefix));
777 $indent = "(?:$oldindent|$newindent)";
779 $bullet = $prefix[0];
780 if ($bullet == '*') {
784 elseif ($bullet == '#') {
790 list ($term,) = explode(':', substr($prefix, 1), 2);
793 $this->_content[] = new Block_HtmlElement('dt', false,
794 TransformInline($term));
798 $this->_content[] = new TightSubBlock($input, $indent, $m->match, $itemtag);
802 function _setTightness($top, $bot) {
803 if (count($this->_content) == 1) {
804 $li = &$this->_content[0];
805 $li->setTightness($top, $bot);
808 assert(count($this->_content) == 2);
809 $dt = &$this->_content[0];
810 $dd = &$this->_content[1];
811 $dt->setTightness($top, false);
812 $dd->setTightness(false, $bot);
817 class Block_pre extends BlockMarkup
819 var $_re = '<(?:pre|verbatim)>';
821 function _match (&$input, $m) {
822 $endtag = '</' . substr($m->match, 1);
824 $pos = $input->getPos();
826 $line = $m->postmatch;
827 while (ltrim($line) != $endtag) {
829 if (($line = $input->nextLine()) === false) {
830 $input->setPos($pos);
836 $text = join("\n", $text);
838 // FIXME: no <img>, <big>, <small>, <sup>, or <sub>'s allowed
840 if ($m->match == '<pre>')
841 $text = TransformInline($text);
843 $this->_element = new Block_HtmlElement('pre', false, $text);
849 class Block_plugin extends Block_pre
851 var $_re = '<\?plugin(?:-form)?(?!\S)';
854 /* <?plugin Backlinks
858 function _match (&$input, $m) {
859 $pos = $input->getPos();
860 $pi = $m->match . $m->postmatch;
861 while (!preg_match('/(?<!'.ESCAPE_CHAR.')\?>\s*$/', $pi)) {
862 if (($line = $input->nextLine()) === false) {
863 $input->setPos($pos);
870 $this->_element = new Cached_PluginInvocation($pi);
875 class Block_email_blockquote extends BlockMarkup
877 var $_attr = array('class' => 'mail-style-quote');
880 function _match (&$input, $m) {
881 //$indent = str_replace(' ', '\\ ', $m->match) . '|>$';
882 $indent = $this->_re;
883 $this->_element = new SubBlock($input, $indent, $m->match,
884 'blockquote', $this->_attr);
889 class Block_hr extends BlockMarkup
891 var $_re = '-{4,}\s*$';
893 function _match (&$input, $m) {
895 $this->_element = new Block_HtmlElement('hr');
899 function _setTightness($top, $bot) {
900 // Don't tighten <hr/>s
904 class Block_heading extends BlockMarkup
908 function _match (&$input, $m) {
909 $tag = "h" . (5 - strlen($m->match));
910 $text = TransformInline(trim($m->postmatch));
913 $this->_element = new Block_HtmlElement($tag, false, $text);
918 function _setTightness($top, $bot) {
919 // Don't tighten headers.
923 class Block_p extends BlockMarkup
928 function _match (&$input, $m) {
929 $this->_text = $m->match;
934 function _setTightness ($top, $bot) {
935 $this->_tight_top = $top;
936 $this->_tight_bot = $bot;
939 function merge ($nextBlock) {
940 $class = get_class($nextBlock);
941 if ($class == 'block_p' && $this->_tight_bot) {
942 $this->_text .= "\n" . $nextBlock->_text;
943 $this->_tight_bot = $nextBlock->_tight_bot;
950 $content = TransformInline(trim($this->_text));
951 $p = new Block_HtmlElement('p', false, $content);
952 $p->setTightness($this->_tight_top, $this->_tight_bot);
957 ////////////////////////////////////////////////////////////////
960 function TransformText ($text, $markup = 2.0, $basepage=false) {
961 if (isa($text, 'WikiDB_PageRevision')) {
963 $text = $rev->getPackedContent();
964 $markup = $rev->get('markup');
967 if (empty($markup) || $markup < 2.0) {
968 //include_once("lib/transform.php");
969 //return do_transform($text);
970 $text = ConvertOldMarkup($text);
973 // Expand leading tabs.
974 $text = expand_tabs($text);
978 $output = new WikiText($text);
981 // This is for immediate consumption.
982 // We must bind the contents to a base pagename so that
983 // relative page links can be properly linkified...
984 return new CacheableMarkup($output->getContent(), $basepage);
987 return new XmlContent($output->getContent());
990 // (c-file-style: "gnu")
995 // c-hanging-comment-ender-p: nil
996 // indent-tabs-mode: nil