4 * HtmlParser Class: Conversion HTML => wikimarkup
5 * Requires XmlParser, XmlElement and the expat (or now the libxml) library. This is all in core.
9 * Copyright (C) 2004 Reini Urban
11 * This file is part of PhpWiki.
13 * PhpWiki is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or
16 * (at your option) any later version.
18 * PhpWiki is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
23 * You should have received a copy of the GNU General Public License along
24 * with PhpWiki; if not, write to the Free Software Foundation, Inc.,
25 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
29 * Base class to implement html => wikitext converters,
30 * extendable for various wiki syntax versions.
31 * This is needed to be able to use htmlarea-alike editors,
32 * and to import XML or HTML documents.
34 * See also php-html.sf.net for a php-only version, if
35 * you don't have the expat/libxml extension included.
36 * See also http://search.cpan.org/~diberri/HTML-WikiConverter/
40 // RssParser contains the XML (expat) and url-grabber methods
41 require_once 'lib/XmlParser.php';
46 public $dialect, $_handlers, $root;
49 * dialect: "PhpWiki2", "PhpWiki"
50 * possible more dialects: MediaWiki, kwiki, c2
52 function HtmlParser($dialect = "PhpWiki2", $encoding = '')
54 $classname = "HtmlParser_" . $dialect;
55 if (class_exists($classname))
56 $this->dialect = new $classname;
58 trigger_error(sprintf("unknown HtmlParser dialect %s", $dialect), E_USER_ERROR);
60 $this->_handlers =& $this->dialect->_handlers;
61 $this->XmlParser($encoding);
62 xml_parser_set_option($this->_parser, XML_OPTION_CASE_FOLDING, 0);
63 xml_parser_set_option($this->_parser, XML_OPTION_SKIP_WHITE, 1);
66 // The three callbacks, called on walking through the HTML tree.
67 // No extensions needed from XmlParser.
69 function tag_open($parser, $name, $attrs='') {
71 function tag_close($parser, $name, $attrs='') {
73 function cdata($parser, $data) {
75 function parse_url($file, $debug=false)
80 if (is_null($this->root))
81 $this->root = $GLOBALS['xml_parser_root'];
82 $output = $this->wikify($this->root);
86 function wikify($node, $parent = null)
89 if (isa($node, 'XmlElement')) {
90 $dialect =& $this->dialect;
91 $conv = $dialect->_handlers[$node->_tag];
92 if (is_string($conv) and method_exists($dialect, $conv)) {
93 $output = $dialect->$conv($node);
94 } elseif (is_array($conv)) {
95 foreach ($node->getContent() as $n) {
96 $output .= $this->wikify($n, $node);
98 $output = $conv[0] . $output . $conv[count($conv) - 1];
99 } elseif (!empty($conv)) {
101 foreach ($node->getContent() as $n) {
102 $output .= $this->wikify($n, $node);
105 foreach ($node->getContent() as $n) {
106 $output .= $this->wikify($n, $node);
111 if ($parent and $parent->_tag != 'pre')
112 preg_replace("/ {2,}/", " ", $output);
113 if (trim($output) == '')
120 * $output = $parser->elem_contents( $elem );
121 * Returns a wikified version of the contents of the specified
122 * HTML element. This is done by passing each element of this
123 * element's content list through the C<wikify()> method, and
124 * returning the concatenated result.
126 function elem_contents($node)
129 if (isa($node, 'XmlElement')) {
130 foreach ($node->getContent() as $child) {
131 $output .= $this->wikify($child, isset($node->parent) ? $node->parent : null);
134 $output = $this->wikify($content);
140 // Private function: _elem_attr_str( $elem, @attrs )
142 // Returns a string containing a list of attribute names and
143 // values associated with the specified HTML element. Only
144 // attribute names included in @attrs will be added to the
145 // string of attributes that is returned. The return value
146 // is suitable for inserting into an HTML document, as
147 // attribute name/value pairs are specified in attr="value"
150 function _elem_attr_str($node, $attrs)
153 foreach ($node->_attr as $attr => $val) {
154 $attr = strtolower($attr);
155 if (in_array($attr, $attrs))
156 $s .= " $attr=\"$val\"";
162 // Private function: _elem_has_ancestor( $elem, $tagname )
164 // Returns true if the specified HtmlElement has an ancestor element
165 // whose element tag equals $tag. This is useful for determining if
166 // an element belongs to the specified tag.
168 function _elem_has_ancestor($node, $tag)
170 if (isset($node->parent)) {
171 if ($node->parent->_tag == $tag) return true;
172 return $this->_elem_has_ancestor($node->parent, $tag);
178 // Private function: _elem_is_image_div( $elem )
180 // Returns true $elem is a container element (P or DIV) meant only to
183 // More specifically, returns true if the given element is a DIV or P
184 // element and the only child it contains is an IMG tag or an IMG tag
185 // contained within a sole A tag (not counting child elements with
186 // whitespace text only).
188 function _elem_is_image_div($node)
190 // Return false if node is undefined or isn't a DIV at all
191 if (!$node or !in_array($node->_tag, array("div", "p")))
193 $contents = $node->getContent();
194 // Returns true if sole child is an IMG tag
195 if (count($contents) == 1 and isset($contents[0]) and $contents[0]->_tag == 'img')
197 // Check if child is a sole A tag that contains an IMG tag
198 if (count($contents) == 1 and isset($contents[0]) and $contents[0]->_tag == 'a') {
199 $children = $contents[0]->getContent();
200 if (count($children) == 1 and isset($children[0]) and $children[0]->_tag == 'img')
206 /** preserves tags and content
208 function wikify_default($node)
210 return $this->wikify_preserve($node);
213 /** preserves tags and content
215 function wikify_preserve($node)
217 return $node->asXML();
225 class HtmlParser_PhpWiki2
228 function HtmlParser_PhpWiki2()
241 'strong' => array("*"),
246 // PRE blocks are handled specially (see tidy_whitespace and
248 'pre' => array("<pre>", "</pre>"),
250 'dl' => array('', "\n\n"),
251 'dt' => array(';', ''),
252 'dd' => array(':', ''),
254 'p' => array("\n\n", "\n\n"),
255 'ul' => array('', "\n"),
256 'ol' => array('', "\n"),
258 'li' => "wikify_list_item",
259 'table' => "wikify_table",
263 'div' => array('', "\n\n"),
264 'img' => "wikify_img",
265 'a' => "wikify_link",
266 'span' => array('', ''),
275 'font' => array('', ''),
276 'sup' => "wikify_default",
277 'sub' => "wikify_default",
278 'nowiki' => "wikify_verbatim",
279 'verbatim' => "wikify_default",
280 'noinclude' => "wikify_noinclude",
284 function wikify_table($node)
287 return "| \n" . $this->elem_contents($node) . "|\n\n";
290 function wikify_tr($node)
292 return "\n| " . $this->elem_contents($node);
295 function wikify_th($node)
297 $ident = empty($this->ident) ? '' : $this->ident;
298 $output = "$ident| ";
299 $content = $this->elem_contents($node);
300 preg_replace("s/^\s+/", "", $content);
303 return "$output |\n";
306 function wikify_list_item($node)
308 return ($this->_elem_has_ancestor($node, 'ol') ? '*' : '#') . " " . trim($this->elem_contents($node)) . "\n";
311 function wikify_link($node)
313 $url = $this->absolute_url($node->getAttr('href'));
314 $title = $this->elem_contents($node);
316 $title = trim($title);
318 // Just return the link title if this tag is contained
319 // within an header tag
320 if (isset($node->parent) and preg_match('/^h\d$/', $node->parent->_tag))
323 // Return if this is a link to an image contained within
324 if (isset($node->parent) and $this->_elem_is_image_div($node->parent))
327 // If HREF is the same as the link title, then
328 // just return the URL (it'll be converted into
329 // a clickable link by the wiki engine)
330 if ($url == $title) return $url;
331 return "[ $url | $title ]";
334 function wikify_h($node)
336 $level = substr($node->_tag, 1);
338 $markup = str_repeat('!', 4 - $level);
342 return $markup . ' ' . trim($this->elem_contents($node)) . "\n\n";
345 function wikify_verbatim($node)
347 $contents = $this->elem_contents($node);
348 return "\n<verbatim>\n$contents\n</verbatim>";
351 function wikify_noinclude($node)
353 return $this->elem_contents($node);
356 function wikify_img($node)
358 $image_url = $this->absolute_url($node->getAttr('src'));
359 $file = basename($image_url);
360 $alignment = $node->getAttr('align');
361 $this->log("Processing IMG tag for SRC: " . $image_url . "...");
363 // Grab attributes to be added to the [ Image ] markup (since 1.3.10)
366 if ($this->_elem_is_image_div($node->parent))
367 $image_div = $node->parent;
368 elseif (isset($node->parent) and $this->_elem_is_image_div($node->parent->parent))
369 $image_div = $node->parent->parent;
371 if (!$alignment and $image_div) {
372 $css_style = $image_div->getAttr('style');
373 $css_class = $image_div->getAttr('class');
375 // float => align: Check for float attribute; if it's there,
376 // then we'll add it to the [Image] syntax
377 if (!$alignment and preg_match("/float\:\s*(right|left)/i", $css_style, $m))
379 if (!$alignment and preg_match("/float(right|left)/i", $css_class, $m)) ;
382 $attrs[] = "align=$alignment";
383 $this->log(" Image is contained within a DIV that specifies $alignment alignment");
384 $this->log(" Adding '$alignment' to [Image] markup attributes");
386 $this->log(" Image is not contained within a DIV for alignment");
389 $this->log(" Image is not contained within a DIV");
392 $attrs[] = "align=$alignment";
394 // Check if we need to request a thumbnail of this
395 // image; it's needed if the specified width attribute
396 // differs from the default size of the image
398 if ($width = $node->getAttr('width')) {
399 $this->log(" Image has WIDTH attribute of $width");
400 $this->log(" Checking whether resulting [Image] markup should specify a thumbnail...");
402 // Download the image from the network and store
403 $abs_url = $this->absolute_url($node->getAttr('src'));
404 $this->log(" Fetching image '$abs_url' from the network");
405 list($actual_w, $actual_h, $flag, $attr_str) = getimagesize($abs_url);
407 // If the WIDTH attribute of the IMG tag is not equal
408 // to the actual width of the image, then we need to
409 // create a thumbnail
410 if (preg_match("/^\d+$/", $width) and $width != $actual_w) {
411 $this->log(" IMG tag's WIDTH attribute ($width) differs from actual width of image ($actual_w)");
412 $this->log(" -- that means we're going to need a thumbnail");
413 $this->log(" Adding 'width' to list of attributes for [Image] markup");
414 $attrs[] = "width=$width";
417 $height = $node->getAttr('height');
418 if (preg_match("/^\d+$/", $height) and $height != $height_h) {
419 $this->log(" IMG tag's HEIGHT attribute ($height) differs from actual height of image ($actual_h)");
420 $this->log(" -- that means we're going to need a thumbnail");
421 $this->log(" Adding 'height' to list of attributes for [Image] markup");
422 if (isset($width_added))
423 $attrs[count($attr) - 1] = "size=" . $width . "x" . $height;
425 $attrs[] = "height=$height";
428 if ($alt = $node->getAttr('alt')) {
429 $this->log(" Adding alternate text '$alt' to [Image] markup");
430 $attrs[] = "alt=$alt";
432 $attr_str = join(' ', $attrs);
433 $this->log("...done processing IMG tag\n");
434 return "[ $file $attr_str ]";
442 // c-hanging-comment-ender-p: nil
443 // indent-tabs-mode: nil