2 rcs_id('$Id: HtmlParser.php,v 1.2 2004-10-19 13:23:06 rurban Exp $');
4 * HtmlParser Class: Conversion HTML => wikimarkup
5 * Requires XmlParser, XmlElement and the expat library
9 Copyright (C) 2004 Reini Urban
11 This file is part of PhpWiki.
13 PhpWiki is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2 of the License, or
16 (at your option) any later version.
18 PhpWiki is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
23 You should have received a copy of the GNU General Public License
24 along with PhpWiki; if not, write to the Free Software
25 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
29 * Base class to implement html => wikitext converters,
30 * extendable for various wiki syntax versions.
31 * This is needed to be able to use htmlarea-alike editors,
32 * and to import HTML documents.
34 * See also php-html.sf.net for a php-only version, if
35 * you don't have the expat extension included.
36 * See also http://search.cpan.org/~diberri/HTML-WikiConverter/
40 // RssParser contains the XML (expat) and url-grabber methods
41 require_once('lib/XmlParser.php');
46 var $dialect, $_handlers, $root;
49 * dialect: "PhpWiki2", "PhpWiki"
50 * possible more dialects: MediaWiki, kwiki, c2
52 function HtmlParser($dialect = "PhpWiki2", $encoding = '') {
53 $classname = "HtmlParser_".$dialect;
54 if (class_exists($classname))
55 $this->dialect = new $classname;
57 trigger_error(sprintf("unknown HtmlParser dialect %s",$dialect),E_USER_ERROR);
59 $this->_handlers =& $this->dialect->_handlers;
60 $this->XmlParser($encoding);
61 xml_parser_set_option($this->_parser, XML_OPTION_CASE_FOLDING, 0);
62 xml_parser_set_option($this->_parser, XML_OPTION_SKIP_WHITE, 1);
65 // The three callbacks, called on walking through the HTML tree.
66 // No extensions needed from XmlParser.
68 function tag_open($parser, $name, $attrs='') {
70 function tag_close($parser, $name, $attrs='') {
72 function cdata($parser, $data) {
74 function parse_url($file, $debug=false)
78 if (is_null($this->root))
79 $this->root = $GLOBALS['xml_parser_root'];
80 $output = $this->wikify( $this->root );
84 function wikify ($node, $parent = null) {
86 if( isa($node, 'XmlElement')) {
87 $dialect =& $this->dialect;
88 $conv = $dialect->_handlers[$node->_tag];
89 if( is_string($conv) and method_exists($dialect, $conv)) {
90 $output = $dialect->$conv($node);
91 } elseif( is_array($conv) ) {
92 foreach ($node->getContent() as $n) {
93 $output .= $this->wikify($n, $node);
95 $output = $conv[0] . $output . $conv[count($conv)-1];
96 } elseif( !empty($conv) ) {
98 foreach ($node->getContent() as $n) {
99 $output .= $this->wikify($n, $node);
102 foreach ($node->getContent() as $n) {
103 $output .= $this->wikify($n, $node);
108 if ($parent and $parent->_tag != 'pre')
109 preg_replace("/ {2,}/"," ",$output);
110 if (trim($output) == '')
117 * $output = $parser->elem_contents( $elem );
119 * Returns a wikified version of the contents of the specified
120 * HTML element. This is done by passing each element of this
121 * element's content list through the C<wikify()> method, and
122 * returning the concatenated result.
124 function elem_contents($node) {
126 if (isa($node,'XmlElement')) {
127 foreach ($node->getContent() as $child) {
128 $output .= $this->wikify($child, isset($node->parent) ? $node->parent : null);
131 $output = $this->wikify($content);
137 // Private function: _elem_attr_str( $elem, @attrs )
139 // Returns a string containing a list of attribute names and
140 // values associated with the specified HTML element. Only
141 // attribute names included in @attrs will be added to the
142 // string of attributes that is returned. The return value
143 // is suitable for inserting into an HTML document, as
144 // attribute name/value pairs are specified in attr="value"
147 function _elem_attr_str($node, $attrs) {
149 foreach ($node->_attr as $attr => $val) {
150 $attr = strtolower($attr);
151 if (in_array($attr,$attrs))
152 $s .= " $attr=\"$val\"";
158 // Private function: _elem_has_ancestor( $elem, $tagname )
160 // Returns true if the specified HtmlElement has an ancestor element
161 // whose element tag equals $tag. This is useful for determining if
162 // an element belongs to the specified tag.
164 function _elem_has_ancestor($node, $tag) {
165 if (isset($node->parent)) {
166 if ($node->parent->_tag == $tag) return true;
167 return $this->_elem_has_ancestor($node->parent, $tag);
173 // Private function: _elem_is_image_div( $elem )
175 // Returns true $elem is a container element (P or DIV) meant only to
178 // More specifically, returns true if the given element is a DIV or P
179 // element and the only child it contains is an IMG tag or an IMG tag
180 // contained within a sole A tag (not counting child elements with
181 // whitespace text only).
183 function _elem_is_image_div( $node ) {
184 // Return false if node is undefined or isn't a DIV at all
185 if (!$node or !in_array($node->_tag,array("div","p")))
187 $contents = $node->getContent();
188 // Returns true if sole child is an IMG tag
189 if (count($contents) == 1 and isset($contents[0]) and $contents[0]->_tag == 'img')
191 // Check if child is a sole A tag that contains an IMG tag
192 if (count($contents) == 1 and isset($contents[0]) and $contents[0]->_tag == 'a') {
193 $children = $contents[0]->getContent();
194 if (count($children) == 1 and isset($children[0]) and $children[0]->_tag == 'img')
200 /** preserves tags and content
202 function wikify_default($node) {
203 return $this->wikify_preserve($node);
206 /** preserves tags and content
208 function wikify_preserve($node) {
209 return $node->asXML();
212 function log($dummy) {}
216 class HtmlParser_PhpWiki2
219 function HtmlParser_PhpWiki2() {
231 'strong' => array( "*" ),
233 'em' => array( "_" ),
236 // PRE blocks are handled specially (see tidy_whitespace and
238 'pre' => array( "<pre>", "</pre>" ),
240 'dl' => array( '', "\n\n" ),
241 'dt' => array( ';', '' ),
242 'dd' => array( ':', '' ),
244 'p' => array( "\n\n", "\n\n" ),
245 'ul' => array( '', "\n" ),
246 'ol' => array( '', "\n" ),
248 'li' => "wikify_list_item",
249 'table' => "wikify_table",
253 'div' => array( '', "\n\n" ),
254 'img' => "wikify_img",
255 'a' => "wikify_link",
256 'span' => array( '', '' ),
265 'font' => array( '', '' ),
266 'sup' => "wikify_default",
267 'sub' => "wikify_default",
268 'nowiki' => "wikify_verbatim",
269 'verbatim' => "wikify_default",
273 function wikify_table( $node ) {
275 return "| \n" . $this->elem_contents($node) . "|\n\n";
277 function wikify_tr( $node ) {
278 return "\n| " . $this->elem_contents($node);
280 function wikify_th( $node ) {
281 $ident = empty($this->ident) ? '' : $this->ident;
282 $output = "$ident| ";
283 $content = $this->elem_contents($node);
284 preg_replace("s/^\s+/","",$content);
287 return "$output |\n";
290 function wikify_list_item( $node ) {
291 return ($this->_elem_has_ancestor($node, 'ol') ? '*' : '#') . " " . trim($this->elem_contents($node)). "\n";
294 function wikify_link( $node ) {
295 $url = $this->absolute_url( $node->getAttr('href') );
296 $title = $this->elem_contents($node);
298 $title = trim($title);
300 // Just return the link title if this tag is contained
301 // within an header tag
302 if (isset($node->parent) and preg_match('/^h\d$/',$node->parent->_tag))
305 // Return if this is a link to an image contained within
306 if (isset($node->parent) and $this->_elem_is_image_div($node->parent))
309 // If HREF is the same as the link title, then
310 // just return the URL (it'll be converted into
311 // a clickable link by the wiki engine)
312 if ($url == $title) return $url;
313 return "[ $url | $title ]";
316 function wikify_h( $node ) {
317 $level = substr($node->_tag,1);
319 $markup = str_repeat('!',4 - $level);
323 return $markup.' '.trim($this->elem_contents($node))."\n\n";
326 function wikify_verbatim( $node ) {
327 $contents = $this->elem_contents( $node );
328 return "\n<verbatim>\n$contents\n</verbatim>";
331 function wikify_img( $node ) {
332 $image_url = $this->absolute_url( $node->getAttr('src') );
333 $file = basename( $image_url );
334 $alignment = $node->getAttr('align');
335 $this->log( "Processing IMG tag for SRC: ".$image_url."..." );
337 // Grab attributes to be added to the [ Image ] markup (since 1.3.10)
340 if ($this->_elem_is_image_div( $node->parent ))
341 $image_div = $node->parent;
342 elseif (isset($node->parent) and $this->_elem_is_image_div( $node->parent->parent ))
343 $image_div = $node->parent->parent;
345 if ( !$alignment and $image_div ) {
346 $css_style = $image_div->getAttr('style');
347 $css_class = $image_div->getAttr('class');
349 // float => align: Check for float attribute; if it's there,
350 // then we'll add it to the [Image] syntax
351 if (!$alignment and preg_match("/float\:\s*(right|left)/i",$css_style,$m))
353 if (!$alignment and preg_match("/float(right|left)/i",$css_class,$m));
356 $attrs[] = "align=$alignment";
357 $this->log( " Image is contained within a DIV that specifies $alignment alignment" );
358 $this->log( " Adding '$alignment' to [Image] markup attributes" );
360 $this->log( " Image is not contained within a DIV for alignment" );
363 $this->log( " Image is not contained within a DIV" );
366 $attrs[] = "align=$alignment";
368 // Check if we need to request a thumbnail of this
369 // image; it's needed if the specified width attribute
370 // differs from the default size of the image
372 if( $width = $node->getAttr('width') ) {
373 $this->log( " Image has WIDTH attribute of $width" );
374 $this->log( " Checking whether resulting [Image] markup should specify a thumbnail..." );
376 // Download the image from the network and store
377 $abs_url = $this->absolute_url( $node->getAttr('src') );
378 $this->log( " Fetching image '$abs_url' from the network" );
379 list( $actual_w, $actual_h, $flag, $attr_str) = getimagesize( $abs_url );
381 // If the WIDTH attribute of the IMG tag is not equal
382 // to the actual width of the image, then we need to
383 // create a thumbnail
384 if( preg_match("/^\d+$/",$width) and $width != $actual_w ) {
385 $this->log( " IMG tag's WIDTH attribute ($width) differs from actual width of image ($actual_w)" );
386 $this->log( " -- that means we're going to need a thumbnail" );
387 $this->log( " Adding 'width' to list of attributes for [Image] markup" );
388 $attrs[] = "width=$width";
391 $height = $node->getAttr('height');
392 if( preg_match("/^\d+$/",$height) and $height != $height_h ) {
393 $this->log( " IMG tag's HEIGHT attribute ($height) differs from actual height of image ($actual_h)" );
394 $this->log( " -- that means we're going to need a thumbnail" );
395 $this->log( " Adding 'height' to list of attributes for [Image] markup" );
396 if (isset($width_added))
397 $attrs[count($attr)-1] = "size=".$width."x".$height;
399 $attrs[] = "height=$height";
402 if ($alt = $node->getAttr('alt')) {
403 $this->log( " Adding alternate text '$alt' to [Image] markup" );
404 $attrs[] = "alt=$alt";
406 $attr_str = join(' ', $attrs);
407 $this->log( "...done processing IMG tag\n" );
408 return "[ $file $attr_str ]";
412 // $Log: not supported by cvs2svn $
413 // Revision 1.1 2004/05/24 17:31:31 rurban
414 // new XmlParser and HtmlParser, RssParser based on that.
423 // c-hanging-comment-ender-p: nil
424 // indent-tabs-mode: nil