4 * Experimental HTML5-based parser using Jeroen van der Meer's PH5P library.
5 * Occupies space in the HTML5 pseudo-namespace, which may cause conflicts.
8 * Recent changes to PHP's DOM extension have resulted in some fatal
9 * error conditions with the original version of PH5P. Pending changes,
10 * this lexer will punt to DirectLex if DOM throughs an exception.
13 class HTMLPurifier_Lexer_PH5P extends HTMLPurifier_Lexer_DOMLex {
15 public function tokenizeHTML($html, $config, $context) {
16 $new_html = $this->normalize($html, $config, $context);
17 $new_html = $this->wrapHTML($new_html, $config, $context);
19 $parser = new HTML5($new_html);
20 $doc = $parser->save();
21 } catch (DOMException $e) {
22 // Uh oh, it failed. Punt to DirectLex.
23 $lexer = new HTMLPurifier_Lexer_DirectLex();
24 $context->register('PH5PError', $e); // save the error, so we can detect it
25 return $lexer->tokenizeHTML($html, $config, $context); // use original HTML
29 $doc->getElementsByTagName('html')->item(0)-> // <html>
30 getElementsByTagName('body')->item(0)-> // <body>
31 getElementsByTagName('div')->item(0) // <div>
40 Copyright 2007 Jeroen van der Meer <http://jero.net/>
42 Permission is hereby granted, free of charge, to any person obtaining a
43 copy of this software and associated documentation files (the
44 "Software"), to deal in the Software without restriction, including
45 without limitation the rights to use, copy, modify, merge, publish,
46 distribute, sublicense, and/or sell copies of the Software, and to
47 permit persons to whom the Software is furnished to do so, subject to
48 the following conditions:
50 The above copyright notice and this permission notice shall be included
51 in all copies or substantial portions of the Software.
53 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
54 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
55 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
56 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
57 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
58 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
59 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
70 private $content_model;
71 private $escape = false;
72 private $entities = array('AElig;','AElig','AMP;','AMP','Aacute;','Aacute',
73 'Acirc;','Acirc','Agrave;','Agrave','Alpha;','Aring;','Aring','Atilde;',
74 'Atilde','Auml;','Auml','Beta;','COPY;','COPY','Ccedil;','Ccedil','Chi;',
75 'Dagger;','Delta;','ETH;','ETH','Eacute;','Eacute','Ecirc;','Ecirc','Egrave;',
76 'Egrave','Epsilon;','Eta;','Euml;','Euml','GT;','GT','Gamma;','Iacute;',
77 'Iacute','Icirc;','Icirc','Igrave;','Igrave','Iota;','Iuml;','Iuml','Kappa;',
78 'LT;','LT','Lambda;','Mu;','Ntilde;','Ntilde','Nu;','OElig;','Oacute;',
79 'Oacute','Ocirc;','Ocirc','Ograve;','Ograve','Omega;','Omicron;','Oslash;',
80 'Oslash','Otilde;','Otilde','Ouml;','Ouml','Phi;','Pi;','Prime;','Psi;',
81 'QUOT;','QUOT','REG;','REG','Rho;','Scaron;','Sigma;','THORN;','THORN',
82 'TRADE;','Tau;','Theta;','Uacute;','Uacute','Ucirc;','Ucirc','Ugrave;',
83 'Ugrave','Upsilon;','Uuml;','Uuml','Xi;','Yacute;','Yacute','Yuml;','Zeta;',
84 'aacute;','aacute','acirc;','acirc','acute;','acute','aelig;','aelig',
85 'agrave;','agrave','alefsym;','alpha;','amp;','amp','and;','ang;','apos;',
86 'aring;','aring','asymp;','atilde;','atilde','auml;','auml','bdquo;','beta;',
87 'brvbar;','brvbar','bull;','cap;','ccedil;','ccedil','cedil;','cedil',
88 'cent;','cent','chi;','circ;','clubs;','cong;','copy;','copy','crarr;',
89 'cup;','curren;','curren','dArr;','dagger;','darr;','deg;','deg','delta;',
90 'diams;','divide;','divide','eacute;','eacute','ecirc;','ecirc','egrave;',
91 'egrave','empty;','emsp;','ensp;','epsilon;','equiv;','eta;','eth;','eth',
92 'euml;','euml','euro;','exist;','fnof;','forall;','frac12;','frac12',
93 'frac14;','frac14','frac34;','frac34','frasl;','gamma;','ge;','gt;','gt',
94 'hArr;','harr;','hearts;','hellip;','iacute;','iacute','icirc;','icirc',
95 'iexcl;','iexcl','igrave;','igrave','image;','infin;','int;','iota;',
96 'iquest;','iquest','isin;','iuml;','iuml','kappa;','lArr;','lambda;','lang;',
97 'laquo;','laquo','larr;','lceil;','ldquo;','le;','lfloor;','lowast;','loz;',
98 'lrm;','lsaquo;','lsquo;','lt;','lt','macr;','macr','mdash;','micro;','micro',
99 'middot;','middot','minus;','mu;','nabla;','nbsp;','nbsp','ndash;','ne;',
100 'ni;','not;','not','notin;','nsub;','ntilde;','ntilde','nu;','oacute;',
101 'oacute','ocirc;','ocirc','oelig;','ograve;','ograve','oline;','omega;',
102 'omicron;','oplus;','or;','ordf;','ordf','ordm;','ordm','oslash;','oslash',
103 'otilde;','otilde','otimes;','ouml;','ouml','para;','para','part;','permil;',
104 'perp;','phi;','pi;','piv;','plusmn;','plusmn','pound;','pound','prime;',
105 'prod;','prop;','psi;','quot;','quot','rArr;','radic;','rang;','raquo;',
106 'raquo','rarr;','rceil;','rdquo;','real;','reg;','reg','rfloor;','rho;',
107 'rlm;','rsaquo;','rsquo;','sbquo;','scaron;','sdot;','sect;','sect','shy;',
108 'shy','sigma;','sigmaf;','sim;','spades;','sub;','sube;','sum;','sup1;',
109 'sup1','sup2;','sup2','sup3;','sup3','sup;','supe;','szlig;','szlig','tau;',
110 'there4;','theta;','thetasym;','thinsp;','thorn;','thorn','tilde;','times;',
111 'times','trade;','uArr;','uacute;','uacute','uarr;','ucirc;','ucirc',
112 'ugrave;','ugrave','uml;','uml','upsih;','upsilon;','uuml;','uuml','weierp;',
113 'xi;','yacute;','yacute','yen;','yen','yuml;','yuml','zeta;','zwj;','zwnj;');
127 public function __construct($data) {
131 $this->EOF = strlen($data);
132 $this->tree = new HTML5TreeConstructer;
133 $this->content_model = self::PCDATA;
135 $this->state = 'data';
137 while($this->state !== null) {
138 $this->{$this->state.'State'}();
142 public function save() {
143 return $this->tree->save();
146 private function char() {
147 return ($this->char < $this->EOF)
148 ? $this->data[$this->char]
152 private function character($s, $l = 0) {
153 if($s + $l < $this->EOF) {
155 return $this->data[$s];
157 return substr($this->data, $s, $l);
162 private function characters($char_class, $start) {
163 return preg_replace('#^(['.$char_class.']+).*#s', '\\1', substr($this->data, $start));
166 private function dataState() {
167 // Consume the next input character
169 $char = $this->char();
171 if($char === '&' && ($this->content_model === self::PCDATA || $this->content_model === self::RCDATA)) {
172 /* U+0026 AMPERSAND (&)
173 When the content model flag is set to one of the PCDATA or RCDATA
174 states: switch to the entity data state. Otherwise: treat it as per
175 the "anything else" entry below. */
176 $this->state = 'entityData';
178 } elseif($char === '-') {
179 /* If the content model flag is set to either the RCDATA state or
180 the CDATA state, and the escape flag is false, and there are at
181 least three characters before this one in the input stream, and the
182 last four characters in the input stream, including this one, are
183 U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
184 and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */
185 if(($this->content_model === self::RCDATA || $this->content_model ===
186 self::CDATA) && $this->escape === false &&
187 $this->char >= 3 && $this->character($this->char - 4, 4) === '<!--') {
188 $this->escape = true;
191 /* In any case, emit the input character as a character token. Stay
192 in the data state. */
193 $this->emitToken(array(
194 'type' => self::CHARACTR,
198 /* U+003C LESS-THAN SIGN (<) */
199 } elseif($char === '<' && ($this->content_model === self::PCDATA ||
200 (($this->content_model === self::RCDATA ||
201 $this->content_model === self::CDATA) && $this->escape === false))) {
202 /* When the content model flag is set to the PCDATA state: switch
203 to the tag open state.
205 When the content model flag is set to either the RCDATA state or
206 the CDATA state and the escape flag is false: switch to the tag
209 Otherwise: treat it as per the "anything else" entry below. */
210 $this->state = 'tagOpen';
212 /* U+003E GREATER-THAN SIGN (>) */
213 } elseif($char === '>') {
214 /* If the content model flag is set to either the RCDATA state or
215 the CDATA state, and the escape flag is true, and the last three
216 characters in the input stream including this one are U+002D
217 HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"),
218 set the escape flag to false. */
219 if(($this->content_model === self::RCDATA ||
220 $this->content_model === self::CDATA) && $this->escape === true &&
221 $this->character($this->char, 3) === '-->') {
222 $this->escape = false;
225 /* In any case, emit the input character as a character token.
226 Stay in the data state. */
227 $this->emitToken(array(
228 'type' => self::CHARACTR,
232 } elseif($this->char === $this->EOF) {
234 Emit an end-of-file token. */
237 } elseif($this->content_model === self::PLAINTEXT) {
238 /* When the content model flag is set to the PLAINTEXT state
239 THIS DIFFERS GREATLY FROM THE SPEC: Get the remaining characters of
240 the text and emit it as a character token. */
241 $this->emitToken(array(
242 'type' => self::CHARACTR,
243 'data' => substr($this->data, $this->char)
250 THIS DIFFERS GREATLY FROM THE SPEC: Get as many character that
251 otherwise would also be treated as a character token and emit it
252 as a single character token. Stay in the data state. */
253 $len = strcspn($this->data, '<&', $this->char);
254 $char = substr($this->data, $this->char, $len);
255 $this->char += $len - 1;
257 $this->emitToken(array(
258 'type' => self::CHARACTR,
262 $this->state = 'data';
266 private function entityDataState() {
267 // Attempt to consume an entity.
268 $entity = $this->entity();
270 // If nothing is returned, emit a U+0026 AMPERSAND character token.
271 // Otherwise, emit the character token that was returned.
272 $char = (!$entity) ? '&' : $entity;
273 $this->emitToken(array(
274 'type' => self::CHARACTR,
278 // Finally, switch to the data state.
279 $this->state = 'data';
282 private function tagOpenState() {
283 switch($this->content_model) {
286 /* If the next input character is a U+002F SOLIDUS (/) character,
287 consume it and switch to the close tag open state. If the next
288 input character is not a U+002F SOLIDUS (/) character, emit a
289 U+003C LESS-THAN SIGN character token and switch to the data
290 state to process the next input character. */
291 if($this->character($this->char + 1) === '/') {
293 $this->state = 'closeTagOpen';
296 $this->emitToken(array(
297 'type' => self::CHARACTR,
301 $this->state = 'data';
306 // If the content model flag is set to the PCDATA state
307 // Consume the next input character:
309 $char = $this->char();
312 /* U+0021 EXCLAMATION MARK (!)
313 Switch to the markup declaration open state. */
314 $this->state = 'markupDeclarationOpen';
316 } elseif($char === '/') {
317 /* U+002F SOLIDUS (/)
318 Switch to the close tag open state. */
319 $this->state = 'closeTagOpen';
321 } elseif(preg_match('/^[A-Za-z]$/', $char)) {
322 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
323 Create a new start tag token, set its tag name to the lowercase
324 version of the input character (add 0x0020 to the character's code
325 point), then switch to the tag name state. (Don't emit the token
326 yet; further details will be filled in before it is emitted.) */
327 $this->token = array(
328 'name' => strtolower($char),
329 'type' => self::STARTTAG,
333 $this->state = 'tagName';
335 } elseif($char === '>') {
336 /* U+003E GREATER-THAN SIGN (>)
337 Parse error. Emit a U+003C LESS-THAN SIGN character token and a
338 U+003E GREATER-THAN SIGN character token. Switch to the data state. */
339 $this->emitToken(array(
340 'type' => self::CHARACTR,
344 $this->state = 'data';
346 } elseif($char === '?') {
347 /* U+003F QUESTION MARK (?)
348 Parse error. Switch to the bogus comment state. */
349 $this->state = 'bogusComment';
353 Parse error. Emit a U+003C LESS-THAN SIGN character token and
354 reconsume the current input character in the data state. */
355 $this->emitToken(array(
356 'type' => self::CHARACTR,
361 $this->state = 'data';
367 private function closeTagOpenState() {
368 $next_node = strtolower($this->characters('A-Za-z', $this->char + 1));
369 $the_same = count($this->tree->stack) > 0 && $next_node === end($this->tree->stack)->nodeName;
371 if(($this->content_model === self::RCDATA || $this->content_model === self::CDATA) &&
372 (!$the_same || ($the_same && (!preg_match('/[\t\n\x0b\x0c >\/]/',
373 $this->character($this->char + 1 + strlen($next_node))) || $this->EOF === $this->char)))) {
374 /* If the content model flag is set to the RCDATA or CDATA states then
375 examine the next few characters. If they do not match the tag name of
376 the last start tag token emitted (case insensitively), or if they do but
377 they are not immediately followed by one of the following characters:
378 * U+0009 CHARACTER TABULATION
379 * U+000A LINE FEED (LF)
380 * U+000B LINE TABULATION
381 * U+000C FORM FEED (FF)
383 * U+003E GREATER-THAN SIGN (>)
386 ...then there is a parse error. Emit a U+003C LESS-THAN SIGN character
387 token, a U+002F SOLIDUS character token, and switch to the data state
388 to process the next input character. */
389 $this->emitToken(array(
390 'type' => self::CHARACTR,
394 $this->state = 'data';
397 /* Otherwise, if the content model flag is set to the PCDATA state,
398 or if the next few characters do match that tag name, consume the
399 next input character: */
401 $char = $this->char();
403 if(preg_match('/^[A-Za-z]$/', $char)) {
404 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
405 Create a new end tag token, set its tag name to the lowercase version
406 of the input character (add 0x0020 to the character's code point), then
407 switch to the tag name state. (Don't emit the token yet; further details
408 will be filled in before it is emitted.) */
409 $this->token = array(
410 'name' => strtolower($char),
411 'type' => self::ENDTAG
414 $this->state = 'tagName';
416 } elseif($char === '>') {
417 /* U+003E GREATER-THAN SIGN (>)
418 Parse error. Switch to the data state. */
419 $this->state = 'data';
421 } elseif($this->char === $this->EOF) {
423 Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F
424 SOLIDUS character token. Reconsume the EOF character in the data state. */
425 $this->emitToken(array(
426 'type' => self::CHARACTR,
431 $this->state = 'data';
434 /* Parse error. Switch to the bogus comment state. */
435 $this->state = 'bogusComment';
440 private function tagNameState() {
441 // Consume the next input character:
443 $char = $this->character($this->char);
445 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
446 /* U+0009 CHARACTER TABULATION
447 U+000A LINE FEED (LF)
448 U+000B LINE TABULATION
449 U+000C FORM FEED (FF)
451 Switch to the before attribute name state. */
452 $this->state = 'beforeAttributeName';
454 } elseif($char === '>') {
455 /* U+003E GREATER-THAN SIGN (>)
456 Emit the current tag token. Switch to the data state. */
457 $this->emitToken($this->token);
458 $this->state = 'data';
460 } elseif($this->char === $this->EOF) {
462 Parse error. Emit the current tag token. Reconsume the EOF
463 character in the data state. */
464 $this->emitToken($this->token);
467 $this->state = 'data';
469 } elseif($char === '/') {
470 /* U+002F SOLIDUS (/)
471 Parse error unless this is a permitted slash. Switch to the before
472 attribute name state. */
473 $this->state = 'beforeAttributeName';
477 Append the current input character to the current tag token's tag name.
478 Stay in the tag name state. */
479 $this->token['name'] .= strtolower($char);
480 $this->state = 'tagName';
484 private function beforeAttributeNameState() {
485 // Consume the next input character:
487 $char = $this->character($this->char);
489 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
490 /* U+0009 CHARACTER TABULATION
491 U+000A LINE FEED (LF)
492 U+000B LINE TABULATION
493 U+000C FORM FEED (FF)
495 Stay in the before attribute name state. */
496 $this->state = 'beforeAttributeName';
498 } elseif($char === '>') {
499 /* U+003E GREATER-THAN SIGN (>)
500 Emit the current tag token. Switch to the data state. */
501 $this->emitToken($this->token);
502 $this->state = 'data';
504 } elseif($char === '/') {
505 /* U+002F SOLIDUS (/)
506 Parse error unless this is a permitted slash. Stay in the before
507 attribute name state. */
508 $this->state = 'beforeAttributeName';
510 } elseif($this->char === $this->EOF) {
512 Parse error. Emit the current tag token. Reconsume the EOF
513 character in the data state. */
514 $this->emitToken($this->token);
517 $this->state = 'data';
521 Start a new attribute in the current tag token. Set that attribute's
522 name to the current input character, and its value to the empty string.
523 Switch to the attribute name state. */
524 $this->token['attr'][] = array(
525 'name' => strtolower($char),
529 $this->state = 'attributeName';
533 private function attributeNameState() {
534 // Consume the next input character:
536 $char = $this->character($this->char);
538 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
539 /* U+0009 CHARACTER TABULATION
540 U+000A LINE FEED (LF)
541 U+000B LINE TABULATION
542 U+000C FORM FEED (FF)
544 Stay in the before attribute name state. */
545 $this->state = 'afterAttributeName';
547 } elseif($char === '=') {
548 /* U+003D EQUALS SIGN (=)
549 Switch to the before attribute value state. */
550 $this->state = 'beforeAttributeValue';
552 } elseif($char === '>') {
553 /* U+003E GREATER-THAN SIGN (>)
554 Emit the current tag token. Switch to the data state. */
555 $this->emitToken($this->token);
556 $this->state = 'data';
558 } elseif($char === '/' && $this->character($this->char + 1) !== '>') {
559 /* U+002F SOLIDUS (/)
560 Parse error unless this is a permitted slash. Switch to the before
561 attribute name state. */
562 $this->state = 'beforeAttributeName';
564 } elseif($this->char === $this->EOF) {
566 Parse error. Emit the current tag token. Reconsume the EOF
567 character in the data state. */
568 $this->emitToken($this->token);
571 $this->state = 'data';
575 Append the current input character to the current attribute's name.
576 Stay in the attribute name state. */
577 $last = count($this->token['attr']) - 1;
578 $this->token['attr'][$last]['name'] .= strtolower($char);
580 $this->state = 'attributeName';
584 private function afterAttributeNameState() {
585 // Consume the next input character:
587 $char = $this->character($this->char);
589 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
590 /* U+0009 CHARACTER TABULATION
591 U+000A LINE FEED (LF)
592 U+000B LINE TABULATION
593 U+000C FORM FEED (FF)
595 Stay in the after attribute name state. */
596 $this->state = 'afterAttributeName';
598 } elseif($char === '=') {
599 /* U+003D EQUALS SIGN (=)
600 Switch to the before attribute value state. */
601 $this->state = 'beforeAttributeValue';
603 } elseif($char === '>') {
604 /* U+003E GREATER-THAN SIGN (>)
605 Emit the current tag token. Switch to the data state. */
606 $this->emitToken($this->token);
607 $this->state = 'data';
609 } elseif($char === '/' && $this->character($this->char + 1) !== '>') {
610 /* U+002F SOLIDUS (/)
611 Parse error unless this is a permitted slash. Switch to the
612 before attribute name state. */
613 $this->state = 'beforeAttributeName';
615 } elseif($this->char === $this->EOF) {
617 Parse error. Emit the current tag token. Reconsume the EOF
618 character in the data state. */
619 $this->emitToken($this->token);
622 $this->state = 'data';
626 Start a new attribute in the current tag token. Set that attribute's
627 name to the current input character, and its value to the empty string.
628 Switch to the attribute name state. */
629 $this->token['attr'][] = array(
630 'name' => strtolower($char),
634 $this->state = 'attributeName';
638 private function beforeAttributeValueState() {
639 // Consume the next input character:
641 $char = $this->character($this->char);
643 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
644 /* U+0009 CHARACTER TABULATION
645 U+000A LINE FEED (LF)
646 U+000B LINE TABULATION
647 U+000C FORM FEED (FF)
649 Stay in the before attribute value state. */
650 $this->state = 'beforeAttributeValue';
652 } elseif($char === '"') {
653 /* U+0022 QUOTATION MARK (")
654 Switch to the attribute value (double-quoted) state. */
655 $this->state = 'attributeValueDoubleQuoted';
657 } elseif($char === '&') {
658 /* U+0026 AMPERSAND (&)
659 Switch to the attribute value (unquoted) state and reconsume
660 this input character. */
662 $this->state = 'attributeValueUnquoted';
664 } elseif($char === '\'') {
665 /* U+0027 APOSTROPHE (')
666 Switch to the attribute value (single-quoted) state. */
667 $this->state = 'attributeValueSingleQuoted';
669 } elseif($char === '>') {
670 /* U+003E GREATER-THAN SIGN (>)
671 Emit the current tag token. Switch to the data state. */
672 $this->emitToken($this->token);
673 $this->state = 'data';
677 Append the current input character to the current attribute's value.
678 Switch to the attribute value (unquoted) state. */
679 $last = count($this->token['attr']) - 1;
680 $this->token['attr'][$last]['value'] .= $char;
682 $this->state = 'attributeValueUnquoted';
686 private function attributeValueDoubleQuotedState() {
687 // Consume the next input character:
689 $char = $this->character($this->char);
692 /* U+0022 QUOTATION MARK (")
693 Switch to the before attribute name state. */
694 $this->state = 'beforeAttributeName';
696 } elseif($char === '&') {
697 /* U+0026 AMPERSAND (&)
698 Switch to the entity in attribute value state. */
699 $this->entityInAttributeValueState('double');
701 } elseif($this->char === $this->EOF) {
703 Parse error. Emit the current tag token. Reconsume the character
704 in the data state. */
705 $this->emitToken($this->token);
708 $this->state = 'data';
712 Append the current input character to the current attribute's value.
713 Stay in the attribute value (double-quoted) state. */
714 $last = count($this->token['attr']) - 1;
715 $this->token['attr'][$last]['value'] .= $char;
717 $this->state = 'attributeValueDoubleQuoted';
721 private function attributeValueSingleQuotedState() {
722 // Consume the next input character:
724 $char = $this->character($this->char);
727 /* U+0022 QUOTATION MARK (')
728 Switch to the before attribute name state. */
729 $this->state = 'beforeAttributeName';
731 } elseif($char === '&') {
732 /* U+0026 AMPERSAND (&)
733 Switch to the entity in attribute value state. */
734 $this->entityInAttributeValueState('single');
736 } elseif($this->char === $this->EOF) {
738 Parse error. Emit the current tag token. Reconsume the character
739 in the data state. */
740 $this->emitToken($this->token);
743 $this->state = 'data';
747 Append the current input character to the current attribute's value.
748 Stay in the attribute value (single-quoted) state. */
749 $last = count($this->token['attr']) - 1;
750 $this->token['attr'][$last]['value'] .= $char;
752 $this->state = 'attributeValueSingleQuoted';
756 private function attributeValueUnquotedState() {
757 // Consume the next input character:
759 $char = $this->character($this->char);
761 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
762 /* U+0009 CHARACTER TABULATION
763 U+000A LINE FEED (LF)
764 U+000B LINE TABULATION
765 U+000C FORM FEED (FF)
767 Switch to the before attribute name state. */
768 $this->state = 'beforeAttributeName';
770 } elseif($char === '&') {
771 /* U+0026 AMPERSAND (&)
772 Switch to the entity in attribute value state. */
773 $this->entityInAttributeValueState();
775 } elseif($char === '>') {
776 /* U+003E GREATER-THAN SIGN (>)
777 Emit the current tag token. Switch to the data state. */
778 $this->emitToken($this->token);
779 $this->state = 'data';
783 Append the current input character to the current attribute's value.
784 Stay in the attribute value (unquoted) state. */
785 $last = count($this->token['attr']) - 1;
786 $this->token['attr'][$last]['value'] .= $char;
788 $this->state = 'attributeValueUnquoted';
792 private function entityInAttributeValueState() {
793 // Attempt to consume an entity.
794 $entity = $this->entity();
796 // If nothing is returned, append a U+0026 AMPERSAND character to the
797 // current attribute's value. Otherwise, emit the character token that
803 $last = count($this->token['attr']) - 1;
804 $this->token['attr'][$last]['value'] .= $char;
807 private function bogusCommentState() {
808 /* Consume every character up to the first U+003E GREATER-THAN SIGN
809 character (>) or the end of the file (EOF), whichever comes first. Emit
810 a comment token whose data is the concatenation of all the characters
811 starting from and including the character that caused the state machine
812 to switch into the bogus comment state, up to and including the last
813 consumed character before the U+003E character, if any, or up to the
814 end of the file otherwise. (If the comment was started by the end of
815 the file (EOF), the token is empty.) */
816 $data = $this->characters('^>', $this->char);
817 $this->emitToken(array(
819 'type' => self::COMMENT
822 $this->char += strlen($data);
824 /* Switch to the data state. */
825 $this->state = 'data';
827 /* If the end of the file was reached, reconsume the EOF character. */
828 if($this->char === $this->EOF) {
829 $this->char = $this->EOF - 1;
833 private function markupDeclarationOpenState() {
834 /* If the next two characters are both U+002D HYPHEN-MINUS (-)
835 characters, consume those two characters, create a comment token whose
836 data is the empty string, and switch to the comment state. */
837 if($this->character($this->char + 1, 2) === '--') {
839 $this->state = 'comment';
840 $this->token = array(
842 'type' => self::COMMENT
845 /* Otherwise if the next seven chacacters are a case-insensitive match
846 for the word "DOCTYPE", then consume those characters and switch to the
848 } elseif(strtolower($this->character($this->char + 1, 7)) === 'doctype') {
850 $this->state = 'doctype';
852 /* Otherwise, is is a parse error. Switch to the bogus comment state.
853 The next character that is consumed, if any, is the first character
854 that will be in the comment. */
857 $this->state = 'bogusComment';
861 private function commentState() {
862 /* Consume the next input character: */
864 $char = $this->char();
866 /* U+002D HYPHEN-MINUS (-) */
868 /* Switch to the comment dash state */
869 $this->state = 'commentDash';
872 } elseif($this->char === $this->EOF) {
873 /* Parse error. Emit the comment token. Reconsume the EOF character
874 in the data state. */
875 $this->emitToken($this->token);
877 $this->state = 'data';
881 /* Append the input character to the comment token's data. Stay in
882 the comment state. */
883 $this->token['data'] .= $char;
887 private function commentDashState() {
888 /* Consume the next input character: */
890 $char = $this->char();
892 /* U+002D HYPHEN-MINUS (-) */
894 /* Switch to the comment end state */
895 $this->state = 'commentEnd';
898 } elseif($this->char === $this->EOF) {
899 /* Parse error. Emit the comment token. Reconsume the EOF character
900 in the data state. */
901 $this->emitToken($this->token);
903 $this->state = 'data';
907 /* Append a U+002D HYPHEN-MINUS (-) character and the input
908 character to the comment token's data. Switch to the comment state. */
909 $this->token['data'] .= '-'.$char;
910 $this->state = 'comment';
914 private function commentEndState() {
915 /* Consume the next input character: */
917 $char = $this->char();
920 $this->emitToken($this->token);
921 $this->state = 'data';
923 } elseif($char === '-') {
924 $this->token['data'] .= '-';
926 } elseif($this->char === $this->EOF) {
927 $this->emitToken($this->token);
929 $this->state = 'data';
932 $this->token['data'] .= '--'.$char;
933 $this->state = 'comment';
937 private function doctypeState() {
938 /* Consume the next input character: */
940 $char = $this->char();
942 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
943 $this->state = 'beforeDoctypeName';
947 $this->state = 'beforeDoctypeName';
951 private function beforeDoctypeNameState() {
952 /* Consume the next input character: */
954 $char = $this->char();
956 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
957 // Stay in the before DOCTYPE name state.
959 } elseif(preg_match('/^[a-z]$/', $char)) {
960 $this->token = array(
961 'name' => strtoupper($char),
962 'type' => self::DOCTYPE,
966 $this->state = 'doctypeName';
968 } elseif($char === '>') {
969 $this->emitToken(array(
971 'type' => self::DOCTYPE,
975 $this->state = 'data';
977 } elseif($this->char === $this->EOF) {
978 $this->emitToken(array(
980 'type' => self::DOCTYPE,
985 $this->state = 'data';
988 $this->token = array(
990 'type' => self::DOCTYPE,
994 $this->state = 'doctypeName';
998 private function doctypeNameState() {
999 /* Consume the next input character: */
1001 $char = $this->char();
1003 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1004 $this->state = 'AfterDoctypeName';
1006 } elseif($char === '>') {
1007 $this->emitToken($this->token);
1008 $this->state = 'data';
1010 } elseif(preg_match('/^[a-z]$/', $char)) {
1011 $this->token['name'] .= strtoupper($char);
1013 } elseif($this->char === $this->EOF) {
1014 $this->emitToken($this->token);
1016 $this->state = 'data';
1019 $this->token['name'] .= $char;
1022 $this->token['error'] = ($this->token['name'] === 'HTML')
1027 private function afterDoctypeNameState() {
1028 /* Consume the next input character: */
1030 $char = $this->char();
1032 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1033 // Stay in the DOCTYPE name state.
1035 } elseif($char === '>') {
1036 $this->emitToken($this->token);
1037 $this->state = 'data';
1039 } elseif($this->char === $this->EOF) {
1040 $this->emitToken($this->token);
1042 $this->state = 'data';
1045 $this->token['error'] = true;
1046 $this->state = 'bogusDoctype';
1050 private function bogusDoctypeState() {
1051 /* Consume the next input character: */
1053 $char = $this->char();
1056 $this->emitToken($this->token);
1057 $this->state = 'data';
1059 } elseif($this->char === $this->EOF) {
1060 $this->emitToken($this->token);
1062 $this->state = 'data';
1065 // Stay in the bogus DOCTYPE state.
1069 private function entity() {
1070 $start = $this->char;
1072 // This section defines how to consume an entity. This definition is
1073 // used when parsing entities in text and in attributes.
1075 // The behaviour depends on the identity of the next character (the
1076 // one immediately after the U+0026 AMPERSAND character):
1078 switch($this->character($this->char + 1)) {
1079 // U+0023 NUMBER SIGN (#)
1082 // The behaviour further depends on the character after the
1083 // U+0023 NUMBER SIGN:
1084 switch($this->character($this->char + 1)) {
1085 // U+0078 LATIN SMALL LETTER X
1086 // U+0058 LATIN CAPITAL LETTER X
1089 // Follow the steps below, but using the range of
1090 // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1091 // NINE, U+0061 LATIN SMALL LETTER A through to U+0066
1092 // LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER
1093 // A, through to U+0046 LATIN CAPITAL LETTER F (in other
1094 // words, 0-9, A-F, a-f).
1096 $char_class = '0-9A-Fa-f';
1101 // Follow the steps below, but using the range of
1102 // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1103 // NINE (i.e. just 0-9).
1105 $char_class = '0-9';
1109 // Consume as many characters as match the range of characters
1112 $e_name = $this->characters($char_class, $this->char + $char + 1);
1113 $entity = $this->character($start, $this->char);
1114 $cond = strlen($e_name) > 0;
1116 // The rest of the parsing happens bellow.
1121 // Consume the maximum number of characters possible, with the
1122 // consumed characters case-sensitively matching one of the
1123 // identifiers in the first column of the entities table.
1124 $e_name = $this->characters('0-9A-Za-z;', $this->char + 1);
1125 $len = strlen($e_name);
1127 for($c = 1; $c <= $len; $c++) {
1128 $id = substr($e_name, 0, $c);
1131 if(in_array($id, $this->entities)) {
1132 if ($e_name[$c-1] !== ';') {
1133 if ($c < $len && $e_name[$c] == ';') {
1134 $this->char++; // consume extra semicolon
1142 $cond = isset($entity);
1143 // The rest of the parsing happens bellow.
1148 // If no match can be made, then this is a parse error. No
1149 // characters are consumed, and nothing is returned.
1150 $this->char = $start;
1154 // Return a character token for the character corresponding to the
1155 // entity name (as given by the second column of the entities table).
1156 return html_entity_decode('&'.$entity.';', ENT_QUOTES, 'UTF-8');
1159 private function emitToken($token) {
1160 $emit = $this->tree->emitToken($token);
1163 $this->content_model = $emit;
1165 } elseif($token['type'] === self::ENDTAG) {
1166 $this->content_model = self::PCDATA;
1170 private function EOF() {
1171 $this->state = null;
1172 $this->tree->emitToken(array(
1178 class HTML5TreeConstructer {
1179 public $stack = array();
1184 private $foster_parent = null;
1185 private $a_formatting = array();
1187 private $head_pointer = null;
1188 private $form_pointer = null;
1190 private $scoping = array('button','caption','html','marquee','object','table','td','th');
1191 private $formatting = array('a','b','big','em','font','i','nobr','s','small','strike','strong','tt','u');
1192 private $special = array('address','area','base','basefont','bgsound',
1193 'blockquote','body','br','center','col','colgroup','dd','dir','div','dl',
1194 'dt','embed','fieldset','form','frame','frameset','h1','h2','h3','h4','h5',
1195 'h6','head','hr','iframe','image','img','input','isindex','li','link',
1196 'listing','menu','meta','noembed','noframes','noscript','ol','optgroup',
1197 'option','p','param','plaintext','pre','script','select','spacer','style',
1198 'tbody','textarea','tfoot','thead','title','tr','ul','wbr');
1200 // The different phases.
1201 const INIT_PHASE = 0;
1202 const ROOT_PHASE = 1;
1203 const MAIN_PHASE = 2;
1204 const END_PHASE = 3;
1206 // The different insertion modes for the main phase.
1207 const BEFOR_HEAD = 0;
1209 const AFTER_HEAD = 2;
1212 const IN_CAPTION = 5;
1213 const IN_CGROUP = 6;
1217 const IN_SELECT = 10;
1218 const AFTER_BODY = 11;
1219 const IN_FRAME = 12;
1220 const AFTR_FRAME = 13;
1222 // The different types of elements.
1225 const FORMATTING = 2;
1230 public function __construct() {
1231 $this->phase = self::INIT_PHASE;
1232 $this->mode = self::BEFOR_HEAD;
1233 $this->dom = new DOMDocument;
1235 $this->dom->encoding = 'UTF-8';
1236 $this->dom->preserveWhiteSpace = true;
1237 $this->dom->substituteEntities = true;
1238 $this->dom->strictErrorChecking = false;
1241 // Process tag tokens
1242 public function emitToken($token) {
1243 switch($this->phase) {
1244 case self::INIT_PHASE: return $this->initPhase($token); break;
1245 case self::ROOT_PHASE: return $this->rootElementPhase($token); break;
1246 case self::MAIN_PHASE: return $this->mainPhase($token); break;
1247 case self::END_PHASE : return $this->trailingEndPhase($token); break;
1251 private function initPhase($token) {
1252 /* Initially, the tree construction stage must handle each token
1253 emitted from the tokenisation stage as follows: */
1255 /* A DOCTYPE token that is marked as being in error
1259 A character token that is not one of one of U+0009 CHARACTER TABULATION,
1260 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1262 An end-of-file token */
1263 if((isset($token['error']) && $token['error']) ||
1264 $token['type'] === HTML5::COMMENT ||
1265 $token['type'] === HTML5::STARTTAG ||
1266 $token['type'] === HTML5::ENDTAG ||
1267 $token['type'] === HTML5::EOF ||
1268 ($token['type'] === HTML5::CHARACTR && isset($token['data']) &&
1269 !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']))) {
1270 /* This specification does not define how to handle this case. In
1271 particular, user agents may ignore the entirety of this specification
1272 altogether for such documents, and instead invoke special parse modes
1273 with a greater emphasis on backwards compatibility. */
1275 $this->phase = self::ROOT_PHASE;
1276 return $this->rootElementPhase($token);
1278 /* A DOCTYPE token marked as being correct */
1279 } elseif(isset($token['error']) && !$token['error']) {
1280 /* Append a DocumentType node to the Document node, with the name
1281 attribute set to the name given in the DOCTYPE token (which will be
1282 "HTML"), and the other attributes specific to DocumentType objects
1283 set to null, empty lists, or the empty string as appropriate. */
1284 $doctype = new DOMDocumentType(null, null, 'HTML');
1286 /* Then, switch to the root element phase of the tree construction
1288 $this->phase = self::ROOT_PHASE;
1290 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1291 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1293 } elseif(isset($token['data']) && preg_match('/^[\t\n\x0b\x0c ]+$/',
1295 /* Append that character to the Document node. */
1296 $text = $this->dom->createTextNode($token['data']);
1297 $this->dom->appendChild($text);
1301 private function rootElementPhase($token) {
1302 /* After the initial phase, as each token is emitted from the tokenisation
1303 stage, it must be processed as described in this section. */
1305 /* A DOCTYPE token */
1306 if($token['type'] === HTML5::DOCTYPE) {
1307 // Parse error. Ignore the token.
1309 /* A comment token */
1310 } elseif($token['type'] === HTML5::COMMENT) {
1311 /* Append a Comment node to the Document object with the data
1312 attribute set to the data given in the comment token. */
1313 $comment = $this->dom->createComment($token['data']);
1314 $this->dom->appendChild($comment);
1316 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1317 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1319 } elseif($token['type'] === HTML5::CHARACTR &&
1320 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
1321 /* Append that character to the Document node. */
1322 $text = $this->dom->createTextNode($token['data']);
1323 $this->dom->appendChild($text);
1325 /* A character token that is not one of U+0009 CHARACTER TABULATION,
1326 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED
1327 (FF), or U+0020 SPACE
1330 An end-of-file token */
1331 } elseif(($token['type'] === HTML5::CHARACTR &&
1332 !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
1333 $token['type'] === HTML5::STARTTAG ||
1334 $token['type'] === HTML5::ENDTAG ||
1335 $token['type'] === HTML5::EOF) {
1336 /* Create an HTMLElement node with the tag name html, in the HTML
1337 namespace. Append it to the Document object. Switch to the main
1338 phase and reprocess the current token. */
1339 $html = $this->dom->createElement('html');
1340 $this->dom->appendChild($html);
1341 $this->stack[] = $html;
1343 $this->phase = self::MAIN_PHASE;
1344 return $this->mainPhase($token);
1348 private function mainPhase($token) {
1349 /* Tokens in the main phase must be handled as follows: */
1351 /* A DOCTYPE token */
1352 if($token['type'] === HTML5::DOCTYPE) {
1353 // Parse error. Ignore the token.
1355 /* A start tag token with the tag name "html" */
1356 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'html') {
1357 /* If this start tag token was not the first start tag token, then
1358 it is a parse error. */
1360 /* For each attribute on the token, check to see if the attribute
1361 is already present on the top element of the stack of open elements.
1362 If it is not, add the attribute and its corresponding value to that
1364 foreach($token['attr'] as $attr) {
1365 if(!$this->stack[0]->hasAttribute($attr['name'])) {
1366 $this->stack[0]->setAttribute($attr['name'], $attr['value']);
1370 /* An end-of-file token */
1371 } elseif($token['type'] === HTML5::EOF) {
1372 /* Generate implied end tags. */
1373 $this->generateImpliedEndTags();
1375 /* Anything else. */
1377 /* Depends on the insertion mode: */
1378 switch($this->mode) {
1379 case self::BEFOR_HEAD: return $this->beforeHead($token); break;
1380 case self::IN_HEAD: return $this->inHead($token); break;
1381 case self::AFTER_HEAD: return $this->afterHead($token); break;
1382 case self::IN_BODY: return $this->inBody($token); break;
1383 case self::IN_TABLE: return $this->inTable($token); break;
1384 case self::IN_CAPTION: return $this->inCaption($token); break;
1385 case self::IN_CGROUP: return $this->inColumnGroup($token); break;
1386 case self::IN_TBODY: return $this->inTableBody($token); break;
1387 case self::IN_ROW: return $this->inRow($token); break;
1388 case self::IN_CELL: return $this->inCell($token); break;
1389 case self::IN_SELECT: return $this->inSelect($token); break;
1390 case self::AFTER_BODY: return $this->afterBody($token); break;
1391 case self::IN_FRAME: return $this->inFrameset($token); break;
1392 case self::AFTR_FRAME: return $this->afterFrameset($token); break;
1393 case self::END_PHASE: return $this->trailingEndPhase($token); break;
1398 private function beforeHead($token) {
1399 /* Handle the token as follows: */
1401 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1402 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1404 if($token['type'] === HTML5::CHARACTR &&
1405 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
1406 /* Append the character to the current node. */
1407 $this->insertText($token['data']);
1409 /* A comment token */
1410 } elseif($token['type'] === HTML5::COMMENT) {
1411 /* Append a Comment node to the current node with the data attribute
1412 set to the data given in the comment token. */
1413 $this->insertComment($token['data']);
1415 /* A start tag token with the tag name "head" */
1416 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') {
1417 /* Create an element for the token, append the new element to the
1418 current node and push it onto the stack of open elements. */
1419 $element = $this->insertElement($token);
1421 /* Set the head element pointer to this new element node. */
1422 $this->head_pointer = $element;
1424 /* Change the insertion mode to "in head". */
1425 $this->mode = self::IN_HEAD;
1427 /* A start tag token whose tag name is one of: "base", "link", "meta",
1428 "script", "style", "title". Or an end tag with the tag name "html".
1429 Or a character token that is not one of U+0009 CHARACTER TABULATION,
1430 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1431 or U+0020 SPACE. Or any other start tag token */
1432 } elseif($token['type'] === HTML5::STARTTAG ||
1433 ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') ||
1434 ($token['type'] === HTML5::CHARACTR && !preg_match('/^[\t\n\x0b\x0c ]$/',
1436 /* Act as if a start tag token with the tag name "head" and no
1437 attributes had been seen, then reprocess the current token. */
1438 $this->beforeHead(array(
1440 'type' => HTML5::STARTTAG,
1444 return $this->inHead($token);
1446 /* Any other end tag */
1447 } elseif($token['type'] === HTML5::ENDTAG) {
1448 /* Parse error. Ignore the token. */
1452 private function inHead($token) {
1453 /* Handle the token as follows: */
1455 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1456 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1459 THIS DIFFERS FROM THE SPEC: If the current node is either a title, style
1460 or script element, append the character to the current node regardless
1462 if(($token['type'] === HTML5::CHARACTR &&
1463 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || (
1464 $token['type'] === HTML5::CHARACTR && in_array(end($this->stack)->nodeName,
1465 array('title', 'style', 'script')))) {
1466 /* Append the character to the current node. */
1467 $this->insertText($token['data']);
1469 /* A comment token */
1470 } elseif($token['type'] === HTML5::COMMENT) {
1471 /* Append a Comment node to the current node with the data attribute
1472 set to the data given in the comment token. */
1473 $this->insertComment($token['data']);
1475 } elseif($token['type'] === HTML5::ENDTAG &&
1476 in_array($token['name'], array('title', 'style', 'script'))) {
1477 array_pop($this->stack);
1478 return HTML5::PCDATA;
1480 /* A start tag with the tag name "title" */
1481 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'title') {
1482 /* Create an element for the token and append the new element to the
1483 node pointed to by the head element pointer, or, if that is null
1484 (innerHTML case), to the current node. */
1485 if($this->head_pointer !== null) {
1486 $element = $this->insertElement($token, false);
1487 $this->head_pointer->appendChild($element);
1490 $element = $this->insertElement($token);
1493 /* Switch the tokeniser's content model flag to the RCDATA state. */
1494 return HTML5::RCDATA;
1496 /* A start tag with the tag name "style" */
1497 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'style') {
1498 /* Create an element for the token and append the new element to the
1499 node pointed to by the head element pointer, or, if that is null
1500 (innerHTML case), to the current node. */
1501 if($this->head_pointer !== null) {
1502 $element = $this->insertElement($token, false);
1503 $this->head_pointer->appendChild($element);
1506 $this->insertElement($token);
1509 /* Switch the tokeniser's content model flag to the CDATA state. */
1510 return HTML5::CDATA;
1512 /* A start tag with the tag name "script" */
1513 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'script') {
1514 /* Create an element for the token. */
1515 $element = $this->insertElement($token, false);
1516 $this->head_pointer->appendChild($element);
1518 /* Switch the tokeniser's content model flag to the CDATA state. */
1519 return HTML5::CDATA;
1521 /* A start tag with the tag name "base", "link", or "meta" */
1522 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
1523 array('base', 'link', 'meta'))) {
1524 /* Create an element for the token and append the new element to the
1525 node pointed to by the head element pointer, or, if that is null
1526 (innerHTML case), to the current node. */
1527 if($this->head_pointer !== null) {
1528 $element = $this->insertElement($token, false);
1529 $this->head_pointer->appendChild($element);
1530 array_pop($this->stack);
1533 $this->insertElement($token);
1536 /* An end tag with the tag name "head" */
1537 } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'head') {
1538 /* If the current node is a head element, pop the current node off
1539 the stack of open elements. */
1540 if($this->head_pointer->isSameNode(end($this->stack))) {
1541 array_pop($this->stack);
1543 /* Otherwise, this is a parse error. */
1548 /* Change the insertion mode to "after head". */
1549 $this->mode = self::AFTER_HEAD;
1551 /* A start tag with the tag name "head" or an end tag except "html". */
1552 } elseif(($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') ||
1553 ($token['type'] === HTML5::ENDTAG && $token['name'] !== 'html')) {
1554 // Parse error. Ignore the token.
1558 /* If the current node is a head element, act as if an end tag
1559 token with the tag name "head" had been seen. */
1560 if($this->head_pointer->isSameNode(end($this->stack))) {
1561 $this->inHead(array(
1563 'type' => HTML5::ENDTAG
1566 /* Otherwise, change the insertion mode to "after head". */
1568 $this->mode = self::AFTER_HEAD;
1571 /* Then, reprocess the current token. */
1572 return $this->afterHead($token);
1576 private function afterHead($token) {
1577 /* Handle the token as follows: */
1579 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1580 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1582 if($token['type'] === HTML5::CHARACTR &&
1583 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
1584 /* Append the character to the current node. */
1585 $this->insertText($token['data']);
1587 /* A comment token */
1588 } elseif($token['type'] === HTML5::COMMENT) {
1589 /* Append a Comment node to the current node with the data attribute
1590 set to the data given in the comment token. */
1591 $this->insertComment($token['data']);
1593 /* A start tag token with the tag name "body" */
1594 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'body') {
1595 /* Insert a body element for the token. */
1596 $this->insertElement($token);
1598 /* Change the insertion mode to "in body". */
1599 $this->mode = self::IN_BODY;
1601 /* A start tag token with the tag name "frameset" */
1602 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'frameset') {
1603 /* Insert a frameset element for the token. */
1604 $this->insertElement($token);
1606 /* Change the insertion mode to "in frameset". */
1607 $this->mode = self::IN_FRAME;
1609 /* A start tag token whose tag name is one of: "base", "link", "meta",
1610 "script", "style", "title" */
1611 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
1612 array('base', 'link', 'meta', 'script', 'style', 'title'))) {
1613 /* Parse error. Switch the insertion mode back to "in head" and
1614 reprocess the token. */
1615 $this->mode = self::IN_HEAD;
1616 return $this->inHead($token);
1620 /* Act as if a start tag token with the tag name "body" and no
1621 attributes had been seen, and then reprocess the current token. */
1622 $this->afterHead(array(
1624 'type' => HTML5::STARTTAG,
1628 return $this->inBody($token);
1632 private function inBody($token) {
1633 /* Handle the token as follows: */
1635 switch($token['type']) {
1636 /* A character token */
1637 case HTML5::CHARACTR:
1638 /* Reconstruct the active formatting elements, if any. */
1639 $this->reconstructActiveFormattingElements();
1641 /* Append the token's character to the current node. */
1642 $this->insertText($token['data']);
1645 /* A comment token */
1646 case HTML5::COMMENT:
1647 /* Append a Comment node to the current node with the data
1648 attribute set to the data given in the comment token. */
1649 $this->insertComment($token['data']);
1652 case HTML5::STARTTAG:
1653 switch($token['name']) {
1654 /* A start tag token whose tag name is one of: "script",
1656 case 'script': case 'style':
1657 /* Process the token as if the insertion mode had been "in
1659 return $this->inHead($token);
1662 /* A start tag token whose tag name is one of: "base", "link",
1664 case 'base': case 'link': case 'meta': case 'title':
1665 /* Parse error. Process the token as if the insertion mode
1666 had been "in head". */
1667 return $this->inHead($token);
1670 /* A start tag token with the tag name "body" */
1672 /* Parse error. If the second element on the stack of open
1673 elements is not a body element, or, if the stack of open
1674 elements has only one node on it, then ignore the token.
1676 if(count($this->stack) === 1 || $this->stack[1]->nodeName !== 'body') {
1679 /* Otherwise, for each attribute on the token, check to see
1680 if the attribute is already present on the body element (the
1681 second element) on the stack of open elements. If it is not,
1682 add the attribute and its corresponding value to that
1685 foreach($token['attr'] as $attr) {
1686 if(!$this->stack[1]->hasAttribute($attr['name'])) {
1687 $this->stack[1]->setAttribute($attr['name'], $attr['value']);
1693 /* A start tag whose tag name is one of: "address",
1694 "blockquote", "center", "dir", "div", "dl", "fieldset",
1695 "listing", "menu", "ol", "p", "ul" */
1696 case 'address': case 'blockquote': case 'center': case 'dir':
1697 case 'div': case 'dl': case 'fieldset': case 'listing':
1698 case 'menu': case 'ol': case 'p': case 'ul':
1699 /* If the stack of open elements has a p element in scope,
1700 then act as if an end tag with the tag name p had been
1702 if($this->elementInScope('p')) {
1703 $this->emitToken(array(
1705 'type' => HTML5::ENDTAG
1709 /* Insert an HTML element for the token. */
1710 $this->insertElement($token);
1713 /* A start tag whose tag name is "form" */
1715 /* If the form element pointer is not null, ignore the
1716 token with a parse error. */
1717 if($this->form_pointer !== null) {
1722 /* If the stack of open elements has a p element in
1723 scope, then act as if an end tag with the tag name p
1725 if($this->elementInScope('p')) {
1726 $this->emitToken(array(
1728 'type' => HTML5::ENDTAG
1732 /* Insert an HTML element for the token, and set the
1733 form element pointer to point to the element created. */
1734 $element = $this->insertElement($token);
1735 $this->form_pointer = $element;
1739 /* A start tag whose tag name is "li", "dd" or "dt" */
1740 case 'li': case 'dd': case 'dt':
1741 /* If the stack of open elements has a p element in scope,
1742 then act as if an end tag with the tag name p had been
1744 if($this->elementInScope('p')) {
1745 $this->emitToken(array(
1747 'type' => HTML5::ENDTAG
1751 $stack_length = count($this->stack) - 1;
1753 for($n = $stack_length; 0 <= $n; $n--) {
1754 /* 1. Initialise node to be the current node (the
1755 bottommost node of the stack). */
1757 $node = $this->stack[$n];
1758 $cat = $this->getElementCategory($node->tagName);
1760 /* 2. If node is an li, dd or dt element, then pop all
1761 the nodes from the current node up to node, including
1762 node, then stop this algorithm. */
1763 if($token['name'] === $node->tagName || ($token['name'] !== 'li'
1764 && ($node->tagName === 'dd' || $node->tagName === 'dt'))) {
1765 for($x = $stack_length; $x >= $n ; $x--) {
1766 array_pop($this->stack);
1772 /* 3. If node is not in the formatting category, and is
1773 not in the phrasing category, and is not an address or
1774 div element, then stop this algorithm. */
1775 if($cat !== self::FORMATTING && $cat !== self::PHRASING &&
1776 $node->tagName !== 'address' && $node->tagName !== 'div') {
1781 /* Finally, insert an HTML element with the same tag
1782 name as the token's. */
1783 $this->insertElement($token);
1786 /* A start tag token whose tag name is "plaintext" */
1788 /* If the stack of open elements has a p element in scope,
1789 then act as if an end tag with the tag name p had been
1791 if($this->elementInScope('p')) {
1792 $this->emitToken(array(
1794 'type' => HTML5::ENDTAG
1798 /* Insert an HTML element for the token. */
1799 $this->insertElement($token);
1801 return HTML5::PLAINTEXT;
1804 /* A start tag whose tag name is one of: "h1", "h2", "h3", "h4",
1806 case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6':
1807 /* If the stack of open elements has a p element in scope,
1808 then act as if an end tag with the tag name p had been seen. */
1809 if($this->elementInScope('p')) {
1810 $this->emitToken(array(
1812 'type' => HTML5::ENDTAG
1816 /* If the stack of open elements has in scope an element whose
1817 tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
1818 this is a parse error; pop elements from the stack until an
1819 element with one of those tag names has been popped from the
1821 while($this->elementInScope(array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'))) {
1822 array_pop($this->stack);
1825 /* Insert an HTML element for the token. */
1826 $this->insertElement($token);
1829 /* A start tag whose tag name is "a" */
1831 /* If the list of active formatting elements contains
1832 an element whose tag name is "a" between the end of the
1833 list and the last marker on the list (or the start of
1834 the list if there is no marker on the list), then this
1835 is a parse error; act as if an end tag with the tag name
1836 "a" had been seen, then remove that element from the list
1837 of active formatting elements and the stack of open
1838 elements if the end tag didn't already remove it (it
1839 might not have if the element is not in table scope). */
1840 $leng = count($this->a_formatting);
1842 for($n = $leng - 1; $n >= 0; $n--) {
1843 if($this->a_formatting[$n] === self::MARKER) {
1846 } elseif($this->a_formatting[$n]->nodeName === 'a') {
1847 $this->emitToken(array(
1849 'type' => HTML5::ENDTAG
1855 /* Reconstruct the active formatting elements, if any. */
1856 $this->reconstructActiveFormattingElements();
1858 /* Insert an HTML element for the token. */
1859 $el = $this->insertElement($token);
1861 /* Add that element to the list of active formatting
1863 $this->a_formatting[] = $el;
1866 /* A start tag whose tag name is one of: "b", "big", "em", "font",
1867 "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
1868 case 'b': case 'big': case 'em': case 'font': case 'i':
1869 case 'nobr': case 's': case 'small': case 'strike':
1870 case 'strong': case 'tt': case 'u':
1871 /* Reconstruct the active formatting elements, if any. */
1872 $this->reconstructActiveFormattingElements();
1874 /* Insert an HTML element for the token. */
1875 $el = $this->insertElement($token);
1877 /* Add that element to the list of active formatting
1879 $this->a_formatting[] = $el;
1882 /* A start tag token whose tag name is "button" */
1884 /* If the stack of open elements has a button element in scope,
1885 then this is a parse error; act as if an end tag with the tag
1886 name "button" had been seen, then reprocess the token. (We don't
1887 do that. Unnecessary.) */
1888 if($this->elementInScope('button')) {
1889 $this->inBody(array(
1891 'type' => HTML5::ENDTAG
1895 /* Reconstruct the active formatting elements, if any. */
1896 $this->reconstructActiveFormattingElements();
1898 /* Insert an HTML element for the token. */
1899 $this->insertElement($token);
1901 /* Insert a marker at the end of the list of active
1902 formatting elements. */
1903 $this->a_formatting[] = self::MARKER;
1906 /* A start tag token whose tag name is one of: "marquee", "object" */
1907 case 'marquee': case 'object':
1908 /* Reconstruct the active formatting elements, if any. */
1909 $this->reconstructActiveFormattingElements();
1911 /* Insert an HTML element for the token. */
1912 $this->insertElement($token);
1914 /* Insert a marker at the end of the list of active
1915 formatting elements. */
1916 $this->a_formatting[] = self::MARKER;
1919 /* A start tag token whose tag name is "xmp" */
1921 /* Reconstruct the active formatting elements, if any. */
1922 $this->reconstructActiveFormattingElements();
1924 /* Insert an HTML element for the token. */
1925 $this->insertElement($token);
1927 /* Switch the content model flag to the CDATA state. */
1928 return HTML5::CDATA;
1931 /* A start tag whose tag name is "table" */
1933 /* If the stack of open elements has a p element in scope,
1934 then act as if an end tag with the tag name p had been seen. */
1935 if($this->elementInScope('p')) {
1936 $this->emitToken(array(
1938 'type' => HTML5::ENDTAG
1942 /* Insert an HTML element for the token. */
1943 $this->insertElement($token);
1945 /* Change the insertion mode to "in table". */
1946 $this->mode = self::IN_TABLE;
1949 /* A start tag whose tag name is one of: "area", "basefont",
1950 "bgsound", "br", "embed", "img", "param", "spacer", "wbr" */
1951 case 'area': case 'basefont': case 'bgsound': case 'br':
1952 case 'embed': case 'img': case 'param': case 'spacer':
1954 /* Reconstruct the active formatting elements, if any. */
1955 $this->reconstructActiveFormattingElements();
1957 /* Insert an HTML element for the token. */
1958 $this->insertElement($token);
1960 /* Immediately pop the current node off the stack of open elements. */
1961 array_pop($this->stack);
1964 /* A start tag whose tag name is "hr" */
1966 /* If the stack of open elements has a p element in scope,
1967 then act as if an end tag with the tag name p had been seen. */
1968 if($this->elementInScope('p')) {
1969 $this->emitToken(array(
1971 'type' => HTML5::ENDTAG
1975 /* Insert an HTML element for the token. */
1976 $this->insertElement($token);
1978 /* Immediately pop the current node off the stack of open elements. */
1979 array_pop($this->stack);
1982 /* A start tag whose tag name is "image" */
1984 /* Parse error. Change the token's tag name to "img" and
1985 reprocess it. (Don't ask.) */
1986 $token['name'] = 'img';
1987 return $this->inBody($token);
1990 /* A start tag whose tag name is "input" */
1992 /* Reconstruct the active formatting elements, if any. */
1993 $this->reconstructActiveFormattingElements();
1995 /* Insert an input element for the token. */
1996 $element = $this->insertElement($token, false);
1998 /* If the form element pointer is not null, then associate the
1999 input element with the form element pointed to by the form
2001 $this->form_pointer !== null
2002 ? $this->form_pointer->appendChild($element)
2003 : end($this->stack)->appendChild($element);
2005 /* Pop that input element off the stack of open elements. */
2006 array_pop($this->stack);
2009 /* A start tag whose tag name is "isindex" */
2014 /* If the form element pointer is not null,
2015 then ignore the token. */
2016 if($this->form_pointer === null) {
2017 /* Act as if a start tag token with the tag name "form" had
2019 $this->inBody(array(
2021 'type' => HTML5::STARTTAG,
2025 /* Act as if a start tag token with the tag name "hr" had
2027 $this->inBody(array(
2029 'type' => HTML5::STARTTAG,
2033 /* Act as if a start tag token with the tag name "p" had
2035 $this->inBody(array(
2037 'type' => HTML5::STARTTAG,
2041 /* Act as if a start tag token with the tag name "label"
2043 $this->inBody(array(
2045 'type' => HTML5::STARTTAG,
2049 /* Act as if a stream of character tokens had been seen. */
2050 $this->insertText('This is a searchable index. '.
2051 'Insert your search keywords here: ');
2053 /* Act as if a start tag token with the tag name "input"
2054 had been seen, with all the attributes from the "isindex"
2055 token, except with the "name" attribute set to the value
2056 "isindex" (ignoring any explicit "name" attribute). */
2057 $attr = $token['attr'];
2058 $attr[] = array('name' => 'name', 'value' => 'isindex');
2060 $this->inBody(array(
2062 'type' => HTML5::STARTTAG,
2066 /* Act as if a stream of character tokens had been seen
2067 (see below for what they should say). */
2068 $this->insertText('This is a searchable index. '.
2069 'Insert your search keywords here: ');
2071 /* Act as if an end tag token with the tag name "label"
2073 $this->inBody(array(
2075 'type' => HTML5::ENDTAG
2078 /* Act as if an end tag token with the tag name "p" had
2080 $this->inBody(array(
2082 'type' => HTML5::ENDTAG
2085 /* Act as if a start tag token with the tag name "hr" had
2087 $this->inBody(array(
2089 'type' => HTML5::ENDTAG
2092 /* Act as if an end tag token with the tag name "form" had
2094 $this->inBody(array(
2096 'type' => HTML5::ENDTAG
2101 /* A start tag whose tag name is "textarea" */
2103 $this->insertElement($token);
2105 /* Switch the tokeniser's content model flag to the
2107 return HTML5::RCDATA;
2110 /* A start tag whose tag name is one of: "iframe", "noembed",
2112 case 'iframe': case 'noembed': case 'noframes':
2113 $this->insertElement($token);
2115 /* Switch the tokeniser's content model flag to the CDATA state. */
2116 return HTML5::CDATA;
2119 /* A start tag whose tag name is "select" */
2121 /* Reconstruct the active formatting elements, if any. */
2122 $this->reconstructActiveFormattingElements();
2124 /* Insert an HTML element for the token. */
2125 $this->insertElement($token);
2127 /* Change the insertion mode to "in select". */
2128 $this->mode = self::IN_SELECT;
2131 /* A start or end tag whose tag name is one of: "caption", "col",
2132 "colgroup", "frame", "frameset", "head", "option", "optgroup",
2133 "tbody", "td", "tfoot", "th", "thead", "tr". */
2134 case 'caption': case 'col': case 'colgroup': case 'frame':
2135 case 'frameset': case 'head': case 'option': case 'optgroup':
2136 case 'tbody': case 'td': case 'tfoot': case 'th': case 'thead':
2138 // Parse error. Ignore the token.
2141 /* A start or end tag whose tag name is one of: "event-source",
2142 "section", "nav", "article", "aside", "header", "footer",
2143 "datagrid", "command" */
2144 case 'event-source': case 'section': case 'nav': case 'article':
2145 case 'aside': case 'header': case 'footer': case 'datagrid':
2147 // Work in progress!
2150 /* A start tag token not covered by the previous entries */
2152 /* Reconstruct the active formatting elements, if any. */
2153 $this->reconstructActiveFormattingElements();
2155 $this->insertElement($token, true, true);
2161 switch($token['name']) {
2162 /* An end tag with the tag name "body" */
2164 /* If the second element in the stack of open elements is
2165 not a body element, this is a parse error. Ignore the token.
2167 if(count($this->stack) < 2 || $this->stack[1]->nodeName !== 'body') {
2170 /* If the current node is not the body element, then this
2171 is a parse error. */
2172 } elseif(end($this->stack)->nodeName !== 'body') {
2176 /* Change the insertion mode to "after body". */
2177 $this->mode = self::AFTER_BODY;
2180 /* An end tag with the tag name "html" */
2182 /* Act as if an end tag with tag name "body" had been seen,
2183 then, if that token wasn't ignored, reprocess the current
2185 $this->inBody(array(
2187 'type' => HTML5::ENDTAG
2190 return $this->afterBody($token);
2193 /* An end tag whose tag name is one of: "address", "blockquote",
2194 "center", "dir", "div", "dl", "fieldset", "listing", "menu",
2195 "ol", "pre", "ul" */
2196 case 'address': case 'blockquote': case 'center': case 'dir':
2197 case 'div': case 'dl': case 'fieldset': case 'listing':
2198 case 'menu': case 'ol': case 'pre': case 'ul':
2199 /* If the stack of open elements has an element in scope
2200 with the same tag name as that of the token, then generate
2201 implied end tags. */
2202 if($this->elementInScope($token['name'])) {
2203 $this->generateImpliedEndTags();
2205 /* Now, if the current node is not an element with
2206 the same tag name as that of the token, then this
2207 is a parse error. */
2210 /* If the stack of open elements has an element in
2211 scope with the same tag name as that of the token,
2212 then pop elements from this stack until an element
2213 with that tag name has been popped from the stack. */
2214 for($n = count($this->stack) - 1; $n >= 0; $n--) {
2215 if($this->stack[$n]->nodeName === $token['name']) {
2219 array_pop($this->stack);
2224 /* An end tag whose tag name is "form" */
2226 /* If the stack of open elements has an element in scope
2227 with the same tag name as that of the token, then generate
2228 implied end tags. */
2229 if($this->elementInScope($token['name'])) {
2230 $this->generateImpliedEndTags();
2234 if(end($this->stack)->nodeName !== $token['name']) {
2235 /* Now, if the current node is not an element with the
2236 same tag name as that of the token, then this is a parse
2241 /* Otherwise, if the current node is an element with
2242 the same tag name as that of the token pop that element
2244 array_pop($this->stack);
2247 /* In any case, set the form element pointer to null. */
2248 $this->form_pointer = null;
2251 /* An end tag whose tag name is "p" */
2253 /* If the stack of open elements has a p element in scope,
2254 then generate implied end tags, except for p elements. */
2255 if($this->elementInScope('p')) {
2256 $this->generateImpliedEndTags(array('p'));
2258 /* If the current node is not a p element, then this is
2262 /* If the stack of open elements has a p element in
2263 scope, then pop elements from this stack until the stack
2264 no longer has a p element in scope. */
2265 for($n = count($this->stack) - 1; $n >= 0; $n--) {
2266 if($this->elementInScope('p')) {
2267 array_pop($this->stack);
2276 /* An end tag whose tag name is "dd", "dt", or "li" */
2277 case 'dd': case 'dt': case 'li':
2278 /* If the stack of open elements has an element in scope
2279 whose tag name matches the tag name of the token, then
2280 generate implied end tags, except for elements with the
2281 same tag name as the token. */
2282 if($this->elementInScope($token['name'])) {
2283 $this->generateImpliedEndTags(array($token['name']));
2285 /* If the current node is not an element with the same
2286 tag name as the token, then this is a parse error. */
2289 /* If the stack of open elements has an element in scope
2290 whose tag name matches the tag name of the token, then
2291 pop elements from this stack until an element with that
2292 tag name has been popped from the stack. */
2293 for($n = count($this->stack) - 1; $n >= 0; $n--) {
2294 if($this->stack[$n]->nodeName === $token['name']) {
2298 array_pop($this->stack);
2303 /* An end tag whose tag name is one of: "h1", "h2", "h3", "h4",
2305 case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6':
2306 $elements = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6');
2308 /* If the stack of open elements has in scope an element whose
2309 tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
2310 generate implied end tags. */
2311 if($this->elementInScope($elements)) {
2312 $this->generateImpliedEndTags();
2314 /* Now, if the current node is not an element with the same
2315 tag name as that of the token, then this is a parse error. */
2318 /* If the stack of open elements has in scope an element
2319 whose tag name is one of "h1", "h2", "h3", "h4", "h5", or
2320 "h6", then pop elements from the stack until an element
2321 with one of those tag names has been popped from the stack. */
2322 while($this->elementInScope($elements)) {
2323 array_pop($this->stack);
2328 /* An end tag whose tag name is one of: "a", "b", "big", "em",
2329 "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
2330 case 'a': case 'b': case 'big': case 'em': case 'font':
2331 case 'i': case 'nobr': case 's': case 'small': case 'strike':
2332 case 'strong': case 'tt': case 'u':
2333 /* 1. Let the formatting element be the last element in
2334 the list of active formatting elements that:
2335 * is between the end of the list and the last scope
2336 marker in the list, if any, or the start of the list
2338 * has the same tag name as the token.
2341 for($a = count($this->a_formatting) - 1; $a >= 0; $a--) {
2342 if($this->a_formatting[$a] === self::MARKER) {
2345 } elseif($this->a_formatting[$a]->tagName === $token['name']) {
2346 $formatting_element = $this->a_formatting[$a];
2347 $in_stack = in_array($formatting_element, $this->stack, true);
2353 /* If there is no such node, or, if that node is
2354 also in the stack of open elements but the element
2355 is not in scope, then this is a parse error. Abort
2356 these steps. The token is ignored. */
2357 if(!isset($formatting_element) || ($in_stack &&
2358 !$this->elementInScope($token['name']))) {
2361 /* Otherwise, if there is such a node, but that node
2362 is not in the stack of open elements, then this is a
2363 parse error; remove the element from the list, and
2364 abort these steps. */
2365 } elseif(isset($formatting_element) && !$in_stack) {
2366 unset($this->a_formatting[$fe_af_pos]);
2367 $this->a_formatting = array_merge($this->a_formatting);
2371 /* 2. Let the furthest block be the topmost node in the
2372 stack of open elements that is lower in the stack
2373 than the formatting element, and is not an element in
2374 the phrasing or formatting categories. There might
2376 $fe_s_pos = array_search($formatting_element, $this->stack, true);
2377 $length = count($this->stack);
2379 for($s = $fe_s_pos + 1; $s < $length; $s++) {
2380 $category = $this->getElementCategory($this->stack[$s]->nodeName);
2382 if($category !== self::PHRASING && $category !== self::FORMATTING) {
2383 $furthest_block = $this->stack[$s];
2387 /* 3. If there is no furthest block, then the UA must
2388 skip the subsequent steps and instead just pop all
2389 the nodes from the bottom of the stack of open
2390 elements, from the current node up to the formatting
2391 element, and remove the formatting element from the
2392 list of active formatting elements. */
2393 if(!isset($furthest_block)) {
2394 for($n = $length - 1; $n >= $fe_s_pos; $n--) {
2395 array_pop($this->stack);
2398 unset($this->a_formatting[$fe_af_pos]);
2399 $this->a_formatting = array_merge($this->a_formatting);
2403 /* 4. Let the common ancestor be the element
2404 immediately above the formatting element in the stack
2405 of open elements. */
2406 $common_ancestor = $this->stack[$fe_s_pos - 1];
2408 /* 5. If the furthest block has a parent node, then
2409 remove the furthest block from its parent node. */
2410 if($furthest_block->parentNode !== null) {
2411 $furthest_block->parentNode->removeChild($furthest_block);
2414 /* 6. Let a bookmark note the position of the
2415 formatting element in the list of active formatting
2416 elements relative to the elements on either side
2417 of it in the list. */
2418 $bookmark = $fe_af_pos;
2420 /* 7. Let node and last node be the furthest block.
2421 Follow these steps: */
2422 $node = $furthest_block;
2423 $last_node = $furthest_block;
2426 for($n = array_search($node, $this->stack, true) - 1; $n >= 0; $n--) {
2427 /* 7.1 Let node be the element immediately
2428 prior to node in the stack of open elements. */
2429 $node = $this->stack[$n];
2431 /* 7.2 If node is not in the list of active
2432 formatting elements, then remove node from
2433 the stack of open elements and then go back
2435 if(!in_array($node, $this->a_formatting, true)) {
2436 unset($this->stack[$n]);
2437 $this->stack = array_merge($this->stack);
2444 /* 7.3 Otherwise, if node is the formatting
2445 element, then go to the next step in the overall
2447 if($node === $formatting_element) {
2450 /* 7.4 Otherwise, if last node is the furthest
2451 block, then move the aforementioned bookmark to
2452 be immediately after the node in the list of
2453 active formatting elements. */
2454 } elseif($last_node === $furthest_block) {
2455 $bookmark = array_search($node, $this->a_formatting, true) + 1;
2458 /* 7.5 If node has any children, perform a
2459 shallow clone of node, replace the entry for
2460 node in the list of active formatting elements
2461 with an entry for the clone, replace the entry
2462 for node in the stack of open elements with an
2463 entry for the clone, and let node be the clone. */
2464 if($node->hasChildNodes()) {
2465 $clone = $node->cloneNode();
2466 $s_pos = array_search($node, $this->stack, true);
2467 $a_pos = array_search($node, $this->a_formatting, true);
2469 $this->stack[$s_pos] = $clone;
2470 $this->a_formatting[$a_pos] = $clone;
2474 /* 7.6 Insert last node into node, first removing
2475 it from its previous parent node if any. */
2476 if($last_node->parentNode !== null) {
2477 $last_node->parentNode->removeChild($last_node);
2480 $node->appendChild($last_node);
2482 /* 7.7 Let last node be node. */
2486 /* 8. Insert whatever last node ended up being in
2487 the previous step into the common ancestor node,
2488 first removing it from its previous parent node if
2490 if($last_node->parentNode !== null) {
2491 $last_node->parentNode->removeChild($last_node);
2494 $common_ancestor->appendChild($last_node);
2496 /* 9. Perform a shallow clone of the formatting
2498 $clone = $formatting_element->cloneNode();
2500 /* 10. Take all of the child nodes of the furthest
2501 block and append them to the clone created in the
2503 while($furthest_block->hasChildNodes()) {
2504 $child = $furthest_block->firstChild;
2505 $furthest_block->removeChild($child);
2506 $clone->appendChild($child);
2509 /* 11. Append that clone to the furthest block. */
2510 $furthest_block->appendChild($clone);
2512 /* 12. Remove the formatting element from the list
2513 of active formatting elements, and insert the clone
2514 into the list of active formatting elements at the
2515 position of the aforementioned bookmark. */
2516 $fe_af_pos = array_search($formatting_element, $this->a_formatting, true);
2517 unset($this->a_formatting[$fe_af_pos]);
2518 $this->a_formatting = array_merge($this->a_formatting);
2520 $af_part1 = array_slice($this->a_formatting, 0, $bookmark - 1);
2521 $af_part2 = array_slice($this->a_formatting, $bookmark, count($this->a_formatting));
2522 $this->a_formatting = array_merge($af_part1, array($clone), $af_part2);
2524 /* 13. Remove the formatting element from the stack
2525 of open elements, and insert the clone into the stack
2526 of open elements immediately after (i.e. in a more
2527 deeply nested position than) the position of the
2528 furthest block in that stack. */
2529 $fe_s_pos = array_search($formatting_element, $this->stack, true);
2530 $fb_s_pos = array_search($furthest_block, $this->stack, true);
2531 unset($this->stack[$fe_s_pos]);
2533 $s_part1 = array_slice($this->stack, 0, $fb_s_pos);
2534 $s_part2 = array_slice($this->stack, $fb_s_pos + 1, count($this->stack));
2535 $this->stack = array_merge($s_part1, array($clone), $s_part2);
2537 /* 14. Jump back to step 1 in this series of steps. */
2538 unset($formatting_element, $fe_af_pos, $fe_s_pos, $furthest_block);
2542 /* An end tag token whose tag name is one of: "button",
2543 "marquee", "object" */
2544 case 'button': case 'marquee': case 'object':
2545 /* If the stack of open elements has an element in scope whose
2546 tag name matches the tag name of the token, then generate implied
2548 if($this->elementInScope($token['name'])) {
2549 $this->generateImpliedEndTags();
2551 /* Now, if the current node is not an element with the same
2552 tag name as the token, then this is a parse error. */
2555 /* Now, if the stack of open elements has an element in scope
2556 whose tag name matches the tag name of the token, then pop
2557 elements from the stack until that element has been popped from
2558 the stack, and clear the list of active formatting elements up
2559 to the last marker. */
2560 for($n = count($this->stack) - 1; $n >= 0; $n--) {
2561 if($this->stack[$n]->nodeName === $token['name']) {
2565 array_pop($this->stack);
2568 $marker = end(array_keys($this->a_formatting, self::MARKER, true));
2570 for($n = count($this->a_formatting) - 1; $n > $marker; $n--) {
2571 array_pop($this->a_formatting);
2576 /* Or an end tag whose tag name is one of: "area", "basefont",
2577 "bgsound", "br", "embed", "hr", "iframe", "image", "img",
2578 "input", "isindex", "noembed", "noframes", "param", "select",
2579 "spacer", "table", "textarea", "wbr" */
2580 case 'area': case 'basefont': case 'bgsound': case 'br':
2581 case 'embed': case 'hr': case 'iframe': case 'image':
2582 case 'img': case 'input': case 'isindex': case 'noembed':
2583 case 'noframes': case 'param': case 'select': case 'spacer':
2584 case 'table': case 'textarea': case 'wbr':
2585 // Parse error. Ignore the token.
2588 /* An end tag token not covered by the previous entries */
2590 for($n = count($this->stack) - 1; $n >= 0; $n--) {
2591 /* Initialise node to be the current node (the bottommost
2592 node of the stack). */
2593 $node = end($this->stack);
2595 /* If node has the same tag name as the end tag token,
2597 if($token['name'] === $node->nodeName) {
2598 /* Generate implied end tags. */
2599 $this->generateImpliedEndTags();
2601 /* If the tag name of the end tag token does not
2602 match the tag name of the current node, this is a
2606 /* Pop all the nodes from the current node up to
2607 node, including node, then stop this algorithm. */
2608 for($x = count($this->stack) - $n; $x >= $n; $x--) {
2609 array_pop($this->stack);
2613 $category = $this->getElementCategory($node);
2615 if($category !== self::SPECIAL && $category !== self::SCOPING) {
2616 /* Otherwise, if node is in neither the formatting
2617 category nor the phrasing category, then this is a
2618 parse error. Stop this algorithm. The end tag token
2630 private function inTable($token) {
2631 $clear = array('html', 'table');
2633 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
2634 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
2636 if($token['type'] === HTML5::CHARACTR &&
2637 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
2638 /* Append the character to the current node. */
2639 $text = $this->dom->createTextNode($token['data']);
2640 end($this->stack)->appendChild($text);
2642 /* A comment token */
2643 } elseif($token['type'] === HTML5::COMMENT) {
2644 /* Append a Comment node to the current node with the data
2645 attribute set to the data given in the comment token. */
2646 $comment = $this->dom->createComment($token['data']);
2647 end($this->stack)->appendChild($comment);
2649 /* A start tag whose tag name is "caption" */
2650 } elseif($token['type'] === HTML5::STARTTAG &&
2651 $token['name'] === 'caption') {
2652 /* Clear the stack back to a table context. */
2653 $this->clearStackToTableContext($clear);
2655 /* Insert a marker at the end of the list of active
2656 formatting elements. */
2657 $this->a_formatting[] = self::MARKER;
2659 /* Insert an HTML element for the token, then switch the
2660 insertion mode to "in caption". */
2661 $this->insertElement($token);
2662 $this->mode = self::IN_CAPTION;
2664 /* A start tag whose tag name is "colgroup" */
2665 } elseif($token['type'] === HTML5::STARTTAG &&
2666 $token['name'] === 'colgroup') {
2667 /* Clear the stack back to a table context. */
2668 $this->clearStackToTableContext($clear);
2670 /* Insert an HTML element for the token, then switch the
2671 insertion mode to "in column group". */
2672 $this->insertElement($token);
2673 $this->mode = self::IN_CGROUP;
2675 /* A start tag whose tag name is "col" */
2676 } elseif($token['type'] === HTML5::STARTTAG &&
2677 $token['name'] === 'col') {
2678 $this->inTable(array(
2679 'name' => 'colgroup',
2680 'type' => HTML5::STARTTAG,
2684 $this->inColumnGroup($token);
2686 /* A start tag whose tag name is one of: "tbody", "tfoot", "thead" */
2687 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
2688 array('tbody', 'tfoot', 'thead'))) {
2689 /* Clear the stack back to a table context. */
2690 $this->clearStackToTableContext($clear);
2692 /* Insert an HTML element for the token, then switch the insertion
2693 mode to "in table body". */
2694 $this->insertElement($token);
2695 $this->mode = self::IN_TBODY;
2697 /* A start tag whose tag name is one of: "td", "th", "tr" */
2698 } elseif($token['type'] === HTML5::STARTTAG &&
2699 in_array($token['name'], array('td', 'th', 'tr'))) {
2700 /* Act as if a start tag token with the tag name "tbody" had been
2701 seen, then reprocess the current token. */
2702 $this->inTable(array(
2704 'type' => HTML5::STARTTAG,
2708 return $this->inTableBody($token);
2710 /* A start tag whose tag name is "table" */
2711 } elseif($token['type'] === HTML5::STARTTAG &&
2712 $token['name'] === 'table') {
2713 /* Parse error. Act as if an end tag token with the tag name "table"
2714 had been seen, then, if that token wasn't ignored, reprocess the
2716 $this->inTable(array(
2718 'type' => HTML5::ENDTAG
2721 return $this->mainPhase($token);
2723 /* An end tag whose tag name is "table" */
2724 } elseif($token['type'] === HTML5::ENDTAG &&
2725 $token['name'] === 'table') {
2726 /* If the stack of open elements does not have an element in table
2727 scope with the same tag name as the token, this is a parse error.
2728 Ignore the token. (innerHTML case) */
2729 if(!$this->elementInScope($token['name'], true)) {
2734 /* Generate implied end tags. */
2735 $this->generateImpliedEndTags();
2737 /* Now, if the current node is not a table element, then this
2738 is a parse error. */
2741 /* Pop elements from this stack until a table element has been
2742 popped from the stack. */
2744 $current = end($this->stack)->nodeName;
2745 array_pop($this->stack);
2747 if($current === 'table') {
2752 /* Reset the insertion mode appropriately. */
2753 $this->resetInsertionMode();
2756 /* An end tag whose tag name is one of: "body", "caption", "col",
2757 "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
2758 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
2759 array('body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td',
2760 'tfoot', 'th', 'thead', 'tr'))) {
2761 // Parse error. Ignore the token.
2765 /* Parse error. Process the token as if the insertion mode was "in
2766 body", with the following exception: */
2768 /* If the current node is a table, tbody, tfoot, thead, or tr
2769 element, then, whenever a node would be inserted into the current
2770 node, it must instead be inserted into the foster parent element. */
2771 if(in_array(end($this->stack)->nodeName,
2772 array('table', 'tbody', 'tfoot', 'thead', 'tr'))) {
2773 /* The foster parent element is the parent element of the last
2774 table element in the stack of open elements, if there is a
2775 table element and it has such a parent element. If there is no
2776 table element in the stack of open elements (innerHTML case),
2777 then the foster parent element is the first element in the
2778 stack of open elements (the html element). Otherwise, if there
2779 is a table element in the stack of open elements, but the last
2780 table element in the stack of open elements has no parent, or
2781 its parent node is not an element, then the foster parent
2782 element is the element before the last table element in the
2783 stack of open elements. */
2784 for($n = count($this->stack) - 1; $n >= 0; $n--) {
2785 if($this->stack[$n]->nodeName === 'table') {
2786 $table = $this->stack[$n];
2791 if(isset($table) && $table->parentNode !== null) {
2792 $this->foster_parent = $table->parentNode;
2794 } elseif(!isset($table)) {
2795 $this->foster_parent = $this->stack[0];
2797 } elseif(isset($table) && ($table->parentNode === null ||
2798 $table->parentNode->nodeType !== XML_ELEMENT_NODE)) {
2799 $this->foster_parent = $this->stack[$n - 1];
2803 $this->inBody($token);
2807 private function inCaption($token) {
2808 /* An end tag whose tag name is "caption" */
2809 if($token['type'] === HTML5::ENDTAG && $token['name'] === 'caption') {
2810 /* If the stack of open elements does not have an element in table
2811 scope with the same tag name as the token, this is a parse error.
2812 Ignore the token. (innerHTML case) */
2813 if(!$this->elementInScope($token['name'], true)) {
2818 /* Generate implied end tags. */
2819 $this->generateImpliedEndTags();
2821 /* Now, if the current node is not a caption element, then this
2822 is a parse error. */
2825 /* Pop elements from this stack until a caption element has
2826 been popped from the stack. */
2828 $node = end($this->stack)->nodeName;
2829 array_pop($this->stack);
2831 if($node === 'caption') {
2836 /* Clear the list of active formatting elements up to the last
2838 $this->clearTheActiveFormattingElementsUpToTheLastMarker();
2840 /* Switch the insertion mode to "in table". */
2841 $this->mode = self::IN_TABLE;
2844 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
2845 "tbody", "td", "tfoot", "th", "thead", "tr", or an end tag whose tag
2847 } elseif(($token['type'] === HTML5::STARTTAG && in_array($token['name'],
2848 array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
2849 'thead', 'tr'))) || ($token['type'] === HTML5::ENDTAG &&
2850 $token['name'] === 'table')) {
2851 /* Parse error. Act as if an end tag with the tag name "caption"
2852 had been seen, then, if that token wasn't ignored, reprocess the
2854 $this->inCaption(array(
2855 'name' => 'caption',
2856 'type' => HTML5::ENDTAG
2859 return $this->inTable($token);
2861 /* An end tag whose tag name is one of: "body", "col", "colgroup",
2862 "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
2863 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
2864 array('body', 'col', 'colgroup', 'html', 'tbody', 'tfoot', 'th',
2866 // Parse error. Ignore the token.
2870 /* Process the token as if the insertion mode was "in body". */
2871 $this->inBody($token);
2875 private function inColumnGroup($token) {
2876 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
2877 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
2879 if($token['type'] === HTML5::CHARACTR &&
2880 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
2881 /* Append the character to the current node. */
2882 $text = $this->dom->createTextNode($token['data']);
2883 end($this->stack)->appendChild($text);
2885 /* A comment token */
2886 } elseif($token['type'] === HTML5::COMMENT) {
2887 /* Append a Comment node to the current node with the data
2888 attribute set to the data given in the comment token. */
2889 $comment = $this->dom->createComment($token['data']);
2890 end($this->stack)->appendChild($comment);
2892 /* A start tag whose tag name is "col" */
2893 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'col') {
2894 /* Insert a col element for the token. Immediately pop the current
2895 node off the stack of open elements. */
2896 $this->insertElement($token);
2897 array_pop($this->stack);
2899 /* An end tag whose tag name is "colgroup" */
2900 } elseif($token['type'] === HTML5::ENDTAG &&
2901 $token['name'] === 'colgroup') {
2902 /* If the current node is the root html element, then this is a
2903 parse error, ignore the token. (innerHTML case) */
2904 if(end($this->stack)->nodeName === 'html') {
2907 /* Otherwise, pop the current node (which will be a colgroup
2908 element) from the stack of open elements. Switch the insertion
2909 mode to "in table". */
2911 array_pop($this->stack);
2912 $this->mode = self::IN_TABLE;
2915 /* An end tag whose tag name is "col" */
2916 } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'col') {
2917 /* Parse error. Ignore the token. */
2921 /* Act as if an end tag with the tag name "colgroup" had been seen,
2922 and then, if that token wasn't ignored, reprocess the current token. */
2923 $this->inColumnGroup(array(
2924 'name' => 'colgroup',
2925 'type' => HTML5::ENDTAG
2928 return $this->inTable($token);
2932 private function inTableBody($token) {
2933 $clear = array('tbody', 'tfoot', 'thead', 'html');
2935 /* A start tag whose tag name is "tr" */
2936 if($token['type'] === HTML5::STARTTAG && $token['name'] === 'tr') {
2937 /* Clear the stack back to a table body context. */
2938 $this->clearStackToTableContext($clear);
2940 /* Insert a tr element for the token, then switch the insertion
2941 mode to "in row". */
2942 $this->insertElement($token);
2943 $this->mode = self::IN_ROW;
2945 /* A start tag whose tag name is one of: "th", "td" */
2946 } elseif($token['type'] === HTML5::STARTTAG &&
2947 ($token['name'] === 'th' || $token['name'] === 'td')) {
2948 /* Parse error. Act as if a start tag with the tag name "tr" had
2949 been seen, then reprocess the current token. */
2950 $this->inTableBody(array(
2952 'type' => HTML5::STARTTAG,
2956 return $this->inRow($token);
2958 /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
2959 } elseif($token['type'] === HTML5::ENDTAG &&
2960 in_array($token['name'], array('tbody', 'tfoot', 'thead'))) {
2961 /* If the stack of open elements does not have an element in table
2962 scope with the same tag name as the token, this is a parse error.
2963 Ignore the token. */
2964 if(!$this->elementInScope($token['name'], true)) {
2969 /* Clear the stack back to a table body context. */
2970 $this->clearStackToTableContext($clear);
2972 /* Pop the current node from the stack of open elements. Switch
2973 the insertion mode to "in table". */
2974 array_pop($this->stack);
2975 $this->mode = self::IN_TABLE;
2978 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
2979 "tbody", "tfoot", "thead", or an end tag whose tag name is "table" */
2980 } elseif(($token['type'] === HTML5::STARTTAG && in_array($token['name'],
2981 array('caption', 'col', 'colgroup', 'tbody', 'tfoor', 'thead'))) ||
2982 ($token['type'] === HTML5::STARTTAG && $token['name'] === 'table')) {
2983 /* If the stack of open elements does not have a tbody, thead, or
2984 tfoot element in table scope, this is a parse error. Ignore the
2985 token. (innerHTML case) */
2986 if(!$this->elementInScope(array('tbody', 'thead', 'tfoot'), true)) {
2991 /* Clear the stack back to a table body context. */
2992 $this->clearStackToTableContext($clear);
2994 /* Act as if an end tag with the same tag name as the current
2995 node ("tbody", "tfoot", or "thead") had been seen, then
2996 reprocess the current token. */
2997 $this->inTableBody(array(
2998 'name' => end($this->stack)->nodeName,
2999 'type' => HTML5::ENDTAG
3002 return $this->mainPhase($token);
3005 /* An end tag whose tag name is one of: "body", "caption", "col",
3006 "colgroup", "html", "td", "th", "tr" */
3007 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
3008 array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr'))) {
3009 /* Parse error. Ignore the token. */
3013 /* Process the token as if the insertion mode was "in table". */
3014 $this->inTable($token);
3018 private function inRow($token) {
3019 $clear = array('tr', 'html');
3021 /* A start tag whose tag name is one of: "th", "td" */
3022 if($token['type'] === HTML5::STARTTAG &&
3023 ($token['name'] === 'th' || $token['name'] === 'td')) {
3024 /* Clear the stack back to a table row context. */
3025 $this->clearStackToTableContext($clear);
3027 /* Insert an HTML element for the token, then switch the insertion
3028 mode to "in cell". */
3029 $this->insertElement($token);
3030 $this->mode = self::IN_CELL;
3032 /* Insert a marker at the end of the list of active formatting
3034 $this->a_formatting[] = self::MARKER;
3036 /* An end tag whose tag name is "tr" */
3037 } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'tr') {
3038 /* If the stack of open elements does not have an element in table
3039 scope with the same tag name as the token, this is a parse error.
3040 Ignore the token. (innerHTML case) */
3041 if(!$this->elementInScope($token['name'], true)) {
3046 /* Clear the stack back to a table row context. */
3047 $this->clearStackToTableContext($clear);
3049 /* Pop the current node (which will be a tr element) from the
3050 stack of open elements. Switch the insertion mode to "in table
3052 array_pop($this->stack);
3053 $this->mode = self::IN_TBODY;
3056 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3057 "tbody", "tfoot", "thead", "tr" or an end tag whose tag name is "table" */
3058 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
3059 array('caption', 'col', 'colgroup', 'tbody', 'tfoot', 'thead', 'tr'))) {
3060 /* Act as if an end tag with the tag name "tr" had been seen, then,
3061 if that token wasn't ignored, reprocess the current token. */
3064 'type' => HTML5::ENDTAG
3067 return $this->inCell($token);
3069 /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
3070 } elseif($token['type'] === HTML5::ENDTAG &&
3071 in_array($token['name'], array('tbody', 'tfoot', 'thead'))) {
3072 /* If the stack of open elements does not have an element in table
3073 scope with the same tag name as the token, this is a parse error.
3074 Ignore the token. */
3075 if(!$this->elementInScope($token['name'], true)) {
3080 /* Otherwise, act as if an end tag with the tag name "tr" had
3081 been seen, then reprocess the current token. */
3084 'type' => HTML5::ENDTAG
3087 return $this->inCell($token);
3090 /* An end tag whose tag name is one of: "body", "caption", "col",
3091 "colgroup", "html", "td", "th" */
3092 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
3093 array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr'))) {
3094 /* Parse error. Ignore the token. */
3098 /* Process the token as if the insertion mode was "in table". */
3099 $this->inTable($token);
3103 private function inCell($token) {
3104 /* An end tag whose tag name is one of: "td", "th" */
3105 if($token['type'] === HTML5::ENDTAG &&
3106 ($token['name'] === 'td' || $token['name'] === 'th')) {
3107 /* If the stack of open elements does not have an element in table
3108 scope with the same tag name as that of the token, then this is a
3109 parse error and the token must be ignored. */
3110 if(!$this->elementInScope($token['name'], true)) {
3115 /* Generate implied end tags, except for elements with the same
3116 tag name as the token. */
3117 $this->generateImpliedEndTags(array($token['name']));
3119 /* Now, if the current node is not an element with the same tag
3120 name as the token, then this is a parse error. */
3123 /* Pop elements from this stack until an element with the same
3124 tag name as the token has been popped from the stack. */
3126 $node = end($this->stack)->nodeName;
3127 array_pop($this->stack);
3129 if($node === $token['name']) {
3134 /* Clear the list of active formatting elements up to the last
3136 $this->clearTheActiveFormattingElementsUpToTheLastMarker();
3138 /* Switch the insertion mode to "in row". (The current node
3139 will be a tr element at this point.) */
3140 $this->mode = self::IN_ROW;
3143 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3144 "tbody", "td", "tfoot", "th", "thead", "tr" */
3145 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
3146 array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
3148 /* If the stack of open elements does not have a td or th element
3149 in table scope, then this is a parse error; ignore the token.
3151 if(!$this->elementInScope(array('td', 'th'), true)) {
3154 /* Otherwise, close the cell (see below) and reprocess the current
3158 return $this->inRow($token);
3161 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3162 "tbody", "td", "tfoot", "th", "thead", "tr" */
3163 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
3164 array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
3166 /* If the stack of open elements does not have a td or th element
3167 in table scope, then this is a parse error; ignore the token.
3169 if(!$this->elementInScope(array('td', 'th'), true)) {
3172 /* Otherwise, close the cell (see below) and reprocess the current
3176 return $this->inRow($token);
3179 /* An end tag whose tag name is one of: "body", "caption", "col",
3180 "colgroup", "html" */
3181 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
3182 array('body', 'caption', 'col', 'colgroup', 'html'))) {
3183 /* Parse error. Ignore the token. */
3185 /* An end tag whose tag name is one of: "table", "tbody", "tfoot",
3187 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
3188 array('table', 'tbody', 'tfoot', 'thead', 'tr'))) {
3189 /* If the stack of open elements does not have an element in table
3190 scope with the same tag name as that of the token (which can only
3191 happen for "tbody", "tfoot" and "thead", or, in the innerHTML case),
3192 then this is a parse error and the token must be ignored. */
3193 if(!$this->elementInScope($token['name'], true)) {
3196 /* Otherwise, close the cell (see below) and reprocess the current
3200 return $this->inRow($token);
3205 /* Process the token as if the insertion mode was "in body". */
3206 $this->inBody($token);
3210 private function inSelect($token) {
3211 /* Handle the token as follows: */
3213 /* A character token */
3214 if($token['type'] === HTML5::CHARACTR) {
3215 /* Append the token's character to the current node. */
3216 $this->insertText($token['data']);
3218 /* A comment token */
3219 } elseif($token['type'] === HTML5::COMMENT) {
3220 /* Append a Comment node to the current node with the data
3221 attribute set to the data given in the comment token. */
3222 $this->insertComment($token['data']);
3224 /* A start tag token whose tag name is "option" */
3225 } elseif($token['type'] === HTML5::STARTTAG &&
3226 $token['name'] === 'option') {
3227 /* If the current node is an option element, act as if an end tag
3228 with the tag name "option" had been seen. */
3229 if(end($this->stack)->nodeName === 'option') {
3230 $this->inSelect(array(
3232 'type' => HTML5::ENDTAG
3236 /* Insert an HTML element for the token. */
3237 $this->insertElement($token);
3239 /* A start tag token whose tag name is "optgroup" */
3240 } elseif($token['type'] === HTML5::STARTTAG &&
3241 $token['name'] === 'optgroup') {
3242 /* If the current node is an option element, act as if an end tag
3243 with the tag name "option" had been seen. */
3244 if(end($this->stack)->nodeName === 'option') {
3245 $this->inSelect(array(
3247 'type' => HTML5::ENDTAG
3251 /* If the current node is an optgroup element, act as if an end tag
3252 with the tag name "optgroup" had been seen. */
3253 if(end($this->stack)->nodeName === 'optgroup') {
3254 $this->inSelect(array(
3255 'name' => 'optgroup',
3256 'type' => HTML5::ENDTAG
3260 /* Insert an HTML element for the token. */
3261 $this->insertElement($token);
3263 /* An end tag token whose tag name is "optgroup" */
3264 } elseif($token['type'] === HTML5::ENDTAG &&
3265 $token['name'] === 'optgroup') {
3266 /* First, if the current node is an option element, and the node
3267 immediately before it in the stack of open elements is an optgroup
3268 element, then act as if an end tag with the tag name "option" had
3270 $elements_in_stack = count($this->stack);
3272 if($this->stack[$elements_in_stack - 1]->nodeName === 'option' &&
3273 $this->stack[$elements_in_stack - 2]->nodeName === 'optgroup') {
3274 $this->inSelect(array(
3276 'type' => HTML5::ENDTAG
3280 /* If the current node is an optgroup element, then pop that node
3281 from the stack of open elements. Otherwise, this is a parse error,
3282 ignore the token. */
3283 if($this->stack[$elements_in_stack - 1] === 'optgroup') {
3284 array_pop($this->stack);
3287 /* An end tag token whose tag name is "option" */
3288 } elseif($token['type'] === HTML5::ENDTAG &&
3289 $token['name'] === 'option') {
3290 /* If the current node is an option element, then pop that node
3291 from the stack of open elements. Otherwise, this is a parse error,
3292 ignore the token. */
3293 if(end($this->stack)->nodeName === 'option') {
3294 array_pop($this->stack);
3297 /* An end tag whose tag name is "select" */
3298 } elseif($token['type'] === HTML5::ENDTAG &&
3299 $token['name'] === 'select') {
3300 /* If the stack of open elements does not have an element in table
3301 scope with the same tag name as the token, this is a parse error.
3302 Ignore the token. (innerHTML case) */
3303 if(!$this->elementInScope($token['name'], true)) {
3308 /* Pop elements from the stack of open elements until a select
3309 element has been popped from the stack. */
3311 $current = end($this->stack)->nodeName;
3312 array_pop($this->stack);
3314 if($current === 'select') {
3319 /* Reset the insertion mode appropriately. */
3320 $this->resetInsertionMode();
3323 /* A start tag whose tag name is "select" */
3324 } elseif($token['name'] === 'select' &&
3325 $token['type'] === HTML5::STARTTAG) {
3326 /* Parse error. Act as if the token had been an end tag with the
3327 tag name "select" instead. */
3328 $this->inSelect(array(
3330 'type' => HTML5::ENDTAG
3333 /* An end tag whose tag name is one of: "caption", "table", "tbody",
3334 "tfoot", "thead", "tr", "td", "th" */
3335 } elseif(in_array($token['name'], array('caption', 'table', 'tbody',
3336 'tfoot', 'thead', 'tr', 'td', 'th')) && $token['type'] === HTML5::ENDTAG) {
3340 /* If the stack of open elements has an element in table scope with
3341 the same tag name as that of the token, then act as if an end tag
3342 with the tag name "select" had been seen, and reprocess the token.
3343 Otherwise, ignore the token. */
3344 if($this->elementInScope($token['name'], true)) {
3345 $this->inSelect(array(
3347 'type' => HTML5::ENDTAG
3350 $this->mainPhase($token);
3355 /* Parse error. Ignore the token. */
3359 private function afterBody($token) {
3360 /* Handle the token as follows: */
3362 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3363 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3365 if($token['type'] === HTML5::CHARACTR &&
3366 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3367 /* Process the token as it would be processed if the insertion mode
3369 $this->inBody($token);
3371 /* A comment token */
3372 } elseif($token['type'] === HTML5::COMMENT) {
3373 /* Append a Comment node to the first element in the stack of open
3374 elements (the html element), with the data attribute set to the
3375 data given in the comment token. */
3376 $comment = $this->dom->createComment($token['data']);
3377 $this->stack[0]->appendChild($comment);
3379 /* An end tag with the tag name "html" */
3380 } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') {
3381 /* If the parser was originally created in order to handle the
3382 setting of an element's innerHTML attribute, this is a parse error;
3383 ignore the token. (The element will be an html element in this
3384 case.) (innerHTML case) */
3386 /* Otherwise, switch to the trailing end phase. */
3387 $this->phase = self::END_PHASE;
3391 /* Parse error. Set the insertion mode to "in body" and reprocess
3393 $this->mode = self::IN_BODY;
3394 return $this->inBody($token);
3398 private function inFrameset($token) {
3399 /* Handle the token as follows: */
3401 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3402 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3403 U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
3404 if($token['type'] === HTML5::CHARACTR &&
3405 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3406 /* Append the character to the current node. */
3407 $this->insertText($token['data']);
3409 /* A comment token */
3410 } elseif($token['type'] === HTML5::COMMENT) {
3411 /* Append a Comment node to the current node with the data
3412 attribute set to the data given in the comment token. */
3413 $this->insertComment($token['data']);
3415 /* A start tag with the tag name "frameset" */
3416 } elseif($token['name'] === 'frameset' &&
3417 $token['type'] === HTML5::STARTTAG) {
3418 $this->insertElement($token);
3420 /* An end tag with the tag name "frameset" */
3421 } elseif($token['name'] === 'frameset' &&
3422 $token['type'] === HTML5::ENDTAG) {
3423 /* If the current node is the root html element, then this is a
3424 parse error; ignore the token. (innerHTML case) */
3425 if(end($this->stack)->nodeName === 'html') {
3429 /* Otherwise, pop the current node from the stack of open
3431 array_pop($this->stack);
3433 /* If the parser was not originally created in order to handle
3434 the setting of an element's innerHTML attribute (innerHTML case),
3435 and the current node is no longer a frameset element, then change
3436 the insertion mode to "after frameset". */
3437 $this->mode = self::AFTR_FRAME;
3440 /* A start tag with the tag name "frame" */
3441 } elseif($token['name'] === 'frame' &&
3442 $token['type'] === HTML5::STARTTAG) {
3443 /* Insert an HTML element for the token. */
3444 $this->insertElement($token);
3446 /* Immediately pop the current node off the stack of open elements. */
3447 array_pop($this->stack);
3449 /* A start tag with the tag name "noframes" */
3450 } elseif($token['name'] === 'noframes' &&
3451 $token['type'] === HTML5::STARTTAG) {
3452 /* Process the token as if the insertion mode had been "in body". */
3453 $this->inBody($token);
3457 /* Parse error. Ignore the token. */
3461 private function afterFrameset($token) {
3462 /* Handle the token as follows: */
3464 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3465 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3466 U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
3467 if($token['type'] === HTML5::CHARACTR &&
3468 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3469 /* Append the character to the current node. */
3470 $this->insertText($token['data']);
3472 /* A comment token */
3473 } elseif($token['type'] === HTML5::COMMENT) {
3474 /* Append a Comment node to the current node with the data
3475 attribute set to the data given in the comment token. */
3476 $this->insertComment($token['data']);
3478 /* An end tag with the tag name "html" */
3479 } elseif($token['name'] === 'html' &&
3480 $token['type'] === HTML5::ENDTAG) {
3481 /* Switch to the trailing end phase. */
3482 $this->phase = self::END_PHASE;
3484 /* A start tag with the tag name "noframes" */
3485 } elseif($token['name'] === 'noframes' &&
3486 $token['type'] === HTML5::STARTTAG) {
3487 /* Process the token as if the insertion mode had been "in body". */
3488 $this->inBody($token);
3492 /* Parse error. Ignore the token. */
3496 private function trailingEndPhase($token) {
3497 /* After the main phase, as each token is emitted from the tokenisation
3498 stage, it must be processed as described in this section. */
3500 /* A DOCTYPE token */
3501 if($token['type'] === HTML5::DOCTYPE) {
3502 // Parse error. Ignore the token.
3504 /* A comment token */
3505 } elseif($token['type'] === HTML5::COMMENT) {
3506 /* Append a Comment node to the Document object with the data
3507 attribute set to the data given in the comment token. */
3508 $comment = $this->dom->createComment($token['data']);
3509 $this->dom->appendChild($comment);
3511 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3512 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3514 } elseif($token['type'] === HTML5::CHARACTR &&
3515 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3516 /* Process the token as it would be processed in the main phase. */
3517 $this->mainPhase($token);
3519 /* A character token that is not one of U+0009 CHARACTER TABULATION,
3520 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3521 or U+0020 SPACE. Or a start tag token. Or an end tag token. */
3522 } elseif(($token['type'] === HTML5::CHARACTR &&
3523 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
3524 $token['type'] === HTML5::STARTTAG || $token['type'] === HTML5::ENDTAG) {
3525 /* Parse error. Switch back to the main phase and reprocess the
3527 $this->phase = self::MAIN_PHASE;
3528 return $this->mainPhase($token);
3530 /* An end-of-file token */
3531 } elseif($token['type'] === HTML5::EOF) {
3536 private function insertElement($token, $append = true, $check = false) {
3537 // Proprietary workaround for libxml2's limitations with tag names
3539 // Slightly modified HTML5 tag-name modification,
3540 // removing anything that's not an ASCII letter, digit, or hyphen
3541 $token['name'] = preg_replace('/[^a-z0-9-]/i', '', $token['name']);
3542 // Remove leading hyphens and numbers
3543 $token['name'] = ltrim($token['name'], '-0..9');
3544 // In theory, this should ever be needed, but just in case
3545 if ($token['name'] === '') $token['name'] = 'span'; // arbitrary generic choice
3548 $el = $this->dom->createElement($token['name']);
3550 foreach($token['attr'] as $attr) {
3551 if(!$el->hasAttribute($attr['name'])) {
3552 $el->setAttribute($attr['name'], $attr['value']);
3556 $this->appendToRealParent($el);
3557 $this->stack[] = $el;
3562 private function insertText($data) {
3563 $text = $this->dom->createTextNode($data);
3564 $this->appendToRealParent($text);
3567 private function insertComment($data) {
3568 $comment = $this->dom->createComment($data);
3569 $this->appendToRealParent($comment);
3572 private function appendToRealParent($node) {
3573 if($this->foster_parent === null) {
3574 end($this->stack)->appendChild($node);
3576 } elseif($this->foster_parent !== null) {
3577 /* If the foster parent element is the parent element of the
3578 last table element in the stack of open elements, then the new
3579 node must be inserted immediately before the last table element
3580 in the stack of open elements in the foster parent element;
3581 otherwise, the new node must be appended to the foster parent
3583 for($n = count($this->stack) - 1; $n >= 0; $n--) {
3584 if($this->stack[$n]->nodeName === 'table' &&
3585 $this->stack[$n]->parentNode !== null) {
3586 $table = $this->stack[$n];
3591 if(isset($table) && $this->foster_parent->isSameNode($table->parentNode))
3592 $this->foster_parent->insertBefore($node, $table);
3594 $this->foster_parent->appendChild($node);
3596 $this->foster_parent = null;
3600 private function elementInScope($el, $table = false) {
3602 foreach($el as $element) {
3603 if($this->elementInScope($element, $table)) {
3611 $leng = count($this->stack);
3613 for($n = 0; $n < $leng; $n++) {
3614 /* 1. Initialise node to be the current node (the bottommost node of
3616 $node = $this->stack[$leng - 1 - $n];
3618 if($node->tagName === $el) {
3619 /* 2. If node is the target node, terminate in a match state. */
3622 } elseif($node->tagName === 'table') {
3623 /* 3. Otherwise, if node is a table element, terminate in a failure
3627 } elseif($table === true && in_array($node->tagName, array('caption', 'td',
3628 'th', 'button', 'marquee', 'object'))) {
3629 /* 4. Otherwise, if the algorithm is the "has an element in scope"
3630 variant (rather than the "has an element in table scope" variant),
3631 and node is one of the following, terminate in a failure state. */
3634 } elseif($node === $node->ownerDocument->documentElement) {
3635 /* 5. Otherwise, if node is an html element (root element), terminate
3636 in a failure state. (This can only happen if the node is the topmost
3637 node of the stack of open elements, and prevents the next step from
3638 being invoked if there are no more elements in the stack.) */
3642 /* Otherwise, set node to the previous entry in the stack of open
3643 elements and return to step 2. (This will never fail, since the loop
3644 will always terminate in the previous step if the top of the stack
3649 private function reconstructActiveFormattingElements() {
3650 /* 1. If there are no entries in the list of active formatting elements,
3651 then there is nothing to reconstruct; stop this algorithm. */
3652 $formatting_elements = count($this->a_formatting);
3654 if($formatting_elements === 0) {
3658 /* 3. Let entry be the last (most recently added) element in the list
3659 of active formatting elements. */
3660 $entry = end($this->a_formatting);
3662 /* 2. If the last (most recently added) entry in the list of active
3663 formatting elements is a marker, or if it is an element that is in the
3664 stack of open elements, then there is nothing to reconstruct; stop this
3666 if($entry === self::MARKER || in_array($entry, $this->stack, true)) {
3670 for($a = $formatting_elements - 1; $a >= 0; true) {
3671 /* 4. If there are no entries before entry in the list of active
3672 formatting elements, then jump to step 8. */
3674 $step_seven = false;
3678 /* 5. Let entry be the entry one earlier than entry in the list of
3679 active formatting elements. */
3681 $entry = $this->a_formatting[$a];
3683 /* 6. If entry is neither a marker nor an element that is also in
3684 thetack of open elements, go to step 4. */
3685 if($entry === self::MARKER || in_array($entry, $this->stack, true)) {
3691 /* 7. Let entry be the element one later than entry in the list of
3692 active formatting elements. */
3693 if(isset($step_seven) && $step_seven === true) {
3695 $entry = $this->a_formatting[$a];
3698 /* 8. Perform a shallow clone of the element entry to obtain clone. */
3699 $clone = $entry->cloneNode();
3701 /* 9. Append clone to the current node and push it onto the stack
3702 of open elements so that it is the new current node. */
3703 end($this->stack)->appendChild($clone);
3704 $this->stack[] = $clone;
3706 /* 10. Replace the entry for entry in the list with an entry for
3708 $this->a_formatting[$a] = $clone;
3710 /* 11. If the entry for clone in the list of active formatting
3711 elements is not the last entry in the list, return to step 7. */
3712 if(end($this->a_formatting) !== $clone) {
3720 private function clearTheActiveFormattingElementsUpToTheLastMarker() {
3721 /* When the steps below require the UA to clear the list of active
3722 formatting elements up to the last marker, the UA must perform the
3726 /* 1. Let entry be the last (most recently added) entry in the list
3727 of active formatting elements. */
3728 $entry = end($this->a_formatting);
3730 /* 2. Remove entry from the list of active formatting elements. */
3731 array_pop($this->a_formatting);
3733 /* 3. If entry was a marker, then stop the algorithm at this point.
3734 The list has been cleared up to the last marker. */
3735 if($entry === self::MARKER) {
3741 private function generateImpliedEndTags($exclude = array()) {
3742 /* When the steps below require the UA to generate implied end tags,
3743 then, if the current node is a dd element, a dt element, an li element,
3744 a p element, a td element, a th element, or a tr element, the UA must
3745 act as if an end tag with the respective tag name had been seen and
3746 then generate implied end tags again. */
3747 $node = end($this->stack);
3748 $elements = array_diff(array('dd', 'dt', 'li', 'p', 'td', 'th', 'tr'), $exclude);
3750 while(in_array(end($this->stack)->nodeName, $elements)) {
3751 array_pop($this->stack);
3755 private function getElementCategory($node) {
3756 $name = $node->tagName;
3757 if(in_array($name, $this->special))
3758 return self::SPECIAL;
3760 elseif(in_array($name, $this->scoping))
3761 return self::SCOPING;
3763 elseif(in_array($name, $this->formatting))
3764 return self::FORMATTING;
3767 return self::PHRASING;
3770 private function clearStackToTableContext($elements) {
3771 /* When the steps above require the UA to clear the stack back to a
3772 table context, it means that the UA must, while the current node is not
3773 a table element or an html element, pop elements from the stack of open
3774 elements. If this causes any elements to be popped from the stack, then
3775 this is a parse error. */
3777 $node = end($this->stack)->nodeName;
3779 if(in_array($node, $elements)) {
3782 array_pop($this->stack);
3787 private function resetInsertionMode() {
3788 /* 1. Let last be false. */
3790 $leng = count($this->stack);
3792 for($n = $leng - 1; $n >= 0; $n--) {
3793 /* 2. Let node be the last node in the stack of open elements. */
3794 $node = $this->stack[$n];
3796 /* 3. If node is the first node in the stack of open elements, then
3797 set last to true. If the element whose innerHTML attribute is being
3798 set is neither a td element nor a th element, then set node to the
3799 element whose innerHTML attribute is being set. (innerHTML case) */
3800 if($this->stack[0]->isSameNode($node)) {
3804 /* 4. If node is a select element, then switch the insertion mode to
3805 "in select" and abort these steps. (innerHTML case) */
3806 if($node->nodeName === 'select') {
3807 $this->mode = self::IN_SELECT;
3810 /* 5. If node is a td or th element, then switch the insertion mode
3811 to "in cell" and abort these steps. */
3812 } elseif($node->nodeName === 'td' || $node->nodeName === 'th') {
3813 $this->mode = self::IN_CELL;
3816 /* 6. If node is a tr element, then switch the insertion mode to
3817 "in row" and abort these steps. */
3818 } elseif($node->nodeName === 'tr') {
3819 $this->mode = self::IN_ROW;
3822 /* 7. If node is a tbody, thead, or tfoot element, then switch the
3823 insertion mode to "in table body" and abort these steps. */
3824 } elseif(in_array($node->nodeName, array('tbody', 'thead', 'tfoot'))) {
3825 $this->mode = self::IN_TBODY;
3828 /* 8. If node is a caption element, then switch the insertion mode
3829 to "in caption" and abort these steps. */
3830 } elseif($node->nodeName === 'caption') {
3831 $this->mode = self::IN_CAPTION;
3834 /* 9. If node is a colgroup element, then switch the insertion mode
3835 to "in column group" and abort these steps. (innerHTML case) */
3836 } elseif($node->nodeName === 'colgroup') {
3837 $this->mode = self::IN_CGROUP;
3840 /* 10. If node is a table element, then switch the insertion mode
3841 to "in table" and abort these steps. */
3842 } elseif($node->nodeName === 'table') {
3843 $this->mode = self::IN_TABLE;
3846 /* 11. If node is a head element, then switch the insertion mode
3847 to "in body" ("in body"! not "in head"!) and abort these steps.
3849 } elseif($node->nodeName === 'head') {
3850 $this->mode = self::IN_BODY;
3853 /* 12. If node is a body element, then switch the insertion mode to
3854 "in body" and abort these steps. */
3855 } elseif($node->nodeName === 'body') {
3856 $this->mode = self::IN_BODY;
3859 /* 13. If node is a frameset element, then switch the insertion
3860 mode to "in frameset" and abort these steps. (innerHTML case) */
3861 } elseif($node->nodeName === 'frameset') {
3862 $this->mode = self::IN_FRAME;
3865 /* 14. If node is an html element, then: if the head element
3866 pointer is null, switch the insertion mode to "before head",
3867 otherwise, switch the insertion mode to "after head". In either
3868 case, abort these steps. (innerHTML case) */
3869 } elseif($node->nodeName === 'html') {
3870 $this->mode = ($this->head_pointer === null)
3876 /* 15. If last is true, then set the insertion mode to "in body"
3877 and abort these steps. (innerHTML case) */
3879 $this->mode = self::IN_BODY;
3885 private function closeCell() {
3886 /* If the stack of open elements has a td or th element in table scope,
3887 then act as if an end tag token with that tag name had been seen. */
3888 foreach(array('td', 'th') as $cell) {
3889 if($this->elementInScope($cell, true)) {
3890 $this->inCell(array(
3892 'type' => HTML5::ENDTAG
3900 public function save() {