4 * Copyright 2010, Moxiecode Systems AB
5 * Released under LGPL License.
7 * License: http://tinymce.moxiecode.com/license
8 * Contributing: http://tinymce.moxiecode.com/contributing
12 var Node = tinymce.html.Node;
15 * This class parses HTML code into a DOM like structure of nodes it will remove redundant whitespace and make
16 * sure that the node tree is valid according to the specified schema. So for example: <p>a<p>b</p>c</p> will become <p>a</p><p>b</p><p>c</p>
19 * var parser = new tinymce.html.DomParser({validate: true}, schema);
20 * var rootNode = parser.parse('<h1>content</h1>');
22 * @class tinymce.html.DomParser
27 * Constructs a new DomParser instance.
31 * @param {Object} settings Name/value collection of settings. comment, cdata, text, start and end are callbacks.
32 * @param {tinymce.html.Schema} schema HTML Schema class to use when parsing.
34 tinymce.html.DomParser = function(settings, schema) {
35 var self = this, nodeFilters = {}, attributeFilters = [], matchedNodes = {}, matchedAttributes = {};
37 settings = settings || {};
38 settings.validate = "validate" in settings ? settings.validate : true;
39 settings.root_name = settings.root_name || 'body';
40 self.schema = schema = schema || new tinymce.html.Schema();
42 function fixInvalidChildren(nodes) {
43 var ni, node, parent, parents, newParent, currentNode, tempNode, childNode, i,
44 childClone, nonEmptyElements, nonSplitableElements, sibling, nextNode;
46 nonSplitableElements = tinymce.makeMap('tr,td,th,tbody,thead,tfoot,table');
47 nonEmptyElements = schema.getNonEmptyElements();
49 for (ni = 0; ni < nodes.length; ni++) {
56 // Get list of all parent nodes until we find a valid parent to stick the child into
58 for (parent = node.parent; parent && !schema.isValidChild(parent.name, node.name) && !nonSplitableElements[parent.name]; parent = parent.parent)
61 // Found a suitable parent
62 if (parent && parents.length > 1) {
63 // Reverse the array since it makes looping easier
66 // Clone the related parent and insert that after the moved node
67 newParent = currentNode = self.filterNode(parents[0].clone());
69 // Start cloning and moving children on the left side of the target node
70 for (i = 0; i < parents.length - 1; i++) {
71 if (schema.isValidChild(currentNode.name, parents[i].name)) {
72 tempNode = self.filterNode(parents[i].clone());
73 currentNode.append(tempNode);
75 tempNode = currentNode;
77 for (childNode = parents[i].firstChild; childNode && childNode != parents[i + 1]; ) {
78 nextNode = childNode.next;
79 tempNode.append(childNode);
83 currentNode = tempNode;
86 if (!newParent.isEmpty(nonEmptyElements)) {
87 parent.insert(newParent, parents[0], true);
88 parent.insert(node, newParent);
90 parent.insert(node, parents[0], true);
93 // Check if the element is empty by looking through it's contents and special treatment for <p><br /></p>
95 if (parent.isEmpty(nonEmptyElements) || parent.firstChild === parent.lastChild && parent.firstChild.name === 'br') {
96 parent.empty().remove();
98 } else if (node.parent) {
99 // If it's an LI try to find a UL/OL for it or wrap it
100 if (node.name === 'li') {
102 if (sibling && (sibling.name === 'ul' || sibling.name === 'ul')) {
103 sibling.append(node);
108 if (sibling && (sibling.name === 'ul' || sibling.name === 'ul')) {
109 sibling.insert(node, sibling.firstChild, true);
113 node.wrap(self.filterNode(new Node('ul', 1)));
117 // Try wrapping the element in a DIV
118 if (schema.isValidChild(node.parent.name, 'div') && schema.isValidChild('div', node.name)) {
119 node.wrap(self.filterNode(new Node('div', 1)));
121 // We failed wrapping it, then remove or unwrap it
122 if (node.name === 'style' || node.name === 'script')
123 node.empty().remove();
132 * Runs the specified node though the element and attributes filters.
134 * @param {tinymce.html.Node} Node the node to run filters on.
135 * @return {tinymce.html.Node} The passed in node.
137 self.filterNode = function(node) {
140 // Run element filters
141 if (name in nodeFilters) {
142 list = matchedNodes[name];
147 matchedNodes[name] = [node];
150 // Run attribute filters
151 i = attributeFilters.length;
153 name = attributeFilters[i].name;
155 if (name in node.attributes.map) {
156 list = matchedAttributes[name];
161 matchedAttributes[name] = [node];
169 * Adds a node filter function to the parser, the parser will collect the specified nodes by name
170 * and then execute the callback ones it has finished parsing the document.
173 * parser.addNodeFilter('p,h1', function(nodes, name) {
174 * for (var i = 0; i < nodes.length; i++) {
175 * console.log(nodes[i].name);
178 * @method addNodeFilter
179 * @method {String} name Comma separated list of nodes to collect.
180 * @param {function} callback Callback function to execute once it has collected nodes.
182 self.addNodeFilter = function(name, callback) {
183 tinymce.each(tinymce.explode(name), function(name) {
184 var list = nodeFilters[name];
187 nodeFilters[name] = list = [];
194 * Adds a attribute filter function to the parser, the parser will collect nodes that has the specified attributes
195 * and then execute the callback ones it has finished parsing the document.
198 * parser.addAttributeFilter('src,href', function(nodes, name) {
199 * for (var i = 0; i < nodes.length; i++) {
200 * console.log(nodes[i].name);
203 * @method addAttributeFilter
204 * @method {String} name Comma separated list of nodes to collect.
205 * @param {function} callback Callback function to execute once it has collected nodes.
207 self.addAttributeFilter = function(name, callback) {
208 tinymce.each(tinymce.explode(name), function(name) {
211 for (i = 0; i < attributeFilters.length; i++) {
212 if (attributeFilters[i].name === name) {
213 attributeFilters[i].callbacks.push(callback);
218 attributeFilters.push({name: name, callbacks: [callback]});
223 * Parses the specified HTML string into a DOM like node tree and returns the result.
226 * var rootNode = new DomParser({...}).parse('<b>text</b>');
228 * @param {String} html Html string to sax parse.
229 * @param {Object} args Optional args object that gets passed to all filter functions.
230 * @return {tinymce.html.Node} Root node containing the tree.
232 self.parse = function(html, args) {
233 var parser, rootNode, node, nodes, i, l, fi, fl, list, name, validate,
234 blockElements, startWhiteSpaceRegExp, invalidChildren = [],
235 endWhiteSpaceRegExp, allWhiteSpaceRegExp, whiteSpaceElements, children, nonEmptyElements;
239 matchedAttributes = {};
240 blockElements = tinymce.extend(tinymce.makeMap('script,style,head,html,body,title,meta,param'), schema.getBlockElements());
241 nonEmptyElements = schema.getNonEmptyElements();
242 children = schema.children;
243 validate = settings.validate;
245 whiteSpaceElements = schema.getWhiteSpaceElements();
246 startWhiteSpaceRegExp = /^[ \t\r\n]+/;
247 endWhiteSpaceRegExp = /[ \t\r\n]+$/;
248 allWhiteSpaceRegExp = /[ \t\r\n]+/g;
250 function createNode(name, type) {
251 var node = new Node(name, type), list;
253 if (name in nodeFilters) {
254 list = matchedNodes[name];
259 matchedNodes[name] = [node];
265 function removeWhitespaceBefore(node) {
266 var textNode, textVal, sibling;
268 for (textNode = node.prev; textNode && textNode.type === 3; ) {
269 textVal = textNode.value.replace(endWhiteSpaceRegExp, '');
271 if (textVal.length > 0) {
272 textNode.value = textVal;
273 textNode = textNode.prev;
275 sibling = textNode.prev;
282 parser = new tinymce.html.SaxParser({
284 fix_self_closing : !validate, // Let the DOM parser handle <li> in <li> or <p> in <p> for better results
286 cdata: function(text) {
287 node.append(createNode('#cdata', 4)).value = text;
290 text: function(text, raw) {
293 // Trim all redundant whitespace on non white space elements
294 if (!whiteSpaceElements[node.name]) {
295 text = text.replace(allWhiteSpaceRegExp, ' ');
297 if (node.lastChild && blockElements[node.lastChild.name])
298 text = text.replace(startWhiteSpaceRegExp, '');
301 // Do we need to create the node
302 if (text.length !== 0) {
303 textNode = createNode('#text', 3);
304 textNode.raw = !!raw;
305 node.append(textNode).value = text;
309 comment: function(text) {
310 node.append(createNode('#comment', 8)).value = text;
313 pi: function(name, text) {
314 node.append(createNode(name, 7)).value = text;
315 removeWhitespaceBefore(node);
318 doctype: function(text) {
321 newNode = node.append(createNode('#doctype', 10));
322 newNode.value = text;
323 removeWhitespaceBefore(node);
326 start: function(name, attrs, empty) {
327 var newNode, attrFiltersLen, elementRule, textNode, attrName, text, sibling, parent;
329 elementRule = validate ? schema.getElementRule(name) : {};
331 newNode = createNode(elementRule.outputName || name, 1);
332 newNode.attributes = attrs;
333 newNode.shortEnded = empty;
335 node.append(newNode);
337 // Check if node is valid child of the parent node is the child is
338 // unknown we don't collect it since it's probably a custom element
339 parent = children[node.name];
340 if (parent && children[newNode.name] && !parent[newNode.name])
341 invalidChildren.push(newNode);
343 attrFiltersLen = attributeFilters.length;
344 while (attrFiltersLen--) {
345 attrName = attributeFilters[attrFiltersLen].name;
347 if (attrName in attrs.map) {
348 list = matchedAttributes[attrName];
353 matchedAttributes[attrName] = [newNode];
357 // Trim whitespace before block
358 if (blockElements[name])
359 removeWhitespaceBefore(newNode);
361 // Change current node if the element wasn't empty i.e not <br /> or <img />
367 end: function(name) {
368 var textNode, elementRule, text, sibling, tempNode;
370 elementRule = validate ? schema.getElementRule(name) : {};
372 if (blockElements[name]) {
373 if (!whiteSpaceElements[node.name]) {
374 // Trim whitespace at beginning of block
375 for (textNode = node.firstChild; textNode && textNode.type === 3; ) {
376 text = textNode.value.replace(startWhiteSpaceRegExp, '');
378 if (text.length > 0) {
379 textNode.value = text;
380 textNode = textNode.next;
382 sibling = textNode.next;
388 // Trim whitespace at end of block
389 for (textNode = node.lastChild; textNode && textNode.type === 3; ) {
390 text = textNode.value.replace(endWhiteSpaceRegExp, '');
392 if (text.length > 0) {
393 textNode.value = text;
394 textNode = textNode.prev;
396 sibling = textNode.prev;
403 // Trim start white space
404 textNode = node.prev;
405 if (textNode && textNode.type === 3) {
406 text = textNode.value.replace(startWhiteSpaceRegExp, '');
409 textNode.value = text;
415 // Handle empty nodes
416 if (elementRule.removeEmpty || elementRule.paddEmpty) {
417 if (node.isEmpty(nonEmptyElements)) {
418 if (elementRule.paddEmpty)
419 node.empty().append(new Node('#text', '3')).value = '\u00a0';
421 // Leave nodes that have a name like <a name="name">
422 if (!node.attributes.map.name) {
423 tempNode = node.parent;
424 node.empty().remove();
437 rootNode = node = new Node(settings.root_name, 11);
442 fixInvalidChildren(invalidChildren);
445 for (name in matchedNodes) {
446 list = nodeFilters[name];
447 nodes = matchedNodes[name];
449 // Remove already removed children
452 if (!nodes[fi].parent)
456 for (i = 0, l = list.length; i < l; i++)
457 list[i](nodes, name, args);
460 // Run attribute filters
461 for (i = 0, l = attributeFilters.length; i < l; i++) {
462 list = attributeFilters[i];
464 if (list.name in matchedAttributes) {
465 nodes = matchedAttributes[list.name];
467 // Remove already removed children
470 if (!nodes[fi].parent)
474 for (fi = 0, fl = list.callbacks.length; fi < fl; fi++)
475 list.callbacks[fi](nodes, list.name, args);
482 // Remove <br> at end of block elements Gecko and WebKit injects BR elements to
483 // make it possible to place the caret inside empty blocks. This logic tries to remove
484 // these elements and keep br elements that where intended to be there intact
485 if (settings.remove_trailing_brs) {
486 self.addNodeFilter('br', function(nodes, name) {
487 var i, l = nodes.length, node, blockElements = schema.getBlockElements(),
488 nonEmptyElements = schema.getNonEmptyElements(), parent, prev, prevName;
490 // Must loop forwards since it will otherwise remove all brs in <p>a<br><br><br></p>
491 for (i = 0; i < l; i++) {
493 parent = node.parent;
495 if (blockElements[node.parent.name] && node === parent.lastChild) {
496 // Loop all nodes to the right of the current node and check for other BR elements
497 // excluding bookmarks since they are invisible
500 prevName = prev.name;
503 if (prevName !== "span" || prev.attr('data-mce-type') !== 'bookmark') {
504 // Found a non BR element
505 if (prevName !== "br")
508 // Found another br it's a <br><br> structure then don't remove anything
509 if (prevName === 'br') {
521 // Is the parent to be considered empty after we removed the BR
522 if (parent.isEmpty(nonEmptyElements)) {
523 elementRule = schema.getElementRule(parent.name);
525 // Remove or padd the element depending on schema rule
526 if (elementRule.removeEmpty)
528 else if (elementRule.paddEmpty)
529 parent.empty().append(new tinymce.html.Node('#text', 3)).value = '\u00a0';