4 * Copyright 2010, Moxiecode Systems AB
5 * Released under LGPL License.
7 * License: http://tinymce.moxiecode.com/license
8 * Contributing: http://tinymce.moxiecode.com/contributing
13 * This class parses HTML code using pure JavaScript and executes various events for each item it finds. It will
14 * always execute the events in the right order for tag soup code like <b><p></b></p>. It will also remove elements
15 * and attributes that doesn't fit the schema if the validate setting is enabled.
18 * var parser = new tinymce.html.SaxParser({
21 * comment: function(text) {
22 * console.log('Comment:', text);
25 * cdata: function(text) {
26 * console.log('CDATA:', text);
29 * text: function(text, raw) {
30 * console.log('Text:', text, 'Raw:', raw);
33 * start: function(name, attrs, empty) {
34 * console.log('Start:', name, attrs, empty);
37 * end: function(name) {
38 * console.log('End:', name);
41 * pi: function(name, text) {
42 * console.log('PI:', name, text);
45 * doctype: function(text) {
46 * console.log('DocType:', text);
49 * @class tinymce.html.SaxParser
54 * Constructs a new SaxParser instance.
58 * @param {Object} settings Name/value collection of settings. comment, cdata, text, start and end are callbacks.
59 * @param {tinymce.html.Schema} schema HTML Schema class to use when parsing.
61 tinymce.html.SaxParser = function(settings, schema) {
62 var self = this, noop = function() {};
64 settings = settings || {};
65 self.schema = schema = schema || new tinymce.html.Schema();
67 if (settings.fix_self_closing !== false)
68 settings.fix_self_closing = true;
70 // Add handler functions from settings and setup default handlers
71 tinymce.each('comment cdata text start end pi doctype'.split(' '), function(name) {
73 self[name] = settings[name] || noop;
77 * Parses the specified HTML string and executes the callbacks for each item it finds.
80 * new SaxParser({...}).parse('<b>text</b>');
82 * @param {String} html Html string to sax parse.
84 self.parse = function(html) {
85 var self = this, matches, index = 0, value, endRegExp, stack = [], attrList, i, text, name, isInternalElement, removeInternalElements,
86 shortEndedElements, fillAttrsMap, isShortEnded, validate, elementRule, isValidElement, attr, attribsValue,
87 validAttributesMap, validAttributePatterns, attributesRequired, attributesDefault, attributesForced, selfClosing,
88 tokenRegExp, attrRegExp, specialElements, attrValue, idCount = 0, decode = tinymce.html.Entities.decode, fixSelfClosing;
90 function processEndTag(name) {
93 // Find position of parent of the same type
96 if (stack[pos].name === name)
102 // Close all the open elements
103 for (i = stack.length - 1; i >= pos; i--) {
110 // Remove the open elements from the stack
115 // Precompile RegExps and map objects
116 tokenRegExp = new RegExp('<(?:' +
117 '(?:!--([\\w\\W]*?)-->)|' + // Comment
118 '(?:!\\[CDATA\\[([\\w\\W]*?)\\]\\]>)|' + // CDATA
119 '(?:!DOCTYPE([\\w\\W]*?)>)|' + // DOCTYPE
120 '(?:\\?([^\\s\\/<>]+) ?([\\w\\W]*?)[?/]>)|' + // PI
121 '(?:\\/([^>]+)>)|' + // End element
122 '(?:([^\\s\\/<>]+)\\s*((?:[^"\'>]+(?:(?:"[^"]*")|(?:\'[^\']*\')|[^>]*))*)>)' + // Start element
125 attrRegExp = /([\w:\-]+)(?:\s*=\s*(?:(?:\"((?:\\.|[^\"])*)\")|(?:\'((?:\\.|[^\'])*)\')|([^>\s]+)))?/g;
127 'script' : /<\/script[^>]*>/gi,
128 'style' : /<\/style[^>]*>/gi,
129 'noscript' : /<\/noscript[^>]*>/gi
132 // Setup lookup tables for empty elements and boolean attributes
133 shortEndedElements = schema.getShortEndedElements();
134 selfClosing = schema.getSelfClosingElements();
135 fillAttrsMap = schema.getBoolAttrs();
136 validate = settings.validate;
137 removeInternalElements = settings.remove_internals;
138 fixSelfClosing = settings.fix_self_closing;
140 while (matches = tokenRegExp.exec(html)) {
142 if (index < matches.index)
143 self.text(decode(html.substr(index, matches.index - index)));
145 if (value = matches[6]) { // End element
146 processEndTag(value.toLowerCase());
147 } else if (value = matches[7]) { // Start element
148 value = value.toLowerCase();
149 isShortEnded = value in shortEndedElements;
151 // Is self closing tag for example an <li> after an open <li>
152 if (fixSelfClosing && selfClosing[value] && stack.length > 0 && stack[stack.length - 1].name === value)
153 processEndTag(value);
156 if (!validate || (elementRule = schema.getElementRule(value))) {
157 isValidElement = true;
159 // Grab attributes map and patters when validation is enabled
161 validAttributesMap = elementRule.attributes;
162 validAttributePatterns = elementRule.attributePatterns;
166 if (attribsValue = matches[8]) {
167 isInternalElement = attribsValue.indexOf('data-mce-type') !== -1; // Check if the element is an internal element
169 // If the element has internal attributes then remove it if we are told to do so
170 if (isInternalElement && removeInternalElements)
171 isValidElement = false;
176 attribsValue.replace(attrRegExp, function(match, name, value, val2, val3) {
179 name = name.toLowerCase();
180 value = name in fillAttrsMap ? name : decode(value || val2 || val3 || ''); // Handle boolean attribute than value attribute
182 // Validate name and value
183 if (validate && !isInternalElement && name.indexOf('data-') !== 0) {
184 attrRule = validAttributesMap[name];
186 // Find rule by pattern matching
187 if (!attrRule && validAttributePatterns) {
188 i = validAttributePatterns.length;
190 attrRule = validAttributePatterns[i];
191 if (attrRule.pattern.test(name))
200 // No attribute rule found
205 if (attrRule.validValues && !(value in attrRule.validValues))
209 // Add attribute to list and map
210 attrList.map[name] = value;
221 // Process attributes if validation is enabled
222 if (validate && !isInternalElement) {
223 attributesRequired = elementRule.attributesRequired;
224 attributesDefault = elementRule.attributesDefault;
225 attributesForced = elementRule.attributesForced;
227 // Handle forced attributes
228 if (attributesForced) {
229 i = attributesForced.length;
231 attr = attributesForced[i];
233 attrValue = attr.value;
235 if (attrValue === '{$uid}')
236 attrValue = 'mce_' + idCount++;
238 attrList.map[name] = attrValue;
239 attrList.push({name: name, value: attrValue});
243 // Handle default attributes
244 if (attributesDefault) {
245 i = attributesDefault.length;
247 attr = attributesDefault[i];
250 if (!(name in attrList.map)) {
251 attrValue = attr.value;
253 if (attrValue === '{$uid}')
254 attrValue = 'mce_' + idCount++;
256 attrList.map[name] = attrValue;
257 attrList.push({name: name, value: attrValue});
262 // Handle required attributes
263 if (attributesRequired) {
264 i = attributesRequired.length;
266 if (attributesRequired[i] in attrList.map)
270 // None of the required attributes where found
272 isValidElement = false;
275 // Invalidate element if it's marked as bogus
276 if (attrList.map['data-mce-bogus'])
277 isValidElement = false;
281 self.start(value, attrList, isShortEnded);
283 isValidElement = false;
285 // Treat script, noscript and style a bit different since they may include code that looks like elements
286 if (endRegExp = specialElements[value]) {
287 endRegExp.lastIndex = index = matches.index + matches[0].length;
289 if (matches = endRegExp.exec(html)) {
291 text = html.substr(index, matches.index - index);
293 index = matches.index + matches[0].length;
295 text = html.substr(index);
299 if (isValidElement && text.length > 0)
300 self.text(text, true);
305 tokenRegExp.lastIndex = index;
309 // Push value on to stack
311 if (!attribsValue || attribsValue.indexOf('/') != attribsValue.length - 1)
312 stack.push({name: value, valid: isValidElement});
313 else if (isValidElement)
316 } else if (value = matches[1]) { // Comment
318 } else if (value = matches[2]) { // CDATA
320 } else if (value = matches[3]) { // DOCTYPE
322 } else if (value = matches[4]) { // PI
323 self.pi(value, matches[5]);
326 index = matches.index + matches[0].length;
330 if (index < html.length)
331 self.text(decode(html.substr(index)));
333 // Close any open elements
334 for (i = stack.length - 1; i >= 0; i--) {
338 self.end(value.name);