/** * SaxParser.js * * Copyright 2010, Moxiecode Systems AB * Released under LGPL License. * * License: http://tinymce.moxiecode.com/license * Contributing: http://tinymce.moxiecode.com/contributing */ (function(tinymce) { /** * This class parses HTML code using pure JavaScript and executes various events for each item it finds. It will * always execute the events in the right order for tag soup code like

. It will also remove elements * and attributes that doesn't fit the schema if the validate setting is enabled. * * @example * var parser = new tinymce.html.SaxParser({ * validate: true, * * comment: function(text) { * console.log('Comment:', text); * }, * * cdata: function(text) { * console.log('CDATA:', text); * }, * * text: function(text, raw) { * console.log('Text:', text, 'Raw:', raw); * }, * * start: function(name, attrs, empty) { * console.log('Start:', name, attrs, empty); * }, * * end: function(name) { * console.log('End:', name); * }, * * pi: function(name, text) { * console.log('PI:', name, text); * }, * * doctype: function(text) { * console.log('DocType:', text); * } * }, schema); * @class tinymce.html.SaxParser * @version 3.4 */ /** * Constructs a new SaxParser instance. * * @constructor * @method SaxParser * @param {Object} settings Name/value collection of settings. comment, cdata, text, start and end are callbacks. * @param {tinymce.html.Schema} schema HTML Schema class to use when parsing. */ tinymce.html.SaxParser = function(settings, schema) { var self = this, noop = function() {}; settings = settings || {}; self.schema = schema = schema || new tinymce.html.Schema(); if (settings.fix_self_closing !== false) settings.fix_self_closing = true; // Add handler functions from settings and setup default handlers tinymce.each('comment cdata text start end pi doctype'.split(' '), function(name) { if (name) self[name] = settings[name] || noop; }); /** * Parses the specified HTML string and executes the callbacks for each item it finds. * * @example * new SaxParser({...}).parse('text'); * @method parse * @param {String} html Html string to sax parse. */ self.parse = function(html) { var self = this, matches, index = 0, value, endRegExp, stack = [], attrList, i, text, name, isInternalElement, removeInternalElements, shortEndedElements, fillAttrsMap, isShortEnded, validate, elementRule, isValidElement, attr, attribsValue, validAttributesMap, validAttributePatterns, attributesRequired, attributesDefault, attributesForced, selfClosing, tokenRegExp, attrRegExp, specialElements, attrValue, idCount = 0, decode = tinymce.html.Entities.decode, fixSelfClosing; function processEndTag(name) { var pos, i; // Find position of parent of the same type pos = stack.length; while (pos--) { if (stack[pos].name === name) break; } // Found parent if (pos >= 0) { // Close all the open elements for (i = stack.length - 1; i >= pos; i--) { name = stack[i]; if (name.valid) self.end(name.name); } // Remove the open elements from the stack stack.length = pos; } }; // Precompile RegExps and map objects tokenRegExp = new RegExp('<(?:' + '(?:!--([\\w\\W]*?)-->)|' + // Comment '(?:!\\[CDATA\\[([\\w\\W]*?)\\]\\]>)|' + // CDATA '(?:!DOCTYPE([\\w\\W]*?)>)|' + // DOCTYPE '(?:\\?([^\\s\\/<>]+) ?([\\w\\W]*?)[?/]>)|' + // PI '(?:\\/([^>]+)>)|' + // End element '(?:([^\\s\\/<>]+)\\s*((?:[^"\'>]+(?:(?:"[^"]*")|(?:\'[^\']*\')|[^>]*))*)>)' + // Start element ')', 'g'); attrRegExp = /([\w:\-]+)(?:\s*=\s*(?:(?:\"((?:\\.|[^\"])*)\")|(?:\'((?:\\.|[^\'])*)\')|([^>\s]+)))?/g; specialElements = { 'script' : /<\/script[^>]*>/gi, 'style' : /<\/style[^>]*>/gi, 'noscript' : /<\/noscript[^>]*>/gi }; // Setup lookup tables for empty elements and boolean attributes shortEndedElements = schema.getShortEndedElements(); selfClosing = schema.getSelfClosingElements(); fillAttrsMap = schema.getBoolAttrs(); validate = settings.validate; removeInternalElements = settings.remove_internals; fixSelfClosing = settings.fix_self_closing; while (matches = tokenRegExp.exec(html)) { // Text if (index < matches.index) self.text(decode(html.substr(index, matches.index - index))); if (value = matches[6]) { // End element processEndTag(value.toLowerCase()); } else if (value = matches[7]) { // Start element value = value.toLowerCase(); isShortEnded = value in shortEndedElements; // Is self closing tag for example an
  • after an open
  • if (fixSelfClosing && selfClosing[value] && stack.length > 0 && stack[stack.length - 1].name === value) processEndTag(value); // Validate element if (!validate || (elementRule = schema.getElementRule(value))) { isValidElement = true; // Grab attributes map and patters when validation is enabled if (validate) { validAttributesMap = elementRule.attributes; validAttributePatterns = elementRule.attributePatterns; } // Parse attributes if (attribsValue = matches[8]) { isInternalElement = attribsValue.indexOf('data-mce-type') !== -1; // Check if the element is an internal element // If the element has internal attributes then remove it if we are told to do so if (isInternalElement && removeInternalElements) isValidElement = false; attrList = []; attrList.map = {}; attribsValue.replace(attrRegExp, function(match, name, value, val2, val3) { var attrRule, i; name = name.toLowerCase(); value = name in fillAttrsMap ? name : decode(value || val2 || val3 || ''); // Handle boolean attribute than value attribute // Validate name and value if (validate && !isInternalElement && name.indexOf('data-') !== 0) { attrRule = validAttributesMap[name]; // Find rule by pattern matching if (!attrRule && validAttributePatterns) { i = validAttributePatterns.length; while (i--) { attrRule = validAttributePatterns[i]; if (attrRule.pattern.test(name)) break; } // No rule matched if (i === -1) attrRule = null; } // No attribute rule found if (!attrRule) return; // Validate value if (attrRule.validValues && !(value in attrRule.validValues)) return; } // Add attribute to list and map attrList.map[name] = value; attrList.push({ name: name, value: value }); }); } else { attrList = []; attrList.map = {}; } // Process attributes if validation is enabled if (validate && !isInternalElement) { attributesRequired = elementRule.attributesRequired; attributesDefault = elementRule.attributesDefault; attributesForced = elementRule.attributesForced; // Handle forced attributes if (attributesForced) { i = attributesForced.length; while (i--) { attr = attributesForced[i]; name = attr.name; attrValue = attr.value; if (attrValue === '{$uid}') attrValue = 'mce_' + idCount++; attrList.map[name] = attrValue; attrList.push({name: name, value: attrValue}); } } // Handle default attributes if (attributesDefault) { i = attributesDefault.length; while (i--) { attr = attributesDefault[i]; name = attr.name; if (!(name in attrList.map)) { attrValue = attr.value; if (attrValue === '{$uid}') attrValue = 'mce_' + idCount++; attrList.map[name] = attrValue; attrList.push({name: name, value: attrValue}); } } } // Handle required attributes if (attributesRequired) { i = attributesRequired.length; while (i--) { if (attributesRequired[i] in attrList.map) break; } // None of the required attributes where found if (i === -1) isValidElement = false; } // Invalidate element if it's marked as bogus if (attrList.map['data-mce-bogus']) isValidElement = false; } if (isValidElement) self.start(value, attrList, isShortEnded); } else isValidElement = false; // Treat script, noscript and style a bit different since they may include code that looks like elements if (endRegExp = specialElements[value]) { endRegExp.lastIndex = index = matches.index + matches[0].length; if (matches = endRegExp.exec(html)) { if (isValidElement) text = html.substr(index, matches.index - index); index = matches.index + matches[0].length; } else { text = html.substr(index); index = html.length; } if (isValidElement && text.length > 0) self.text(text, true); if (isValidElement) self.end(value); tokenRegExp.lastIndex = index; continue; } // Push value on to stack if (!isShortEnded) { if (!attribsValue || attribsValue.indexOf('/') != attribsValue.length - 1) stack.push({name: value, valid: isValidElement}); else if (isValidElement) self.end(value); } } else if (value = matches[1]) { // Comment self.comment(value); } else if (value = matches[2]) { // CDATA self.cdata(value); } else if (value = matches[3]) { // DOCTYPE self.doctype(value); } else if (value = matches[4]) { // PI self.pi(value, matches[5]); } index = matches.index + matches[0].length; } // Text if (index < html.length) self.text(decode(html.substr(index))); // Close any open elements for (i = stack.length - 1; i >= 0; i--) { value = stack[i]; if (value.valid) self.end(value.name); } }; } })(tinymce);