/* Copyright (c) 2010, Yahoo! Inc. All rights reserved. Code licensed under the BSD License: http://developer.yahoo.com/yui/license.html version: 3.3.0 build: 3167 */ YUI.add('text-wordbreak', function(Y) { /** * Provides utility methods for splitting strings on word breaks and determining * whether a character index represents a word boundary. * * @module text * @submodule text-wordbreak */ /** *

* Provides utility methods for splitting strings on word breaks and determining * whether a character index represents a word boundary, using the generic word * breaking algorithm defined in the Unicode Text Segmentation guidelines * (Unicode Standard * Annex #29). *

* *

* This algorithm provides a reasonable default for many languages. However, it * does not cover language or context specific requirements, and it does not * provide meaningful results at all for languages that don't use spaces between * words, such as Chinese, Japanese, Thai, Lao, Khmer, and others. Server-based * word breaking services usually provide significantly better results with * better performance. *

* * @class Text.WordBreak * @static */ var Text = Y.Text, WBData = Text.Data.WordBreak, // Constants representing code point classifications. ALETTER = 0, MIDNUMLET = 1, MIDLETTER = 2, MIDNUM = 3, NUMERIC = 4, CR = 5, LF = 6, NEWLINE = 7, EXTEND = 8, FORMAT = 9, KATAKANA = 10, EXTENDNUMLET = 11, OTHER = 12, // RegExp objects generated from code point data. Each regex matches a single // character against a set of Unicode code points. The index of each item in // this array must match its corresponding code point constant value defined // above. SETS = [ new RegExp(WBData.aletter), new RegExp(WBData.midnumlet), new RegExp(WBData.midletter), new RegExp(WBData.midnum), new RegExp(WBData.numeric), new RegExp(WBData.cr), new RegExp(WBData.lf), new RegExp(WBData.newline), new RegExp(WBData.extend), new RegExp(WBData.format), new RegExp(WBData.katakana), new RegExp(WBData.extendnumlet) ], EMPTY_STRING = '', PUNCTUATION = new RegExp('^' + WBData.punctuation + '$'), WHITESPACE = /\s/, WordBreak = { // -- Public Static Methods ------------------------------------------------ /** * Splits the specified string into an array of individual words. * * @method getWords * @param {String} string String to split. * @param {Object} options (optional) Options object containing zero or more * of the following properties: * *
*
ignoreCase (Boolean)
*
* If true, the string will be converted to lowercase * before being split. Default is false. *
* *
includePunctuation (Boolean)
*
* If true, the returned array will include punctuation * characters. Default is false. *
* *
includeWhitespace (Boolean)
*
* If true, the returned array will include whitespace * characters. Default is false. *
*
* @return {Array} Array of words. * @static */ getWords: function (string, options) { var i = 0, map = WordBreak._classify(string), len = map.length, word = [], words = [], chr, includePunctuation, includeWhitespace; if (!options) { options = {}; } if (options.ignoreCase) { string = string.toLowerCase(); } includePunctuation = options.includePunctuation; includeWhitespace = options.includeWhitespace; // Loop through each character in the classification map and determine // whether it precedes a word boundary, building an array of distinct // words as we go. for (; i < len; ++i) { chr = string.charAt(i); // Append this character to the current word. word.push(chr); // If there's a word boundary between the current character and the // next character, append the current word to the words array and // start building a new word. if (WordBreak._isWordBoundary(map, i)) { word = word.join(EMPTY_STRING); if (word && (includeWhitespace || !WHITESPACE.test(word)) && (includePunctuation || !PUNCTUATION.test(word))) { words.push(word); } word = []; } } return words; }, /** * Returns an array containing only unique words from the specified string. * For example, the string 'foo bar baz foo' would result in * the array ['foo', 'bar', 'baz']. * * @method getUniqueWords * @param {String} string String to split. * @param {Object} options (optional) Options (see getWords() * for details). * @return {Array} Array of unique words. * @static */ getUniqueWords: function (string, options) { return Y.Array.unique(WordBreak.getWords(string, options)); }, /** *

* Returns true if there is a word boundary between the * specified character index and the next character index (or the end of the * string). *

* *

* Note that there are always word breaks at the beginning and end of a * string, so isWordBoundary('', 0) and * isWordBoundary('a', 0) will both return true. *

* * @method isWordBoundary * @param {String} string String to test. * @param {Number} index Character index to test within the string. * @return {Boolean} true for a word boundary, * false otherwise. * @static */ isWordBoundary: function (string, index) { return WordBreak._isWordBoundary(WordBreak._classify(string), index); }, // -- Protected Static Methods --------------------------------------------- /** * Returns a character classification map for the specified string. * * @method _classify * @param {String} string String to classify. * @return {Array} Classification map. * @protected * @static */ _classify: function (string) { var chr, map = [], i = 0, j, set, stringLength = string.length, setsLength = SETS.length, type; for (; i < stringLength; ++i) { chr = string.charAt(i); type = OTHER; for (j = 0; j < setsLength; ++j) { set = SETS[j]; if (set && set.test(chr)) { type = j; break; } } map.push(type); } return map; }, /** *

* Returns true if there is a word boundary between the * specified character index and the next character index (or the end of the * string). *

* *

* Note that there are always word breaks at the beginning and end of a * string, so _isWordBoundary('', 0) and * _isWordBoundary('a', 0) will both return true. *

* * @method _isWordBoundary * @param {Array} map Character classification map generated by * _classify. * @param {Number} index Character index to test. * @return {Boolean} * @protected * @static */ _isWordBoundary: function (map, index) { var prevType, type = map[index], nextType = map[index + 1], nextNextType; if (index < 0 || (index > map.length - 1 && index !== 0)) { return false; } // WB5. Don't break between most letters. if (type === ALETTER && nextType === ALETTER) { return false; } nextNextType = map[index + 2]; // WB6. Don't break letters across certain punctuation. if (type === ALETTER && (nextType === MIDLETTER || nextType === MIDNUMLET) && nextNextType === ALETTER) { return false; } prevType = map[index - 1]; // WB7. Don't break letters across certain punctuation. if ((type === MIDLETTER || type === MIDNUMLET) && nextType === ALETTER && prevType === ALETTER) { return false; } // WB8/WB9/WB10. Don't break inside sequences of digits or digits // adjacent to letters. if ((type === NUMERIC || type === ALETTER) && (nextType === NUMERIC || nextType === ALETTER)) { return false; } // WB11. Don't break inside numeric sequences like "3.2" or // "3,456.789". if ((type === MIDNUM || type === MIDNUMLET) && nextType === NUMERIC && prevType === NUMERIC) { return false; } // WB12. Don't break inside numeric sequences like "3.2" or // "3,456.789". if (type === NUMERIC && (nextType === MIDNUM || nextType === MIDNUMLET) && nextNextType === NUMERIC) { return false; } // WB4. Ignore format and extend characters. if (type === EXTEND || type === FORMAT || prevType === EXTEND || prevType === FORMAT || nextType === EXTEND || nextType === FORMAT) { return false; } // WB3. Don't break inside CRLF. if (type === CR && nextType === LF) { return false; } // WB3a. Break before newlines (including CR and LF). if (type === NEWLINE || type === CR || type === LF) { return true; } // WB3b. Break after newlines (including CR and LF). if (nextType === NEWLINE || nextType === CR || nextType === LF) { return true; } // WB13. Don't break between Katakana characters. if (type === KATAKANA && nextType === KATAKANA) { return false; } // WB13a. Don't break from extenders. if (nextType === EXTENDNUMLET && (type === ALETTER || type === NUMERIC || type === KATAKANA || type === EXTENDNUMLET)) { return false; } // WB13b. Don't break from extenders. if (type === EXTENDNUMLET && (nextType === ALETTER || nextType === NUMERIC || nextType === KATAKANA)) { return false; } // Break after any character not covered by the rules above. return true; } }; Text.WordBreak = WordBreak; }, '3.3.0' ,{requires:['array-extras', 'text-data-wordbreak']});