jssource/src_files/include/javascript/yui3/build/text/text-wordbreak.js

   1 /*
   2 Copyright (c) 2010, Yahoo! Inc. All rights reserved.
   3 Code licensed under the BSD License:
   4 http://developer.yahoo.com/yui/license.html
   5 version: 3.3.0
   6 build: 3167
   7 */
   8 YUI.add('text-wordbreak', function(Y) {
   9
  10 /**
  11  * Provides utility methods for splitting strings on word breaks and determining
  12  * whether a character index represents a word boundary.
  13  *
  14  * @module text
  15  * @submodule text-wordbreak
  16  */
  17
  18 /**
  19  * <p>
  20  * Provides utility methods for splitting strings on word breaks and determining
  21  * whether a character index represents a word boundary, using the generic word
  22  * breaking algorithm defined in the Unicode Text Segmentation guidelines
  23  * (<a href="http://unicode.org/reports/tr29/#Word_Boundaries">Unicode Standard
  24  * Annex #29</a>).
  25  * </p>
  26  *
  27  * <p>
  28  * This algorithm provides a reasonable default for many languages. However, it
  29  * does not cover language or context specific requirements, and it does not
  30  * provide meaningful results at all for languages that don't use spaces between
  31  * words, such as Chinese, Japanese, Thai, Lao, Khmer, and others. Server-based
  32  * word breaking services usually provide significantly better results with
  33  * better performance.
  34  * </p>
  35  *
  36  * @class Text.WordBreak
  37  * @static
  38  */
  39
  40 var Text   = Y.Text,
  41     WBData = Text.Data.WordBreak,
  42
  43 // Constants representing code point classifications.
  44 ALETTER      = 0,
  45 MIDNUMLET    = 1,
  46 MIDLETTER    = 2,
  47 MIDNUM       = 3,
  48 NUMERIC      = 4,
  49 CR           = 5,
  50 LF           = 6,
  51 NEWLINE      = 7,
  52 EXTEND       = 8,
  53 FORMAT       = 9,
  54 KATAKANA     = 10,
  55 EXTENDNUMLET = 11,
  56 OTHER        = 12,
  57
  58 // RegExp objects generated from code point data. Each regex matches a single
  59 // character against a set of Unicode code points. The index of each item in
  60 // this array must match its corresponding code point constant value defined
  61 // above.
  62 SETS = [
  63     new RegExp(WBData.aletter),
  64     new RegExp(WBData.midnumlet),
  65     new RegExp(WBData.midletter),
  66     new RegExp(WBData.midnum),
  67     new RegExp(WBData.numeric),
  68     new RegExp(WBData.cr),
  69     new RegExp(WBData.lf),
  70     new RegExp(WBData.newline),
  71     new RegExp(WBData.extend),
  72     new RegExp(WBData.format),
  73     new RegExp(WBData.katakana),
  74     new RegExp(WBData.extendnumlet)
  75 ],
  76
  77 EMPTY_STRING = '',
  78 PUNCTUATION  = new RegExp('^' + WBData.punctuation + '$'),
  79 WHITESPACE   = /\s/,
  80
  81 WordBreak = {
  82     // -- Public Static Methods ------------------------------------------------
  83
  84     /**
  85      * Splits the specified string into an array of individual words.
  86      *
  87      * @method getWords
  88      * @param {String} string String to split.
  89      * @param {Object} options (optional) Options object containing zero or more
  90      *   of the following properties:
  91      *
  92      * <dl>
  93      *   <dt>ignoreCase (Boolean)</dt>
  94      *   <dd>
  95      *     If <code>true</code>, the string will be converted to lowercase
  96      *     before being split. Default is <code>false</code>.
  97      *   </dd>
  98      *
  99      *   <dt>includePunctuation (Boolean)</dt>
 100      *   <dd>
 101      *     If <code>true</code>, the returned array will include punctuation
 102      *     characters. Default is <code>false</code>.
 103      *   </dd>
 104      *
 105      *   <dt>includeWhitespace (Boolean)</dt>
 106      *   <dd>
 107      *     If <code>true</code>, the returned array will include whitespace
 108      *     characters. Default is <code>false</code>.
 109      *   </dd>
 110      * </dl>
 111      * @return {Array} Array of words.
 112      * @static
 113      */
 114     getWords: function (string, options) {
 115         var i     = 0,
 116             map   = WordBreak._classify(string),
 117             len   = map.length,
 118             word  = [],
 119             words = [],
 120             chr,
 121             includePunctuation,
 122             includeWhitespace;
 123
 124         if (!options) {
 125             options = {};
 126         }
 127
 128         if (options.ignoreCase) {
 129             string = string.toLowerCase();
 130         }
 131
 132         includePunctuation = options.includePunctuation;
 133         includeWhitespace  = options.includeWhitespace;
 134
 135         // Loop through each character in the classification map and determine
 136         // whether it precedes a word boundary, building an array of distinct
 137         // words as we go.
 138         for (; i < len; ++i) {
 139             chr = string.charAt(i);
 140
 141             // Append this character to the current word.
 142             word.push(chr);
 143
 144             // If there's a word boundary between the current character and the
 145             // next character, append the current word to the words array and
 146             // start building a new word.
 147             if (WordBreak._isWordBoundary(map, i)) {
 148                 word = word.join(EMPTY_STRING);
 149
 150                 if (word &&
 151                         (includeWhitespace  || !WHITESPACE.test(word)) &&
 152                         (includePunctuation || !PUNCTUATION.test(word))) {
 153                     words.push(word);
 154                 }
 155
 156                 word = [];
 157             }
 158         }
 159
 160         return words;
 161     },
 162
 163     /**
 164      * Returns an array containing only unique words from the specified string.
 165      * For example, the string <code>'foo bar baz foo'</code> would result in
 166      * the array <code>['foo', 'bar', 'baz']</code>.
 167      *
 168      * @method getUniqueWords
 169      * @param {String} string String to split.
 170      * @param {Object} options (optional) Options (see <code>getWords()</code>
 171      *   for details).
 172      * @return {Array} Array of unique words.
 173      * @static
 174      */
 175     getUniqueWords: function (string, options) {
 176         return Y.Array.unique(WordBreak.getWords(string, options));
 177     },
 178
 179     /**
 180      * <p>
 181      * Returns <code>true</code> if there is a word boundary between the
 182      * specified character index and the next character index (or the end of the
 183      * string).
 184      * </p>
 185      *
 186      * <p>
 187      * Note that there are always word breaks at the beginning and end of a
 188      * string, so <code>isWordBoundary('', 0)</code> and
 189      * <code>isWordBoundary('a', 0)</code> will both return <code>true</code>.
 190      * </p>
 191      *
 192      * @method isWordBoundary
 193      * @param {String} string String to test.
 194      * @param {Number} index Character index to test within the string.
 195      * @return {Boolean} <code>true</code> for a word boundary,
 196      *   <code>false</code> otherwise.
 197      * @static
 198      */
 199     isWordBoundary: function (string, index) {
 200         return WordBreak._isWordBoundary(WordBreak._classify(string), index);
 201     },
 202
 203     // -- Protected Static Methods ---------------------------------------------
 204
 205     /**
 206      * Returns a character classification map for the specified string.
 207      *
 208      * @method _classify
 209      * @param {String} string String to classify.
 210      * @return {Array} Classification map.
 211      * @protected
 212      * @static
 213      */
 214     _classify: function (string) {
 215         var chr,
 216             map          = [],
 217             i            = 0,
 218             j,
 219             set,
 220             stringLength = string.length,
 221             setsLength   = SETS.length,
 222             type;
 223
 224         for (; i < stringLength; ++i) {
 225             chr  = string.charAt(i);
 226             type = OTHER;
 227
 228             for (j = 0; j < setsLength; ++j) {
 229                 set = SETS[j];
 230
 231                 if (set && set.test(chr)) {
 232                     type = j;
 233                     break;
 234                 }
 235             }
 236
 237             map.push(type);
 238         }
 239
 240         return map;
 241     },
 242
 243     /**
 244      * <p>
 245      * Returns <code>true</code> if there is a word boundary between the
 246      * specified character index and the next character index (or the end of the
 247      * string).
 248      * </p>
 249      *
 250      * <p>
 251      * Note that there are always word breaks at the beginning and end of a
 252      * string, so <code>_isWordBoundary('', 0)</code> and
 253      * <code>_isWordBoundary('a', 0)</code> will both return <code>true</code>.
 254      * </p>
 255      *
 256      * @method _isWordBoundary
 257      * @param {Array} map Character classification map generated by
 258      *   <code>_classify</code>.
 259      * @param {Number} index Character index to test.
 260      * @return {Boolean}
 261      * @protected
 262      * @static
 263      */
 264     _isWordBoundary: function (map, index) {
 265         var prevType,
 266             type     = map[index],
 267             nextType = map[index + 1],
 268             nextNextType;
 269
 270         if (index < 0 || (index > map.length - 1 && index !== 0)) {
 271             return false;
 272         }
 273
 274         // WB5. Don't break between most letters.
 275         if (type === ALETTER && nextType === ALETTER) {
 276             return false;
 277         }
 278
 279         nextNextType = map[index + 2];
 280
 281         // WB6. Don't break letters across certain punctuation.
 282         if (type === ALETTER &&
 283                 (nextType === MIDLETTER || nextType === MIDNUMLET) &&
 284                 nextNextType === ALETTER) {
 285             return false;
 286         }
 287
 288         prevType = map[index - 1];
 289
 290         // WB7. Don't break letters across certain punctuation.
 291         if ((type === MIDLETTER || type === MIDNUMLET) &&
 292                 nextType === ALETTER &&
 293                 prevType === ALETTER) {
 294             return false;
 295         }
 296
 297         // WB8/WB9/WB10. Don't break inside sequences of digits or digits
 298         // adjacent to letters.
 299         if ((type === NUMERIC || type === ALETTER) &&
 300                 (nextType === NUMERIC || nextType === ALETTER)) {
 301             return false;
 302         }
 303
 304         // WB11. Don't break inside numeric sequences like "3.2" or
 305         // "3,456.789".
 306         if ((type === MIDNUM || type === MIDNUMLET) &&
 307                 nextType === NUMERIC &&
 308                 prevType === NUMERIC) {
 309             return false;
 310         }
 311
 312         // WB12. Don't break inside numeric sequences like "3.2" or
 313         // "3,456.789".
 314         if (type === NUMERIC &&
 315                 (nextType === MIDNUM || nextType === MIDNUMLET) &&
 316                 nextNextType === NUMERIC) {
 317             return false;
 318         }
 319
 320         // WB4. Ignore format and extend characters.
 321         if (type === EXTEND || type === FORMAT ||
 322                 prevType === EXTEND || prevType === FORMAT ||
 323                 nextType === EXTEND || nextType === FORMAT) {
 324             return false;
 325         }
 326
 327         // WB3. Don't break inside CRLF.
 328         if (type === CR && nextType === LF) {
 329             return false;
 330         }
 331
 332         // WB3a. Break before newlines (including CR and LF).
 333         if (type === NEWLINE || type === CR || type === LF) {
 334             return true;
 335         }
 336
 337         // WB3b. Break after newlines (including CR and LF).
 338         if (nextType === NEWLINE || nextType === CR || nextType === LF) {
 339             return true;
 340         }
 341
 342         // WB13. Don't break between Katakana characters.
 343         if (type === KATAKANA && nextType === KATAKANA) {
 344             return false;
 345         }
 346
 347         // WB13a. Don't break from extenders.
 348         if (nextType === EXTENDNUMLET &&
 349                 (type === ALETTER || type === NUMERIC || type === KATAKANA ||
 350                 type === EXTENDNUMLET)) {
 351             return false;
 352         }
 353
 354         // WB13b. Don't break from extenders.
 355         if (type === EXTENDNUMLET &&
 356                 (nextType === ALETTER || nextType === NUMERIC ||
 357                 nextType === KATAKANA)) {
 358             return false;
 359         }
 360
 361         // Break after any character not covered by the rules above.
 362         return true;
 363     }
 364 };
 365
 366 Text.WordBreak = WordBreak;
 367
 368
 369 }, '3.3.0' ,{requires:['array-extras', 'text-data-wordbreak']});