jssource/src_files/include/javascript/yui3/build/text/text.js

   1 /*
   2 Copyright (c) 2010, Yahoo! Inc. All rights reserved.
   3 Code licensed under the BSD License:
   4 http://developer.yahoo.com/yui/license.html
   5 version: 3.3.0
   6 build: 3167
   7 */
   8 YUI.add('text-accentfold', function(Y) {
   9
  10 /**
  11  * Text utilities.
  12  *
  13  * @module text
  14  * @since 3.3.0
  15  */
  16
  17 /**
  18  * Provides a basic accent folding implementation that converts common accented
  19  * letters (like "á") to their non-accented forms (like "a").
  20  *
  21  * @module text
  22  * @submodule text-accentfold
  23  */
  24
  25 /**
  26  * <p>
  27  * Provides a basic accent folding implementation that converts common accented
  28  * letters (like "á") to their non-accented forms (like "a").
  29  * </p>
  30  *
  31  * <p>
  32  * This implementation is not comprehensive, and should only be used as a last
  33  * resort when accent folding can't be done on the server. A comprehensive
  34  * accent folding implementation would require much more character data to be
  35  * sent to the browser, resulting in a significant performance penalty. This
  36  * implementation strives for a compromise between usefulness and performance.
  37  * </p>
  38  *
  39  * <p>
  40  * Accent folding is a destructive operation that can't be reversed, and may
  41  * change or destroy the actual meaning of the text depending on the language.
  42  * It should not be used on strings that will later be displayed to a user,
  43  * unless this is done with the understanding that linguistic meaning may be
  44  * lost and that you may in fact confuse or insult the user by doing so.
  45  * </p>
  46  *
  47  * <p>
  48  * When used for matching, accent folding is likely to produce erroneous matches
  49  * for languages in which characters with diacritics are considered different
  50  * from their base characters, or where correct folding would map to other
  51  * character sequences than just stripped characters. For example, in German
  52  * "ü" is a character that's clearly different from "u" and should match "ue"
  53  * instead. The word "betrügen" means "to defraud", while "betrugen" is the past
  54  * tense of "to behave". The name "Müller" is expected to match "Mueller", but
  55  * not "Muller". On the other hand, accent folding falls short for languages
  56  * where different base characters are expected to match. In Japanese, for
  57  * example, hiragana and katakana characters with the same pronunciation ("あ"
  58  * and "ア") are commonly treated as equivalent for lookups, but accent folding
  59  * treats them as different.
  60  * </p>
  61  *
  62  * @class Text.AccentFold
  63  * @static
  64  */
  65
  66 var YArray   = Y.Array,
  67     Text     = Y.Text,
  68     FoldData = Text.Data.AccentFold,
  69
  70 AccentFold = {
  71     // -- Public Static Methods ------------------------------------------------
  72
  73     /**
  74      * Returns <code>true</code> if the specified string contains one or more
  75      * characters that can be folded, <code>false</code> otherwise.
  76      *
  77      * @method canFold
  78      * @param {String} string String to test.
  79      * @return {Boolean}
  80      * @static
  81      */
  82     canFold: function (string) {
  83         var letter;
  84
  85         for (letter in FoldData) {
  86             if (FoldData.hasOwnProperty(letter) &&
  87                     string.search(FoldData[letter]) !== -1) {
  88                 return true;
  89             }
  90         }
  91
  92         return false;
  93     },
  94
  95     /**
  96      * Compares the accent-folded versions of two strings and returns
  97      * <code>true</code> if they're the same, <code>false</code> otherwise. If
  98      * a custom comparison function is supplied, the accent-folded strings will
  99      * be passed to that function for comparison.
 100      *
 101      * @method compare
 102      * @param {String} a First string to compare.
 103      * @param {String} b Second string to compare.
 104      * @param {Function} func (optional) Custom comparison function. Should
 105      *   return a truthy or falsy value.
 106      * @return {Boolean} Results of the comparison.
 107      * @static
 108      */
 109     compare: function (a, b, func) {
 110         var aFolded = AccentFold.fold(a),
 111             bFolded = AccentFold.fold(b);
 112
 113         return func ? !!func(aFolded, bFolded) : aFolded === bFolded;
 114     },
 115
 116     /**
 117      * <p>
 118      * Returns a copy of <em>haystack</em> containing only the strings for which
 119      * the supplied function returns <code>true</code>.
 120      * </p>
 121      *
 122      * <p>
 123      * While comparisons will be made using accent-folded strings, the returned
 124      * array of matches will contain the original strings that were passed in.
 125      * </p>
 126      *
 127      * @method filter
 128      * @param {Array} haystack Array of strings to filter.
 129      * @param {Function} func Comparison function. Will receive an accent-folded
 130      *   haystack string as an argument, and should return a truthy or falsy
 131      *   value.
 132      * @return {Array} Filtered copy of <em>haystack</em>.
 133      * @static
 134      */
 135     filter: function (haystack, func) {
 136         return YArray.filter(haystack, function (item) {
 137             return func(AccentFold.fold(item));
 138         });
 139     },
 140
 141     /**
 142      * Accent-folds the specified string or array of strings and returns a copy
 143      * in which common accented letters have been converted to their closest
 144      * non-accented, lowercase forms.
 145      *
 146      * @method fold
 147      * @param {String|Array} input String or array of strings to be folded.
 148      * @return {String|Array} Folded string or array of strings.
 149      * @static
 150      */
 151     fold: function (input) {
 152         if (Y.Lang.isArray(input)) {
 153             return YArray.map(input, AccentFold.fold);
 154         }
 155
 156         input = input.toLowerCase();
 157
 158         Y.Object.each(FoldData, function (regex, letter) {
 159             input = input.replace(regex, letter);
 160         });
 161
 162         return input;
 163     }
 164 };
 165
 166 Text.AccentFold = AccentFold;
 167
 168
 169 }, '3.3.0' ,{requires:['array-extras', 'text-data-accentfold']});
 170 YUI.add('text-data-accentfold', function(Y) {
 171
 172 // The following tool was very helpful in creating these mappings:
 173 // http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:toNFKD%3D/^a/:]&abb=on
 174
 175 Y.namespace('Text.Data').AccentFold = {
 176     0: /[⁰₀⓪０]/gi,
 177     1: /[¹₁①１]/gi,
 178     2: /[²₂②２]/gi,
 179     3: /[³₃③３]/gi,
 180     4: /[⁴₄④４]/gi,
 181     5: /[⁵₅⑤５]/gi,
 182     6: /[⁶₆⑥６]/gi,
 183     7: /[⁷₇⑦７]/gi,
 184     8: /[⁸₈⑧８]/gi,
 185     9: /[⁹₉⑨９]/gi,
 186     a: /[ªà-åāăąǎǟǡǻȁȃȧᵃḁẚạảấầẩẫậắằẳẵặⓐａ]/gi,
 187     b: /[ᵇḃḅḇⓑｂ]/gi,
 188     c: /[çćĉċčᶜḉⓒｃ]/gi,
 189     d: /[ďᵈḋḍḏḑḓⅾⓓｄ]/gi,
 190     e: /[è-ëēĕėęěȅȇȩᵉḕḗḙḛḝẹẻẽếềểễệₑℯⓔｅ]/gi,
 191     f: /[ᶠḟⓕｆ]/gi,
 192     g: /[ĝğġģǧǵᵍḡℊⓖｇ]/gi,
 193     h: /[ĥȟʰḣḥḧḩḫẖℎⓗｈ]/gi,
 194     i: /[ì-ïĩīĭįĳǐȉȋᵢḭḯỉịⁱℹⅰⓘｉ]/gi,
 195     j: /[ĵǰʲⓙⱼｊ]/gi,
 196     k: /[ķǩᵏḱḳḵⓚｋ]/gi,
 197     l: /[ĺļľŀǉˡḷḹḻḽℓⅼⓛｌ]/gi,
 198     m: /[ᵐḿṁṃⅿⓜｍ]/gi,
 199     n: /[ñńņňǹṅṇṉṋⁿⓝｎ]/gi,
 200     o: /[ºò-öōŏőơǒǫǭȍȏȫȭȯȱᵒṍṏṑṓọỏốồổỗộớờởỡợₒℴⓞｏ]/gi,
 201     p: /[ᵖṕṗⓟｐ]/gi,
 202     q: /[ʠⓠｑ]/gi,
 203     r: /[ŕŗřȑȓʳᵣṙṛṝṟⓡｒ]/gi,
 204     s: /[śŝşšſșˢṡṣṥṧṩẛⓢｓ]/gi,
 205     t: /[ţťțᵗṫṭṯṱẗⓣｔ]/gi,
 206     u: /[ù-üũūŭůűųưǔǖǘǚǜȕȗᵘᵤṳṵṷṹṻụủứừửữựⓤｕ]/gi,
 207     v: /[ᵛᵥṽṿⅴⓥｖ]/gi,
 208     w: /[ŵʷẁẃẅẇẉẘⓦｗ]/gi,
 209     x: /[ˣẋẍₓⅹⓧｘ]/gi,
 210     y: /[ýÿŷȳʸẏẙỳỵỷỹⓨｙ]/gi,
 211     z: /[źżžᶻẑẓẕⓩｚ]/gi
 212 };
 213
 214
 215 }, '3.3.0' );
 216 YUI.add('text-data-wordbreak', function(Y) {
 217
 218 Y.namespace('Text.Data').WordBreak = {
 219     // The UnicodeSet utility is helpful for enumerating the specific code
 220     // points covered by each of these regular expressions:
 221     // http://unicode.org/cldr/utility/list-unicodeset.jsp
 222     //
 223     // The code sets from which these regexes were derived can be generated
 224     // by the UnicodeSet utility using the links here:
 225     // http://unicode.org/cldr/utility/properties.jsp?a=Word_Break#Word_Break
 226
 227     aletter     : '[A-Za-zªµºÀ-ÖØ-öø-ˁˆ-ˑˠ-ˤˬˮͰ-ʹͶͷͺ-ͽΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԧԱ-Ֆՙա-ևא-תװ-׳ؠ-يٮٯٱ-ۓەۥۦۮۯۺ-ۼۿܐܒ-ܯݍ-ޥޱߊ-ߪߴߵߺࠀ-ࠕࠚࠤࠨࡀ-ࡘऄ-हऽॐक़-ॡॱ-ॷॹ-ॿঅ-ঌএঐও-নপ-রলশ-হঽৎড়ঢ়য়-ৡৰৱਅ-ਊਏਐਓ-ਨਪ-ਰਲਲ਼ਵਸ਼ਸਹਖ਼-ੜਫ਼ੲ-ੴઅ-ઍએ-ઑઓ-નપ-રલળવ-હઽૐૠૡଅ-ଌଏଐଓ-ନପ-ରଲଳଵ-ହଽଡ଼ଢ଼ୟ-ୡୱஃஅ-ஊஎ-ஐஒ-கஙசஜஞடணதந-பம-ஹௐఅ-ఌఎ-ఐఒ-నప-ళవ-హఽౘౙౠౡಅ-ಌಎ-ಐಒ-ನಪ-ಳವ-ಹಽೞೠೡೱೲഅ-ഌഎ-ഐഒ-ഺഽൎൠൡൺ-ൿඅ-ඖක-නඳ-රලව-ෆༀཀ-ཇཉ-ཬྈ-ྌႠ-Ⴥა-ჺჼᄀ-ቈቊ-ቍቐ-ቖቘቚ-ቝበ-ኈኊ-ኍነ-ኰኲ-ኵኸ-ኾዀዂ-ዅወ-ዖዘ-ጐጒ-ጕጘ-ፚᎀ-ᎏᎠ-Ᏼᐁ-ᙬᙯ-ᙿᚁ-ᚚᚠ-ᛪᛮ-ᛰᜀ-ᜌᜎ-ᜑᜠ-ᜱᝀ-ᝑᝠ-ᝬᝮ-ᝰᠠ-ᡷᢀ-ᢨᢪᢰ-ᣵᤀ-ᤜᨀ-ᨖᬅ-ᬳᭅ-ᭋᮃ-ᮠᮮᮯᯀ-ᯥᰀ-ᰣᱍ-ᱏᱚ-ᱽᳩ-ᳬᳮ-ᳱᴀ-ᶿḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼⁱⁿₐ-ₜℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℹℼ-ℿⅅ-ⅉⅎⅠ-ↈⒶ-ⓩⰀ-Ⱞⰰ-ⱞⱠ-ⳤⳫ-ⳮⴀ-ⴥⴰ-ⵥⵯⶀ-ⶖⶠ-ⶦⶨ-ⶮⶰ-ⶶⶸ-ⶾⷀ-ⷆⷈ-ⷎⷐ-ⷖⷘ-ⷞⸯ々〻〼ㄅ-ㄭㄱ-ㆎㆠ-ㆺꀀ-ꒌꓐ-ꓽꔀ-ꘌꘐ-ꘟꘪꘫꙀ-ꙮꙿ-ꚗꚠ-ꛯꜗ-ꜟꜢ-ꞈꞋ-ꞎꞐꞑꞠ-ꞩꟺ-ꠁꠃ-ꠅꠇ-ꠊꠌ-ꠢꡀ-ꡳꢂ-ꢳꣲ-ꣷꣻꤊ-ꤥꤰ-ꥆꥠ-ꥼꦄ-ꦲꧏꨀ-ꨨꩀ-ꩂꩄ-ꩋꬁ-ꬆꬉ-ꬎꬑ-ꬖꬠ-ꬦꬨ-ꬮꯀ-ꯢ가-힣ힰ-ퟆퟋ-ퟻﬀ-ﬆﬓ-ﬗיִײַ-ﬨשׁ-זּטּ-לּמּנּסּףּפּצּ-ﮱﯓ-ﴽﵐ-ﶏﶒ-ﷇﷰ-ﷻﹰ-ﹴﹶ-ﻼＡ-Ｚａ-ｚﾠ-ﾾￂ-ￇￊ-ￏￒ-ￗￚ-ￜ]',
 228     midnumlet   : "['\\.‘’․﹒＇．]",
 229     midletter   : '[:··״‧︓﹕：]',
 230     midnum      : '[,;;։،؍٬߸⁄︐︔﹐﹔，；]',
 231     numeric     : '[0-9٠-٩٫۰-۹߀-߉०-९০-৯੦-੯૦-૯୦-୯௦-௯౦-౯೦-೯൦-൯๐-๙໐-໙༠-༩၀-၉႐-႙០-៩᠐-᠙᥆-᥏᧐-᧙᪀-᪉᪐-᪙᭐-᭙᮰-᮹᱀-᱉᱐-᱙꘠-꘩꣐-꣙꤀-꤉꧐-꧙꩐-꩙꯰-꯹]',
 232     cr          : '\\r',
 233     lf          : '\\n',
 234     newline     : '[\u000B\u000C\u0085\u2028\u2029]',
 235     extend      : '[\u0300-\u036F\u0483-\u0489\u0591-\u05BD\u05BF\u05C1\u05C2\u05C4\u05C5\u05C7\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06DC\u06DF-\u06E4\u06E7\u06E8\u06EA-\u06ED\u0711\u0730-\u074A\u07A6-\u07B0\u07EB-\u07F3\u0816-\u0819\u081B-\u0823\u0825-\u0827\u0829-\u082D\u0859-\u085B\u0900-\u0903\u093A-\u093C\u093E-\u094F\u0951-\u0957\u0962\u0963\u0981-\u0983\u09BC\u09BE-\u09C4\u09C7\u09C8\u09CB-\u09CD\u09D7\u09E2\u09E3\u0A01-\u0A03\u0A3C\u0A3E-\u0A42\u0A47\u0A48\u0A4B-\u0A4D\u0A51\u0A70\u0A71\u0A75\u0A81-\u0A83\u0ABC\u0ABE-\u0AC5\u0AC7-\u0AC9\u0ACB-\u0ACD\u0AE2\u0AE3\u0B01-\u0B03\u0B3C\u0B3E-\u0B44\u0B47\u0B48\u0B4B-\u0B4D\u0B56\u0B57\u0B62\u0B63\u0B82\u0BBE-\u0BC2\u0BC6-\u0BC8\u0BCA-\u0BCD\u0BD7\u0C01-\u0C03\u0C3E-\u0C44\u0C46-\u0C48\u0C4A-\u0C4D\u0C55\u0C56\u0C62\u0C63\u0C82\u0C83\u0CBC\u0CBE-\u0CC4\u0CC6-\u0CC8\u0CCA-\u0CCD\u0CD5\u0CD6\u0CE2\u0CE3\u0D02\u0D03\u0D3E-\u0D44\u0D46-\u0D48\u0D4A-\u0D4D\u0D57\u0D62\u0D63\u0D82\u0D83\u0DCA\u0DCF-\u0DD4\u0DD6\u0DD8-\u0DDF\u0DF2\u0DF3\u0E31\u0E34-\u0E3A\u0E47-\u0E4E\u0EB1\u0EB4-\u0EB9\u0EBB\u0EBC\u0EC8-\u0ECD\u0F18\u0F19\u0F35\u0F37\u0F39\u0F3E\u0F3F\u0F71-\u0F84\u0F86\u0F87\u0F8D-\u0F97\u0F99-\u0FBC\u0FC6\u102B-\u103E\u1056-\u1059\u105E-\u1060\u1062-\u1064\u1067-\u106D\u1071-\u1074\u1082-\u108D\u108F\u109A-\u109D\u135D-\u135F\u1712-\u1714\u1732-\u1734\u1752\u1753\u1772\u1773\u17B6-\u17D3\u17DD\u180B-\u180D\u18A9\u1920-\u192B\u1930-\u193B\u19B0-\u19C0\u19C8\u19C9\u1A17-\u1A1B\u1A55-\u1A5E\u1A60-\u1A7C\u1A7F\u1B00-\u1B04\u1B34-\u1B44\u1B6B-\u1B73\u1B80-\u1B82\u1BA1-\u1BAA\u1BE6-\u1BF3\u1C24-\u1C37\u1CD0-\u1CD2\u1CD4-\u1CE8\u1CED\u1CF2\u1DC0-\u1DE6\u1DFC-\u1DFF\u200C\u200D\u20D0-\u20F0\u2CEF-\u2CF1\u2D7F\u2DE0-\u2DFF\u302A-\u302F\u3099\u309A\uA66F-\uA672\uA67C\uA67D\uA6F0\uA6F1\uA802\uA806\uA80B\uA823-\uA827\uA880\uA881\uA8B4-\uA8C4\uA8E0-\uA8F1\uA926-\uA92D\uA947-\uA953\uA980-\uA983\uA9B3-\uA9C0\uAA29-\uAA36\uAA43\uAA4C\uAA4D\uAA7B\uAAB0\uAAB2-\uAAB4\uAAB7\uAAB8\uAABE\uAABF\uAAC1\uABE3-\uABEA\uABEC\uABED\uFB1E\uFE00-\uFE0F\uFE20-\uFE26\uFF9E\uFF9F]',
 236     format      : '[\u00AD\u0600-\u0603\u06DD\u070F\u17B4\u17B5\u200E\u200F\u202A-\u202E\u2060-\u2064\u206A-\u206F\uFEFF\uFFF9-\uFFFB]',
 237     katakana    : '[〱-〵゛゜゠-ヺー-ヿㇰ-ㇿ㋐-㋾㌀-㍗ｦ-ﾝ]',
 238     extendnumlet: '[_‿⁀⁔︳︴﹍-﹏＿]',
 239     punctuation : '[!-#%-*,-\\/:;?@\\[-\\]_{}¡«·»¿;·՚-՟։֊־׀׃׆׳״؉؊،؍؛؞؟٪-٭۔܀-܍߷-߹࠰-࠾࡞।॥॰෴๏๚๛༄-༒༺-༽྅࿐-࿔࿙࿚၊-၏჻፡-፨᐀᙭᙮᚛᚜᛫-᛭᜵᜶។-៖៘-៚᠀-᠊᥄᥅᨞᨟᪠-᪦᪨-᪭᭚-᭠᯼-᯿᰻-᰿᱾᱿᳓‐-‧‰-⁃⁅-⁑⁓-⁞⁽⁾₍₎〈〉❨-❵⟅⟆⟦-⟯⦃-⦘⧘-⧛⧼⧽⳹-⳼⳾⳿⵰⸀-⸮⸰⸱、-〃〈-】〔-〟〰〽゠・꓾꓿꘍-꘏꙳꙾꛲-꛷꡴-꡷꣎꣏꣸-꣺꤮꤯꥟꧁-꧍꧞꧟꩜-꩟꫞꫟꯫﴾﴿︐-︙︰-﹒﹔-﹡﹣﹨﹪﹫！-＃％-＊，-／：；？＠［-］＿｛｝｟-･]'
 240 };
 241
 242
 243 }, '3.3.0' );
 244 YUI.add('text-wordbreak', function(Y) {
 245
 246 /**
 247  * Provides utility methods for splitting strings on word breaks and determining
 248  * whether a character index represents a word boundary.
 249  *
 250  * @module text
 251  * @submodule text-wordbreak
 252  */
 253
 254 /**
 255  * <p>
 256  * Provides utility methods for splitting strings on word breaks and determining
 257  * whether a character index represents a word boundary, using the generic word
 258  * breaking algorithm defined in the Unicode Text Segmentation guidelines
 259  * (<a href="http://unicode.org/reports/tr29/#Word_Boundaries">Unicode Standard
 260  * Annex #29</a>).
 261  * </p>
 262  *
 263  * <p>
 264  * This algorithm provides a reasonable default for many languages. However, it
 265  * does not cover language or context specific requirements, and it does not
 266  * provide meaningful results at all for languages that don't use spaces between
 267  * words, such as Chinese, Japanese, Thai, Lao, Khmer, and others. Server-based
 268  * word breaking services usually provide significantly better results with
 269  * better performance.
 270  * </p>
 271  *
 272  * @class Text.WordBreak
 273  * @static
 274  */
 275
 276 var Text   = Y.Text,
 277     WBData = Text.Data.WordBreak,
 278
 279 // Constants representing code point classifications.
 280 ALETTER      = 0,
 281 MIDNUMLET    = 1,
 282 MIDLETTER    = 2,
 283 MIDNUM       = 3,
 284 NUMERIC      = 4,
 285 CR           = 5,
 286 LF           = 6,
 287 NEWLINE      = 7,
 288 EXTEND       = 8,
 289 FORMAT       = 9,
 290 KATAKANA     = 10,
 291 EXTENDNUMLET = 11,
 292 OTHER        = 12,
 293
 294 // RegExp objects generated from code point data. Each regex matches a single
 295 // character against a set of Unicode code points. The index of each item in
 296 // this array must match its corresponding code point constant value defined
 297 // above.
 298 SETS = [
 299     new RegExp(WBData.aletter),
 300     new RegExp(WBData.midnumlet),
 301     new RegExp(WBData.midletter),
 302     new RegExp(WBData.midnum),
 303     new RegExp(WBData.numeric),
 304     new RegExp(WBData.cr),
 305     new RegExp(WBData.lf),
 306     new RegExp(WBData.newline),
 307     new RegExp(WBData.extend),
 308     new RegExp(WBData.format),
 309     new RegExp(WBData.katakana),
 310     new RegExp(WBData.extendnumlet)
 311 ],
 312
 313 EMPTY_STRING = '',
 314 PUNCTUATION  = new RegExp('^' + WBData.punctuation + '$'),
 315 WHITESPACE   = /\s/,
 316
 317 WordBreak = {
 318     // -- Public Static Methods ------------------------------------------------
 319
 320     /**
 321      * Splits the specified string into an array of individual words.
 322      *
 323      * @method getWords
 324      * @param {String} string String to split.
 325      * @param {Object} options (optional) Options object containing zero or more
 326      *   of the following properties:
 327      *
 328      * <dl>
 329      *   <dt>ignoreCase (Boolean)</dt>
 330      *   <dd>
 331      *     If <code>true</code>, the string will be converted to lowercase
 332      *     before being split. Default is <code>false</code>.
 333      *   </dd>
 334      *
 335      *   <dt>includePunctuation (Boolean)</dt>
 336      *   <dd>
 337      *     If <code>true</code>, the returned array will include punctuation
 338      *     characters. Default is <code>false</code>.
 339      *   </dd>
 340      *
 341      *   <dt>includeWhitespace (Boolean)</dt>
 342      *   <dd>
 343      *     If <code>true</code>, the returned array will include whitespace
 344      *     characters. Default is <code>false</code>.
 345      *   </dd>
 346      * </dl>
 347      * @return {Array} Array of words.
 348      * @static
 349      */
 350     getWords: function (string, options) {
 351         var i     = 0,
 352             map   = WordBreak._classify(string),
 353             len   = map.length,
 354             word  = [],
 355             words = [],
 356             chr,
 357             includePunctuation,
 358             includeWhitespace;
 359
 360         if (!options) {
 361             options = {};
 362         }
 363
 364         if (options.ignoreCase) {
 365             string = string.toLowerCase();
 366         }
 367
 368         includePunctuation = options.includePunctuation;
 369         includeWhitespace  = options.includeWhitespace;
 370
 371         // Loop through each character in the classification map and determine
 372         // whether it precedes a word boundary, building an array of distinct
 373         // words as we go.
 374         for (; i < len; ++i) {
 375             chr = string.charAt(i);
 376
 377             // Append this character to the current word.
 378             word.push(chr);
 379
 380             // If there's a word boundary between the current character and the
 381             // next character, append the current word to the words array and
 382             // start building a new word.
 383             if (WordBreak._isWordBoundary(map, i)) {
 384                 word = word.join(EMPTY_STRING);
 385
 386                 if (word &&
 387                         (includeWhitespace  || !WHITESPACE.test(word)) &&
 388                         (includePunctuation || !PUNCTUATION.test(word))) {
 389                     words.push(word);
 390                 }
 391
 392                 word = [];
 393             }
 394         }
 395
 396         return words;
 397     },
 398
 399     /**
 400      * Returns an array containing only unique words from the specified string.
 401      * For example, the string <code>'foo bar baz foo'</code> would result in
 402      * the array <code>['foo', 'bar', 'baz']</code>.
 403      *
 404      * @method getUniqueWords
 405      * @param {String} string String to split.
 406      * @param {Object} options (optional) Options (see <code>getWords()</code>
 407      *   for details).
 408      * @return {Array} Array of unique words.
 409      * @static
 410      */
 411     getUniqueWords: function (string, options) {
 412         return Y.Array.unique(WordBreak.getWords(string, options));
 413     },
 414
 415     /**
 416      * <p>
 417      * Returns <code>true</code> if there is a word boundary between the
 418      * specified character index and the next character index (or the end of the
 419      * string).
 420      * </p>
 421      *
 422      * <p>
 423      * Note that there are always word breaks at the beginning and end of a
 424      * string, so <code>isWordBoundary('', 0)</code> and
 425      * <code>isWordBoundary('a', 0)</code> will both return <code>true</code>.
 426      * </p>
 427      *
 428      * @method isWordBoundary
 429      * @param {String} string String to test.
 430      * @param {Number} index Character index to test within the string.
 431      * @return {Boolean} <code>true</code> for a word boundary,
 432      *   <code>false</code> otherwise.
 433      * @static
 434      */
 435     isWordBoundary: function (string, index) {
 436         return WordBreak._isWordBoundary(WordBreak._classify(string), index);
 437     },
 438
 439     // -- Protected Static Methods ---------------------------------------------
 440
 441     /**
 442      * Returns a character classification map for the specified string.
 443      *
 444      * @method _classify
 445      * @param {String} string String to classify.
 446      * @return {Array} Classification map.
 447      * @protected
 448      * @static
 449      */
 450     _classify: function (string) {
 451         var chr,
 452             map          = [],
 453             i            = 0,
 454             j,
 455             set,
 456             stringLength = string.length,
 457             setsLength   = SETS.length,
 458             type;
 459
 460         for (; i < stringLength; ++i) {
 461             chr  = string.charAt(i);
 462             type = OTHER;
 463
 464             for (j = 0; j < setsLength; ++j) {
 465                 set = SETS[j];
 466
 467                 if (set && set.test(chr)) {
 468                     type = j;
 469                     break;
 470                 }
 471             }
 472
 473             map.push(type);
 474         }
 475
 476         return map;
 477     },
 478
 479     /**
 480      * <p>
 481      * Returns <code>true</code> if there is a word boundary between the
 482      * specified character index and the next character index (or the end of the
 483      * string).
 484      * </p>
 485      *
 486      * <p>
 487      * Note that there are always word breaks at the beginning and end of a
 488      * string, so <code>_isWordBoundary('', 0)</code> and
 489      * <code>_isWordBoundary('a', 0)</code> will both return <code>true</code>.
 490      * </p>
 491      *
 492      * @method _isWordBoundary
 493      * @param {Array} map Character classification map generated by
 494      *   <code>_classify</code>.
 495      * @param {Number} index Character index to test.
 496      * @return {Boolean}
 497      * @protected
 498      * @static
 499      */
 500     _isWordBoundary: function (map, index) {
 501         var prevType,
 502             type     = map[index],
 503             nextType = map[index + 1],
 504             nextNextType;
 505
 506         if (index < 0 || (index > map.length - 1 && index !== 0)) {
 507             return false;
 508         }
 509
 510         // WB5. Don't break between most letters.
 511         if (type === ALETTER && nextType === ALETTER) {
 512             return false;
 513         }
 514
 515         nextNextType = map[index + 2];
 516
 517         // WB6. Don't break letters across certain punctuation.
 518         if (type === ALETTER &&
 519                 (nextType === MIDLETTER || nextType === MIDNUMLET) &&
 520                 nextNextType === ALETTER) {
 521             return false;
 522         }
 523
 524         prevType = map[index - 1];
 525
 526         // WB7. Don't break letters across certain punctuation.
 527         if ((type === MIDLETTER || type === MIDNUMLET) &&
 528                 nextType === ALETTER &&
 529                 prevType === ALETTER) {
 530             return false;
 531         }
 532
 533         // WB8/WB9/WB10. Don't break inside sequences of digits or digits
 534         // adjacent to letters.
 535         if ((type === NUMERIC || type === ALETTER) &&
 536                 (nextType === NUMERIC || nextType === ALETTER)) {
 537             return false;
 538         }
 539
 540         // WB11. Don't break inside numeric sequences like "3.2" or
 541         // "3,456.789".
 542         if ((type === MIDNUM || type === MIDNUMLET) &&
 543                 nextType === NUMERIC &&
 544                 prevType === NUMERIC) {
 545             return false;
 546         }
 547
 548         // WB12. Don't break inside numeric sequences like "3.2" or
 549         // "3,456.789".
 550         if (type === NUMERIC &&
 551                 (nextType === MIDNUM || nextType === MIDNUMLET) &&
 552                 nextNextType === NUMERIC) {
 553             return false;
 554         }
 555
 556         // WB4. Ignore format and extend characters.
 557         if (type === EXTEND || type === FORMAT ||
 558                 prevType === EXTEND || prevType === FORMAT ||
 559                 nextType === EXTEND || nextType === FORMAT) {
 560             return false;
 561         }
 562
 563         // WB3. Don't break inside CRLF.
 564         if (type === CR && nextType === LF) {
 565             return false;
 566         }
 567
 568         // WB3a. Break before newlines (including CR and LF).
 569         if (type === NEWLINE || type === CR || type === LF) {
 570             return true;
 571         }
 572
 573         // WB3b. Break after newlines (including CR and LF).
 574         if (nextType === NEWLINE || nextType === CR || nextType === LF) {
 575             return true;
 576         }
 577
 578         // WB13. Don't break between Katakana characters.
 579         if (type === KATAKANA && nextType === KATAKANA) {
 580             return false;
 581         }
 582
 583         // WB13a. Don't break from extenders.
 584         if (nextType === EXTENDNUMLET &&
 585                 (type === ALETTER || type === NUMERIC || type === KATAKANA ||
 586                 type === EXTENDNUMLET)) {
 587             return false;
 588         }
 589
 590         // WB13b. Don't break from extenders.
 591         if (type === EXTENDNUMLET &&
 592                 (nextType === ALETTER || nextType === NUMERIC ||
 593                 nextType === KATAKANA)) {
 594             return false;
 595         }
 596
 597         // Break after any character not covered by the rules above.
 598         return true;
 599     }
 600 };
 601
 602 Text.WordBreak = WordBreak;
 603
 604
 605 }, '3.3.0' ,{requires:['array-extras', 'text-data-wordbreak']});
 606
 607
 608 YUI.add('text', function(Y){}, '3.3.0' ,{use:['text-accentfold', 'text-wordbreak']});
 609