3
Copyright 2011 Yahoo! Inc. All rights reserved.
4
Licensed under the BSD License.
5
http://yuilibrary.com/license/
7
YUI.add('text-wordbreak', function(Y) {
10
* Provides utility methods for splitting strings on word breaks and determining
11
* whether a character index represents a word boundary.
14
* @submodule text-wordbreak
19
* Provides utility methods for splitting strings on word breaks and determining
20
* whether a character index represents a word boundary, using the generic word
21
* breaking algorithm defined in the Unicode Text Segmentation guidelines
22
* (<a href="http://unicode.org/reports/tr29/#Word_Boundaries">Unicode Standard
27
* This algorithm provides a reasonable default for many languages. However, it
28
* does not cover language or context specific requirements, and it does not
29
* provide meaningful results at all for languages that don't use spaces between
30
* words, such as Chinese, Japanese, Thai, Lao, Khmer, and others. Server-based
31
* word breaking services usually provide significantly better results with
35
* @class Text.WordBreak
40
WBData = Text.Data.WordBreak,
42
// Constants representing code point classifications.
57
// RegExp objects generated from code point data. Each regex matches a single
58
// character against a set of Unicode code points. The index of each item in
59
// this array must match its corresponding code point constant value defined
62
new RegExp(WBData.aletter),
63
new RegExp(WBData.midnumlet),
64
new RegExp(WBData.midletter),
65
new RegExp(WBData.midnum),
66
new RegExp(WBData.numeric),
67
new RegExp(WBData.cr),
68
new RegExp(WBData.lf),
69
new RegExp(WBData.newline),
70
new RegExp(WBData.extend),
71
new RegExp(WBData.format),
72
new RegExp(WBData.katakana),
73
new RegExp(WBData.extendnumlet)
77
PUNCTUATION = new RegExp('^' + WBData.punctuation + '$'),
81
// -- Public Static Methods ------------------------------------------------
84
* Splits the specified string into an array of individual words.
87
* @param {String} string String to split.
88
* @param {Object} options (optional) Options object containing zero or more
89
* of the following properties:
92
* <dt>ignoreCase (Boolean)</dt>
94
* If <code>true</code>, the string will be converted to lowercase
95
* before being split. Default is <code>false</code>.
98
* <dt>includePunctuation (Boolean)</dt>
100
* If <code>true</code>, the returned array will include punctuation
101
* characters. Default is <code>false</code>.
104
* <dt>includeWhitespace (Boolean)</dt>
106
* If <code>true</code>, the returned array will include whitespace
107
* characters. Default is <code>false</code>.
110
* @return {Array} Array of words.
113
getWords: function (string, options) {
115
map = WordBreak._classify(string),
127
if (options.ignoreCase) {
128
string = string.toLowerCase();
131
includePunctuation = options.includePunctuation;
132
includeWhitespace = options.includeWhitespace;
134
// Loop through each character in the classification map and determine
135
// whether it precedes a word boundary, building an array of distinct
137
for (; i < len; ++i) {
138
chr = string.charAt(i);
140
// Append this character to the current word.
143
// If there's a word boundary between the current character and the
144
// next character, append the current word to the words array and
145
// start building a new word.
146
if (WordBreak._isWordBoundary(map, i)) {
147
word = word.join(EMPTY_STRING);
150
(includeWhitespace || !WHITESPACE.test(word)) &&
151
(includePunctuation || !PUNCTUATION.test(word))) {
163
* Returns an array containing only unique words from the specified string.
164
* For example, the string <code>'foo bar baz foo'</code> would result in
165
* the array <code>['foo', 'bar', 'baz']</code>.
167
* @method getUniqueWords
168
* @param {String} string String to split.
169
* @param {Object} options (optional) Options (see <code>getWords()</code>
171
* @return {Array} Array of unique words.
174
getUniqueWords: function (string, options) {
175
return Y.Array.unique(WordBreak.getWords(string, options));
180
* Returns <code>true</code> if there is a word boundary between the
181
* specified character index and the next character index (or the end of the
186
* Note that there are always word breaks at the beginning and end of a
187
* string, so <code>isWordBoundary('', 0)</code> and
188
* <code>isWordBoundary('a', 0)</code> will both return <code>true</code>.
191
* @method isWordBoundary
192
* @param {String} string String to test.
193
* @param {Number} index Character index to test within the string.
194
* @return {Boolean} <code>true</code> for a word boundary,
195
* <code>false</code> otherwise.
198
isWordBoundary: function (string, index) {
199
return WordBreak._isWordBoundary(WordBreak._classify(string), index);
202
// -- Protected Static Methods ---------------------------------------------
205
* Returns a character classification map for the specified string.
208
* @param {String} string String to classify.
209
* @return {Array} Classification map.
213
_classify: function (string) {
219
stringLength = string.length,
220
setsLength = SETS.length,
223
for (; i < stringLength; ++i) {
224
chr = string.charAt(i);
227
for (j = 0; j < setsLength; ++j) {
230
if (set && set.test(chr)) {
244
* Returns <code>true</code> if there is a word boundary between the
245
* specified character index and the next character index (or the end of the
250
* Note that there are always word breaks at the beginning and end of a
251
* string, so <code>_isWordBoundary('', 0)</code> and
252
* <code>_isWordBoundary('a', 0)</code> will both return <code>true</code>.
255
* @method _isWordBoundary
256
* @param {Array} map Character classification map generated by
257
* <code>_classify</code>.
258
* @param {Number} index Character index to test.
263
_isWordBoundary: function (map, index) {
266
nextType = map[index + 1],
269
if (index < 0 || (index > map.length - 1 && index !== 0)) {
273
// WB5. Don't break between most letters.
274
if (type === ALETTER && nextType === ALETTER) {
278
nextNextType = map[index + 2];
280
// WB6. Don't break letters across certain punctuation.
281
if (type === ALETTER &&
282
(nextType === MIDLETTER || nextType === MIDNUMLET) &&
283
nextNextType === ALETTER) {
287
prevType = map[index - 1];
289
// WB7. Don't break letters across certain punctuation.
290
if ((type === MIDLETTER || type === MIDNUMLET) &&
291
nextType === ALETTER &&
292
prevType === ALETTER) {
296
// WB8/WB9/WB10. Don't break inside sequences of digits or digits
297
// adjacent to letters.
298
if ((type === NUMERIC || type === ALETTER) &&
299
(nextType === NUMERIC || nextType === ALETTER)) {
303
// WB11. Don't break inside numeric sequences like "3.2" or
305
if ((type === MIDNUM || type === MIDNUMLET) &&
306
nextType === NUMERIC &&
307
prevType === NUMERIC) {
311
// WB12. Don't break inside numeric sequences like "3.2" or
313
if (type === NUMERIC &&
314
(nextType === MIDNUM || nextType === MIDNUMLET) &&
315
nextNextType === NUMERIC) {
319
// WB4. Ignore format and extend characters.
320
if (type === EXTEND || type === FORMAT ||
321
prevType === EXTEND || prevType === FORMAT ||
322
nextType === EXTEND || nextType === FORMAT) {
326
// WB3. Don't break inside CRLF.
327
if (type === CR && nextType === LF) {
331
// WB3a. Break before newlines (including CR and LF).
332
if (type === NEWLINE || type === CR || type === LF) {
336
// WB3b. Break after newlines (including CR and LF).
337
if (nextType === NEWLINE || nextType === CR || nextType === LF) {
341
// WB13. Don't break between Katakana characters.
342
if (type === KATAKANA && nextType === KATAKANA) {
346
// WB13a. Don't break from extenders.
347
if (nextType === EXTENDNUMLET &&
348
(type === ALETTER || type === NUMERIC || type === KATAKANA ||
349
type === EXTENDNUMLET)) {
353
// WB13b. Don't break from extenders.
354
if (type === EXTENDNUMLET &&
355
(nextType === ALETTER || nextType === NUMERIC ||
356
nextType === KATAKANA)) {
360
// Break after any character not covered by the rules above.
365
Text.WordBreak = WordBreak;
368
}, '3.4.1' ,{requires:['array-extras', 'text-data-wordbreak']});