@ephox/polaris
Advanced tools
Comparing version 6.0.9 to 6.1.0-alpha.0
@@ -9,2 +9,5 @@ # Changelog | ||
### Added | ||
- New `Words.getWordsWithIndices` API to extract words and their start and end indices when provided an array of characters. #TINY-9654 | ||
## 6.0.2 - 2022-06-29 | ||
@@ -11,0 +14,0 @@ |
@@ -1,5 +0,7 @@ | ||
import * as WordOptions from '../words/Words'; | ||
declare type GetWordsApi = <T>(chars: T[], extract: (char: T) => string, options?: WordOptions.WordOptions) => T[][]; | ||
import * as Words from '../words/Words'; | ||
declare type GetWordsApi = <T>(chars: T[], extract: (char: T) => string, options?: Words.WordOptions) => Words.Word<T>[]; | ||
declare const getWords: GetWordsApi; | ||
export { getWords }; | ||
declare type GetWordsAndIndicesApi = <T>(chars: T[], extract: (char: T) => string, options?: Words.WordOptions) => Words.WordsWithIndices<T>; | ||
declare const getWordsWithIndices: GetWordsAndIndicesApi; | ||
export { getWords, getWordsWithIndices }; | ||
//# sourceMappingURL=Words.d.ts.map |
@@ -1,4 +0,5 @@ | ||
import * as WordOptions from '../words/Words'; | ||
const getWords = WordOptions.getWords; | ||
export { getWords }; | ||
import * as Words from '../words/Words'; | ||
const getWords = Words.getWords; | ||
const getWordsWithIndices = Words.getWordsWithIndices; | ||
export { getWords, getWordsWithIndices }; | ||
//# sourceMappingURL=Words.js.map |
@@ -1,2 +0,2 @@ | ||
declare const punctuationStr = "[!-#%-*,-\\/:;?@\\[-\\]_{}\u00A1\u00AB\u00B7\u00BB\u00BF;\u00B7\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1361-\u1368\u1400\u166D\u166E\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CD3\u2010-\u2027\u2030-\u2043\u2045-\u2051\u2053-\u205E\u207D\u207E\u208D\u208E\u3008\u3009\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30\u2E31\u3001-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]"; | ||
declare const punctuationStr = "[~^\u2116|!-*+-\\/:;?@\\[-\\]_{}\u00A1\u00AB\u00B7\u00BB\u00BF;\u00B7\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1361-\u1368\u1400\u166D\u166E\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CD3\u2010-\u2027\u2030-\u2043\u2045-\u2051\u2053-\u205E\u207D\u207E\u208D\u208E\u3008\u3009\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30\u2E31\u3001-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]"; | ||
declare const characterIndices: { | ||
@@ -3,0 +3,0 @@ ALETTER: number; |
/* eslint-disable max-len */ | ||
const punctuationStr = '[!-#%-*,-\\/:;?@\\[-\\]_{}\u00A1\u00AB\u00B7\u00BB\u00BF;\u00B7\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1361-\u1368\u1400\u166D\u166E\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CD3\u2010-\u2027\u2030-\u2043\u2045-\u2051\u2053-\u205E\u207D\u207E\u208D\u208E\u3008\u3009\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30\u2E31\u3001-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]'; | ||
const punctuationStr = `[~^№|!-*+-\\/:;?@\\[-\\]_{}\u00A1\u00AB\u00B7\u00BB\u00BF;\u00B7\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1361-\u1368\u1400\u166D\u166E\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CD3\u2010-\u2027\u2030-\u2043\u2045-\u2051\u2053-\u205E\u207D\u207E\u208D\u208E\u3008\u3009\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30\u2E31\u3001-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]`; | ||
const regExps = { | ||
@@ -4,0 +4,0 @@ aletter: '[A-Za-z\u00AA\u00B5\u00BA\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02C1\u02C6-\u02D1\u02E0-\u02E4\u02EC\u02EE\u0370-\u0374\u0376\u0377\u037A-\u037D\u0386\u0388-\u038A\u038C\u038E-\u03A1\u03A3-\u03F5\u03F7-\u0481\u048A-\u0527\u0531-\u0556\u0559\u0561-\u0587\u05D0-\u05EA\u05F0-\u05F3\u0620-\u064A\u066E\u066F\u0671-\u06D3\u06D5\u06E5\u06E6\u06EE\u06EF\u06FA-\u06FC\u06FF\u0710\u0712-\u072F\u074D-\u07A5\u07B1\u07CA-\u07EA\u07F4\u07F5\u07FA\u0800-\u0815\u081A\u0824\u0828\u0840-\u0858\u0904-\u0939\u093D\u0950\u0958-\u0961\u0971-\u0977\u0979-\u097F\u0985-\u098C\u098F\u0990\u0993-\u09A8\u09AA-\u09B0\u09B2\u09B6-\u09B9\u09BD\u09CE\u09DC\u09DD\u09DF-\u09E1\u09F0\u09F1\u0A05-\u0A0A\u0A0F\u0A10\u0A13-\u0A28\u0A2A-\u0A30\u0A32\u0A33\u0A35\u0A36\u0A38\u0A39\u0A59-\u0A5C\u0A5E\u0A72-\u0A74\u0A85-\u0A8D\u0A8F-\u0A91\u0A93-\u0AA8\u0AAA-\u0AB0\u0AB2\u0AB3\u0AB5-\u0AB9\u0ABD\u0AD0\u0AE0\u0AE1\u0B05-\u0B0C\u0B0F\u0B10\u0B13-\u0B28\u0B2A-\u0B30\u0B32\u0B33\u0B35-\u0B39\u0B3D\u0B5C\u0B5D\u0B5F-\u0B61\u0B71\u0B83\u0B85-\u0B8A\u0B8E-\u0B90\u0B92-\u0B95\u0B99\u0B9A\u0B9C\u0B9E\u0B9F\u0BA3\u0BA4\u0BA8-\u0BAA\u0BAE-\u0BB9\u0BD0\u0C05-\u0C0C\u0C0E-\u0C10\u0C12-\u0C28\u0C2A-\u0C33\u0C35-\u0C39\u0C3D\u0C58\u0C59\u0C60\u0C61\u0C85-\u0C8C\u0C8E-\u0C90\u0C92-\u0CA8\u0CAA-\u0CB3\u0CB5-\u0CB9\u0CBD\u0CDE\u0CE0\u0CE1\u0CF1\u0CF2\u0D05-\u0D0C\u0D0E-\u0D10\u0D12-\u0D3A\u0D3D\u0D4E\u0D60\u0D61\u0D7A-\u0D7F\u0D85-\u0D96\u0D9A-\u0DB1\u0DB3-\u0DBB\u0DBD\u0DC0-\u0DC6\u0F00\u0F40-\u0F47\u0F49-\u0F6C\u0F88-\u0F8C\u10A0-\u10C5\u10D0-\u10FA\u10FC\u1100-\u1248\u124A-\u124D\u1250-\u1256\u1258\u125A-\u125D\u1260-\u1288\u128A-\u128D\u1290-\u12B0\u12B2-\u12B5\u12B8-\u12BE\u12C0\u12C2-\u12C5\u12C8-\u12D6\u12D8-\u1310\u1312-\u1315\u1318-\u135A\u1380-\u138F\u13A0-\u13F4\u1401-\u166C\u166F-\u167F\u1681-\u169A\u16A0-\u16EA\u16EE-\u16F0\u1700-\u170C\u170E-\u1711\u1720-\u1731\u1740-\u1751\u1760-\u176C\u176E-\u1770\u1820-\u1877\u1880-\u18A8\u18AA\u18B0-\u18F5\u1900-\u191C\u1A00-\u1A16\u1B05-\u1B33\u1B45-\u1B4B\u1B83-\u1BA0\u1BAE\u1BAF\u1BC0-\u1BE5\u1C00-\u1C23\u1C4D-\u1C4F\u1C5A-\u1C7D\u1CE9-\u1CEC\u1CEE-\u1CF1\u1D00-\u1DBF\u1E00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FBC\u1FBE\u1FC2-\u1FC4\u1FC6-\u1FCC\u1FD0-\u1FD3\u1FD6-\u1FDB\u1FE0-\u1FEC\u1FF2-\u1FF4\u1FF6-\u1FFC\u2071\u207F\u2090-\u209C\u2102\u2107\u210A-\u2113\u2115\u2119-\u211D\u2124\u2126\u2128\u212A-\u212D\u212F-\u2139\u213C-\u213F\u2145-\u2149\u214E\u2160-\u2188\u24B6-\u24E9\u2C00-\u2C2E\u2C30-\u2C5E\u2C60-\u2CE4\u2CEB-\u2CEE\u2D00-\u2D25\u2D30-\u2D65\u2D6F\u2D80-\u2D96\u2DA0-\u2DA6\u2DA8-\u2DAE\u2DB0-\u2DB6\u2DB8-\u2DBE\u2DC0-\u2DC6\u2DC8-\u2DCE\u2DD0-\u2DD6\u2DD8-\u2DDE\u2E2F\u3005\u303B\u303C\u3105-\u312D\u3131-\u318E\u31A0-\u31BA\uA000-\uA48C\uA4D0-\uA4FD\uA500-\uA60C\uA610-\uA61F\uA62A\uA62B\uA640-\uA66E\uA67F-\uA697\uA6A0-\uA6EF\uA717-\uA71F\uA722-\uA788\uA78B-\uA78E\uA790\uA791\uA7A0-\uA7A9\uA7FA-\uA801\uA803-\uA805\uA807-\uA80A\uA80C-\uA822\uA840-\uA873\uA882-\uA8B3\uA8F2-\uA8F7\uA8FB\uA90A-\uA925\uA930-\uA946\uA960-\uA97C\uA984-\uA9B2\uA9CF\uAA00-\uAA28\uAA40-\uAA42\uAA44-\uAA4B\uAB01-\uAB06\uAB09-\uAB0E\uAB11-\uAB16\uAB20-\uAB26\uAB28-\uAB2E\uABC0-\uABE2\uAC00-\uD7A3\uD7B0-\uD7C6\uD7CB-\uD7FB\uFB00-\uFB06\uFB13-\uFB17\uFB1D\uFB1F-\uFB28\uFB2A-\uFB36\uFB38-\uFB3C\uFB3E\uFB40\uFB41\uFB43\uFB44\uFB46-\uFBB1\uFBD3-\uFD3D\uFD50-\uFD8F\uFD92-\uFDC7\uFDF0-\uFDFB\uFE70-\uFE74\uFE76-\uFEFC\uFF21-\uFF3A\uFF41-\uFF5A\uFFA0-\uFFBE\uFFC2-\uFFC7\uFFCA-\uFFCF\uFFD2-\uFFD7\uFFDA-\uFFDC]', |
@@ -47,5 +47,12 @@ import { characterIndices as ci } from './UnicodeData'; | ||
// WB4. Ignore format and extend characters. | ||
if (type === ci.EXTEND || type === ci.FORMAT || | ||
prevType === ci.EXTEND || prevType === ci.FORMAT || | ||
nextType === ci.EXTEND || nextType === ci.FORMAT) { | ||
if ((type === ci.EXTEND || type === ci.FORMAT) && | ||
(nextType === ci.ALETTER || nextType === ci.NUMERIC || nextType === ci.KATAKANA || | ||
nextType === ci.EXTEND || nextType === ci.FORMAT) | ||
|| | ||
(nextType === ci.EXTEND || | ||
// TINY-9654: Only ignore format characters if they do not precede a word boundary. Since some format characters overlap with whitespace characters (ex: \ufeff) and | ||
// our word extraction logic excludes whitespace characters, if a whitespace-overlapping format character that precedes a word boundary is not split on, whichever word | ||
// it is a part of will not be added to the list of extracted words, causing inaccuracies. | ||
nextType === ci.FORMAT && (nextNextType === ci.ALETTER || nextNextType === ci.NUMERIC || nextNextType === ci.KATAKANA || nextNextType === ci.EXTEND || nextNextType === ci.FORMAT)) | ||
&& (type === ci.ALETTER || type === ci.NUMERIC || type === ci.KATAKANA || type === ci.EXTEND || type === ci.FORMAT)) { | ||
return false; | ||
@@ -52,0 +59,0 @@ } |
@@ -0,1 +1,10 @@ | ||
export declare type Word<T> = T[]; | ||
interface WordIndex { | ||
readonly start: number; | ||
readonly end: number; | ||
} | ||
export interface WordsWithIndices<T> { | ||
readonly words: Word<T>[]; | ||
readonly indices: WordIndex[]; | ||
} | ||
export interface WordOptions { | ||
@@ -5,4 +14,5 @@ includeWhitespace?: boolean; | ||
} | ||
declare const getWords: <T>(chars: T[], extract: (char: T) => string, options?: WordOptions) => T[][]; | ||
export { getWords }; | ||
declare const getWordsWithIndices: <T>(chars: Word<T>, extract: (char: T) => string, options?: WordOptions) => WordsWithIndices<T>; | ||
declare const getWords: <T>(chars: Word<T>, extract: (char: T) => string, options?: WordOptions) => Word<T>[]; | ||
export { getWords, getWordsWithIndices }; | ||
//# sourceMappingURL=Words.d.ts.map |
@@ -1,2 +0,2 @@ | ||
import { Unicode } from '@ephox/katamari'; | ||
import { Arr } from '@ephox/katamari'; | ||
import { classify } from './StringMapper'; | ||
@@ -23,4 +23,5 @@ import * as UnicodeData from './UnicodeData'; | ||
}; | ||
const findWords = (chars, sChars, characterMap, options) => { | ||
const findWordsWithIndices = (chars, sChars, characterMap, options) => { | ||
const words = []; | ||
const indices = []; | ||
let word = []; | ||
@@ -48,2 +49,6 @@ // Loop through each character in the classification map and determine whether | ||
words.push(word); | ||
indices.push({ | ||
start: startOfWord, | ||
end: endOfWord | ||
}); | ||
} | ||
@@ -53,3 +58,3 @@ word = []; | ||
} | ||
return words; | ||
return { words, indices }; | ||
}; | ||
@@ -60,3 +65,3 @@ const getDefaultOptions = () => ({ | ||
}); | ||
const getWords = (chars, extract, options) => { | ||
const getWordsWithIndices = (chars, extract, options) => { | ||
options = { | ||
@@ -66,16 +71,8 @@ ...getDefaultOptions(), | ||
}; | ||
const filteredChars = []; | ||
const extractedChars = []; | ||
// tslint:disable-next-line:prefer-for-of | ||
for (let i = 0; i < chars.length; i++) { | ||
const ch = extract(chars[i]); | ||
if (ch !== Unicode.zeroWidth) { | ||
filteredChars.push(chars[i]); | ||
extractedChars.push(ch); | ||
} | ||
} | ||
const extractedChars = Arr.map(chars, extract); | ||
const characterMap = classify(extractedChars); | ||
return findWords(filteredChars, extractedChars, characterMap, options); | ||
return findWordsWithIndices(chars, extractedChars, characterMap, options); | ||
}; | ||
export { getWords }; | ||
const getWords = (chars, extract, options) => getWordsWithIndices(chars, extract, options).words; | ||
export { getWords, getWordsWithIndices }; | ||
//# sourceMappingURL=Words.js.map |
@@ -44,4 +44,30 @@ import { Assert, UnitTest } from '@ephox/bedrock-client'; | ||
assertWords(['42.6±4.2'], '42.6±4.2'); | ||
assertWords(['ab'], 'a\ufeffb'); | ||
// TINY-9654: Does not split on extend characters (ex: \u0300) | ||
assertWords(['a\u0300b'], 'a\u0300b'); | ||
assertWords(['a\u0300bc'], 'a\u0300bc'); | ||
assertWords(['ab\u0300c'], 'ab\u0300c'); | ||
assertWords(['a\u0300b', 'c'], 'a\u0300b c'); | ||
assertWords(['\u0300b'], '\u0300b'); | ||
assertWords(['a', '\u0300b'], 'a \u0300b'); | ||
assertWords(['\u0300'], '\u0300'); | ||
assertWords(['\u0300'], '\u0300 '); | ||
assertWords(['a\u0300'], 'a\u0300'); | ||
assertWords(['a\u0300'], 'a\u0300 '); | ||
// TINY-9654: Does not split on format characters (ex: \ufeff) if they do not precede a word boundary | ||
// TINY-9654: Does not strip \ufeff characters (obsolete TINY-1166 fix removed) | ||
assertWords(['a\ufeffb'], 'a\ufeffb'); | ||
assertWords(['a\ufeffbc'], 'a\ufeffbc'); | ||
assertWords(['ab\ufeffc'], 'ab\ufeffc'); | ||
assertWords(['a\ufeffb', 'c'], 'a\ufeffb c'); | ||
assertWords(['\ufeffb'], '\ufeffb'); | ||
assertWords(['a', '\ufeffb'], 'a \ufeffb'); | ||
// TINY-9654: Split on format characters if they precede a word boundary. Some format characters overlap with whitespace | ||
// characters (ex: \ufeff). Since whitespace characters are not extracted, if a whitespace-overlapping format character that | ||
// precedes a word boundary is not split on, whichever word it is a part of will not be added to the list of extracted words, | ||
// causing inaccuracies. | ||
assertWords([], '\ufeff'); | ||
assertWords([], '\ufeff '); | ||
assertWords(['a'], 'a\ufeff'); | ||
assertWords(['a'], 'a\ufeff '); | ||
}); | ||
//# sourceMappingURL=WordsTest.js.map |
{ | ||
"name": "@ephox/polaris", | ||
"description": "This project does data manipulation on arrays and strings.", | ||
"version": "6.0.9", | ||
"version": "6.1.0-alpha.0", | ||
"repository": { | ||
@@ -28,3 +28,2 @@ "type": "git", | ||
"scripts": { | ||
"prepublishOnly": "tsc -b", | ||
"test": "bedrock-auto -b chrome-headless -d src/test/ts", | ||
@@ -38,4 +37,3 @@ "test-manual": "bedrock -d src/test/ts", | ||
"module": "./lib/main/ts/ephox/polaris/api/Main.js", | ||
"types": "./lib/main/ts/ephox/polaris/api/Main.d.ts", | ||
"gitHead": "d0a354c4921b1ce815cfb36cc72ea3a309932fee" | ||
"types": "./lib/main/ts/ephox/polaris/api/Main.d.ts" | ||
} |
@@ -1,8 +0,12 @@ | ||
import * as WordOptions from '../words/Words'; | ||
import * as Words from '../words/Words'; | ||
type GetWordsApi = <T>(chars: T[], extract: (char: T) => string, options?: WordOptions.WordOptions) => T[][]; | ||
const getWords: GetWordsApi = WordOptions.getWords; | ||
type GetWordsApi = <T>(chars: T[], extract: (char: T) => string, options?: Words.WordOptions) => Words.Word<T>[]; | ||
const getWords: GetWordsApi = Words.getWords; | ||
type GetWordsAndIndicesApi = <T>(chars: T[], extract: (char: T) => string, options?: Words.WordOptions) => Words.WordsWithIndices<T>; | ||
const getWordsWithIndices: GetWordsAndIndicesApi = Words.getWordsWithIndices; | ||
export { | ||
getWords | ||
getWords, | ||
getWordsWithIndices | ||
}; |
/* eslint-disable max-len */ | ||
const punctuationStr = '[!-#%-*,-\\/:;?@\\[-\\]_{}\u00A1\u00AB\u00B7\u00BB\u00BF;\u00B7\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1361-\u1368\u1400\u166D\u166E\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CD3\u2010-\u2027\u2030-\u2043\u2045-\u2051\u2053-\u205E\u207D\u207E\u208D\u208E\u3008\u3009\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30\u2E31\u3001-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]'; | ||
const punctuationStr = `[~^№|!-*+-\\/:;?@\\[-\\]_{}\u00A1\u00AB\u00B7\u00BB\u00BF;\u00B7\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1361-\u1368\u1400\u166D\u166E\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CD3\u2010-\u2027\u2030-\u2043\u2045-\u2051\u2053-\u205E\u207D\u207E\u208D\u208E\u3008\u3009\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30\u2E31\u3001-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]`; | ||
@@ -5,0 +5,0 @@ const regExps = { |
@@ -59,5 +59,12 @@ import { CharacterMap } from './StringMapper'; | ||
// WB4. Ignore format and extend characters. | ||
if (type === ci.EXTEND || type === ci.FORMAT || | ||
prevType === ci.EXTEND || prevType === ci.FORMAT || | ||
nextType === ci.EXTEND || nextType === ci.FORMAT) { | ||
if ((type === ci.EXTEND || type === ci.FORMAT) && | ||
(nextType === ci.ALETTER || nextType === ci.NUMERIC || nextType === ci.KATAKANA || | ||
nextType === ci.EXTEND || nextType === ci.FORMAT) | ||
|| | ||
(nextType === ci.EXTEND || | ||
// TINY-9654: Only ignore format characters if they do not precede a word boundary. Since some format characters overlap with whitespace characters (ex: \ufeff) and | ||
// our word extraction logic excludes whitespace characters, if a whitespace-overlapping format character that precedes a word boundary is not split on, whichever word | ||
// it is a part of will not be added to the list of extracted words, causing inaccuracies. | ||
nextType === ci.FORMAT && (nextNextType === ci.ALETTER || nextNextType === ci.NUMERIC || nextNextType === ci.KATAKANA || nextNextType === ci.EXTEND || nextNextType === ci.FORMAT)) | ||
&& (type === ci.ALETTER || type === ci.NUMERIC || type === ci.KATAKANA || type === ci.EXTEND || type === ci.FORMAT)) { | ||
return false; | ||
@@ -64,0 +71,0 @@ } |
@@ -1,2 +0,2 @@ | ||
import { Unicode } from '@ephox/katamari'; | ||
import { Arr } from '@ephox/katamari'; | ||
@@ -29,6 +29,19 @@ import { CharacterMap, classify } from './StringMapper'; | ||
const findWords = <T>(chars: T[], sChars: string[], characterMap: CharacterMap, options: WordOptions): T[][] => { | ||
const words: T[][] = []; | ||
let word: T[] = []; | ||
export type Word<T> = T[]; | ||
interface WordIndex { | ||
readonly start: number; | ||
readonly end: number; | ||
} | ||
export interface WordsWithIndices<T> { | ||
readonly words: Word<T>[]; | ||
readonly indices: WordIndex[]; | ||
} | ||
const findWordsWithIndices = <T>(chars: Word<T>, sChars: string[], characterMap: CharacterMap, options: WordOptions): WordsWithIndices<T> => { | ||
const words: Word<T>[] = []; | ||
const indices: WordIndex[] = []; | ||
let word: Word<T> = []; | ||
// Loop through each character in the classification map and determine whether | ||
@@ -61,2 +74,6 @@ // it precedes a word boundary, building an array of distinct words as we go. | ||
words.push(word); | ||
indices.push({ | ||
start: startOfWord, | ||
end: endOfWord | ||
}); | ||
} | ||
@@ -68,3 +85,3 @@ | ||
return words; | ||
return { words, indices }; | ||
}; | ||
@@ -82,3 +99,3 @@ | ||
const getWords = <T>(chars: T[], extract: (char: T) => string, options?: WordOptions): T[][] => { | ||
const getWordsWithIndices = <T>(chars: Word<T>, extract: (char: T) => string, options?: WordOptions): WordsWithIndices<T> => { | ||
options = { | ||
@@ -88,21 +105,13 @@ ...getDefaultOptions(), | ||
}; | ||
const filteredChars: T[] = []; | ||
const extractedChars: string[] = []; | ||
// tslint:disable-next-line:prefer-for-of | ||
for (let i = 0; i < chars.length; i++) { | ||
const ch = extract(chars[i]); | ||
if (ch !== Unicode.zeroWidth) { | ||
filteredChars.push(chars[i]); | ||
extractedChars.push(ch); | ||
} | ||
} | ||
const extractedChars: string[] = Arr.map(chars, extract); | ||
const characterMap: CharacterMap = classify(extractedChars); | ||
return findWords(filteredChars, extractedChars, characterMap, options); | ||
return findWordsWithIndices(chars, extractedChars, characterMap, options); | ||
}; | ||
const getWords = <T>(chars: Word<T>, extract: (char: T) => string, options?: WordOptions): Word<T>[] => | ||
getWordsWithIndices(chars, extract, options).words; | ||
export { | ||
getWords | ||
getWords, | ||
getWordsWithIndices | ||
}; |
@@ -55,3 +55,32 @@ import { Assert, UnitTest } from '@ephox/bedrock-client'; | ||
assertWords([ '42.6±4.2' ], '42.6±4.2'); | ||
assertWords([ 'ab' ], 'a\ufeffb'); | ||
// TINY-9654: Does not split on extend characters (ex: \u0300) | ||
assertWords([ 'a\u0300b' ], 'a\u0300b'); | ||
assertWords([ 'a\u0300bc' ], 'a\u0300bc'); | ||
assertWords([ 'ab\u0300c' ], 'ab\u0300c'); | ||
assertWords([ 'a\u0300b', 'c' ], 'a\u0300b c'); | ||
assertWords([ '\u0300b' ], '\u0300b'); | ||
assertWords([ 'a', '\u0300b' ], 'a \u0300b'); | ||
assertWords([ '\u0300' ], '\u0300'); | ||
assertWords([ '\u0300' ], '\u0300 '); | ||
assertWords([ 'a\u0300' ], 'a\u0300'); | ||
assertWords([ 'a\u0300' ], 'a\u0300 '); | ||
// TINY-9654: Does not split on format characters (ex: \ufeff) if they do not precede a word boundary | ||
// TINY-9654: Does not strip \ufeff characters (obsolete TINY-1166 fix removed) | ||
assertWords([ 'a\ufeffb' ], 'a\ufeffb'); | ||
assertWords([ 'a\ufeffbc' ], 'a\ufeffbc'); | ||
assertWords([ 'ab\ufeffc' ], 'ab\ufeffc'); | ||
assertWords([ 'a\ufeffb', 'c' ], 'a\ufeffb c'); | ||
assertWords([ '\ufeffb' ], '\ufeffb'); | ||
assertWords([ 'a', '\ufeffb' ], 'a \ufeffb'); | ||
// TINY-9654: Split on format characters if they precede a word boundary. Some format characters overlap with whitespace | ||
// characters (ex: \ufeff). Since whitespace characters are not extracted, if a whitespace-overlapping format character that | ||
// precedes a word boundary is not split on, whichever word it is a part of will not be added to the list of extracted words, | ||
// causing inaccuracies. | ||
assertWords([ ], '\ufeff'); | ||
assertWords([ ], '\ufeff '); | ||
assertWords([ 'a' ], 'a\ufeff'); | ||
assertWords([ 'a' ], 'a\ufeff '); | ||
}); |
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
No v1
QualityPackage is not semver >=1. This means it is not stable and does not support ^ ranges.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
253497
3844
239
2