@formatjs/intl-segmenter
Advanced tools
+3
-3
| { | ||
| "name": "@formatjs/intl-segmenter", | ||
| "description": "Polyfill for Intl.Segmenter", | ||
| "version": "12.0.7", | ||
| "version": "12.0.8", | ||
| "license": "MIT", | ||
@@ -15,4 +15,4 @@ "author": "Matija Gaspar <matijagaspar@gmail.com>", | ||
| "tslib": "^2.8.0", | ||
| "@formatjs/ecma402-abstract": "3.0.7", | ||
| "@formatjs/intl-localematcher": "0.7.4" | ||
| "@formatjs/intl-localematcher": "0.7.5", | ||
| "@formatjs/ecma402-abstract": "3.0.8" | ||
| }, | ||
@@ -19,0 +19,0 @@ "bugs": "https://github.com/formatjs/formatjs/issues", |
@@ -1,7 +0,7 @@ | ||
| import { Segmenter } from './src/segmenter.js'; | ||
| Object.defineProperty(Intl, 'Segmenter', { | ||
| value: Segmenter, | ||
| enumerable: false, | ||
| writable: true, | ||
| configurable: true, | ||
| import { Segmenter } from "./src/segmenter.js"; | ||
| Object.defineProperty(Intl, "Segmenter", { | ||
| value: Segmenter, | ||
| enumerable: false, | ||
| writable: true, | ||
| configurable: true | ||
| }); |
+8
-8
@@ -1,10 +0,10 @@ | ||
| import { Segmenter } from './src/segmenter.js'; | ||
| import { shouldPolyfill } from './should-polyfill.js'; | ||
| import { Segmenter } from "./src/segmenter.js"; | ||
| import { shouldPolyfill } from "./should-polyfill.js"; | ||
| if (shouldPolyfill()) { | ||
| Object.defineProperty(Intl, 'Segmenter', { | ||
| value: Segmenter, | ||
| enumerable: false, | ||
| writable: true, | ||
| configurable: true, | ||
| }); | ||
| Object.defineProperty(Intl, "Segmenter", { | ||
| value: Segmenter, | ||
| enumerable: false, | ||
| writable: true, | ||
| configurable: true | ||
| }); | ||
| } |
| export function shouldPolyfill() { | ||
| return !Intl.Segmenter; | ||
| return !Intl.Segmenter; | ||
| } |
@@ -1,15 +0,12 @@ | ||
| export var replaceVariables = function (variables, input) { | ||
| var findVarRegex = /\$[A-Za-z0-9_]+/gm; | ||
| return input.replaceAll(findVarRegex, function (match) { | ||
| if (!(match in variables)) { | ||
| throw new Error("No such variable ".concat(match)); | ||
| } | ||
| return variables[match]; | ||
| }); | ||
| export const replaceVariables = (variables, input) => { | ||
| const findVarRegex = /\$[A-Za-z0-9_]+/gm; | ||
| return input.replaceAll(findVarRegex, (match) => { | ||
| if (!(match in variables)) { | ||
| throw new Error(`No such variable ${match}`); | ||
| } | ||
| return variables[match]; | ||
| }); | ||
| }; | ||
| export var isSurrogate = function (str, pos) { | ||
| return (0xd800 <= str.charCodeAt(pos - 1) && | ||
| str.charCodeAt(pos - 1) <= 0xdbff && | ||
| 0xdc00 <= str.charCodeAt(pos) && | ||
| str.charCodeAt(pos) <= 0xdfff); | ||
| export const isSurrogate = (str, pos) => { | ||
| return 55296 <= str.charCodeAt(pos - 1) && str.charCodeAt(pos - 1) <= 56319 && 56320 <= str.charCodeAt(pos) && str.charCodeAt(pos) <= 57343; | ||
| }; | ||
@@ -16,0 +13,0 @@ // alternative surrogate check mimicking the java implementation |
+42
-42
| type SegmentResult = { | ||
| segment: string; | ||
| breakingRule?: string; | ||
| nonBreakingRules?: string[]; | ||
| segment: string; | ||
| breakingRule?: string; | ||
| nonBreakingRules?: string[]; | ||
| } | undefined; | ||
| export interface SegmenterOptions { | ||
| localeMatcher?: 'lookup' | 'best fit'; | ||
| granularity?: 'word' | 'sentence' | 'grapheme'; | ||
| localeMatcher?: "lookup" | "best fit"; | ||
| granularity?: "word" | "sentence" | "grapheme"; | ||
| } | ||
| export interface SegmenterResolvedOptions { | ||
| locale: string; | ||
| granularity: NonNullable<SegmenterOptions['granularity']>; | ||
| locale: string; | ||
| granularity: NonNullable<SegmenterOptions["granularity"]>; | ||
| } | ||
| declare const breaksAtResult: (breaks: boolean, matchingRule: string) => { | ||
| breaks: boolean; | ||
| matchingRule: string; | ||
| breaks: boolean; | ||
| matchingRule: string; | ||
| }; | ||
| export declare class Segmenter { | ||
| private readonly rules; | ||
| private readonly ruleSortedKeys; | ||
| private readonly mergedSegmentationTypeValue; | ||
| constructor(locales: string | string[] | undefined, options: SegmenterOptions); | ||
| breaksAt(position: number, input: string): ReturnType<typeof breaksAtResult>; | ||
| segment(input: string): SegmentIterator; | ||
| resolvedOptions(): SegmenterResolvedOptions; | ||
| static availableLocales: Set<string>; | ||
| static supportedLocalesOf(locales?: string | string[], options?: Pick<SegmenterOptions, 'localeMatcher'>): string[]; | ||
| static readonly polyfilled = true; | ||
| private readonly rules; | ||
| private readonly ruleSortedKeys; | ||
| private readonly mergedSegmentationTypeValue; | ||
| constructor(locales: string | string[] | undefined, options: SegmenterOptions); | ||
| breaksAt(position: number, input: string): ReturnType<typeof breaksAtResult>; | ||
| segment(input: string): SegmentIterator; | ||
| resolvedOptions(): SegmenterResolvedOptions; | ||
| static availableLocales: Set<string>; | ||
| static supportedLocalesOf(locales?: string | string[], options?: Pick<SegmenterOptions, "localeMatcher">): string[]; | ||
| static readonly polyfilled: true; | ||
| } | ||
| declare class SegmentIterator implements Iterable<SegmentResult>, Iterator<SegmentResult> { | ||
| private readonly segmenter; | ||
| private lastSegmentIndex; | ||
| private input; | ||
| constructor(segmenter: Segmenter, input: string); | ||
| [Symbol.iterator](): SegmentIterator; | ||
| next(): { | ||
| done: boolean; | ||
| value: { | ||
| segment: string; | ||
| index: number; | ||
| input: string; | ||
| isWordLike?: boolean; | ||
| }; | ||
| } | { | ||
| done: boolean; | ||
| value: undefined; | ||
| }; | ||
| containing(positionInput: number): { | ||
| segment: string; | ||
| index: number; | ||
| input: string; | ||
| isWordLike?: boolean; | ||
| } | undefined; | ||
| private readonly segmenter; | ||
| private lastSegmentIndex; | ||
| private input; | ||
| constructor(segmenter: Segmenter, input: string); | ||
| [Symbol.iterator](): SegmentIterator; | ||
| next(): { | ||
| done: boolean; | ||
| value: { | ||
| segment: string; | ||
| index: number; | ||
| input: string; | ||
| isWordLike?: boolean; | ||
| }; | ||
| } | { | ||
| done: boolean; | ||
| value: undefined; | ||
| }; | ||
| containing(positionInput: number): { | ||
| segment: string; | ||
| index: number; | ||
| input: string; | ||
| isWordLike?: boolean; | ||
| } | undefined; | ||
| } | ||
| export type { SegmentIterator }; |
+332
-328
@@ -1,346 +0,350 @@ | ||
| import { __assign, __spreadArray } from "tslib"; | ||
| import { CanonicalizeLocaleList, GetOption, GetOptionsObject, SupportedLocales, getInternalSlot, getMultiInternalSlots, setInternalSlot, } from '@formatjs/ecma402-abstract'; | ||
| import { ResolveLocale } from '@formatjs/intl-localematcher'; | ||
| import { SegmentationRules } from './cldr-segmentation-rules.generated.js'; | ||
| import { isSurrogate, replaceVariables } from './segmentation-utils.js'; | ||
| import { CanonicalizeLocaleList, GetOption, GetOptionsObject, SupportedLocales, getInternalSlot, getMultiInternalSlots, setInternalSlot } from "@formatjs/ecma402-abstract"; | ||
| import { ResolveLocale } from "@formatjs/intl-localematcher"; | ||
| import { SegmentationRules } from "./cldr-segmentation-rules.generated.js"; | ||
| import { isSurrogate, replaceVariables } from "./segmentation-utils.js"; | ||
| // Cached regex patterns for word character detection | ||
| // Note: Unicode property escape regex is created at runtime in try-catch | ||
| // to avoid compile-time errors when targeting ES5 | ||
| var WORD_CHARACTERS_BASIC_REGEX = /\w/; | ||
| const WORD_CHARACTERS_BASIC_REGEX = /\w/; | ||
| // Lazy-initialized Unicode word character regex (null if not supported) | ||
| var WORD_CHARACTERS_UNICODE_REGEX = undefined; | ||
| let WORD_CHARACTERS_UNICODE_REGEX = undefined; | ||
| /** | ||
| * Adds $ to before rules and ^ to after rules for strictness | ||
| * Replaces variables | ||
| * Initializes the RegExp | ||
| * | ||
| * @param rule raw rule string from cldr-segmentation-rules.generated | ||
| * @param variables | ||
| * @param after appends ^ if true and $ if false | ||
| * @returns | ||
| */ | ||
| var generateRuleRegex = function (rule, variables, after) { | ||
| return new RegExp("".concat(after ? '^' : '').concat(replaceVariables(variables, rule)).concat(after ? '' : '$')); | ||
| * Adds $ to before rules and ^ to after rules for strictness | ||
| * Replaces variables | ||
| * Initializes the RegExp | ||
| * | ||
| * @param rule raw rule string from cldr-segmentation-rules.generated | ||
| * @param variables | ||
| * @param after appends ^ if true and $ if false | ||
| * @returns | ||
| */ | ||
| const generateRuleRegex = (rule, variables, after) => { | ||
| return new RegExp(`${after ? "^" : ""}${replaceVariables(variables, rule)}${after ? "" : "$"}`); | ||
| }; | ||
| var prepareLocaleSegmentationRules = function (segmentationTypeValue) { | ||
| var preparedRules = {}; | ||
| for (var _i = 0, _a = Object.keys(segmentationTypeValue.segmentRules); _i < _a.length; _i++) { | ||
| var ruleNr = _a[_i]; | ||
| var ruleValue = segmentationTypeValue.segmentRules[ruleNr]; | ||
| var preparedRule = { | ||
| breaks: ruleValue.breaks, | ||
| }; | ||
| if ('before' in ruleValue && ruleValue.before) { | ||
| preparedRule.before = generateRuleRegex(ruleValue.before, segmentationTypeValue.variables, false); | ||
| } | ||
| if ('after' in ruleValue && ruleValue.after) { | ||
| preparedRule.after = generateRuleRegex(ruleValue.after, segmentationTypeValue.variables, true); | ||
| } | ||
| preparedRules[ruleNr] = preparedRule; | ||
| } | ||
| return preparedRules; | ||
| const prepareLocaleSegmentationRules = (segmentationTypeValue) => { | ||
| const preparedRules = {}; | ||
| for (const ruleNr of Object.keys(segmentationTypeValue.segmentRules)) { | ||
| const ruleValue = segmentationTypeValue.segmentRules[ruleNr]; | ||
| const preparedRule = { breaks: ruleValue.breaks }; | ||
| if ("before" in ruleValue && ruleValue.before) { | ||
| preparedRule.before = generateRuleRegex(ruleValue.before, segmentationTypeValue.variables, false); | ||
| } | ||
| if ("after" in ruleValue && ruleValue.after) { | ||
| preparedRule.after = generateRuleRegex(ruleValue.after, segmentationTypeValue.variables, true); | ||
| } | ||
| preparedRules[ruleNr] = preparedRule; | ||
| } | ||
| return preparedRules; | ||
| }; | ||
| var breaksAtResult = function (breaks, matchingRule) { return ({ | ||
| breaks: breaks, | ||
| matchingRule: matchingRule, | ||
| }); }; | ||
| var Segmenter = /** @class */ (function () { | ||
| function Segmenter(locales, options) { | ||
| var _newTarget = this.constructor; | ||
| if (_newTarget === undefined) { | ||
| throw TypeError("Constructor Intl.Segmenter requires 'new'"); | ||
| } | ||
| var requestedLocales = CanonicalizeLocaleList(locales); | ||
| options = GetOptionsObject(options); | ||
| var opt = Object.create(null); | ||
| var matcher = GetOption(options, 'localeMatcher', 'string', ['lookup', 'best fit'], 'best fit'); | ||
| opt.localeMatcher = matcher; | ||
| var granularity = GetOption(options, 'granularity', 'string', ['word', 'sentence', 'grapheme'], 'grapheme'); | ||
| setSlot(this, 'granularity', granularity); | ||
| //TODO: figure out correct availible locales | ||
| var r = ResolveLocale(Segmenter.availableLocales, //availible locales | ||
| requestedLocales, opt, [], // there is no relevantExtensionKeys | ||
| {}, function () { return ''; } //use only root rules | ||
| ); | ||
| setSlot(this, 'locale', r.locale); | ||
| //root rules based on granularity | ||
| this.mergedSegmentationTypeValue = SegmentationRules.root[granularity]; | ||
| //merge root rules with locale ones if locale is specified | ||
| if (r.locale.length) { | ||
| var localeOverrides = SegmentationRules[r.locale]; | ||
| if (granularity in localeOverrides) { | ||
| var localeSegmentationTypeValue = localeOverrides[granularity]; | ||
| this.mergedSegmentationTypeValue.variables = __assign(__assign({}, this.mergedSegmentationTypeValue.variables), localeSegmentationTypeValue.variables); | ||
| this.mergedSegmentationTypeValue.segmentRules = __assign(__assign({}, this.mergedSegmentationTypeValue.segmentRules), localeSegmentationTypeValue.segmentRules); | ||
| this.mergedSegmentationTypeValue.suppressions = __spreadArray(__spreadArray([], this.mergedSegmentationTypeValue.suppressions, true), localeSegmentationTypeValue.suppressions, true); | ||
| } | ||
| } | ||
| //prepare rules | ||
| this.rules = prepareLocaleSegmentationRules(this.mergedSegmentationTypeValue); | ||
| //order rule keys | ||
| this.ruleSortedKeys = Object.keys(this.rules).sort(function (a, b) { return Number(a) - Number(b); }); | ||
| } | ||
| Segmenter.prototype.breaksAt = function (position, input) { | ||
| var ruleSortedKeys = this.ruleSortedKeys; | ||
| var rules = this.rules; | ||
| var mergedSegmentationTypeValue = this.mergedSegmentationTypeValue; | ||
| //artificial rule 0.2 | ||
| if (position === 0) { | ||
| return breaksAtResult(true, '0.2'); | ||
| } | ||
| if (position === input.length) { | ||
| //rule 0.3 | ||
| return breaksAtResult(true, '0.3'); | ||
| } | ||
| //artificial rule 0.1: js specific, due to es5 regex not being unicode aware | ||
| //number 0.1 chosen to mimic java implementation, but needs to execute after 0.2 and 0.3 to be inside the string bounds | ||
| if (isSurrogate(input, position)) { | ||
| return breaksAtResult(false, '0.1'); | ||
| } | ||
| var stringBeforeBreak = input.substring(0, position); | ||
| var stringAfterBreak = input.substring(position); | ||
| //artificial rule 0.4: handle suppressions | ||
| if ('suppressions' in mergedSegmentationTypeValue) { | ||
| for (var _i = 0, _a = mergedSegmentationTypeValue.suppressions; _i < _a.length; _i++) { | ||
| var suppressions = _a[_i]; | ||
| if (stringBeforeBreak.trim().endsWith(suppressions)) { | ||
| return breaksAtResult(false, '0.4'); | ||
| } | ||
| } | ||
| } | ||
| // loop through rules and find a match | ||
| for (var _b = 0, ruleSortedKeys_1 = ruleSortedKeys; _b < ruleSortedKeys_1.length; _b++) { | ||
| var ruleKey = ruleSortedKeys_1[_b]; | ||
| var _c = rules[ruleKey], before = _c.before, after = _c.after, breaks = _c.breaks; | ||
| // for debugging | ||
| // if (ruleKey === '16' && position === 4) { | ||
| // console.log({before, after, stringBeforeBreak, stringAfterBreak}) | ||
| // } | ||
| if (before) { | ||
| if (!before.test(stringBeforeBreak)) { | ||
| //didn't match the before part, therfore skipping | ||
| continue; | ||
| } | ||
| } | ||
| if (after) { | ||
| if (!after.test(stringAfterBreak)) { | ||
| //didn't match the after part, therfore skipping | ||
| continue; | ||
| } | ||
| } | ||
| return breaksAtResult(breaks, ruleKey); | ||
| } | ||
| //artificial rule 999: if no rule matched is Any รท Any so return true | ||
| return breaksAtResult(true, '999'); | ||
| }; | ||
| Segmenter.prototype.segment = function (input) { | ||
| checkReceiver(this, 'segment'); | ||
| return new SegmentIterator(this, input); | ||
| }; | ||
| Segmenter.prototype.resolvedOptions = function () { | ||
| checkReceiver(this, 'resolvedOptions'); | ||
| return __assign({}, getMultiInternalSlots(__INTERNAL_SLOT_MAP__, this, 'locale', 'granularity')); | ||
| }; | ||
| Segmenter.supportedLocalesOf = function (locales, options) { | ||
| return SupportedLocales(Segmenter.availableLocales, CanonicalizeLocaleList(locales), options); | ||
| }; | ||
| Segmenter.availableLocales = new Set(Object.keys(SegmentationRules).filter(function (key) { return key !== 'root'; })); | ||
| Segmenter.polyfilled = true; | ||
| return Segmenter; | ||
| }()); | ||
| export { Segmenter }; | ||
| const breaksAtResult = (breaks, matchingRule) => ({ | ||
| breaks, | ||
| matchingRule | ||
| }); | ||
| export class Segmenter { | ||
| rules; | ||
| ruleSortedKeys; | ||
| mergedSegmentationTypeValue; | ||
| constructor(locales, options) { | ||
| if (new.target === undefined) { | ||
| throw TypeError(`Constructor Intl.Segmenter requires 'new'`); | ||
| } | ||
| const requestedLocales = CanonicalizeLocaleList(locales); | ||
| options = GetOptionsObject(options); | ||
| const opt = Object.create(null); | ||
| const matcher = GetOption(options, "localeMatcher", "string", ["lookup", "best fit"], "best fit"); | ||
| opt.localeMatcher = matcher; | ||
| const granularity = GetOption(options, "granularity", "string", [ | ||
| "word", | ||
| "sentence", | ||
| "grapheme" | ||
| ], "grapheme"); | ||
| setSlot(this, "granularity", granularity); | ||
| //TODO: figure out correct availible locales | ||
| const r = ResolveLocale(Segmenter.availableLocales, requestedLocales, opt, [], {}, () => ""); | ||
| setSlot(this, "locale", r.locale); | ||
| //root rules based on granularity | ||
| this.mergedSegmentationTypeValue = SegmentationRules.root[granularity]; | ||
| //merge root rules with locale ones if locale is specified | ||
| if (r.locale.length) { | ||
| const localeOverrides = SegmentationRules[r.locale]; | ||
| if (granularity in localeOverrides) { | ||
| const localeSegmentationTypeValue = localeOverrides[granularity]; | ||
| this.mergedSegmentationTypeValue.variables = { | ||
| ...this.mergedSegmentationTypeValue.variables, | ||
| ...localeSegmentationTypeValue.variables | ||
| }; | ||
| this.mergedSegmentationTypeValue.segmentRules = { | ||
| ...this.mergedSegmentationTypeValue.segmentRules, | ||
| ...localeSegmentationTypeValue.segmentRules | ||
| }; | ||
| this.mergedSegmentationTypeValue.suppressions = [...this.mergedSegmentationTypeValue.suppressions, ...localeSegmentationTypeValue.suppressions]; | ||
| } | ||
| } | ||
| //prepare rules | ||
| this.rules = prepareLocaleSegmentationRules(this.mergedSegmentationTypeValue); | ||
| //order rule keys | ||
| this.ruleSortedKeys = Object.keys(this.rules).sort((a, b) => Number(a) - Number(b)); | ||
| } | ||
| breaksAt(position, input) { | ||
| const ruleSortedKeys = this.ruleSortedKeys; | ||
| const rules = this.rules; | ||
| const mergedSegmentationTypeValue = this.mergedSegmentationTypeValue; | ||
| //artificial rule 0.2 | ||
| if (position === 0) { | ||
| return breaksAtResult(true, "0.2"); | ||
| } | ||
| if (position === input.length) { | ||
| //rule 0.3 | ||
| return breaksAtResult(true, "0.3"); | ||
| } | ||
| //artificial rule 0.1: js specific, due to es5 regex not being unicode aware | ||
| //number 0.1 chosen to mimic java implementation, but needs to execute after 0.2 and 0.3 to be inside the string bounds | ||
| if (isSurrogate(input, position)) { | ||
| return breaksAtResult(false, "0.1"); | ||
| } | ||
| const stringBeforeBreak = input.substring(0, position); | ||
| const stringAfterBreak = input.substring(position); | ||
| //artificial rule 0.4: handle suppressions | ||
| if ("suppressions" in mergedSegmentationTypeValue) { | ||
| for (const suppressions of mergedSegmentationTypeValue.suppressions) { | ||
| if (stringBeforeBreak.trim().endsWith(suppressions)) { | ||
| return breaksAtResult(false, "0.4"); | ||
| } | ||
| } | ||
| } | ||
| // loop through rules and find a match | ||
| for (const ruleKey of ruleSortedKeys) { | ||
| const { before, after, breaks } = rules[ruleKey]; | ||
| // for debugging | ||
| // if (ruleKey === '16' && position === 4) { | ||
| // console.log({before, after, stringBeforeBreak, stringAfterBreak}) | ||
| // } | ||
| if (before) { | ||
| if (!before.test(stringBeforeBreak)) { | ||
| //didn't match the before part, therfore skipping | ||
| continue; | ||
| } | ||
| } | ||
| if (after) { | ||
| if (!after.test(stringAfterBreak)) { | ||
| //didn't match the after part, therfore skipping | ||
| continue; | ||
| } | ||
| } | ||
| return breaksAtResult(breaks, ruleKey); | ||
| } | ||
| //artificial rule 999: if no rule matched is Any รท Any so return true | ||
| return breaksAtResult(true, "999"); | ||
| } | ||
| segment(input) { | ||
| checkReceiver(this, "segment"); | ||
| return new SegmentIterator(this, input); | ||
| } | ||
| resolvedOptions() { | ||
| checkReceiver(this, "resolvedOptions"); | ||
| return { ...getMultiInternalSlots(__INTERNAL_SLOT_MAP__, this, "locale", "granularity") }; | ||
| } | ||
| static availableLocales = new Set(Object.keys(SegmentationRules).filter((key) => key !== "root")); | ||
| static supportedLocalesOf(locales, options) { | ||
| return SupportedLocales(Segmenter.availableLocales, CanonicalizeLocaleList(locales), options); | ||
| } | ||
| static polyfilled = true; | ||
| } | ||
| /** | ||
| * Determines if a segment is word-like according to Unicode Word Break rules. | ||
| * | ||
| * A segment is considered word-like if it contains alphabetic characters, | ||
| * numbers, or ideographs. Segments containing only whitespace, punctuation, | ||
| * or symbols are not word-like. | ||
| * | ||
| * Per Unicode Word Break (UAX #29) and native Intl.Segmenter implementations, | ||
| * this matches segments that contain characters from word character classes: | ||
| * ALetter, Hebrew_Letter, Numeric, Katakana, Hiragana, and Ideographic. | ||
| * | ||
| * @param segment - The text segment to check | ||
| * @param matchingRule - The word break rule that created this segment | ||
| * @returns true if the segment is word-like | ||
| */ | ||
| * Determines if a segment is word-like according to Unicode Word Break rules. | ||
| * | ||
| * A segment is considered word-like if it contains alphabetic characters, | ||
| * numbers, or ideographs. Segments containing only whitespace, punctuation, | ||
| * or symbols are not word-like. | ||
| * | ||
| * Per Unicode Word Break (UAX #29) and native Intl.Segmenter implementations, | ||
| * this matches segments that contain characters from word character classes: | ||
| * ALetter, Hebrew_Letter, Numeric, Katakana, Hiragana, and Ideographic. | ||
| * | ||
| * @param segment - The text segment to check | ||
| * @param matchingRule - The word break rule that created this segment | ||
| * @returns true if the segment is word-like | ||
| */ | ||
| function isSegmentWordLike(segment, matchingRule) { | ||
| // Primary check: Does the segment contain word characters? | ||
| // Word-like segments contain letters (including ideographs), numbers, | ||
| // or connecting characters like apostrophes within words | ||
| // | ||
| // Regex matches: | ||
| // - Letters: \p{L} (all Unicode letters) | ||
| // - Numbers: \p{N} (all Unicode numbers) | ||
| // - Marks: \p{M} (combining marks, typically part of letters) | ||
| // | ||
| // Note: Using Unicode property escapes which work in modern JS engines | ||
| // and are necessary for proper internationalization | ||
| // Lazy-initialize Unicode regex on first use | ||
| if (WORD_CHARACTERS_UNICODE_REGEX === undefined) { | ||
| try { | ||
| // Create Unicode property escape regex at runtime to avoid compile-time TS1501 error | ||
| WORD_CHARACTERS_UNICODE_REGEX = new RegExp('[\\p{L}\\p{N}\\p{M}]', 'u'); | ||
| } | ||
| catch (_a) { | ||
| // Environment doesn't support Unicode property escapes | ||
| WORD_CHARACTERS_UNICODE_REGEX = null; | ||
| } | ||
| } | ||
| var hasWordCharacters; | ||
| if (WORD_CHARACTERS_UNICODE_REGEX) { | ||
| // Check if segment contains word characters using Unicode property escapes | ||
| // This matches the behavior of native Intl.Segmenter in Chrome/Firefox | ||
| hasWordCharacters = WORD_CHARACTERS_UNICODE_REGEX.test(segment); | ||
| } | ||
| else { | ||
| // Fallback for environments without Unicode property escapes | ||
| // Match basic word characters: letters, numbers, underscores | ||
| hasWordCharacters = WORD_CHARACTERS_BASIC_REGEX.test(segment); | ||
| } | ||
| // If segment contains word characters, it's word-like | ||
| if (hasWordCharacters) { | ||
| return true; | ||
| } | ||
| // If no word characters, check if it's definitely not word-like via rules | ||
| // Non-word-like rules per Unicode Word Break specification (UAX #29): | ||
| // https://unicode.org/reports/tr29/#Word_Boundaries | ||
| // | ||
| // WB3a (3.1): Break before newlines (sot รท (Newline | CR | LF)) | ||
| // WB3b (3.2): Break after newlines ((Newline | CR | LF) รท eot) | ||
| // WB3d (3.4): Keep horizontal whitespace together (WSegSpace ร WSegSpace) | ||
| // | ||
| // These rules specifically identify non-word segments like line breaks and whitespace | ||
| var definitelyNotWordLikeRules = ['3.1', '3.2', '3.4']; | ||
| if (definitelyNotWordLikeRules.includes(matchingRule)) { | ||
| return false; | ||
| } | ||
| // For segments without word characters and not matching specific non-word rules, | ||
| // return false (e.g., punctuation, symbols, whitespace via rule 999) | ||
| return false; | ||
| // Primary check: Does the segment contain word characters? | ||
| // Word-like segments contain letters (including ideographs), numbers, | ||
| // or connecting characters like apostrophes within words | ||
| // | ||
| // Regex matches: | ||
| // - Letters: \p{L} (all Unicode letters) | ||
| // - Numbers: \p{N} (all Unicode numbers) | ||
| // - Marks: \p{M} (combining marks, typically part of letters) | ||
| // | ||
| // Note: Using Unicode property escapes which work in modern JS engines | ||
| // and are necessary for proper internationalization | ||
| // Lazy-initialize Unicode regex on first use | ||
| if (WORD_CHARACTERS_UNICODE_REGEX === undefined) { | ||
| try { | ||
| // Create Unicode property escape regex at runtime to avoid compile-time TS1501 error | ||
| WORD_CHARACTERS_UNICODE_REGEX = new RegExp("[\\p{L}\\p{N}\\p{M}]", "u"); | ||
| } catch { | ||
| // Environment doesn't support Unicode property escapes | ||
| WORD_CHARACTERS_UNICODE_REGEX = null; | ||
| } | ||
| } | ||
| let hasWordCharacters; | ||
| if (WORD_CHARACTERS_UNICODE_REGEX) { | ||
| // Check if segment contains word characters using Unicode property escapes | ||
| // This matches the behavior of native Intl.Segmenter in Chrome/Firefox | ||
| hasWordCharacters = WORD_CHARACTERS_UNICODE_REGEX.test(segment); | ||
| } else { | ||
| // Fallback for environments without Unicode property escapes | ||
| // Match basic word characters: letters, numbers, underscores | ||
| hasWordCharacters = WORD_CHARACTERS_BASIC_REGEX.test(segment); | ||
| } | ||
| // If segment contains word characters, it's word-like | ||
| if (hasWordCharacters) { | ||
| return true; | ||
| } | ||
| // If no word characters, check if it's definitely not word-like via rules | ||
| // Non-word-like rules per Unicode Word Break specification (UAX #29): | ||
| // https://unicode.org/reports/tr29/#Word_Boundaries | ||
| // | ||
| // WB3a (3.1): Break before newlines (sot รท (Newline | CR | LF)) | ||
| // WB3b (3.2): Break after newlines ((Newline | CR | LF) รท eot) | ||
| // WB3d (3.4): Keep horizontal whitespace together (WSegSpace ร WSegSpace) | ||
| // | ||
| // These rules specifically identify non-word segments like line breaks and whitespace | ||
| const definitelyNotWordLikeRules = [ | ||
| "3.1", | ||
| "3.2", | ||
| "3.4" | ||
| ]; | ||
| if (definitelyNotWordLikeRules.includes(matchingRule)) { | ||
| return false; | ||
| } | ||
| // For segments without word characters and not matching specific non-word rules, | ||
| // return false (e.g., punctuation, symbols, whitespace via rule 999) | ||
| return false; | ||
| } | ||
| var createSegmentDataObject = function (segmenter, segment, index, input, matchingRule) { | ||
| var returnValue = { | ||
| segment: segment, | ||
| index: index, | ||
| input: input, | ||
| }; | ||
| if (getSlot(segmenter, 'granularity') === 'word') { | ||
| returnValue.isWordLike = isSegmentWordLike(segment, matchingRule); | ||
| } | ||
| return returnValue; | ||
| const createSegmentDataObject = (segmenter, segment, index, input, matchingRule) => { | ||
| const returnValue = { | ||
| segment, | ||
| index, | ||
| input | ||
| }; | ||
| if (getSlot(segmenter, "granularity") === "word") { | ||
| returnValue.isWordLike = isSegmentWordLike(segment, matchingRule); | ||
| } | ||
| return returnValue; | ||
| }; | ||
| var SegmentIterator = /** @class */ (function () { | ||
| function SegmentIterator(segmenter, input) { | ||
| this.segmenter = segmenter; | ||
| this.lastSegmentIndex = 0; | ||
| if (typeof input == 'symbol') { | ||
| throw TypeError("Input must not be a symbol"); | ||
| } | ||
| this.input = String(input); | ||
| } | ||
| SegmentIterator.prototype[Symbol.iterator] = function () { | ||
| return new SegmentIterator(this.segmenter, this.input); | ||
| }; | ||
| SegmentIterator.prototype.next = function () { | ||
| //using only the relevant bit of the string | ||
| var checkString = this.input.substring(this.lastSegmentIndex); | ||
| //loop from the start of the checkString, until exactly length (breaksAt returns break at pos=== lenght) | ||
| for (var position = 1; position <= checkString.length; position++) { | ||
| var _a = this.segmenter.breaksAt(position, checkString), breaks = _a.breaks, matchingRule = _a.matchingRule; | ||
| if (breaks) { | ||
| var segment = checkString.substring(0, position); | ||
| var index = this.lastSegmentIndex; | ||
| this.lastSegmentIndex += position; | ||
| return { | ||
| done: false, | ||
| value: createSegmentDataObject(this.segmenter, segment, index, this.input, matchingRule), | ||
| }; | ||
| } | ||
| } | ||
| //no segment was found by the loop, therefore the segmentation is done | ||
| return { done: true, value: undefined }; | ||
| }; | ||
| SegmentIterator.prototype.containing = function (positionInput) { | ||
| if (typeof positionInput === 'bigint') { | ||
| throw TypeError('Index must not be a BigInt'); | ||
| } | ||
| var position = Number(positionInput); | ||
| //https://tc39.es/ecma262/#sec-tointegerorinfinity | ||
| // 2. If number is NaN, +0๐ฝ, or -0๐ฝ, return 0. | ||
| if (isNaN(position) || !position) { | ||
| position = 0; | ||
| } | ||
| // 5. Let integer be floor(abs(โ(number))). | ||
| // 6. If number < -0๐ฝ, set integer to -integer. | ||
| position = Math.floor(Math.abs(position)) * (position < 0 ? -1 : 1); | ||
| if (position < 0 || position >= this.input.length) { | ||
| return undefined; | ||
| } | ||
| //find previous break point | ||
| var previousBreakPoint = 0; | ||
| if (position === 0) { | ||
| previousBreakPoint = 0; | ||
| } | ||
| else { | ||
| var checkString_1 = this.input; | ||
| for (var cursor = position; cursor >= 0; cursor--) { | ||
| var breaks = this.segmenter.breaksAt(cursor, checkString_1).breaks; | ||
| if (breaks) { | ||
| previousBreakPoint = cursor; | ||
| break; | ||
| } | ||
| } | ||
| } | ||
| var checkString = this.input.substring(previousBreakPoint); | ||
| //find next break point | ||
| for (var cursor = 1; cursor <= checkString.length; cursor++) { | ||
| var _a = this.segmenter.breaksAt(cursor, checkString), breaks = _a.breaks, matchingRule = _a.matchingRule; | ||
| if (breaks) { | ||
| var segment = checkString.substring(0, cursor); | ||
| return createSegmentDataObject(this.segmenter, segment, previousBreakPoint, this.input, matchingRule); | ||
| } | ||
| } | ||
| }; | ||
| return SegmentIterator; | ||
| }()); | ||
| var __INTERNAL_SLOT_MAP__ = new WeakMap(); | ||
| class SegmentIterator { | ||
| segmenter; | ||
| lastSegmentIndex; | ||
| input; | ||
| constructor(segmenter, input) { | ||
| this.segmenter = segmenter; | ||
| this.lastSegmentIndex = 0; | ||
| if (typeof input == "symbol") { | ||
| throw TypeError(`Input must not be a symbol`); | ||
| } | ||
| this.input = String(input); | ||
| } | ||
| [Symbol.iterator]() { | ||
| return new SegmentIterator(this.segmenter, this.input); | ||
| } | ||
| next() { | ||
| //using only the relevant bit of the string | ||
| let checkString = this.input.substring(this.lastSegmentIndex); | ||
| //loop from the start of the checkString, until exactly length (breaksAt returns break at pos=== lenght) | ||
| for (let position = 1; position <= checkString.length; position++) { | ||
| const { breaks, matchingRule } = this.segmenter.breaksAt(position, checkString); | ||
| if (breaks) { | ||
| const segment = checkString.substring(0, position); | ||
| const index = this.lastSegmentIndex; | ||
| this.lastSegmentIndex += position; | ||
| return { | ||
| done: false, | ||
| value: createSegmentDataObject(this.segmenter, segment, index, this.input, matchingRule) | ||
| }; | ||
| } | ||
| } | ||
| //no segment was found by the loop, therefore the segmentation is done | ||
| return { | ||
| done: true, | ||
| value: undefined | ||
| }; | ||
| } | ||
| containing(positionInput) { | ||
| if (typeof positionInput === "bigint") { | ||
| throw TypeError("Index must not be a BigInt"); | ||
| } | ||
| let position = Number(positionInput); | ||
| //https://tc39.es/ecma262/#sec-tointegerorinfinity | ||
| // 2. If number is NaN, +0๐ฝ, or -0๐ฝ, return 0. | ||
| if (isNaN(position) || !position) { | ||
| position = 0; | ||
| } | ||
| // 5. Let integer be floor(abs(โ(number))). | ||
| // 6. If number < -0๐ฝ, set integer to -integer. | ||
| position = Math.floor(Math.abs(position)) * (position < 0 ? -1 : 1); | ||
| if (position < 0 || position >= this.input.length) { | ||
| return undefined; | ||
| } | ||
| //find previous break point | ||
| let previousBreakPoint = 0; | ||
| if (position === 0) { | ||
| previousBreakPoint = 0; | ||
| } else { | ||
| const checkString = this.input; | ||
| for (let cursor = position; cursor >= 0; cursor--) { | ||
| const { breaks } = this.segmenter.breaksAt(cursor, checkString); | ||
| if (breaks) { | ||
| previousBreakPoint = cursor; | ||
| break; | ||
| } | ||
| } | ||
| } | ||
| let checkString = this.input.substring(previousBreakPoint); | ||
| //find next break point | ||
| for (let cursor = 1; cursor <= checkString.length; cursor++) { | ||
| const { breaks, matchingRule } = this.segmenter.breaksAt(cursor, checkString); | ||
| if (breaks) { | ||
| const segment = checkString.substring(0, cursor); | ||
| return createSegmentDataObject(this.segmenter, segment, previousBreakPoint, this.input, matchingRule); | ||
| } | ||
| } | ||
| } | ||
| } | ||
| const __INTERNAL_SLOT_MAP__ = new WeakMap(); | ||
| function getSlot(instance, key) { | ||
| return getInternalSlot(__INTERNAL_SLOT_MAP__, instance, key); | ||
| return getInternalSlot(__INTERNAL_SLOT_MAP__, instance, key); | ||
| } | ||
| function setSlot(instance, key, value) { | ||
| setInternalSlot(__INTERNAL_SLOT_MAP__, instance, key, value); | ||
| setInternalSlot(__INTERNAL_SLOT_MAP__, instance, key, value); | ||
| } | ||
| function checkReceiver(receiver, methodName) { | ||
| if (!(receiver instanceof Segmenter)) { | ||
| throw TypeError("Method Intl.Segmenter.prototype.".concat(methodName, " called on incompatible receiver")); | ||
| } | ||
| if (!(receiver instanceof Segmenter)) { | ||
| throw TypeError(`Method Intl.Segmenter.prototype.${methodName} called on incompatible receiver`); | ||
| } | ||
| } | ||
| try { | ||
| // IE11 does not have Symbol | ||
| if (typeof Symbol !== 'undefined') { | ||
| Object.defineProperty(Segmenter.prototype, Symbol.toStringTag, { | ||
| value: 'Intl.Segmenter', | ||
| writable: false, | ||
| enumerable: false, | ||
| configurable: true, | ||
| }); | ||
| } | ||
| //github.com/tc39/test262/blob/main/test/intl402/Segmenter/constructor/length.js | ||
| Object.defineProperty(Segmenter.prototype.constructor, 'length', { | ||
| value: 0, | ||
| writable: false, | ||
| enumerable: false, | ||
| configurable: true, | ||
| }); | ||
| // https://github.com/tc39/test262/blob/main/test/intl402/Segmenter/constructor/supportedLocalesOf/length.js | ||
| Object.defineProperty(Segmenter.supportedLocalesOf, 'length', { | ||
| value: 1, | ||
| writable: false, | ||
| enumerable: false, | ||
| configurable: true, | ||
| }); | ||
| } | ||
| catch (_a) { | ||
| // Meta fix so we're test262-compliant, not important | ||
| } | ||
| // IE11 does not have Symbol | ||
| if (typeof Symbol !== "undefined") { | ||
| Object.defineProperty(Segmenter.prototype, Symbol.toStringTag, { | ||
| value: "Intl.Segmenter", | ||
| writable: false, | ||
| enumerable: false, | ||
| configurable: true | ||
| }); | ||
| } | ||
| //github.com/tc39/test262/blob/main/test/intl402/Segmenter/constructor/length.js | ||
| Object.defineProperty(Segmenter.prototype.constructor, "length", { | ||
| value: 0, | ||
| writable: false, | ||
| enumerable: false, | ||
| configurable: true | ||
| }); | ||
| // https://github.com/tc39/test262/blob/main/test/intl402/Segmenter/constructor/supportedLocalesOf/length.js | ||
| Object.defineProperty(Segmenter.supportedLocalesOf, "length", { | ||
| value: 1, | ||
| writable: false, | ||
| enumerable: false, | ||
| configurable: true | ||
| }); | ||
| } catch {} |
@@ -1,1 +0,4 @@ | ||
| import './polyfill-force.js'; | ||
| // @generated | ||
| // @ts-nocheck | ||
| import "./polyfill-force.js"; | ||
| export {}; |
+1
-1
| // @generated | ||
| // @ts-nocheck | ||
| import './polyfill-force.js'; | ||
| import "./polyfill-force.js"; |
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is too big to display
Long strings
Supply chain riskContains long string literals, which may be a sign of obfuscated or packed code.
Found 1 instance in 1 package
Long strings
Supply chain riskContains long string literals, which may be a sign of obfuscated or packed code.
Found 1 instance in 1 package
URL strings
Supply chain riskPackage contains fragments of external URLs or IP addresses, which the package may be accessing at runtime.
Found 1 instance in 1 package
908543
-5.15%12557
-5.76%113
0.89%+ Added
+ Added
+ Added
- Removed
- Removed
- Removed