@formatjs/intl-segmenter
Advanced tools
+3
-3
| { | ||
| "name": "@formatjs/intl-segmenter", | ||
| "description": "Polyfill for Intl.Segmenter", | ||
| "version": "12.0.6", | ||
| "version": "12.0.7", | ||
| "license": "MIT", | ||
@@ -15,4 +15,4 @@ "author": "Matija Gaspar <matijagaspar@gmail.com>", | ||
| "tslib": "^2.8.0", | ||
| "@formatjs/ecma402-abstract": "3.0.6", | ||
| "@formatjs/intl-localematcher": "0.7.3" | ||
| "@formatjs/ecma402-abstract": "3.0.7", | ||
| "@formatjs/intl-localematcher": "0.7.4" | ||
| }, | ||
@@ -19,0 +19,0 @@ "bugs": "https://github.com/formatjs/formatjs/issues", |
+77
-1
@@ -6,2 +6,8 @@ import { __assign, __spreadArray } from "tslib"; | ||
| import { isSurrogate, replaceVariables } from './segmentation-utils.js'; | ||
| // Cached regex patterns for word character detection | ||
| // Note: Unicode property escape regex is created at runtime in try-catch | ||
| // to avoid compile-time errors when targeting ES5 | ||
| var WORD_CHARACTERS_BASIC_REGEX = /\w/; | ||
| // Lazy-initialized Unicode word character regex (null if not supported) | ||
| var WORD_CHARACTERS_UNICODE_REGEX = undefined; | ||
| /** | ||
@@ -147,2 +153,72 @@ * Adds $ to before rules and ^ to after rules for strictness | ||
| export { Segmenter }; | ||
| /** | ||
| * Determines if a segment is word-like according to Unicode Word Break rules. | ||
| * | ||
| * A segment is considered word-like if it contains alphabetic characters, | ||
| * numbers, or ideographs. Segments containing only whitespace, punctuation, | ||
| * or symbols are not word-like. | ||
| * | ||
| * Per Unicode Word Break (UAX #29) and native Intl.Segmenter implementations, | ||
| * this matches segments that contain characters from word character classes: | ||
| * ALetter, Hebrew_Letter, Numeric, Katakana, Hiragana, and Ideographic. | ||
| * | ||
| * @param segment - The text segment to check | ||
| * @param matchingRule - The word break rule that created this segment | ||
| * @returns true if the segment is word-like | ||
| */ | ||
| function isSegmentWordLike(segment, matchingRule) { | ||
| // Primary check: Does the segment contain word characters? | ||
| // Word-like segments contain letters (including ideographs), numbers, | ||
| // or connecting characters like apostrophes within words | ||
| // | ||
| // Regex matches: | ||
| // - Letters: \p{L} (all Unicode letters) | ||
| // - Numbers: \p{N} (all Unicode numbers) | ||
| // - Marks: \p{M} (combining marks, typically part of letters) | ||
| // | ||
| // Note: Using Unicode property escapes which work in modern JS engines | ||
| // and are necessary for proper internationalization | ||
| // Lazy-initialize Unicode regex on first use | ||
| if (WORD_CHARACTERS_UNICODE_REGEX === undefined) { | ||
| try { | ||
| // Create Unicode property escape regex at runtime to avoid compile-time TS1501 error | ||
| WORD_CHARACTERS_UNICODE_REGEX = new RegExp('[\\p{L}\\p{N}\\p{M}]', 'u'); | ||
| } | ||
| catch (_a) { | ||
| // Environment doesn't support Unicode property escapes | ||
| WORD_CHARACTERS_UNICODE_REGEX = null; | ||
| } | ||
| } | ||
| var hasWordCharacters; | ||
| if (WORD_CHARACTERS_UNICODE_REGEX) { | ||
| // Check if segment contains word characters using Unicode property escapes | ||
| // This matches the behavior of native Intl.Segmenter in Chrome/Firefox | ||
| hasWordCharacters = WORD_CHARACTERS_UNICODE_REGEX.test(segment); | ||
| } | ||
| else { | ||
| // Fallback for environments without Unicode property escapes | ||
| // Match basic word characters: letters, numbers, underscores | ||
| hasWordCharacters = WORD_CHARACTERS_BASIC_REGEX.test(segment); | ||
| } | ||
| // If segment contains word characters, it's word-like | ||
| if (hasWordCharacters) { | ||
| return true; | ||
| } | ||
| // If no word characters, check if it's definitely not word-like via rules | ||
| // Non-word-like rules per Unicode Word Break specification (UAX #29): | ||
| // https://unicode.org/reports/tr29/#Word_Boundaries | ||
| // | ||
| // WB3a (3.1): Break before newlines (sot ÷ (Newline | CR | LF)) | ||
| // WB3b (3.2): Break after newlines ((Newline | CR | LF) ÷ eot) | ||
| // WB3d (3.4): Keep horizontal whitespace together (WSegSpace × WSegSpace) | ||
| // | ||
| // These rules specifically identify non-word segments like line breaks and whitespace | ||
| var definitelyNotWordLikeRules = ['3.1', '3.2', '3.4']; | ||
| if (definitelyNotWordLikeRules.includes(matchingRule)) { | ||
| return false; | ||
| } | ||
| // For segments without word characters and not matching specific non-word rules, | ||
| // return false (e.g., punctuation, symbols, whitespace via rule 999) | ||
| return false; | ||
| } | ||
| var createSegmentDataObject = function (segmenter, segment, index, input, matchingRule) { | ||
@@ -155,3 +231,3 @@ var returnValue = { | ||
| if (getSlot(segmenter, 'granularity') === 'word') { | ||
| returnValue.isWordLike = matchingRule !== '3.1' && matchingRule !== '3.2'; | ||
| returnValue.isWordLike = isSegmentWordLike(segment, matchingRule); | ||
| } | ||
@@ -158,0 +234,0 @@ return returnValue; |
Sorry, the diff of this file is too big to display
Long strings
Supply chain riskContains long string literals, which may be a sign of obfuscated or packed code.
Found 1 instance in 1 package
URL strings
Supply chain riskPackage contains fragments of external URLs or IP addresses, which the package may be accessing at runtime.
Found 1 instance in 1 package
Long strings
Supply chain riskContains long string literals, which may be a sign of obfuscated or packed code.
Found 1 instance in 1 package
957825
0.37%13325
-0.3%112
2.75%+ Added
+ Added
+ Added
- Removed
- Removed
- Removed