{
		"name": "@formatjs/intl-segmenter",
		"description": "Polyfill for Intl.Segmenter",
		"version": "12.0.6",
		"version": "12.0.7",
		"license": "MIT",
		@@ -15,4 +15,4 @@ "author": "Matija Gaspar <matijagaspar@gmail.com>",
		"tslib": "^2.8.0",
		"@formatjs/ecma402-abstract": "3.0.6",
		"@formatjs/intl-localematcher": "0.7.3"
		"@formatjs/ecma402-abstract": "3.0.7",
		"@formatjs/intl-localematcher": "0.7.4"
		},
		@@ -19,0 +19,0 @@ "bugs": "https://github.com/formatjs/formatjs/issues",

+77

-1

src/segmenter.js

		@@ -6,2 +6,8 @@ import { __assign, __spreadArray } from "tslib";
		import { isSurrogate, replaceVariables } from './segmentation-utils.js';
		// Cached regex patterns for word character detection
		// Note: Unicode property escape regex is created at runtime in try-catch
		// to avoid compile-time errors when targeting ES5
		var WORD_CHARACTERS_BASIC_REGEX = /\w/;
		// Lazy-initialized Unicode word character regex (null if not supported)
		var WORD_CHARACTERS_UNICODE_REGEX = undefined;
		/**
		@@ -147,2 +153,72 @@ * Adds $ to before rules and ^ to after rules for strictness
		export { Segmenter };
		/**
		* Determines if a segment is word-like according to Unicode Word Break rules.
		*
		* A segment is considered word-like if it contains alphabetic characters,
		* numbers, or ideographs. Segments containing only whitespace, punctuation,
		* or symbols are not word-like.
		*
		* Per Unicode Word Break (UAX #29) and native Intl.Segmenter implementations,
		* this matches segments that contain characters from word character classes:
		* ALetter, Hebrew_Letter, Numeric, Katakana, Hiragana, and Ideographic.
		*
		* @param segment - The text segment to check
		* @param matchingRule - The word break rule that created this segment
		* @returns true if the segment is word-like
		*/
		function isSegmentWordLike(segment, matchingRule) {
		// Primary check: Does the segment contain word characters?
		// Word-like segments contain letters (including ideographs), numbers,
		// or connecting characters like apostrophes within words
		//
		// Regex matches:
		// - Letters: \p{L} (all Unicode letters)
		// - Numbers: \p{N} (all Unicode numbers)
		// - Marks: \p{M} (combining marks, typically part of letters)
		//
		// Note: Using Unicode property escapes which work in modern JS engines
		// and are necessary for proper internationalization
		// Lazy-initialize Unicode regex on first use
		if (WORD_CHARACTERS_UNICODE_REGEX === undefined) {
		try {
		// Create Unicode property escape regex at runtime to avoid compile-time TS1501 error
		WORD_CHARACTERS_UNICODE_REGEX = new RegExp('[\\p{L}\\p{N}\\p{M}]', 'u');
		}
		catch (_a) {
		// Environment doesn't support Unicode property escapes
		WORD_CHARACTERS_UNICODE_REGEX = null;
		}
		}
		var hasWordCharacters;
		if (WORD_CHARACTERS_UNICODE_REGEX) {
		// Check if segment contains word characters using Unicode property escapes
		// This matches the behavior of native Intl.Segmenter in Chrome/Firefox
		hasWordCharacters = WORD_CHARACTERS_UNICODE_REGEX.test(segment);
		}
		else {
		// Fallback for environments without Unicode property escapes
		// Match basic word characters: letters, numbers, underscores
		hasWordCharacters = WORD_CHARACTERS_BASIC_REGEX.test(segment);
		}
		// If segment contains word characters, it's word-like
		if (hasWordCharacters) {
		return true;
		}
		// If no word characters, check if it's definitely not word-like via rules
		// Non-word-like rules per Unicode Word Break specification (UAX #29):
		// https://unicode.org/reports/tr29/#Word_Boundaries
		//
		// WB3a (3.1): Break before newlines (sot ÷ (Newline \| CR \| LF))
		// WB3b (3.2): Break after newlines ((Newline \| CR \| LF) ÷ eot)
		// WB3d (3.4): Keep horizontal whitespace together (WSegSpace × WSegSpace)
		//
		// These rules specifically identify non-word segments like line breaks and whitespace
		var definitelyNotWordLikeRules = ['3.1', '3.2', '3.4'];
		if (definitelyNotWordLikeRules.includes(matchingRule)) {
		return false;
		}
		// For segments without word characters and not matching specific non-word rules,
		// return false (e.g., punctuation, symbols, whitespace via rule 999)
		return false;
		}
		var createSegmentDataObject = function (segmenter, segment, index, input, matchingRule) {
		@@ -155,3 +231,3 @@ var returnValue = {
		if (getSlot(segmenter, 'granularity') === 'word') {
		returnValue.isWordLike = matchingRule !== '3.1' && matchingRule !== '3.2';
		returnValue.isWordLike = isSegmentWordLike(segment, matchingRule);
		}
		@@ -158,0 +234,0 @@ return returnValue;

polyfill.iife.js

Sorry, the diff of this file is too big to display

@formatjs/intl-segmenter - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics

Worsened metrics

Dependency changes