Latest Threat Research:SANDWORM_MODE: Shai-Hulud-Style npm Worm Hijacks CI Workflows and Poisons AI Toolchains.Details
Socket
Book a DemoInstallSign in
Socket

@formatjs/intl-segmenter

Package Overview
Dependencies
Maintainers
3
Versions
43
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

@formatjs/intl-segmenter - npm Package Compare versions

Comparing version
12.0.6
to
12.0.7
+3
-3
package.json
{
"name": "@formatjs/intl-segmenter",
"description": "Polyfill for Intl.Segmenter",
"version": "12.0.6",
"version": "12.0.7",
"license": "MIT",

@@ -15,4 +15,4 @@ "author": "Matija Gaspar <matijagaspar@gmail.com>",

"tslib": "^2.8.0",
"@formatjs/ecma402-abstract": "3.0.6",
"@formatjs/intl-localematcher": "0.7.3"
"@formatjs/ecma402-abstract": "3.0.7",
"@formatjs/intl-localematcher": "0.7.4"
},

@@ -19,0 +19,0 @@ "bugs": "https://github.com/formatjs/formatjs/issues",

@@ -6,2 +6,8 @@ import { __assign, __spreadArray } from "tslib";

import { isSurrogate, replaceVariables } from './segmentation-utils.js';
// Cached regex patterns for word character detection
// Note: Unicode property escape regex is created at runtime in try-catch
// to avoid compile-time errors when targeting ES5
var WORD_CHARACTERS_BASIC_REGEX = /\w/;
// Lazy-initialized Unicode word character regex (null if not supported)
var WORD_CHARACTERS_UNICODE_REGEX = undefined;
/**

@@ -147,2 +153,72 @@ * Adds $ to before rules and ^ to after rules for strictness

export { Segmenter };
/**
* Determines if a segment is word-like according to Unicode Word Break rules.
*
* A segment is considered word-like if it contains alphabetic characters,
* numbers, or ideographs. Segments containing only whitespace, punctuation,
* or symbols are not word-like.
*
* Per Unicode Word Break (UAX #29) and native Intl.Segmenter implementations,
* this matches segments that contain characters from word character classes:
* ALetter, Hebrew_Letter, Numeric, Katakana, Hiragana, and Ideographic.
*
* @param segment - The text segment to check
* @param matchingRule - The word break rule that created this segment
* @returns true if the segment is word-like
*/
function isSegmentWordLike(segment, matchingRule) {
// Primary check: Does the segment contain word characters?
// Word-like segments contain letters (including ideographs), numbers,
// or connecting characters like apostrophes within words
//
// Regex matches:
// - Letters: \p{L} (all Unicode letters)
// - Numbers: \p{N} (all Unicode numbers)
// - Marks: \p{M} (combining marks, typically part of letters)
//
// Note: Using Unicode property escapes which work in modern JS engines
// and are necessary for proper internationalization
// Lazy-initialize Unicode regex on first use
if (WORD_CHARACTERS_UNICODE_REGEX === undefined) {
try {
// Create Unicode property escape regex at runtime to avoid compile-time TS1501 error
WORD_CHARACTERS_UNICODE_REGEX = new RegExp('[\\p{L}\\p{N}\\p{M}]', 'u');
}
catch (_a) {
// Environment doesn't support Unicode property escapes
WORD_CHARACTERS_UNICODE_REGEX = null;
}
}
var hasWordCharacters;
if (WORD_CHARACTERS_UNICODE_REGEX) {
// Check if segment contains word characters using Unicode property escapes
// This matches the behavior of native Intl.Segmenter in Chrome/Firefox
hasWordCharacters = WORD_CHARACTERS_UNICODE_REGEX.test(segment);
}
else {
// Fallback for environments without Unicode property escapes
// Match basic word characters: letters, numbers, underscores
hasWordCharacters = WORD_CHARACTERS_BASIC_REGEX.test(segment);
}
// If segment contains word characters, it's word-like
if (hasWordCharacters) {
return true;
}
// If no word characters, check if it's definitely not word-like via rules
// Non-word-like rules per Unicode Word Break specification (UAX #29):
// https://unicode.org/reports/tr29/#Word_Boundaries
//
// WB3a (3.1): Break before newlines (sot ÷ (Newline | CR | LF))
// WB3b (3.2): Break after newlines ((Newline | CR | LF) ÷ eot)
// WB3d (3.4): Keep horizontal whitespace together (WSegSpace × WSegSpace)
//
// These rules specifically identify non-word segments like line breaks and whitespace
var definitelyNotWordLikeRules = ['3.1', '3.2', '3.4'];
if (definitelyNotWordLikeRules.includes(matchingRule)) {
return false;
}
// For segments without word characters and not matching specific non-word rules,
// return false (e.g., punctuation, symbols, whitespace via rule 999)
return false;
}
var createSegmentDataObject = function (segmenter, segment, index, input, matchingRule) {

@@ -155,3 +231,3 @@ var returnValue = {

if (getSlot(segmenter, 'granularity') === 'word') {
returnValue.isWordLike = matchingRule !== '3.1' && matchingRule !== '3.2';
returnValue.isWordLike = isSegmentWordLike(segment, matchingRule);
}

@@ -158,0 +234,0 @@ return returnValue;

Sorry, the diff of this file is too big to display