Comparing version 0.3.1 to 0.4.0
@@ -69,2 +69,3 @@ import type { TransformerContainer } from '../../transformer/Transformers'; | ||
private compileTerms; | ||
private validateWhitelistedTerms; | ||
} | ||
@@ -71,0 +72,0 @@ /** |
"use strict"; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.RegExpMatcher = void 0; | ||
const Char_1 = require("../../util/Char"); | ||
const Util_1 = require("../../pattern/Util"); | ||
const TransformerSet_1 = require("../../transformer/TransformerSet"); | ||
const Char_1 = require("../../util/Char"); | ||
const CharacterIterator_1 = require("../../util/CharacterIterator"); | ||
@@ -66,2 +66,3 @@ const IntervalCollection_1 = require("../IntervalCollection"); | ||
this.blacklistedTerms = this.compileTerms(blacklistedTerms); | ||
this.validateWhitelistedTerms(whitelistedTerms); | ||
this.whitelistedTerms = whitelistedTerms; | ||
@@ -73,19 +74,22 @@ this.blacklistMatcherTransformers = new TransformerSet_1.TransformerSet(blacklistMatcherTransformers); | ||
const whitelistedIntervals = this.getWhitelistedIntervals(input); | ||
const [indices, transformed] = this.applyTransformers(input, this.blacklistMatcherTransformers); | ||
const [transformedToOrigIndex, transformed] = this.applyTransformers(input, this.blacklistMatcherTransformers); | ||
const matches = []; | ||
for (const blacklistedTerm of this.blacklistedTerms) { | ||
for (const match of transformed.matchAll(blacklistedTerm.regExp)) { | ||
const matchLength = [...match[0]].length; // spread so we count code points, not code units | ||
const startIndex = indices[match.index]; | ||
// eslint-disable-next-line @typescript-eslint/restrict-plus-operands | ||
let endIndex = indices[match.index + matchLength - 1]; | ||
// Adjust the end index if needed. | ||
if (endIndex < transformed.length - 1 && // not the last character | ||
(0, Char_1.isHighSurrogate)(transformed.charCodeAt(endIndex)) && // character is a high surrogate | ||
(0, Char_1.isLowSurrogate)(transformed.charCodeAt(endIndex + 1)) // next character is a low surrogate | ||
const origStartIndex = transformedToOrigIndex[match.index]; | ||
let origEndIndex = transformedToOrigIndex[match.index + match[0].length - 1]; | ||
// End index is (unfortunately) inclusive, so adjust as necessary. | ||
if (origEndIndex < input.length - 1 && // not the last character | ||
(0, Char_1.isHighSurrogate)(input.charCodeAt(origEndIndex)) && // character is a high surrogate | ||
(0, Char_1.isLowSurrogate)(input.charCodeAt(origEndIndex + 1)) // next character is a low surrogate | ||
) { | ||
endIndex++; | ||
origEndIndex++; | ||
} | ||
if (!whitelistedIntervals.query(startIndex, endIndex)) { | ||
matches.push({ termId: blacklistedTerm.id, startIndex, endIndex, matchLength }); | ||
if (!whitelistedIntervals.query(origStartIndex, origEndIndex)) { | ||
matches.push({ | ||
termId: blacklistedTerm.id, | ||
startIndex: origStartIndex, | ||
endIndex: origEndIndex, | ||
matchLength: [...match[0]].length, | ||
}); | ||
} | ||
@@ -100,17 +104,15 @@ } | ||
const whitelistedIntervals = this.getWhitelistedIntervals(input); | ||
const [indices, transformed] = this.applyTransformers(input, this.blacklistMatcherTransformers); | ||
const [transformedToOrigIndex, transformed] = this.applyTransformers(input, this.blacklistMatcherTransformers); | ||
for (const blacklistedTerm of this.blacklistedTerms) { | ||
for (const match of transformed.matchAll(blacklistedTerm.regExp)) { | ||
const matchLength = [...match[0]].length; // spread so we count code points, not code units | ||
const startIndex = indices[match.index]; | ||
// eslint-disable-next-line @typescript-eslint/restrict-plus-operands | ||
let endIndex = indices[match.index + matchLength - 1]; | ||
// Adjust the end index if needed. | ||
if (endIndex < transformed.length - 1 && // not the last character | ||
(0, Char_1.isHighSurrogate)(transformed.charCodeAt(endIndex)) && // character is a high surrogate | ||
(0, Char_1.isLowSurrogate)(transformed.charCodeAt(endIndex + 1)) // next character is a low surrogate | ||
const origStartIndex = transformedToOrigIndex[match.index]; | ||
let origEndIndex = transformedToOrigIndex[match.index + match[0].length - 1]; | ||
// End index is (unfortunately) inclusive, so adjust as necessary. | ||
if (origEndIndex < input.length - 1 && // not the last character | ||
(0, Char_1.isHighSurrogate)(input.charCodeAt(origEndIndex)) && // character is a high surrogate | ||
(0, Char_1.isLowSurrogate)(input.charCodeAt(origEndIndex + 1)) // next character is a low surrogate | ||
) { | ||
endIndex++; | ||
origEndIndex++; | ||
} | ||
if (!whitelistedIntervals.query(startIndex, endIndex)) | ||
if (!whitelistedIntervals.query(origStartIndex, origEndIndex)) | ||
return true; | ||
@@ -123,17 +125,16 @@ } | ||
const matches = new IntervalCollection_1.IntervalCollection(); | ||
const [indices, transformed] = this.applyTransformers(input, this.whitelistMatcherTransformers); | ||
const [transformedToOrigIndex, transformed] = this.applyTransformers(input, this.whitelistMatcherTransformers); | ||
for (const whitelistedTerm of this.whitelistedTerms) { | ||
const length = [...whitelistedTerm].length; | ||
let lastEnd = 0; | ||
for (let startIndex = transformed.indexOf(whitelistedTerm, lastEnd); startIndex !== -1; startIndex = transformed.indexOf(whitelistedTerm, lastEnd)) { | ||
let endIndex = indices[startIndex + length - 1]; | ||
// Adjust the end index if needed. | ||
if (endIndex < transformed.length - 1 && // not the last character | ||
(0, Char_1.isHighSurrogate)(transformed.charCodeAt(endIndex)) && // character is a high surrogate | ||
(0, Char_1.isLowSurrogate)(transformed.charCodeAt(endIndex + 1)) // next character is a low surrogate | ||
let origEndIndex = transformedToOrigIndex[startIndex + whitelistedTerm.length - 1]; | ||
// End index is (unfortunately) inclusive, so adjust as necessary. | ||
if (origEndIndex < input.length - 1 && // not the last character | ||
(0, Char_1.isHighSurrogate)(input.charCodeAt(origEndIndex)) && // character is a high surrogate | ||
(0, Char_1.isLowSurrogate)(input.charCodeAt(origEndIndex + 1)) // next character is a low surrogate | ||
) { | ||
endIndex++; | ||
origEndIndex++; | ||
} | ||
matches.insert(indices[startIndex], endIndex); | ||
lastEnd = endIndex + 1; | ||
matches.insert(transformedToOrigIndex[startIndex], origEndIndex); | ||
lastEnd = startIndex + whitelistedTerm.length; | ||
} | ||
@@ -144,3 +145,3 @@ } | ||
applyTransformers(input, transformers) { | ||
const indices = []; | ||
const transformedToOrigIndex = []; | ||
let transformed = ''; | ||
@@ -151,8 +152,9 @@ const iter = new CharacterIterator_1.CharacterIterator(input); | ||
if (transformedChar !== undefined) { | ||
indices.push(iter.position); | ||
transformed += String.fromCodePoint(transformedChar); | ||
while (transformedToOrigIndex.length < transformed.length) | ||
transformedToOrigIndex.push(iter.position); | ||
} | ||
} | ||
transformers.resetAll(); | ||
return [indices, transformed]; | ||
return [transformedToOrigIndex, transformed]; | ||
} | ||
@@ -176,3 +178,8 @@ compileTerms(terms) { | ||
} | ||
validateWhitelistedTerms(whitelist) { | ||
if (whitelist.some((term) => term.length === 0)) { | ||
throw new Error('Whitelisted term set contains empty string; this is unsupported.'); | ||
} | ||
} | ||
} | ||
exports.RegExpMatcher = RegExpMatcher; |
{ | ||
"name": "obscenity", | ||
"version": "0.3.1", | ||
"version": "0.4.0", | ||
"description": "Robust, extensible profanity filter.", | ||
@@ -54,9 +54,8 @@ "files": [ | ||
"@types/jest": "^29.5.2", | ||
"@typescript-eslint/eslint-plugin": "^6.0.0", | ||
"@typescript-eslint/parser": "^6.0.0", | ||
"@typescript-eslint/eslint-plugin": "^6.21.0", | ||
"@typescript-eslint/parser": "^6.21.0", | ||
"conventional-github-releaser": "^3.1.5", | ||
"eslint": "^8.42.0", | ||
"eslint-config-neon": "^0.1.47", | ||
"eslint-config-prettier": "^9.0.0", | ||
"eslint-plugin-jest": "^27.2.1", | ||
"eslint": "^8.57.0", | ||
"eslint-config-prettier": "^9.1.0", | ||
"eslint-plugin-jest": "^27.9.0", | ||
"eslint-plugin-prettier": "^4.2.1", | ||
@@ -69,3 +68,3 @@ "fast-check": "^2.25.0", | ||
"prettier": "^2.8.8", | ||
"rimraf": "^5.0.0", | ||
"rimraf": "^6.0.0", | ||
"standard-version": "^9.5.0", | ||
@@ -72,0 +71,0 @@ "ts-jest": "^29.1.1", |
153856
24
3450