text-annotator
Advanced tools
Comparing version 0.9.5 to 0.9.6
@@ -8,4 +8,2 @@ "use strict"; | ||
var _htmlEntities = require("html-entities"); | ||
var _sbd = _interopRequireDefault(require("./ext/sbd")); | ||
@@ -15,3 +13,7 @@ | ||
// div inside span is a bad idea | ||
const encode = str => { | ||
return str.replace(/&/g, '&').replace(/"/g, '"').replace(/'/g, ''').replace(/</g, '<').replace(/>/g, '>'); | ||
}; // div inside span is a bad idea | ||
const blockElements = ['address', 'article', 'aside', 'blockquote', 'canvas', 'dd', 'div', 'dl', 'dt', 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'li', 'main', 'nav', 'noscript', 'ol', 'output', 'p', 'pre', 'section', 'table', 'tfoot', 'ul', 'video']; | ||
@@ -202,3 +204,3 @@ | ||
if (ifEncode && index === -1) { | ||
const encodedStrWithFixes = (0, _htmlEntities.encode)(strWithFixes); | ||
const encodedStrWithFixes = encode(strWithFixes); | ||
const index = text.indexOf(encodedStrWithFixes, offset); | ||
@@ -208,4 +210,4 @@ | ||
const loc = []; | ||
loc[0] = index + (0, _htmlEntities.encode)(prefix).length; | ||
loc[1] = loc[0] + (0, _htmlEntities.encode)(str).length; | ||
loc[0] = index + encode(prefix).length; | ||
loc[1] = loc[0] + encode(str).length; | ||
highlightIndex = this.highlights.push({ | ||
@@ -320,104 +322,104 @@ loc | ||
else if (sentenceBased) { | ||
// step 1: sentenize the text if has not done so | ||
let sentences = []; | ||
// step 1: sentenize the text if has not done so | ||
let sentences = []; | ||
if (this.sentences.length) { | ||
sentences = this.sentences; | ||
} else { | ||
sentences = this.sentences = TextAnnotator.sentenize(text); | ||
} // step 2 (for efficiency only): filter sentences by words of the str | ||
if (this.sentences.length) { | ||
sentences = this.sentences; | ||
} else { | ||
sentences = this.sentences = TextAnnotator.sentenize(text); | ||
} // step 2 (for efficiency only): filter sentences by words of the str | ||
const words = str.split(/\s/); | ||
const filteredSentences = []; | ||
const words = str.split(/\s/); | ||
const filteredSentences = []; | ||
for (let i = 0; i < sentences.length; i++) { | ||
for (let j = 0; j < words.length; j++) { | ||
if (sentences[i].raw.includes(words[j])) { | ||
filteredSentences.push(sentences[i]); | ||
break; | ||
} | ||
for (let i = 0; i < sentences.length; i++) { | ||
for (let j = 0; j < words.length; j++) { | ||
if (sentences[i].raw.includes(words[j])) { | ||
filteredSentences.push(sentences[i]); | ||
break; | ||
} | ||
} //step 3 (optional) | ||
} | ||
} //step 3 (optional) | ||
if (processSentence) { | ||
let index = 0; // for each sentence | ||
if (processSentence) { | ||
let index = 0; // for each sentence | ||
for (let i = 0; i < filteredSentences.length; i++) { | ||
const fs = filteredSentences[i]; | ||
let raw = fs.raw; // loc without tags | ||
for (let i = 0; i < filteredSentences.length; i++) { | ||
const fs = filteredSentences[i]; | ||
let raw = fs.raw; // loc without tags | ||
const loc = [fs.index, fs.index + raw.length]; | ||
let locInc = 0; // add loc of all tags before the one being checked so as to derive the actual loc | ||
const loc = [fs.index, fs.index + raw.length]; | ||
let locInc = 0; // add loc of all tags before the one being checked so as to derive the actual loc | ||
const tagLocations = this.tagLocations; // for each loc of tag whose loc is larger than the last sentence | ||
const tagLocations = this.tagLocations; // for each loc of tag whose loc is larger than the last sentence | ||
for (let j = index; j < tagLocations.length; j++) { | ||
const tagLoc = tagLocations[j]; | ||
for (let j = index; j < tagLocations.length; j++) { | ||
const tagLoc = tagLocations[j]; | ||
if (tagLoc[0] >= loc[0] && tagLoc[0] <= loc[1]) { | ||
const tag = this.originalContent.substring(tagLoc[0] + tagLoc[2], tagLoc[0] + tagLoc[2] + tagLoc[1]); | ||
const insertIndex = tagLoc[0] + locInc - loc[0]; | ||
raw = raw.slice(0, insertIndex) + tag + raw.slice(insertIndex); | ||
locInc += tagLoc[1]; | ||
} else if (tagLoc[0] > loc[1]) { | ||
index = j; // not sure this part | ||
if (tagLoc[0] >= loc[0] && tagLoc[0] <= loc[1]) { | ||
const tag = this.originalContent.substring(tagLoc[0] + tagLoc[2], tagLoc[0] + tagLoc[2] + tagLoc[1]); | ||
const insertIndex = tagLoc[0] + locInc - loc[0]; | ||
raw = raw.slice(0, insertIndex) + tag + raw.slice(insertIndex); | ||
locInc += tagLoc[1]; | ||
} else if (tagLoc[0] > loc[1]) { | ||
index = j; // not sure this part | ||
break; | ||
} | ||
break; | ||
} | ||
} | ||
raw = processSentence(raw); | ||
raw = raw.replace(/(<([^>]+)>)/gi, ''); | ||
const copy = fs.raw; // update the sentence if it got reduced | ||
raw = processSentence(raw); | ||
raw = raw.replace(/(<([^>]+)>)/gi, ''); | ||
const copy = fs.raw; // update the sentence if it got reduced | ||
if (copy !== raw) { | ||
fs.raw = raw; | ||
fs.index = fs.index + copy.indexOf(raw); | ||
} | ||
if (copy !== raw) { | ||
fs.raw = raw; | ||
fs.index = fs.index + copy.indexOf(raw); | ||
} | ||
} // step 4: find the most possible sentence | ||
} | ||
} // step 4: find the most possible sentence | ||
let mostPossibleSentence = null; | ||
let mostPossibleSentence = null; | ||
for (let i = 0; i < filteredSentences.length; i++) { | ||
const sentence = filteredSentences[i]; | ||
const similarity = TextAnnotator.getSimilarity(sentence.raw, str, caseSensitive); | ||
for (let i = 0; i < filteredSentences.length; i++) { | ||
const sentence = filteredSentences[i]; | ||
const similarity = TextAnnotator.getSimilarity(sentence.raw, str, caseSensitive); | ||
if (similarity >= sbThreshold) { | ||
sbThreshold = similarity; | ||
mostPossibleSentence = sentence; | ||
} else if (i !== filteredSentences.length - 1) { | ||
// combine two sentences to reduce the inaccuracy of sentenizing text | ||
const newSentenceRaw = sentence.raw + filteredSentences[i + 1].raw; | ||
const lengthDiff = Math.abs(newSentenceRaw.length - str.length) / str.length; | ||
if (similarity >= sbThreshold) { | ||
sbThreshold = similarity; | ||
mostPossibleSentence = sentence; | ||
} else if (i !== filteredSentences.length - 1) { | ||
// combine two sentences to reduce the inaccuracy of sentenizing text | ||
const newSentenceRaw = sentence.raw + filteredSentences[i + 1].raw; | ||
const lengthDiff = Math.abs(newSentenceRaw.length - str.length) / str.length; | ||
if (lengthDiff <= maxLengthDiff) { | ||
const newSimilarity = TextAnnotator.getSimilarity(newSentenceRaw, str, caseSensitive); | ||
if (lengthDiff <= maxLengthDiff) { | ||
const newSimilarity = TextAnnotator.getSimilarity(newSentenceRaw, str, caseSensitive); | ||
if (newSimilarity >= sbThreshold) { | ||
sbThreshold = newSimilarity; | ||
mostPossibleSentence = { | ||
raw: newSentenceRaw, | ||
index: sentence.index | ||
}; | ||
} | ||
if (newSimilarity >= sbThreshold) { | ||
sbThreshold = newSimilarity; | ||
mostPossibleSentence = { | ||
raw: newSentenceRaw, | ||
index: sentence.index | ||
}; | ||
} | ||
} | ||
} // step 5: if the most possible sentence is found, derive and return the location of the most similar str from it | ||
} | ||
} // step 5: if the most possible sentence is found, derive and return the location of the most similar str from it | ||
if (mostPossibleSentence) { | ||
const result = TextAnnotator.getBestSubstring(mostPossibleSentence.raw, str, sbThreshold, lenRatio, caseSensitive, true); | ||
if (mostPossibleSentence) { | ||
const result = TextAnnotator.getBestSubstring(mostPossibleSentence.raw, str, sbThreshold, lenRatio, caseSensitive, true); | ||
if (result.loc) { | ||
let index = mostPossibleSentence.index; | ||
highlightIndex = this.highlights.push({ | ||
loc: [index + result.loc[0], index + result.loc[1]] | ||
}) - 1; | ||
} | ||
if (result.loc) { | ||
let index = mostPossibleSentence.index; | ||
highlightIndex = this.highlights.push({ | ||
loc: [index + result.loc[0], index + result.loc[1]] | ||
}) - 1; | ||
} | ||
} | ||
} | ||
@@ -460,23 +462,23 @@ return highlightIndex; | ||
else { | ||
for (let i2 = i + 1; i2 < this.tagLocations.length; i2++) { | ||
const tagLoc2 = this.tagLocations[i2]; | ||
for (let i2 = i + 1; i2 < this.tagLocations.length; i2++) { | ||
const tagLoc2 = this.tagLocations[i2]; | ||
if (highlightLoc[1] < tagLoc2[0]) { | ||
break; | ||
} else { | ||
const tag2 = this.originalContent.substring(tagLoc2[0] + tagLoc2[2], tagLoc2[0] + tagLoc2[2] + tagLoc2[1]); | ||
if (highlightLoc[1] < tagLoc2[0]) { | ||
break; | ||
} else { | ||
const tag2 = this.originalContent.substring(tagLoc2[0] + tagLoc2[2], tagLoc2[0] + tagLoc2[2] + tagLoc2[1]); | ||
if (tag2.startsWith('<' + tagName)) { | ||
requiredTagNumber++; | ||
} else if (tag2.startsWith('</' + tagName)) { | ||
requiredTagCount++; | ||
} | ||
if (tag2.startsWith('<' + tagName)) { | ||
requiredTagNumber++; | ||
} else if (tag2.startsWith('</' + tagName)) { | ||
requiredTagCount++; | ||
} | ||
if (requiredTagNumber === requiredTagCount) { | ||
included = true; | ||
break; | ||
} | ||
if (requiredTagNumber === requiredTagCount) { | ||
included = true; | ||
break; | ||
} | ||
} | ||
} | ||
} | ||
@@ -499,22 +501,22 @@ return included; | ||
else if (highlightLoc[1] === tagLoc[0]) { | ||
const tag = this.originalContent.substring(tagLoc[0] + tagLoc[2], tagLoc[0] + tagLoc[2] + tagLoc[1]); // if end tag, not block element and include the required close tag, add right to the tag | ||
const tag = this.originalContent.substring(tagLoc[0] + tagLoc[2], tagLoc[0] + tagLoc[2] + tagLoc[1]); // if end tag, not block element and include the required close tag, add right to the tag | ||
if (!tag.endsWith('/>') && tag.startsWith('</') && !blockElements.includes(tag.split('</')[1].split('>')[0]) && this.includeRequiredTag(i, highlightLoc, tag)) { | ||
locInc[1] += tagLoc[1]; | ||
} | ||
} // start tag end | ||
else if (highlightLoc[1] > tagLoc[0]) { | ||
locInc[1] += tagLoc[1]; // start&tag end | ||
if (!tag.endsWith('/>') && tag.startsWith('</') && !blockElements.includes(tag.split('</')[1].split('>')[0]) && this.includeRequiredTag(i, highlightLoc, tag)) { | ||
locInc[1] += tagLoc[1]; | ||
} | ||
} // start tag end | ||
else if (highlightLoc[1] > tagLoc[0]) { | ||
locInc[1] += tagLoc[1]; // start&tag end | ||
if (highlightLoc[0] === tagLoc[0]) { | ||
const tag = this.originalContent.substring(tagLoc[0] + tagLoc[2], tagLoc[0] + tagLoc[2] + tagLoc[1]); // if self close tag or end tag or block element or not include the required close tag, add right to the tag | ||
if (highlightLoc[0] === tagLoc[0]) { | ||
const tag = this.originalContent.substring(tagLoc[0] + tagLoc[2], tagLoc[0] + tagLoc[2] + tagLoc[1]); // if self close tag or end tag or block element or not include the required close tag, add right to the tag | ||
if (tag.startsWith('</') || tag.endsWith('/>') || blockElements.includes(tag.split(' ')[0].split('<')[1].split('>')[0]) || !this.includeRequiredTag(i, highlightLoc, tag)) { | ||
locInc[0] += tagLoc[1]; | ||
} | ||
} // tag start end | ||
else if (highlightLoc[0] > tagLoc[0]) { | ||
locInc[0] += tagLoc[1]; | ||
} | ||
if (tag.startsWith('</') || tag.endsWith('/>') || blockElements.includes(tag.split(' ')[0].split('<')[1].split('>')[0]) || !this.includeRequiredTag(i, highlightLoc, tag)) { | ||
locInc[0] += tagLoc[1]; | ||
} | ||
} // tag start end | ||
else if (highlightLoc[0] > tagLoc[0]) { | ||
locInc[0] += tagLoc[1]; | ||
} | ||
} | ||
} // step 2: check locations of other highlights | ||
@@ -539,13 +541,13 @@ // all span (no blocks) | ||
else if (highlightLoc[0] < loc[1] && highlightLoc[0] > loc[0] && highlightLoc[1] > loc[1]) { | ||
locInc[0] += openTagLength; | ||
locInc[1] += openTagLength + closeTagLength; | ||
} else if (highlightLoc[0] <= loc[0] && highlightLoc[1] >= loc[1]) { | ||
locInc[1] += openTagLength + closeTagLength; | ||
} // syntactical correct but semantical incorrect | ||
else if (highlightLoc[0] < loc[0] && highlightLoc[1] > loc[0] && highlightLoc[1] < loc[1]) { | ||
locInc[1] += openTagLength; | ||
} else if (highlightLoc[0] >= loc[0] && highlightLoc[1] <= loc[1]) { | ||
locInc[0] += openTagLength; | ||
locInc[1] += openTagLength; | ||
} | ||
locInc[0] += openTagLength; | ||
locInc[1] += openTagLength + closeTagLength; | ||
} else if (highlightLoc[0] <= loc[0] && highlightLoc[1] >= loc[1]) { | ||
locInc[1] += openTagLength + closeTagLength; | ||
} // syntactical correct but semantical incorrect | ||
else if (highlightLoc[0] < loc[0] && highlightLoc[1] > loc[0] && highlightLoc[1] < loc[1]) { | ||
locInc[1] += openTagLength; | ||
} else if (highlightLoc[0] >= loc[0] && highlightLoc[1] <= loc[1]) { | ||
locInc[0] += openTagLength; | ||
locInc[1] += openTagLength; | ||
} | ||
} | ||
@@ -552,0 +554,0 @@ } |
{ | ||
"name": "text-annotator", | ||
"version": "0.9.5", | ||
"version": "0.9.6", | ||
"description": "A JavaScript library for locating and annotating plain text in HTML", | ||
@@ -8,6 +8,4 @@ "main": "build/text-annotator.js", | ||
"lint": "./node_modules/.bin/eslint src/** test/** --fix", | ||
"lint:nofix": "./node_modules/.bin/eslint src/** test/** --max-warnings 0", | ||
"build": "babel src -d build", | ||
"build-min": "webpack --config webpack.config.js", | ||
"sync": "node sync.js", | ||
"test": "jest" | ||
@@ -31,17 +29,14 @@ }, | ||
"devDependencies": { | ||
"@babel/cli": "^7.13.0", | ||
"@babel/core": "^7.13.8", | ||
"@babel/preset-env": "^7.13.9", | ||
"babel-jest": "^26.6.3", | ||
"dotenv": "^8.0.0", | ||
"eslint": "^7.21.0", | ||
"eslint-config-prettier": "^8.1.0", | ||
"eslint-plugin-jest": "^24.1.5", | ||
"eslint-plugin-prettier": "^3.3.1", | ||
"jest": "^26.6.3", | ||
"pre-commit": "^1.2.2", | ||
"prettier": "2.2.1", | ||
"sync-directory": "^2.2.17", | ||
"webpack": "^4.46.0", | ||
"webpack-cli": "^3.3.12" | ||
"@babel/cli": "^7.17.10", | ||
"@babel/core": "^7.18.5", | ||
"@babel/preset-env": "^7.18.2", | ||
"babel-jest": "^28.1.1", | ||
"eslint": "^8.18.0", | ||
"eslint-config-prettier": "^8.5.0", | ||
"eslint-plugin-jest": "^26.5.3", | ||
"eslint-plugin-prettier": "^4.0.0", | ||
"jest": "^28.1.1", | ||
"prettier": "2.7.1", | ||
"webpack": "^5.73.0", | ||
"webpack-cli": "^4.10.0" | ||
}, | ||
@@ -86,10 +81,3 @@ "babel": { | ||
"singleQuote": true | ||
}, | ||
"pre-commit": [ | ||
"lint:nofix", | ||
"test" | ||
], | ||
"dependencies": { | ||
"html-entities": "^2.1.0" | ||
} | ||
} |
@@ -148,2 +148,2 @@ # text-annotator | ||
## Contact | ||
[Zhan Huang](mailto:z2hm@outlook.com "Zhan Huang") | ||
[Zhan Huang](mailto:z2hm@outlook.com "Zhan Huang") |
@@ -1,4 +0,12 @@ | ||
import { encode } from 'html-entities' | ||
import getSentences from './ext/sbd' | ||
const encode = (str) => { | ||
return str | ||
.replace(/&/g, '&') | ||
.replace(/"/g, '"') | ||
.replace(/'/g, ''') | ||
.replace(/</g, '<') | ||
.replace(/>/g, '>') | ||
} | ||
// div inside span is a bad idea | ||
@@ -5,0 +13,0 @@ const blockElements = [ |
const path = require('path') | ||
module.exports = { | ||
entry: './src/index.js', | ||
entry: './build/index.js', | ||
target: 'web', | ||
mode: 'production', | ||
mode: 'development', | ||
output: { | ||
@@ -8,0 +8,0 @@ path: path.join(__dirname, 'public/js'), |
Sorry, the diff of this file is too big to display
Environment variable access
Supply chain riskPackage accesses environment variables, which may be a sign of credential stuffing or data theft.
Found 3 instances in 1 package
Filesystem access
Supply chain riskAccesses the file system, and could potentially read sensitive data.
Found 1 instance in 1 package
0
12
2398
0
132557
13
148
- Removedhtml-entities@^2.1.0
- Removedhtml-entities@2.5.2(transitive)