Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

@ckirby/longest-common-substring

Package Overview
Dependencies
Maintainers
1
Versions
6
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

@ckirby/longest-common-substring - npm Package Compare versions

Comparing version 1.0.1 to 1.1.0

151

index.js

@@ -1,6 +0,10 @@

function longestCommonSubstring(str1 = ``, str2 = ``) {
let arr2 = str2.split('');
const { memoizingStemmer: stemmer } = require('porter-stemmer');
const { assign } = Object;
function findCommonSubstrings(str1 = ``, str2 = ``, { threshold = 3 } = {}) {
let arr2 = str2.split(``);
let prev = [];
let best = ``;
str1.split('').forEach((char1, ii) => {
let substrings = [];
str1.split(``).forEach((char1, ii) => {
prev = arr2.map((char2, jj) => {

@@ -13,5 +17,11 @@ if (char1 !== char2) {

if (length > best.length) {
if (length >= threshold) {
let end = ii + 1;
best = str1.substring(end - length, end);
let offset1 = end - length;
let substring = str1.substring(offset1, end);
let curr = substrings[substrings.length - 1];
if (curr && curr.offset1 === offset1) {
substrings.pop();
}
substrings.push({ offset1, substring, offset2: jj + 1 - length });
}

@@ -21,7 +31,130 @@

});
}, '');
}, ``);
return best;
return substrings;
}
module.exports = { longestCommonSubstring };
function findCommonPhrases(str1 = ``, str2 = ``, options = {}) {
let { threshold = 3, trimOverlaps = false } = options;
let tk1 = tokenizer(str1, options);
let tk2 = tokenizer(str2, options);
let vocabulary = makeVocabulary(tk1);
let sequences = findCommonSubstrings(
vocabulary.fromTokens(tk1),
vocabulary.fromTokens(tk2),
{ threshold }
);
if (trimOverlaps) {
sequences = reduceOverlaps(sequences, threshold);
}
return [
makeOutput(sequences, tk1, `offset1`),
makeOutput(sequences, tk2, `offset2`)
];
}
// de-overlap sequences
function reduceOverlaps(inputs, threshold = 0) {
// sort by descending sequence length
let sequences = inputs.slice().sort((aa, bb) => bb.substring.length - aa.substring.length);
sequences.forEach((target, ii) => {
if (!target.substring.length) {
return;
}
let start = target.offset1;
let end = start + target.substring.length;
let candidates = sequences.slice(ii + 1);
// find sequences that start inside a longer sequence
let overlapping = candidates.filter(({ offset1 }) => offset1 > start && offset1 <= end);
// ensure the substring starts at the end of the target.substring
overlapping.forEach((seq) => assign(seq, {
offset1: end,
substring: seq.substring.slice(end - seq.offset1)
}));
// find sequences that end inside a longer sequence
overlapping = candidates.filter(
({ offset1, substring: { length } }) => (offset1 + length) > start && (offset1 + length) <= end
);
// ensure the substring ends at the target.offset1
overlapping.forEach((seq) => assign(seq, {
substring: seq.substring.slice(0, start - seq.offset1)
}));
});
// omit empties & sort by ascending offset1
sequences = sequences.filter(({ substring: { length } }) => length >= threshold);
return sequences.sort(({ offset1: aa }, { offset1: bb }) => aa - bb);
}
function makeOutput(sequences, { all: tokens }, offsetKey) {
return sequences.map((seq) => {
let index = seq[offsetKey] * 2 + 1;
let length = 2 * seq.substring.length;
let offset = tokens.slice(0, index).join(``).length;
let substring = tokens.slice(index, index + length - 1).join(``);
return { offset, substring };
});
}
function tokenizer(str, { caseInsensitive = false, conflateStems = false }) {
// split on word-chars & dashes with optional trailing "(s)"
let all = str.split(/([-\w]+(?:[(]s[)])?)/);
// odd-indexed items are the tokens
let tokens = all.filter((tk, ii) => ii % 2);
if (caseInsensitive) {
tokens = tokens.map((token) => token.toLowerCase());
}
if (conflateStems) {
tokens = tokens.map(stem);
}
return { all, tokens };
}
function makeVocabulary({ tokens }) {
const NOTFOUND = String.fromCharCode(33);
let uniqTokens = [ ...new Set(tokens) ];
let vocab = new Map(uniqTokens.map((token, ii) => [ token, String.fromCharCode(ii + 65) ]));
return assign(vocab, {
fromTokens({ tokens }) {
return tokens.map((tk) => vocab.get(tk) || NOTFOUND).join(``);
}
});
}
// list plurals that don't get stemmed such that
// stemmer(plural) === stemmer(singular)
const IRREGULARS = new Map([
[ 'apparatuses', stemmer('apparatus') ],
// the stemmer stems 'identification' differently from other forms of
// 'identify'
[ 'identification', stemmer('identify') ],
[ 'people', stemmer('person') ],
[ 'has', stemmer('have') ],
[ 'had', stemmer('have') ]
]);
function stem(word) {
if (!word) {
return '';
}
// strip trailing optional plural(s)
word = word.toLowerCase().replace(/[(]s[)]$/, '');
if (/\W/.test(word)) {
// individually stem each portion of a hyphenate (any any other word that
// contains a non-word char)
return word.split(/\W+/).map(stem).join(' ').trim().replace(/\s+/, ' ');
}
let irregularStem = IRREGULARS.get(word);
if (irregularStem) {
return irregularStem;
}
return stemmer(word);
}
module.exports = { findCommonSubstrings, findCommonPhrases, stem };

5

package.json
{
"name": "@ckirby/longest-common-substring",
"version": "1.0.1",
"version": "1.1.0",
"description": "find the longest string that is a substring of two strings",

@@ -20,3 +20,6 @@ "main": "index.js",

"lint": "eslint ."
},
"dependencies": {
"porter-stemmer": "^0.9.1"
}
}
SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc