synonym-optimizer
Advanced tools
Comparing version 2.19.0 to 2.20.0
@@ -1,23 +0,1 @@ | ||
export declare type Languages = 'en_US' | 'fr_FR' | 'de_DE' | 'it_IT' | 'es_ES' | string; | ||
export declare function getStandardStopWords(lang: Languages): string[]; | ||
export declare function getStopWords(lang: Languages, stopWordsToAdd: string[], stopWordsToRemove: string[], stopWordsOverride: string[]): string[]; | ||
export declare function extractWords(input: string, lang: Languages): string[]; | ||
interface WordsWithPos { | ||
[key: string]: number[]; | ||
} | ||
export declare function getWordsWithPos(lang: Languages, words: string[], identicals: string[][], debugHolder: DebugHolder): WordsWithPos; | ||
export declare function getScore(wordsWithPos: WordsWithPos): number; | ||
interface IdenticalsMap { | ||
[key: string]: string; | ||
} | ||
export interface DebugHolder { | ||
filteredAlt?: string[]; | ||
identicals?: string[][]; | ||
identicalsMap?: IdenticalsMap; | ||
wordsWithPos?: WordsWithPos; | ||
score?: number; | ||
} | ||
export declare function getStemmedWords(text: string, stopwords: string[], lang: Languages): string[]; | ||
export declare function scoreAlternative(lang: Languages, alternative: string, stopWordsToAdd: string[], stopWordsToRemove: string[], stopWordsOverride: string[], identicals: string[][], debugHolder: DebugHolder): number; | ||
export declare function getBest(lang: Languages, alternatives: string[], stopWordsToAdd: string[], stopWordsToRemove: string[], stopWordsOverride: string[], identicals: string[][]): number; | ||
export {}; | ||
export { DebugHolder, SynOptimizer } from './SynOptimizer'; |
"use strict"; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.getBest = exports.scoreAlternative = exports.getStemmedWords = exports.getScore = exports.getWordsWithPos = exports.extractWords = exports.getStopWords = exports.getStandardStopWords = void 0; | ||
const tokenizer = require("wink-tokenizer"); | ||
const stopwordsFr = require("stopwords-fr"); | ||
const stopwordsDe = require("stopwords-de"); | ||
const stopwordsEn = require("stopwords-en"); | ||
const stopwordsIt = require("stopwords-it"); | ||
const stopwordsEs = require("stopwords-es"); | ||
const englishStemmer = require("snowball-stemmer.jsx/dest/english-stemmer.common.js"); | ||
const frenchStemmer = require("snowball-stemmer.jsx/dest/french-stemmer.common.js"); | ||
const germanStemmer = require("snowball-stemmer.jsx/dest/german-stemmer.common.js"); | ||
const italianStemmer = require("snowball-stemmer.jsx/dest/italian-stemmer.common.js"); | ||
const spanishStemmer = require("snowball-stemmer.jsx/dest/spanish-stemmer.common.js"); | ||
const rosaenlg_filter_1 = require("rosaenlg-filter"); | ||
const fullySupportedLanguages = ['en_US', 'de_DE', 'fr_FR', 'it_IT', 'es_ES']; | ||
// exported for testing purposes | ||
function getStandardStopWords(lang) { | ||
switch (lang) { | ||
case 'en_US': | ||
return stopwordsEn; | ||
case 'fr_FR': | ||
return stopwordsFr; | ||
case 'de_DE': | ||
return stopwordsDe; | ||
case 'it_IT': | ||
return stopwordsIt; | ||
case 'es_ES': | ||
return stopwordsEs; | ||
default: | ||
return []; | ||
} | ||
} | ||
exports.getStandardStopWords = getStandardStopWords; | ||
function getStopWords(lang, stopWordsToAdd, stopWordsToRemove, stopWordsOverride) { | ||
let baseList; | ||
// the base list | ||
if (stopWordsOverride) { | ||
baseList = stopWordsOverride.slice(0); | ||
} | ||
else { | ||
baseList = getStandardStopWords(lang); | ||
} | ||
// remove | ||
if (stopWordsToRemove) { | ||
baseList = baseList.filter(function (word) { | ||
return !stopWordsToRemove.includes(word); | ||
}); | ||
} | ||
// and add | ||
if (stopWordsToAdd) { | ||
baseList = baseList.concat(stopWordsToAdd); | ||
} | ||
return baseList.map(function (alt) { | ||
return alt.toLowerCase(); | ||
}); | ||
} | ||
exports.getStopWords = getStopWords; | ||
function extractWords(input, lang) { | ||
// console.log(`tokenizing: ${input}`); | ||
const myTokenizer = new tokenizer(); | ||
myTokenizer.defineConfig({ | ||
currency: false, | ||
number: false, | ||
punctuation: false, | ||
symbol: false, | ||
time: false, | ||
}); | ||
const tokenized = myTokenizer.tokenize(input); | ||
// console.log(`tokenized: ${tokenized}`); | ||
let res = []; | ||
tokenized.forEach(function (elt) { | ||
// no alien tags and no html elements | ||
if (elt.tag != 'alien' && rosaenlg_filter_1.blockLevelHtmlElts.indexOf(elt.value) == -1 && rosaenlg_filter_1.inlineHtmlElts.indexOf(elt.value) == -1) { | ||
res.push(elt.value); | ||
} | ||
}); | ||
if (lang == 'fr_FR') { | ||
// we just leave [Pp]uisqu [Jj]usqu [Ll]orsqu as they are | ||
const regexp = new RegExp("^(D|d|Q|q|L|l|S|s|J|j|T|t|M|m|N|n)'", 'g'); | ||
res = res.map((elt) => { | ||
return elt.replace(regexp, ''); | ||
}); | ||
// sometimes it results in having empty elements | ||
res = res.filter((elt) => elt.length > 0); | ||
} | ||
// console.log(`res: ${res}`); | ||
return res; | ||
} | ||
exports.extractWords = extractWords; | ||
const stemmersCache = {}; | ||
function stemWordForLang(word, lang) { | ||
if (fullySupportedLanguages.includes(lang)) { | ||
//console.log(`ok ${lang} is valid`); | ||
if (!stemmersCache[lang]) { | ||
switch (lang) { | ||
case 'en_US': | ||
stemmersCache[lang] = new englishStemmer.EnglishStemmer(); | ||
break; | ||
case 'de_DE': | ||
stemmersCache[lang] = new germanStemmer.GermanStemmer(); | ||
break; | ||
case 'fr_FR': | ||
stemmersCache[lang] = new frenchStemmer.FrenchStemmer(); | ||
break; | ||
case 'it_IT': | ||
stemmersCache[lang] = new italianStemmer.ItalianStemmer(); | ||
break; | ||
case 'es_ES': | ||
stemmersCache[lang] = new spanishStemmer.SpanishStemmer(); | ||
break; | ||
} | ||
} | ||
//console.log(`orig: ${word}, stemmed: ${stemmersCache[lang].stemWord(word)}`); | ||
return stemmersCache[lang].stemWord(word); | ||
} | ||
return word; | ||
} | ||
function getWordsWithPos(lang, words, identicals, debugHolder) { | ||
const identicalsMap = {}; | ||
if (identicals) { | ||
// check type | ||
if (!Array.isArray(identicals)) { | ||
const err = new Error(); | ||
err.name = 'InvalidArgumentError'; | ||
err.message = `identicals must be a string[][]`; | ||
throw err; | ||
} | ||
else { | ||
identicals.forEach(function (identicalList) { | ||
if (!Array.isArray(identicalList)) { | ||
const err = new Error(); | ||
err.name = 'InvalidArgumentError'; | ||
err.message = `identicals must be a string[][]`; | ||
throw err; | ||
} | ||
}); | ||
} | ||
if (debugHolder) { | ||
debugHolder.identicals = identicals; | ||
} | ||
// do the job | ||
identicals.forEach(function (identicalList) { | ||
const mapTo = identicalList.join('_'); | ||
identicalList.forEach(function (identicalElt) { | ||
identicalsMap[stemWordForLang(identicalElt, lang)] = mapTo; | ||
}); | ||
}); | ||
} | ||
if (debugHolder) { | ||
debugHolder.identicalsMap = identicalsMap; | ||
} | ||
const wordsWithPos = {}; | ||
for (let j = 0; j < words.length; j++) { | ||
const word = identicalsMap[words[j]] || words[j]; | ||
if (!wordsWithPos[word]) { | ||
wordsWithPos[word] = []; | ||
} | ||
wordsWithPos[word].push(j); | ||
} | ||
return wordsWithPos; | ||
} | ||
exports.getWordsWithPos = getWordsWithPos; | ||
function getScore(wordsWithPos) { | ||
let score = 0; | ||
Object.keys(wordsWithPos).forEach(function (word) { | ||
const positions = wordsWithPos[word]; | ||
for (let j = 1; j < positions.length; j++) { | ||
score += 1 / (positions[j] - positions[j - 1]); | ||
} | ||
}); | ||
return score; | ||
} | ||
exports.getScore = getScore; | ||
function getStemmedWords(text, stopwords, lang) { | ||
// console.log(`getStemmedWords: ${text}`); | ||
const res = extractWords(text, lang) | ||
.map(function (alt) { | ||
return alt.toLowerCase(); | ||
}) | ||
.filter(function (alt) { | ||
return !stopwords.includes(alt); | ||
}) | ||
.map((elt) => { | ||
return stemWordForLang(elt, lang); | ||
}); | ||
// console.log(`getStemmedWords result: ${res}`); | ||
return res; | ||
} | ||
exports.getStemmedWords = getStemmedWords; | ||
function scoreAlternative(lang, alternative, stopWordsToAdd, stopWordsToRemove, stopWordsOverride, identicals, debugHolder) { | ||
// console.log(stemmer.stemWord("baby")); | ||
// console.log(stopWordsToAdd); | ||
const stopwords = getStopWords(lang, stopWordsToAdd, stopWordsToRemove, stopWordsOverride); | ||
// console.log(stopwords); | ||
const filteredAlt = getStemmedWords(alternative, stopwords, lang); | ||
if (debugHolder) { | ||
debugHolder.filteredAlt = filteredAlt; | ||
} | ||
const wordsWithPos = getWordsWithPos(lang, filteredAlt, identicals, debugHolder); | ||
if (debugHolder) { | ||
// only keep ones with > 1 for readability | ||
debugHolder.wordsWithPos = {}; | ||
Object.keys(wordsWithPos).forEach(function (word) { | ||
if (wordsWithPos[word].length > 1) { | ||
debugHolder.wordsWithPos[word] = wordsWithPos[word]; | ||
} | ||
}); | ||
} | ||
// console.log(wordsWithPos); | ||
// score | ||
const score = getScore(wordsWithPos); | ||
if (debugHolder) { | ||
debugHolder.score = score; | ||
} | ||
return score; | ||
// console.log(score); | ||
} | ||
exports.scoreAlternative = scoreAlternative; | ||
function getBest(lang, alternatives, stopWordsToAdd, stopWordsToRemove, stopWordsOverride, identicals) { | ||
const scores = []; | ||
alternatives.forEach(function (alt) { | ||
scores.push(scoreAlternative(lang, alt, stopWordsToAdd, stopWordsToRemove, stopWordsOverride, identicals, null)); | ||
}); | ||
return scores.indexOf(Math.min(...scores)); | ||
} | ||
exports.getBest = getBest; | ||
exports.SynOptimizer = void 0; | ||
var SynOptimizer_1 = require("./SynOptimizer"); | ||
Object.defineProperty(exports, "SynOptimizer", { enumerable: true, get: function () { return SynOptimizer_1.SynOptimizer; } }); | ||
//# sourceMappingURL=index.js.map |
"use strict"; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
const fs = require("fs"); | ||
const index_1 = require("./index"); | ||
const helper_1 = require("./helper"); | ||
function generateStopWordsDoc(dest) { | ||
@@ -12,3 +12,4 @@ const stream = fs.createWriteStream(dest, 'utf-8'); | ||
stream.write(`== ${language}\n\n`); | ||
const stopWords = index_1.getStandardStopWords(language); | ||
const languageSyn = helper_1.buildLanguageSyn(helper_1.getIso2fromLocale(language)); | ||
const stopWords = languageSyn.getStandardStopWords(); | ||
stream.write(stopWords.join(' - ')); | ||
@@ -15,0 +16,0 @@ stream.write(`\n\n\n`); |
{ | ||
"name": "synonym-optimizer", | ||
"version": "2.19.0", | ||
"version": "2.20.0", | ||
"description": "Finds the text which has the least number of repetitions", | ||
@@ -11,3 +11,3 @@ "main": "dist/index.js", | ||
"tsc": "tsc", | ||
"build": "tsc && node dist/stopWordsDoc.js" | ||
"build": "tsc && node ./dist/stopWordsDoc.js" | ||
}, | ||
@@ -56,3 +56,3 @@ "nyc": { | ||
"@types/wink-tokenizer": "^4.0.1", | ||
"rosaenlg-filter": "2.19.0", | ||
"rosaenlg-filter": "2.20.0", | ||
"snowball-stemmer.jsx": "^0.2.3", | ||
@@ -66,3 +66,3 @@ "stopwords-de": "^0.3.0", | ||
}, | ||
"gitHead": "ecf867234952618936b9710b363984bfe00632ba" | ||
"gitHead": "9223cdd558d1775f866f99921e3391d5d7cdc4f2" | ||
} |
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
152945
56
934
+ Addedenglish-a-an@0.12.0(transitive)
+ Addedenglish-a-an-list@0.12.0(transitive)
+ Addedfrench-contractions@2.20.0(transitive)
+ Addedrosaenlg-commons@0.4.0(transitive)
+ Addedrosaenlg-filter@2.20.0(transitive)
- Removedenglish-a-an@0.11.0(transitive)
- Removedenglish-a-an-list@0.11.0(transitive)
- Removedfrench-contractions@2.19.0(transitive)
- Removedrosaenlg-commons@0.3.0(transitive)
- Removedrosaenlg-filter@2.19.0(transitive)
Updatedrosaenlg-filter@2.20.0