retext-keywords
Advanced tools
Comparing version 4.0.1 to 4.0.2
288
index.js
@@ -1,50 +0,50 @@ | ||
'use strict'; | ||
'use strict' | ||
var stemmer = require('stemmer'); | ||
var visit = require('unist-util-visit'); | ||
var nlcstToString = require('nlcst-to-string'); | ||
var pos = require('retext-pos'); | ||
var stemmer = require('stemmer') | ||
var visit = require('unist-util-visit') | ||
var nlcstToString = require('nlcst-to-string') | ||
var pos = require('retext-pos') | ||
module.exports = keywords; | ||
module.exports = keywords | ||
var own = {}.hasOwnProperty; | ||
var own = {}.hasOwnProperty | ||
function keywords(options) { | ||
this.use(pos).use(gatherKeywords, options); | ||
this.use(pos).use(gatherKeywords, options) | ||
} | ||
function gatherKeywords(options) { | ||
var maximum = (options || {}).maximum || 5; | ||
var maximum = (options || {}).maximum || 5 | ||
return transformer; | ||
return transformer | ||
function transformer(tree, file) { | ||
var important = getImportantWords(tree); | ||
var important = getImportantWords(tree) | ||
file.data.keywords = filterResults(cloneMatches(important), maximum); | ||
file.data.keyphrases = getKeyphrases(important, maximum); | ||
file.data.keywords = filterResults(cloneMatches(important), maximum) | ||
file.data.keyphrases = getKeyphrases(important, maximum) | ||
} | ||
} | ||
/* Get following or preceding important words or white space. */ | ||
// Get following or preceding important words or white space. | ||
function findPhraseInDirection(node, index, parent, offset) { | ||
var children = parent.children; | ||
var nodes = []; | ||
var stems = []; | ||
var words = []; | ||
var queue = []; | ||
var child; | ||
var children = parent.children | ||
var nodes = [] | ||
var stems = [] | ||
var words = [] | ||
var queue = [] | ||
var child | ||
while (children[index += offset]) { | ||
child = children[index]; | ||
while (children[(index += offset)]) { | ||
child = children[index] | ||
if (child.type === 'WhiteSpaceNode') { | ||
queue.push(child); | ||
queue.push(child) | ||
} else if (isImportant(child)) { | ||
nodes = nodes.concat(queue, [child]); | ||
words.push(child); | ||
stems.push(stemNode(child)); | ||
queue = []; | ||
nodes = nodes.concat(queue, [child]) | ||
words.push(child) | ||
stems.push(stemNode(child)) | ||
queue = [] | ||
} else { | ||
break; | ||
break | ||
} | ||
@@ -57,32 +57,32 @@ } | ||
nodes: nodes | ||
}; | ||
} | ||
} | ||
/* Get the top important phrases in `self`. */ | ||
// Get the top important phrases in `self`. | ||
function getKeyphrases(results, maximum) { | ||
var stemmedPhrases = {}; | ||
var initialWords = []; | ||
var stemmedPhrase; | ||
var index; | ||
var length; | ||
var otherIndex; | ||
var keyword; | ||
var matches; | ||
var phrase; | ||
var stems; | ||
var score; | ||
var first; | ||
var match; | ||
var stemmedPhrases = {} | ||
var initialWords = [] | ||
var stemmedPhrase | ||
var index | ||
var length | ||
var otherIndex | ||
var keyword | ||
var matches | ||
var phrase | ||
var stems | ||
var score | ||
var first | ||
var match | ||
/* Iterate over all grouped important words... */ | ||
// Iterate over all grouped important words... | ||
for (keyword in results) { | ||
matches = results[keyword].matches; | ||
length = matches.length; | ||
index = -1; | ||
matches = results[keyword].matches | ||
length = matches.length | ||
index = -1 | ||
/* Iterate over every occurence of a certain keyword... */ | ||
// Iterate over every occurence of a certain keyword... | ||
while (++index < length) { | ||
phrase = findPhrase(matches[index]); | ||
stemmedPhrase = stemmedPhrases[phrase.value]; | ||
first = phrase.nodes[0]; | ||
phrase = findPhrase(matches[index]) | ||
stemmedPhrase = stemmedPhrases[phrase.value] | ||
first = phrase.nodes[0] | ||
@@ -92,30 +92,26 @@ match = { | ||
parent: matches[index].parent | ||
}; | ||
} | ||
/* If we've detected the same stemmed | ||
* phrase somewhere. */ | ||
// If we've detected the same stemmed phrase somewhere. | ||
if (own.call(stemmedPhrases, phrase.value)) { | ||
/* Add weight per phrase to the score of | ||
* the phrase. */ | ||
stemmedPhrase.score += stemmedPhrase.weight; | ||
// Add weight per phrase to the score of the phrase. | ||
stemmedPhrase.score += stemmedPhrase.weight | ||
/* If this is the first time we walk over | ||
* the phrase (exact match but containing | ||
* another important word), add it to the | ||
* list of matching phrases. */ | ||
// If this is the first time we walk over the phrase (exact match but | ||
// containing another important word), add it to the list of matching | ||
// phrases. | ||
if (initialWords.indexOf(first) === -1) { | ||
initialWords.push(first); | ||
stemmedPhrase.matches.push(match); | ||
initialWords.push(first) | ||
stemmedPhrase.matches.push(match) | ||
} | ||
} else { | ||
otherIndex = -1; | ||
score = -1; | ||
stems = phrase.stems; | ||
otherIndex = -1 | ||
score = -1 | ||
stems = phrase.stems | ||
initialWords.push(first); | ||
initialWords.push(first) | ||
/* For every stem in phrase, add its | ||
* score to score. */ | ||
// For every stem in phrase, add its score to score. | ||
while (stems[++otherIndex]) { | ||
score += results[stems[otherIndex]].score; | ||
score += results[stems[otherIndex]].score | ||
} | ||
@@ -129,3 +125,3 @@ | ||
matches: [match] | ||
}; | ||
} | ||
} | ||
@@ -136,79 +132,80 @@ } | ||
for (stemmedPhrase in stemmedPhrases) { | ||
phrase = stemmedPhrases[stemmedPhrase]; | ||
phrase = stemmedPhrases[stemmedPhrase] | ||
/* Modify its score to be the rounded result of | ||
* multiplying it with the number of occurances, | ||
* and dividing it by the ammount of words in the | ||
* phrase. */ | ||
// Modify its score to be the rounded result of multiplying it with the | ||
// number of occurances, and dividing it by the ammount of words in the | ||
// phrase. | ||
phrase.score = Math.round( | ||
phrase.score * phrase.matches.length / phrase.stems.length | ||
); | ||
(phrase.score * phrase.matches.length) / phrase.stems.length | ||
) | ||
} | ||
return filterResults(stemmedPhrases, maximum); | ||
return filterResults(stemmedPhrases, maximum) | ||
} | ||
/* Get the top results from an occurance map. */ | ||
// Get the top results from an occurance map. | ||
function filterResults(results, maximum) { | ||
var filteredResults = []; | ||
var indices = []; | ||
var matrix = {}; | ||
var column; | ||
var key; | ||
var score; | ||
var interpolated; | ||
var index; | ||
var otherIndex; | ||
var maxScore; | ||
var filteredResults = [] | ||
var indices = [] | ||
var matrix = {} | ||
var column | ||
var key | ||
var score | ||
var interpolated | ||
var index | ||
var otherIndex | ||
var maxScore | ||
for (key in results) { | ||
score = results[key].score; | ||
score = results[key].score | ||
if (!matrix[score]) { | ||
matrix[score] = []; | ||
indices.push(score); | ||
matrix[score] = [] | ||
indices.push(score) | ||
} | ||
matrix[score].push(results[key]); | ||
matrix[score].push(results[key]) | ||
} | ||
indices.sort(reverse); | ||
indices.sort(reverse) | ||
maxScore = indices[0]; | ||
maxScore = indices[0] | ||
index = -1; | ||
index = -1 | ||
while (indices[++index]) { | ||
score = indices[index]; | ||
column = matrix[score]; | ||
score = indices[index] | ||
column = matrix[score] | ||
interpolated = score / maxScore; | ||
otherIndex = -1; | ||
interpolated = score / maxScore | ||
otherIndex = -1 | ||
while (column[++otherIndex]) { | ||
column[otherIndex].score = interpolated; | ||
column[otherIndex].score = interpolated | ||
} | ||
filteredResults = filteredResults.concat(column); | ||
filteredResults = filteredResults.concat(column) | ||
if (filteredResults.length >= maximum) { | ||
break; | ||
break | ||
} | ||
} | ||
return filteredResults; | ||
return filteredResults | ||
} | ||
/* Merge a previous array, with a current value, and | ||
* a following array. */ | ||
// Merge a previous array, with a current value, and a following array. | ||
function merge(prev, current, next) { | ||
return prev.concat().reverse().concat([current], next); | ||
return prev | ||
.concat() | ||
.reverse() | ||
.concat([current], next) | ||
} | ||
/* Find the phrase surrounding a node. */ | ||
// Find the phrase surrounding a node. | ||
function findPhrase(match) { | ||
var node = match.node; | ||
var prev = findPhraseInDirection(node, match.index, match.parent, -1); | ||
var next = findPhraseInDirection(node, match.index, match.parent, 1); | ||
var stems = merge(prev.stems, stemNode(node), next.stems); | ||
var node = match.node | ||
var prev = findPhraseInDirection(node, match.index, match.parent, -1) | ||
var next = findPhraseInDirection(node, match.index, match.parent, 1) | ||
var stems = merge(prev.stems, stemNode(node), next.stems) | ||
@@ -219,19 +216,19 @@ return { | ||
nodes: merge(prev.nodes, node, next.nodes) | ||
}; | ||
} | ||
} | ||
/* Get most important words in `node`. */ | ||
// Get most important words in `node`. | ||
function getImportantWords(node) { | ||
var words = {}; | ||
var words = {} | ||
visit(node, 'WordNode', visitor); | ||
visit(node, 'WordNode', visitor) | ||
return words; | ||
return words | ||
function visitor(word, index, parent) { | ||
var match; | ||
var stem; | ||
var match | ||
var stem | ||
if (isImportant(word)) { | ||
stem = stemNode(word); | ||
stem = stemNode(word) | ||
match = { | ||
@@ -241,3 +238,3 @@ node: word, | ||
parent: parent | ||
}; | ||
} | ||
@@ -249,6 +246,6 @@ if (!own.call(words, stem)) { | ||
score: 1 | ||
}; | ||
} | ||
} else { | ||
words[stem].matches.push(match); | ||
words[stem].score++; | ||
words[stem].matches.push(match) | ||
words[stem].score++ | ||
} | ||
@@ -259,11 +256,10 @@ } | ||
/* Clone the given map of words. | ||
* This is a two level-deep clone. */ | ||
// Clone the given map of words. This is a two level-deep clone. | ||
function cloneMatches(words) { | ||
var result = {}; | ||
var key; | ||
var match; | ||
var result = {} | ||
var key | ||
var match | ||
for (key in words) { | ||
match = words[key]; | ||
match = words[key] | ||
@@ -274,9 +270,9 @@ result[key] = { | ||
score: match.score | ||
}; | ||
} | ||
} | ||
return result; | ||
return result | ||
} | ||
/* Check if `node` is important. */ | ||
// Check if `node` is important. | ||
function isImportant(node) { | ||
@@ -287,25 +283,21 @@ return ( | ||
node.data.partOfSpeech && | ||
( | ||
node.data.partOfSpeech.indexOf('N') === 0 || | ||
( | ||
node.data.partOfSpeech === 'JJ' && | ||
isUpperCase(nlcstToString(node).charAt(0)) | ||
) | ||
) | ||
); | ||
(node.data.partOfSpeech.indexOf('N') === 0 || | ||
(node.data.partOfSpeech === 'JJ' && | ||
isUpperCase(nlcstToString(node).charAt(0)))) | ||
) | ||
} | ||
/* Check if `value` is upper-case. */ | ||
// Check if `value` is upper-case. | ||
function isUpperCase(value) { | ||
return value === String(value).toUpperCase(); | ||
return value === String(value).toUpperCase() | ||
} | ||
/* Reverse sort: from 9 to 0. */ | ||
// Reverse sort: from 9 to 0. | ||
function reverse(a, b) { | ||
return b - a; | ||
return b - a | ||
} | ||
/* Get the stem of a node. */ | ||
// Get the stem of a node. | ||
function stemNode(node) { | ||
return stemmer(nlcstToString(node)).toLowerCase(); | ||
return stemmer(nlcstToString(node)).toLowerCase() | ||
} |
{ | ||
"name": "retext-keywords", | ||
"version": "4.0.1", | ||
"version": "4.0.2", | ||
"description": "Keyword extraction with Retext", | ||
@@ -14,7 +14,7 @@ "license": "MIT", | ||
], | ||
"repository": "wooorm/retext-keywords", | ||
"bugs": "https://github.com/wooorm/retext-keywords/issues", | ||
"author": "Titus Wormer <tituswormer@gmail.com> (http://wooorm.com)", | ||
"repository": "retextjs/retext-keywords", | ||
"bugs": "https://github.com/retextjs/retext-keywords/issues", | ||
"author": "Titus Wormer <tituswormer@gmail.com> (https://wooorm.com)", | ||
"contributors": [ | ||
"Titus Wormer <tituswormer@gmail.com> (http://wooorm.com)", | ||
"Titus Wormer <tituswormer@gmail.com> (https://wooorm.com)", | ||
"Vladimir Starkov <iamstarkov@gmail.com>" | ||
@@ -32,20 +32,20 @@ ], | ||
"devDependencies": { | ||
"browserify": "^14.1.0", | ||
"esmangle": "^1.0.0", | ||
"nyc": "^11.0.0", | ||
"remark-cli": "^4.0.0", | ||
"remark-preset-wooorm": "^3.0.0", | ||
"browserify": "^16.0.0", | ||
"nyc": "^13.0.0", | ||
"prettier": "^1.14.3", | ||
"remark-cli": "^6.0.0", | ||
"remark-preset-wooorm": "^4.0.0", | ||
"retext": "^5.0.0", | ||
"tape": "^4.0.0", | ||
"xo": "^0.18.0" | ||
"tinyify": "^2.4.3", | ||
"xo": "^0.23.0" | ||
}, | ||
"scripts": { | ||
"build-md": "remark . -qfo", | ||
"build-bundle": "browserify index.js --ignore-missing --bare -s retextKeywords > retext-keywords.js", | ||
"build-mangle": "esmangle retext-keywords.js > retext-keywords.min.js", | ||
"build": "npm run build-md && npm run build-bundle && npm run build-mangle", | ||
"lint": "xo", | ||
"format": "remark . -qfo && prettier --write \"**/*.js\" && xo --fix", | ||
"build-bundle": "browserify . -s retextKeywords > retext-keywords.js", | ||
"build-mangle": "browserify . -s retextKeywords -p tinyify > retext-keywords.min.js", | ||
"build": "npm run build-bundle && npm run build-mangle", | ||
"test-api": "node test", | ||
"test-coverage": "nyc --reporter lcov tape test.js", | ||
"test": "npm run build && npm run lint && npm run test-coverage" | ||
"test": "npm run format && npm run build && npm run test-coverage" | ||
}, | ||
@@ -58,4 +58,12 @@ "nyc": { | ||
}, | ||
"prettier": { | ||
"tabWidth": 2, | ||
"useTabs": false, | ||
"singleQuote": true, | ||
"bracketSpacing": false, | ||
"semi": false, | ||
"trailingComma": "none" | ||
}, | ||
"xo": { | ||
"space": true, | ||
"prettier": true, | ||
"esnext": false, | ||
@@ -62,0 +70,0 @@ "rules": { |
@@ -1,2 +0,2 @@ | ||
# retext-keywords [![Build Status][travis-badge]][travis] [![Coverage Status][codecov-badge]][codecov] | ||
# retext-keywords [![Build][build-badge]][build] [![Coverage][coverage-badge]][coverage] [![Downloads][downloads-badge]][downloads] [![Chat][chat-badge]][chat] | ||
@@ -15,3 +15,3 @@ Keyword extraction with [**retext**][retext]. | ||
Say we have the following file, `example.txt`, with the first three paragraphs | ||
Say we have the following file, `example.txt`, with the first four paragraphs | ||
on [Term Extraction][term-extraction] from Wikipedia: | ||
@@ -32,24 +32,28 @@ | ||
```javascript | ||
var vfile = require('to-vfile'); | ||
var retext = require('retext'); | ||
var keywords = require('retext-keywords'); | ||
var nlcstToString = require('nlcst-to-string'); | ||
var vfile = require('to-vfile') | ||
var retext = require('retext') | ||
var keywords = require('retext-keywords') | ||
var toString = require('nlcst-to-string') | ||
retext() | ||
.use(keywords) | ||
.process(vfile.readSync('example.txt'), function (err, file) { | ||
if (err) throw err; | ||
.process(vfile.readSync('example.txt'), done) | ||
console.log('Keywords:'); | ||
file.data.keywords.forEach(function (keyword) { | ||
console.log(nlcstToString(keyword.matches[0].node)); | ||
}); | ||
function done(err, file) { | ||
if (err) throw err | ||
console.log(); | ||
console.log('Key-phrases:'); | ||
file.data.keyphrases.forEach(function (phrase) { | ||
console.log(phrase.matches[0].nodes.map(nlcstToString).join('')); | ||
}); | ||
} | ||
); | ||
console.log('Keywords:') | ||
file.data.keywords.forEach(function(keyword) { | ||
console.log(toString(keyword.matches[0].node)) | ||
}) | ||
console.log() | ||
console.log('Key-phrases:') | ||
file.data.keyphrases.forEach(function(phrase) { | ||
console.log(phrase.matches[0].nodes.map(stringify).join('')) | ||
function stringify(value) { | ||
return toString(value) | ||
} | ||
}) | ||
} | ||
``` | ||
@@ -122,2 +126,10 @@ | ||
## Contribute | ||
See [`contributing.md` in `retextjs/retext`][contributing] for ways to get | ||
started. | ||
This organisation has a [Code of Conduct][coc]. By interacting with this | ||
repository, organisation, or community you agree to abide by its terms. | ||
## License | ||
@@ -129,18 +141,30 @@ | ||
[travis-badge]: https://img.shields.io/travis/wooorm/retext-keywords.svg | ||
[build-badge]: https://img.shields.io/travis/retextjs/retext-keywords.svg | ||
[travis]: https://travis-ci.org/wooorm/retext-keywords | ||
[build]: https://travis-ci.org/retextjs/retext-keywords | ||
[codecov-badge]: https://img.shields.io/codecov/c/github/wooorm/retext-keywords.svg | ||
[coverage-badge]: https://img.shields.io/codecov/c/github/retextjs/retext-keywords.svg | ||
[codecov]: https://codecov.io/github/wooorm/retext-keywords | ||
[coverage]: https://codecov.io/github/retextjs/retext-keywords | ||
[downloads-badge]: https://img.shields.io/npm/dm/retext-keywords.svg | ||
[downloads]: https://www.npmjs.com/package/retext-keywords | ||
[chat-badge]: https://img.shields.io/badge/join%20the%20community-on%20spectrum-7b16ff.svg | ||
[chat]: https://spectrum.chat/unified/retext | ||
[npm]: https://docs.npmjs.com/cli/install | ||
[license]: LICENSE | ||
[license]: license | ||
[author]: http://wooorm.com | ||
[author]: https://wooorm.com | ||
[retext]: https://github.com/wooorm/retext | ||
[retext]: https://github.com/retextjs/retext | ||
[term-extraction]: http://en.wikipedia.org/wiki/Terminology_extraction | ||
[term-extraction]: https://en.wikipedia.org/wiki/Terminology_extraction | ||
[contributing]: https://github.com/retextjs/retext/blob/master/contributing.md | ||
[coc]: https://github.com/retextjs/retext/blob/master/code-of-conduct.md |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
No bug tracker
MaintenancePackage does not have a linked bug tracker in package.json.
Found 1 instance in 1 package
No repository
Supply chain riskPackage does not have a linked source code repository. Without this field, a package will have no reference to the location of the source code use to generate the package.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
No bug tracker
MaintenancePackage does not have a linked bug tracker in package.json.
Found 1 instance in 1 package
No repository
Supply chain riskPackage does not have a linked source code repository. Without this field, a package will have no reference to the location of the source code use to generate the package.
Found 1 instance in 1 package
14895
167
9
239
1