retext-keywords
Advanced tools
Comparing version 0.2.1 to 1.0.0
419
index.js
@@ -0,102 +1,56 @@ | ||
/** | ||
* @author Titus Wormer | ||
* @copyright 2014-2015 Titus Wormer | ||
* @license MIT | ||
* @module retext:keywords | ||
* @fileoverview Keyword extraction with Retext. | ||
*/ | ||
'use strict'; | ||
/* | ||
* Module dependencies. | ||
* Dependencies. | ||
*/ | ||
var pos, | ||
stemmer, | ||
visit; | ||
var stemmer = require('stemmer'); | ||
var visit = require('unist-util-visit'); | ||
var nlcstToString = require('nlcst-to-string'); | ||
var pos = require('retext-pos'); | ||
pos = require('retext-pos'); | ||
stemmer = require('retext-porter-stemmer'); | ||
visit = require('retext-visit'); | ||
/* | ||
* Constants. | ||
/** | ||
* Get the stem of a node. | ||
* | ||
* @param {Node} node - Node to stem. | ||
* @return {string} - Stemmed node. | ||
*/ | ||
function stemNode(node) { | ||
return stemmer(nlcstToString(node)).toLowerCase(); | ||
} | ||
var has; | ||
has = Object.prototype.hasOwnProperty; | ||
/** | ||
* Reverse sort: from 9 to 0. | ||
* Check whether `value` is upper-case. | ||
* | ||
* @param {number} a | ||
* @param {number} b | ||
* @param {string} value - Value to check. | ||
* @return {boolean} - Whether `value` is upper-case. | ||
*/ | ||
function reverseSort(a, b) { | ||
return b - a; | ||
function isUpperCase(value) { | ||
return value === String(value).toUpperCase(); | ||
} | ||
/** | ||
* Get the top results from an occurance map. | ||
* Reverse sort: from 9 to 0. | ||
* | ||
* @param {Object.<string, Object>} results - Dictionary of | ||
* stems mapping to objects containing `nodes`, `stem`, | ||
* and `score` properties. | ||
* @param {number} minimum - Minimum number of results to | ||
* return. | ||
* @return {Array.<Object>} | ||
* @param {number} a - First. | ||
* @param {number} b - Second. | ||
* @return {number} - Difference. | ||
*/ | ||
function filterResults(results, minimum) { | ||
var filteredResults, | ||
matrix, | ||
indices, | ||
column, | ||
key, | ||
score, | ||
interpolatedScore, | ||
index, | ||
otherIndex, | ||
maxScore; | ||
filteredResults = []; | ||
indices = []; | ||
matrix = {}; | ||
for (key in results) { | ||
score = results[key].score; | ||
if (!has.call(matrix, score)) { | ||
matrix[score] = []; | ||
indices.push(score); | ||
} | ||
matrix[score].push(results[key]); | ||
} | ||
indices.sort(reverseSort); | ||
maxScore = indices[0]; | ||
index = -1; | ||
while (indices[++index]) { | ||
score = indices[index]; | ||
column = matrix[score]; | ||
interpolatedScore = score / maxScore; | ||
otherIndex = -1; | ||
while (column[++otherIndex]) { | ||
column[otherIndex].score = interpolatedScore; | ||
} | ||
filteredResults = filteredResults.concat(column); | ||
if (filteredResults.length >= minimum) { | ||
break; | ||
} | ||
} | ||
return filteredResults; | ||
function reverse(a, b) { | ||
return b - a; | ||
} | ||
/** | ||
* Get whether or not a `node` is important. | ||
* Check whether or not a `node` is important. | ||
* | ||
* @param {Node} node | ||
* @return {boolean} | ||
* @param {Node} node - Node to check. | ||
* @return {boolean} - Whether `node` is important. | ||
*/ | ||
@@ -106,3 +60,4 @@ function isImportant(node) { | ||
node && | ||
node.type === 'WordNode' && | ||
node.data && | ||
node.data.partOfSpeech && | ||
( | ||
@@ -112,3 +67,3 @@ node.data.partOfSpeech.indexOf('N') === 0 || | ||
node.data.partOfSpeech === 'JJ' && | ||
node.toString().charAt(0).match(/[A-Z]/) | ||
isUpperCase(nlcstToString(node).charAt(0)) | ||
) | ||
@@ -122,19 +77,23 @@ ) | ||
* | ||
* @param {Node} node | ||
* @param {Node} node - Parent to search in. | ||
* @return {Array.<Object>} | ||
*/ | ||
function getImportantWords(node) { | ||
var importantWords; | ||
var words = {}; | ||
importantWords = {}; | ||
node.visit(node.WORD_NODE, function (word) { | ||
visit(node, 'WordNode', function (word, index, parent) { | ||
var match; | ||
var stem; | ||
if (isImportant(word)) { | ||
stem = word.data.stem.toLowerCase(); | ||
stem = stemNode(word); | ||
match = { | ||
'node': word, | ||
'index': index, | ||
'parent': parent | ||
}; | ||
if (!has.call(importantWords, stem)) { | ||
importantWords[stem] = { | ||
'nodes': [word], | ||
if (!words[stem]) { | ||
words[stem] = { | ||
'matches': [match], | ||
'stem': stem, | ||
@@ -144,4 +103,4 @@ 'score': 1 | ||
} else { | ||
importantWords[stem].nodes.push(word); | ||
importantWords[stem].score++; | ||
words[stem].matches.push(match); | ||
words[stem].score++; | ||
} | ||
@@ -151,19 +110,63 @@ } | ||
return importantWords; | ||
return words; | ||
} | ||
/** | ||
* Get the top important words in `self`. | ||
* Get the top results from an occurance map. | ||
* | ||
* @param {Object?} options | ||
* @param {number?} options.minimum | ||
* @this {Node} node | ||
* @param {Object.<string, Object>} results - Map of stems | ||
* mapping to objects containing `nodes`, `stem`, and | ||
* `score` properties. | ||
* @param {number} maximum - Try to get at least `maximum` | ||
* results. | ||
* @return {Array.<Object>} | ||
*/ | ||
function getKeywords(options) { | ||
var minimum; | ||
function filterResults(results, maximum) { | ||
var filteredResults = []; | ||
var indices = []; | ||
var matrix = {}; | ||
var column; | ||
var key; | ||
var score; | ||
var interpolated; | ||
var index; | ||
var otherIndex; | ||
var maxScore; | ||
minimum = options && has.call(options, 'minimum') ? options.minimum : 5; | ||
for (key in results) { | ||
score = results[key].score; | ||
return filterResults(getImportantWords(this), minimum); | ||
if (!matrix[score]) { | ||
matrix[score] = []; | ||
indices.push(score); | ||
} | ||
matrix[score].push(results[key]); | ||
} | ||
indices.sort(reverse); | ||
maxScore = indices[0]; | ||
index = -1; | ||
while (indices[++index]) { | ||
score = indices[index]; | ||
column = matrix[score]; | ||
interpolated = score / maxScore; | ||
otherIndex = -1; | ||
while (column[++otherIndex]) { | ||
column[otherIndex].score = interpolated; | ||
} | ||
filteredResults = filteredResults.concat(column); | ||
if (filteredResults.length >= maximum) { | ||
break; | ||
} | ||
} | ||
return filteredResults; | ||
} | ||
@@ -174,26 +177,26 @@ | ||
* | ||
* @param {Node} node | ||
* @param {string} direction - either "prev" or "next". | ||
* @param {Node} node - Node to start search at. | ||
* @param {number} index - Position of `node` in `parent`. | ||
* @param {Node} parent - Parent of `node`. | ||
* @param {number} offset - Offset to the next node. `-1` | ||
* when iterating backwards, `1` when iterating forwards. | ||
* @return {Object} | ||
*/ | ||
function findPhraseInDirection(node, direction) { | ||
var nodes, | ||
stems, | ||
words, | ||
queue; | ||
function findPhraseInDirection(node, index, parent, offset) { | ||
var children = parent.children; | ||
var nodes = []; | ||
var stems = []; | ||
var words = []; | ||
var queue = []; | ||
var child; | ||
nodes = []; | ||
stems = []; | ||
words = []; | ||
queue = []; | ||
while (children[index += offset]) { | ||
child = children[index]; | ||
node = node[direction]; | ||
while (node) { | ||
if (node.type === node.WHITE_SPACE_NODE) { | ||
queue.push(node); | ||
} else if (isImportant(node)) { | ||
nodes = nodes.concat(queue, [node]); | ||
words.push(node); | ||
stems.push(node.data.stem.toLowerCase()); | ||
if (child.type === 'WhiteSpaceNode') { | ||
queue.push(child); | ||
} else if (isImportant(child)) { | ||
nodes = nodes.concat(queue, [child]); | ||
words.push(child); | ||
stems.push(stemNode(child)); | ||
queue = []; | ||
@@ -203,4 +206,2 @@ } else { | ||
} | ||
node = node[direction]; | ||
} | ||
@@ -219,9 +220,9 @@ | ||
* | ||
* @param {Array.<*>} prev | ||
* @param {*} current | ||
* @param {Array.<*>} next | ||
* @param {Array.<*>} prev - Reversed array before `current`. | ||
* @param {*} current - Current thing. | ||
* @param {Array.<*>} next - Things after `current`. | ||
* @return {Array.<*>} | ||
*/ | ||
function merge(prev, current, next) { | ||
return prev.reverse().concat([current], next); | ||
return prev.concat().reverse().concat([current], next); | ||
} | ||
@@ -232,13 +233,14 @@ | ||
* | ||
* @param {Node} node | ||
* @param {Object} match - Single match. | ||
* @return {Object} | ||
*/ | ||
function findPhrase(node) { | ||
var prev = findPhraseInDirection(node, 'prev'), | ||
next = findPhraseInDirection(node, 'next'), | ||
stems = merge(prev.stems, node.data.stem.toLowerCase(), next.stems); | ||
function findPhrase(match) { | ||
var node = match.node; | ||
var prev = findPhraseInDirection(node, match.index, match.parent, -1); | ||
var next = findPhraseInDirection(node, match.index, match.parent, 1); | ||
var stems = merge(prev.stems, stemNode(node), next.stems); | ||
return { | ||
'stems': stems, | ||
'value': stems.join(' ').toLowerCase(), | ||
'value': stems.join(' '), | ||
'nodes': merge(prev.nodes, node, next.nodes) | ||
@@ -251,28 +253,24 @@ }; | ||
* | ||
* @param {Object?} options | ||
* @param {number?} options.minimum | ||
* @this {Node} node | ||
* @param {Object.<string, Object>} results - Map of stems | ||
* mapping to objects containing `nodes`, `stem`, and | ||
* `score` properties. | ||
* @param {number} maximum - Try to get at least `maximum` | ||
* results. | ||
* @return {Array.<Object>} | ||
*/ | ||
function getKeyphrases(options) { | ||
var stemmedPhrases, | ||
initialWords, | ||
stemmedPhrase, | ||
index, | ||
otherIndex, | ||
importantWords, | ||
keyword, | ||
nodes, | ||
phrase, | ||
stems, | ||
minimum, | ||
score; | ||
function getKeyphrases(results, maximum) { | ||
var stemmedPhrases = {}; | ||
var initialWords = []; | ||
var stemmedPhrase; | ||
var index; | ||
var length; | ||
var otherIndex; | ||
var keyword; | ||
var matches; | ||
var phrase; | ||
var stems; | ||
var score; | ||
var first; | ||
var match; | ||
stemmedPhrases = {}; | ||
initialWords = []; | ||
minimum = options && has.call(options, 'minimum') ? options.minimum : 5; | ||
importantWords = getImportantWords(this); | ||
/* | ||
@@ -282,5 +280,5 @@ * Iterate over all grouped important words... | ||
for (keyword in importantWords) { | ||
nodes = importantWords[keyword].nodes; | ||
for (keyword in results) { | ||
matches = results[keyword].matches; | ||
length = matches.length; | ||
index = -1; | ||
@@ -292,5 +290,12 @@ | ||
while (nodes[++index]) { | ||
phrase = findPhrase(nodes[index]); | ||
while (++index < length) { | ||
phrase = findPhrase(matches[index]); | ||
stemmedPhrase = stemmedPhrases[phrase.value]; | ||
first = phrase.nodes[0]; | ||
match = { | ||
'nodes': phrase.nodes, | ||
'parent': matches[index].parent | ||
}; | ||
/* | ||
@@ -301,5 +306,3 @@ * If we've detected the same stemmed | ||
if (has.call(stemmedPhrases, phrase.value)) { | ||
stemmedPhrase = stemmedPhrases[phrase.value]; | ||
if (stemmedPhrase) { | ||
/* | ||
@@ -319,5 +322,5 @@ * Add weight per phrase to the score of | ||
if (initialWords.indexOf(phrase.nodes[0]) === -1) { | ||
initialWords.push(phrase.nodes[0]); | ||
stemmedPhrase.nodes.push(phrase.nodes); | ||
if (initialWords.indexOf(first) === -1) { | ||
initialWords.push(first); | ||
stemmedPhrase.matches.push(match); | ||
} | ||
@@ -329,3 +332,3 @@ } else { | ||
initialWords.push(phrase.nodes[0]); | ||
initialWords.push(first); | ||
@@ -338,3 +341,3 @@ /* | ||
while (stems[++otherIndex]) { | ||
score += importantWords[stems[otherIndex]].score; | ||
score += results[stems[otherIndex]].score; | ||
} | ||
@@ -347,3 +350,3 @@ | ||
'value': phrase.value, | ||
'nodes': [phrase.nodes] | ||
'matches': [match] | ||
}; | ||
@@ -365,39 +368,69 @@ } | ||
phrase.score = Math.round( | ||
phrase.score * phrase.nodes.length / phrase.stems.length | ||
phrase.score * phrase.matches.length / phrase.stems.length | ||
); | ||
} | ||
return filterResults(stemmedPhrases, minimum); | ||
return filterResults(stemmedPhrases, maximum); | ||
} | ||
/** | ||
* Define `keywords`. | ||
* Clone the given map of words. | ||
* | ||
* @param {Retext} retext | ||
* This is a two level-deep clone. | ||
* | ||
* @param {Object} words - Important words. | ||
* @return {Object} - Cloned words. | ||
*/ | ||
function keywords(retext) { | ||
var TextOM, | ||
parentPrototype, | ||
elementPrototype; | ||
function cloneMatches(words) { | ||
var result = {}; | ||
var key; | ||
var match; | ||
TextOM = retext.TextOM; | ||
parentPrototype = TextOM.Parent.prototype; | ||
elementPrototype = TextOM.Element.prototype; | ||
for (key in words) { | ||
match = words[key]; | ||
result[key] = { | ||
'matches': match.matches, | ||
'stem': match.stem, | ||
'score': match.score | ||
} | ||
} | ||
retext | ||
.use(stemmer) | ||
.use(pos) | ||
.use(visit); | ||
return result; | ||
} | ||
parentPrototype.keywords = getKeywords; | ||
elementPrototype.keywords = getKeywords; | ||
/** | ||
* Attach. | ||
* | ||
* @param {Retext} retext - Instance. | ||
* @param {Object?} [options] - Configuration. | ||
* @param {number?} [options.maximum] - Try to get at | ||
* least `maximum` results. | ||
* @return {Function} - `transformer`. | ||
*/ | ||
function attacher(retext, options) { | ||
var maximum = (options || {}).maximum || 5; | ||
parentPrototype.keyphrases = getKeyphrases; | ||
elementPrototype.keyphrases = getKeyphrases; | ||
retext.use(pos); | ||
/** | ||
* Attach keywords in `cst` to `file`. | ||
* | ||
* @param {NLCSTNode} cst - Node. | ||
* @param {VFile} file - Virtual file. | ||
*/ | ||
function transformer(cst, file) { | ||
var space = file.namespace('retext'); | ||
var important = getImportantWords(cst); | ||
space.keywords = filterResults(cloneMatches(important), maximum); | ||
space.keyphrases = getKeyphrases(important, maximum); | ||
} | ||
return transformer; | ||
} | ||
/* | ||
* Expose `keywords`. | ||
* Expose. | ||
*/ | ||
module.exports = keywords; | ||
module.exports = attacher; |
{ | ||
"name": "retext-keywords", | ||
"version": "0.2.1", | ||
"version": "1.0.0", | ||
"description": "Keyword extraction with Retext", | ||
@@ -15,6 +15,10 @@ "license": "MIT", | ||
"dependencies": { | ||
"retext-porter-stemmer": "^0.2.2", | ||
"retext-pos": "^0.2.1", | ||
"retext-visit": "^0.2.2" | ||
"nlcst-to-string": "^0.1.5", | ||
"retext-pos": "^1.0.0", | ||
"stemmer": "^0.1.4", | ||
"unist-util-visit": "^1.0.0" | ||
}, | ||
"files": [ | ||
"index.js" | ||
], | ||
"repository": { | ||
@@ -26,24 +30,32 @@ "type": "git", | ||
"devDependencies": { | ||
"eslint": "^0.12.0", | ||
"browserify": "^11.0.1", | ||
"chalk": "^1.0.0", | ||
"eslint": "^1.0.0", | ||
"esmangle": "^1.0.1", | ||
"istanbul": "^0.3.0", | ||
"jscs": "^1.0.0", | ||
"jscs-jsdoc": "^0.4.0", | ||
"matcha": "^0.6.0", | ||
"jscs": "^2.0.0", | ||
"jscs-jsdoc": "^1.0.0", | ||
"mdast": "^0.28.0", | ||
"mdast-comment-config": "^0.1.2", | ||
"mdast-github": "^0.3.2", | ||
"mdast-lint": "^0.4.2", | ||
"mdast-slug": "^0.1.1", | ||
"mdast-validate-links": "^0.3.1", | ||
"mocha": "^2.0.0", | ||
"retext": "^0.5.0" | ||
"retext": "^1.0.0-rc.2" | ||
}, | ||
"scripts": { | ||
"test-api": "_mocha --check-leaks test.js", | ||
"test-coveralls": "istanbul cover _mocha --report lcovonly -- --check-leaks test.js", | ||
"test-coverage": "istanbul cover _mocha -- --check-leaks test.js", | ||
"test-travis": "npm run test-coveralls", | ||
"test-api": "mocha --check-leaks test.js", | ||
"test-coverage": "istanbul cover _mocha -- test.js", | ||
"test-travis": "npm run test-coverage", | ||
"test": "npm run test-api", | ||
"lint-api": "eslint index.js", | ||
"lint-benchmark": "eslint --global bench,before,suite,set benchmark.js", | ||
"lint-test": "eslint --env mocha test.js", | ||
"lint-style": "jscs --reporter inline index.js benchmark.js test.js", | ||
"lint": "npm run lint-api && npm run lint-benchmark && npm run lint-test && npm run lint-style", | ||
"lint-api": "eslint .", | ||
"lint-style": "jscs --reporter inline .", | ||
"lint": "npm run lint-api && npm run lint-style", | ||
"make": "npm run lint && npm run test-coverage", | ||
"benchmark": "matcha benchmark.js" | ||
"build-bundle": "browserify index.js --ignore-missing --no-builtins --standalone retextPOS > retext-keywords.js", | ||
"postbuild-bundle": "esmangle retext-keywords.js > retext-keywords.min.js", | ||
"build-md": "mdast . --quiet", | ||
"build": "npm run build-bundle && npm run build-md" | ||
} | ||
} |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
No v1
QualityPackage is not semver >=1. This means it is not stable and does not support ^ ranges.
Found 1 instance in 1 package
25766
5
357
1
4
15
156
1
+ Addednlcst-to-string@^0.1.5
+ Addedstemmer@^0.1.4
+ Addedunist-util-visit@^1.0.0
+ Addednlcst-to-string@0.1.5(transitive)
+ Addedpos@0.2.3(transitive)
+ Addedretext-pos@1.0.0(transitive)
+ Addedunist-util-is@3.0.0(transitive)
+ Addedunist-util-visit@1.4.1(transitive)
+ Addedunist-util-visit-parents@2.1.2(transitive)
- Removedretext-porter-stemmer@^0.2.2
- Removedretext-visit@^0.2.2
- Removedpos@0.1.9(transitive)
- Removedretext-porter-stemmer@0.2.5(transitive)
- Removedretext-pos@0.2.1(transitive)
- Removedretext-visit@0.2.6(transitive)
Updatedretext-pos@^1.0.0