Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

retext-keywords

Package Overview
Dependencies
Maintainers
1
Versions
28
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

retext-keywords - npm Package Compare versions

Comparing version 0.2.1 to 1.0.0

history.md

419

index.js

@@ -0,102 +1,56 @@

/**
* @author Titus Wormer
* @copyright 2014-2015 Titus Wormer
* @license MIT
* @module retext:keywords
* @fileoverview Keyword extraction with Retext.
*/
'use strict';
/*
* Module dependencies.
* Dependencies.
*/
var pos,
stemmer,
visit;
var stemmer = require('stemmer');
var visit = require('unist-util-visit');
var nlcstToString = require('nlcst-to-string');
var pos = require('retext-pos');
pos = require('retext-pos');
stemmer = require('retext-porter-stemmer');
visit = require('retext-visit');
/*
* Constants.
/**
* Get the stem of a node.
*
* @param {Node} node - Node to stem.
* @return {string} - Stemmed node.
*/
function stemNode(node) {
return stemmer(nlcstToString(node)).toLowerCase();
}
var has;
has = Object.prototype.hasOwnProperty;
/**
* Reverse sort: from 9 to 0.
* Check whether `value` is upper-case.
*
* @param {number} a
* @param {number} b
* @param {string} value - Value to check.
* @return {boolean} - Whether `value` is upper-case.
*/
function reverseSort(a, b) {
return b - a;
function isUpperCase(value) {
return value === String(value).toUpperCase();
}
/**
* Get the top results from an occurance map.
* Reverse sort: from 9 to 0.
*
* @param {Object.<string, Object>} results - Dictionary of
* stems mapping to objects containing `nodes`, `stem`,
* and `score` properties.
* @param {number} minimum - Minimum number of results to
* return.
* @return {Array.<Object>}
* @param {number} a - First.
* @param {number} b - Second.
* @return {number} - Difference.
*/
function filterResults(results, minimum) {
var filteredResults,
matrix,
indices,
column,
key,
score,
interpolatedScore,
index,
otherIndex,
maxScore;
filteredResults = [];
indices = [];
matrix = {};
for (key in results) {
score = results[key].score;
if (!has.call(matrix, score)) {
matrix[score] = [];
indices.push(score);
}
matrix[score].push(results[key]);
}
indices.sort(reverseSort);
maxScore = indices[0];
index = -1;
while (indices[++index]) {
score = indices[index];
column = matrix[score];
interpolatedScore = score / maxScore;
otherIndex = -1;
while (column[++otherIndex]) {
column[otherIndex].score = interpolatedScore;
}
filteredResults = filteredResults.concat(column);
if (filteredResults.length >= minimum) {
break;
}
}
return filteredResults;
function reverse(a, b) {
return b - a;
}
/**
* Get whether or not a `node` is important.
* Check whether or not a `node` is important.
*
* @param {Node} node
* @return {boolean}
* @param {Node} node - Node to check.
* @return {boolean} - Whether `node` is important.
*/

@@ -106,3 +60,4 @@ function isImportant(node) {

node &&
node.type === 'WordNode' &&
node.data &&
node.data.partOfSpeech &&
(

@@ -112,3 +67,3 @@ node.data.partOfSpeech.indexOf('N') === 0 ||

node.data.partOfSpeech === 'JJ' &&
node.toString().charAt(0).match(/[A-Z]/)
isUpperCase(nlcstToString(node).charAt(0))
)

@@ -122,19 +77,23 @@ )

*
* @param {Node} node
* @param {Node} node - Parent to search in.
* @return {Array.<Object>}
*/
function getImportantWords(node) {
var importantWords;
var words = {};
importantWords = {};
node.visit(node.WORD_NODE, function (word) {
visit(node, 'WordNode', function (word, index, parent) {
var match;
var stem;
if (isImportant(word)) {
stem = word.data.stem.toLowerCase();
stem = stemNode(word);
match = {
'node': word,
'index': index,
'parent': parent
};
if (!has.call(importantWords, stem)) {
importantWords[stem] = {
'nodes': [word],
if (!words[stem]) {
words[stem] = {
'matches': [match],
'stem': stem,

@@ -144,4 +103,4 @@ 'score': 1

} else {
importantWords[stem].nodes.push(word);
importantWords[stem].score++;
words[stem].matches.push(match);
words[stem].score++;
}

@@ -151,19 +110,63 @@ }

return importantWords;
return words;
}
/**
* Get the top important words in `self`.
* Get the top results from an occurance map.
*
* @param {Object?} options
* @param {number?} options.minimum
* @this {Node} node
* @param {Object.<string, Object>} results - Map of stems
* mapping to objects containing `nodes`, `stem`, and
* `score` properties.
* @param {number} maximum - Try to get at least `maximum`
* results.
* @return {Array.<Object>}
*/
function getKeywords(options) {
var minimum;
function filterResults(results, maximum) {
var filteredResults = [];
var indices = [];
var matrix = {};
var column;
var key;
var score;
var interpolated;
var index;
var otherIndex;
var maxScore;
minimum = options && has.call(options, 'minimum') ? options.minimum : 5;
for (key in results) {
score = results[key].score;
return filterResults(getImportantWords(this), minimum);
if (!matrix[score]) {
matrix[score] = [];
indices.push(score);
}
matrix[score].push(results[key]);
}
indices.sort(reverse);
maxScore = indices[0];
index = -1;
while (indices[++index]) {
score = indices[index];
column = matrix[score];
interpolated = score / maxScore;
otherIndex = -1;
while (column[++otherIndex]) {
column[otherIndex].score = interpolated;
}
filteredResults = filteredResults.concat(column);
if (filteredResults.length >= maximum) {
break;
}
}
return filteredResults;
}

@@ -174,26 +177,26 @@

*
* @param {Node} node
* @param {string} direction - either "prev" or "next".
* @param {Node} node - Node to start search at.
* @param {number} index - Position of `node` in `parent`.
* @param {Node} parent - Parent of `node`.
* @param {number} offset - Offset to the next node. `-1`
* when iterating backwards, `1` when iterating forwards.
* @return {Object}
*/
function findPhraseInDirection(node, direction) {
var nodes,
stems,
words,
queue;
function findPhraseInDirection(node, index, parent, offset) {
var children = parent.children;
var nodes = [];
var stems = [];
var words = [];
var queue = [];
var child;
nodes = [];
stems = [];
words = [];
queue = [];
while (children[index += offset]) {
child = children[index];
node = node[direction];
while (node) {
if (node.type === node.WHITE_SPACE_NODE) {
queue.push(node);
} else if (isImportant(node)) {
nodes = nodes.concat(queue, [node]);
words.push(node);
stems.push(node.data.stem.toLowerCase());
if (child.type === 'WhiteSpaceNode') {
queue.push(child);
} else if (isImportant(child)) {
nodes = nodes.concat(queue, [child]);
words.push(child);
stems.push(stemNode(child));
queue = [];

@@ -203,4 +206,2 @@ } else {

}
node = node[direction];
}

@@ -219,9 +220,9 @@

*
* @param {Array.<*>} prev
* @param {*} current
* @param {Array.<*>} next
* @param {Array.<*>} prev - Reversed array before `current`.
* @param {*} current - Current thing.
* @param {Array.<*>} next - Things after `current`.
* @return {Array.<*>}
*/
function merge(prev, current, next) {
return prev.reverse().concat([current], next);
return prev.concat().reverse().concat([current], next);
}

@@ -232,13 +233,14 @@

*
* @param {Node} node
* @param {Object} match - Single match.
* @return {Object}
*/
function findPhrase(node) {
var prev = findPhraseInDirection(node, 'prev'),
next = findPhraseInDirection(node, 'next'),
stems = merge(prev.stems, node.data.stem.toLowerCase(), next.stems);
function findPhrase(match) {
var node = match.node;
var prev = findPhraseInDirection(node, match.index, match.parent, -1);
var next = findPhraseInDirection(node, match.index, match.parent, 1);
var stems = merge(prev.stems, stemNode(node), next.stems);
return {
'stems': stems,
'value': stems.join(' ').toLowerCase(),
'value': stems.join(' '),
'nodes': merge(prev.nodes, node, next.nodes)

@@ -251,28 +253,24 @@ };

*
* @param {Object?} options
* @param {number?} options.minimum
* @this {Node} node
* @param {Object.<string, Object>} results - Map of stems
* mapping to objects containing `nodes`, `stem`, and
* `score` properties.
* @param {number} maximum - Try to get at least `maximum`
* results.
* @return {Array.<Object>}
*/
function getKeyphrases(options) {
var stemmedPhrases,
initialWords,
stemmedPhrase,
index,
otherIndex,
importantWords,
keyword,
nodes,
phrase,
stems,
minimum,
score;
function getKeyphrases(results, maximum) {
var stemmedPhrases = {};
var initialWords = [];
var stemmedPhrase;
var index;
var length;
var otherIndex;
var keyword;
var matches;
var phrase;
var stems;
var score;
var first;
var match;
stemmedPhrases = {};
initialWords = [];
minimum = options && has.call(options, 'minimum') ? options.minimum : 5;
importantWords = getImportantWords(this);
/*

@@ -282,5 +280,5 @@ * Iterate over all grouped important words...

for (keyword in importantWords) {
nodes = importantWords[keyword].nodes;
for (keyword in results) {
matches = results[keyword].matches;
length = matches.length;
index = -1;

@@ -292,5 +290,12 @@

while (nodes[++index]) {
phrase = findPhrase(nodes[index]);
while (++index < length) {
phrase = findPhrase(matches[index]);
stemmedPhrase = stemmedPhrases[phrase.value];
first = phrase.nodes[0];
match = {
'nodes': phrase.nodes,
'parent': matches[index].parent
};
/*

@@ -301,5 +306,3 @@ * If we've detected the same stemmed

if (has.call(stemmedPhrases, phrase.value)) {
stemmedPhrase = stemmedPhrases[phrase.value];
if (stemmedPhrase) {
/*

@@ -319,5 +322,5 @@ * Add weight per phrase to the score of

if (initialWords.indexOf(phrase.nodes[0]) === -1) {
initialWords.push(phrase.nodes[0]);
stemmedPhrase.nodes.push(phrase.nodes);
if (initialWords.indexOf(first) === -1) {
initialWords.push(first);
stemmedPhrase.matches.push(match);
}

@@ -329,3 +332,3 @@ } else {

initialWords.push(phrase.nodes[0]);
initialWords.push(first);

@@ -338,3 +341,3 @@ /*

while (stems[++otherIndex]) {
score += importantWords[stems[otherIndex]].score;
score += results[stems[otherIndex]].score;
}

@@ -347,3 +350,3 @@

'value': phrase.value,
'nodes': [phrase.nodes]
'matches': [match]
};

@@ -365,39 +368,69 @@ }

phrase.score = Math.round(
phrase.score * phrase.nodes.length / phrase.stems.length
phrase.score * phrase.matches.length / phrase.stems.length
);
}
return filterResults(stemmedPhrases, minimum);
return filterResults(stemmedPhrases, maximum);
}
/**
* Define `keywords`.
* Clone the given map of words.
*
* @param {Retext} retext
* This is a two level-deep clone.
*
* @param {Object} words - Important words.
* @return {Object} - Cloned words.
*/
function keywords(retext) {
var TextOM,
parentPrototype,
elementPrototype;
function cloneMatches(words) {
var result = {};
var key;
var match;
TextOM = retext.TextOM;
parentPrototype = TextOM.Parent.prototype;
elementPrototype = TextOM.Element.prototype;
for (key in words) {
match = words[key];
result[key] = {
'matches': match.matches,
'stem': match.stem,
'score': match.score
}
}
retext
.use(stemmer)
.use(pos)
.use(visit);
return result;
}
parentPrototype.keywords = getKeywords;
elementPrototype.keywords = getKeywords;
/**
* Attach.
*
* @param {Retext} retext - Instance.
* @param {Object?} [options] - Configuration.
* @param {number?} [options.maximum] - Try to get at
* least `maximum` results.
* @return {Function} - `transformer`.
*/
function attacher(retext, options) {
var maximum = (options || {}).maximum || 5;
parentPrototype.keyphrases = getKeyphrases;
elementPrototype.keyphrases = getKeyphrases;
retext.use(pos);
/**
* Attach keywords in `cst` to `file`.
*
* @param {NLCSTNode} cst - Node.
* @param {VFile} file - Virtual file.
*/
function transformer(cst, file) {
var space = file.namespace('retext');
var important = getImportantWords(cst);
space.keywords = filterResults(cloneMatches(important), maximum);
space.keyphrases = getKeyphrases(important, maximum);
}
return transformer;
}
/*
* Expose `keywords`.
* Expose.
*/
module.exports = keywords;
module.exports = attacher;
{
"name": "retext-keywords",
"version": "0.2.1",
"version": "1.0.0",
"description": "Keyword extraction with Retext",

@@ -15,6 +15,10 @@ "license": "MIT",

"dependencies": {
"retext-porter-stemmer": "^0.2.2",
"retext-pos": "^0.2.1",
"retext-visit": "^0.2.2"
"nlcst-to-string": "^0.1.5",
"retext-pos": "^1.0.0",
"stemmer": "^0.1.4",
"unist-util-visit": "^1.0.0"
},
"files": [
"index.js"
],
"repository": {

@@ -26,24 +30,32 @@ "type": "git",

"devDependencies": {
"eslint": "^0.12.0",
"browserify": "^11.0.1",
"chalk": "^1.0.0",
"eslint": "^1.0.0",
"esmangle": "^1.0.1",
"istanbul": "^0.3.0",
"jscs": "^1.0.0",
"jscs-jsdoc": "^0.4.0",
"matcha": "^0.6.0",
"jscs": "^2.0.0",
"jscs-jsdoc": "^1.0.0",
"mdast": "^0.28.0",
"mdast-comment-config": "^0.1.2",
"mdast-github": "^0.3.2",
"mdast-lint": "^0.4.2",
"mdast-slug": "^0.1.1",
"mdast-validate-links": "^0.3.1",
"mocha": "^2.0.0",
"retext": "^0.5.0"
"retext": "^1.0.0-rc.2"
},
"scripts": {
"test-api": "_mocha --check-leaks test.js",
"test-coveralls": "istanbul cover _mocha --report lcovonly -- --check-leaks test.js",
"test-coverage": "istanbul cover _mocha -- --check-leaks test.js",
"test-travis": "npm run test-coveralls",
"test-api": "mocha --check-leaks test.js",
"test-coverage": "istanbul cover _mocha -- test.js",
"test-travis": "npm run test-coverage",
"test": "npm run test-api",
"lint-api": "eslint index.js",
"lint-benchmark": "eslint --global bench,before,suite,set benchmark.js",
"lint-test": "eslint --env mocha test.js",
"lint-style": "jscs --reporter inline index.js benchmark.js test.js",
"lint": "npm run lint-api && npm run lint-benchmark && npm run lint-test && npm run lint-style",
"lint-api": "eslint .",
"lint-style": "jscs --reporter inline .",
"lint": "npm run lint-api && npm run lint-style",
"make": "npm run lint && npm run test-coverage",
"benchmark": "matcha benchmark.js"
"build-bundle": "browserify index.js --ignore-missing --no-builtins --standalone retextPOS > retext-keywords.js",
"postbuild-bundle": "esmangle retext-keywords.js > retext-keywords.min.js",
"build-md": "mdast . --quiet",
"build": "npm run build-bundle && npm run build-md"
}
}
SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc