retext-keywords - npm Package Compare versions

Comparing version 2.0.2 to 3.0.0

535

index.js

		@@ -11,5 +11,4 @@ /**

		/* eslint-env commonjs */

		/* Dependencies. */
		var has = require('has');
		var stemmer = require('stemmer');
		@@ -20,92 +19,169 @@ var visit = require('unist-util-visit');

		/* Methods. */
		var own = Object.prototype.hasOwnProperty;
		/* Expose. */
		module.exports = attacher;

		/**
		* Get the stem of a node.
		* Attach.
		*
		* @param {Node} node - Node to stem.
		* @return {string} - Stemmed node.
		* @param {Retext} retext - Instance.
		* @param {Object?} [options] - Configuration.
		* @param {number?} [options.maximum] - Try to get at
		* least `maximum` results.
		* @return {Function} - `transformer`.
		*/
		function stemNode(node) {
		return stemmer(nlcstToString(node)).toLowerCase();
		}
		function attacher(retext, options) {
		var maximum = (options \|\| {}).maximum \|\| 5;

		/**
		* Check whether `value` is upper-case.
		*
		* @param {string} value - Value to check.
		* @return {boolean} - Whether `value` is upper-case.
		*/
		function isUpperCase(value) {
		return value === String(value).toUpperCase();
		retext.use(pos);

		return transformer;

		/**
		* Attach keywords in `cst` to `file`.
		*
		* @param {NLCSTNode} cst - Node.
		* @param {VFile} file - Virtual file.
		*/
		function transformer(cst, file) {
		var important = getImportantWords(cst);

		file.data.keywords = filterResults(cloneMatches(important), maximum);
		file.data.keyphrases = getKeyphrases(important, maximum);
		}
		}

		/**
		* Reverse sort: from 9 to 0.
		* Get following or preceding important words or white space.
		*
		* @param {number} a - First.
		* @param {number} b - Second.
		* @return {number} - Difference.
		* @param {Node} node - Node to start search at.
		* @param {number} index - Position of `node` in `parent`.
		* @param {Node} parent - Parent of `node`.
		* @param {number} offset - Offset to the next node. `-1`
		* when iterating backwards, `1` when iterating forwards.
		* @return {Object} - Phrase.
		*/
		function reverse(a, b) {
		return b - a;
		}
		function findPhraseInDirection(node, index, parent, offset) {
		var children = parent.children;
		var nodes = [];
		var stems = [];
		var words = [];
		var queue = [];
		var child;

		/**
		* Check whether or not a `node` is important.
		*
		* @param {Node} node - Node to check.
		* @return {boolean} - Whether `node` is important.
		*/
		function isImportant(node) {
		return (
		node &&
		node.data &&
		node.data.partOfSpeech &&
		(
		node.data.partOfSpeech.indexOf('N') === 0 \|\|
		(
		node.data.partOfSpeech === 'JJ' &&
		isUpperCase(nlcstToString(node).charAt(0))
		)
		)
		);
		while (children[index += offset]) {
		child = children[index];

		if (child.type === 'WhiteSpaceNode') {
		queue.push(child);
		} else if (isImportant(child)) {
		nodes = nodes.concat(queue, [child]);
		words.push(child);
		stems.push(stemNode(child));
		queue = [];
		} else {
		break;
		}
		}

		return {
		stems: stems,
		words: words,
		nodes: nodes
		};
		}

		/**
		* Get most important words in `node`.
		* Get the top important phrases in `self`.
		*
		* @param {Node} node - Parent to search in.
		* @return {Array.<Object>} - Important words.
		* @param {Object.<string, Object>} results - Map of stems
		* mapping to objects containing `nodes`, `stem`, and
		* `score` properties.
		* @param {number} maximum - Try to get at least `maximum`
		* results.
		* @return {Array.<Object>} - Keyphrases.
		*/
		function getImportantWords(node) {
		var words = {};
		function getKeyphrases(results, maximum) {
		var stemmedPhrases = {};
		var initialWords = [];
		var stemmedPhrase;
		var index;
		var length;
		var otherIndex;
		var keyword;
		var matches;
		var phrase;
		var stems;
		var score;
		var first;
		var match;

		visit(node, 'WordNode', function (word, index, parent) {
		var match;
		var stem;
		/* Iterate over all grouped important words... */
		for (keyword in results) {
		matches = results[keyword].matches;
		length = matches.length;
		index = -1;

		if (isImportant(word)) {
		stem = stemNode(word);
		match = {
		node: word,
		index: index,
		parent: parent
		};
		/* Iterate over every occurence of a certain keyword... */
		while (++index < length) {
		phrase = findPhrase(matches[index]);
		stemmedPhrase = stemmedPhrases[phrase.value];
		first = phrase.nodes[0];

		if (!own.call(words, stem)) {
		words[stem] = {
		matches: [match],
		stem: stem,
		score: 1
		};
		} else {
		words[stem].matches.push(match);
		words[stem].score++;
		}
		match = {
		nodes: phrase.nodes,
		parent: matches[index].parent
		};

		/* If we've detected the same stemmed
		* phrase somewhere. */
		if (has(stemmedPhrases, phrase.value)) {
		/* Add weight per phrase to the score of
		* the phrase. */
		stemmedPhrase.score += stemmedPhrase.weight;

		/* If this is the first time we walk over
		* the phrase (exact match but containing
		* another important word), add it to the
		* list of matching phrases. */
		if (initialWords.indexOf(first) === -1) {
		initialWords.push(first);
		stemmedPhrase.matches.push(match);
		}
		});
		} else {
		otherIndex = -1;
		score = -1;
		stems = phrase.stems;

		return words;
		initialWords.push(first);

		/* For every stem in phrase, add its
		* score to score. */
		while (stems[++otherIndex]) {
		score += results[stems[otherIndex]].score;
		}

		stemmedPhrases[phrase.value] = {
		score: score,
		weight: score,
		stems: stems,
		value: phrase.value,
		matches: [match]
		};
		}
		}
		}

		for (stemmedPhrase in stemmedPhrases) {
		phrase = stemmedPhrases[stemmedPhrase];

		/* Modify its score to be the rounded result of
		* multiplying it with the number of occurances,
		* and dividing it by the ammount of words in the
		* phrase. */
		phrase.score = Math.round(
		phrase.score * phrase.matches.length / phrase.stems.length
		);
		}

		return filterResults(stemmedPhrases, maximum);
		}
		@@ -124,89 +200,49 @@
		function filterResults(results, maximum) {
		var filteredResults = [];
		var indices = [];
		var matrix = {};
		var column;
		var key;
		var score;
		var interpolated;
		var index;
		var otherIndex;
		var maxScore;
		var filteredResults = [];
		var indices = [];
		var matrix = {};
		var column;
		var key;
		var score;
		var interpolated;
		var index;
		var otherIndex;
		var maxScore;

		for (key in results) {
		score = results[key].score;
		for (key in results) {
		score = results[key].score;

		if (!matrix[score]) {
		matrix[score] = [];
		indices.push(score);
		}

		matrix[score].push(results[key]);
		if (!matrix[score]) {
		matrix[score] = [];
		indices.push(score);
		}

		indices.sort(reverse);
		matrix[score].push(results[key]);
		}

		maxScore = indices[0];
		indices.sort(reverse);

		index = -1;
		maxScore = indices[0];

		while (indices[++index]) {
		score = indices[index];
		column = matrix[score];
		index = -1;

		interpolated = score / maxScore;
		otherIndex = -1;
		while (indices[++index]) {
		score = indices[index];
		column = matrix[score];

		while (column[++otherIndex]) {
		column[otherIndex].score = interpolated;
		}
		interpolated = score / maxScore;
		otherIndex = -1;

		filteredResults = filteredResults.concat(column);

		if (filteredResults.length >= maximum) {
		break;
		}
		while (column[++otherIndex]) {
		column[otherIndex].score = interpolated;
		}

		return filteredResults;
		}
		filteredResults = filteredResults.concat(column);

		/**
		* Get following or preceding important words or white space.
		*
		* @param {Node} node - Node to start search at.
		* @param {number} index - Position of `node` in `parent`.
		* @param {Node} parent - Parent of `node`.
		* @param {number} offset - Offset to the next node. `-1`
		* when iterating backwards, `1` when iterating forwards.
		* @return {Object} - Phrase.
		*/
		function findPhraseInDirection(node, index, parent, offset) {
		var children = parent.children;
		var nodes = [];
		var stems = [];
		var words = [];
		var queue = [];
		var child;

		while (children[index += offset]) {
		child = children[index];

		if (child.type === 'WhiteSpaceNode') {
		queue.push(child);
		} else if (isImportant(child)) {
		nodes = nodes.concat(queue, [child]);
		words.push(child);
		stems.push(stemNode(child));
		queue = [];
		} else {
		break;
		}
		if (filteredResults.length >= maximum) {
		break;
		}
		}

		return {
		stems: stems,
		words: words,
		nodes: nodes
		};
		return filteredResults;
		}
		@@ -224,3 +260,3 @@
		function merge(prev, current, next) {
		return prev.concat().reverse().concat([current], next);
		return prev.concat().reverse().concat([current], next);
		}
		@@ -235,108 +271,51 @@
		function findPhrase(match) {
		var node = match.node;
		var prev = findPhraseInDirection(node, match.index, match.parent, -1);
		var next = findPhraseInDirection(node, match.index, match.parent, 1);
		var stems = merge(prev.stems, stemNode(node), next.stems);
		var node = match.node;
		var prev = findPhraseInDirection(node, match.index, match.parent, -1);
		var next = findPhraseInDirection(node, match.index, match.parent, 1);
		var stems = merge(prev.stems, stemNode(node), next.stems);

		return {
		stems: stems,
		value: stems.join(' '),
		nodes: merge(prev.nodes, node, next.nodes)
		};
		return {
		stems: stems,
		value: stems.join(' '),
		nodes: merge(prev.nodes, node, next.nodes)
		};
		}

		/**
		* Get the top important phrases in `self`.
		* Get most important words in `node`.
		*
		* @param {Object.<string, Object>} results - Map of stems
		* mapping to objects containing `nodes`, `stem`, and
		* `score` properties.
		* @param {number} maximum - Try to get at least `maximum`
		* results.
		* @return {Array.<Object>} - Keyphrases.
		* @param {Node} node - Parent to search in.
		* @return {Array.<Object>} - Important words.
		*/
		function getKeyphrases(results, maximum) {
		var stemmedPhrases = {};
		var initialWords = [];
		var stemmedPhrase;
		var index;
		var length;
		var otherIndex;
		var keyword;
		var matches;
		var phrase;
		var stems;
		var score;
		var first;
		var match;
		function getImportantWords(node) {
		var words = {};

		/* Iterate over all grouped important words... */
		for (keyword in results) {
		matches = results[keyword].matches;
		length = matches.length;
		index = -1;
		visit(node, 'WordNode', visitor);

		/* Iterate over every occurence of a certain keyword... */
		while (++index < length) {
		phrase = findPhrase(matches[index]);
		stemmedPhrase = stemmedPhrases[phrase.value];
		first = phrase.nodes[0];
		return words;

		match = {
		nodes: phrase.nodes,
		parent: matches[index].parent
		};
		function visitor(word, index, parent) {
		var match;
		var stem;

		/* If we've detected the same stemmed
		* phrase somewhere. */
		if (own.call(stemmedPhrases, phrase.value)) {
		/* Add weight per phrase to the score of
		* the phrase. */
		stemmedPhrase.score += stemmedPhrase.weight;
		if (isImportant(word)) {
		stem = stemNode(word);
		match = {
		node: word,
		index: index,
		parent: parent
		};

		/* If this is the first time we walk over
		* the phrase (exact match but containing
		* another important word), add it to the
		* list of matching phrases. */
		if (initialWords.indexOf(first) === -1) {
		initialWords.push(first);
		stemmedPhrase.matches.push(match);
		}
		} else {
		otherIndex = -1;
		score = -1;
		stems = phrase.stems;

		initialWords.push(first);

		/* For every stem in phrase, add its
		* score to score. */
		while (stems[++otherIndex]) {
		score += results[stems[otherIndex]].score;
		}

		stemmedPhrases[phrase.value] = {
		score: score,
		weight: score,
		stems: stems,
		value: phrase.value,
		matches: [match]
		};
		}
		}
		if (!has(words, stem)) {
		words[stem] = {
		matches: [match],
		stem: stem,
		score: 1
		};
		} else {
		words[stem].matches.push(match);
		words[stem].score++;
		}
		}

		for (stemmedPhrase in stemmedPhrases) {
		phrase = stemmedPhrases[stemmedPhrase];

		/* Modify its score to be the rounded result of
		* multiplying it with the number of occurances,
		* and dividing it by the ammount of words in the
		* phrase. */
		phrase.score = Math.round(
		phrase.score * phrase.matches.length / phrase.stems.length
		);
		}

		return filterResults(stemmedPhrases, maximum);
		}
		}
		@@ -353,50 +332,48 @@
		function cloneMatches(words) {
		var result = {};
		var key;
		var match;
		var result = {};
		var key;
		var match;

		for (key in words) {
		match = words[key];
		result[key] = {
		matches: match.matches,
		stem: match.stem,
		score: match.score
		}
		}
		for (key in words) {
		match = words[key];

		return result;
		result[key] = {
		matches: match.matches,
		stem: match.stem,
		score: match.score
		};
		}

		return result;
		}

		/**
		* Attach.
		*
		* @param {Retext} retext - Instance.
		* @param {Object?} [options] - Configuration.
		* @param {number?} [options.maximum] - Try to get at
		* least `maximum` results.
		* @return {Function} - `transformer`.
		*/
		function attacher(retext, options) {
		var maximum = (options \|\| {}).maximum \|\| 5;
		/* Check if `node` is important. */
		function isImportant(node) {
		return (
		node &&
		node.data &&
		node.data.partOfSpeech &&
		(
		node.data.partOfSpeech.indexOf('N') === 0 \|\|
		(
		node.data.partOfSpeech === 'JJ' &&
		isUpperCase(nlcstToString(node).charAt(0))
		)
		)
		);
		}

		retext.use(pos);
		/* Check if `value` is upper-case. */
		function isUpperCase(value) {
		return value === String(value).toUpperCase();
		}

		/**
		* Attach keywords in `cst` to `file`.
		*
		* @param {NLCSTNode} cst - Node.
		* @param {VFile} file - Virtual file.
		*/
		function transformer(cst, file) {
		var space = file.namespace('retext');
		var important = getImportantWords(cst);
		/* Reverse sort: from 9 to 0. */
		function reverse(a, b) {
		return b - a;
		}

		space.keywords = filterResults(cloneMatches(important), maximum);
		space.keyphrases = getKeyphrases(important, maximum);
		}

		return transformer;
		/* Get the stem of a node. */
		function stemNode(node) {
		return stemmer(nlcstToString(node)).toLowerCase();
		}

		/* Expose. */
		module.exports = attacher;

package.json

		{
		"name": "retext-keywords",
		"version": "2.0.2",
		"version": "3.0.0",
		"description": "Keyword extraction with Retext",
		@@ -14,9 +14,3 @@ "license": "MIT",
		],
		"files": [
		"index.js"
		],
		"repository": {
		"type": "git",
		"url": "https://github.com/wooorm/retext-keywords.git"
		},
		"repository": "https://github.com/wooorm/retext-keywords",
		"bugs": "https://github.com/wooorm/retext-keywords/issues",
		@@ -28,3 +22,7 @@ "author": "Titus Wormer <tituswormer@gmail.com> (http://wooorm.com)",
		],
		"files": [
		"index.js"
		],
		"dependencies": {
		"has": "^1.0.1",
		"nlcst-to-string": "^2.0.0",
		@@ -37,14 +35,9 @@ "retext-pos": "^1.0.0",
		"browserify": "^13.0.1",
		"eslint": "^2.0.0",
		"esmangle": "^1.0.1",
		"istanbul": "^0.4.0",
		"jscs": "^3.0.0",
		"jscs-jsdoc": "^2.0.0",
		"remark-cli": "^1.0.0",
		"remark-comment-config": "^4.0.0",
		"remark-github": "^5.0.0",
		"remark-lint": "^4.0.0",
		"remark-validate-links": "^4.0.0",
		"retext": "^3.0.0",
		"tape": "^4.0.0"
		"nyc": "^8.3.0",
		"remark-cli": "^2.0.0",
		"remark-preset-wooorm": "^1.0.0",
		"retext": "^4.0.0",
		"tape": "^4.0.0",
		"xo": "^0.16.0"
		},
		@@ -56,9 +49,29 @@ "scripts": {
		"build": "npm run build-md && npm run build-bundle && npm run build-mangle",
		"lint-api": "eslint .",
		"lint-style": "jscs --reporter inline .",
		"lint": "npm run lint-api && npm run lint-style",
		"lint": "xo",
		"test-api": "node test.js",
		"test-coverage": "istanbul cover test.js",
		"test-coverage": "nyc --reporter lcov tape test.js",
		"test": "npm run build && npm run lint && npm run test-coverage"
		},
		"nyc": {
		"check-coverage": true,
		"lines": 100,
		"functions": 100,
		"branches": 100
		},
		"xo": {
		"space": true,
		"rules": {
		"no-negated-condition": "off",
		"guard-for-in": "off",
		"max-lines": "off",
		"max-nested-callbacks": "off"
		},
		"ignores": [
		"retext-keywords.js"
		]
		},
		"remarkConfig": {
		"output": true,
		"presets": "wooorm"
		}
		}

146

readme.md

		# retext-keywords [![Build Status][travis-badge]][travis] [![Coverage Status][codecov-badge]][codecov]

		<!--lint disable heading-increment list-item-spacing-->

		Keyword extraction with [retext][retext].
		@@ -9,3 +7,3 @@

		[npm][npm-install]:
		[npm][]:

		@@ -16,5 +14,2 @@ ```bash

		retext-keywords is also available as an AMD, CommonJS, and
		globals module, [uncompressed and compressed][releases].

		## Usage
		@@ -28,55 +23,53 @@
		retext().use(keywords).process(
		/* First three paragraphs on Term Extraction from Wikipedia:
		* http://en.wikipedia.org/wiki/Terminology_extraction */
		'Terminology mining, term extraction, term recognition, or ' +
		'glossary extraction, is a subtask of information extraction. ' +
		'The goal of terminology extraction is to automatically extract ' +
		'relevant terms from a given corpus.' +
		'\n\n' +
		'In the semantic web era, a growing number of communities and ' +
		'networked enterprises started to access and interoperate through ' +
		'the internet. Modeling these communities and their information ' +
		'needs is important for several web applications, like ' +
		'topic-driven web crawlers, web services, recommender systems, ' +
		'etc. The development of terminology extraction is essential to ' +
		'the language industry.' +
		'\n\n' +
		'One of the first steps to model the knowledge domain of a ' +
		'virtual community is to collect a vocabulary of domain-relevant ' +
		'terms, constituting the linguistic surface manifestation of ' +
		'domain concepts. Several methods to automatically extract ' +
		'technical terms from domain-specific document warehouses have ' +
		'been described in the literature.' +
		'\n\n' +
		'Typically, approaches to automatic term extraction make use of ' +
		'linguistic processors (part of speech tagging, phrase chunking) ' +
		'to extract terminological candidates, i.e. syntactically ' +
		'plausible terminological noun phrases, NPs (e.g. compounds ' +
		'"credit card", adjective-NPs "local tourist information office", ' +
		'and prepositional-NPs "board of directors" - in English, the ' +
		'first two constructs are the most frequent). Terminological ' +
		'entries are then filtered from the candidate list using ' +
		'statistical and machine learning methods. Once filtered, ' +
		'because of their low ambiguity and high specificity, these terms ' +
		'are particularly useful for conceptualizing a knowledge domain ' +
		'or for supporting the creation of a domain ontology. Furthermore, ' +
		'terminology extraction is a very useful starting point for ' +
		'semantic similarity, knowledge management, human translation ' +
		'and machine translation, etc.',
		function (err, file) {
		var space = file.namespace('retext');
		/* First three paragraphs on Term Extraction from Wikipedia:
		* http://en.wikipedia.org/wiki/Terminology_extraction */
		'Terminology mining, term extraction, term recognition, or ' +
		'glossary extraction, is a subtask of information extraction. ' +
		'The goal of terminology extraction is to automatically extract ' +
		'relevant terms from a given corpus.' +
		'\n\n' +
		'In the semantic web era, a growing number of communities and ' +
		'networked enterprises started to access and interoperate through ' +
		'the internet. Modeling these communities and their information ' +
		'needs is important for several web applications, like ' +
		'topic-driven web crawlers, web services, recommender systems, ' +
		'etc. The development of terminology extraction is essential to ' +
		'the language industry.' +
		'\n\n' +
		'One of the first steps to model the knowledge domain of a ' +
		'virtual community is to collect a vocabulary of domain-relevant ' +
		'terms, constituting the linguistic surface manifestation of ' +
		'domain concepts. Several methods to automatically extract ' +
		'technical terms from domain-specific document warehouses have ' +
		'been described in the literature.' +
		'\n\n' +
		'Typically, approaches to automatic term extraction make use of ' +
		'linguistic processors (part of speech tagging, phrase chunking) ' +
		'to extract terminological candidates, i.e. syntactically ' +
		'plausible terminological noun phrases, NPs (e.g. compounds ' +
		'"credit card", adjective-NPs "local tourist information office", ' +
		'and prepositional-NPs "board of directors" - in English, the ' +
		'first two constructs are the most frequent). Terminological ' +
		'entries are then filtered from the candidate list using ' +
		'statistical and machine learning methods. Once filtered, ' +
		'because of their low ambiguity and high specificity, these terms ' +
		'are particularly useful for conceptualizing a knowledge domain ' +
		'or for supporting the creation of a domain ontology. Furthermore, ' +
		'terminology extraction is a very useful starting point for ' +
		'semantic similarity, knowledge management, human translation ' +
		'and machine translation, etc.',
		function (err, file) {
		console.log('Keywords:');

		console.log('Keywords:');
		file.data.keywords.forEach(function (keyword) {
		console.log(nlcstToString(keyword.matches[0].node));
		});

		space.keywords.forEach(function (keyword) {
		console.log(nlcstToString(keyword.matches[0].node));
		});
		console.log();
		console.log('Key-phrases:');

		console.log();
		console.log('Key-phrases:');

		space.keyphrases.forEach(function (phrase) {
		console.log(phrase.matches[0].nodes.map(nlcstToString).join(''));
		});
		}
		file.data.keyphrases.forEach(function (phrase) {
		console.log(phrase.matches[0].nodes.map(nlcstToString).join(''));
		});
		}
		);
		@@ -109,5 +102,4 @@ ```

		The results are stored in the `retext` namespace on the virtual file:
		keywords at `file.namespace('retext').keywords` and key-phrases at
		`file.namespace('retext').keyphrases`. Both are lists.
		The results are stored on `file.data`: keywords at `file.data.keywords`
		and key-phrases at `file.data.keyphrases`. Both are lists.

		@@ -118,9 +110,9 @@ A single keyword looks as follows:
		{
		'stem': 'term',
		'score': 1,
		'matches': [
		{ 'node': Node, 'index': 5, 'parent': Node },
		// ...
		],
		stem: 'term',
		score: 1,
		matches: [
		{node: Node, index: 5, parent: Node},
		// ...
		],
		// ...
		}
		@@ -133,10 +125,10 @@ ```
		{
		'score': 1,
		'weight': 11,
		'stems': [ 'terminolog', 'extract' ],
		'value': 'terminolog extract',
		'matches': [
		{ 'nodes': [Node, Node, Node], 'parent': Node },
		// ...
		]
		score: 1,
		weight: 11,
		stems: ['terminolog', 'extract'],
		value: 'terminolog extract',
		matches: [
		{nodes: [Node, Node, Node], parent: Node},
		// ...
		]
		}
		@@ -150,4 +142,4 @@ ```

		Note that actual counts may differ. For example, when two words
		have the same score, both will be returned. Or when too few words
		Note that actual counts may differ. For example, when two words
		have the same score, both will be returned. Or when too few words
		exist, less will be returned. the same goes for phrases.
		@@ -169,6 +161,4 @@

		[npm-install]: https://docs.npmjs.com/cli/install
		[npm]: https://docs.npmjs.com/cli/install

		[releases]: https://github.com/wooorm/retext-keywords/releases

		[license]: LICENSE
		@@ -175,0 +165,0 @@

history.md

retext-keywords - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics

Worsened metrics

Dependency changes