retext-keywords - npm Package Compare versions

Comparing version 0.0.1 to 0.1.0

benchmark/index.js

		'use strict';

		var Retext, retext, keywords, source,
		sourceSmall, sourceMedium,
		tiny, small, medium,
		wordCount, sentenceCount, paragraphCount;
		/**
		* Dependencies.
		*/

		var Retext,
		keywords;

		Retext = require('retext');
		keywords = require('..');

		/* First paragraph on term extraction from Wikipedia:
		* http://en.wikipedia.org/wiki/Terminology_extraction
		/**
		* Fixtures.
		*
		* First paragraph on term extraction from Wikipedia:
		*
		* http://en.wikipedia.org/wiki/Terminology_extraction
		*/
		source = 'Terminology mining, term extraction, term recognition, or ' +

		var source,
		sourceSmall,
		sourceMedium;

		source =
		'Terminology mining, term extraction, term recognition, or ' +
		'glossary extraction, is a subtask of information extraction. ' +
		@@ -19,58 +31,53 @@ 'The goal of terminology extraction is to automatically extract ' +

		/* Test data */
		sourceSmall = Array(11).join(source);
		sourceMedium = Array(11).join(sourceSmall);

		/**
		* Retext.
		*/

		var retext;

		retext = new Retext().use(keywords);

		tiny = retext.parse(source);
		small = retext.parse(sourceSmall);
		medium = retext.parse(sourceMedium);
		/**
		* Benchmarks.
		*/

		wordCount = sentenceCount = paragraphCount = 0;
		suite('A big section (10 paragraphs)', function () {
		var tree;

		tiny.visitType(tiny.WORD_NODE, function () {
		wordCount++;
		});
		before(function(next) {
		retext.parse(sourceSmall, function (err, node) {
		tree = node;
		next();
		});
		});

		tiny.visitType(tiny.SENTENCE_NODE, function () {
		sentenceCount++;
		});
		bench('Finding keywords', function () {
		tree.keywords();
		});

		tiny.visitType(tiny.PARAGRAPH_NODE, function () {
		paragraphCount++;
		bench('Finding keyphrases', function () {
		tree.keyphrases();
		});
		});

		if (wordCount !== 30) {
		console.error('Word count should be 300!');
		}
		suite('A big article (100 paragraphs)', function () {
		var tree;

		if (sentenceCount !== 2) {
		console.error('Sentence count should be 300!');
		}

		if (paragraphCount !== 1) {
		console.error('Paragraph count should be 300!');
		}

		/* Benchmarks */
		suite('Finding keywords in English', function () {
		bench('small (10 paragraphs, 20 sentences, 300 words)', function () {
		small.keywords();
		before(function(next) {
		retext.parse(sourceMedium, function (err, node) {
		tree = node;
		next();
		});
		});

		bench('medium (100 paragraphs, 200 sentences, 3000 words)', function () {
		medium.keywords();
		bench('Finding keywords', function () {
		tree.keywords();
		});
		});

		/* Benchmarks */
		suite('Finding keyphrases in English', function () {
		bench('small (10 paragraphs, 20 sentences, 300 words)', function () {
		small.keyphrases();
		bench('Finding keyphrases', function () {
		tree.keyphrases();
		});

		bench('medium (100 paragraphs, 200 sentences, 3000 words)', function () {
		medium.keyphrases();
		});
		});

346

index.js

		'use strict';

		var pos = require('retext-pos'),
		stemmer = require('retext-porter-stemmer'),
		visit = require('retext-visit');
		/**
		* Module dependencies.
		*/

		exports = module.exports = function () {};
		var pos,
		stemmer,
		visit;

		pos = require('retext-pos');
		stemmer = require('retext-porter-stemmer');
		visit = require('retext-visit');

		/**
		* Constants.
		*/

		var has;

		has = Object.prototype.hasOwnProperty;

		/**
		* Define `keywords`.
		*/

		function keywords() {}

		/**
		* Reverse sort: from 9 to 0.
		*
		* @param {number} a
		* @param {number} b
		*/

		function reverseSort(a, b) {
		@@ -13,17 +40,33 @@ return b - a;

		function interpolate(value, min, max) {
		return min + value * (max - min);
		}
		/**
		* Get the top results from an occurance map.
		*
		* @param {Object.<string, Object>} results - Dictionary of
		* stems mapping to objects containing `nodes`, `stem`,
		* and `score` properties.
		* @param {number} minimum - Minimum number of results to
		* return.
		* @return {Array.<Object>}
		*/

		function filterResults(results, minimum) {
		var filteredResults = [],
		matrix = {},
		indices = [],
		column, key, score, interpolatedScore, iterator, otherIterator,
		var filteredResults,
		matrix,
		indices,
		column,
		key,
		score,
		interpolatedScore,
		index,
		otherIndex,
		maxScore;

		filteredResults = [];
		indices = [];
		matrix = {};

		for (key in results) {
		score = results[key].score;

		if (!(score in matrix)) {
		if (!has.call(matrix, score)) {
		matrix[score] = [];
		@@ -37,15 +80,16 @@ indices.push(score);
		indices.sort(reverseSort);

		maxScore = indices[0];

		iterator = -1;
		index = -1;

		while (indices[++iterator]) {
		score = indices[iterator];
		while (indices[++index]) {
		score = indices[index];
		column = matrix[score];

		interpolatedScore = interpolate(score / maxScore, 0, 1);
		otherIterator = -1;
		interpolatedScore = score / maxScore;
		otherIndex = -1;

		while (column[++otherIterator]) {
		column[otherIterator].score = interpolatedScore;
		while (column[++otherIndex]) {
		column[otherIndex].score = interpolatedScore;
		}
		@@ -63,3 +107,10 @@

		function isKeyWord(node) {
		/**
		* Get whether or not a `node` is important.
		*
		* @param {Node} node
		* @return {boolean}
		*/

		function isImportant(node) {
		return (
		@@ -78,13 +129,22 @@ node &&

		function getKeywords(node) {
		var keywords = {};
		/**
		* Get most important words in `node`.
		*
		* @param {Node} node
		* @return {Array.<Object>}
		*/

		function getImportantWords(node) {
		var importantWords;

		importantWords = {};

		node.visitType(node.WORD_NODE, function (word) {
		var stem;

		if (isKeyWord(word)) {
		if (isImportant(word)) {
		stem = word.data.stem.toLowerCase();

		if (!(stem in keywords)) {
		keywords[stem] = {
		if (!has.call(importantWords, stem)) {
		importantWords[stem] = {
		'nodes' : [word],
		@@ -95,4 +155,4 @@ 'stem' : stem,
		} else {
		keywords[stem].nodes.push(word);
		keywords[stem].score++;
		importantWords[stem].nodes.push(word);
		importantWords[stem].score++;
		}
		@@ -102,25 +162,47 @@ }

		return keywords;
		return importantWords;
		}

		function getFilteredKeywords(options) {
		if (!options) {
		options = {};
		}
		/**
		* Get the top important words in `self`.
		*
		* @param {Object?} options
		* @param {number?} options.minimum
		* @this {Node} node
		* @return {Array.<Object>}
		*/

		return filterResults(
		getKeywords(this),
		'minimum' in options ? options.minimum : 5
		);
		function getKeywords(options) {
		var minimum;

		minimum = options && has.call(options, 'minimum') ? options.minimum : 5;

		return filterResults(getImportantWords(this), minimum);
		}

		function findPhraseInDirection(node, property) {
		var nodes = [], stems = [], words = [], queue = [];
		/**
		* Get following or preceding important words or white space.
		*
		* @param {Node} node
		* @param {string} direction - either "prev" or "next".
		* @return {Object}
		*/

		node = node[property];
		function findPhraseInDirection(node, direction) {
		var nodes,
		stems,
		words,
		queue;

		nodes = [];
		stems = [];
		words = [];
		queue = [];

		node = node[direction];

		while (node) {
		if (node.type === node.WHITE_SPACE_NODE) {
		queue.push(node);
		} else if (isKeyWord(node)) {
		} else if (isImportant(node)) {
		nodes = nodes.concat(queue, [node]);
		@@ -134,3 +216,3 @@ words.push(node);

		node = node[property];
		node = node[direction];
		}
		@@ -145,6 +227,23 @@

		function merge(prev, value, next) {
		return prev.reverse().concat([value], next);
		/**
		* Merge a previous array, with a current value, and
		* a following array.
		*
		* @param {Array.<*>} prev
		* @param {*} current
		* @param {Array.<*>} next
		* @return {Array.<*>}
		*/

		function merge(prev, current, next) {
		return prev.reverse().concat([current], next);
		}

		/**
		* Find the phrase surrounding a node.
		*
		* @param {Node} node
		* @return {Object}
		*/

		function findPhrase(node) {
		@@ -162,53 +261,91 @@ var prev = findPhraseInDirection(node, 'prev'),

		/**
		* Get the top important phrases in `self`.
		*
		* @param {Object?} options
		* @param {number?} options.minimum
		* @this {Node} node
		* @return {Array.<Object>}
		*/

		function getKeyphrases(options) {
		var simplePhrases = {},
		initialWords = [],
		simplePhrase, iterator, otherIterator, keywords, keyword, nodes,
		phrase, stems, score;
		var stemmedPhrases,
		initialWords,
		stemmedPhrase,
		index,
		otherIndex,
		importantWords,
		keyword,
		nodes,
		phrase,
		stems,
		minimum,
		score;

		if (!options) {
		options = {};
		}
		stemmedPhrases = {};
		initialWords = [];

		keywords = getKeywords(this);
		minimum = options && has.call(options, 'minimum') ? options.minimum : 5;

		/* Iterate over all grouped keywords... */
		for (keyword in keywords) {
		nodes = keywords[keyword].nodes;
		importantWords = getImportantWords(this);

		iterator = -1;
		/**
		* Iterate over all grouped important words...
		*/

		/* Iterate over every occurence of a certain keyword... */
		while (nodes[++iterator]) {
		/* Detect the phrase the node is in. */
		phrase = findPhrase(nodes[iterator]);
		for (keyword in importantWords) {
		nodes = importantWords[keyword].nodes;

		/* If we've already detected the same (simplified) phrase
		* somewhere... */
		if (phrase.value in simplePhrases) {
		simplePhrase = simplePhrases[phrase.value];
		index = -1;

		/* Add weight per phrase to the score of the phrase. */
		simplePhrase.score += simplePhrase.weight;
		/**
		* Iterate over every occurence of a certain keyword...
		*/

		/* If this is the first time we walk over the phrase (exact
		* match, at another position), add it to the list of
		* matching phrases. */
		while (nodes[++index]) {
		phrase = findPhrase(nodes[index]);

		/**
		* If we've detected the same stemmed
		* phrase somewhere.
		*/

		if (has.call(stemmedPhrases, phrase.value)) {
		stemmedPhrase = stemmedPhrases[phrase.value];

		/**
		* Add weight per phrase to the score of
		* the phrase.
		*/

		stemmedPhrase.score += stemmedPhrase.weight;

		/**
		* If this is the first time we walk over
		* the phrase (exact match but containing
		* another important word), add it to the
		* list of matching phrases.
		*/

		if (initialWords.indexOf(phrase.nodes[0]) === -1) {
		initialWords.push(phrase.nodes[0]);
		simplePhrase.nodes.push(phrase.nodes);
		stemmedPhrase.nodes.push(phrase.nodes);
		}
		/* Otherwise... */
		} else {
		otherIterator = -1;
		otherIndex = -1;
		score = -1;
		stems = phrase.stems;

		initialWords.push(phrase.nodes[0]);

		/* For every stem in phrase, add its score to score. */
		while (stems[++otherIterator]) {
		score += keywords[stems[otherIterator]].score;
		/**
		* For every stem in phrase, add its
		* score to score.
		*/

		while (stems[++otherIndex]) {
		score += importantWords[stems[otherIndex]].score;
		}

		simplePhrases[phrase.value] = {
		stemmedPhrases[phrase.value] = {
		'score' : score,
		@@ -224,9 +361,12 @@ 'weight' : score,

		/* Iterate over all grouped phrases... */
		for (simplePhrase in simplePhrases) {
		phrase = simplePhrases[simplePhrase];
		for (stemmedPhrase in stemmedPhrases) {
		phrase = stemmedPhrases[stemmedPhrase];

		/* Modify its score to be the rounded result of multiplying it with
		* the number of occurances, and dividing it by the ammount of words
		* in the phrase. */
		/**
		* Modify its score to be the rounded result of
		* multiplying it with the number of occurances,
		* and dividing it by the ammount of words in the
		* phrase.
		*/

		phrase.score = Math.round(
		@@ -237,20 +377,42 @@ phrase.score * phrase.nodes.length / phrase.stems.length

		return filterResults(
		simplePhrases,
		'minimum' in options ? options.minimum : 5
		);
		return filterResults(stemmedPhrases, minimum);
		}

		/**
		* Define `attach`.
		*
		* @param {Retext}
		*/

		function attach(retext) {
		var TextOM = retext.parser.TextOM;
		var TextOM,
		parentPrototype,
		elementPrototype;

		retext.use(stemmer).use(pos).use(visit);
		TextOM = retext.TextOM;
		parentPrototype = TextOM.Parent.prototype;
		elementPrototype = TextOM.Element.prototype;

		TextOM.Parent.prototype.keywords = TextOM.Element.prototype.keywords =
		getFilteredKeywords;
		retext
		.use(stemmer)
		.use(pos)
		.use(visit);

		TextOM.Parent.prototype.keyphrases = TextOM.Element.prototype.keyphrases =
		getKeyphrases;
		parentPrototype.keywords = getKeywords;
		elementPrototype.keywords = getKeywords;

		parentPrototype.keyphrases = getKeyphrases;
		elementPrototype.keyphrases = getKeyphrases;
		}

		exports.attach = attach;
		/**
		* Expose `attach`.
		*/

		keywords.attach = attach;

		/**
		* Expose `keywords`.
		*/

		module.exports = keywords;

package.json

		{
		"name": "retext-keywords",
		"version": "0.0.1",
		"version": "0.1.0",
		"description": "Keyword extraction with Retext",
		"license": "MIT",
		"keywords": [
		@@ -13,21 +14,20 @@ "keyword",
		],
		"author": "Titus Wormer <tituswormer@gmail.com>",
		"license": "MIT",
		"dependencies": {
		"retext-porter-stemmer": "^0.1.0",
		"retext-pos": "^0.1.0",
		"retext-visit": "^0.1.0"
		"retext-porter-stemmer": "^0.1.1",
		"retext-pos": "^0.1.3",
		"retext-visit": "^0.1.1"
		},
		"repository": {
		"type": "git",
		"url": "https://github.com/wooorm/retext-keywords.git"
		},
		"author": "Titus Wormer <tituswormer@gmail.com>",
		"devDependencies": {
		"eslint": "^0.7.4",
		"eslint": "^0.8.0",
		"istanbul": "^0.3.0",
		"jscs": "^1.5.4",
		"jscs": "^1.5.0",
		"matcha": "^0.5.0",
		"mocha": "~1.20.1",
		"retext": "^0.1.0-rc.4"
		"mocha": "^1.21.0",
		"retext": "^0.2.0-rc.2"
		},
		"repository": {
		"type": "git",
		"url": "https://github.com/wooorm/retext-keywords.git"
		},
		"scripts": {
		@@ -42,23 +42,5 @@ "test": "node_modules/.bin/_mocha --reporter spec --check-leaks -u exports spec/retext-keywords.spec.js",
		"coverage": "node_modules/.bin/istanbul cover node_modules/.bin/_mocha -- -- spec/retext-keywords.spec.js",
		"install-browser-test": "npm install browserify",
		"build-browser-test": "node_modules/.bin/browserify spec/retext-keywords.spec.js -o spec/browser.spec.js",
		"benchmark": "node_modules/.bin/matcha",
		"make": "npm run lint && npm run coverage"
		},
		"testling": {
		"files": "spec/retext-keywords.spec.js",
		"harness": "mocha",
		"browsers": [
		"iexplore/latest",
		"chrome/latest",
		"chrome/canary",
		"firefox/latest",
		"firefox/nightly",
		"opera/latest",
		"opera/next",
		"safari/latest",
		"iphone/latest",
		"android-browser/latest"
		]
		}
		}

202

Readme.md

		# retext-keywords [![Build Status](https://travis-ci.org/wooorm/retext-keywords.svg?branch=master)](https://travis-ci.org/wooorm/retext-keywords) [![Coverage Status](https://img.shields.io/coveralls/wooorm/retext-keywords.svg)](https://coveralls.io/r/wooorm/retext-keywords?branch=master)

		[![browser support](https://ci.testling.com/wooorm/retext-keywords.png) ](https://ci.testling.com/wooorm/retext-keywords)

		---

		Keyword extraction with [Retext](https://github.com/wooorm/retext).
		@@ -11,3 +7,3 @@

		NPM:
		npm:
		```sh
		@@ -17,7 +13,2 @@ $ npm install retext-keywords

		Component.js:
		```sh
		$ component install wooorm/retext-keywords
		```

		## Usage
		@@ -28,87 +19,84 @@
		keywords = require('retext-keywords'),
		root;
		retext;

		var root = new Retext()
		.use(keywords)
		.parse(
		/* First three paragraphs on Term Extraction from Wikipedia:
		* http://en.wikipedia.org/wiki/Terminology_extraction */
		'Terminology mining, term extraction, term recognition, or ' +
		'glossary extraction, is a subtask of information extraction. ' +
		'The goal of terminology extraction is to automatically extract ' +
		'relevant terms from a given corpus.' +
		'\n\n' +
		'In the semantic web era, a growing number of communities and ' +
		'networked enterprises started to access and interoperate through ' +
		'the internet. Modeling these communities and their information ' +
		'needs is important for several web applications, like ' +
		'topic-driven web crawlers, web services, recommender systems, ' +
		'etc. The development of terminology extraction is essential to ' +
		'the language industry.' +
		'\n\n' +
		'One of the first steps to model the knowledge domain of a ' +
		'virtual community is to collect a vocabulary of domain-relevant ' +
		'terms, constituting the linguistic surface manifestation of ' +
		'domain concepts. Several methods to automatically extract ' +
		'technical terms from domain-specific document warehouses have ' +
		'been described in the literature.' +
		'\n\n' +
		'Typically, approaches to automatic term extraction make use of ' +
		'linguistic processors (part of speech tagging, phrase chunking) ' +
		'to extract terminological candidates, i.e. syntactically ' +
		'plausible terminological noun phrases, NPs (e.g. compounds ' +
		'"credit card", adjective-NPs "local tourist information office", ' +
		'and prepositional-NPs "board of directors" - in English, the ' +
		'first two constructs are the most frequent). Terminological ' +
		'entries are then filtered from the candidate list using ' +
		'statistical and machine learning methods. Once filtered, ' +
		'because of their low ambiguity and high specificity, these terms ' +
		'are particularly useful for conceptualizing a knowledge domain ' +
		'or for supporting the creation of a domain ontology. Furthermore, ' +
		'terminology extraction is a very useful starting point for ' +
		'semantic similarity, knowledge management, human translation ' +
		'and machine translation, etc.'
		);
		retext = new Retext().use(keywords);

		root.keywords();
		/*
		* Array[5]
		* ├─ 0: Object
		* \| ├─ stem: "terminolog"
		* \| ├─ score: 1
		* \| └─ nodes: Array[7]
		* ├─ 1: Object
		* \| ├─ stem: "term"
		* \| ├─ score: 1
		* \| └─ nodes: Array[7]
		* ├─ 2: Object
		* \| ├─ stem: "extract"
		* \| ├─ score: 1
		* \| └─ nodes: Array[7]
		* ├─ 3: Object
		* \| ├─ stem: "web"
		* \| ├─ score: 0.5714285714285714
		* \| └─ nodes: Array[4]
		* └─ 4: Object
		* ├─ stem: "domain"
		* ├─ score: 0.5714285714285714
		* └─ nodes: Array[4]
		*/
		retext.parse(
		/* First three paragraphs on Term Extraction from Wikipedia:
		* http://en.wikipedia.org/wiki/Terminology_extraction */
		'Terminology mining, term extraction, term recognition, or ' +
		'glossary extraction, is a subtask of information extraction. ' +
		'The goal of terminology extraction is to automatically extract ' +
		'relevant terms from a given corpus.' +
		'\n\n' +
		'In the semantic web era, a growing number of communities and ' +
		'networked enterprises started to access and interoperate through ' +
		'the internet. Modeling these communities and their information ' +
		'needs is important for several web applications, like ' +
		'topic-driven web crawlers, web services, recommender systems, ' +
		'etc. The development of terminology extraction is essential to ' +
		'the language industry.' +
		'\n\n' +
		'One of the first steps to model the knowledge domain of a ' +
		'virtual community is to collect a vocabulary of domain-relevant ' +
		'terms, constituting the linguistic surface manifestation of ' +
		'domain concepts. Several methods to automatically extract ' +
		'technical terms from domain-specific document warehouses have ' +
		'been described in the literature.' +
		'\n\n' +
		'Typically, approaches to automatic term extraction make use of ' +
		'linguistic processors (part of speech tagging, phrase chunking) ' +
		'to extract terminological candidates, i.e. syntactically ' +
		'plausible terminological noun phrases, NPs (e.g. compounds ' +
		'"credit card", adjective-NPs "local tourist information office", ' +
		'and prepositional-NPs "board of directors" - in English, the ' +
		'first two constructs are the most frequent). Terminological ' +
		'entries are then filtered from the candidate list using ' +
		'statistical and machine learning methods. Once filtered, ' +
		'because of their low ambiguity and high specificity, these terms ' +
		'are particularly useful for conceptualizing a knowledge domain ' +
		'or for supporting the creation of a domain ontology. Furthermore, ' +
		'terminology extraction is a very useful starting point for ' +
		'semantic similarity, knowledge management, human translation ' +
		'and machine translation, etc.',
		function (err, tree) {
		tree.keywords();
		/**
		* Array[5]
		* ├─ 0: Object
		* \| ├─ stem: "terminolog"
		* \| ├─ score: 1
		* \| └─ nodes: Array[7]
		* ├─ 1: Object
		* \| ├─ stem: "term"
		* \| ├─ score: 1
		* \| └─ nodes: Array[7]
		* ├─ 2: Object
		* \| ├─ stem: "extract"
		* \| ├─ score: 1
		* \| └─ nodes: Array[7]
		* ├─ 3: Object
		* \| ├─ stem: "web"
		* \| ├─ score: 0.5714285714285714
		* \| └─ nodes: Array[4]
		* └─ 4: Object
		* ├─ stem: "domain"
		* ├─ score: 0.5714285714285714
		* └─ nodes: Array[4]
		*/
		}
		);
		```

		## API
		retext-keywords depends on the following plugins:

		- [retext-pos](https://github.com/wooorm/retext-pos) — for part-of-speach;
		- [retext-porter-stemmer](https://github.com/wooorm/retext-porter-stemmer) — for stemming;
		- [retext-visit](https://github.com/wooorm/retext-visit)

		### Parent#keywords({minimum=5}?)
		Extract keywords, based on how many times they (nouns) occur in text.

		Extract keywords, based on the number of times they (nouns) occur in text.

		```js
		// See above for an example, and output.
		/* See above for an example, and output. */

		// Do not limit keyword-count.
		root.keywords({'minimum' : Infinity});
		/* To not limit keyword-count: */
		tree.keywords({'minimum' : Infinity});
		```
		@@ -118,19 +106,16 @@

		* minimum: Return at least (when possible) `minimum` keywords.
		- minimum: Return at least (when possible) `minimum` keywords.

		Results: An array, containing match-objects:

		* stem: The stem of the word (using [retext-porter-stemm](https://github.com/wooorm/retext-porter-stemmer/));
		* score: A value between 0 and (including) 1. the first match always has a score of 1;
		* nodes: An array containing all matched word nodes.
		- stem: The stem of the word (see [retext-porter-stemmer](https://github.com/wooorm/retext-porter-stemmer/));
		- score: A value between 0 and (including) 1. the first match has a score of 1;
		- nodes: An array containing all matched word nodes.

		### Parent#keyphrases({minimum=5}?)
		Extract keywords, based on how many times they (nouns) occur in text.

		Extract keyphrases, based on the number of times they (multiple nouns) occur in text.

		```js
		// Do not limit phrase-count.
		root.keywords({'minimum' : Infinity});

		// Default values:
		root.keyphrases();
		tree.keyphrases();
		/*
		@@ -167,2 +152,5 @@ * Array[6]
		*/

		/* To not limit phrase-count: */
		tree.keyphrases({'minimum' : Infinity});
		```
		@@ -172,9 +160,9 @@

		* minimum: Return at least (when possible) `minimum` phrases.
		- minimum: Return at least (when possible) `minimum` phrases.

		Results: An array, containing match-objects:

		* stems: An array containing the stemms of all matched word nodes inside the phrase(s);
		* score: A value between 0 and (including) 1. the first match always has a score of 1;
		* nodes: An matrix containing array-phrases, each in turn containing word nodes.
		- stems: An array containing the stems of all matched word nodes inside the phrase(s);
		- score: A value between 0 and (including) 1. the first match has a score of 1;
		- nodes: An array containing array-phrases, each containing word nodes.

		@@ -189,12 +177,12 @@ ## Benchmark

		On a MacBook Air, `keywords()` runs about 3,041 op/s on a section / small article.
		On a MacBook Air, `keywords()` runs about 3,784 op/s on a big section / small article.

		```
		Finding keywords in English
		3,041 op/s » small (10 paragraphs, 20 sentences, 300 words)
		349 op/s » medium (100 paragraphs, 200 sentences, 3000 words)
		A big section (10 paragraphs)
		3,784 op/s » Finding keywords
		788 op/s » Finding keyphrases

		Finding keyphrases in English
		738 op/s » small (10 paragraphs, 20 sentences, 300 words)
		47 op/s » medium (100 paragraphs, 200 sentences, 3000 words)
		A big article (100 paragraphs)
		401 op/s » Finding keywords
		48 op/s » Finding keyphrases
		```
		@@ -204,2 +192,2 @@

		MIT
		MIT © Titus Wormer

272

spec/retext-keywords.spec.js

		'use strict';

		var keywords, Retext, assert, tree;
		/**
		* Module dependencies.
		*/

		var keywords,
		Retext,
		assert;

		keywords = require('..');
		@@ -9,51 +15,73 @@ Retext = require('retext');

		tree = new Retext()
		.use(keywords)
		.parse(
		/* First three paragraphs on term extraction from Wikipedia:
		* http://en.wikipedia.org/wiki/Terminology_extraction
		*/
		'Terminology mining, term extraction, term recognition, or ' +
		'glossary extraction, is a subtask of information extraction. ' +
		'The goal of terminology extraction is to automatically extract ' +
		'relevant terms from a given corpus.' +
		'\n\n' +
		'In the semantic web era, a growing number of communities and ' +
		'networked enterprises started to access and interoperate through ' +
		'the internet. Modeling these communities and their information ' +
		'needs is important for several web applications, like ' +
		'topic-driven web crawlers, web services, recommender systems, ' +
		'etc. The development of terminology extraction is essential to ' +
		'the language industry.' +
		'\n\n' +
		'One of the first steps to model the knowledge domain of a ' +
		'virtual community is to collect a vocabulary of domain-relevant ' +
		'terms, constituting the linguistic surface manifestation of ' +
		'domain concepts. Several methods to automatically extract ' +
		'technical terms from domain-specific document warehouses have ' +
		'been described in the literature.' +
		'\n\n' +
		'Typically, approaches to automatic term extraction make use of ' +
		'linguistic processors (part of speech tagging, phrase chunking) ' +
		'to extract terminological candidates, i.e. syntactically ' +
		'plausible terminological noun phrases, NPs (e.g. compounds ' +
		'"credit card", adjective-NPs "local tourist information office", ' +
		'and prepositional-NPs "board of directors" - in English, the ' +
		'first two constructs are the most frequent). Terminological ' +
		'entries are then filtered from the candidate list using ' +
		'statistical and machine learning methods. Once filtered, ' +
		'because of their low ambiguity and high specificity, these terms ' +
		'are particularly useful for conceptualizing a knowledge domain ' +
		'or for supporting the creation of a domain ontology. Furthermore, ' +
		'terminology extraction is a very useful starting point for ' +
		'semantic similarity, knowledge management, human translation ' +
		'and machine translation, etc.'
		);
		/**
		* Retext.
		*/

		describe('retext-keywords()', function () {
		it('should be of type `function`', function () {
		var retext,
		TextOM;

		retext = new Retext().use(keywords);

		TextOM = retext.TextOM;

		/**
		* Value.
		*
		* First three paragraphs on term extraction from
		* Wikipedia:
		*
		* http://en.wikipedia.org/wiki/Terminology_extraction
		*/

		var value;

		value =
		'Terminology mining, term extraction, term recognition, or ' +
		'glossary extraction, is a subtask of information extraction. ' +
		'The goal of terminology extraction is to automatically extract ' +
		'relevant terms from a given corpus.' +
		'\n\n' +
		'In the semantic web era, a growing number of communities and ' +
		'networked enterprises started to access and interoperate through ' +
		'the internet. Modeling these communities and their information ' +
		'needs is important for several web applications, like ' +
		'topic-driven web crawlers, web services, recommender systems, ' +
		'etc. The development of terminology extraction is essential to ' +
		'the language industry.' +
		'\n\n' +
		'One of the first steps to model the knowledge domain of a ' +
		'virtual community is to collect a vocabulary of domain-relevant ' +
		'terms, constituting the linguistic surface manifestation of ' +
		'domain concepts. Several methods to automatically extract ' +
		'technical terms from domain-specific document warehouses have ' +
		'been described in the literature.' +
		'\n\n' +
		'Typically, approaches to automatic term extraction make use of ' +
		'linguistic processors (part of speech tagging, phrase chunking) ' +
		'to extract terminological candidates, i.e. syntactically ' +
		'plausible terminological noun phrases, NPs (e.g. compounds ' +
		'"credit card", adjective-NPs "local tourist information office", ' +
		'and prepositional-NPs "board of directors" - in English, the ' +
		'first two constructs are the most frequent). Terminological ' +
		'entries are then filtered from the candidate list using ' +
		'statistical and machine learning methods. Once filtered, ' +
		'because of their low ambiguity and high specificity, these terms ' +
		'are particularly useful for conceptualizing a knowledge domain ' +
		'or for supporting the creation of a domain ontology. Furthermore, ' +
		'terminology extraction is a very useful starting point for ' +
		'semantic similarity, knowledge management, human translation ' +
		'and machine translation, etc.';

		/**
		* Tests.
		*/

		describe('keywords()', function () {
		it('should be a `function`', function () {
		assert(typeof keywords === 'function');
		});
		});

		it('should export an `attach` method', function () {
		describe('keywords.attach()', function () {
		it('should be a `function`', function () {
		assert(typeof keywords.attach === 'function');
		@@ -63,90 +91,114 @@ });

		describe('TextOM.Parent#keywords()', function () {
		it('should be of type `function`', function () {
		assert(typeof tree.TextOM.Parent.prototype.keywords === 'function');
		describe('TextOM.Parent#keywords(options?)', function () {
		it('should be a `function`', function () {
		assert(typeof TextOM.Parent.prototype.keywords === 'function');
		});

		it('should work', function () {
		var terms = tree.keywords();
		it('should work', function (done) {
		retext.parse(value, function (err, tree) {
		var terms;

		assert(terms[0].stem === 'terminolog');
		assert(terms[1].stem === 'term');
		assert(terms[2].stem === 'extract');
		assert(terms[3].stem === 'web');
		assert(terms[4].stem === 'domain');
		terms = tree.keywords();

		assert(terms[0].nodes.length === 7);
		assert(terms[1].nodes.length === 7);
		assert(terms[2].nodes.length === 7);
		assert(terms[3].nodes.length === 4);
		assert(terms[4].nodes.length === 4);
		assert(terms[0].stem === 'terminolog');
		assert(terms[1].stem === 'term');
		assert(terms[2].stem === 'extract');
		assert(terms[3].stem === 'web');
		assert(terms[4].stem === 'domain');

		assert(terms.length >= 5);
		});
		assert(terms[0].nodes.length === 7);
		assert(terms[1].nodes.length === 7);
		assert(terms[2].nodes.length === 7);
		assert(terms[3].nodes.length === 4);
		assert(terms[4].nodes.length === 4);

		it('should accept a `minimum` option', function () {
		var terms = tree.keywords({
		'minimum' : 7
		assert(terms.length >= 5);

		done(err);
		});
		});

		assert(terms[0].stem === 'terminolog');
		assert(terms[1].stem === 'term');
		assert(terms[2].stem === 'extract');
		assert(terms[3].stem === 'web');
		assert(terms[4].stem === 'domain');
		assert(terms[5].stem === 'inform');
		assert(terms[6].stem === 'commun');
		assert(terms[7].stem === 'knowledg');
		it('should accept a `minimum` option', function (done) {
		retext.parse(value, function (err, tree) {
		var terms;

		assert(terms[0].nodes.length === 7);
		assert(terms[1].nodes.length === 7);
		assert(terms[2].nodes.length === 7);
		assert(terms[3].nodes.length === 4);
		assert(terms[4].nodes.length === 4);
		assert(terms[5].nodes.length === 3);
		assert(terms[6].nodes.length === 3);
		assert(terms[7].nodes.length === 3);
		terms = tree.keywords({
		'minimum' : 7
		});

		assert(terms.length >= 7);
		assert(terms[0].stem === 'terminolog');
		assert(terms[1].stem === 'term');
		assert(terms[2].stem === 'extract');
		assert(terms[3].stem === 'web');
		assert(terms[4].stem === 'domain');
		assert(terms[5].stem === 'inform');
		assert(terms[6].stem === 'commun');
		assert(terms[7].stem === 'knowledg');

		assert(terms[0].nodes.length === 7);
		assert(terms[1].nodes.length === 7);
		assert(terms[2].nodes.length === 7);
		assert(terms[3].nodes.length === 4);
		assert(terms[4].nodes.length === 4);
		assert(terms[5].nodes.length === 3);
		assert(terms[6].nodes.length === 3);
		assert(terms[7].nodes.length === 3);

		assert(terms.length >= 7);

		done(err);
		});
		});
		});

		describe('TextOM.Parent#keyphrases()', function () {
		it('should be of type `function`', function () {
		assert(typeof tree.TextOM.Parent.prototype.keywords === 'function');
		describe('TextOM.Parent#keyphrases(options?)', function () {
		it('should be a `function`', function () {
		assert(typeof TextOM.Parent.prototype.keywords === 'function');
		});

		it('should work', function () {
		var phrases = tree.keyphrases();
		it('should work', function (done) {
		retext.parse(value, function (err, tree) {
		var phrases;

		assert(phrases[0].value === 'terminolog extract');
		assert(phrases[1].value === 'term');
		assert(phrases[2].value === 'term extract');
		assert(phrases[3].value === 'knowledg domain');
		assert(phrases[4].value === 'commun');
		phrases = tree.keyphrases();

		assert(phrases[0].nodes.length === 3);
		assert(phrases[1].nodes.length === 3);
		assert(phrases[2].nodes.length === 2);
		assert(phrases[3].nodes.length === 2);
		assert(phrases[4].nodes.length === 3);
		assert(phrases[0].value === 'terminolog extract');
		assert(phrases[1].value === 'term');
		assert(phrases[2].value === 'term extract');
		assert(phrases[3].value === 'knowledg domain');
		assert(phrases[4].value === 'commun');

		assert(phrases.length >= 5);
		});
		assert(phrases[0].nodes.length === 3);
		assert(phrases[1].nodes.length === 3);
		assert(phrases[2].nodes.length === 2);
		assert(phrases[3].nodes.length === 2);
		assert(phrases[4].nodes.length === 3);

		it('should accept a `minimum` option', function () {
		var phrases = tree.keyphrases({
		'minimum' : 3
		assert(phrases.length >= 5);

		done(err);
		});
		});

		assert(phrases[0].value === 'terminolog extract');
		assert(phrases[1].value === 'term');
		assert(phrases[2].value === 'term extract');
		it('should accept a `minimum` option', function (done) {
		retext.parse(value, function (err, tree) {
		var phrases;

		assert(phrases[0].nodes.length === 3);
		assert(phrases[1].nodes.length === 3);
		assert(phrases[2].nodes.length === 2);
		phrases = tree.keyphrases({
		'minimum' : 3
		});

		assert(phrases.length >= 3);
		assert(phrases[0].value === 'terminolog extract');
		assert(phrases[1].value === 'term');
		assert(phrases[2].value === 'term extract');

		assert(phrases[0].nodes.length === 3);
		assert(phrases[1].nodes.length === 3);
		assert(phrases[2].nodes.length === 2);

		assert(phrases.length >= 3);

		done(err);
		});
		});
		});

spec/index.html

History.md→History.md

.npmignore

Sorry, the diff of this file is not supported yet

retext-keywords - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics

Worsened metrics

Dependency changes