Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

retext-keywords

Package Overview
Dependencies
Maintainers
1
Versions
28
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

retext-keywords - npm Package Compare versions

Comparing version 0.0.1 to 0.1.0

99

benchmark/index.js
'use strict';
var Retext, retext, keywords, source,
sourceSmall, sourceMedium,
tiny, small, medium,
wordCount, sentenceCount, paragraphCount;
/**
* Dependencies.
*/
var Retext,
keywords;
Retext = require('retext');
keywords = require('..');
/* First paragraph on term extraction from Wikipedia:
* http://en.wikipedia.org/wiki/Terminology_extraction
/**
* Fixtures.
*
* First paragraph on term extraction from Wikipedia:
*
* http://en.wikipedia.org/wiki/Terminology_extraction
*/
source = 'Terminology mining, term extraction, term recognition, or ' +
var source,
sourceSmall,
sourceMedium;
source =
'Terminology mining, term extraction, term recognition, or ' +
'glossary extraction, is a subtask of information extraction. ' +

@@ -19,58 +31,53 @@ 'The goal of terminology extraction is to automatically extract ' +

/* Test data */
sourceSmall = Array(11).join(source);
sourceMedium = Array(11).join(sourceSmall);
/**
* Retext.
*/
var retext;
retext = new Retext().use(keywords);
tiny = retext.parse(source);
small = retext.parse(sourceSmall);
medium = retext.parse(sourceMedium);
/**
* Benchmarks.
*/
wordCount = sentenceCount = paragraphCount = 0;
suite('A big section (10 paragraphs)', function () {
var tree;
tiny.visitType(tiny.WORD_NODE, function () {
wordCount++;
});
before(function(next) {
retext.parse(sourceSmall, function (err, node) {
tree = node;
next();
});
});
tiny.visitType(tiny.SENTENCE_NODE, function () {
sentenceCount++;
});
bench('Finding keywords', function () {
tree.keywords();
});
tiny.visitType(tiny.PARAGRAPH_NODE, function () {
paragraphCount++;
bench('Finding keyphrases', function () {
tree.keyphrases();
});
});
if (wordCount !== 30) {
console.error('Word count should be 300!');
}
suite('A big article (100 paragraphs)', function () {
var tree;
if (sentenceCount !== 2) {
console.error('Sentence count should be 300!');
}
if (paragraphCount !== 1) {
console.error('Paragraph count should be 300!');
}
/* Benchmarks */
suite('Finding keywords in English', function () {
bench('small (10 paragraphs, 20 sentences, 300 words)', function () {
small.keywords();
before(function(next) {
retext.parse(sourceMedium, function (err, node) {
tree = node;
next();
});
});
bench('medium (100 paragraphs, 200 sentences, 3000 words)', function () {
medium.keywords();
bench('Finding keywords', function () {
tree.keywords();
});
});
/* Benchmarks */
suite('Finding keyphrases in English', function () {
bench('small (10 paragraphs, 20 sentences, 300 words)', function () {
small.keyphrases();
bench('Finding keyphrases', function () {
tree.keyphrases();
});
bench('medium (100 paragraphs, 200 sentences, 3000 words)', function () {
medium.keyphrases();
});
});
'use strict';
var pos = require('retext-pos'),
stemmer = require('retext-porter-stemmer'),
visit = require('retext-visit');
/**
* Module dependencies.
*/
exports = module.exports = function () {};
var pos,
stemmer,
visit;
pos = require('retext-pos');
stemmer = require('retext-porter-stemmer');
visit = require('retext-visit');
/**
* Constants.
*/
var has;
has = Object.prototype.hasOwnProperty;
/**
* Define `keywords`.
*/
function keywords() {}
/**
* Reverse sort: from 9 to 0.
*
* @param {number} a
* @param {number} b
*/
function reverseSort(a, b) {

@@ -13,17 +40,33 @@ return b - a;

function interpolate(value, min, max) {
return min + value * (max - min);
}
/**
* Get the top results from an occurance map.
*
* @param {Object.<string, Object>} results - Dictionary of
* stems mapping to objects containing `nodes`, `stem`,
* and `score` properties.
* @param {number} minimum - Minimum number of results to
* return.
* @return {Array.<Object>}
*/
function filterResults(results, minimum) {
var filteredResults = [],
matrix = {},
indices = [],
column, key, score, interpolatedScore, iterator, otherIterator,
var filteredResults,
matrix,
indices,
column,
key,
score,
interpolatedScore,
index,
otherIndex,
maxScore;
filteredResults = [];
indices = [];
matrix = {};
for (key in results) {
score = results[key].score;
if (!(score in matrix)) {
if (!has.call(matrix, score)) {
matrix[score] = [];

@@ -37,15 +80,16 @@ indices.push(score);

indices.sort(reverseSort);
maxScore = indices[0];
iterator = -1;
index = -1;
while (indices[++iterator]) {
score = indices[iterator];
while (indices[++index]) {
score = indices[index];
column = matrix[score];
interpolatedScore = interpolate(score / maxScore, 0, 1);
otherIterator = -1;
interpolatedScore = score / maxScore;
otherIndex = -1;
while (column[++otherIterator]) {
column[otherIterator].score = interpolatedScore;
while (column[++otherIndex]) {
column[otherIndex].score = interpolatedScore;
}

@@ -63,3 +107,10 @@

function isKeyWord(node) {
/**
* Get whether or not a `node` is important.
*
* @param {Node} node
* @return {boolean}
*/
function isImportant(node) {
return (

@@ -78,13 +129,22 @@ node &&

function getKeywords(node) {
var keywords = {};
/**
* Get most important words in `node`.
*
* @param {Node} node
* @return {Array.<Object>}
*/
function getImportantWords(node) {
var importantWords;
importantWords = {};
node.visitType(node.WORD_NODE, function (word) {
var stem;
if (isKeyWord(word)) {
if (isImportant(word)) {
stem = word.data.stem.toLowerCase();
if (!(stem in keywords)) {
keywords[stem] = {
if (!has.call(importantWords, stem)) {
importantWords[stem] = {
'nodes' : [word],

@@ -95,4 +155,4 @@ 'stem' : stem,

} else {
keywords[stem].nodes.push(word);
keywords[stem].score++;
importantWords[stem].nodes.push(word);
importantWords[stem].score++;
}

@@ -102,25 +162,47 @@ }

return keywords;
return importantWords;
}
function getFilteredKeywords(options) {
if (!options) {
options = {};
}
/**
* Get the top important words in `self`.
*
* @param {Object?} options
* @param {number?} options.minimum
* @this {Node} node
* @return {Array.<Object>}
*/
return filterResults(
getKeywords(this),
'minimum' in options ? options.minimum : 5
);
function getKeywords(options) {
var minimum;
minimum = options && has.call(options, 'minimum') ? options.minimum : 5;
return filterResults(getImportantWords(this), minimum);
}
function findPhraseInDirection(node, property) {
var nodes = [], stems = [], words = [], queue = [];
/**
* Get following or preceding important words or white space.
*
* @param {Node} node
* @param {string} direction - either "prev" or "next".
* @return {Object}
*/
node = node[property];
function findPhraseInDirection(node, direction) {
var nodes,
stems,
words,
queue;
nodes = [];
stems = [];
words = [];
queue = [];
node = node[direction];
while (node) {
if (node.type === node.WHITE_SPACE_NODE) {
queue.push(node);
} else if (isKeyWord(node)) {
} else if (isImportant(node)) {
nodes = nodes.concat(queue, [node]);

@@ -134,3 +216,3 @@ words.push(node);

node = node[property];
node = node[direction];
}

@@ -145,6 +227,23 @@

function merge(prev, value, next) {
return prev.reverse().concat([value], next);
/**
* Merge a previous array, with a current value, and
* a following array.
*
* @param {Array.<*>} prev
* @param {*} current
* @param {Array.<*>} next
* @return {Array.<*>}
*/
function merge(prev, current, next) {
return prev.reverse().concat([current], next);
}
/**
* Find the phrase surrounding a node.
*
* @param {Node} node
* @return {Object}
*/
function findPhrase(node) {

@@ -162,53 +261,91 @@ var prev = findPhraseInDirection(node, 'prev'),

/**
* Get the top important phrases in `self`.
*
* @param {Object?} options
* @param {number?} options.minimum
* @this {Node} node
* @return {Array.<Object>}
*/
function getKeyphrases(options) {
var simplePhrases = {},
initialWords = [],
simplePhrase, iterator, otherIterator, keywords, keyword, nodes,
phrase, stems, score;
var stemmedPhrases,
initialWords,
stemmedPhrase,
index,
otherIndex,
importantWords,
keyword,
nodes,
phrase,
stems,
minimum,
score;
if (!options) {
options = {};
}
stemmedPhrases = {};
initialWords = [];
keywords = getKeywords(this);
minimum = options && has.call(options, 'minimum') ? options.minimum : 5;
/* Iterate over all grouped keywords... */
for (keyword in keywords) {
nodes = keywords[keyword].nodes;
importantWords = getImportantWords(this);
iterator = -1;
/**
* Iterate over all grouped important words...
*/
/* Iterate over every occurence of a certain keyword... */
while (nodes[++iterator]) {
/* Detect the phrase the node is in. */
phrase = findPhrase(nodes[iterator]);
for (keyword in importantWords) {
nodes = importantWords[keyword].nodes;
/* If we've already detected the same (simplified) phrase
* somewhere... */
if (phrase.value in simplePhrases) {
simplePhrase = simplePhrases[phrase.value];
index = -1;
/* Add weight per phrase to the score of the phrase. */
simplePhrase.score += simplePhrase.weight;
/**
* Iterate over every occurence of a certain keyword...
*/
/* If this is the first time we walk over the phrase (exact
* match, at another position), add it to the list of
* matching phrases. */
while (nodes[++index]) {
phrase = findPhrase(nodes[index]);
/**
* If we've detected the same stemmed
* phrase somewhere.
*/
if (has.call(stemmedPhrases, phrase.value)) {
stemmedPhrase = stemmedPhrases[phrase.value];
/**
* Add weight per phrase to the score of
* the phrase.
*/
stemmedPhrase.score += stemmedPhrase.weight;
/**
* If this is the first time we walk over
* the phrase (exact match but containing
* another important word), add it to the
* list of matching phrases.
*/
if (initialWords.indexOf(phrase.nodes[0]) === -1) {
initialWords.push(phrase.nodes[0]);
simplePhrase.nodes.push(phrase.nodes);
stemmedPhrase.nodes.push(phrase.nodes);
}
/* Otherwise... */
} else {
otherIterator = -1;
otherIndex = -1;
score = -1;
stems = phrase.stems;
initialWords.push(phrase.nodes[0]);
/* For every stem in phrase, add its score to score. */
while (stems[++otherIterator]) {
score += keywords[stems[otherIterator]].score;
/**
* For every stem in phrase, add its
* score to score.
*/
while (stems[++otherIndex]) {
score += importantWords[stems[otherIndex]].score;
}
simplePhrases[phrase.value] = {
stemmedPhrases[phrase.value] = {
'score' : score,

@@ -224,9 +361,12 @@ 'weight' : score,

/* Iterate over all grouped phrases... */
for (simplePhrase in simplePhrases) {
phrase = simplePhrases[simplePhrase];
for (stemmedPhrase in stemmedPhrases) {
phrase = stemmedPhrases[stemmedPhrase];
/* Modify its score to be the rounded result of multiplying it with
* the number of occurances, and dividing it by the ammount of words
* in the phrase. */
/**
* Modify its score to be the rounded result of
* multiplying it with the number of occurances,
* and dividing it by the ammount of words in the
* phrase.
*/
phrase.score = Math.round(

@@ -237,20 +377,42 @@ phrase.score * phrase.nodes.length / phrase.stems.length

return filterResults(
simplePhrases,
'minimum' in options ? options.minimum : 5
);
return filterResults(stemmedPhrases, minimum);
}
/**
* Define `attach`.
*
* @param {Retext}
*/
function attach(retext) {
var TextOM = retext.parser.TextOM;
var TextOM,
parentPrototype,
elementPrototype;
retext.use(stemmer).use(pos).use(visit);
TextOM = retext.TextOM;
parentPrototype = TextOM.Parent.prototype;
elementPrototype = TextOM.Element.prototype;
TextOM.Parent.prototype.keywords = TextOM.Element.prototype.keywords =
getFilteredKeywords;
retext
.use(stemmer)
.use(pos)
.use(visit);
TextOM.Parent.prototype.keyphrases = TextOM.Element.prototype.keyphrases =
getKeyphrases;
parentPrototype.keywords = getKeywords;
elementPrototype.keywords = getKeywords;
parentPrototype.keyphrases = getKeyphrases;
elementPrototype.keyphrases = getKeyphrases;
}
exports.attach = attach;
/**
* Expose `attach`.
*/
keywords.attach = attach;
/**
* Expose `keywords`.
*/
module.exports = keywords;
{
"name": "retext-keywords",
"version": "0.0.1",
"version": "0.1.0",
"description": "Keyword extraction with Retext",
"license": "MIT",
"keywords": [

@@ -13,21 +14,20 @@ "keyword",

],
"author": "Titus Wormer <tituswormer@gmail.com>",
"license": "MIT",
"dependencies": {
"retext-porter-stemmer": "^0.1.0",
"retext-pos": "^0.1.0",
"retext-visit": "^0.1.0"
"retext-porter-stemmer": "^0.1.1",
"retext-pos": "^0.1.3",
"retext-visit": "^0.1.1"
},
"repository": {
"type": "git",
"url": "https://github.com/wooorm/retext-keywords.git"
},
"author": "Titus Wormer <tituswormer@gmail.com>",
"devDependencies": {
"eslint": "^0.7.4",
"eslint": "^0.8.0",
"istanbul": "^0.3.0",
"jscs": "^1.5.4",
"jscs": "^1.5.0",
"matcha": "^0.5.0",
"mocha": "~1.20.1",
"retext": "^0.1.0-rc.4"
"mocha": "^1.21.0",
"retext": "^0.2.0-rc.2"
},
"repository": {
"type": "git",
"url": "https://github.com/wooorm/retext-keywords.git"
},
"scripts": {

@@ -42,23 +42,5 @@ "test": "node_modules/.bin/_mocha --reporter spec --check-leaks -u exports spec/retext-keywords.spec.js",

"coverage": "node_modules/.bin/istanbul cover node_modules/.bin/_mocha -- -- spec/retext-keywords.spec.js",
"install-browser-test": "npm install browserify",
"build-browser-test": "node_modules/.bin/browserify spec/retext-keywords.spec.js -o spec/browser.spec.js",
"benchmark": "node_modules/.bin/matcha",
"make": "npm run lint && npm run coverage"
},
"testling": {
"files": "spec/retext-keywords.spec.js",
"harness": "mocha",
"browsers": [
"iexplore/latest",
"chrome/latest",
"chrome/canary",
"firefox/latest",
"firefox/nightly",
"opera/latest",
"opera/next",
"safari/latest",
"iphone/latest",
"android-browser/latest"
]
}
}
# retext-keywords [![Build Status](https://travis-ci.org/wooorm/retext-keywords.svg?branch=master)](https://travis-ci.org/wooorm/retext-keywords) [![Coverage Status](https://img.shields.io/coveralls/wooorm/retext-keywords.svg)](https://coveralls.io/r/wooorm/retext-keywords?branch=master)
[![browser support](https://ci.testling.com/wooorm/retext-keywords.png) ](https://ci.testling.com/wooorm/retext-keywords)
---
Keyword extraction with **[Retext](https://github.com/wooorm/retext)**.

@@ -11,3 +7,3 @@

NPM:
npm:
```sh

@@ -17,7 +13,2 @@ $ npm install retext-keywords

Component.js:
```sh
$ component install wooorm/retext-keywords
```
## Usage

@@ -28,87 +19,84 @@

keywords = require('retext-keywords'),
root;
retext;
var root = new Retext()
.use(keywords)
.parse(
/* First three paragraphs on Term Extraction from Wikipedia:
* http://en.wikipedia.org/wiki/Terminology_extraction */
'Terminology mining, term extraction, term recognition, or ' +
'glossary extraction, is a subtask of information extraction. ' +
'The goal of terminology extraction is to automatically extract ' +
'relevant terms from a given corpus.' +
'\n\n' +
'In the semantic web era, a growing number of communities and ' +
'networked enterprises started to access and interoperate through ' +
'the internet. Modeling these communities and their information ' +
'needs is important for several web applications, like ' +
'topic-driven web crawlers, web services, recommender systems, ' +
'etc. The development of terminology extraction is essential to ' +
'the language industry.' +
'\n\n' +
'One of the first steps to model the knowledge domain of a ' +
'virtual community is to collect a vocabulary of domain-relevant ' +
'terms, constituting the linguistic surface manifestation of ' +
'domain concepts. Several methods to automatically extract ' +
'technical terms from domain-specific document warehouses have ' +
'been described in the literature.' +
'\n\n' +
'Typically, approaches to automatic term extraction make use of ' +
'linguistic processors (part of speech tagging, phrase chunking) ' +
'to extract terminological candidates, i.e. syntactically ' +
'plausible terminological noun phrases, NPs (e.g. compounds ' +
'"credit card", adjective-NPs "local tourist information office", ' +
'and prepositional-NPs "board of directors" - in English, the ' +
'first two constructs are the most frequent). Terminological ' +
'entries are then filtered from the candidate list using ' +
'statistical and machine learning methods. Once filtered, ' +
'because of their low ambiguity and high specificity, these terms ' +
'are particularly useful for conceptualizing a knowledge domain ' +
'or for supporting the creation of a domain ontology. Furthermore, ' +
'terminology extraction is a very useful starting point for ' +
'semantic similarity, knowledge management, human translation ' +
'and machine translation, etc.'
);
retext = new Retext().use(keywords);
root.keywords();
/*
* Array[5]
* ├─ 0: Object
* | ├─ stem: "terminolog"
* | ├─ score: 1
* | └─ nodes: Array[7]
* ├─ 1: Object
* | ├─ stem: "term"
* | ├─ score: 1
* | └─ nodes: Array[7]
* ├─ 2: Object
* | ├─ stem: "extract"
* | ├─ score: 1
* | └─ nodes: Array[7]
* ├─ 3: Object
* | ├─ stem: "web"
* | ├─ score: 0.5714285714285714
* | └─ nodes: Array[4]
* └─ 4: Object
* ├─ stem: "domain"
* ├─ score: 0.5714285714285714
* └─ nodes: Array[4]
*/
retext.parse(
/* First three paragraphs on Term Extraction from Wikipedia:
* http://en.wikipedia.org/wiki/Terminology_extraction */
'Terminology mining, term extraction, term recognition, or ' +
'glossary extraction, is a subtask of information extraction. ' +
'The goal of terminology extraction is to automatically extract ' +
'relevant terms from a given corpus.' +
'\n\n' +
'In the semantic web era, a growing number of communities and ' +
'networked enterprises started to access and interoperate through ' +
'the internet. Modeling these communities and their information ' +
'needs is important for several web applications, like ' +
'topic-driven web crawlers, web services, recommender systems, ' +
'etc. The development of terminology extraction is essential to ' +
'the language industry.' +
'\n\n' +
'One of the first steps to model the knowledge domain of a ' +
'virtual community is to collect a vocabulary of domain-relevant ' +
'terms, constituting the linguistic surface manifestation of ' +
'domain concepts. Several methods to automatically extract ' +
'technical terms from domain-specific document warehouses have ' +
'been described in the literature.' +
'\n\n' +
'Typically, approaches to automatic term extraction make use of ' +
'linguistic processors (part of speech tagging, phrase chunking) ' +
'to extract terminological candidates, i.e. syntactically ' +
'plausible terminological noun phrases, NPs (e.g. compounds ' +
'"credit card", adjective-NPs "local tourist information office", ' +
'and prepositional-NPs "board of directors" - in English, the ' +
'first two constructs are the most frequent). Terminological ' +
'entries are then filtered from the candidate list using ' +
'statistical and machine learning methods. Once filtered, ' +
'because of their low ambiguity and high specificity, these terms ' +
'are particularly useful for conceptualizing a knowledge domain ' +
'or for supporting the creation of a domain ontology. Furthermore, ' +
'terminology extraction is a very useful starting point for ' +
'semantic similarity, knowledge management, human translation ' +
'and machine translation, etc.',
function (err, tree) {
tree.keywords();
/**
* Array[5]
* ├─ 0: Object
* | ├─ stem: "terminolog"
* | ├─ score: 1
* | └─ nodes: Array[7]
* ├─ 1: Object
* | ├─ stem: "term"
* | ├─ score: 1
* | └─ nodes: Array[7]
* ├─ 2: Object
* | ├─ stem: "extract"
* | ├─ score: 1
* | └─ nodes: Array[7]
* ├─ 3: Object
* | ├─ stem: "web"
* | ├─ score: 0.5714285714285714
* | └─ nodes: Array[4]
* └─ 4: Object
* ├─ stem: "domain"
* ├─ score: 0.5714285714285714
* └─ nodes: Array[4]
*/
}
);
```
## API
retext-keywords depends on the following plugins:
- [retext-pos](https://github.com/wooorm/retext-pos) — for part-of-speach;
- [retext-porter-stemmer](https://github.com/wooorm/retext-porter-stemmer) — for stemming;
- [retext-visit](https://github.com/wooorm/retext-visit)
### Parent#keywords({minimum=5}?)
Extract keywords, based on how many times they (nouns) occur in text.
Extract keywords, based on the number of times they (nouns) occur in text.
```js
// **See above for an example, and output.**
/* See above for an example, and output. */
// Do not limit keyword-count.
root.keywords({'minimum' : Infinity});
/* To *not* limit keyword-count: */
tree.keywords({'minimum' : Infinity});
```

@@ -118,19 +106,16 @@

* minimum: Return at least (when possible) `minimum` keywords.
- minimum: Return at least (when possible) `minimum` keywords.
Results: An array, containing match-objects:
* stem: The stem of the word (using [retext-porter-stemm](https://github.com/wooorm/retext-porter-stemmer/));
* score: A value between 0 and (including) 1. the first match always has a score of 1;
* nodes: An array containing all matched word nodes.
- stem: The stem of the word (see [retext-porter-stemmer](https://github.com/wooorm/retext-porter-stemmer/));
- score: A value between 0 and (including) 1. the first match has a score of 1;
- nodes: An array containing all matched word nodes.
### Parent#keyphrases({minimum=5}?)
Extract keywords, based on how many times they (nouns) occur in text.
Extract keyphrases, based on the number of times they (multiple nouns) occur in text.
```js
// Do not limit phrase-count.
root.keywords({'minimum' : Infinity});
// Default values:
root.keyphrases();
tree.keyphrases();
/*

@@ -167,2 +152,5 @@ * Array[6]

*/
/* To *not* limit phrase-count: */
tree.keyphrases({'minimum' : Infinity});
```

@@ -172,9 +160,9 @@

* minimum: Return at least (when possible) `minimum` phrases.
- minimum: Return at least (when possible) `minimum` phrases.
Results: An array, containing match-objects:
* stems: An array containing the stemms of all matched word nodes inside the phrase(s);
* score: A value between 0 and (including) 1. the first match always has a score of 1;
* nodes: An matrix containing array-phrases, each in turn containing word nodes.
- stems: An array containing the stems of all matched word nodes inside the phrase(s);
- score: A value between 0 and (including) 1. the first match has a score of 1;
- nodes: An array containing array-phrases, each containing word nodes.

@@ -189,12 +177,12 @@ ## Benchmark

On a MacBook Air, `keywords()` runs about 3,041 op/s on a section / small article.
On a MacBook Air, `keywords()` runs about 3,784 op/s on a big section / small article.
```
Finding keywords in English
3,041 op/s » small (10 paragraphs, 20 sentences, 300 words)
349 op/s » medium (100 paragraphs, 200 sentences, 3000 words)
A big section (10 paragraphs)
3,784 op/s » Finding keywords
788 op/s » Finding keyphrases
Finding keyphrases in English
738 op/s » small (10 paragraphs, 20 sentences, 300 words)
47 op/s » medium (100 paragraphs, 200 sentences, 3000 words)
A big article (100 paragraphs)
401 op/s » Finding keywords
48 op/s » Finding keyphrases
```

@@ -204,2 +192,2 @@

MIT
MIT © Titus Wormer
'use strict';
var keywords, Retext, assert, tree;
/**
* Module dependencies.
*/
var keywords,
Retext,
assert;
keywords = require('..');

@@ -9,51 +15,73 @@ Retext = require('retext');

tree = new Retext()
.use(keywords)
.parse(
/* First three paragraphs on term extraction from Wikipedia:
* http://en.wikipedia.org/wiki/Terminology_extraction
*/
'Terminology mining, term extraction, term recognition, or ' +
'glossary extraction, is a subtask of information extraction. ' +
'The goal of terminology extraction is to automatically extract ' +
'relevant terms from a given corpus.' +
'\n\n' +
'In the semantic web era, a growing number of communities and ' +
'networked enterprises started to access and interoperate through ' +
'the internet. Modeling these communities and their information ' +
'needs is important for several web applications, like ' +
'topic-driven web crawlers, web services, recommender systems, ' +
'etc. The development of terminology extraction is essential to ' +
'the language industry.' +
'\n\n' +
'One of the first steps to model the knowledge domain of a ' +
'virtual community is to collect a vocabulary of domain-relevant ' +
'terms, constituting the linguistic surface manifestation of ' +
'domain concepts. Several methods to automatically extract ' +
'technical terms from domain-specific document warehouses have ' +
'been described in the literature.' +
'\n\n' +
'Typically, approaches to automatic term extraction make use of ' +
'linguistic processors (part of speech tagging, phrase chunking) ' +
'to extract terminological candidates, i.e. syntactically ' +
'plausible terminological noun phrases, NPs (e.g. compounds ' +
'"credit card", adjective-NPs "local tourist information office", ' +
'and prepositional-NPs "board of directors" - in English, the ' +
'first two constructs are the most frequent). Terminological ' +
'entries are then filtered from the candidate list using ' +
'statistical and machine learning methods. Once filtered, ' +
'because of their low ambiguity and high specificity, these terms ' +
'are particularly useful for conceptualizing a knowledge domain ' +
'or for supporting the creation of a domain ontology. Furthermore, ' +
'terminology extraction is a very useful starting point for ' +
'semantic similarity, knowledge management, human translation ' +
'and machine translation, etc.'
);
/**
* Retext.
*/
describe('retext-keywords()', function () {
it('should be of type `function`', function () {
var retext,
TextOM;
retext = new Retext().use(keywords);
TextOM = retext.TextOM;
/**
* Value.
*
* First three paragraphs on term extraction from
* Wikipedia:
*
* http://en.wikipedia.org/wiki/Terminology_extraction
*/
var value;
value =
'Terminology mining, term extraction, term recognition, or ' +
'glossary extraction, is a subtask of information extraction. ' +
'The goal of terminology extraction is to automatically extract ' +
'relevant terms from a given corpus.' +
'\n\n' +
'In the semantic web era, a growing number of communities and ' +
'networked enterprises started to access and interoperate through ' +
'the internet. Modeling these communities and their information ' +
'needs is important for several web applications, like ' +
'topic-driven web crawlers, web services, recommender systems, ' +
'etc. The development of terminology extraction is essential to ' +
'the language industry.' +
'\n\n' +
'One of the first steps to model the knowledge domain of a ' +
'virtual community is to collect a vocabulary of domain-relevant ' +
'terms, constituting the linguistic surface manifestation of ' +
'domain concepts. Several methods to automatically extract ' +
'technical terms from domain-specific document warehouses have ' +
'been described in the literature.' +
'\n\n' +
'Typically, approaches to automatic term extraction make use of ' +
'linguistic processors (part of speech tagging, phrase chunking) ' +
'to extract terminological candidates, i.e. syntactically ' +
'plausible terminological noun phrases, NPs (e.g. compounds ' +
'"credit card", adjective-NPs "local tourist information office", ' +
'and prepositional-NPs "board of directors" - in English, the ' +
'first two constructs are the most frequent). Terminological ' +
'entries are then filtered from the candidate list using ' +
'statistical and machine learning methods. Once filtered, ' +
'because of their low ambiguity and high specificity, these terms ' +
'are particularly useful for conceptualizing a knowledge domain ' +
'or for supporting the creation of a domain ontology. Furthermore, ' +
'terminology extraction is a very useful starting point for ' +
'semantic similarity, knowledge management, human translation ' +
'and machine translation, etc.';
/**
* Tests.
*/
describe('keywords()', function () {
it('should be a `function`', function () {
assert(typeof keywords === 'function');
});
});
it('should export an `attach` method', function () {
describe('keywords.attach()', function () {
it('should be a `function`', function () {
assert(typeof keywords.attach === 'function');

@@ -63,90 +91,114 @@ });

describe('TextOM.Parent#keywords()', function () {
it('should be of type `function`', function () {
assert(typeof tree.TextOM.Parent.prototype.keywords === 'function');
describe('TextOM.Parent#keywords(options?)', function () {
it('should be a `function`', function () {
assert(typeof TextOM.Parent.prototype.keywords === 'function');
});
it('should work', function () {
var terms = tree.keywords();
it('should work', function (done) {
retext.parse(value, function (err, tree) {
var terms;
assert(terms[0].stem === 'terminolog');
assert(terms[1].stem === 'term');
assert(terms[2].stem === 'extract');
assert(terms[3].stem === 'web');
assert(terms[4].stem === 'domain');
terms = tree.keywords();
assert(terms[0].nodes.length === 7);
assert(terms[1].nodes.length === 7);
assert(terms[2].nodes.length === 7);
assert(terms[3].nodes.length === 4);
assert(terms[4].nodes.length === 4);
assert(terms[0].stem === 'terminolog');
assert(terms[1].stem === 'term');
assert(terms[2].stem === 'extract');
assert(terms[3].stem === 'web');
assert(terms[4].stem === 'domain');
assert(terms.length >= 5);
});
assert(terms[0].nodes.length === 7);
assert(terms[1].nodes.length === 7);
assert(terms[2].nodes.length === 7);
assert(terms[3].nodes.length === 4);
assert(terms[4].nodes.length === 4);
it('should accept a `minimum` option', function () {
var terms = tree.keywords({
'minimum' : 7
assert(terms.length >= 5);
done(err);
});
});
assert(terms[0].stem === 'terminolog');
assert(terms[1].stem === 'term');
assert(terms[2].stem === 'extract');
assert(terms[3].stem === 'web');
assert(terms[4].stem === 'domain');
assert(terms[5].stem === 'inform');
assert(terms[6].stem === 'commun');
assert(terms[7].stem === 'knowledg');
it('should accept a `minimum` option', function (done) {
retext.parse(value, function (err, tree) {
var terms;
assert(terms[0].nodes.length === 7);
assert(terms[1].nodes.length === 7);
assert(terms[2].nodes.length === 7);
assert(terms[3].nodes.length === 4);
assert(terms[4].nodes.length === 4);
assert(terms[5].nodes.length === 3);
assert(terms[6].nodes.length === 3);
assert(terms[7].nodes.length === 3);
terms = tree.keywords({
'minimum' : 7
});
assert(terms.length >= 7);
assert(terms[0].stem === 'terminolog');
assert(terms[1].stem === 'term');
assert(terms[2].stem === 'extract');
assert(terms[3].stem === 'web');
assert(terms[4].stem === 'domain');
assert(terms[5].stem === 'inform');
assert(terms[6].stem === 'commun');
assert(terms[7].stem === 'knowledg');
assert(terms[0].nodes.length === 7);
assert(terms[1].nodes.length === 7);
assert(terms[2].nodes.length === 7);
assert(terms[3].nodes.length === 4);
assert(terms[4].nodes.length === 4);
assert(terms[5].nodes.length === 3);
assert(terms[6].nodes.length === 3);
assert(terms[7].nodes.length === 3);
assert(terms.length >= 7);
done(err);
});
});
});
describe('TextOM.Parent#keyphrases()', function () {
it('should be of type `function`', function () {
assert(typeof tree.TextOM.Parent.prototype.keywords === 'function');
describe('TextOM.Parent#keyphrases(options?)', function () {
it('should be a `function`', function () {
assert(typeof TextOM.Parent.prototype.keywords === 'function');
});
it('should work', function () {
var phrases = tree.keyphrases();
it('should work', function (done) {
retext.parse(value, function (err, tree) {
var phrases;
assert(phrases[0].value === 'terminolog extract');
assert(phrases[1].value === 'term');
assert(phrases[2].value === 'term extract');
assert(phrases[3].value === 'knowledg domain');
assert(phrases[4].value === 'commun');
phrases = tree.keyphrases();
assert(phrases[0].nodes.length === 3);
assert(phrases[1].nodes.length === 3);
assert(phrases[2].nodes.length === 2);
assert(phrases[3].nodes.length === 2);
assert(phrases[4].nodes.length === 3);
assert(phrases[0].value === 'terminolog extract');
assert(phrases[1].value === 'term');
assert(phrases[2].value === 'term extract');
assert(phrases[3].value === 'knowledg domain');
assert(phrases[4].value === 'commun');
assert(phrases.length >= 5);
});
assert(phrases[0].nodes.length === 3);
assert(phrases[1].nodes.length === 3);
assert(phrases[2].nodes.length === 2);
assert(phrases[3].nodes.length === 2);
assert(phrases[4].nodes.length === 3);
it('should accept a `minimum` option', function () {
var phrases = tree.keyphrases({
'minimum' : 3
assert(phrases.length >= 5);
done(err);
});
});
assert(phrases[0].value === 'terminolog extract');
assert(phrases[1].value === 'term');
assert(phrases[2].value === 'term extract');
it('should accept a `minimum` option', function (done) {
retext.parse(value, function (err, tree) {
var phrases;
assert(phrases[0].nodes.length === 3);
assert(phrases[1].nodes.length === 3);
assert(phrases[2].nodes.length === 2);
phrases = tree.keyphrases({
'minimum' : 3
});
assert(phrases.length >= 3);
assert(phrases[0].value === 'terminolog extract');
assert(phrases[1].value === 'term');
assert(phrases[2].value === 'term extract');
assert(phrases[0].nodes.length === 3);
assert(phrases[1].nodes.length === 3);
assert(phrases[2].nodes.length === 2);
assert(phrases.length >= 3);
done(err);
});
});
});

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc