retext-keywords
Advanced tools
Comparing version 0.0.1 to 0.1.0
'use strict'; | ||
var Retext, retext, keywords, source, | ||
sourceSmall, sourceMedium, | ||
tiny, small, medium, | ||
wordCount, sentenceCount, paragraphCount; | ||
/** | ||
* Dependencies. | ||
*/ | ||
var Retext, | ||
keywords; | ||
Retext = require('retext'); | ||
keywords = require('..'); | ||
/* First paragraph on term extraction from Wikipedia: | ||
* http://en.wikipedia.org/wiki/Terminology_extraction | ||
/** | ||
* Fixtures. | ||
* | ||
* First paragraph on term extraction from Wikipedia: | ||
* | ||
* http://en.wikipedia.org/wiki/Terminology_extraction | ||
*/ | ||
source = 'Terminology mining, term extraction, term recognition, or ' + | ||
var source, | ||
sourceSmall, | ||
sourceMedium; | ||
source = | ||
'Terminology mining, term extraction, term recognition, or ' + | ||
'glossary extraction, is a subtask of information extraction. ' + | ||
@@ -19,58 +31,53 @@ 'The goal of terminology extraction is to automatically extract ' + | ||
/* Test data */ | ||
sourceSmall = Array(11).join(source); | ||
sourceMedium = Array(11).join(sourceSmall); | ||
/** | ||
* Retext. | ||
*/ | ||
var retext; | ||
retext = new Retext().use(keywords); | ||
tiny = retext.parse(source); | ||
small = retext.parse(sourceSmall); | ||
medium = retext.parse(sourceMedium); | ||
/** | ||
* Benchmarks. | ||
*/ | ||
wordCount = sentenceCount = paragraphCount = 0; | ||
suite('A big section (10 paragraphs)', function () { | ||
var tree; | ||
tiny.visitType(tiny.WORD_NODE, function () { | ||
wordCount++; | ||
}); | ||
before(function(next) { | ||
retext.parse(sourceSmall, function (err, node) { | ||
tree = node; | ||
next(); | ||
}); | ||
}); | ||
tiny.visitType(tiny.SENTENCE_NODE, function () { | ||
sentenceCount++; | ||
}); | ||
bench('Finding keywords', function () { | ||
tree.keywords(); | ||
}); | ||
tiny.visitType(tiny.PARAGRAPH_NODE, function () { | ||
paragraphCount++; | ||
bench('Finding keyphrases', function () { | ||
tree.keyphrases(); | ||
}); | ||
}); | ||
if (wordCount !== 30) { | ||
console.error('Word count should be 300!'); | ||
} | ||
suite('A big article (100 paragraphs)', function () { | ||
var tree; | ||
if (sentenceCount !== 2) { | ||
console.error('Sentence count should be 300!'); | ||
} | ||
if (paragraphCount !== 1) { | ||
console.error('Paragraph count should be 300!'); | ||
} | ||
/* Benchmarks */ | ||
suite('Finding keywords in English', function () { | ||
bench('small (10 paragraphs, 20 sentences, 300 words)', function () { | ||
small.keywords(); | ||
before(function(next) { | ||
retext.parse(sourceMedium, function (err, node) { | ||
tree = node; | ||
next(); | ||
}); | ||
}); | ||
bench('medium (100 paragraphs, 200 sentences, 3000 words)', function () { | ||
medium.keywords(); | ||
bench('Finding keywords', function () { | ||
tree.keywords(); | ||
}); | ||
}); | ||
/* Benchmarks */ | ||
suite('Finding keyphrases in English', function () { | ||
bench('small (10 paragraphs, 20 sentences, 300 words)', function () { | ||
small.keyphrases(); | ||
bench('Finding keyphrases', function () { | ||
tree.keyphrases(); | ||
}); | ||
bench('medium (100 paragraphs, 200 sentences, 3000 words)', function () { | ||
medium.keyphrases(); | ||
}); | ||
}); |
346
index.js
'use strict'; | ||
var pos = require('retext-pos'), | ||
stemmer = require('retext-porter-stemmer'), | ||
visit = require('retext-visit'); | ||
/** | ||
* Module dependencies. | ||
*/ | ||
exports = module.exports = function () {}; | ||
var pos, | ||
stemmer, | ||
visit; | ||
pos = require('retext-pos'); | ||
stemmer = require('retext-porter-stemmer'); | ||
visit = require('retext-visit'); | ||
/** | ||
* Constants. | ||
*/ | ||
var has; | ||
has = Object.prototype.hasOwnProperty; | ||
/** | ||
* Define `keywords`. | ||
*/ | ||
function keywords() {} | ||
/** | ||
* Reverse sort: from 9 to 0. | ||
* | ||
* @param {number} a | ||
* @param {number} b | ||
*/ | ||
function reverseSort(a, b) { | ||
@@ -13,17 +40,33 @@ return b - a; | ||
function interpolate(value, min, max) { | ||
return min + value * (max - min); | ||
} | ||
/** | ||
* Get the top results from an occurance map. | ||
* | ||
* @param {Object.<string, Object>} results - Dictionary of | ||
* stems mapping to objects containing `nodes`, `stem`, | ||
* and `score` properties. | ||
* @param {number} minimum - Minimum number of results to | ||
* return. | ||
* @return {Array.<Object>} | ||
*/ | ||
function filterResults(results, minimum) { | ||
var filteredResults = [], | ||
matrix = {}, | ||
indices = [], | ||
column, key, score, interpolatedScore, iterator, otherIterator, | ||
var filteredResults, | ||
matrix, | ||
indices, | ||
column, | ||
key, | ||
score, | ||
interpolatedScore, | ||
index, | ||
otherIndex, | ||
maxScore; | ||
filteredResults = []; | ||
indices = []; | ||
matrix = {}; | ||
for (key in results) { | ||
score = results[key].score; | ||
if (!(score in matrix)) { | ||
if (!has.call(matrix, score)) { | ||
matrix[score] = []; | ||
@@ -37,15 +80,16 @@ indices.push(score); | ||
indices.sort(reverseSort); | ||
maxScore = indices[0]; | ||
iterator = -1; | ||
index = -1; | ||
while (indices[++iterator]) { | ||
score = indices[iterator]; | ||
while (indices[++index]) { | ||
score = indices[index]; | ||
column = matrix[score]; | ||
interpolatedScore = interpolate(score / maxScore, 0, 1); | ||
otherIterator = -1; | ||
interpolatedScore = score / maxScore; | ||
otherIndex = -1; | ||
while (column[++otherIterator]) { | ||
column[otherIterator].score = interpolatedScore; | ||
while (column[++otherIndex]) { | ||
column[otherIndex].score = interpolatedScore; | ||
} | ||
@@ -63,3 +107,10 @@ | ||
function isKeyWord(node) { | ||
/** | ||
* Get whether or not a `node` is important. | ||
* | ||
* @param {Node} node | ||
* @return {boolean} | ||
*/ | ||
function isImportant(node) { | ||
return ( | ||
@@ -78,13 +129,22 @@ node && | ||
function getKeywords(node) { | ||
var keywords = {}; | ||
/** | ||
* Get most important words in `node`. | ||
* | ||
* @param {Node} node | ||
* @return {Array.<Object>} | ||
*/ | ||
function getImportantWords(node) { | ||
var importantWords; | ||
importantWords = {}; | ||
node.visitType(node.WORD_NODE, function (word) { | ||
var stem; | ||
if (isKeyWord(word)) { | ||
if (isImportant(word)) { | ||
stem = word.data.stem.toLowerCase(); | ||
if (!(stem in keywords)) { | ||
keywords[stem] = { | ||
if (!has.call(importantWords, stem)) { | ||
importantWords[stem] = { | ||
'nodes' : [word], | ||
@@ -95,4 +155,4 @@ 'stem' : stem, | ||
} else { | ||
keywords[stem].nodes.push(word); | ||
keywords[stem].score++; | ||
importantWords[stem].nodes.push(word); | ||
importantWords[stem].score++; | ||
} | ||
@@ -102,25 +162,47 @@ } | ||
return keywords; | ||
return importantWords; | ||
} | ||
function getFilteredKeywords(options) { | ||
if (!options) { | ||
options = {}; | ||
} | ||
/** | ||
* Get the top important words in `self`. | ||
* | ||
* @param {Object?} options | ||
* @param {number?} options.minimum | ||
* @this {Node} node | ||
* @return {Array.<Object>} | ||
*/ | ||
return filterResults( | ||
getKeywords(this), | ||
'minimum' in options ? options.minimum : 5 | ||
); | ||
function getKeywords(options) { | ||
var minimum; | ||
minimum = options && has.call(options, 'minimum') ? options.minimum : 5; | ||
return filterResults(getImportantWords(this), minimum); | ||
} | ||
function findPhraseInDirection(node, property) { | ||
var nodes = [], stems = [], words = [], queue = []; | ||
/** | ||
* Get following or preceding important words or white space. | ||
* | ||
* @param {Node} node | ||
* @param {string} direction - either "prev" or "next". | ||
* @return {Object} | ||
*/ | ||
node = node[property]; | ||
function findPhraseInDirection(node, direction) { | ||
var nodes, | ||
stems, | ||
words, | ||
queue; | ||
nodes = []; | ||
stems = []; | ||
words = []; | ||
queue = []; | ||
node = node[direction]; | ||
while (node) { | ||
if (node.type === node.WHITE_SPACE_NODE) { | ||
queue.push(node); | ||
} else if (isKeyWord(node)) { | ||
} else if (isImportant(node)) { | ||
nodes = nodes.concat(queue, [node]); | ||
@@ -134,3 +216,3 @@ words.push(node); | ||
node = node[property]; | ||
node = node[direction]; | ||
} | ||
@@ -145,6 +227,23 @@ | ||
function merge(prev, value, next) { | ||
return prev.reverse().concat([value], next); | ||
/** | ||
* Merge a previous array, with a current value, and | ||
* a following array. | ||
* | ||
* @param {Array.<*>} prev | ||
* @param {*} current | ||
* @param {Array.<*>} next | ||
* @return {Array.<*>} | ||
*/ | ||
function merge(prev, current, next) { | ||
return prev.reverse().concat([current], next); | ||
} | ||
/** | ||
* Find the phrase surrounding a node. | ||
* | ||
* @param {Node} node | ||
* @return {Object} | ||
*/ | ||
function findPhrase(node) { | ||
@@ -162,53 +261,91 @@ var prev = findPhraseInDirection(node, 'prev'), | ||
/** | ||
* Get the top important phrases in `self`. | ||
* | ||
* @param {Object?} options | ||
* @param {number?} options.minimum | ||
* @this {Node} node | ||
* @return {Array.<Object>} | ||
*/ | ||
function getKeyphrases(options) { | ||
var simplePhrases = {}, | ||
initialWords = [], | ||
simplePhrase, iterator, otherIterator, keywords, keyword, nodes, | ||
phrase, stems, score; | ||
var stemmedPhrases, | ||
initialWords, | ||
stemmedPhrase, | ||
index, | ||
otherIndex, | ||
importantWords, | ||
keyword, | ||
nodes, | ||
phrase, | ||
stems, | ||
minimum, | ||
score; | ||
if (!options) { | ||
options = {}; | ||
} | ||
stemmedPhrases = {}; | ||
initialWords = []; | ||
keywords = getKeywords(this); | ||
minimum = options && has.call(options, 'minimum') ? options.minimum : 5; | ||
/* Iterate over all grouped keywords... */ | ||
for (keyword in keywords) { | ||
nodes = keywords[keyword].nodes; | ||
importantWords = getImportantWords(this); | ||
iterator = -1; | ||
/** | ||
* Iterate over all grouped important words... | ||
*/ | ||
/* Iterate over every occurence of a certain keyword... */ | ||
while (nodes[++iterator]) { | ||
/* Detect the phrase the node is in. */ | ||
phrase = findPhrase(nodes[iterator]); | ||
for (keyword in importantWords) { | ||
nodes = importantWords[keyword].nodes; | ||
/* If we've already detected the same (simplified) phrase | ||
* somewhere... */ | ||
if (phrase.value in simplePhrases) { | ||
simplePhrase = simplePhrases[phrase.value]; | ||
index = -1; | ||
/* Add weight per phrase to the score of the phrase. */ | ||
simplePhrase.score += simplePhrase.weight; | ||
/** | ||
* Iterate over every occurence of a certain keyword... | ||
*/ | ||
/* If this is the first time we walk over the phrase (exact | ||
* match, at another position), add it to the list of | ||
* matching phrases. */ | ||
while (nodes[++index]) { | ||
phrase = findPhrase(nodes[index]); | ||
/** | ||
* If we've detected the same stemmed | ||
* phrase somewhere. | ||
*/ | ||
if (has.call(stemmedPhrases, phrase.value)) { | ||
stemmedPhrase = stemmedPhrases[phrase.value]; | ||
/** | ||
* Add weight per phrase to the score of | ||
* the phrase. | ||
*/ | ||
stemmedPhrase.score += stemmedPhrase.weight; | ||
/** | ||
* If this is the first time we walk over | ||
* the phrase (exact match but containing | ||
* another important word), add it to the | ||
* list of matching phrases. | ||
*/ | ||
if (initialWords.indexOf(phrase.nodes[0]) === -1) { | ||
initialWords.push(phrase.nodes[0]); | ||
simplePhrase.nodes.push(phrase.nodes); | ||
stemmedPhrase.nodes.push(phrase.nodes); | ||
} | ||
/* Otherwise... */ | ||
} else { | ||
otherIterator = -1; | ||
otherIndex = -1; | ||
score = -1; | ||
stems = phrase.stems; | ||
initialWords.push(phrase.nodes[0]); | ||
/* For every stem in phrase, add its score to score. */ | ||
while (stems[++otherIterator]) { | ||
score += keywords[stems[otherIterator]].score; | ||
/** | ||
* For every stem in phrase, add its | ||
* score to score. | ||
*/ | ||
while (stems[++otherIndex]) { | ||
score += importantWords[stems[otherIndex]].score; | ||
} | ||
simplePhrases[phrase.value] = { | ||
stemmedPhrases[phrase.value] = { | ||
'score' : score, | ||
@@ -224,9 +361,12 @@ 'weight' : score, | ||
/* Iterate over all grouped phrases... */ | ||
for (simplePhrase in simplePhrases) { | ||
phrase = simplePhrases[simplePhrase]; | ||
for (stemmedPhrase in stemmedPhrases) { | ||
phrase = stemmedPhrases[stemmedPhrase]; | ||
/* Modify its score to be the rounded result of multiplying it with | ||
* the number of occurances, and dividing it by the ammount of words | ||
* in the phrase. */ | ||
/** | ||
* Modify its score to be the rounded result of | ||
* multiplying it with the number of occurances, | ||
* and dividing it by the ammount of words in the | ||
* phrase. | ||
*/ | ||
phrase.score = Math.round( | ||
@@ -237,20 +377,42 @@ phrase.score * phrase.nodes.length / phrase.stems.length | ||
return filterResults( | ||
simplePhrases, | ||
'minimum' in options ? options.minimum : 5 | ||
); | ||
return filterResults(stemmedPhrases, minimum); | ||
} | ||
/** | ||
* Define `attach`. | ||
* | ||
* @param {Retext} | ||
*/ | ||
function attach(retext) { | ||
var TextOM = retext.parser.TextOM; | ||
var TextOM, | ||
parentPrototype, | ||
elementPrototype; | ||
retext.use(stemmer).use(pos).use(visit); | ||
TextOM = retext.TextOM; | ||
parentPrototype = TextOM.Parent.prototype; | ||
elementPrototype = TextOM.Element.prototype; | ||
TextOM.Parent.prototype.keywords = TextOM.Element.prototype.keywords = | ||
getFilteredKeywords; | ||
retext | ||
.use(stemmer) | ||
.use(pos) | ||
.use(visit); | ||
TextOM.Parent.prototype.keyphrases = TextOM.Element.prototype.keyphrases = | ||
getKeyphrases; | ||
parentPrototype.keywords = getKeywords; | ||
elementPrototype.keywords = getKeywords; | ||
parentPrototype.keyphrases = getKeyphrases; | ||
elementPrototype.keyphrases = getKeyphrases; | ||
} | ||
exports.attach = attach; | ||
/** | ||
* Expose `attach`. | ||
*/ | ||
keywords.attach = attach; | ||
/** | ||
* Expose `keywords`. | ||
*/ | ||
module.exports = keywords; |
{ | ||
"name": "retext-keywords", | ||
"version": "0.0.1", | ||
"version": "0.1.0", | ||
"description": "Keyword extraction with Retext", | ||
"license": "MIT", | ||
"keywords": [ | ||
@@ -13,21 +14,20 @@ "keyword", | ||
], | ||
"author": "Titus Wormer <tituswormer@gmail.com>", | ||
"license": "MIT", | ||
"dependencies": { | ||
"retext-porter-stemmer": "^0.1.0", | ||
"retext-pos": "^0.1.0", | ||
"retext-visit": "^0.1.0" | ||
"retext-porter-stemmer": "^0.1.1", | ||
"retext-pos": "^0.1.3", | ||
"retext-visit": "^0.1.1" | ||
}, | ||
"repository": { | ||
"type": "git", | ||
"url": "https://github.com/wooorm/retext-keywords.git" | ||
}, | ||
"author": "Titus Wormer <tituswormer@gmail.com>", | ||
"devDependencies": { | ||
"eslint": "^0.7.4", | ||
"eslint": "^0.8.0", | ||
"istanbul": "^0.3.0", | ||
"jscs": "^1.5.4", | ||
"jscs": "^1.5.0", | ||
"matcha": "^0.5.0", | ||
"mocha": "~1.20.1", | ||
"retext": "^0.1.0-rc.4" | ||
"mocha": "^1.21.0", | ||
"retext": "^0.2.0-rc.2" | ||
}, | ||
"repository": { | ||
"type": "git", | ||
"url": "https://github.com/wooorm/retext-keywords.git" | ||
}, | ||
"scripts": { | ||
@@ -42,23 +42,5 @@ "test": "node_modules/.bin/_mocha --reporter spec --check-leaks -u exports spec/retext-keywords.spec.js", | ||
"coverage": "node_modules/.bin/istanbul cover node_modules/.bin/_mocha -- -- spec/retext-keywords.spec.js", | ||
"install-browser-test": "npm install browserify", | ||
"build-browser-test": "node_modules/.bin/browserify spec/retext-keywords.spec.js -o spec/browser.spec.js", | ||
"benchmark": "node_modules/.bin/matcha", | ||
"make": "npm run lint && npm run coverage" | ||
}, | ||
"testling": { | ||
"files": "spec/retext-keywords.spec.js", | ||
"harness": "mocha", | ||
"browsers": [ | ||
"iexplore/latest", | ||
"chrome/latest", | ||
"chrome/canary", | ||
"firefox/latest", | ||
"firefox/nightly", | ||
"opera/latest", | ||
"opera/next", | ||
"safari/latest", | ||
"iphone/latest", | ||
"android-browser/latest" | ||
] | ||
} | ||
} |
202
Readme.md
# retext-keywords [![Build Status](https://travis-ci.org/wooorm/retext-keywords.svg?branch=master)](https://travis-ci.org/wooorm/retext-keywords) [![Coverage Status](https://img.shields.io/coveralls/wooorm/retext-keywords.svg)](https://coveralls.io/r/wooorm/retext-keywords?branch=master) | ||
[![browser support](https://ci.testling.com/wooorm/retext-keywords.png) ](https://ci.testling.com/wooorm/retext-keywords) | ||
--- | ||
Keyword extraction with **[Retext](https://github.com/wooorm/retext)**. | ||
@@ -11,3 +7,3 @@ | ||
NPM: | ||
npm: | ||
```sh | ||
@@ -17,7 +13,2 @@ $ npm install retext-keywords | ||
Component.js: | ||
```sh | ||
$ component install wooorm/retext-keywords | ||
``` | ||
## Usage | ||
@@ -28,87 +19,84 @@ | ||
keywords = require('retext-keywords'), | ||
root; | ||
retext; | ||
var root = new Retext() | ||
.use(keywords) | ||
.parse( | ||
/* First three paragraphs on Term Extraction from Wikipedia: | ||
* http://en.wikipedia.org/wiki/Terminology_extraction */ | ||
'Terminology mining, term extraction, term recognition, or ' + | ||
'glossary extraction, is a subtask of information extraction. ' + | ||
'The goal of terminology extraction is to automatically extract ' + | ||
'relevant terms from a given corpus.' + | ||
'\n\n' + | ||
'In the semantic web era, a growing number of communities and ' + | ||
'networked enterprises started to access and interoperate through ' + | ||
'the internet. Modeling these communities and their information ' + | ||
'needs is important for several web applications, like ' + | ||
'topic-driven web crawlers, web services, recommender systems, ' + | ||
'etc. The development of terminology extraction is essential to ' + | ||
'the language industry.' + | ||
'\n\n' + | ||
'One of the first steps to model the knowledge domain of a ' + | ||
'virtual community is to collect a vocabulary of domain-relevant ' + | ||
'terms, constituting the linguistic surface manifestation of ' + | ||
'domain concepts. Several methods to automatically extract ' + | ||
'technical terms from domain-specific document warehouses have ' + | ||
'been described in the literature.' + | ||
'\n\n' + | ||
'Typically, approaches to automatic term extraction make use of ' + | ||
'linguistic processors (part of speech tagging, phrase chunking) ' + | ||
'to extract terminological candidates, i.e. syntactically ' + | ||
'plausible terminological noun phrases, NPs (e.g. compounds ' + | ||
'"credit card", adjective-NPs "local tourist information office", ' + | ||
'and prepositional-NPs "board of directors" - in English, the ' + | ||
'first two constructs are the most frequent). Terminological ' + | ||
'entries are then filtered from the candidate list using ' + | ||
'statistical and machine learning methods. Once filtered, ' + | ||
'because of their low ambiguity and high specificity, these terms ' + | ||
'are particularly useful for conceptualizing a knowledge domain ' + | ||
'or for supporting the creation of a domain ontology. Furthermore, ' + | ||
'terminology extraction is a very useful starting point for ' + | ||
'semantic similarity, knowledge management, human translation ' + | ||
'and machine translation, etc.' | ||
); | ||
retext = new Retext().use(keywords); | ||
root.keywords(); | ||
/* | ||
* Array[5] | ||
* ├─ 0: Object | ||
* | ├─ stem: "terminolog" | ||
* | ├─ score: 1 | ||
* | └─ nodes: Array[7] | ||
* ├─ 1: Object | ||
* | ├─ stem: "term" | ||
* | ├─ score: 1 | ||
* | └─ nodes: Array[7] | ||
* ├─ 2: Object | ||
* | ├─ stem: "extract" | ||
* | ├─ score: 1 | ||
* | └─ nodes: Array[7] | ||
* ├─ 3: Object | ||
* | ├─ stem: "web" | ||
* | ├─ score: 0.5714285714285714 | ||
* | └─ nodes: Array[4] | ||
* └─ 4: Object | ||
* ├─ stem: "domain" | ||
* ├─ score: 0.5714285714285714 | ||
* └─ nodes: Array[4] | ||
*/ | ||
retext.parse( | ||
/* First three paragraphs on Term Extraction from Wikipedia: | ||
* http://en.wikipedia.org/wiki/Terminology_extraction */ | ||
'Terminology mining, term extraction, term recognition, or ' + | ||
'glossary extraction, is a subtask of information extraction. ' + | ||
'The goal of terminology extraction is to automatically extract ' + | ||
'relevant terms from a given corpus.' + | ||
'\n\n' + | ||
'In the semantic web era, a growing number of communities and ' + | ||
'networked enterprises started to access and interoperate through ' + | ||
'the internet. Modeling these communities and their information ' + | ||
'needs is important for several web applications, like ' + | ||
'topic-driven web crawlers, web services, recommender systems, ' + | ||
'etc. The development of terminology extraction is essential to ' + | ||
'the language industry.' + | ||
'\n\n' + | ||
'One of the first steps to model the knowledge domain of a ' + | ||
'virtual community is to collect a vocabulary of domain-relevant ' + | ||
'terms, constituting the linguistic surface manifestation of ' + | ||
'domain concepts. Several methods to automatically extract ' + | ||
'technical terms from domain-specific document warehouses have ' + | ||
'been described in the literature.' + | ||
'\n\n' + | ||
'Typically, approaches to automatic term extraction make use of ' + | ||
'linguistic processors (part of speech tagging, phrase chunking) ' + | ||
'to extract terminological candidates, i.e. syntactically ' + | ||
'plausible terminological noun phrases, NPs (e.g. compounds ' + | ||
'"credit card", adjective-NPs "local tourist information office", ' + | ||
'and prepositional-NPs "board of directors" - in English, the ' + | ||
'first two constructs are the most frequent). Terminological ' + | ||
'entries are then filtered from the candidate list using ' + | ||
'statistical and machine learning methods. Once filtered, ' + | ||
'because of their low ambiguity and high specificity, these terms ' + | ||
'are particularly useful for conceptualizing a knowledge domain ' + | ||
'or for supporting the creation of a domain ontology. Furthermore, ' + | ||
'terminology extraction is a very useful starting point for ' + | ||
'semantic similarity, knowledge management, human translation ' + | ||
'and machine translation, etc.', | ||
function (err, tree) { | ||
tree.keywords(); | ||
/** | ||
* Array[5] | ||
* ├─ 0: Object | ||
* | ├─ stem: "terminolog" | ||
* | ├─ score: 1 | ||
* | └─ nodes: Array[7] | ||
* ├─ 1: Object | ||
* | ├─ stem: "term" | ||
* | ├─ score: 1 | ||
* | └─ nodes: Array[7] | ||
* ├─ 2: Object | ||
* | ├─ stem: "extract" | ||
* | ├─ score: 1 | ||
* | └─ nodes: Array[7] | ||
* ├─ 3: Object | ||
* | ├─ stem: "web" | ||
* | ├─ score: 0.5714285714285714 | ||
* | └─ nodes: Array[4] | ||
* └─ 4: Object | ||
* ├─ stem: "domain" | ||
* ├─ score: 0.5714285714285714 | ||
* └─ nodes: Array[4] | ||
*/ | ||
} | ||
); | ||
``` | ||
## API | ||
retext-keywords depends on the following plugins: | ||
- [retext-pos](https://github.com/wooorm/retext-pos) — for part-of-speach; | ||
- [retext-porter-stemmer](https://github.com/wooorm/retext-porter-stemmer) — for stemming; | ||
- [retext-visit](https://github.com/wooorm/retext-visit) | ||
### Parent#keywords({minimum=5}?) | ||
Extract keywords, based on how many times they (nouns) occur in text. | ||
Extract keywords, based on the number of times they (nouns) occur in text. | ||
```js | ||
// **See above for an example, and output.** | ||
/* See above for an example, and output. */ | ||
// Do not limit keyword-count. | ||
root.keywords({'minimum' : Infinity}); | ||
/* To *not* limit keyword-count: */ | ||
tree.keywords({'minimum' : Infinity}); | ||
``` | ||
@@ -118,19 +106,16 @@ | ||
* minimum: Return at least (when possible) `minimum` keywords. | ||
- minimum: Return at least (when possible) `minimum` keywords. | ||
Results: An array, containing match-objects: | ||
* stem: The stem of the word (using [retext-porter-stemm](https://github.com/wooorm/retext-porter-stemmer/)); | ||
* score: A value between 0 and (including) 1. the first match always has a score of 1; | ||
* nodes: An array containing all matched word nodes. | ||
- stem: The stem of the word (see [retext-porter-stemmer](https://github.com/wooorm/retext-porter-stemmer/)); | ||
- score: A value between 0 and (including) 1. the first match has a score of 1; | ||
- nodes: An array containing all matched word nodes. | ||
### Parent#keyphrases({minimum=5}?) | ||
Extract keywords, based on how many times they (nouns) occur in text. | ||
Extract keyphrases, based on the number of times they (multiple nouns) occur in text. | ||
```js | ||
// Do not limit phrase-count. | ||
root.keywords({'minimum' : Infinity}); | ||
// Default values: | ||
root.keyphrases(); | ||
tree.keyphrases(); | ||
/* | ||
@@ -167,2 +152,5 @@ * Array[6] | ||
*/ | ||
/* To *not* limit phrase-count: */ | ||
tree.keyphrases({'minimum' : Infinity}); | ||
``` | ||
@@ -172,9 +160,9 @@ | ||
* minimum: Return at least (when possible) `minimum` phrases. | ||
- minimum: Return at least (when possible) `minimum` phrases. | ||
Results: An array, containing match-objects: | ||
* stems: An array containing the stemms of all matched word nodes inside the phrase(s); | ||
* score: A value between 0 and (including) 1. the first match always has a score of 1; | ||
* nodes: An matrix containing array-phrases, each in turn containing word nodes. | ||
- stems: An array containing the stems of all matched word nodes inside the phrase(s); | ||
- score: A value between 0 and (including) 1. the first match has a score of 1; | ||
- nodes: An array containing array-phrases, each containing word nodes. | ||
@@ -189,12 +177,12 @@ ## Benchmark | ||
On a MacBook Air, `keywords()` runs about 3,041 op/s on a section / small article. | ||
On a MacBook Air, `keywords()` runs about 3,784 op/s on a big section / small article. | ||
``` | ||
Finding keywords in English | ||
3,041 op/s » small (10 paragraphs, 20 sentences, 300 words) | ||
349 op/s » medium (100 paragraphs, 200 sentences, 3000 words) | ||
A big section (10 paragraphs) | ||
3,784 op/s » Finding keywords | ||
788 op/s » Finding keyphrases | ||
Finding keyphrases in English | ||
738 op/s » small (10 paragraphs, 20 sentences, 300 words) | ||
47 op/s » medium (100 paragraphs, 200 sentences, 3000 words) | ||
A big article (100 paragraphs) | ||
401 op/s » Finding keywords | ||
48 op/s » Finding keyphrases | ||
``` | ||
@@ -204,2 +192,2 @@ | ||
MIT | ||
MIT © Titus Wormer |
'use strict'; | ||
var keywords, Retext, assert, tree; | ||
/** | ||
* Module dependencies. | ||
*/ | ||
var keywords, | ||
Retext, | ||
assert; | ||
keywords = require('..'); | ||
@@ -9,51 +15,73 @@ Retext = require('retext'); | ||
tree = new Retext() | ||
.use(keywords) | ||
.parse( | ||
/* First three paragraphs on term extraction from Wikipedia: | ||
* http://en.wikipedia.org/wiki/Terminology_extraction | ||
*/ | ||
'Terminology mining, term extraction, term recognition, or ' + | ||
'glossary extraction, is a subtask of information extraction. ' + | ||
'The goal of terminology extraction is to automatically extract ' + | ||
'relevant terms from a given corpus.' + | ||
'\n\n' + | ||
'In the semantic web era, a growing number of communities and ' + | ||
'networked enterprises started to access and interoperate through ' + | ||
'the internet. Modeling these communities and their information ' + | ||
'needs is important for several web applications, like ' + | ||
'topic-driven web crawlers, web services, recommender systems, ' + | ||
'etc. The development of terminology extraction is essential to ' + | ||
'the language industry.' + | ||
'\n\n' + | ||
'One of the first steps to model the knowledge domain of a ' + | ||
'virtual community is to collect a vocabulary of domain-relevant ' + | ||
'terms, constituting the linguistic surface manifestation of ' + | ||
'domain concepts. Several methods to automatically extract ' + | ||
'technical terms from domain-specific document warehouses have ' + | ||
'been described in the literature.' + | ||
'\n\n' + | ||
'Typically, approaches to automatic term extraction make use of ' + | ||
'linguistic processors (part of speech tagging, phrase chunking) ' + | ||
'to extract terminological candidates, i.e. syntactically ' + | ||
'plausible terminological noun phrases, NPs (e.g. compounds ' + | ||
'"credit card", adjective-NPs "local tourist information office", ' + | ||
'and prepositional-NPs "board of directors" - in English, the ' + | ||
'first two constructs are the most frequent). Terminological ' + | ||
'entries are then filtered from the candidate list using ' + | ||
'statistical and machine learning methods. Once filtered, ' + | ||
'because of their low ambiguity and high specificity, these terms ' + | ||
'are particularly useful for conceptualizing a knowledge domain ' + | ||
'or for supporting the creation of a domain ontology. Furthermore, ' + | ||
'terminology extraction is a very useful starting point for ' + | ||
'semantic similarity, knowledge management, human translation ' + | ||
'and machine translation, etc.' | ||
); | ||
/** | ||
* Retext. | ||
*/ | ||
describe('retext-keywords()', function () { | ||
it('should be of type `function`', function () { | ||
var retext, | ||
TextOM; | ||
retext = new Retext().use(keywords); | ||
TextOM = retext.TextOM; | ||
/** | ||
* Value. | ||
* | ||
* First three paragraphs on term extraction from | ||
* Wikipedia: | ||
* | ||
* http://en.wikipedia.org/wiki/Terminology_extraction | ||
*/ | ||
var value; | ||
value = | ||
'Terminology mining, term extraction, term recognition, or ' + | ||
'glossary extraction, is a subtask of information extraction. ' + | ||
'The goal of terminology extraction is to automatically extract ' + | ||
'relevant terms from a given corpus.' + | ||
'\n\n' + | ||
'In the semantic web era, a growing number of communities and ' + | ||
'networked enterprises started to access and interoperate through ' + | ||
'the internet. Modeling these communities and their information ' + | ||
'needs is important for several web applications, like ' + | ||
'topic-driven web crawlers, web services, recommender systems, ' + | ||
'etc. The development of terminology extraction is essential to ' + | ||
'the language industry.' + | ||
'\n\n' + | ||
'One of the first steps to model the knowledge domain of a ' + | ||
'virtual community is to collect a vocabulary of domain-relevant ' + | ||
'terms, constituting the linguistic surface manifestation of ' + | ||
'domain concepts. Several methods to automatically extract ' + | ||
'technical terms from domain-specific document warehouses have ' + | ||
'been described in the literature.' + | ||
'\n\n' + | ||
'Typically, approaches to automatic term extraction make use of ' + | ||
'linguistic processors (part of speech tagging, phrase chunking) ' + | ||
'to extract terminological candidates, i.e. syntactically ' + | ||
'plausible terminological noun phrases, NPs (e.g. compounds ' + | ||
'"credit card", adjective-NPs "local tourist information office", ' + | ||
'and prepositional-NPs "board of directors" - in English, the ' + | ||
'first two constructs are the most frequent). Terminological ' + | ||
'entries are then filtered from the candidate list using ' + | ||
'statistical and machine learning methods. Once filtered, ' + | ||
'because of their low ambiguity and high specificity, these terms ' + | ||
'are particularly useful for conceptualizing a knowledge domain ' + | ||
'or for supporting the creation of a domain ontology. Furthermore, ' + | ||
'terminology extraction is a very useful starting point for ' + | ||
'semantic similarity, knowledge management, human translation ' + | ||
'and machine translation, etc.'; | ||
/** | ||
* Tests. | ||
*/ | ||
describe('keywords()', function () { | ||
it('should be a `function`', function () { | ||
assert(typeof keywords === 'function'); | ||
}); | ||
}); | ||
it('should export an `attach` method', function () { | ||
describe('keywords.attach()', function () { | ||
it('should be a `function`', function () { | ||
assert(typeof keywords.attach === 'function'); | ||
@@ -63,90 +91,114 @@ }); | ||
describe('TextOM.Parent#keywords()', function () { | ||
it('should be of type `function`', function () { | ||
assert(typeof tree.TextOM.Parent.prototype.keywords === 'function'); | ||
describe('TextOM.Parent#keywords(options?)', function () { | ||
it('should be a `function`', function () { | ||
assert(typeof TextOM.Parent.prototype.keywords === 'function'); | ||
}); | ||
it('should work', function () { | ||
var terms = tree.keywords(); | ||
it('should work', function (done) { | ||
retext.parse(value, function (err, tree) { | ||
var terms; | ||
assert(terms[0].stem === 'terminolog'); | ||
assert(terms[1].stem === 'term'); | ||
assert(terms[2].stem === 'extract'); | ||
assert(terms[3].stem === 'web'); | ||
assert(terms[4].stem === 'domain'); | ||
terms = tree.keywords(); | ||
assert(terms[0].nodes.length === 7); | ||
assert(terms[1].nodes.length === 7); | ||
assert(terms[2].nodes.length === 7); | ||
assert(terms[3].nodes.length === 4); | ||
assert(terms[4].nodes.length === 4); | ||
assert(terms[0].stem === 'terminolog'); | ||
assert(terms[1].stem === 'term'); | ||
assert(terms[2].stem === 'extract'); | ||
assert(terms[3].stem === 'web'); | ||
assert(terms[4].stem === 'domain'); | ||
assert(terms.length >= 5); | ||
}); | ||
assert(terms[0].nodes.length === 7); | ||
assert(terms[1].nodes.length === 7); | ||
assert(terms[2].nodes.length === 7); | ||
assert(terms[3].nodes.length === 4); | ||
assert(terms[4].nodes.length === 4); | ||
it('should accept a `minimum` option', function () { | ||
var terms = tree.keywords({ | ||
'minimum' : 7 | ||
assert(terms.length >= 5); | ||
done(err); | ||
}); | ||
}); | ||
assert(terms[0].stem === 'terminolog'); | ||
assert(terms[1].stem === 'term'); | ||
assert(terms[2].stem === 'extract'); | ||
assert(terms[3].stem === 'web'); | ||
assert(terms[4].stem === 'domain'); | ||
assert(terms[5].stem === 'inform'); | ||
assert(terms[6].stem === 'commun'); | ||
assert(terms[7].stem === 'knowledg'); | ||
it('should accept a `minimum` option', function (done) { | ||
retext.parse(value, function (err, tree) { | ||
var terms; | ||
assert(terms[0].nodes.length === 7); | ||
assert(terms[1].nodes.length === 7); | ||
assert(terms[2].nodes.length === 7); | ||
assert(terms[3].nodes.length === 4); | ||
assert(terms[4].nodes.length === 4); | ||
assert(terms[5].nodes.length === 3); | ||
assert(terms[6].nodes.length === 3); | ||
assert(terms[7].nodes.length === 3); | ||
terms = tree.keywords({ | ||
'minimum' : 7 | ||
}); | ||
assert(terms.length >= 7); | ||
assert(terms[0].stem === 'terminolog'); | ||
assert(terms[1].stem === 'term'); | ||
assert(terms[2].stem === 'extract'); | ||
assert(terms[3].stem === 'web'); | ||
assert(terms[4].stem === 'domain'); | ||
assert(terms[5].stem === 'inform'); | ||
assert(terms[6].stem === 'commun'); | ||
assert(terms[7].stem === 'knowledg'); | ||
assert(terms[0].nodes.length === 7); | ||
assert(terms[1].nodes.length === 7); | ||
assert(terms[2].nodes.length === 7); | ||
assert(terms[3].nodes.length === 4); | ||
assert(terms[4].nodes.length === 4); | ||
assert(terms[5].nodes.length === 3); | ||
assert(terms[6].nodes.length === 3); | ||
assert(terms[7].nodes.length === 3); | ||
assert(terms.length >= 7); | ||
done(err); | ||
}); | ||
}); | ||
}); | ||
describe('TextOM.Parent#keyphrases()', function () { | ||
it('should be of type `function`', function () { | ||
assert(typeof tree.TextOM.Parent.prototype.keywords === 'function'); | ||
describe('TextOM.Parent#keyphrases(options?)', function () { | ||
it('should be a `function`', function () { | ||
assert(typeof TextOM.Parent.prototype.keywords === 'function'); | ||
}); | ||
it('should work', function () { | ||
var phrases = tree.keyphrases(); | ||
it('should work', function (done) { | ||
retext.parse(value, function (err, tree) { | ||
var phrases; | ||
assert(phrases[0].value === 'terminolog extract'); | ||
assert(phrases[1].value === 'term'); | ||
assert(phrases[2].value === 'term extract'); | ||
assert(phrases[3].value === 'knowledg domain'); | ||
assert(phrases[4].value === 'commun'); | ||
phrases = tree.keyphrases(); | ||
assert(phrases[0].nodes.length === 3); | ||
assert(phrases[1].nodes.length === 3); | ||
assert(phrases[2].nodes.length === 2); | ||
assert(phrases[3].nodes.length === 2); | ||
assert(phrases[4].nodes.length === 3); | ||
assert(phrases[0].value === 'terminolog extract'); | ||
assert(phrases[1].value === 'term'); | ||
assert(phrases[2].value === 'term extract'); | ||
assert(phrases[3].value === 'knowledg domain'); | ||
assert(phrases[4].value === 'commun'); | ||
assert(phrases.length >= 5); | ||
}); | ||
assert(phrases[0].nodes.length === 3); | ||
assert(phrases[1].nodes.length === 3); | ||
assert(phrases[2].nodes.length === 2); | ||
assert(phrases[3].nodes.length === 2); | ||
assert(phrases[4].nodes.length === 3); | ||
it('should accept a `minimum` option', function () { | ||
var phrases = tree.keyphrases({ | ||
'minimum' : 3 | ||
assert(phrases.length >= 5); | ||
done(err); | ||
}); | ||
}); | ||
assert(phrases[0].value === 'terminolog extract'); | ||
assert(phrases[1].value === 'term'); | ||
assert(phrases[2].value === 'term extract'); | ||
it('should accept a `minimum` option', function (done) { | ||
retext.parse(value, function (err, tree) { | ||
var phrases; | ||
assert(phrases[0].nodes.length === 3); | ||
assert(phrases[1].nodes.length === 3); | ||
assert(phrases[2].nodes.length === 2); | ||
phrases = tree.keyphrases({ | ||
'minimum' : 3 | ||
}); | ||
assert(phrases.length >= 3); | ||
assert(phrases[0].value === 'terminolog extract'); | ||
assert(phrases[1].value === 'term'); | ||
assert(phrases[2].value === 'term extract'); | ||
assert(phrases[0].nodes.length === 3); | ||
assert(phrases[1].nodes.length === 3); | ||
assert(phrases[2].nodes.length === 2); | ||
assert(phrases.length >= 3); | ||
done(err); | ||
}); | ||
}); | ||
}); |
Sorry, the diff of this file is not supported yet
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
29695
685
9
186
Updatedretext-porter-stemmer@^0.1.1
Updatedretext-pos@^0.1.3
Updatedretext-visit@^0.1.1