retext-keywords
Advanced tools
Comparing version 2.0.2 to 3.0.0
535
index.js
@@ -11,5 +11,4 @@ /** | ||
/* eslint-env commonjs */ | ||
/* Dependencies. */ | ||
var has = require('has'); | ||
var stemmer = require('stemmer'); | ||
@@ -20,92 +19,169 @@ var visit = require('unist-util-visit'); | ||
/* Methods. */ | ||
var own = Object.prototype.hasOwnProperty; | ||
/* Expose. */ | ||
module.exports = attacher; | ||
/** | ||
* Get the stem of a node. | ||
* Attach. | ||
* | ||
* @param {Node} node - Node to stem. | ||
* @return {string} - Stemmed node. | ||
* @param {Retext} retext - Instance. | ||
* @param {Object?} [options] - Configuration. | ||
* @param {number?} [options.maximum] - Try to get at | ||
* least `maximum` results. | ||
* @return {Function} - `transformer`. | ||
*/ | ||
function stemNode(node) { | ||
return stemmer(nlcstToString(node)).toLowerCase(); | ||
} | ||
function attacher(retext, options) { | ||
var maximum = (options || {}).maximum || 5; | ||
/** | ||
* Check whether `value` is upper-case. | ||
* | ||
* @param {string} value - Value to check. | ||
* @return {boolean} - Whether `value` is upper-case. | ||
*/ | ||
function isUpperCase(value) { | ||
return value === String(value).toUpperCase(); | ||
retext.use(pos); | ||
return transformer; | ||
/** | ||
* Attach keywords in `cst` to `file`. | ||
* | ||
* @param {NLCSTNode} cst - Node. | ||
* @param {VFile} file - Virtual file. | ||
*/ | ||
function transformer(cst, file) { | ||
var important = getImportantWords(cst); | ||
file.data.keywords = filterResults(cloneMatches(important), maximum); | ||
file.data.keyphrases = getKeyphrases(important, maximum); | ||
} | ||
} | ||
/** | ||
* Reverse sort: from 9 to 0. | ||
* Get following or preceding important words or white space. | ||
* | ||
* @param {number} a - First. | ||
* @param {number} b - Second. | ||
* @return {number} - Difference. | ||
* @param {Node} node - Node to start search at. | ||
* @param {number} index - Position of `node` in `parent`. | ||
* @param {Node} parent - Parent of `node`. | ||
* @param {number} offset - Offset to the next node. `-1` | ||
* when iterating backwards, `1` when iterating forwards. | ||
* @return {Object} - Phrase. | ||
*/ | ||
function reverse(a, b) { | ||
return b - a; | ||
} | ||
function findPhraseInDirection(node, index, parent, offset) { | ||
var children = parent.children; | ||
var nodes = []; | ||
var stems = []; | ||
var words = []; | ||
var queue = []; | ||
var child; | ||
/** | ||
* Check whether or not a `node` is important. | ||
* | ||
* @param {Node} node - Node to check. | ||
* @return {boolean} - Whether `node` is important. | ||
*/ | ||
function isImportant(node) { | ||
return ( | ||
node && | ||
node.data && | ||
node.data.partOfSpeech && | ||
( | ||
node.data.partOfSpeech.indexOf('N') === 0 || | ||
( | ||
node.data.partOfSpeech === 'JJ' && | ||
isUpperCase(nlcstToString(node).charAt(0)) | ||
) | ||
) | ||
); | ||
while (children[index += offset]) { | ||
child = children[index]; | ||
if (child.type === 'WhiteSpaceNode') { | ||
queue.push(child); | ||
} else if (isImportant(child)) { | ||
nodes = nodes.concat(queue, [child]); | ||
words.push(child); | ||
stems.push(stemNode(child)); | ||
queue = []; | ||
} else { | ||
break; | ||
} | ||
} | ||
return { | ||
stems: stems, | ||
words: words, | ||
nodes: nodes | ||
}; | ||
} | ||
/** | ||
* Get most important words in `node`. | ||
* Get the top important phrases in `self`. | ||
* | ||
* @param {Node} node - Parent to search in. | ||
* @return {Array.<Object>} - Important words. | ||
* @param {Object.<string, Object>} results - Map of stems | ||
* mapping to objects containing `nodes`, `stem`, and | ||
* `score` properties. | ||
* @param {number} maximum - Try to get at least `maximum` | ||
* results. | ||
* @return {Array.<Object>} - Keyphrases. | ||
*/ | ||
function getImportantWords(node) { | ||
var words = {}; | ||
function getKeyphrases(results, maximum) { | ||
var stemmedPhrases = {}; | ||
var initialWords = []; | ||
var stemmedPhrase; | ||
var index; | ||
var length; | ||
var otherIndex; | ||
var keyword; | ||
var matches; | ||
var phrase; | ||
var stems; | ||
var score; | ||
var first; | ||
var match; | ||
visit(node, 'WordNode', function (word, index, parent) { | ||
var match; | ||
var stem; | ||
/* Iterate over all grouped important words... */ | ||
for (keyword in results) { | ||
matches = results[keyword].matches; | ||
length = matches.length; | ||
index = -1; | ||
if (isImportant(word)) { | ||
stem = stemNode(word); | ||
match = { | ||
node: word, | ||
index: index, | ||
parent: parent | ||
}; | ||
/* Iterate over every occurence of a certain keyword... */ | ||
while (++index < length) { | ||
phrase = findPhrase(matches[index]); | ||
stemmedPhrase = stemmedPhrases[phrase.value]; | ||
first = phrase.nodes[0]; | ||
if (!own.call(words, stem)) { | ||
words[stem] = { | ||
matches: [match], | ||
stem: stem, | ||
score: 1 | ||
}; | ||
} else { | ||
words[stem].matches.push(match); | ||
words[stem].score++; | ||
} | ||
match = { | ||
nodes: phrase.nodes, | ||
parent: matches[index].parent | ||
}; | ||
/* If we've detected the same stemmed | ||
* phrase somewhere. */ | ||
if (has(stemmedPhrases, phrase.value)) { | ||
/* Add weight per phrase to the score of | ||
* the phrase. */ | ||
stemmedPhrase.score += stemmedPhrase.weight; | ||
/* If this is the first time we walk over | ||
* the phrase (exact match but containing | ||
* another important word), add it to the | ||
* list of matching phrases. */ | ||
if (initialWords.indexOf(first) === -1) { | ||
initialWords.push(first); | ||
stemmedPhrase.matches.push(match); | ||
} | ||
}); | ||
} else { | ||
otherIndex = -1; | ||
score = -1; | ||
stems = phrase.stems; | ||
return words; | ||
initialWords.push(first); | ||
/* For every stem in phrase, add its | ||
* score to score. */ | ||
while (stems[++otherIndex]) { | ||
score += results[stems[otherIndex]].score; | ||
} | ||
stemmedPhrases[phrase.value] = { | ||
score: score, | ||
weight: score, | ||
stems: stems, | ||
value: phrase.value, | ||
matches: [match] | ||
}; | ||
} | ||
} | ||
} | ||
for (stemmedPhrase in stemmedPhrases) { | ||
phrase = stemmedPhrases[stemmedPhrase]; | ||
/* Modify its score to be the rounded result of | ||
* multiplying it with the number of occurances, | ||
* and dividing it by the ammount of words in the | ||
* phrase. */ | ||
phrase.score = Math.round( | ||
phrase.score * phrase.matches.length / phrase.stems.length | ||
); | ||
} | ||
return filterResults(stemmedPhrases, maximum); | ||
} | ||
@@ -124,89 +200,49 @@ | ||
function filterResults(results, maximum) { | ||
var filteredResults = []; | ||
var indices = []; | ||
var matrix = {}; | ||
var column; | ||
var key; | ||
var score; | ||
var interpolated; | ||
var index; | ||
var otherIndex; | ||
var maxScore; | ||
var filteredResults = []; | ||
var indices = []; | ||
var matrix = {}; | ||
var column; | ||
var key; | ||
var score; | ||
var interpolated; | ||
var index; | ||
var otherIndex; | ||
var maxScore; | ||
for (key in results) { | ||
score = results[key].score; | ||
for (key in results) { | ||
score = results[key].score; | ||
if (!matrix[score]) { | ||
matrix[score] = []; | ||
indices.push(score); | ||
} | ||
matrix[score].push(results[key]); | ||
if (!matrix[score]) { | ||
matrix[score] = []; | ||
indices.push(score); | ||
} | ||
indices.sort(reverse); | ||
matrix[score].push(results[key]); | ||
} | ||
maxScore = indices[0]; | ||
indices.sort(reverse); | ||
index = -1; | ||
maxScore = indices[0]; | ||
while (indices[++index]) { | ||
score = indices[index]; | ||
column = matrix[score]; | ||
index = -1; | ||
interpolated = score / maxScore; | ||
otherIndex = -1; | ||
while (indices[++index]) { | ||
score = indices[index]; | ||
column = matrix[score]; | ||
while (column[++otherIndex]) { | ||
column[otherIndex].score = interpolated; | ||
} | ||
interpolated = score / maxScore; | ||
otherIndex = -1; | ||
filteredResults = filteredResults.concat(column); | ||
if (filteredResults.length >= maximum) { | ||
break; | ||
} | ||
while (column[++otherIndex]) { | ||
column[otherIndex].score = interpolated; | ||
} | ||
return filteredResults; | ||
} | ||
filteredResults = filteredResults.concat(column); | ||
/** | ||
* Get following or preceding important words or white space. | ||
* | ||
* @param {Node} node - Node to start search at. | ||
* @param {number} index - Position of `node` in `parent`. | ||
* @param {Node} parent - Parent of `node`. | ||
* @param {number} offset - Offset to the next node. `-1` | ||
* when iterating backwards, `1` when iterating forwards. | ||
* @return {Object} - Phrase. | ||
*/ | ||
function findPhraseInDirection(node, index, parent, offset) { | ||
var children = parent.children; | ||
var nodes = []; | ||
var stems = []; | ||
var words = []; | ||
var queue = []; | ||
var child; | ||
while (children[index += offset]) { | ||
child = children[index]; | ||
if (child.type === 'WhiteSpaceNode') { | ||
queue.push(child); | ||
} else if (isImportant(child)) { | ||
nodes = nodes.concat(queue, [child]); | ||
words.push(child); | ||
stems.push(stemNode(child)); | ||
queue = []; | ||
} else { | ||
break; | ||
} | ||
if (filteredResults.length >= maximum) { | ||
break; | ||
} | ||
} | ||
return { | ||
stems: stems, | ||
words: words, | ||
nodes: nodes | ||
}; | ||
return filteredResults; | ||
} | ||
@@ -224,3 +260,3 @@ | ||
function merge(prev, current, next) { | ||
return prev.concat().reverse().concat([current], next); | ||
return prev.concat().reverse().concat([current], next); | ||
} | ||
@@ -235,108 +271,51 @@ | ||
function findPhrase(match) { | ||
var node = match.node; | ||
var prev = findPhraseInDirection(node, match.index, match.parent, -1); | ||
var next = findPhraseInDirection(node, match.index, match.parent, 1); | ||
var stems = merge(prev.stems, stemNode(node), next.stems); | ||
var node = match.node; | ||
var prev = findPhraseInDirection(node, match.index, match.parent, -1); | ||
var next = findPhraseInDirection(node, match.index, match.parent, 1); | ||
var stems = merge(prev.stems, stemNode(node), next.stems); | ||
return { | ||
stems: stems, | ||
value: stems.join(' '), | ||
nodes: merge(prev.nodes, node, next.nodes) | ||
}; | ||
return { | ||
stems: stems, | ||
value: stems.join(' '), | ||
nodes: merge(prev.nodes, node, next.nodes) | ||
}; | ||
} | ||
/** | ||
* Get the top important phrases in `self`. | ||
* Get most important words in `node`. | ||
* | ||
* @param {Object.<string, Object>} results - Map of stems | ||
* mapping to objects containing `nodes`, `stem`, and | ||
* `score` properties. | ||
* @param {number} maximum - Try to get at least `maximum` | ||
* results. | ||
* @return {Array.<Object>} - Keyphrases. | ||
* @param {Node} node - Parent to search in. | ||
* @return {Array.<Object>} - Important words. | ||
*/ | ||
function getKeyphrases(results, maximum) { | ||
var stemmedPhrases = {}; | ||
var initialWords = []; | ||
var stemmedPhrase; | ||
var index; | ||
var length; | ||
var otherIndex; | ||
var keyword; | ||
var matches; | ||
var phrase; | ||
var stems; | ||
var score; | ||
var first; | ||
var match; | ||
function getImportantWords(node) { | ||
var words = {}; | ||
/* Iterate over all grouped important words... */ | ||
for (keyword in results) { | ||
matches = results[keyword].matches; | ||
length = matches.length; | ||
index = -1; | ||
visit(node, 'WordNode', visitor); | ||
/* Iterate over every occurence of a certain keyword... */ | ||
while (++index < length) { | ||
phrase = findPhrase(matches[index]); | ||
stemmedPhrase = stemmedPhrases[phrase.value]; | ||
first = phrase.nodes[0]; | ||
return words; | ||
match = { | ||
nodes: phrase.nodes, | ||
parent: matches[index].parent | ||
}; | ||
function visitor(word, index, parent) { | ||
var match; | ||
var stem; | ||
/* If we've detected the same stemmed | ||
* phrase somewhere. */ | ||
if (own.call(stemmedPhrases, phrase.value)) { | ||
/* Add weight per phrase to the score of | ||
* the phrase. */ | ||
stemmedPhrase.score += stemmedPhrase.weight; | ||
if (isImportant(word)) { | ||
stem = stemNode(word); | ||
match = { | ||
node: word, | ||
index: index, | ||
parent: parent | ||
}; | ||
/* If this is the first time we walk over | ||
* the phrase (exact match but containing | ||
* another important word), add it to the | ||
* list of matching phrases. */ | ||
if (initialWords.indexOf(first) === -1) { | ||
initialWords.push(first); | ||
stemmedPhrase.matches.push(match); | ||
} | ||
} else { | ||
otherIndex = -1; | ||
score = -1; | ||
stems = phrase.stems; | ||
initialWords.push(first); | ||
/* For every stem in phrase, add its | ||
* score to score. */ | ||
while (stems[++otherIndex]) { | ||
score += results[stems[otherIndex]].score; | ||
} | ||
stemmedPhrases[phrase.value] = { | ||
score: score, | ||
weight: score, | ||
stems: stems, | ||
value: phrase.value, | ||
matches: [match] | ||
}; | ||
} | ||
} | ||
if (!has(words, stem)) { | ||
words[stem] = { | ||
matches: [match], | ||
stem: stem, | ||
score: 1 | ||
}; | ||
} else { | ||
words[stem].matches.push(match); | ||
words[stem].score++; | ||
} | ||
} | ||
for (stemmedPhrase in stemmedPhrases) { | ||
phrase = stemmedPhrases[stemmedPhrase]; | ||
/* Modify its score to be the rounded result of | ||
* multiplying it with the number of occurances, | ||
* and dividing it by the ammount of words in the | ||
* phrase. */ | ||
phrase.score = Math.round( | ||
phrase.score * phrase.matches.length / phrase.stems.length | ||
); | ||
} | ||
return filterResults(stemmedPhrases, maximum); | ||
} | ||
} | ||
@@ -353,50 +332,48 @@ | ||
function cloneMatches(words) { | ||
var result = {}; | ||
var key; | ||
var match; | ||
var result = {}; | ||
var key; | ||
var match; | ||
for (key in words) { | ||
match = words[key]; | ||
result[key] = { | ||
matches: match.matches, | ||
stem: match.stem, | ||
score: match.score | ||
} | ||
} | ||
for (key in words) { | ||
match = words[key]; | ||
return result; | ||
result[key] = { | ||
matches: match.matches, | ||
stem: match.stem, | ||
score: match.score | ||
}; | ||
} | ||
return result; | ||
} | ||
/** | ||
* Attach. | ||
* | ||
* @param {Retext} retext - Instance. | ||
* @param {Object?} [options] - Configuration. | ||
* @param {number?} [options.maximum] - Try to get at | ||
* least `maximum` results. | ||
* @return {Function} - `transformer`. | ||
*/ | ||
function attacher(retext, options) { | ||
var maximum = (options || {}).maximum || 5; | ||
/* Check if `node` is important. */ | ||
function isImportant(node) { | ||
return ( | ||
node && | ||
node.data && | ||
node.data.partOfSpeech && | ||
( | ||
node.data.partOfSpeech.indexOf('N') === 0 || | ||
( | ||
node.data.partOfSpeech === 'JJ' && | ||
isUpperCase(nlcstToString(node).charAt(0)) | ||
) | ||
) | ||
); | ||
} | ||
retext.use(pos); | ||
/* Check if `value` is upper-case. */ | ||
function isUpperCase(value) { | ||
return value === String(value).toUpperCase(); | ||
} | ||
/** | ||
* Attach keywords in `cst` to `file`. | ||
* | ||
* @param {NLCSTNode} cst - Node. | ||
* @param {VFile} file - Virtual file. | ||
*/ | ||
function transformer(cst, file) { | ||
var space = file.namespace('retext'); | ||
var important = getImportantWords(cst); | ||
/* Reverse sort: from 9 to 0. */ | ||
function reverse(a, b) { | ||
return b - a; | ||
} | ||
space.keywords = filterResults(cloneMatches(important), maximum); | ||
space.keyphrases = getKeyphrases(important, maximum); | ||
} | ||
return transformer; | ||
/* Get the stem of a node. */ | ||
function stemNode(node) { | ||
return stemmer(nlcstToString(node)).toLowerCase(); | ||
} | ||
/* Expose. */ | ||
module.exports = attacher; |
{ | ||
"name": "retext-keywords", | ||
"version": "2.0.2", | ||
"version": "3.0.0", | ||
"description": "Keyword extraction with Retext", | ||
@@ -14,9 +14,3 @@ "license": "MIT", | ||
], | ||
"files": [ | ||
"index.js" | ||
], | ||
"repository": { | ||
"type": "git", | ||
"url": "https://github.com/wooorm/retext-keywords.git" | ||
}, | ||
"repository": "https://github.com/wooorm/retext-keywords", | ||
"bugs": "https://github.com/wooorm/retext-keywords/issues", | ||
@@ -28,3 +22,7 @@ "author": "Titus Wormer <tituswormer@gmail.com> (http://wooorm.com)", | ||
], | ||
"files": [ | ||
"index.js" | ||
], | ||
"dependencies": { | ||
"has": "^1.0.1", | ||
"nlcst-to-string": "^2.0.0", | ||
@@ -37,14 +35,9 @@ "retext-pos": "^1.0.0", | ||
"browserify": "^13.0.1", | ||
"eslint": "^2.0.0", | ||
"esmangle": "^1.0.1", | ||
"istanbul": "^0.4.0", | ||
"jscs": "^3.0.0", | ||
"jscs-jsdoc": "^2.0.0", | ||
"remark-cli": "^1.0.0", | ||
"remark-comment-config": "^4.0.0", | ||
"remark-github": "^5.0.0", | ||
"remark-lint": "^4.0.0", | ||
"remark-validate-links": "^4.0.0", | ||
"retext": "^3.0.0", | ||
"tape": "^4.0.0" | ||
"nyc": "^8.3.0", | ||
"remark-cli": "^2.0.0", | ||
"remark-preset-wooorm": "^1.0.0", | ||
"retext": "^4.0.0", | ||
"tape": "^4.0.0", | ||
"xo": "^0.16.0" | ||
}, | ||
@@ -56,9 +49,29 @@ "scripts": { | ||
"build": "npm run build-md && npm run build-bundle && npm run build-mangle", | ||
"lint-api": "eslint .", | ||
"lint-style": "jscs --reporter inline .", | ||
"lint": "npm run lint-api && npm run lint-style", | ||
"lint": "xo", | ||
"test-api": "node test.js", | ||
"test-coverage": "istanbul cover test.js", | ||
"test-coverage": "nyc --reporter lcov tape test.js", | ||
"test": "npm run build && npm run lint && npm run test-coverage" | ||
}, | ||
"nyc": { | ||
"check-coverage": true, | ||
"lines": 100, | ||
"functions": 100, | ||
"branches": 100 | ||
}, | ||
"xo": { | ||
"space": true, | ||
"rules": { | ||
"no-negated-condition": "off", | ||
"guard-for-in": "off", | ||
"max-lines": "off", | ||
"max-nested-callbacks": "off" | ||
}, | ||
"ignores": [ | ||
"retext-keywords.js" | ||
] | ||
}, | ||
"remarkConfig": { | ||
"output": true, | ||
"presets": "wooorm" | ||
} | ||
} |
146
readme.md
# retext-keywords [![Build Status][travis-badge]][travis] [![Coverage Status][codecov-badge]][codecov] | ||
<!--lint disable heading-increment list-item-spacing--> | ||
Keyword extraction with [**retext**][retext]. | ||
@@ -9,3 +7,3 @@ | ||
[npm][npm-install]: | ||
[npm][]: | ||
@@ -16,5 +14,2 @@ ```bash | ||
**retext-keywords** is also available as an AMD, CommonJS, and | ||
globals module, [uncompressed and compressed][releases]. | ||
## Usage | ||
@@ -28,55 +23,53 @@ | ||
retext().use(keywords).process( | ||
/* First three paragraphs on Term Extraction from Wikipedia: | ||
* http://en.wikipedia.org/wiki/Terminology_extraction */ | ||
'Terminology mining, term extraction, term recognition, or ' + | ||
'glossary extraction, is a subtask of information extraction. ' + | ||
'The goal of terminology extraction is to automatically extract ' + | ||
'relevant terms from a given corpus.' + | ||
'\n\n' + | ||
'In the semantic web era, a growing number of communities and ' + | ||
'networked enterprises started to access and interoperate through ' + | ||
'the internet. Modeling these communities and their information ' + | ||
'needs is important for several web applications, like ' + | ||
'topic-driven web crawlers, web services, recommender systems, ' + | ||
'etc. The development of terminology extraction is essential to ' + | ||
'the language industry.' + | ||
'\n\n' + | ||
'One of the first steps to model the knowledge domain of a ' + | ||
'virtual community is to collect a vocabulary of domain-relevant ' + | ||
'terms, constituting the linguistic surface manifestation of ' + | ||
'domain concepts. Several methods to automatically extract ' + | ||
'technical terms from domain-specific document warehouses have ' + | ||
'been described in the literature.' + | ||
'\n\n' + | ||
'Typically, approaches to automatic term extraction make use of ' + | ||
'linguistic processors (part of speech tagging, phrase chunking) ' + | ||
'to extract terminological candidates, i.e. syntactically ' + | ||
'plausible terminological noun phrases, NPs (e.g. compounds ' + | ||
'"credit card", adjective-NPs "local tourist information office", ' + | ||
'and prepositional-NPs "board of directors" - in English, the ' + | ||
'first two constructs are the most frequent). Terminological ' + | ||
'entries are then filtered from the candidate list using ' + | ||
'statistical and machine learning methods. Once filtered, ' + | ||
'because of their low ambiguity and high specificity, these terms ' + | ||
'are particularly useful for conceptualizing a knowledge domain ' + | ||
'or for supporting the creation of a domain ontology. Furthermore, ' + | ||
'terminology extraction is a very useful starting point for ' + | ||
'semantic similarity, knowledge management, human translation ' + | ||
'and machine translation, etc.', | ||
function (err, file) { | ||
var space = file.namespace('retext'); | ||
/* First three paragraphs on Term Extraction from Wikipedia: | ||
* http://en.wikipedia.org/wiki/Terminology_extraction */ | ||
'Terminology mining, term extraction, term recognition, or ' + | ||
'glossary extraction, is a subtask of information extraction. ' + | ||
'The goal of terminology extraction is to automatically extract ' + | ||
'relevant terms from a given corpus.' + | ||
'\n\n' + | ||
'In the semantic web era, a growing number of communities and ' + | ||
'networked enterprises started to access and interoperate through ' + | ||
'the internet. Modeling these communities and their information ' + | ||
'needs is important for several web applications, like ' + | ||
'topic-driven web crawlers, web services, recommender systems, ' + | ||
'etc. The development of terminology extraction is essential to ' + | ||
'the language industry.' + | ||
'\n\n' + | ||
'One of the first steps to model the knowledge domain of a ' + | ||
'virtual community is to collect a vocabulary of domain-relevant ' + | ||
'terms, constituting the linguistic surface manifestation of ' + | ||
'domain concepts. Several methods to automatically extract ' + | ||
'technical terms from domain-specific document warehouses have ' + | ||
'been described in the literature.' + | ||
'\n\n' + | ||
'Typically, approaches to automatic term extraction make use of ' + | ||
'linguistic processors (part of speech tagging, phrase chunking) ' + | ||
'to extract terminological candidates, i.e. syntactically ' + | ||
'plausible terminological noun phrases, NPs (e.g. compounds ' + | ||
'"credit card", adjective-NPs "local tourist information office", ' + | ||
'and prepositional-NPs "board of directors" - in English, the ' + | ||
'first two constructs are the most frequent). Terminological ' + | ||
'entries are then filtered from the candidate list using ' + | ||
'statistical and machine learning methods. Once filtered, ' + | ||
'because of their low ambiguity and high specificity, these terms ' + | ||
'are particularly useful for conceptualizing a knowledge domain ' + | ||
'or for supporting the creation of a domain ontology. Furthermore, ' + | ||
'terminology extraction is a very useful starting point for ' + | ||
'semantic similarity, knowledge management, human translation ' + | ||
'and machine translation, etc.', | ||
function (err, file) { | ||
console.log('Keywords:'); | ||
console.log('Keywords:'); | ||
file.data.keywords.forEach(function (keyword) { | ||
console.log(nlcstToString(keyword.matches[0].node)); | ||
}); | ||
space.keywords.forEach(function (keyword) { | ||
console.log(nlcstToString(keyword.matches[0].node)); | ||
}); | ||
console.log(); | ||
console.log('Key-phrases:'); | ||
console.log(); | ||
console.log('Key-phrases:'); | ||
space.keyphrases.forEach(function (phrase) { | ||
console.log(phrase.matches[0].nodes.map(nlcstToString).join('')); | ||
}); | ||
} | ||
file.data.keyphrases.forEach(function (phrase) { | ||
console.log(phrase.matches[0].nodes.map(nlcstToString).join('')); | ||
}); | ||
} | ||
); | ||
@@ -109,5 +102,4 @@ ``` | ||
The results are stored in the `retext` namespace on the virtual file: | ||
keywords at `file.namespace('retext').keywords` and key-phrases at | ||
`file.namespace('retext').keyphrases`. Both are lists. | ||
The results are stored on `file.data`: keywords at `file.data.keywords` | ||
and key-phrases at `file.data.keyphrases`. Both are lists. | ||
@@ -118,9 +110,9 @@ A single keyword looks as follows: | ||
{ | ||
'stem': 'term', | ||
'score': 1, | ||
'matches': [ | ||
{ 'node': Node, 'index': 5, 'parent': Node }, | ||
// ... | ||
], | ||
stem: 'term', | ||
score: 1, | ||
matches: [ | ||
{node: Node, index: 5, parent: Node}, | ||
// ... | ||
], | ||
// ... | ||
} | ||
@@ -133,10 +125,10 @@ ``` | ||
{ | ||
'score': 1, | ||
'weight': 11, | ||
'stems': [ 'terminolog', 'extract' ], | ||
'value': 'terminolog extract', | ||
'matches': [ | ||
{ 'nodes': [Node, Node, Node], 'parent': Node }, | ||
// ... | ||
] | ||
score: 1, | ||
weight: 11, | ||
stems: ['terminolog', 'extract'], | ||
value: 'terminolog extract', | ||
matches: [ | ||
{nodes: [Node, Node, Node], parent: Node}, | ||
// ... | ||
] | ||
} | ||
@@ -150,4 +142,4 @@ ``` | ||
Note that actual counts may differ. For example, when two words | ||
have the same score, both will be returned. Or when too few words | ||
Note that actual counts may differ. For example, when two words | ||
have the same score, both will be returned. Or when too few words | ||
exist, less will be returned. the same goes for phrases. | ||
@@ -169,6 +161,4 @@ | ||
[npm-install]: https://docs.npmjs.com/cli/install | ||
[npm]: https://docs.npmjs.com/cli/install | ||
[releases]: https://github.com/wooorm/retext-keywords/releases | ||
[license]: LICENSE | ||
@@ -175,0 +165,0 @@ |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
No repository
Supply chain riskPackage does not have a linked source code repository. Without this field, a package will have no reference to the location of the source code use to generate the package.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
8
15984
5
4
319
161
1
1
+ Addedhas@^1.0.1
+ Addedhas@1.0.4(transitive)