Comparing version 0.0.9 to 1.0.0
@@ -1,25 +0,27 @@ | ||
import rake from '../app' | ||
import rake from '../app'; | ||
describe('rake', () => { | ||
it('can be imported', () => { | ||
expect(rake).toBeTruthy() | ||
}) | ||
expect(rake).toBeTruthy(); | ||
}); | ||
describe('generate', () => { | ||
const text = 'LDA stands for Latent Dirichlet Allocation. As already mentioned it is one of the more popular topic models which was initially proposed by Blei, Ng and Jordan in 2003. It is a generative model which, according to Wikipedia, allows sets of observations to be explained by unobserved groups that explain why some parts of the data are similar.'; | ||
let text = "LDA stands for Latent Dirichlet Allocation. As already mentioned it is one of the more popular topic models which was initially proposed by Blei, Ng and Jordan in 2003. It is a generative model which, according to Wikipedia, allows sets of observations to be explained by unobserved groups that explain why some parts of the data are similar." | ||
it('extracts keywords from text', () => { | ||
let results = rake.generate(text) | ||
expect(results.length).toEqual(18) | ||
}) | ||
const results = rake.generate(text); | ||
expect(results.length).toEqual(18); | ||
}); | ||
it('extracts keywords from text using a custom stopwords list', () => { | ||
const opts = { stopwords: ['for', 'the', 'a', 'stands', 'test', 'man', 'woman'] }; | ||
const keywords = rake.generate(text, opts); | ||
expect(keywords.length).toEqual(7); | ||
}); | ||
it('trims leading and trailing spaces from keywords', () => { | ||
let [firstKeyword, ...rest] = rake.generate(text) | ||
expect(firstKeyword).toEqual("Latent Dirichlet Allocation") | ||
}) | ||
}) | ||
}) | ||
const firstKeyword = rake.generate(text)[0]; | ||
expect(firstKeyword).toEqual('Latent Dirichlet Allocation'); | ||
}); | ||
}); | ||
}); |
21
app.js
@@ -1,10 +0,15 @@ | ||
var Rake = require('./index.js') | ||
var path = require("path"); | ||
var stopwords_path = path.resolve(__dirname+'/'+'stopWords.txt') | ||
const Rake = require('./index.js'); | ||
const path = require('path'); | ||
const fs = require('fs'); | ||
const stopwordsPath = path.resolve(`${__dirname}/stopWords.txt`); | ||
module.exports = { | ||
generate: function(content){ | ||
let instance = new Rake(content,stopwords_path) | ||
return instance.generate() | ||
} | ||
} | ||
generate(content, opts = {}) { | ||
const fileData = fs.readFileSync(stopwordsPath).toString().split('\n'); | ||
const stopwordsList = opts.stopwords || fileData; | ||
const instance = new Rake(content, stopwordsList); | ||
return instance.generate(); | ||
}, | ||
}; |
166
index.js
@@ -1,123 +0,95 @@ | ||
var fs = require('fs'); | ||
class Rake { | ||
constructor(text,stop_words_path){ | ||
constructor(text, stopwordsList) { | ||
this.text = text; | ||
this.stop_words_path = stop_words_path | ||
this.regex_expression = this.buildRegex() | ||
this.stopwords = stopwordsList; | ||
this.regexExpression = this.buildRegex(); | ||
} | ||
getStopWordsFromFile() { | ||
var stopwords = fs.readFileSync(this.stop_words_path).toString().split("\n"); | ||
return stopwords | ||
buildRegex() { | ||
return this.stopwords.join('|'); | ||
} | ||
buildRegex(){ | ||
var reg = '' | ||
var stopwords_list = this.getStopWordsFromFile(); | ||
for(var i in stopwords_list){ | ||
var stopword = stopwords_list[i]; | ||
if(i!=stopwords_list.length-1){reg = reg + stopword + '|';} | ||
else{reg = reg + stopword;} | ||
} | ||
return reg; | ||
} | ||
removeStopWords(sentence) { | ||
var reg_exp = this.regex_expression | ||
var r = reg_exp.substring(0, reg_exp.length - 1); | ||
var reg = new RegExp('\\b(?:' + r + ')\\b','ig') | ||
var filtered_sentence = sentence.replace(reg,'|').split('|') | ||
return filtered_sentence | ||
const regExp = this.regexExpression; | ||
const r = regExp.substring(0, regExp.length - 1); | ||
const reg = new RegExp(`\\b(?:${r})\\b`, 'ig'); | ||
const filteredSentence = sentence.replace(reg, '|').split('|'); | ||
return filteredSentence; | ||
} | ||
splitTextToSentences(text){ | ||
var sentences = text.match( /[^\.!\?\:\\]+/g ); | ||
var filtered_sentences = [] | ||
for(var i in sentences){ | ||
var s = sentences[i].replace(/ +/g, ""); | ||
if(s != ""){filtered_sentences.push(s)} | ||
} | ||
return filtered_sentences | ||
splitTextToSentences(text) { | ||
const sentences = text.match(/[^.!?:\\]+/g); | ||
const filteredSentences = sentences.filter(s => s.replace(/ +/g, '') !== ''); | ||
return filteredSentences; | ||
} | ||
generatePhrases(sentence_list) { | ||
var phrase_list = [] | ||
for (var s in sentence_list) { | ||
var phrases = this.removeStopWords(sentence_list[s]); | ||
for(var phrase in phrases) { | ||
var phr = phrases[phrase].replace(/['!"“”’#$%&()\*+,\-\.\/:;<=>?@\[\\\]\^_`{|}~']/g,'') | ||
if(phr != ' ' && phr != '') { | ||
phrase_list.push(phr.trim()) | ||
} | ||
} | ||
} | ||
return phrase_list | ||
generatePhrases(sentenceList) { | ||
const reg = /['!"“”’#$%&()*+,\-./:;<=>?@[\\\]^_`{|}~']/g; | ||
const phrases = sentenceList.map(s => this.removeStopWords(s)); | ||
const phraseList = phrases.map(phrase => phrase | ||
.filter(phr => (phr.replace(reg, '') !== ' ' && phr.replace(reg, '') !== '')) | ||
.map(phr => phr.trim()), | ||
); | ||
const flattenedList = [].concat(...phraseList); | ||
return flattenedList; | ||
} | ||
//Generates score for each word. | ||
calculateKeywordScores(phrase_list) { | ||
var word_freq = {} | ||
var word_degree = {} | ||
var word_score = {} | ||
for(var phrase in phrase_list) { | ||
var word_list = phrase_list[phrase].match(/[,.!?;:/‘’“”]|\b[0-9a-z']+\b/gi) | ||
var word_list_degree = word_list.length | ||
for(var word in word_list){ | ||
word_freq[word_list[word]] = 0; | ||
word_freq[word_list[word]] +=1; | ||
word_degree[word_list[word]] = 0; | ||
word_degree[word_list[word]] += word_list_degree; | ||
// Generates score for each word. | ||
calculateKeywordScores(phraseList) { | ||
const wordFreq = {}; | ||
const wordDegree = {}; | ||
const wordScore = {}; | ||
phraseList.forEach((phrase) => { | ||
const wordList = phrase.match(/[,.!?;:/‘’“”]|\b[0-9a-z']+\b/gi); | ||
if(wordList){ | ||
const wordListDegree = wordList.length; | ||
wordList.forEach((word) => { | ||
if (wordFreq[word]) { | ||
wordFreq[word] += 1; | ||
} | ||
else { | ||
wordFreq[word] = 1; | ||
} | ||
if (wordDegree[word]) { | ||
wordDegree[word] += wordListDegree; | ||
} | ||
else { | ||
wordDegree[word] = wordListDegree; | ||
} | ||
}); | ||
} | ||
} | ||
}); | ||
for(var i in word_freq) { | ||
var freq = word_freq[i]; | ||
word_degree[freq] = word_degree[freq] + word_freq[freq]; | ||
} | ||
for(var i in word_freq){ | ||
word_score[i] = 0; | ||
word_score[i] = word_degree[i] / (word_freq[i] * 1.0); | ||
} | ||
return word_score | ||
Object.values(wordFreq).forEach((freq) => { wordDegree[freq] += wordFreq[freq]; }); | ||
Object.keys(wordFreq).forEach((i) => { wordScore[i] = wordDegree[i] / (wordFreq[i] * 1.0); }); | ||
return wordScore; | ||
} | ||
//Generates score for each phrase based on the word scores. | ||
calculatePhraseScores(phrase_list, word_score) { | ||
var phrase_scores = {} | ||
for(var p in phrase_list){ | ||
var phrase = phrase_list[p]; | ||
phrase_scores[phrase] = 0; | ||
var word_list = phrase.match(/(\b[^\s]+\b)/g) | ||
var candidate_score = 0; | ||
for(var w in word_list){ | ||
var word = word_list[w]; | ||
candidate_score += word_score[word]; | ||
} | ||
phrase_scores[phrase] = candidate_score; | ||
} | ||
return phrase_scores | ||
// Generates score for each phrase based on the word scores. | ||
calculatePhraseScores(phraseList, wordScore) { | ||
const phraseScores = {}; | ||
phraseList.forEach((phrase) => { | ||
phraseScores[phrase] = 0; | ||
let candidateScore = 0; | ||
const wordList = phrase.match(/(\b[^\s]+\b)/g); | ||
wordList.forEach((word) => { candidateScore += wordScore[word]; }); | ||
phraseScores[phrase] = candidateScore; | ||
}); | ||
return phraseScores; | ||
} | ||
sortPhrases(obj) { | ||
var keys = []; for(var key in obj) keys.push(key); | ||
return keys.sort(function(a,b){return obj[b]-obj[a]}); | ||
return Object.keys(obj).sort((a, b) => obj[b] - obj[a]); | ||
} | ||
generate() { | ||
var sentence_list = this.splitTextToSentences(this.text); | ||
var phrases_list = this.generatePhrases(sentence_list); | ||
var word_scores = this.calculateKeywordScores(phrases_list) | ||
var phrase_scores = this.calculatePhraseScores(phrases_list, word_scores) | ||
var result = this.sortPhrases(phrase_scores) | ||
return result | ||
const sentenceList = this.splitTextToSentences(this.text); | ||
const phrasesList = this.generatePhrases(sentenceList); | ||
const wordScores = this.calculateKeywordScores(phrasesList); | ||
const phraseScores = this.calculatePhraseScores(phrasesList, wordScores); | ||
const result = this.sortPhrases(phraseScores); | ||
return result; | ||
} | ||
} | ||
module.exports = Rake | ||
module.exports = Rake; |
{ | ||
"name": "node-rake", | ||
"version": "0.0.9", | ||
"version": "1.0.0", | ||
"description": "A NodeJS implementation of the Rapid Automatic Keyword Extraction algorithm.", | ||
@@ -26,4 +26,7 @@ "main": "app.js", | ||
"babel-preset-es2015": "^6.22.0", | ||
"eslint": "^4.4.0", | ||
"eslint-config-airbnb-base": "^11.3.1", | ||
"eslint-plugin-import": "^2.7.0", | ||
"jest": "^18.1.0" | ||
} | ||
} |
# node-rake | ||
[![npm](https://img.shields.io/npm/dm/node-rake.svg)](https://www.npmjs.com/package/node-rake) [![npm](https://img.shields.io/npm/v/node-rake.svg)]() | ||
[![Build Status](https://travis-ci.org/waseem18/node-rake.svg?branch=master)](https://travis-ci.org/waseem18/node-rake) [![npm](https://img.shields.io/npm/dm/node-rake.svg)](https://www.npmjs.com/package/node-rake) [![npm](https://img.shields.io/npm/v/node-rake.svg)]() | ||
@@ -13,6 +13,23 @@ | ||
```javascript | ||
import rake from 'node-rake' | ||
rake.generate(text, opts); | ||
``` | ||
The `opts` param is an object that allows to pass custom params to generate method. Options: | ||
- `stopwords`: Optional. An `array` containing a custom stopwords list. By default, the method uses a stopwords list which comes along (take a look at [Stopwords source](#stopwords-source)). | ||
## Example of usage: | ||
```javascript | ||
const rake = require('node-rake') | ||
const keywords = rake.generate("LDA stands for Latent Dirichlet Allocation") | ||
// it'll output: [ 'Latent Dirichlet Allocation', 'LDA stands' ] | ||
//or | ||
let rake = require('node-rake') | ||
let keywords = rake.generate("LDA stands for Latent Dirichlet Allocation") | ||
const myStopwords = ['for', 'the', 'a', 'stands', 'test', 'man', 'woman']; | ||
const opts = {stopwords: myStopwords}; | ||
const keywords = rake.generate("LDA stands for Latent Dirichlet Allocation", opts); | ||
// it'll output: [ 'Latent Dirichlet Allocation', 'LDA' ] | ||
``` | ||
@@ -19,0 +36,0 @@ |
Sorry, the diff of this file is not supported yet
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
No v1
QualityPackage is not semver >=1. This means it is not stable and does not support ^ ranges.
Found 1 instance in 1 package
12412
12
0
50
7
126
1