Comparing version 0.1.0 to 0.1.1
{ | ||
"author": "alex.topliceanu <alext@vibetrace.com> (https://github.com/topliceanu)", | ||
"name": "sum", | ||
"description": "text summarization utility", | ||
"version": "0.1.0", | ||
"homepage": "https://github.com/topliceanu/text-summarization", | ||
"repository": { | ||
"type": "git", | ||
"url": "git@github.com:topliceanu/text-summarization.git" | ||
}, | ||
"main": "./sum.js", | ||
"engines": { | ||
"node": "~0.4.12" | ||
}, | ||
"dependencies": { | ||
"underscore": "~1.2.2", | ||
"underscore.string": "~2.0.0", | ||
"porter-stemmer": "~0.9.1", | ||
"vows": "~0.5.13" | ||
} | ||
"author": "alex.topliceanu <alext@vibetrace.com> (https://github.com/topliceanu)", | ||
"name": "sum", | ||
"description": "text summarization utility", | ||
"version": "0.1.1", | ||
"homepage": "https://github.com/topliceanu/text-summarization", | ||
"repository": { | ||
"type": "git", | ||
"url": "git@github.com:topliceanu/text-summarization.git" | ||
}, | ||
"main": "./sum.js", | ||
"engines": { | ||
"node": ">0.4.12" | ||
}, | ||
"dependencies": { | ||
"underscore": "~1.3.1", | ||
"underscore.string": "~2.0.0", | ||
"porter-stemmer": "~0.9.1", | ||
"vows": "0.6.1" | ||
} | ||
} |
@@ -40,6 +40,26 @@ _____ _ | ||
var abstract = sum({ | ||
'corpus': anotherBigString, // `corpus` is the string you want to summarize | ||
'nSentences': 3, // `nSentences` controls the number of sentences from the original from the original text included in the abstact | ||
'exclude': ['polar', 'bear'], // sum.js allows you to exclude sentences that contain any of the words in the `exclude` array param | ||
'emphasise': ['magic'] // forces sum.js to include in the summary the centences that contain the words specified by `emphasise` param. | ||
/** | ||
* `corpus`: String - is the string you want to summarize | ||
*/ | ||
'corpus': anotherBigString, | ||
/** | ||
* `nSentences`: Number - controls the number of sentences from the original text included in the abstact | ||
*/ | ||
'nSentences': 3, | ||
/** | ||
* `nWords`: Number - controls the length in words of the nGram output. Output might be larger as some words are ignored in the algorithm but present in the abstract, for ex. prepositions. When `nWords` is set, `nSentences` is ignored | ||
*/ | ||
'nWords': 5, | ||
/** | ||
* `exclude`: Array[String] - sum.js allows you to exclude from the final abstract, sentences or nGrams that contain any of the words in the `exclude` param | ||
*/ | ||
'exclude': ['polar', 'bear'], | ||
/** | ||
* `emphasise`: Array[String] - forces sum.js to include in the summary the sentences or nGrams that contain any the words specified by `emphasise` param. | ||
*/ | ||
'emphasise': ['magic'] | ||
}); | ||
@@ -50,5 +70,18 @@ | ||
============= | ||
Run /tests/specrunner.html in your favourite browser. | ||
Run node.js test with the command | ||
Run /tests/browser/specrunner.html in your favourite browser. | ||
To run node tests, make sure you have [vows.js](http://vowsjs.org) installed then run | ||
vows ./tests/node/sum.js | ||
Goals | ||
===== | ||
This library is intended to be fully `embeddable`. It's purpose is to be used primarly on the `client-side`. | ||
It should be `self-contained` so no API calls to external services. | ||
It should be as `light` as possible, both in terms of code size and dependencies and above all it must be `fast`. | ||
Because of these constraints, the algorithm used is purely statistical, using [TF IDF](http://en.wikipedia.org/wiki/Tf*idf) to calculate abstracts. | ||
Other methods of text summarization proposed by researchers in [NLP](http://en.wikipedia.org/wiki/Natural_language_processing) and [ML](http://en.wikipedia.org/wiki/Machine_learning) produce better results but are not (to my best of knowledge) practical in the browser context as many of them require intense computation to produce their output. | ||
@@ -61,2 +94,4 @@ TODO | ||
4. better control for the length of the summary, by words, by letters | ||
5. add more algorithms to calculate abstracts | ||
6. Make the library's inner utility functions available as some of them might be usefull | ||
@@ -63,0 +98,0 @@ |
330
sum.js
@@ -1,164 +0,226 @@ | ||
var stemmer = require( 'porter-stemmer' ).stemmer; | ||
var _ = require( 'underscore' ); | ||
_.str = require( 'underscore.string' ); | ||
_.mixin( _.str.exports() ); | ||
(function (_undef) { | ||
"use strict"; | ||
//default values | ||
var defaults = { | ||
nSentences: 1, | ||
exclude: [], | ||
emphasise: [] | ||
}; | ||
// regexes | ||
var sentenceDelimiter = /[.!?;]/; | ||
var wordDelimiter = /\s/mg; | ||
var matchJunk = /["#$%&'()*+,\-\/:<=>@\[\\\]\^_`{|}]/mg ; | ||
var wrapper = function (_, stemmer) { | ||
//default values | ||
var defaults = { | ||
nSentences: 1, | ||
exclude: [], | ||
emphasise: [] | ||
}; | ||
// regexes | ||
var sentenceDelimiter = /[.!?;]/; | ||
var nGramDelimiter = /[.,!?;]/; | ||
var wordDelimiter = /\s/mg; | ||
var matchJunk = /["#$%&'()*+,\-\/:<=>@\[\\\]\^_`{|}]/mg ; | ||
var stopWords = ["", "a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also","although","always","am","among", "amongst", "amoungst", "amount", "an", "and", "another", "any","anyhow","anyone","anything","anyway", "anywhere", "are", "around", "as", "at", "back","be","became", "because","become","becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom","but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven","else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own","part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", "the"]; | ||
var stopWords = ["", "a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also","although","always","am","among", "amongst", "amoungst", "amount", "an", "and", "another", "any","anyhow","anyone","anything","anyway", "anywhere", "are", "around", "as", "at", "back","be","became", "because","become","becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom","but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven","else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own","part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", "the"]; | ||
// function used to clean sentences before splitting into words | ||
var clean = function (str) { | ||
return _(str).chain() | ||
.unescapeHTML() | ||
.stripTags() | ||
.clean() | ||
.value() | ||
.replace( matchJunk, '' ) | ||
.toLowerCase(); | ||
}; | ||
// function used to clean sentences before splitting into words | ||
var clean = function (str) { | ||
return _(str).chain() | ||
.unescapeHTML() | ||
.stripTags() | ||
.clean() | ||
.value() | ||
.replace( matchJunk, '' ) | ||
.toLowerCase(); | ||
}; | ||
// Sentence Module | ||
var Sentence = function (s) { | ||
var c = clean( s ); | ||
var all = _.words( c, wordDelimiter ); | ||
var words = _(all).chain() | ||
// remove stop words | ||
.filter( function (w) { | ||
return (stopWords.indexOf( w ) === -1) ; | ||
}) | ||
// apply stemmer | ||
.map( function (w) { | ||
return stemmer( w ); | ||
}) | ||
// collect word frequencies | ||
.reduce( function (collect, w) { | ||
collect[w] = collect[w] ? collect[w] + 1 : 1 ; | ||
return collect; | ||
}, {}).value(); | ||
// remove a word from this sentence to reduce redundancy in results | ||
var remove = function (w) { | ||
return delete words[w]; | ||
// Sentence Module | ||
var Sentence = function (s) { | ||
var c = clean( s ); | ||
var all = _.words( c, wordDelimiter ); | ||
var words = _(all).chain() | ||
// remove stop words | ||
.filter( function (w) { | ||
return (stopWords.indexOf( w ) === -1) ; | ||
}) | ||
// apply stemmer | ||
.map( function (w) { | ||
return stemmer( w ); | ||
}) | ||
// collect word frequencies | ||
.reduce( function (collect, w) { | ||
collect[w] = collect[w] ? collect[w] + 1 : 1 ; | ||
return collect; | ||
}, {}).value(); | ||
// remove a word from this sentence to reduce redundancy in results | ||
var remove = function (w) { | ||
return delete words[w]; | ||
}; | ||
return { | ||
orig: s, | ||
words: words, | ||
remove: remove | ||
}; | ||
}; | ||
return { | ||
orig: s, | ||
words: words, | ||
remove: remove | ||
}; | ||
}; | ||
var sum = function (opts){ | ||
var sum = function (opts){ | ||
// handle options | ||
opts = _.extend( {}, defaults, opts ); | ||
opts.corpus = opts.corpus || _undef; | ||
if (opts.corpus === _undef) throw Error( 'No input corpus' ); | ||
// handle options | ||
opts = _.extend( {}, defaults, opts ); | ||
opts.corpus = opts.corpus || _undef; | ||
if (opts.corpus === _undef) throw Error( 'No input corpus' ); | ||
if (opts.nWords !== _undef && !_.isNumber(opts.nWords)) throw Error('Bad value for nWords'); | ||
// clean corpus | ||
var s = opts.corpus.split( sentenceDelimiter ); // TODO: keep the sentence ending chars | ||
var sentences = _(s).map( function (s) { | ||
return new Sentence(s); | ||
}); | ||
// return all sentences that contain a givven word | ||
var containing = function (w) { | ||
return _(sentences).filter( function (s) { | ||
return (s.words[w] !== undefined) ; | ||
// clean corpus | ||
var s = opts.corpus.split( sentenceDelimiter ); // TODO: keep the sentence ending chars | ||
var sentences = _(s).map( function (s) { | ||
return new Sentence(s); | ||
}); | ||
}; | ||
// if summary must exclude words in opts.exclude remove sentences that contain those words | ||
if ( _.isArray(opts.exclude) && opts.exclude.length !== 0) { | ||
var excludes = _(opts.exclude).map( function (w) { | ||
return stemmer(clean(w)); | ||
}); | ||
sentences = _(sentences).filter( function (s) { | ||
var words = _(s.words).keys(); | ||
return (_.intersection( words, excludes ).length === 0); | ||
}); | ||
} | ||
var summary = [] ; | ||
var counter = 0; | ||
// extract sentences in order of their relevance | ||
while (true) { | ||
var N = sentences.length; | ||
// splits the sentences into nGrams then applies the same algorithm | ||
if (opts.nWords) { | ||
// builds a hash of all words with global frequencies | ||
var words = _(sentences).reduce( function (collect,s) { | ||
_(s.words).each( function (count, w) { | ||
collect[w] = collect[w] ? collect[w] + count : count ; | ||
}); | ||
return collect; | ||
}, {}); | ||
// `opts.nSentences` is ignored, output size is determined by lexem size | ||
opts.nSentences = 1; | ||
var nGrams = _(sentences).reduce( function (collect, s) { | ||
var orig = s.orig; | ||
var partials = _(s.words).reduce( function (memo, freq, w) { | ||
var pos = orig.indexOf(' '); | ||
if (pos === -1) pos = orig.length; | ||
var partial = orig.substr(0, pos); | ||
orig = orig.substr(pos + 1); | ||
if (partial !== '') memo.push(partial); | ||
return memo; | ||
}, []); | ||
if (partials.length <= opts.nWords) { | ||
var newSentence = new Sentence( partials.join(' ')); | ||
collect.push( newSentence ); | ||
return collect; | ||
} | ||
var i = 0, j = 0, n = partials.length - opts.nWords, m=partials.length, tmp; | ||
for (i = 0; i < n; i ++) { | ||
var tmp = '' | ||
for (j = i; j < i+opts.nWords; j ++) { | ||
tmp += partials[j] + ' '; | ||
} | ||
var newSentence = new Sentence( tmp ); | ||
collect.push( newSentence ); | ||
} | ||
return collect; | ||
}, []); | ||
sentences = nGrams; | ||
} | ||
// return all sentences that contain a givven word | ||
var containing = function (w) { | ||
return _(sentences).filter( function (s) { | ||
return (s.words[w] !== undefined) ; | ||
}); | ||
}; | ||
// if summary must have the words in opts.emphasise | ||
var emphasise = []; | ||
if ( _.isArray(opts.emphasise) && opts.emphasise.length !== 0) { | ||
emphasise = _(opts.emphasise).map( function (w) { | ||
// if summary must exclude words in opts.exclude remove sentences that contain those words | ||
if ( _.isArray(opts.exclude) && opts.exclude.length !== 0) { | ||
var excludes = _(opts.exclude).map( function (w) { | ||
return stemmer(clean(w)); | ||
}); | ||
sentences = _(sentences).filter( function (s) { | ||
var words = _(s.words).keys(); | ||
return (_.intersection( words, excludes ).length === 0); | ||
}); | ||
} | ||
//calculate relevance for each sentence | ||
_(sentences).each( function (s) { | ||
var relevance = _(s.words).reduce( function (memo, freq, w) { | ||
var local = Math.log( 1 + freq ); | ||
var global = Math.log( N / containing(w).length ); | ||
return memo = memo + (local * global); | ||
}, 0); | ||
var summary = [] ; | ||
var counter = 0; | ||
// extract sentences in order of their relevance | ||
while (true) { | ||
var N = sentences.length; | ||
// builds a hash of all words with global frequencies | ||
var words = _(sentences).reduce( function (collect,s) { | ||
_(s.words).each( function (count, w) { | ||
collect[w] = collect[w] ? collect[w] + count : count ; | ||
}); | ||
return collect; | ||
}, {}); | ||
// if current sentence containes emphasised words, bumb up the relevance | ||
var bump = _.intersection(emphasise, _(s.words).keys()).length; | ||
relevance += bump * 1000; //big enough to push it in front | ||
// if summary must have the words in opts.emphasise | ||
var emphasise = []; | ||
if ( _.isArray(opts.emphasise) && opts.emphasise.length !== 0) { | ||
emphasise = _(opts.emphasise).map( function (w) { | ||
return stemmer(clean(w)); | ||
}); | ||
} | ||
s.relevance = relevance; | ||
}) | ||
//calculate relevance for each sentence | ||
_(sentences).each( function (s) { | ||
var relevance = _(s.words).reduce( function (memo, freq, w) { | ||
var local = Math.log( 1 + freq ); | ||
var global = Math.log( N / containing(w).length ); | ||
return memo = memo + (local * global); | ||
}, 0); | ||
// if current sentence containes emphasised words, bumb up the relevance | ||
var bump = _.intersection(emphasise, _(s.words).keys()).length; | ||
relevance += bump * 1000; //big enough to push it in front | ||
// highest relevance sentence | ||
var highest = _(sentences).max( function (s) { | ||
return s.relevance; | ||
}); | ||
// remove words from the remaining sentences to reduce redundancy | ||
sentences = _(sentences).chain() | ||
.without(highest) | ||
.map( function (s) { | ||
_(highest.words).each( function (w) { | ||
s.remove( w ); | ||
}); | ||
return s; | ||
s.relevance = relevance; | ||
}) | ||
.value(); | ||
summary.push( highest.orig ) ; | ||
counter += 1; | ||
// highest relevance sentence | ||
var highest = _(sentences).max( function (s) { | ||
return s.relevance; | ||
}); | ||
var stop = (counter === opts.nSentences || sentences.length === 0); | ||
if (stop) break; | ||
}//~ end while | ||
return { | ||
'summary': summary.join('.'), | ||
'sentences': summary | ||
// remove words from the remaining sentences to reduce redundancy | ||
sentences = _(sentences).chain() | ||
.without(highest) | ||
.map( function (s) { | ||
_(highest.words).each( function (w) { | ||
s.remove( w ); | ||
}); | ||
return s; | ||
}) | ||
.value(); | ||
summary.push( highest.orig ) ; | ||
counter += 1; | ||
var stop = (counter === opts.nSentences || sentences.length === 0); | ||
if (stop) break; | ||
}//~ end while | ||
return { | ||
'summary': summary.join('.'), | ||
'sentences': summary | ||
}; | ||
}; | ||
return sum; | ||
}; | ||
//public api | ||
module.exports = sum; | ||
// exports the `sum` function in node.js | ||
if (typeof exports !== 'undefined' && typeof module !== 'undefined' && module.exports && typeof require !== 'undefined') { | ||
var stemmer = require( 'porter-stemmer' ).stemmer; | ||
var _ = require( 'underscore' ); | ||
_.str = require( 'underscore.string' ); | ||
_.mixin( _.str.exports() ); | ||
module.exports = wrapper(_, stemmer); | ||
} | ||
// exports `sum` to AMD module, defining dependencies | ||
else if (typeof define === 'function' && define.amd) { | ||
define('sum', [ | ||
'underscore', | ||
'underscore.string', | ||
'porter-stemmer' | ||
], function(_, str, stemmer) { | ||
return wrapper(_, stemmer); | ||
}); | ||
} | ||
// export in browser | ||
else if (typeof this !== 'undefined' && this._ && this.stemmer) { | ||
this._.mixin( this._.str.exports() ); | ||
this.sum = wrapper(this._, this.stemmer); | ||
} | ||
else { | ||
throw Error( 'unsupported js environment detected' ); | ||
} | ||
}).call(this); |
@@ -35,3 +35,11 @@ describe( 'test sum\' params', function () { | ||
}); | ||
}) ; | ||
}); | ||
describe( 'test nWords params in action', function () { | ||
corpora.forEach( function (corpus) { | ||
it( 'should calculate the summary', function () { | ||
var actual = sum({ 'corpus': corpus.text, 'nWords': 5 }); | ||
expect(actual.sentences.length).not.toEqual(''); | ||
}); | ||
}); | ||
}); | ||
//TODO add tests to validate correctness of the actual output |
@@ -24,2 +24,17 @@ var vows = require( 'vows' ); | ||
}) | ||
.addBatch({ | ||
'when summarizing a text': { | ||
topic: function () { | ||
var s = sum({ | ||
'corpus': corpus, | ||
'nWords': 5 | ||
}); | ||
return s.summary; | ||
}, | ||
'it should output the abstract containing the most relevant sentences for the meaning of the initial text': function (error, summary) { | ||
assert.ifError( error ); | ||
assert.isString( summary ); | ||
} | ||
} | ||
}) | ||
.export(module); |
Sorry, the diff of this file is not supported yet
121
173677
4215
+ Addedunderscore@1.3.3(transitive)
+ Addedvows@0.6.1(transitive)
- Removedunderscore@1.2.4(transitive)
- Removedvows@0.5.13(transitive)
Updatedunderscore@~1.3.1
Updatedvows@0.6.1