sum - npm Package Compare versions

Comparing version 0.1.0 to 0.1.1

.npmignore

package.json

		{
		"author": "alex.topliceanu <alext@vibetrace.com> (https://github.com/topliceanu)",
		"name": "sum",
		"description": "text summarization utility",
		"version": "0.1.0",
		"homepage": "https://github.com/topliceanu/text-summarization",
		"repository": {
		"type": "git",
		"url": "git@github.com:topliceanu/text-summarization.git"
		},
		"main": "./sum.js",
		"engines": {
		"node": "~0.4.12"
		},
		"dependencies": {
		"underscore": "~1.2.2",
		"underscore.string": "~2.0.0",
		"porter-stemmer": "~0.9.1",
		"vows": "~0.5.13"
		}
		"author": "alex.topliceanu <alext@vibetrace.com> (https://github.com/topliceanu)",
		"name": "sum",
		"description": "text summarization utility",
		"version": "0.1.1",
		"homepage": "https://github.com/topliceanu/text-summarization",
		"repository": {
		"type": "git",
		"url": "git@github.com:topliceanu/text-summarization.git"
		},
		"main": "./sum.js",
		"engines": {
		"node": ">0.4.12"
		},
		"dependencies": {
		"underscore": "~1.3.1",
		"underscore.string": "~2.0.0",
		"porter-stemmer": "~0.9.1",
		"vows": "0.6.1"
		}
		}

README.md

		@@ -40,6 +40,26 @@ _____ _
		var abstract = sum({
		'corpus': anotherBigString, // `corpus` is the string you want to summarize
		'nSentences': 3, // `nSentences` controls the number of sentences from the original from the original text included in the abstact
		'exclude': ['polar', 'bear'], // sum.js allows you to exclude sentences that contain any of the words in the `exclude` array param
		'emphasise': ['magic'] // forces sum.js to include in the summary the centences that contain the words specified by `emphasise` param.
		/**
		* `corpus`: String - is the string you want to summarize
		*/
		'corpus': anotherBigString,

		/**
		* `nSentences`: Number - controls the number of sentences from the original text included in the abstact
		*/
		'nSentences': 3,

		/**
		* `nWords`: Number - controls the length in words of the nGram output. Output might be larger as some words are ignored in the algorithm but present in the abstract, for ex. prepositions. When `nWords` is set, `nSentences` is ignored
		*/
		'nWords': 5,

		/**
		* `exclude`: Array[String] - sum.js allows you to exclude from the final abstract, sentences or nGrams that contain any of the words in the `exclude` param
		*/
		'exclude': ['polar', 'bear'],

		/**
		* `emphasise`: Array[String] - forces sum.js to include in the summary the sentences or nGrams that contain any the words specified by `emphasise` param.
		*/
		'emphasise': ['magic']
		});
		@@ -50,5 +70,18 @@
		=============
		Run /tests/specrunner.html in your favourite browser.
		Run node.js test with the command
		Run /tests/browser/specrunner.html in your favourite browser.

		To run node tests, make sure you have [vows.js](http://vowsjs.org) installed then run

		vows ./tests/node/sum.js


		Goals
		=====

		This library is intended to be fully `embeddable`. It's purpose is to be used primarly on the `client-side`.
		It should be `self-contained` so no API calls to external services.
		It should be as `light` as possible, both in terms of code size and dependencies and above all it must be `fast`.
		Because of these constraints, the algorithm used is purely statistical, using [TF IDF](http://en.wikipedia.org/wiki/Tf*idf) to calculate abstracts.
		Other methods of text summarization proposed by researchers in [NLP](http://en.wikipedia.org/wiki/Natural_language_processing) and [ML](http://en.wikipedia.org/wiki/Machine_learning) produce better results but are not (to my best of knowledge) practical in the browser context as many of them require intense computation to produce their output.


		@@ -61,2 +94,4 @@ TODO
		4. better control for the length of the summary, by words, by letters
		5. add more algorithms to calculate abstracts
		6. Make the library's inner utility functions available as some of them might be usefull

		@@ -63,0 +98,0 @@

330

sum.js

		@@ -1,164 +0,226 @@
		var stemmer = require( 'porter-stemmer' ).stemmer;
		var _ = require( 'underscore' );
		_.str = require( 'underscore.string' );
		_.mixin( _.str.exports() );

		(function (_undef) {
		"use strict";

		//default values
		var defaults = {
		nSentences: 1,
		exclude: [],
		emphasise: []
		};

		// regexes
		var sentenceDelimiter = /[.!?;]/;
		var wordDelimiter = /\s/mg;
		var matchJunk = /["#$%&'()*+,\-\/:<=>@\[\\\]\^_`{\|}]/mg ;
		var wrapper = function (_, stemmer) {
		//default values
		var defaults = {
		nSentences: 1,
		exclude: [],
		emphasise: []
		};

		// regexes
		var sentenceDelimiter = /[.!?;]/;
		var nGramDelimiter = /[.,!?;]/;
		var wordDelimiter = /\s/mg;
		var matchJunk = /["#$%&'()*+,\-\/:<=>@\[\\\]\^_`{\|}]/mg ;

		var stopWords = ["", "a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also","although","always","am","among", "amongst", "amoungst", "amount", "an", "and", "another", "any","anyhow","anyone","anything","anyway", "anywhere", "are", "around", "as", "at", "back","be","became", "because","become","becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom","but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven","else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own","part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", "the"];
		var stopWords = ["", "a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also","although","always","am","among", "amongst", "amoungst", "amount", "an", "and", "another", "any","anyhow","anyone","anything","anyway", "anywhere", "are", "around", "as", "at", "back","be","became", "because","become","becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom","but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven","else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own","part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", "the"];

		// function used to clean sentences before splitting into words
		var clean = function (str) {
		return _(str).chain()
		.unescapeHTML()
		.stripTags()
		.clean()
		.value()
		.replace( matchJunk, '' )
		.toLowerCase();
		};
		// function used to clean sentences before splitting into words
		var clean = function (str) {
		return _(str).chain()
		.unescapeHTML()
		.stripTags()
		.clean()
		.value()
		.replace( matchJunk, '' )
		.toLowerCase();
		};

		// Sentence Module
		var Sentence = function (s) {
		var c = clean( s );
		var all = _.words( c, wordDelimiter );
		var words = _(all).chain()
		// remove stop words
		.filter( function (w) {
		return (stopWords.indexOf( w ) === -1) ;
		})
		// apply stemmer
		.map( function (w) {
		return stemmer( w );
		})
		// collect word frequencies
		.reduce( function (collect, w) {
		collect[w] = collect[w] ? collect[w] + 1 : 1 ;
		return collect;
		}, {}).value();
		// remove a word from this sentence to reduce redundancy in results
		var remove = function (w) {
		return delete words[w];
		// Sentence Module
		var Sentence = function (s) {
		var c = clean( s );
		var all = _.words( c, wordDelimiter );
		var words = _(all).chain()
		// remove stop words
		.filter( function (w) {
		return (stopWords.indexOf( w ) === -1) ;
		})
		// apply stemmer
		.map( function (w) {
		return stemmer( w );
		})
		// collect word frequencies
		.reduce( function (collect, w) {
		collect[w] = collect[w] ? collect[w] + 1 : 1 ;
		return collect;
		}, {}).value();
		// remove a word from this sentence to reduce redundancy in results
		var remove = function (w) {
		return delete words[w];
		};
		return {
		orig: s,
		words: words,
		remove: remove
		};
		};
		return {
		orig: s,
		words: words,
		remove: remove
		};
		};

		var sum = function (opts){
		var sum = function (opts){

		// handle options
		opts = _.extend( {}, defaults, opts );
		opts.corpus = opts.corpus \|\| _undef;
		if (opts.corpus === _undef) throw Error( 'No input corpus' );
		// handle options
		opts = _.extend( {}, defaults, opts );
		opts.corpus = opts.corpus \|\| _undef;
		if (opts.corpus === _undef) throw Error( 'No input corpus' );
		if (opts.nWords !== _undef && !_.isNumber(opts.nWords)) throw Error('Bad value for nWords');

		// clean corpus
		var s = opts.corpus.split( sentenceDelimiter ); // TODO: keep the sentence ending chars
		var sentences = _(s).map( function (s) {
		return new Sentence(s);
		});

		// return all sentences that contain a givven word
		var containing = function (w) {
		return _(sentences).filter( function (s) {
		return (s.words[w] !== undefined) ;
		// clean corpus
		var s = opts.corpus.split( sentenceDelimiter ); // TODO: keep the sentence ending chars
		var sentences = _(s).map( function (s) {
		return new Sentence(s);
		});
		};

		// if summary must exclude words in opts.exclude remove sentences that contain those words
		if ( _.isArray(opts.exclude) && opts.exclude.length !== 0) {
		var excludes = _(opts.exclude).map( function (w) {
		return stemmer(clean(w));
		});
		sentences = _(sentences).filter( function (s) {
		var words = _(s.words).keys();
		return (_.intersection( words, excludes ).length === 0);
		});
		}


		var summary = [] ;
		var counter = 0;

		// extract sentences in order of their relevance
		while (true) {
		var N = sentences.length;
		// splits the sentences into nGrams then applies the same algorithm
		if (opts.nWords) {

		// builds a hash of all words with global frequencies
		var words = _(sentences).reduce( function (collect,s) {
		_(s.words).each( function (count, w) {
		collect[w] = collect[w] ? collect[w] + count : count ;
		});
		return collect;
		}, {});
		// `opts.nSentences` is ignored, output size is determined by lexem size
		opts.nSentences = 1;

		var nGrams = _(sentences).reduce( function (collect, s) {
		var orig = s.orig;
		var partials = _(s.words).reduce( function (memo, freq, w) {
		var pos = orig.indexOf(' ');
		if (pos === -1) pos = orig.length;
		var partial = orig.substr(0, pos);
		orig = orig.substr(pos + 1);
		if (partial !== '') memo.push(partial);
		return memo;
		}, []);
		if (partials.length <= opts.nWords) {
		var newSentence = new Sentence( partials.join(' '));
		collect.push( newSentence );
		return collect;
		}
		var i = 0, j = 0, n = partials.length - opts.nWords, m=partials.length, tmp;
		for (i = 0; i < n; i ++) {
		var tmp = ''
		for (j = i; j < i+opts.nWords; j ++) {
		tmp += partials[j] + ' ';
		}
		var newSentence = new Sentence( tmp );
		collect.push( newSentence );
		}
		return collect;
		}, []);
		sentences = nGrams;
		}


		// return all sentences that contain a givven word
		var containing = function (w) {
		return _(sentences).filter( function (s) {
		return (s.words[w] !== undefined) ;
		});
		};

		// if summary must have the words in opts.emphasise
		var emphasise = [];
		if ( _.isArray(opts.emphasise) && opts.emphasise.length !== 0) {
		emphasise = _(opts.emphasise).map( function (w) {
		// if summary must exclude words in opts.exclude remove sentences that contain those words
		if ( _.isArray(opts.exclude) && opts.exclude.length !== 0) {
		var excludes = _(opts.exclude).map( function (w) {
		return stemmer(clean(w));
		});
		sentences = _(sentences).filter( function (s) {
		var words = _(s.words).keys();
		return (_.intersection( words, excludes ).length === 0);
		});
		}

		//calculate relevance for each sentence
		_(sentences).each( function (s) {
		var relevance = _(s.words).reduce( function (memo, freq, w) {
		var local = Math.log( 1 + freq );
		var global = Math.log( N / containing(w).length );
		return memo = memo + (local * global);
		}, 0);

		var summary = [] ;
		var counter = 0;

		// extract sentences in order of their relevance
		while (true) {
		var N = sentences.length;

		// builds a hash of all words with global frequencies
		var words = _(sentences).reduce( function (collect,s) {
		_(s.words).each( function (count, w) {
		collect[w] = collect[w] ? collect[w] + count : count ;
		});
		return collect;
		}, {});

		// if current sentence containes emphasised words, bumb up the relevance
		var bump = _.intersection(emphasise, _(s.words).keys()).length;
		relevance += bump * 1000; //big enough to push it in front
		// if summary must have the words in opts.emphasise
		var emphasise = [];
		if ( _.isArray(opts.emphasise) && opts.emphasise.length !== 0) {
		emphasise = _(opts.emphasise).map( function (w) {
		return stemmer(clean(w));
		});
		}

		s.relevance = relevance;
		})
		//calculate relevance for each sentence
		_(sentences).each( function (s) {
		var relevance = _(s.words).reduce( function (memo, freq, w) {
		var local = Math.log( 1 + freq );
		var global = Math.log( N / containing(w).length );
		return memo = memo + (local * global);
		}, 0);

		// if current sentence containes emphasised words, bumb up the relevance
		var bump = _.intersection(emphasise, _(s.words).keys()).length;
		relevance += bump * 1000; //big enough to push it in front

		// highest relevance sentence
		var highest = _(sentences).max( function (s) {
		return s.relevance;
		});

		// remove words from the remaining sentences to reduce redundancy
		sentences = _(sentences).chain()
		.without(highest)
		.map( function (s) {
		_(highest.words).each( function (w) {
		s.remove( w );
		});
		return s;
		s.relevance = relevance;
		})
		.value();

		summary.push( highest.orig ) ;
		counter += 1;
		// highest relevance sentence
		var highest = _(sentences).max( function (s) {
		return s.relevance;
		});

		var stop = (counter === opts.nSentences \|\| sentences.length === 0);
		if (stop) break;
		}//~ end while
		return {
		'summary': summary.join('.'),
		'sentences': summary
		// remove words from the remaining sentences to reduce redundancy
		sentences = _(sentences).chain()
		.without(highest)
		.map( function (s) {
		_(highest.words).each( function (w) {
		s.remove( w );
		});
		return s;
		})
		.value();

		summary.push( highest.orig ) ;
		counter += 1;

		var stop = (counter === opts.nSentences \|\| sentences.length === 0);
		if (stop) break;
		}//~ end while
		return {
		'summary': summary.join('.'),
		'sentences': summary
		};
		};
		return sum;
		};

		//public api
		module.exports = sum;

		// exports the `sum` function in node.js
		if (typeof exports !== 'undefined' && typeof module !== 'undefined' && module.exports && typeof require !== 'undefined') {
		var stemmer = require( 'porter-stemmer' ).stemmer;
		var _ = require( 'underscore' );
		_.str = require( 'underscore.string' );
		_.mixin( _.str.exports() );
		module.exports = wrapper(_, stemmer);
		}
		// exports `sum` to AMD module, defining dependencies
		else if (typeof define === 'function' && define.amd) {
		define('sum', [
		'underscore',
		'underscore.string',
		'porter-stemmer'
		], function(_, str, stemmer) {
		return wrapper(_, stemmer);
		});
		}
		// export in browser
		else if (typeof this !== 'undefined' && this._ && this.stemmer) {
		this._.mixin( this._.str.exports() );
		this.sum = wrapper(this._, this.stemmer);
		}
		else {
		throw Error( 'unsupported js environment detected' );
		}

		}).call(this);

tests/browser/specs/SpecSum.js

		@@ -35,3 +35,11 @@ describe( 'test sum\' params', function () {
		});
		}) ;
		});
		describe( 'test nWords params in action', function () {
		corpora.forEach( function (corpus) {
		it( 'should calculate the summary', function () {
		var actual = sum({ 'corpus': corpus.text, 'nWords': 5 });
		expect(actual.sentences.length).not.toEqual('');
		});
		});
		});
		//TODO add tests to validate correctness of the actual output

tests/node/sum.js

		@@ -24,2 +24,17 @@ var vows = require( 'vows' );
		})
		.addBatch({
		'when summarizing a text': {
		topic: function () {
		var s = sum({
		'corpus': corpus,
		'nWords': 5
		});
		return s.summary;
		},
		'it should output the abstract containing the most relevant sentences for the meaning of the initial text': function (error, summary) {
		assert.ifError( error );
		assert.isString( summary );
		}
		}
		})
		.export(module);

sum.browser.js

tests/browser/specrunner.html

Sorry, the diff of this file is not supported yet

sum - npm Package Compare versions

Improved metrics

Worsened metrics

Dependency changes