Comparing version 0.1.1 to 0.1.2
{ | ||
"author": "alex.topliceanu <alext@vibetrace.com> (https://github.com/topliceanu)", | ||
"name": "sum", | ||
"description": "text summarization utility", | ||
"version": "0.1.1", | ||
"homepage": "https://github.com/topliceanu/text-summarization", | ||
"repository": { | ||
"type": "git", | ||
"url": "git@github.com:topliceanu/text-summarization.git" | ||
}, | ||
"main": "./sum.js", | ||
"engines": { | ||
"node": ">0.4.12" | ||
}, | ||
"dependencies": { | ||
"underscore": "~1.3.1", | ||
"underscore.string": "~2.0.0", | ||
"porter-stemmer": "~0.9.1", | ||
"vows": "0.6.1" | ||
} | ||
"name": "sum", | ||
"version": "0.1.2", | ||
"description": "text summarization utility", | ||
"homepage": "https://github.com/topliceanu/text-summarization", | ||
"license": "MIT", | ||
"keywords": [ | ||
"summarization", | ||
"nlp", | ||
"stemmer", | ||
"stop-words", | ||
"express" | ||
], | ||
"author": "alex.topliceanu <alexandru.topliceanu@gmail.com> (https://github.com/topliceanu)", | ||
"repository": { | ||
"type": "git", | ||
"url": "git://github.com:topliceanu/text-summarization.git" | ||
}, | ||
"bugs": { | ||
"url": "https://github.com/topliceanu/text-summarization/issues" | ||
}, | ||
"main": "./sum.js", | ||
"scripts": { | ||
"test": "./node_modules/.bin/vows --spec --isolate ./tests/node/sum.js", | ||
"lint": "./node_modules/.bin/jshint sum.js" | ||
}, | ||
"dependencies": { | ||
"underscore": "1.7.0", | ||
"underscore.string": "3.0.3", | ||
"porter-stemmer": "0.9.1" | ||
}, | ||
"devDependencies": { | ||
"vows": "0.8.1", | ||
"jshint": "2.6.0" | ||
}, | ||
"optionalDependencies": {}, | ||
"engines": { | ||
"node": ">0.10.0" | ||
} | ||
} |
144
README.md
@@ -1,80 +0,96 @@ | ||
_____ _ | ||
/ ____| (_) | ||
| (___ _ _ _ __ ___ _ ___ | ||
_____ _ | ||
/ ____| (_) | ||
| (___ _ _ _ __ ___ _ ___ | ||
\___ \ | | | || '_ ` _ \ | |/ __| | ||
____) || |_| || | | | | | _ | |\__ \ | ||
|_____/ \__,_||_| |_| |_|(_)| ||___/ | ||
_/ | | ||
|__/ | ||
_/ | | ||
|__/ | ||
Sum.js | ||
============ | ||
## Sum.js | ||
[![NPM](https://nodei.co/npm/sum.png?downloads=true&stars=true)](https://nodei.co/npm/sum/) | ||
[![NPM](https://nodei.co/npm-dl/sum.png?months=12)](https://nodei.co/npm-dl/sum/) | ||
| Indicator | | | ||
|:-----------------------|:-------------------------------------------------------------------------| | ||
| continuous integration | [![Build Status](https://travis-ci.org/topliceanu/sum.svg?branch=master)](https://travis-ci.org/topliceanu/sum) | | ||
| dependency management | [![Dependency Status](https://david-dm.org/topliceanu/sum.svg?style=flat)](https://david-dm.org/topliceanu/sum) [![devDependency Status](https://david-dm.org/topliceanu/sum/dev-status.svg?style=flat)](https://david-dm.org/topliceanu/sum#info=devDependencies) | | ||
| change log | [CHANGELOG](https://github.com/topliceanu/sum/blob/master/CHANGELOG.md) [Releases](https://github.com/topliceanu/sum/releases) | | ||
A simple function for summarizing text e.g. for automatically determining the sentences that are most relevant to the context of the corpus. | ||
This library depends on the [underscore](http://documentcloud.github.com/underscore/), [underscore.string](http://epeli.github.com/underscore.string/) and [porter-stemmer](https://github.com/jedp/porter-stemmer) for the moment | ||
This library depends on the [underscore](http://documentcloud.github.com/underscore/), [underscore.string](http://epeli.github.com/underscore.string/) and [porter-stemmer](https://github.com/jedp/porter-stemmer). | ||
Install in node.js | ||
================== | ||
sudo npm install -g sum | ||
## Install in node.js | ||
Install in browser | ||
================== | ||
<script src="/lib/underscore.js"></script> | ||
<script src="/lib/underscore.string.js"></script> | ||
<script src="/lib/porter-stemmer.js"></script> | ||
<script src="/sum.browser.js"></script> | ||
```bash | ||
sudo npm install -g sum | ||
``` | ||
Quick Start | ||
=========== | ||
var sum = require( 'sum' ); | ||
var bigString = "...."; | ||
var abstract = sum({ 'corpus': bigString }); | ||
## Install in browser | ||
Further Options | ||
=============== | ||
var sum = require( 'sum' ); | ||
var anotherBigString = "..."; | ||
var abstract = sum({ | ||
/** | ||
* `corpus`: String - is the string you want to summarize | ||
*/ | ||
'corpus': anotherBigString, | ||
```html | ||
<script src="/lib/underscore.js"></script> | ||
<script src="/lib/underscore.string.js"></script> | ||
<script src="/lib/porter-stemmer.js"></script> | ||
<script src="/sum.js"></script> | ||
``` | ||
/** | ||
* `nSentences`: Number - controls the number of sentences from the original text included in the abstact | ||
*/ | ||
'nSentences': 3, | ||
## Quick Start | ||
/** | ||
* `nWords`: Number - controls the length in words of the nGram output. Output might be larger as some words are ignored in the algorithm but present in the abstract, for ex. prepositions. When `nWords` is set, `nSentences` is ignored | ||
*/ | ||
'nWords': 5, | ||
/** | ||
* `exclude`: Array[String] - sum.js allows you to exclude from the final abstract, sentences or nGrams that contain any of the words in the `exclude` param | ||
*/ | ||
'exclude': ['polar', 'bear'], | ||
```javascript | ||
var sum = require( 'sum' ); | ||
var bigString = "...."; | ||
var abstract = sum({ 'corpus': bigString }); | ||
// `abstract` is an object w/ format `{"summary":String, "sentences":Array<String>}` | ||
// where summary is the concatenation of the array of sentences. | ||
``` | ||
/** | ||
* `emphasise`: Array[String] - forces sum.js to include in the summary the sentences or nGrams that contain any the words specified by `emphasise` param. | ||
*/ | ||
'emphasise': ['magic'] | ||
}); | ||
## Further Options | ||
```javascript | ||
var sum = require( 'sum' ); | ||
var anotherBigString = "..."; | ||
var abstract = sum({ | ||
/** | ||
* `corpus`: String - is the string you want to summarize | ||
*/ | ||
'corpus': anotherBigString, | ||
Running tests | ||
============= | ||
Run /tests/browser/specrunner.html in your favourite browser. | ||
/** | ||
* `nSentences`: Number - controls the number of sentences from the original text included in the abstact | ||
*/ | ||
'nSentences': 3, | ||
To run node tests, make sure you have [vows.js](http://vowsjs.org) installed then run | ||
/** | ||
* `nWords`: Number - controls the length in words of the nGram output. Output might be larger as some words are ignored in the algorithm but present in the abstract, for ex. prepositions. When `nWords` is set, `nSentences` is ignored | ||
*/ | ||
'nWords': 5, | ||
vows ./tests/node/sum.js | ||
/** | ||
* `exclude`: Array[String] - sum.js allows you to exclude from the final abstract, sentences or nGrams that contain any of the words in the `exclude` param | ||
*/ | ||
'exclude': ['polar', 'bear'], | ||
/** | ||
* `emphasise`: Array[String] - forces sum.js to include in the summary the sentences or nGrams that contain any the words specified by `emphasise` param. | ||
*/ | ||
'emphasise': ['magic'] | ||
}); | ||
Goals | ||
===== | ||
//`abstract` is an object with format {'sentences':Array<String>, 'summary':String} where summary is just the concatenation of the sentences, for convenience. | ||
console.log("The short version of corpus is ", abstract.summary); | ||
``` | ||
This library is intended to be fully `embeddable`. It's purpose is to be used primarly on the `client-side`. | ||
## Running tests | ||
Run `/tests/browser/specrunner.html` in your favourite browser. | ||
To run node tests, run `npm run test`. | ||
## Goals | ||
This library is intended to be fully `embeddable`. It's purpose is to be used primarly on the `client-side`. | ||
It should be `self-contained` so no API calls to external services. | ||
@@ -85,5 +101,4 @@ It should be as `light` as possible, both in terms of code size and dependencies and above all it must be `fast`. | ||
TODO | ||
==== | ||
## TODO | ||
1. add tests to verify the correctness of the actual output | ||
@@ -97,8 +112,7 @@ 2. currenty the output does not preserve the ending chars of the original sentences | ||
Licence | ||
======= | ||
## Licence | ||
(The MIT License) | ||
Copyright (c) 2009-2011 Alex Topliceanu <alext@vibetrace.com> | ||
Copyright (c) Alex Topliceanu <alexandru.topliceanu@gmail.com> | ||
@@ -105,0 +119,0 @@ Permission is hereby granted, free of charge, to any person obtaining |
493
sum.js
(function (_undef) { | ||
"use strict"; | ||
"use strict"; | ||
var wrapper = function (_, stemmer) { | ||
//default values | ||
var defaults = { | ||
nSentences: 1, | ||
exclude: [], | ||
emphasise: [] | ||
}; | ||
// regexes | ||
var sentenceDelimiter = /[.!?;]/; | ||
var nGramDelimiter = /[.,!?;]/; | ||
var wordDelimiter = /\s/mg; | ||
var matchJunk = /["#$%&'()*+,\-\/:<=>@\[\\\]\^_`{|}]/mg ; | ||
/** | ||
* Function wraps the library code to allow passing in the | ||
* dependencies easily. | ||
* @param {Object} _ - Reference to underscore.js | ||
* @param {Object} stemmer - Porter stemmer implementation in js. | ||
* @return {Function} Sumarization function. | ||
*/ | ||
var wrapper = function (_, stemmer) { | ||
var stopWords = ["", "a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also","although","always","am","among", "amongst", "amoungst", "amount", "an", "and", "another", "any","anyhow","anyone","anything","anyway", "anywhere", "are", "around", "as", "at", "back","be","became", "because","become","becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom","but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven","else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own","part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", "the"]; | ||
// Params default values. | ||
var defaults = { | ||
nSentences: 1, | ||
exclude: [], | ||
emphasise: [] | ||
}; | ||
// function used to clean sentences before splitting into words | ||
var clean = function (str) { | ||
return _(str).chain() | ||
.unescapeHTML() | ||
.stripTags() | ||
.clean() | ||
.value() | ||
.replace( matchJunk, '' ) | ||
.toLowerCase(); | ||
}; | ||
// regexes | ||
var sentenceDelimiter = /[.!?;]/; | ||
var nGramDelimiter = /[.,!?;]/; | ||
var wordDelimiter = /\s/mg; | ||
var matchJunk = /["#$%&'()*+,\-\/:<=>@\[\\\]\^_`{|}]/mg ; | ||
// Sentence Module | ||
var Sentence = function (s) { | ||
var c = clean( s ); | ||
var all = _.words( c, wordDelimiter ); | ||
var words = _(all).chain() | ||
// remove stop words | ||
.filter( function (w) { | ||
return (stopWords.indexOf( w ) === -1) ; | ||
}) | ||
// apply stemmer | ||
.map( function (w) { | ||
return stemmer( w ); | ||
}) | ||
// collect word frequencies | ||
.reduce( function (collect, w) { | ||
collect[w] = collect[w] ? collect[w] + 1 : 1 ; | ||
return collect; | ||
}, {}).value(); | ||
// remove a word from this sentence to reduce redundancy in results | ||
var remove = function (w) { | ||
return delete words[w]; | ||
}; | ||
return { | ||
orig: s, | ||
words: words, | ||
remove: remove | ||
}; | ||
}; | ||
// List of words which are ignored when computing top relevant sentences. | ||
var stopWords = ["", "a", "about", "above", "above", "across", "after", | ||
"afterwards", "again", "against", "all", "almost", "alone", "along", | ||
"already", "also","although","always","am","among", "amongst", | ||
"amoungst", "amount", "an", "and", "another", "any","anyhow", | ||
"anyone","anything","anyway", "anywhere", "are", "around", "as", | ||
"at", "back","be","became", "because","become","becomes", | ||
"becoming", "been", "before", "beforehand", "behind", "being", | ||
"below", "beside", "besides", "between", "beyond", "bill", "both", | ||
"bottom","but", "by", "call", "can", "cannot", "cant", "co", "con", | ||
"could", "couldnt", "cry", "de", "describe", "detail", "do", "done", | ||
"down", "due", "during", "each", "eg", "eight", "either", "eleven", | ||
"else", "elsewhere", "empty", "enough", "etc", "even", "ever", | ||
"every", "everyone", "everything", "everywhere", "except", "few", | ||
"fifteen", "fify", "fill", "find", "fire", "first", "five", "for", | ||
"former", "formerly", "forty", "found", "four", "from", "front", | ||
"full", "further", "get", "give", "go", "had", "has", "hasnt", | ||
"have", "he", "hence", "her", "here", "hereafter", "hereby", | ||
"herein", "hereupon", "hers", "herself", "him", "himself", "his", | ||
"how", "however", "hundred", "ie", "if", "in", "inc", "indeed", | ||
"interest", "into", "is", "it", "its", "itself", "keep", "last", | ||
"latter", "latterly", "least", "less", "ltd", "made", "many", "may", | ||
"me", "meanwhile", "might", "mill", "mine", "more", "moreover", | ||
"most", "mostly", "move", "much", "must", "my", "myself", "name", | ||
"namely", "neither", "never", "nevertheless", "next", "nine", "no", | ||
"nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", | ||
"of", "off", "often", "on", "once", "one", "only", "onto", "or", | ||
"other", "others", "otherwise", "our", "ours", "ourselves", "out", | ||
"over", "own","part", "per", "perhaps", "please", "put", "rather", | ||
"re", "same", "see", "seem", "seemed", "seeming", "seems", | ||
"serious", "several", "she", "should", "show", "side", "since", | ||
"sincere", "six", "sixty", "so", "some", "somehow", "someone", | ||
"something", "sometime", "sometimes", "somewhere", "still", "such", | ||
"system", "take", "ten", "than", "that", "the", "their", "them", | ||
"themselves", "then", "thence", "there", "thereafter", "thereby", | ||
"therefore", "therein", "thereupon", "these", "they", "thickv", | ||
"thin", "third", "this", "those", "though", "three", "through", | ||
"throughout", "thru", "thus", "to", "together", "too", "top", | ||
"toward", "towards", "twelve", "twenty", "two", "un", "under", | ||
"until", "up", "upon", "us", "very", "via", "was", "we", "well", | ||
"were", "what", "whatever", "when", "whence", "whenever", "where", | ||
"whereafter", "whereas", "whereby", "wherein", "whereupon", | ||
"wherever", "whether", "which", "while", "whither", "who", | ||
"whoever", "whole", "whom", "whose", "why", "will", "with", | ||
"within", "without", "would", "yet", "you", "your", "yours", | ||
"yourself", "yourselves", "the"]; | ||
var sum = function (opts){ | ||
/** | ||
* Function used to clean sentences before splitting into words | ||
* @param {String} str | ||
* @return {String} | ||
*/ | ||
var clean = function (str) { | ||
return _(str).chain() | ||
.unescapeHTML() | ||
.stripTags() | ||
.clean() | ||
.value() | ||
.replace( matchJunk, '' ) | ||
.toLowerCase(); | ||
}; | ||
// handle options | ||
opts = _.extend( {}, defaults, opts ); | ||
opts.corpus = opts.corpus || _undef; | ||
if (opts.corpus === _undef) throw Error( 'No input corpus' ); | ||
if (opts.nWords !== _undef && !_.isNumber(opts.nWords)) throw Error('Bad value for nWords'); | ||
/** | ||
* Sentence Module. Creates object with format: | ||
* {orig:String, words:Array<String>, remove:Function} | ||
*/ | ||
var Sentence = function (s) { | ||
var c = clean( s ); | ||
var all = _.words( c, wordDelimiter ); | ||
var words = _(all).chain() | ||
// remove stop words | ||
.filter( function (w) { | ||
return (stopWords.indexOf( w ) === -1) ; | ||
}) | ||
// apply stemmer | ||
.map( function (w) { | ||
return stemmer( w ); | ||
}) | ||
// collect word frequencies | ||
.reduce( function (collect, w) { | ||
collect[w] = collect[w] ? collect[w] + 1 : 1 ; | ||
return collect; | ||
}, {}).value(); | ||
// remove a word from this sentence to reduce redundancy in results | ||
var remove = function (w) { | ||
return delete words[w]; | ||
}; | ||
return { | ||
orig: s, | ||
words: words, | ||
remove: remove | ||
}; | ||
}; | ||
// clean corpus | ||
var s = opts.corpus.split( sentenceDelimiter ); // TODO: keep the sentence ending chars | ||
var sentences = _(s).map( function (s) { | ||
return new Sentence(s); | ||
}); | ||
/** | ||
* Text summarization function. | ||
* @param {Object} opts | ||
* @param {String} opts.corpus - String to summarize. | ||
* @param {String} opts.nWords - Number of words the summary should have. | ||
* @param {String} opts.nSentences - Number of sentences the summary should have. | ||
* @return {Object} output | ||
* @return {Array<String>} output.sentences - The summary sentences in | ||
* order of relevance to the input text. | ||
* @return {String} output.summary - the concatenation of the summary | ||
* sentences for convenience. | ||
*/ | ||
var sum = function (opts){ | ||
// Handle options. | ||
opts = _.extend( {}, defaults, opts ); | ||
opts.corpus = opts.corpus || _undef; | ||
if (opts.corpus === _undef) { | ||
throw Error( 'No input corpus' ); | ||
} | ||
if (opts.nWords !== _undef && !_.isNumber(opts.nWords)) { | ||
throw Error('Bad value for nWords'); | ||
} | ||
// Clean corpus. | ||
var s = opts.corpus.split(sentenceDelimiter); | ||
var sentences = _(s).map( function (s) { | ||
return new Sentence(s); | ||
}); | ||
// splits the sentences into nGrams then applies the same algorithm | ||
if (opts.nWords) { | ||
// Splits the sentences into nGrams then applies the same algorithm. | ||
if (opts.nWords) { | ||
// `opts.nSentences` is ignored, output size is determined by lexem size | ||
opts.nSentences = 1; | ||
// `opts.nSentences` is ignored, output size is determined by lexem size. | ||
opts.nSentences = 1; | ||
var nGrams = _(sentences).reduce( function (collect, s) { | ||
var orig = s.orig; | ||
var partials = _(s.words).reduce( function (memo, freq, w) { | ||
var pos = orig.indexOf(' '); | ||
if (pos === -1) pos = orig.length; | ||
var partial = orig.substr(0, pos); | ||
orig = orig.substr(pos + 1); | ||
if (partial !== '') memo.push(partial); | ||
return memo; | ||
}, []); | ||
if (partials.length <= opts.nWords) { | ||
var newSentence = new Sentence( partials.join(' ')); | ||
collect.push( newSentence ); | ||
return collect; | ||
} | ||
var i = 0, j = 0, n = partials.length - opts.nWords, m=partials.length, tmp; | ||
for (i = 0; i < n; i ++) { | ||
var tmp = '' | ||
for (j = i; j < i+opts.nWords; j ++) { | ||
tmp += partials[j] + ' '; | ||
} | ||
var newSentence = new Sentence( tmp ); | ||
collect.push( newSentence ); | ||
} | ||
return collect; | ||
}, []); | ||
sentences = nGrams; | ||
} | ||
var nGrams = _(sentences).reduce( function (collect, s) { | ||
var orig = s.orig; | ||
var partials = _(s.words).reduce( function (memo, freq, w) { | ||
var pos = orig.indexOf(' '); | ||
if (pos === -1) { | ||
pos = orig.length; | ||
} | ||
var partial = orig.substr(0, pos); | ||
orig = orig.substr(pos + 1); | ||
if (partial !== '') { | ||
memo.push(partial); | ||
} | ||
return memo; | ||
}, []); | ||
if (partials.length <= opts.nWords) { | ||
var newSentence = new Sentence( partials.join(' ')); | ||
collect.push( newSentence ); | ||
return collect; | ||
} | ||
var i = 0, | ||
j = 0, | ||
n = partials.length - opts.nWords, | ||
m = partials.length, | ||
tmp; | ||
for (i = 0; i < n; i ++) { | ||
var tmp = '' | ||
for (j = i; j < i+opts.nWords; j ++) { | ||
tmp += partials[j] + ' '; | ||
} | ||
var newSentence = new Sentence(tmp); | ||
collect.push(newSentence); | ||
} | ||
return collect; | ||
}, []); | ||
sentences = nGrams; | ||
} | ||
// return all sentences that contain a givven word | ||
var containing = function (w) { | ||
return _(sentences).filter( function (s) { | ||
return (s.words[w] !== undefined) ; | ||
}); | ||
}; | ||
// if summary must exclude words in opts.exclude remove sentences that contain those words | ||
if ( _.isArray(opts.exclude) && opts.exclude.length !== 0) { | ||
var excludes = _(opts.exclude).map( function (w) { | ||
return stemmer(clean(w)); | ||
}); | ||
sentences = _(sentences).filter( function (s) { | ||
var words = _(s.words).keys(); | ||
return (_.intersection( words, excludes ).length === 0); | ||
}); | ||
} | ||
/** | ||
* Return all sentences that contain a givven word. | ||
* @param {String} w - word | ||
* @return {Array<Object>} | ||
*/ | ||
var containing = function (w) { | ||
return _(sentences).filter( function (s) { | ||
return (s.words[w] !== undefined) ; | ||
}); | ||
}; | ||
var summary = [] ; | ||
var counter = 0; | ||
// If summary must exclude words in opts.exclude remove sentences | ||
// that contain those words. | ||
if ( _.isArray(opts.exclude) && opts.exclude.length !== 0) { | ||
var excludes = _(opts.exclude).map( function (w) { | ||
return stemmer(clean(w)); | ||
}); | ||
sentences = _(sentences).filter( function (s) { | ||
var words = _(s.words).keys(); | ||
return (_.intersection( words, excludes ).length === 0); | ||
}); | ||
} | ||
// extract sentences in order of their relevance | ||
while (true) { | ||
var N = sentences.length; | ||
var summary = []; | ||
var counter = 0; | ||
// builds a hash of all words with global frequencies | ||
var words = _(sentences).reduce( function (collect,s) { | ||
_(s.words).each( function (count, w) { | ||
collect[w] = collect[w] ? collect[w] + count : count ; | ||
}); | ||
return collect; | ||
}, {}); | ||
// if summary must have the words in opts.emphasise | ||
var emphasise = []; | ||
if ( _.isArray(opts.emphasise) && opts.emphasise.length !== 0) { | ||
emphasise = _(opts.emphasise).map( function (w) { | ||
return stemmer(clean(w)); | ||
}); | ||
} | ||
// Extract sentences in order of their relevance. | ||
while (true) { | ||
var N = sentences.length; | ||
//calculate relevance for each sentence | ||
_(sentences).each( function (s) { | ||
var relevance = _(s.words).reduce( function (memo, freq, w) { | ||
var local = Math.log( 1 + freq ); | ||
var global = Math.log( N / containing(w).length ); | ||
return memo = memo + (local * global); | ||
}, 0); | ||
// if current sentence containes emphasised words, bumb up the relevance | ||
var bump = _.intersection(emphasise, _(s.words).keys()).length; | ||
relevance += bump * 1000; //big enough to push it in front | ||
// Builds a hash of all words with global frequencies. | ||
var words = _(sentences).reduce( function (collect,s) { | ||
_(s.words).each( function (count, w) { | ||
collect[w] = collect[w] ? collect[w] + count : count ; | ||
}); | ||
return collect; | ||
}, {}); | ||
s.relevance = relevance; | ||
}) | ||
// If summary must have the words in opts.emphasise. | ||
var emphasise = []; | ||
if ( _.isArray(opts.emphasise) && opts.emphasise.length !== 0) { | ||
emphasise = _(opts.emphasise).map( function (w) { | ||
return stemmer(clean(w)); | ||
}); | ||
} | ||
// highest relevance sentence | ||
var highest = _(sentences).max( function (s) { | ||
return s.relevance; | ||
}); | ||
// Calculate relevance for each sentence. | ||
_(sentences).each( function (s) { | ||
var relevance = _(s.words).reduce( function (memo, freq, w) { | ||
var local = Math.log(1 + freq); | ||
var global = Math.log(N / containing(w).length); | ||
return memo = memo + (local * global); | ||
}, 0); | ||
// remove words from the remaining sentences to reduce redundancy | ||
sentences = _(sentences).chain() | ||
.without(highest) | ||
.map( function (s) { | ||
_(highest.words).each( function (w) { | ||
s.remove( w ); | ||
}); | ||
return s; | ||
}) | ||
.value(); | ||
// If current sentence containes emphasised words, | ||
// bumb up the relevance. | ||
var bump = _.intersection(emphasise, _(s.words).keys()).length; | ||
relevance += bump * 1000; | ||
summary.push( highest.orig ) ; | ||
counter += 1; | ||
s.relevance = relevance; | ||
}) | ||
var stop = (counter === opts.nSentences || sentences.length === 0); | ||
if (stop) break; | ||
}//~ end while | ||
return { | ||
'summary': summary.join('.'), | ||
'sentences': summary | ||
}; | ||
}; | ||
return sum; | ||
}; | ||
// Highest relevance sentence. | ||
var highest = _(sentences).max( function (s) { | ||
return s.relevance; | ||
}); | ||
// exports the `sum` function in node.js | ||
if (typeof exports !== 'undefined' && typeof module !== 'undefined' && module.exports && typeof require !== 'undefined') { | ||
var stemmer = require( 'porter-stemmer' ).stemmer; | ||
var _ = require( 'underscore' ); | ||
_.str = require( 'underscore.string' ); | ||
_.mixin( _.str.exports() ); | ||
module.exports = wrapper(_, stemmer); | ||
} | ||
// exports `sum` to AMD module, defining dependencies | ||
else if (typeof define === 'function' && define.amd) { | ||
define('sum', [ | ||
'underscore', | ||
'underscore.string', | ||
'porter-stemmer' | ||
], function(_, str, stemmer) { | ||
return wrapper(_, stemmer); | ||
}); | ||
} | ||
// export in browser | ||
else if (typeof this !== 'undefined' && this._ && this.stemmer) { | ||
this._.mixin( this._.str.exports() ); | ||
this.sum = wrapper(this._, this.stemmer); | ||
} | ||
else { | ||
throw Error( 'unsupported js environment detected' ); | ||
} | ||
// Remove words from the remaining sentences to reduce redundancy. | ||
sentences = _(sentences).chain() | ||
.without(highest) | ||
.map( function (s) { | ||
_(highest.words).each( function (w) { | ||
s.remove( w ); | ||
}); | ||
return s; | ||
}) | ||
.value(); | ||
summary.push( highest.orig ) ; | ||
counter += 1; | ||
var stop = (counter === opts.nSentences || sentences.length === 0); | ||
if (stop) break; | ||
} | ||
return { | ||
'summary': summary.join('.'), | ||
'sentences': summary | ||
}; | ||
}; | ||
return sum; | ||
}; | ||
// exports the `sum` function in node.js | ||
if (typeof exports !== 'undefined' | ||
&& typeof module !== 'undefined' | ||
&& module.exports | ||
&& typeof require !== 'undefined') { | ||
var stemmer = require( 'porter-stemmer' ).stemmer; | ||
var _ = require( 'underscore' ); | ||
_.str = require( 'underscore.string' ); | ||
_.mixin( _.str.exports() ); | ||
module.exports = wrapper(_, stemmer); | ||
} | ||
// exports `sum` to AMD module, defining dependencies | ||
else if (typeof define === 'function' && define.amd) { | ||
define('sum', [ | ||
'underscore', | ||
'underscore.string', | ||
'porter-stemmer' | ||
], function(_, str, stemmer) { | ||
return wrapper(_, stemmer); | ||
}); | ||
} | ||
// export in browser | ||
else if (typeof this !== 'undefined' | ||
&& this._ && this.stemmer) { | ||
this._.mixin( this._.str.exports() ); | ||
this.sum = wrapper(this._, this.stemmer); | ||
} | ||
else { | ||
throw Error( 'Unsupported js environment detected' ); | ||
} | ||
}).call(this); |
describe( 'test sum\' params', function () { | ||
it( 'should return one sentence', function () { | ||
var corpus = corpora[1]; | ||
var actual = sum({ 'corpus': corpus.text, 'nSentences': 1 }); | ||
var expected = 1; | ||
expect(actual.sentences.length).toEqual( expected ); | ||
}); | ||
it( 'should return two sentences', function () { | ||
var corpus = corpora[1]; | ||
var actual = sum({ 'corpus': corpus.text, 'nSentences': 3 }); | ||
var expected = 3; | ||
expect(actual.sentences.length).toEqual( expected ); | ||
}); | ||
it( 'should ignore sentences that have the word `bladder` in them', function () { | ||
var corpus = corpora[1]; | ||
var sum1 = sum({ 'corpus': corpus.text, 'nSentences': 1, 'exclude': ['bladder', 'Chubb'] }); | ||
var sum2 = sum({ 'corpus': corpus.text, 'nSentences': 1 }); | ||
expect( sum1.summary ).not.toEqual( sum2.summary ); | ||
}); | ||
it( 'should have the emphasisted word `drug` in the abstract', function () { | ||
var corpus = corpora[1]; | ||
var sum1 = sum({ 'corpus': corpus.text, 'nSentences': 1, 'emphasise': ['Drug'] }); | ||
var actual = _.str.include( sum1.summary, 'Drug' ); | ||
expect( actual ).toBe( true ); | ||
}); | ||
it( 'should return one sentence', function () { | ||
var corpus = corpora[1]; | ||
var actual = sum({ 'corpus': corpus.text, 'nSentences': 1 }); | ||
var expected = 1; | ||
expect(actual.sentences.length).toEqual( expected ); | ||
}); | ||
it( 'should return two sentences', function () { | ||
var corpus = corpora[1]; | ||
var actual = sum({ 'corpus': corpus.text, 'nSentences': 3 }); | ||
var expected = 3; | ||
expect(actual.sentences.length).toEqual( expected ); | ||
}); | ||
it( 'should ignore sentences that have the word `bladder` in them', function () { | ||
var corpus = corpora[1]; | ||
var sum1 = sum({ 'corpus': corpus.text, 'nSentences': 1, 'exclude': ['bladder', 'Chubb'] }); | ||
var sum2 = sum({ 'corpus': corpus.text, 'nSentences': 1 }); | ||
expect( sum1.summary ).not.toEqual( sum2.summary ); | ||
}); | ||
it( 'should have the emphasisted word `drug` in the abstract', function () { | ||
var corpus = corpora[1]; | ||
var sum1 = sum({ 'corpus': corpus.text, 'nSentences': 1, 'emphasise': ['Drug'] }); | ||
var actual = _.str.include( sum1.summary, 'Drug' ); | ||
expect( actual ).toBe( true ); | ||
}); | ||
}); | ||
describe( 'summarize.js basic output test', function () { | ||
corpora.forEach( function (corpus) { | ||
it( 'should calculate the summary', function () { | ||
var actual = sum({ 'corpus': corpus.text, 'nSentences': 3 }); | ||
var expected = 3; | ||
expect(actual.sentences.length).toEqual( expected ); | ||
}); | ||
}); | ||
corpora.forEach( function (corpus) { | ||
it( 'should calculate the summary', function () { | ||
var actual = sum({ 'corpus': corpus.text, 'nSentences': 3 }); | ||
var expected = 3; | ||
expect(actual.sentences.length).toEqual( expected ); | ||
}); | ||
}); | ||
}); | ||
describe( 'test nWords params in action', function () { | ||
corpora.forEach( function (corpus) { | ||
it( 'should calculate the summary', function () { | ||
var actual = sum({ 'corpus': corpus.text, 'nWords': 5 }); | ||
expect(actual.sentences.length).not.toEqual(''); | ||
}); | ||
}); | ||
corpora.forEach( function (corpus) { | ||
it( 'should calculate the summary', function () { | ||
var actual = sum({ 'corpus': corpus.text, 'nWords': 5 }); | ||
expect(actual.sentences.length).not.toEqual(''); | ||
}); | ||
}); | ||
}); | ||
//TODO add tests to validate correctness of the actual output |
@@ -10,31 +10,31 @@ var vows = require( 'vows' ); | ||
.addBatch({ | ||
'when summarizing a text': { | ||
topic: function () { | ||
var s = sum({ | ||
'corpus': corpus, | ||
'nSentences': 3 | ||
}); | ||
return s.summary; | ||
}, | ||
'it should output the abstract containing the most relevant sentences for the meaning of the initial text': function (error, summary) { | ||
assert.ifError( error ); | ||
assert.isString( summary ); | ||
} | ||
} | ||
'when summarizing a text': { | ||
topic: function () { | ||
var s = sum({ | ||
'corpus': corpus, | ||
'nSentences': 3 | ||
}); | ||
return s.summary; | ||
}, | ||
'it should output the abstract containing the most relevant sentences for the meaning of the initial text': function (error, summary) { | ||
assert.ifError( error ); | ||
assert.isString( summary ); | ||
} | ||
} | ||
}) | ||
.addBatch({ | ||
'when summarizing a text': { | ||
topic: function () { | ||
var s = sum({ | ||
'corpus': corpus, | ||
'nWords': 5 | ||
}); | ||
return s.summary; | ||
}, | ||
'it should output the abstract containing the most relevant sentences for the meaning of the initial text': function (error, summary) { | ||
assert.ifError( error ); | ||
assert.isString( summary ); | ||
} | ||
} | ||
'when summarizing a text': { | ||
topic: function () { | ||
var s = sum({ | ||
'corpus': corpus, | ||
'nWords': 5 | ||
}); | ||
return s.summary; | ||
}, | ||
'it should output the abstract containing the most relevant sentences for the meaning of the initial text': function (error, summary) { | ||
assert.ifError( error ); | ||
assert.isString( summary ); | ||
} | ||
} | ||
}) | ||
.export(module); |
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
No bug tracker
MaintenancePackage does not have a linked bug tracker in package.json.
Found 1 instance in 1 package
186355
3
29
4297
1
135
2
+ Addedunderscore@1.7.0(transitive)
+ Addedunderscore.string@3.0.3(transitive)
- Removedvows@0.6.1
- Removedeyes@0.1.8(transitive)
- Removedunderscore@1.3.3(transitive)
- Removedunderscore.string@2.0.0(transitive)
- Removedvows@0.6.1(transitive)
Updatedporter-stemmer@0.9.1
Updatedunderscore@1.7.0
Updatedunderscore.string@3.0.3