auto-tagger - npm Package Compare versions

Comparing version 1.0.1 to 1.0.2

177

autoTagger.js

		@@ -1,12 +0,20 @@
		(function (global, factory) {
		(function(global, factory) {
		typeof exports === 'object' && typeof module !== 'undefined' ? module.exports = factory() :
		typeof define === 'function' && define.amd ? define(factory) :
		global.autoTagger = factory()
		}(this, function () { 'use strict';
		var autoTagger = function AutoTagger() {
		typeof define === 'function' && define.amd ? define(factory) :
		global.autoTagger = factory()
		}(this, function() {
		'use strict';
		var defaultStopWords = {
		'en': ['a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', 'aren\'t', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can\'t', 'cannot', 'could', 'couldn\'t', 'did', 'didn\'t', 'do', 'does', 'doesn\'t', 'doing', 'don\'t', 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn\'t', 'has', 'hasn\'t', 'have', 'haven\'t', 'having', 'he', 'he\'d', 'he\'ll', 'he\'s', 'her', 'here', 'here\'s', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'how\'s', 'i', 'i\'d', 'i\'ll', 'i\'m', 'i\'ve', 'if', 'in', 'into', 'is', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'let\'s', 'me', 'more', 'most', 'mustn\'t', 'my', 'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours ', 'ourselves', 'out', 'over', 'own', 'same', 'shan\'t', 'she', 'she\'d', 'she\'ll', 'she\'s', 'should', 'shouldn\'t', 'so', 'some', 'such', 'than', 'that', 'that\'s', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'there\'s', 'these', 'they', 'they\'d', 'they\'ll', 'they\'re', 'they\'ve', 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 'was', 'wasn\'t', 'we', 'we\'d', 'we\'ll', 'we\'re', 'we\'ve', 'were', 'weren\'t', 'what', 'what\'s', 'when', 'when\'s', 'where', 'where\'s', 'which', 'while', 'who', 'who\'s', 'whom', 'why', 'why\'s', 'with', 'won\'t', 'would', 'wouldn\'t', 'you', 'you\'d', 'you\'ll', 'you\'re', 'you\'ve', 'your', 'yours', 'yourself', 'yourselves'],
		'pt': ['de', 'da', 'já', 'se', 'ao', 'na', 'seja', 'será', 'que', 'último', 'é', 'acerca', 'agora', 'algumas', 'alguns', 'ali', 'ambos', 'antes', 'apontar', 'aquela', 'aquelas', 'aquele', 'aqueles', 'aqui', 'atrás', 'bem', 'bom', 'cada', 'caminho', 'cima', 'com', 'como', 'comprido', 'conhecido', 'corrente', 'das', 'debaixo', 'dentro', 'desde', 'desligado', 'deve', 'devem', 'deverá', 'direita', 'diz', 'dizer', 'dois', 'dos', 'e', 'ela', 'ele', 'eles', 'em', 'enquanto', 'então', 'está', 'estão', 'estado', 'estar', 'estará', 'este', 'estes', 'esteve', 'estive', 'estivemos', 'estiveram', 'eu', 'fará', 'faz', 'fazer', 'fazia', 'fez', 'fim', 'foi', 'fora', 'horas', 'iniciar', 'inicio', 'ir', 'irá', 'ista', 'iste', 'isto', 'ligado', 'maioria', 'maiorias', 'mais', 'mas', 'mesmo', 'meu', 'muito', 'muitos', 'nós', 'não', 'nome', 'nosso', 'novo', 'o', 'onde', 'os', 'ou', 'outro', 'para', 'parte', 'pegar', 'pelo', 'pessoas', 'pode', 'poderá', 'podia', 'por', 'porque', 'povo', 'promeiro', 'quê', 'qual', 'qualquer', 'quando', 'quem', 'quieto', 'são', 'saber', 'sem', 'ser', 'seu', 'somente', 'têm', 'tal', 'também', 'tem', 'tempo', 'tenho', 'tentar', 'tentaram', 'tente', 'tentei', 'teu', 'teve', 'tipo', 'tive', 'todos', 'trabalhar', 'trabalho', 'tu', 'um', 'uma', 'umas', 'uns', 'usa', 'usar', 'valor', 'veja', 'ver', 'verdade', 'verdadeiro', 'você']
		};

		var autoTagger = AutoTagger;
		autoTagger.useStopWords = useStopWords;
		autoTagger.fromText = fromText;

		return autoTagger;
		//-----------------------------------------------------------------------------
		function AutoTagger() {
		var self = this;
		var defaultStopWords = {
		'en': ['a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', 'aren\'t', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can\'t', 'cannot', 'could', 'couldn\'t', 'did', 'didn\'t', 'do', 'does', 'doesn\'t', 'doing', 'don\'t', 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn\'t', 'has', 'hasn\'t', 'have', 'haven\'t', 'having', 'he', 'he\'d', 'he\'ll', 'he\'s', 'her', 'here', 'here\'s', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'how\'s', 'i', 'i\'d', 'i\'ll', 'i\'m', 'i\'ve', 'if', 'in', 'into', 'is', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'let\'s', 'me', 'more', 'most', 'mustn\'t', 'my', 'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours ', 'ourselves', 'out', 'over', 'own', 'same', 'shan\'t', 'she', 'she\'d', 'she\'ll', 'she\'s', 'should', 'shouldn\'t', 'so', 'some', 'such', 'than', 'that', 'that\'s', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'there\'s', 'these', 'they', 'they\'d', 'they\'ll', 'they\'re', 'they\'ve', 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 'was', 'wasn\'t', 'we', 'we\'d', 'we\'ll', 'we\'re', 'we\'ve', 'were', 'weren\'t', 'what', 'what\'s', 'when', 'when\'s', 'where', 'where\'s', 'which', 'while', 'who', 'who\'s', 'whom', 'why', 'why\'s', 'with', 'won\'t', 'would', 'wouldn\'t', 'you', 'you\'d', 'you\'ll', 'you\'re', 'you\'ve', 'your', 'yours', 'yourself', 'yourselves'],
		'pt': ['de', 'da', 'já', 'se', 'ao', 'na', 'seja', 'será', 'que', 'último', 'é', 'acerca', 'agora', 'algumas', 'alguns', 'ali', 'ambos', 'antes', 'apontar', 'aquela', 'aquelas', 'aquele', 'aqueles', 'aqui', 'atrás', 'bem', 'bom', 'cada', 'caminho', 'cima', 'com', 'como', 'comprido', 'conhecido', 'corrente', 'das', 'debaixo', 'dentro', 'desde', 'desligado', 'deve', 'devem', 'deverá', 'direita', 'diz', 'dizer', 'dois', 'dos', 'e', 'ela', 'ele', 'eles', 'em', 'enquanto', 'então', 'está', 'estão', 'estado', 'estar', 'estará', 'este', 'estes', 'esteve', 'estive', 'estivemos', 'estiveram', 'eu', 'fará', 'faz', 'fazer', 'fazia', 'fez', 'fim', 'foi', 'fora', 'horas', 'iniciar', 'inicio', 'ir', 'irá', 'ista', 'iste', 'isto', 'ligado', 'maioria', 'maiorias', 'mais', 'mas', 'mesmo', 'meu', 'muito', 'muitos', 'nós', 'não', 'nome', 'nosso', 'novo', 'o', 'onde', 'os', 'ou', 'outro', 'para', 'parte', 'pegar', 'pelo', 'pessoas', 'pode', 'poderá', 'podia', 'por', 'porque', 'povo', 'promeiro', 'quê', 'qual', 'qualquer', 'quando', 'quem', 'quieto', 'são', 'saber', 'sem', 'ser', 'seu', 'somente', 'têm', 'tal', 'também', 'tem', 'tempo', 'tenho', 'tentar', 'tentaram', 'tente', 'tentei', 'teu', 'teve', 'tipo', 'tive', 'todos', 'trabalhar', 'trabalho', 'tu', 'um', 'uma', 'umas', 'uns', 'usa', 'usar', 'valor', 'veja', 'ver', 'verdade', 'verdadeiro', 'você']
		};
		self._stopWords = [];
		@@ -16,78 +24,85 @@ self.useStopWords = useStopWords;
		return self;
		//------------------------------------------------
		function useStopWords(stopWords) {
		var at = new AutoTagger();
		if (isString(stopWords)) {
		at._stopWords = defaultStopWords[stopWords];
		} else {
		at._stopWords = at._stopWords.concat(stopWords);
		}
		return at;
		};

		function useStopWords(stopWords) {
		var at = this instanceof AutoTagger ? this : new AutoTagger();
		if (isString(stopWords)) {
		at._stopWords = at._stopWords.concat(defaultStopWords[stopWords] \|\| []);
		} else {
		at._stopWords = at._stopWords.concat(stopWords \|\| []);
		}
		return at;
		}

		function fromText(text) {
		var data;
		if (!isString(text)) return;
		data = text.replace(/\s+/g, " ").toLowerCase()
		//.replace(/[^a-zA-Z'\-]+/g, " ")
		.replace(/[`~!@#$%^&*()_\|+\-=?;:'",.<>\{\}\[\]\\\/]/gi, " ");
		data = self._stopWords.reduce(function(text, stop_word) {
		// Build the regex
		var regex = "^\\s" + stop_word + "\\s$"; // Only word
		regex += "\|^\\s*" + stop_word + "\\s+"; // First word
		regex += "\|\\s+" + stop_word + "\\s*$"; // Last word
		regex += "\|\\s+" + stop_word + "\\s+"; // Word somewhere in the middle
		regex = new RegExp(regex, "ig");
		return text.replace(regex, " ");
		}, data);
		data = data.match(/[^\s]+/g);
		var textLength = data.length;
		var gramsFrequency = getGramsFrequency(data);
		var atLeast = 2;
		return gramsFrequency.reduce(function(a, b, i) {
		var frequencyTable = b \|\| {};
		return Object.keys(frequencyTable)
		.filter(function(word) {
		return frequencyTable[word] >= atLeast
		})
		.map(function(word) {
		return {
		word: word,
		count: frequencyTable[word],
		// freq: Math.round(frequencyTable[word] / textLength * 10000) / 100,
		// peso: ((i + 1) / gramsFrequency.length),
		}
		}).concat(a);
		}, [])
		// sort words by abc order
		.sort(function(a, b) {
		return b.count - a.count;
		});
		function fromText(text, atLeast, numWords) {
		var me = this instanceof AutoTagger ? this : new AutoTagger();
		var data;

		if (!isString(text)) return;
		atLeast = atLeast \|\| 2;
		numWords = numWords \|\| 5;

		data = text.replace(/\s+/g, " ").toLowerCase()
		//.replace(/[^a-zA-Z'\-]+/g, " ")
		.replace(/[`~!@#$%^&*()_\|+\-=?;:'",.<>\{\}\[\]\\\/]/gi, " ");
		data = me._stopWords.reduce(function(text, stop_word) {
		// Build the regex
		var regex = "^\\s" + stop_word + "\\s$"; // Only word
		regex += "\|^\\s*" + stop_word + "\\s+"; // First word
		regex += "\|\\s+" + stop_word + "\\s*$"; // Last word
		regex += "\|\\s+" + stop_word + "\\s+"; // Word somewhere in the middle
		regex = new RegExp(regex, "ig");
		return text.replace(regex, " ");
		}, data);
		data = data.match(/[^\s]+/g);

		if (!data) return;

		//var textLength = data.length;
		var gramsFrequency = getGramsFrequency(data, numWords);
		return gramsFrequency.reduce(function(a, b, i) {
		var frequencyTable = b \|\| {};
		return Object.keys(frequencyTable)
		.filter(function(word) {
		return frequencyTable[word] >= atLeast
		})
		.map(function(word) {
		return {
		word: word,
		count: frequencyTable[word],
		// freq: Math.round(frequencyTable[word] / textLength * 10000) / 100,
		// peso: ((i + 1) / gramsFrequency.length),
		}
		}).concat(a);
		}, [])
		// sort words by abc order
		.sort(function(a, b) {
		return b.count - a.count;
		});
		}

		function getGramsFrequency(data, numWords) {
		numWords = numWords \|\| 5;
		var keys = [];
		var i = 0,
		j = 0;
		for (i = 0; i < numWords; i++) {
		keys.push({});
		}

		function getGramsFrequency(data, numWords) {
		numWords = numWords \|\| 5;
		var keys = [];
		var i = 0, j = 0;
		for (i = 0; i < numWords; i++) {
		keys.push({});
		var map = data.reduce(function(p, c, i, arr) {
		p[0][c] = (p[0][c] \|\| 0) + 1;
		for (j = 1; j < numWords; j++) {
		if (i + j < arr.length) {
		c += " " + arr[i + j];
		p[j][c] = (p[j][c] \|\| 0) + 1;
		} else break;
		}
		var map = data.reduce(function(p, c, i, arr) {
		p[0][c] = (p[0][c] \|\| 0) + 1;
		for (j = 1; j < numWords; j++) {
		if (i + j < arr.length) {
		c += " " + arr[i + j];
		p[j][c] = (p[j][c] \|\| 0) + 1;
		} else break;
		}
		return p;
		}, keys);
		return map;
		}
		return p;
		}, keys);
		return map;
		}

		function isString(value) {
		return typeof value === 'string';
		}
		};
		return autoTagger;
		}));
		function isString(value) {
		return typeof value === 'string';
		}
		}));

package.json

		{
		"name": "auto-tagger",
		"version": "1.0.1",
		"version": "1.0.2",
		"description": "JavaScript text auto tagger",
		@@ -5,0 +5,0 @@ "main": "autoTagger.js",

README.md

		# autoTagger.js

		JavaScript text auto tagger
		Simple JavaScript text auto tagger


		# Usage


		Include the javascript in the browser:

		<script type="text/javascript" src="https://cdn.rawgit.com/eberlitz/autoTagger/master/autoTagger.js"></script>

		or use it in Node.js:

		```
		$> npm install auto-tagger
		```

		```
		var autoTagger = require('auto-tagger');
		```

		You can use the folowing methods to extract relevant tags from documents:

		```
		var testText = "This text is from a Wikipedia entry about Bayes' Theorem. Bayesian inference has applications in artificial intelligence and expert systems. Bayesian inference techniques have been a fundamental part of computerized pattern recognition techniques since the late 1950s. There is also an ever growing connection between Bayesian methods and simulation-based Monte Carlo techniques since complex models cannot be processed in closed form by a Bayesian analysis, while the graphical model structure inherent to statistical models, may allow for efficient simulation algorithms like the Gibbs sampling and other Metropolis-Hastings algorithm schemes. Recently Bayesian inference has gained popularity amongst the phylogenetics community for these reasons; applications such as BEAST, MrBayes and P4 allow many demographic and evolutionary parameters to be estimated simultaneously."

		new autoTagger().useStopWords('en').fromText(testText)
		var tags = autoTagger

		// using portuguese stop words
		.useStopWords('pt')

		// using english stop words
		.useStopWords('en')

		// adding aditionals stop words
		.useStopWords(['will'])

		// extract tags from text
		// return tags that have at least 2 ocurrences
		// and look for ocurrences of 4 consecutive words
		.fromText(testText,2,4);

		console.log(tags);
		```

auto-tagger - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics