auto-tagger
Advanced tools
Comparing version 1.0.1 to 1.0.2
@@ -1,12 +0,20 @@ | ||
(function (global, factory) { | ||
(function(global, factory) { | ||
typeof exports === 'object' && typeof module !== 'undefined' ? module.exports = factory() : | ||
typeof define === 'function' && define.amd ? define(factory) : | ||
global.autoTagger = factory() | ||
}(this, function () { 'use strict'; | ||
var autoTagger = function AutoTagger() { | ||
typeof define === 'function' && define.amd ? define(factory) : | ||
global.autoTagger = factory() | ||
}(this, function() { | ||
'use strict'; | ||
var defaultStopWords = { | ||
'en': ['a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', 'aren\'t', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can\'t', 'cannot', 'could', 'couldn\'t', 'did', 'didn\'t', 'do', 'does', 'doesn\'t', 'doing', 'don\'t', 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn\'t', 'has', 'hasn\'t', 'have', 'haven\'t', 'having', 'he', 'he\'d', 'he\'ll', 'he\'s', 'her', 'here', 'here\'s', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'how\'s', 'i', 'i\'d', 'i\'ll', 'i\'m', 'i\'ve', 'if', 'in', 'into', 'is', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'let\'s', 'me', 'more', 'most', 'mustn\'t', 'my', 'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours ', 'ourselves', 'out', 'over', 'own', 'same', 'shan\'t', 'she', 'she\'d', 'she\'ll', 'she\'s', 'should', 'shouldn\'t', 'so', 'some', 'such', 'than', 'that', 'that\'s', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'there\'s', 'these', 'they', 'they\'d', 'they\'ll', 'they\'re', 'they\'ve', 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 'was', 'wasn\'t', 'we', 'we\'d', 'we\'ll', 'we\'re', 'we\'ve', 'were', 'weren\'t', 'what', 'what\'s', 'when', 'when\'s', 'where', 'where\'s', 'which', 'while', 'who', 'who\'s', 'whom', 'why', 'why\'s', 'with', 'won\'t', 'would', 'wouldn\'t', 'you', 'you\'d', 'you\'ll', 'you\'re', 'you\'ve', 'your', 'yours', 'yourself', 'yourselves'], | ||
'pt': ['de', 'da', 'já', 'se', 'ao', 'na', 'seja', 'será', 'que', 'último', 'é', 'acerca', 'agora', 'algumas', 'alguns', 'ali', 'ambos', 'antes', 'apontar', 'aquela', 'aquelas', 'aquele', 'aqueles', 'aqui', 'atrás', 'bem', 'bom', 'cada', 'caminho', 'cima', 'com', 'como', 'comprido', 'conhecido', 'corrente', 'das', 'debaixo', 'dentro', 'desde', 'desligado', 'deve', 'devem', 'deverá', 'direita', 'diz', 'dizer', 'dois', 'dos', 'e', 'ela', 'ele', 'eles', 'em', 'enquanto', 'então', 'está', 'estão', 'estado', 'estar', 'estará', 'este', 'estes', 'esteve', 'estive', 'estivemos', 'estiveram', 'eu', 'fará', 'faz', 'fazer', 'fazia', 'fez', 'fim', 'foi', 'fora', 'horas', 'iniciar', 'inicio', 'ir', 'irá', 'ista', 'iste', 'isto', 'ligado', 'maioria', 'maiorias', 'mais', 'mas', 'mesmo', 'meu', 'muito', 'muitos', 'nós', 'não', 'nome', 'nosso', 'novo', 'o', 'onde', 'os', 'ou', 'outro', 'para', 'parte', 'pegar', 'pelo', 'pessoas', 'pode', 'poderá', 'podia', 'por', 'porque', 'povo', 'promeiro', 'quê', 'qual', 'qualquer', 'quando', 'quem', 'quieto', 'são', 'saber', 'sem', 'ser', 'seu', 'somente', 'têm', 'tal', 'também', 'tem', 'tempo', 'tenho', 'tentar', 'tentaram', 'tente', 'tentei', 'teu', 'teve', 'tipo', 'tive', 'todos', 'trabalhar', 'trabalho', 'tu', 'um', 'uma', 'umas', 'uns', 'usa', 'usar', 'valor', 'veja', 'ver', 'verdade', 'verdadeiro', 'você'] | ||
}; | ||
var autoTagger = AutoTagger; | ||
autoTagger.useStopWords = useStopWords; | ||
autoTagger.fromText = fromText; | ||
return autoTagger; | ||
//----------------------------------------------------------------------------- | ||
function AutoTagger() { | ||
var self = this; | ||
var defaultStopWords = { | ||
'en': ['a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', 'aren\'t', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can\'t', 'cannot', 'could', 'couldn\'t', 'did', 'didn\'t', 'do', 'does', 'doesn\'t', 'doing', 'don\'t', 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn\'t', 'has', 'hasn\'t', 'have', 'haven\'t', 'having', 'he', 'he\'d', 'he\'ll', 'he\'s', 'her', 'here', 'here\'s', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'how\'s', 'i', 'i\'d', 'i\'ll', 'i\'m', 'i\'ve', 'if', 'in', 'into', 'is', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'let\'s', 'me', 'more', 'most', 'mustn\'t', 'my', 'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours ', 'ourselves', 'out', 'over', 'own', 'same', 'shan\'t', 'she', 'she\'d', 'she\'ll', 'she\'s', 'should', 'shouldn\'t', 'so', 'some', 'such', 'than', 'that', 'that\'s', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'there\'s', 'these', 'they', 'they\'d', 'they\'ll', 'they\'re', 'they\'ve', 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 'was', 'wasn\'t', 'we', 'we\'d', 'we\'ll', 'we\'re', 'we\'ve', 'were', 'weren\'t', 'what', 'what\'s', 'when', 'when\'s', 'where', 'where\'s', 'which', 'while', 'who', 'who\'s', 'whom', 'why', 'why\'s', 'with', 'won\'t', 'would', 'wouldn\'t', 'you', 'you\'d', 'you\'ll', 'you\'re', 'you\'ve', 'your', 'yours', 'yourself', 'yourselves'], | ||
'pt': ['de', 'da', 'já', 'se', 'ao', 'na', 'seja', 'será', 'que', 'último', 'é', 'acerca', 'agora', 'algumas', 'alguns', 'ali', 'ambos', 'antes', 'apontar', 'aquela', 'aquelas', 'aquele', 'aqueles', 'aqui', 'atrás', 'bem', 'bom', 'cada', 'caminho', 'cima', 'com', 'como', 'comprido', 'conhecido', 'corrente', 'das', 'debaixo', 'dentro', 'desde', 'desligado', 'deve', 'devem', 'deverá', 'direita', 'diz', 'dizer', 'dois', 'dos', 'e', 'ela', 'ele', 'eles', 'em', 'enquanto', 'então', 'está', 'estão', 'estado', 'estar', 'estará', 'este', 'estes', 'esteve', 'estive', 'estivemos', 'estiveram', 'eu', 'fará', 'faz', 'fazer', 'fazia', 'fez', 'fim', 'foi', 'fora', 'horas', 'iniciar', 'inicio', 'ir', 'irá', 'ista', 'iste', 'isto', 'ligado', 'maioria', 'maiorias', 'mais', 'mas', 'mesmo', 'meu', 'muito', 'muitos', 'nós', 'não', 'nome', 'nosso', 'novo', 'o', 'onde', 'os', 'ou', 'outro', 'para', 'parte', 'pegar', 'pelo', 'pessoas', 'pode', 'poderá', 'podia', 'por', 'porque', 'povo', 'promeiro', 'quê', 'qual', 'qualquer', 'quando', 'quem', 'quieto', 'são', 'saber', 'sem', 'ser', 'seu', 'somente', 'têm', 'tal', 'também', 'tem', 'tempo', 'tenho', 'tentar', 'tentaram', 'tente', 'tentei', 'teu', 'teve', 'tipo', 'tive', 'todos', 'trabalhar', 'trabalho', 'tu', 'um', 'uma', 'umas', 'uns', 'usa', 'usar', 'valor', 'veja', 'ver', 'verdade', 'verdadeiro', 'você'] | ||
}; | ||
self._stopWords = []; | ||
@@ -16,78 +24,85 @@ self.useStopWords = useStopWords; | ||
return self; | ||
//------------------------------------------------ | ||
function useStopWords(stopWords) { | ||
var at = new AutoTagger(); | ||
if (isString(stopWords)) { | ||
at._stopWords = defaultStopWords[stopWords]; | ||
} else { | ||
at._stopWords = at._stopWords.concat(stopWords); | ||
} | ||
return at; | ||
}; | ||
function useStopWords(stopWords) { | ||
var at = this instanceof AutoTagger ? this : new AutoTagger(); | ||
if (isString(stopWords)) { | ||
at._stopWords = at._stopWords.concat(defaultStopWords[stopWords] || []); | ||
} else { | ||
at._stopWords = at._stopWords.concat(stopWords || []); | ||
} | ||
return at; | ||
} | ||
function fromText(text) { | ||
var data; | ||
if (!isString(text)) return; | ||
data = text.replace(/\s+/g, " ").toLowerCase() | ||
//.replace(/[^a-zA-Z'\-]+/g, " ") | ||
.replace(/[`~!@#$%^&*()_|+\-=?;:'",.<>\{\}\[\]\\\/]/gi, " "); | ||
data = self._stopWords.reduce(function(text, stop_word) { | ||
// Build the regex | ||
var regex = "^\\s*" + stop_word + "\\s*$"; // Only word | ||
regex += "|^\\s*" + stop_word + "\\s+"; // First word | ||
regex += "|\\s+" + stop_word + "\\s*$"; // Last word | ||
regex += "|\\s+" + stop_word + "\\s+"; // Word somewhere in the middle | ||
regex = new RegExp(regex, "ig"); | ||
return text.replace(regex, " "); | ||
}, data); | ||
data = data.match(/[^\s]+/g); | ||
var textLength = data.length; | ||
var gramsFrequency = getGramsFrequency(data); | ||
var atLeast = 2; | ||
return gramsFrequency.reduce(function(a, b, i) { | ||
var frequencyTable = b || {}; | ||
return Object.keys(frequencyTable) | ||
.filter(function(word) { | ||
return frequencyTable[word] >= atLeast | ||
}) | ||
.map(function(word) { | ||
return { | ||
word: word, | ||
count: frequencyTable[word], | ||
// freq: Math.round(frequencyTable[word] / textLength * 10000) / 100, | ||
// peso: ((i + 1) / gramsFrequency.length), | ||
} | ||
}).concat(a); | ||
}, []) | ||
// sort words by abc order | ||
.sort(function(a, b) { | ||
return b.count - a.count; | ||
}); | ||
function fromText(text, atLeast, numWords) { | ||
var me = this instanceof AutoTagger ? this : new AutoTagger(); | ||
var data; | ||
if (!isString(text)) return; | ||
atLeast = atLeast || 2; | ||
numWords = numWords || 5; | ||
data = text.replace(/\s+/g, " ").toLowerCase() | ||
//.replace(/[^a-zA-Z'\-]+/g, " ") | ||
.replace(/[`~!@#$%^&*()_|+\-=?;:'",.<>\{\}\[\]\\\/]/gi, " "); | ||
data = me._stopWords.reduce(function(text, stop_word) { | ||
// Build the regex | ||
var regex = "^\\s*" + stop_word + "\\s*$"; // Only word | ||
regex += "|^\\s*" + stop_word + "\\s+"; // First word | ||
regex += "|\\s+" + stop_word + "\\s*$"; // Last word | ||
regex += "|\\s+" + stop_word + "\\s+"; // Word somewhere in the middle | ||
regex = new RegExp(regex, "ig"); | ||
return text.replace(regex, " "); | ||
}, data); | ||
data = data.match(/[^\s]+/g); | ||
if (!data) return; | ||
//var textLength = data.length; | ||
var gramsFrequency = getGramsFrequency(data, numWords); | ||
return gramsFrequency.reduce(function(a, b, i) { | ||
var frequencyTable = b || {}; | ||
return Object.keys(frequencyTable) | ||
.filter(function(word) { | ||
return frequencyTable[word] >= atLeast | ||
}) | ||
.map(function(word) { | ||
return { | ||
word: word, | ||
count: frequencyTable[word], | ||
// freq: Math.round(frequencyTable[word] / textLength * 10000) / 100, | ||
// peso: ((i + 1) / gramsFrequency.length), | ||
} | ||
}).concat(a); | ||
}, []) | ||
// sort words by abc order | ||
.sort(function(a, b) { | ||
return b.count - a.count; | ||
}); | ||
} | ||
function getGramsFrequency(data, numWords) { | ||
numWords = numWords || 5; | ||
var keys = []; | ||
var i = 0, | ||
j = 0; | ||
for (i = 0; i < numWords; i++) { | ||
keys.push({}); | ||
} | ||
function getGramsFrequency(data, numWords) { | ||
numWords = numWords || 5; | ||
var keys = []; | ||
var i = 0, j = 0; | ||
for (i = 0; i < numWords; i++) { | ||
keys.push({}); | ||
var map = data.reduce(function(p, c, i, arr) { | ||
p[0][c] = (p[0][c] || 0) + 1; | ||
for (j = 1; j < numWords; j++) { | ||
if (i + j < arr.length) { | ||
c += " " + arr[i + j]; | ||
p[j][c] = (p[j][c] || 0) + 1; | ||
} else break; | ||
} | ||
var map = data.reduce(function(p, c, i, arr) { | ||
p[0][c] = (p[0][c] || 0) + 1; | ||
for (j = 1; j < numWords; j++) { | ||
if (i + j < arr.length) { | ||
c += " " + arr[i + j]; | ||
p[j][c] = (p[j][c] || 0) + 1; | ||
} else break; | ||
} | ||
return p; | ||
}, keys); | ||
return map; | ||
} | ||
return p; | ||
}, keys); | ||
return map; | ||
} | ||
function isString(value) { | ||
return typeof value === 'string'; | ||
} | ||
}; | ||
return autoTagger; | ||
})); | ||
function isString(value) { | ||
return typeof value === 'string'; | ||
} | ||
})); |
{ | ||
"name": "auto-tagger", | ||
"version": "1.0.1", | ||
"version": "1.0.2", | ||
"description": "JavaScript text auto tagger", | ||
@@ -5,0 +5,0 @@ "main": "autoTagger.js", |
# autoTagger.js | ||
JavaScript text auto tagger | ||
Simple JavaScript text auto tagger | ||
# Usage | ||
Include the javascript in the browser: | ||
<script type="text/javascript" src="https://cdn.rawgit.com/eberlitz/autoTagger/master/autoTagger.js"></script> | ||
or use it in Node.js: | ||
``` | ||
$> npm install auto-tagger | ||
``` | ||
``` | ||
var autoTagger = require('auto-tagger'); | ||
``` | ||
You can use the folowing methods to extract relevant tags from documents: | ||
``` | ||
var testText = "This text is from a Wikipedia entry about Bayes' Theorem. Bayesian inference has applications in artificial intelligence and expert systems. Bayesian inference techniques have been a fundamental part of computerized pattern recognition techniques since the late 1950s. There is also an ever growing connection between Bayesian methods and simulation-based Monte Carlo techniques since complex models cannot be processed in closed form by a Bayesian analysis, while the graphical model structure inherent to statistical models, may allow for efficient simulation algorithms like the Gibbs sampling and other Metropolis-Hastings algorithm schemes. Recently Bayesian inference has gained popularity amongst the phylogenetics community for these reasons; applications such as BEAST, MrBayes and P4 allow many demographic and evolutionary parameters to be estimated simultaneously." | ||
new autoTagger().useStopWords('en').fromText(testText) | ||
var tags = autoTagger | ||
// using portuguese stop words | ||
.useStopWords('pt') | ||
// using english stop words | ||
.useStopWords('en') | ||
// adding aditionals stop words | ||
.useStopWords(['will']) | ||
// extract tags from text | ||
// return tags that have at least 2 ocurrences | ||
// and look for ocurrences of 4 consecutive words | ||
.fromText(testText,2,4); | ||
console.log(tags); | ||
``` |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
10126
105
44