prospectimo
Advanced tools
Comparing version 0.2.0 to 0.3.0
204
index.js
/** | ||
* prospectimo | ||
* v0.2.0 | ||
* v0.3.0 | ||
* | ||
@@ -24,7 +24,8 @@ * Analyse the temporal orientation of a string. | ||
* 'encoding': 'binary', // 'binary' (default), or 'frequency' - type of word encoding to use. | ||
* 'threshold': -0.38, // | ||
* 'bigrams': true, // compare against bigrams in the lexicon? | ||
* 'trigrams': true, // compare against trigrams in the lexicon? | ||
* } | ||
* const text = "A big long string of text..."; | ||
* const orientation = prospectimo(text, opts); | ||
* const str = "A big long string of text..."; | ||
* const orientation = prospectimo(str, opts); | ||
* console.log(orientation) | ||
@@ -34,3 +35,3 @@ * | ||
* @param {Object} opts options | ||
* @return {string|number} temporal orientation or lexical value based on opts | ||
* @return {Object|string} temporal orientation or lexical value based on opts | ||
*/ | ||
@@ -43,16 +44,15 @@ | ||
let tokenizer = root.tokenizer | ||
let lexicon = root.lexicon | ||
let natural = root.natural | ||
let tokenizer = root.tokenizer | ||
if (typeof tokenizer === 'undefined') { | ||
const hasRequire = typeof require !== 'undefined' | ||
if (hasRequire) { | ||
tokenizer = require('happynodetokenizer') | ||
if (typeof lexicon === 'undefined') { | ||
if (typeof require !== 'undefined') { | ||
lexicon = require('./data/lexicon.json') | ||
natural = require('natural') | ||
} else throw new Error('prospectimo required happynodetokenizer and ./data/lexicon.json') | ||
tokenizer = require('happynodetokenizer') | ||
} else throw new Error('prospectimo requires node modules happynodetokenizer and natural, and ./data/lexicon.json') | ||
} | ||
// get multiple indexes helper | ||
// Find how many times an element appears in an array | ||
Array.prototype.indexesOf = function (el) { | ||
@@ -70,14 +70,18 @@ const idxs = [] | ||
/** | ||
* @function getBigrams | ||
* @param {string} str input string | ||
* @return {Array} array of bigram strings | ||
* Get all the n-grams of a string and return as an array | ||
* @function getNGrams | ||
* @param {string} str input string | ||
* @param {number} n abitrary n-gram number, e.g. 2 = bigrams | ||
* @return {Array} array of ngram strings | ||
*/ | ||
const getBigrams = str => { | ||
const NGrams = natural.NGrams | ||
const bigrams = NGrams.bigrams(str) | ||
const getNGrams = (str, n) => { | ||
// default to bi-grams on null n | ||
if (n == null) n = 2 | ||
if (typeof n !== 'number') n = Number(n) | ||
const ngrams = natural.NGrams.ngrams(str, n) | ||
const len = ngrams.length | ||
const result = [] | ||
const len = bigrams.length | ||
let i = 0 | ||
for (i; i < len; i++) { | ||
result.push(bigrams[i].join(' ')) | ||
result.push(ngrams[i].join(' ')) | ||
} | ||
@@ -88,19 +92,14 @@ return result | ||
/** | ||
* @function getTrigrams | ||
* @param {string} str input string | ||
* @return {Array} array of trigram strings | ||
* Loop through lexicon and match against array | ||
* @function getMatches | ||
* @param {Array} arr token array | ||
* @param {number} threshold min. weight threshold | ||
* @return {Object} object of matches | ||
*/ | ||
const getTrigrams = str => { | ||
const NGrams = natural.NGrams | ||
const trigrams = NGrams.trigrams(str) | ||
const result = [] | ||
const len = trigrams.length | ||
let i = 0 | ||
for (i; i < len; i++) { | ||
result.push(trigrams[i].join(' ')) | ||
} | ||
return result | ||
} | ||
const getMatches = (arr) => { | ||
const getMatches = (arr, threshold) => { | ||
// error prevention | ||
if (arr == null) return null | ||
if (threshold == null) threshold = -999 | ||
if (typeof threshold !== 'number') threshold = Number(threshold) | ||
// loop through categories in lexicon | ||
const matches = {} | ||
@@ -111,23 +110,15 @@ let category | ||
let match = [] | ||
let key | ||
let word | ||
let data = lexicon[category] | ||
for (key in data) { | ||
if (!data.hasOwnProperty(key)) continue | ||
if (arr.indexOf(key) > -1) { | ||
let item | ||
let weight = data[key] | ||
let reps = arr.indexesOf(key).length | ||
if (reps > 1) { | ||
let words = [] | ||
for (let i = 0; i < reps; i++) { | ||
words.push(key) | ||
} | ||
item = [words, weight] | ||
} else { | ||
item = [key, weight] | ||
} | ||
match.push(item) | ||
// loop through words in category | ||
for (word in data) { | ||
if (!data.hasOwnProperty(word)) continue | ||
let weight = data[word] | ||
// if word from input matches word from lexicon ... | ||
if (arr.indexOf(word) > -1 && weight > threshold) { | ||
let count = arr.indexesOf(word).length // number of times the word appears in the input text | ||
match.push([word, count, weight]) | ||
} | ||
matches[category] = match | ||
} | ||
matches[category] = match | ||
} | ||
@@ -137,34 +128,35 @@ return matches | ||
/** | ||
* Calculate the total lexical value of matches | ||
* @function calcLex | ||
* @param {Object} obj matches object | ||
* @param {number} wc wordcount | ||
* @param {number} int intercept value | ||
* @param {string} enc encoding | ||
* @return {number} lexical value | ||
*/ | ||
const calcLex = (obj, wc, int, enc) => { | ||
const counts = [] | ||
const weights = [] | ||
let key | ||
for (key in obj) { | ||
if (!obj.hasOwnProperty(key)) continue | ||
if (Array.isArray(obj[key][0])) { | ||
counts.push(obj[key][0].length) | ||
} else { | ||
counts.push(1) | ||
} | ||
weights.push(obj[key][1]) | ||
} | ||
if (obj == null) return null | ||
let lex = 0 | ||
let i | ||
const len = counts.length | ||
const words = Number(wc) | ||
for (i = 0; i < len; i++) { | ||
let weight = Number(weights[i]) | ||
if (enc === 'frequency') { | ||
let count = Number(counts[i]) | ||
lex += (count / words) * weight | ||
let word | ||
for (word in obj) { | ||
if (!obj.hasOwnProperty(word)) continue | ||
if (enc === 'binary' || enc == null || wc == null) { | ||
// weight + weight + weight etc | ||
lex += Number(obj[word][2]) | ||
} else { | ||
lex += weight | ||
// (frequency / wordcount) * weight | ||
lex += (Number(obj[word][1]) / Number(wc)) * Number(obj[word][2]) | ||
} | ||
} | ||
// add intercept value | ||
lex += int | ||
// return final lexical value | ||
if (int != null) lex += Number(int) | ||
return lex | ||
} | ||
/** | ||
* Converts the lexical values object to an orientation string | ||
* @function getOrientation | ||
* @param {Object} obj lexical values object | ||
* @return {string} 'Past', 'Present', or 'Future' | ||
*/ | ||
const getOrientation = obj => { | ||
@@ -174,27 +166,24 @@ const a = [obj.PAST, obj.PRESENT, obj.FUTURE] | ||
let ori | ||
let orientation = `No temporal orientation detected.` | ||
if (indexOfMaxValue === 0) { | ||
ori = 'past' | ||
orientation = 'Past' | ||
} else if (indexOfMaxValue === 1) { | ||
ori = 'present' | ||
orientation = 'Present' | ||
} else if (indexOfMaxValue === 2) { | ||
ori = 'future' | ||
orientation = 'Future' | ||
} | ||
let str | ||
if (a[indexOfMaxValue] < 0) { | ||
str = `No temporal orientation association detected.` | ||
} else { | ||
str = ori | ||
} | ||
return str | ||
return orientation | ||
} | ||
/** | ||
* Analyse the temporal orientation of a string | ||
* @function prospectimo | ||
* @param {string} str input string | ||
* @param {Object} opts options | ||
* @return {Object|string} temporal orientation or lexical value based on opts | ||
*/ | ||
const prospectimo = (str, opts) => { | ||
// make sure there is input before proceeding | ||
// error prevention | ||
if (str == null) return null | ||
// if str isn't a string, make it into one | ||
if (typeof str !== 'string') str = str.toString() | ||
// trim whitespace and convert to lowercase | ||
str = str.toLowerCase().trim() | ||
// default options | ||
@@ -205,4 +194,5 @@ if (opts == null) { | ||
'encoding': 'binary', | ||
'bigrams': true, // match bigrams? | ||
'trigrams': true // match trigrams? | ||
'threshold': -999, | ||
'bigrams': true, | ||
'trigrams': true | ||
} | ||
@@ -212,20 +202,23 @@ } | ||
opts.encoding = opts.encoding || 'binary' | ||
opts.threshold = opts.threshold || -999 | ||
// convert to lowercase and trim whitespace | ||
str = str.toLowerCase().trim() | ||
// convert our string to tokens | ||
let tokens = tokenizer(str) | ||
// if no tokens return null | ||
if (tokens == null) return { PAST: 0, PRESENT: 0, FUTURE: 0 } | ||
// get wordcount | ||
if (tokens == null) return null | ||
// get wordcount before we add n-grams | ||
const wordcount = tokens.length | ||
// handle bigrams if wanted | ||
// handle bi-grams if wanted | ||
if (opts.bigrams) { | ||
const bigrams = getBigrams(str) | ||
const bigrams = getNGrams(str, 2) | ||
tokens = tokens.concat(bigrams) | ||
} | ||
// handle trigrams if wanted | ||
// handle tri-grams if wanted | ||
if (opts.trigrams) { | ||
const trigrams = getTrigrams(str) | ||
const trigrams = getNGrams(str, 3) | ||
tokens = tokens.concat(trigrams) | ||
} | ||
// get matches from array | ||
const matches = getMatches(tokens) | ||
const matches = getMatches(tokens, opts.threshold) | ||
// calculate lexical useage | ||
@@ -238,7 +231,4 @@ const enc = opts.encoding | ||
// predict and return | ||
if (opts.return === 'lex') { | ||
return lex | ||
} else { | ||
return getOrientation(lex) | ||
} | ||
if (opts.return === 'lex') return lex | ||
return getOrientation(lex) | ||
} | ||
@@ -245,0 +235,0 @@ |
{ | ||
"name": "prospectimo", | ||
"version": "0.2.0", | ||
"version": "0.3.0", | ||
"description": "Analyse the temporal orientation of a string.", | ||
@@ -5,0 +5,0 @@ "main": "index.js", |
# prospectimo | ||
Analyse the temporal orientation of a string. | ||
Get the temporal orientation of a string. | ||
@@ -9,12 +9,45 @@ ## Usage | ||
const opts = { | ||
'return': 'lex', // 'orientation' return string, 'lex' (default) returns object of lexical values | ||
"threshold": -0.2, | ||
"bigrams": true, | ||
"trigrams": true | ||
'return': 'lex', // 'orientation' returns a string, 'lex' (default) returns object of lexical values | ||
'encoding': 'binary', // 'binary' (default), or 'frequency' - type of word encoding to use. | ||
'threshold': -0.98, // a lexical weight threshold between 1.16 (include nothing), and -0.98 (include everything, default) | ||
'bigrams': true, // compare against bigrams in the lexicon? | ||
'trigrams': true, // compare against trigrams in the lexicon? | ||
} | ||
const text = "A big long string of text..."; | ||
let orientation = prospectimo(text, opts); | ||
console.log(orientation) | ||
const str = "A big long string of text..."; | ||
const orientation = prospectimo(str, opts); | ||
console.log(orientation); | ||
``` | ||
## Options | ||
### 'return' | ||
Valid options: 'lex' (default), or 'orientation'. | ||
'lex' returns an object with 'PAST', 'PRESENT' and 'FUTURE' keys, each containing a lexical value for that orientation. | ||
'orientation' returns a string stating either 'Past', 'Present', 'Future', or 'Unknown'. | ||
### 'encoding' | ||
Valid options: 'binary' (default), or 'frequency'. | ||
'binary' calculates the lexical value as simply a sum of weights, i.e. weight[1] + weight[2] + etc... | ||
'frequency' calculates the lexical value as (word frequency / total wordcount) * word weight | ||
Unless you have a specific need for frequency encoding, we recommend you use binary only. | ||
### 'threshold' | ||
The lexicon contains weight values that are very small. You can exclude them using the threshold option. | ||
The smallest value in the lexicon is -0.9772179. Therefore a threshold of -0.98 will include all words in the lexicon. | ||
The largest value in the lexicon is 1.15807005. Therefore a threshold of 1.16 will include no words in the lexicon. | ||
### 'bigrams' and 'trigrams' | ||
The lexicon includes strings that are between one and three words in length. By default we will match against these using bi-grams and tri-grams, however you may want to disable these when analysing very long strings to save processing time and memory use. | ||
## Acknowledgements | ||
@@ -28,7 +61,7 @@ | ||
Used under the Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported licence | ||
Used under the [Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported](http://creativecommons.org/licenses/by-nc-sa/3.0/) | ||
# Licence | ||
(C) 2017 P. Hughes | ||
(C) 2017 [P. Hughes](www.phugh.es) | ||
[Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported](http://creativecommons.org/licenses/by-nc-sa/3.0/) |
56272
66
226