node-normalizer
Advanced tools
Comparing version 0.0.7 to 0.0.8
@@ -169,2 +169,3 @@ sha_n't shall+not | ||
what've what+have | ||
what's what+is | ||
what_'s what+is | ||
@@ -171,0 +172,0 @@ when_'s when+is |
@@ -1,18 +0,18 @@ | ||
var fs = require('fs'); | ||
var path = require("path"); | ||
var readline = require('readline'); | ||
var stream = require('stream'); | ||
var str = require("string"); | ||
var async = require("async"); | ||
var debug = require('debug')("Normalizer"); | ||
var fs = require('fs'); | ||
var path = require("path"); | ||
var readline = require('readline'); | ||
var stream = require('stream'); | ||
var str = require("string"); | ||
var async = require("async"); | ||
var debug = require('debug')("Normalizer"); | ||
// TODO, fix the paths | ||
var tasks = [ | ||
{'key':'_sys','file':'systemessentials.txt'}, | ||
{'key':'_extra','file':'substitutes.txt'}, | ||
{'key':'_contractions','file':'contractions.txt'}, | ||
{'key':'_interjections','file':'interjections.txt'}, | ||
{'key':'_britsh','file':'british.txt'}, | ||
{'key':'_spellfix','file':'spellfix.txt'}, | ||
{'key':'_texting','file':'texting.txt'} | ||
{'key':'_sys','file':'systemessentials.txt'}, | ||
{'key':'_extra','file':'substitutes.txt'}, | ||
{'key':'_contractions','file':'contractions.txt'}, | ||
{'key':'_interjections','file':'interjections.txt'}, | ||
{'key':'_britsh','file':'british.txt'}, | ||
{'key':'_spellfix','file':'spellfix.txt'}, | ||
{'key':'_texting','file':'texting.txt'} | ||
]; | ||
@@ -23,28 +23,31 @@ | ||
var readSubstitutes = function(file, lineHandle, closeHandle) { | ||
var p = path.join(__dirname, "../data/", file) | ||
var instream = fs.createReadStream(p); | ||
var outstream = new stream; | ||
var rl = readline.createInterface(instream, outstream); | ||
rl.on('line', function(line){ | ||
var nline = str(line).trimLeft(); | ||
// Lets allow comments with '#' | ||
var pos = nline.indexOf('#'); | ||
if (pos == -1) { | ||
var parts = nline.s.split(" "); | ||
if (parts[1] == undefined) { | ||
lineHandle(parts[0], ""); | ||
} else { | ||
lineHandle(parts[0], parts[1]); | ||
} | ||
} else if (pos > 0) { | ||
nline = nline.left(pos); | ||
var parts = nline.s.split(" "); | ||
lineHandle(parts[0], parts[1]); | ||
} | ||
}); | ||
var p = path.join(__dirname, "../data/", file) | ||
var data = fs.readFileSync(p,'utf8').split("\r\n"); | ||
rl.on('close', closeHandle); | ||
for (var i = 0; i < data.length; i++) { | ||
var line = data[i]; | ||
var nline = str(line).trimLeft(); | ||
// Lets allow comments with '#' | ||
var pos = nline.indexOf('#'); | ||
if (pos === -1) { | ||
var parts = nline.s.split(" "); | ||
if (parts[1] == undefined) { | ||
lineHandle(parts[0], ""); | ||
} else { | ||
lineHandle(parts[0], parts[1]); | ||
} | ||
} else if (pos > 0) { | ||
nline = nline.left(pos); | ||
var parts = nline.s.split(" "); | ||
lineHandle(parts[0], parts[1]); | ||
} | ||
} | ||
closeHandle(); | ||
} | ||
@@ -54,60 +57,65 @@ | ||
var itor = function(item, cb) { | ||
debug("Loaded File", item); | ||
var fc = 0; | ||
var itor = function(item, cb2) { | ||
debug("Loaded File", item); | ||
var lineHandle = function(key, replacer) { | ||
var lineHandle = function(key, replacer) { | ||
if (reSet[item.key] === undefined) { | ||
reSet[item.key] = {}; | ||
} | ||
if (reSet[item.key][key] === undefined) { | ||
reSet[item.key][key] = []; | ||
} | ||
// Add RegEx | ||
var startM, endM, lookup = key; | ||
if (key[0] == '<') { | ||
startM = true; | ||
lookup = key.substring(1); | ||
} | ||
if (reSet[item.key] === undefined) { | ||
reSet[item.key] = {}; | ||
} | ||
if (reSet[item.key][key] === undefined) { | ||
reSet[item.key][key] = []; | ||
} | ||
// Add RegEx | ||
var startM, endM, lookup = key; | ||
if (key[0] == '<') { | ||
startM = true; | ||
lookup = key.substring(1); | ||
} | ||
if (key.slice(-1) == '>') { | ||
endM = true; | ||
lookup = lookup.substring(0, lookup.length - 1); | ||
} | ||
if (key.slice(-1) == '>') { | ||
endM = true; | ||
lookup = lookup.substring(0, lookup.length - 1); | ||
} | ||
lookup = lookup.replace(/_/g," "); | ||
var qm = quotemeta(lookup); | ||
lookup = lookup.replace(/_/g," "); | ||
var qm = quotemeta(lookup); | ||
if (startM && endM) { | ||
reSet[item.key][key].push({re: new RegExp("^" + qm + "$", "gi"), r: replacer }); | ||
} else if (startM) { | ||
reSet[item.key][key].push({re: new RegExp("^" + qm + "$", "gi"), r: replacer }); | ||
reSet[item.key][key].push({re: new RegExp("^" + qm + "(\\W+)", "gi"), r: replacer + "$1"}); | ||
} else if (endM) { | ||
if (startM && endM) { | ||
reSet[item.key][key].push({re: new RegExp("^" + qm + "$", "gi"), r: replacer }); | ||
} else if (startM) { | ||
reSet[item.key][key].push({re: new RegExp("^" + qm + "$", "gi"), r: replacer }); | ||
reSet[item.key][key].push({re: new RegExp("^" + qm + "(\\W+)", "gi"), r: replacer + "$1"}); | ||
} else if (endM) { | ||
reSet[item.key][key].push({re: new RegExp("(\\W+)" + qm + "$", "gi"), r: "$1" + replacer }); | ||
if (item.key == "_sys") { | ||
reSet[item.key][key].push({re: new RegExp(qm + "$", "gi"), r: replacer }); | ||
} else { | ||
// reSet[item.key][key].push({re: new RegExp("(\\W+)" + qm + "(\\W+)", "gi"), r: "$1" + replacer + "$2" }); | ||
} | ||
} else { | ||
reSet[item.key][key].push({re: new RegExp("^" + qm + "$", "gi"), r: replacer }); | ||
reSet[item.key][key].push({re: new RegExp("^" + qm + "(\\W+)", "gi"), r: replacer + "$1" }); | ||
reSet[item.key][key].push({re: new RegExp("(\\W+)" + qm + "(\\W+)", "gi"), r: "$1" + replacer + "$2" }); | ||
reSet[item.key][key].push({re: new RegExp("(\\W+)" + qm + "$", "gi"), r: "$1" + replacer }); | ||
} | ||
} | ||
readSubstitutes(item.file, lineHandle, function() { | ||
cb(null); | ||
}); | ||
} | ||
reSet[item.key][key].push({re: new RegExp("(\\W+)" + qm + "$", "gi"), r: "$1" + replacer }); | ||
if (item.key == "_sys") { | ||
reSet[item.key][key].push({re: new RegExp(qm + "$", "gi"), r: replacer }); | ||
} else { | ||
// reSet[item.key][key].push({re: new RegExp("(\\W+)" + qm + "(\\W+)", "gi"), r: "$1" + replacer + "$2" }); | ||
} | ||
} else { | ||
reSet[item.key][key].push({re: new RegExp("^" + qm + "$", "gi"), r: replacer }); | ||
reSet[item.key][key].push({re: new RegExp("^" + qm + "(\\W+)", "gi"), r: replacer + "$1" }); | ||
reSet[item.key][key].push({re: new RegExp("(\\W+)" + qm + "(\\W+)", "gi"), r: "$1" + replacer + "$2" }); | ||
reSet[item.key][key].push({re: new RegExp("(\\W+)" + qm + "$", "gi"), r: "$1" + replacer }); | ||
} | ||
} | ||
readSubstitutes(item.file, lineHandle, function() { | ||
fc++; | ||
if (tasks.length == fc) { | ||
debug("Done Reading Subs"); | ||
cb2('done'); | ||
} | ||
}); | ||
} | ||
async.map(tasks, itor, function(){ | ||
debug("Done Loading files"); | ||
cb(); | ||
}); | ||
async.map(tasks, itor, function(){ | ||
debug("Done Loading files"); | ||
cb(); | ||
}); | ||
}; | ||
@@ -117,41 +125,41 @@ | ||
msg = msg.replace(/\+/g, "<plus>"); | ||
msg = msg.replace(new RegExp("\t", "g"), " "); | ||
msg = msg.replace(/\s+/g, " "); | ||
msg = msg.replace(/\+/g, "<plus>"); | ||
msg = msg.replace(new RegExp("\t", "g"), " "); | ||
msg = msg.replace(/\s+/g, " "); | ||
var fileItor = function(item1, next1) { | ||
var fileItor = function(item1, next1) { | ||
var itemItor = function(item2, next2) { | ||
var reArray = reSet[item1][item2]; | ||
var reItor = function(item3, next3) { | ||
var pm = msg; | ||
msg = msg.replace(item3.re, item3.r); | ||
next3(null); | ||
}; | ||
var itemItor = function(item2, next2) { | ||
var reArray = reSet[item1][item2]; | ||
var reItor = function(item3, next3) { | ||
var pm = msg; | ||
msg = msg.replace(item3.re, item3.r); | ||
next3(null); | ||
}; | ||
async.map(reArray, reItor, function(){ | ||
next2(null); | ||
}); | ||
} | ||
async.map(reArray, reItor, function(){ | ||
next2(null); | ||
}); | ||
} | ||
async.each(Object.keys(reSet[item1]), itemItor, function(){ | ||
next1(null) | ||
}); | ||
} | ||
async.each(Object.keys(reSet[item1]), itemItor, function(){ | ||
next1(null) | ||
}); | ||
} | ||
async.mapSeries(Object.keys(reSet), fileItor, function() { | ||
msg = msg.replace(new RegExp("[\+]{1}", "g"), " "); | ||
msg = msg.replace(new RegExp("<plus>", "g"), "+"); | ||
msg = msg.replace(/\d,\d/g, function(v) { return v.replace(",",""); }); | ||
}); | ||
async.mapSeries(Object.keys(reSet), fileItor, function() { | ||
msg = msg.replace(new RegExp("[\+]{1}", "g"), " "); | ||
msg = msg.replace(new RegExp("<plus>", "g"), "+"); | ||
msg = msg.replace(/\d,\d/g, function(v) { return v.replace(",",""); }); | ||
}); | ||
return msg.trim(); | ||
return msg.trim(); | ||
} | ||
var quotemeta = function (string) { | ||
var unsafe = "\\.+*?[^]$(){}=!<>|:"; | ||
for (var i = 0; i < unsafe.length; i++) { | ||
string = string.replace(new RegExp("\\" + unsafe.charAt(i), "g"), "\\" + unsafe.charAt(i)); | ||
} | ||
return string; | ||
var unsafe = "\\.+*?[^]$(){}=!<>|:"; | ||
for (var i = 0; i < unsafe.length; i++) { | ||
string = string.replace(new RegExp("\\" + unsafe.charAt(i), "g"), "\\" + unsafe.charAt(i)); | ||
} | ||
return string; | ||
}; |
{ | ||
"name": "node-normalizer", | ||
"version": "0.0.7", | ||
"version": "0.0.8", | ||
"description": "Normalize and clean text", | ||
"main": "index.js", | ||
"scripts": { | ||
"test": "mocha test/basic.js -R spec" | ||
"test": "mocha test/basic.js -R spec -t 4000" | ||
}, | ||
@@ -9,0 +9,0 @@ "author": "Rob Ellis", |
@@ -63,4 +63,3 @@ | ||
}); | ||
}); |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
209723
177