Socket
Socket
Sign inDemoInstall

node-normalizer

Package Overview
Dependencies
Maintainers
1
Versions
21
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

node-normalizer - npm Package Compare versions

Comparing version 0.0.7 to 0.0.8

1

data/contractions.txt

@@ -169,2 +169,3 @@ sha_n't shall+not

what've what+have
what's what+is
what_'s what+is

@@ -171,0 +172,0 @@ when_'s when+is

246

lib/normalizer.js

@@ -1,18 +0,18 @@

var fs = require('fs');
var path = require("path");
var readline = require('readline');
var stream = require('stream');
var str = require("string");
var async = require("async");
var debug = require('debug')("Normalizer");
var fs = require('fs');
var path = require("path");
var readline = require('readline');
var stream = require('stream');
var str = require("string");
var async = require("async");
var debug = require('debug')("Normalizer");
// TODO, fix the paths
var tasks = [
{'key':'_sys','file':'systemessentials.txt'},
{'key':'_extra','file':'substitutes.txt'},
{'key':'_contractions','file':'contractions.txt'},
{'key':'_interjections','file':'interjections.txt'},
{'key':'_britsh','file':'british.txt'},
{'key':'_spellfix','file':'spellfix.txt'},
{'key':'_texting','file':'texting.txt'}
{'key':'_sys','file':'systemessentials.txt'},
{'key':'_extra','file':'substitutes.txt'},
{'key':'_contractions','file':'contractions.txt'},
{'key':'_interjections','file':'interjections.txt'},
{'key':'_britsh','file':'british.txt'},
{'key':'_spellfix','file':'spellfix.txt'},
{'key':'_texting','file':'texting.txt'}
];

@@ -23,28 +23,31 @@

var readSubstitutes = function(file, lineHandle, closeHandle) {
var p = path.join(__dirname, "../data/", file)
var instream = fs.createReadStream(p);
var outstream = new stream;
var rl = readline.createInterface(instream, outstream);
rl.on('line', function(line){
var nline = str(line).trimLeft();
// Lets allow comments with '#'
var pos = nline.indexOf('#');
if (pos == -1) {
var parts = nline.s.split(" ");
if (parts[1] == undefined) {
lineHandle(parts[0], "");
} else {
lineHandle(parts[0], parts[1]);
}
} else if (pos > 0) {
nline = nline.left(pos);
var parts = nline.s.split(" ");
lineHandle(parts[0], parts[1]);
}
});
var p = path.join(__dirname, "../data/", file)
var data = fs.readFileSync(p,'utf8').split("\r\n");
rl.on('close', closeHandle);
for (var i = 0; i < data.length; i++) {
var line = data[i];
var nline = str(line).trimLeft();
// Lets allow comments with '#'
var pos = nline.indexOf('#');
if (pos === -1) {
var parts = nline.s.split(" ");
if (parts[1] == undefined) {
lineHandle(parts[0], "");
} else {
lineHandle(parts[0], parts[1]);
}
} else if (pos > 0) {
nline = nline.left(pos);
var parts = nline.s.split(" ");
lineHandle(parts[0], parts[1]);
}
}
closeHandle();
}

@@ -54,60 +57,65 @@

var itor = function(item, cb) {
debug("Loaded File", item);
var fc = 0;
var itor = function(item, cb2) {
debug("Loaded File", item);
var lineHandle = function(key, replacer) {
var lineHandle = function(key, replacer) {
if (reSet[item.key] === undefined) {
reSet[item.key] = {};
}
if (reSet[item.key][key] === undefined) {
reSet[item.key][key] = [];
}
// Add RegEx
var startM, endM, lookup = key;
if (key[0] == '<') {
startM = true;
lookup = key.substring(1);
}
if (reSet[item.key] === undefined) {
reSet[item.key] = {};
}
if (reSet[item.key][key] === undefined) {
reSet[item.key][key] = [];
}
// Add RegEx
var startM, endM, lookup = key;
if (key[0] == '<') {
startM = true;
lookup = key.substring(1);
}
if (key.slice(-1) == '>') {
endM = true;
lookup = lookup.substring(0, lookup.length - 1);
}
if (key.slice(-1) == '>') {
endM = true;
lookup = lookup.substring(0, lookup.length - 1);
}
lookup = lookup.replace(/_/g," ");
var qm = quotemeta(lookup);
lookup = lookup.replace(/_/g," ");
var qm = quotemeta(lookup);
if (startM && endM) {
reSet[item.key][key].push({re: new RegExp("^" + qm + "$", "gi"), r: replacer });
} else if (startM) {
reSet[item.key][key].push({re: new RegExp("^" + qm + "$", "gi"), r: replacer });
reSet[item.key][key].push({re: new RegExp("^" + qm + "(\\W+)", "gi"), r: replacer + "$1"});
} else if (endM) {
if (startM && endM) {
reSet[item.key][key].push({re: new RegExp("^" + qm + "$", "gi"), r: replacer });
} else if (startM) {
reSet[item.key][key].push({re: new RegExp("^" + qm + "$", "gi"), r: replacer });
reSet[item.key][key].push({re: new RegExp("^" + qm + "(\\W+)", "gi"), r: replacer + "$1"});
} else if (endM) {
reSet[item.key][key].push({re: new RegExp("(\\W+)" + qm + "$", "gi"), r: "$1" + replacer });
if (item.key == "_sys") {
reSet[item.key][key].push({re: new RegExp(qm + "$", "gi"), r: replacer });
} else {
// reSet[item.key][key].push({re: new RegExp("(\\W+)" + qm + "(\\W+)", "gi"), r: "$1" + replacer + "$2" });
}
} else {
reSet[item.key][key].push({re: new RegExp("^" + qm + "$", "gi"), r: replacer });
reSet[item.key][key].push({re: new RegExp("^" + qm + "(\\W+)", "gi"), r: replacer + "$1" });
reSet[item.key][key].push({re: new RegExp("(\\W+)" + qm + "(\\W+)", "gi"), r: "$1" + replacer + "$2" });
reSet[item.key][key].push({re: new RegExp("(\\W+)" + qm + "$", "gi"), r: "$1" + replacer });
}
}
readSubstitutes(item.file, lineHandle, function() {
cb(null);
});
}
reSet[item.key][key].push({re: new RegExp("(\\W+)" + qm + "$", "gi"), r: "$1" + replacer });
if (item.key == "_sys") {
reSet[item.key][key].push({re: new RegExp(qm + "$", "gi"), r: replacer });
} else {
// reSet[item.key][key].push({re: new RegExp("(\\W+)" + qm + "(\\W+)", "gi"), r: "$1" + replacer + "$2" });
}
} else {
reSet[item.key][key].push({re: new RegExp("^" + qm + "$", "gi"), r: replacer });
reSet[item.key][key].push({re: new RegExp("^" + qm + "(\\W+)", "gi"), r: replacer + "$1" });
reSet[item.key][key].push({re: new RegExp("(\\W+)" + qm + "(\\W+)", "gi"), r: "$1" + replacer + "$2" });
reSet[item.key][key].push({re: new RegExp("(\\W+)" + qm + "$", "gi"), r: "$1" + replacer });
}
}
readSubstitutes(item.file, lineHandle, function() {
fc++;
if (tasks.length == fc) {
debug("Done Reading Subs");
cb2('done');
}
});
}
async.map(tasks, itor, function(){
debug("Done Loading files");
cb();
});
async.map(tasks, itor, function(){
debug("Done Loading files");
cb();
});
};

@@ -117,41 +125,41 @@

msg = msg.replace(/\+/g, "<plus>");
msg = msg.replace(new RegExp("\t", "g"), " ");
msg = msg.replace(/\s+/g, " ");
msg = msg.replace(/\+/g, "<plus>");
msg = msg.replace(new RegExp("\t", "g"), " ");
msg = msg.replace(/\s+/g, " ");
var fileItor = function(item1, next1) {
var fileItor = function(item1, next1) {
var itemItor = function(item2, next2) {
var reArray = reSet[item1][item2];
var reItor = function(item3, next3) {
var pm = msg;
msg = msg.replace(item3.re, item3.r);
next3(null);
};
var itemItor = function(item2, next2) {
var reArray = reSet[item1][item2];
var reItor = function(item3, next3) {
var pm = msg;
msg = msg.replace(item3.re, item3.r);
next3(null);
};
async.map(reArray, reItor, function(){
next2(null);
});
}
async.map(reArray, reItor, function(){
next2(null);
});
}
async.each(Object.keys(reSet[item1]), itemItor, function(){
next1(null)
});
}
async.each(Object.keys(reSet[item1]), itemItor, function(){
next1(null)
});
}
async.mapSeries(Object.keys(reSet), fileItor, function() {
msg = msg.replace(new RegExp("[\+]{1}", "g"), " ");
msg = msg.replace(new RegExp("<plus>", "g"), "+");
msg = msg.replace(/\d,\d/g, function(v) { return v.replace(",",""); });
});
async.mapSeries(Object.keys(reSet), fileItor, function() {
msg = msg.replace(new RegExp("[\+]{1}", "g"), " ");
msg = msg.replace(new RegExp("<plus>", "g"), "+");
msg = msg.replace(/\d,\d/g, function(v) { return v.replace(",",""); });
});
return msg.trim();
return msg.trim();
}
var quotemeta = function (string) {
var unsafe = "\\.+*?[^]$(){}=!<>|:";
for (var i = 0; i < unsafe.length; i++) {
string = string.replace(new RegExp("\\" + unsafe.charAt(i), "g"), "\\" + unsafe.charAt(i));
}
return string;
var unsafe = "\\.+*?[^]$(){}=!<>|:";
for (var i = 0; i < unsafe.length; i++) {
string = string.replace(new RegExp("\\" + unsafe.charAt(i), "g"), "\\" + unsafe.charAt(i));
}
return string;
};
{
"name": "node-normalizer",
"version": "0.0.7",
"version": "0.0.8",
"description": "Normalize and clean text",
"main": "index.js",
"scripts": {
"test": "mocha test/basic.js -R spec"
"test": "mocha test/basic.js -R spec -t 4000"
},

@@ -9,0 +9,0 @@ "author": "Rob Ellis",

@@ -63,4 +63,3 @@

});
});
SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc