node-normalizer
Advanced tools
Comparing version 0.0.11 to 0.1.0
@@ -1259,3 +1259,3 @@ a.i. artificial+intelligence | ||
quite | ||
raelly | ||
raelly really | ||
rather_then rather+than | ||
@@ -1262,0 +1262,0 @@ real_actual actual |
@@ -7,8 +7,21 @@ var fs = require('fs'); | ||
var async = require("async"); | ||
var RE2 = require("re2"); | ||
var debug = require('debug')("Normalizer"); | ||
var re1 = new RE2(/\+/g); | ||
var re2 = new RE2(/\t/g); | ||
var re3 = new RE2(/\s+/g); | ||
var re4 = new RE2(/(’|‘)/g); | ||
var re5 = new RE2(/(“|”)/g); | ||
var re6 = new RE2(/(–|—)/g); | ||
var re7 = new RE2(/[^\x00-\x7F]/g); | ||
var re8 = new RE2(/[\+]{1}/g); | ||
var re9 = new RE2(/<plus>/g); | ||
var re10 = new RE2(/\d,\d/g); | ||
var re11 = new RE2(/_/g); | ||
// TODO, fix the paths | ||
var tasks = [ | ||
{'key':'_sys','file':'systemessentials.txt'}, | ||
{'key':'_extra','file':'substitutes.txt'}, | ||
{'key':'_sys','file':'systemessentials.txt'}, | ||
{'key':'_extra','file':'substitutes.txt'}, | ||
{'key':'_contractions','file':'contractions.txt'}, | ||
@@ -25,3 +38,3 @@ {'key':'_interjections','file':'interjections.txt'}, | ||
var p = path.join(__dirname, "../data/", file) | ||
var p = path.join(__dirname, "../data/", file); | ||
var data = fs.readFileSync(p,'utf8').split("\r\n"); | ||
@@ -39,4 +52,4 @@ | ||
if (parts[1] == undefined) { | ||
lineHandle(parts[0], ""); | ||
if (parts[1] === undefined) { | ||
lineHandle(parts[0], ""); | ||
} else { | ||
@@ -47,3 +60,3 @@ | ||
} else if (pos > 0) { | ||
} else if (pos > 0) { | ||
nline = nline.left(pos); | ||
@@ -56,3 +69,3 @@ var parts = nline.s.split(" "); | ||
closeHandle(); | ||
} | ||
}; | ||
@@ -84,7 +97,6 @@ exports.loadData = function(cb){ | ||
endM = true; | ||
lookup = lookup.substring(0, lookup.length - 1); | ||
lookup = lookup.substring(0, lookup.length - 1); | ||
} | ||
lookup = lookup.replace(/_/g," "); | ||
var qm = quotemeta(lookup); | ||
var qm = quotemeta(re11.replace(lookup, " ")); | ||
@@ -110,3 +122,3 @@ if (startM && endM) { | ||
} | ||
} | ||
}; | ||
@@ -120,3 +132,3 @@ readSubstitutes(item.file, lineHandle, function() { | ||
}); | ||
} | ||
}; | ||
@@ -131,17 +143,15 @@ async.map(tasks, itor, function(){ | ||
msg = msg.replace(/\+/g, "<plus>"); | ||
msg = msg.replace(new RegExp("\t", "g"), " "); | ||
msg = msg.replace(/\s+/g, " "); | ||
msg = msg.replace(/(’|‘)/g, "'"); | ||
msg = msg.replace(/(“|”)/g, '"'); | ||
msg = msg.replace(/(–|—)/g, "—"); | ||
msg = msg.replace(/[^\x00-\x7F]/g, ""); | ||
msg = re1.replace(msg, "<plus>"); | ||
msg = re2.replace(msg, " "); | ||
msg = re3.replace(msg, " "); | ||
msg = re4.replace(msg, "'"); | ||
msg = re5.replace(msg, '"'); | ||
msg = re6.replace(msg, "—"); | ||
msg = re7.replace(msg, ""); | ||
var fileItor = function(item1, next1) { | ||
var itemItor = function(item2, next2) { | ||
var reArray = reSet[item1][item2]; | ||
var reItor = function(item3, next3) { | ||
var pm = msg; | ||
// msg = item3.re.replace(msg, item3.r); | ||
msg = msg.replace(item3.re, item3.r); | ||
@@ -154,17 +164,17 @@ next3(null); | ||
}); | ||
} | ||
}; | ||
async.each(Object.keys(reSet[item1]), itemItor, function(){ | ||
next1(null) | ||
next1(null); | ||
}); | ||
} | ||
}; | ||
async.mapSeries(Object.keys(reSet), fileItor, function() { | ||
msg = msg.replace(new RegExp("[\+]{1}", "g"), " "); | ||
msg = msg.replace(new RegExp("<plus>", "g"), "+"); | ||
msg = msg.replace(/\d,\d/g, function(v) { return v.replace(",",""); }); | ||
msg = re8.replace(msg, " "); | ||
msg = re9.replace(msg, "+"); | ||
msg = re10.replace(msg, function(v) { return v.replace(",",""); }); | ||
}); | ||
return msg.trim(); | ||
} | ||
return msg.trim(); | ||
}; | ||
@@ -177,2 +187,2 @@ var quotemeta = function (string) { | ||
return string; | ||
}; | ||
}; |
{ | ||
"name": "node-normalizer", | ||
"version": "0.0.11", | ||
"version": "0.1.0", | ||
"description": "Normalize and clean text", | ||
@@ -12,5 +12,6 @@ "main": "index.js", | ||
"dependencies": { | ||
"string": "~1.8.0", | ||
"async": "~0.7.0", | ||
"debug": "~0.8.0", | ||
"async": "~0.7.0" | ||
"re2": "^1.3.0", | ||
"string": "~1.8.0" | ||
}, | ||
@@ -17,0 +18,0 @@ "devDependencies": { |
@@ -1,3 +0,3 @@ | ||
var mocha = require("mocha"); | ||
var mocha = require("mocha"); | ||
var should = require("should"); | ||
@@ -8,66 +8,66 @@ var norm = require("../index"); | ||
before(function(done){ | ||
norm.loadData(function(){ | ||
done(); | ||
}); | ||
}); | ||
before(function(done){ | ||
norm.loadData(function(){ | ||
done(); | ||
}); | ||
}); | ||
describe('Should clean input', function() { | ||
describe('Should clean input', function() { | ||
it("should replace subsitutes", function() { | ||
norm.clean("Nov 1st I weighed 90 kgs. total").should.eql("November 1st I weighed 90 kilograms total"); | ||
norm.clean("I shared it on FB w/ friends, ie: you").should.eql("I shared it on Facebook with friends, for example : you"); | ||
}); | ||
it("should replace subsitutes", function() { | ||
norm.clean("Nov 1st I weighed 90 kgs. total").should.eql("November 1st I weighed 90 kilograms total"); | ||
norm.clean("I shared it on FB w/ friends, ie: you").should.eql("I shared it on Facebook with friends, for example : you"); | ||
}); | ||
it("should expand contractions", function() { | ||
norm.clean("I'm on the yelow zebra").should.eql("I am on the yellow zebra"); | ||
norm.clean("I'll listen to y'all").should.eql("I will listen to you all"); | ||
norm.clean("do n't make it right").should.eql("do not make it right"); | ||
norm.clean("it's all good").should.eql("it is all good"); | ||
}); | ||
it("should expand contractions", function() { | ||
norm.clean("I'm on the yelow zebra").should.eql("I am on the yellow zebra"); | ||
norm.clean("I'll listen to y'all").should.eql("I will listen to you all"); | ||
norm.clean("do n't make it right").should.eql("do not make it right"); | ||
norm.clean("it's all good").should.eql("it is all good"); | ||
}); | ||
it("should swap british / canadian words", function() { | ||
norm.clean("armour axe coloured gold").should.eql("armor ax colored gold"); | ||
}); | ||
it("should swap british / canadian words", function() { | ||
norm.clean("armour axe coloured gold").should.eql("armor ax colored gold"); | ||
}); | ||
it("should fix spelling", function() { | ||
norm.clean("are we sceduled thrsday for teh restraunt").should.eql("are we scheduled Thursday for the restaurant"); | ||
}); | ||
it("should fix spelling", function() { | ||
norm.clean("are we sceduled thrsday for teh restraunt").should.eql("are we scheduled Thursday for the restaurant"); | ||
}); | ||
it("should expand txt speak", function() { | ||
norm.clean("n").should.eql("~no"); | ||
norm.clean("lol").should.eql("~emolaugh"); | ||
norm.clean("haha").should.eql("~emolaugh"); | ||
norm.clean(":)").should.eql("~emohappy"); | ||
}); | ||
it("should expand txt speak", function() { | ||
norm.clean("n").should.eql("~no"); | ||
norm.clean("lol").should.eql("~emolaugh"); | ||
norm.clean("haha").should.eql("~emolaugh"); | ||
norm.clean(":)").should.eql("~emohappy"); | ||
}); | ||
it("should clean this", function() { | ||
norm.clean("Well , I could not help it, could I").should.eql("I could not help it, could I") | ||
}); | ||
it("should clean this", function() { | ||
norm.clean("Well , I could not help it, could I").should.eql("I could not help it, could I") | ||
}); | ||
it("should not remove +", function() { | ||
norm.clean("3+4=7").should.eql("3+4=7"); | ||
}); | ||
it("should not remove +", function() { | ||
norm.clean("3+4=7").should.eql("3+4=7"); | ||
}); | ||
it("should remove extra spaces", function() { | ||
norm.clean("this is spaced out").should.eql("this is spaced out"); | ||
}); | ||
it("should remove extra spaces", function() { | ||
norm.clean("this is spaced out").should.eql("this is spaced out"); | ||
}); | ||
it("should remove punct", function() { | ||
norm.clean("why do i care?").should.eql("why do I care"); | ||
}); | ||
it("should remove punct", function() { | ||
norm.clean("why do i care?").should.eql("why do I care"); | ||
}); | ||
it("Fix numbers", function() { | ||
norm.clean("how much is 1,000.00").should.eql("how much is 1000.00"); | ||
}); | ||
it("Fix ASCII characters", function() { | ||
norm.clean("What’s up").should.eql("what is up"); | ||
norm.clean("What's up").should.eql("what is up"); | ||
norm.clean("I said “shut up”").should.eql('I said "shut up"'); | ||
norm.clean("œ").should.eql(''); | ||
}); | ||
it("Fix numbers", function() { | ||
norm.clean("how much is 1,000.00").should.eql("how much is 1000.00"); | ||
}); | ||
it("Fix ASCII characters", function() { | ||
norm.clean("What’s up").should.eql("what is up"); | ||
norm.clean("What's up").should.eql("what is up"); | ||
norm.clean("I said “shut up”").should.eql('I said "shut up"'); | ||
norm.clean("œ").should.eql(''); | ||
}); | ||
}); | ||
}); | ||
}); |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
210597
199
4
+ Addedre2@^1.3.0
+ Added@isaacs/cliui@8.0.2(transitive)
+ Added@npmcli/agent@2.2.2(transitive)
+ Added@npmcli/fs@3.1.1(transitive)
+ Added@pkgjs/parseargs@0.11.0(transitive)
+ Addedabbrev@2.0.0(transitive)
+ Addedagent-base@7.1.1(transitive)
+ Addedaggregate-error@3.1.0(transitive)
+ Addedansi-regex@5.0.16.1.0(transitive)
+ Addedansi-styles@4.3.06.2.1(transitive)
+ Addedbalanced-match@1.0.2(transitive)
+ Addedbrace-expansion@2.0.1(transitive)
+ Addedcacache@18.0.4(transitive)
+ Addedchownr@2.0.0(transitive)
+ Addedclean-stack@2.2.0(transitive)
+ Addedcolor-convert@2.0.1(transitive)
+ Addedcolor-name@1.1.4(transitive)
+ Addedcross-spawn@7.0.3(transitive)
+ Addeddebug@4.3.7(transitive)
+ Addedeastasianwidth@0.2.0(transitive)
+ Addedemoji-regex@8.0.09.2.2(transitive)
+ Addedencoding@0.1.13(transitive)
+ Addedenv-paths@2.2.1(transitive)
+ Addederr-code@2.0.3(transitive)
+ Addedexponential-backoff@3.1.1(transitive)
+ Addedforeground-child@3.3.0(transitive)
+ Addedfs-minipass@2.1.03.0.3(transitive)
+ Addedglob@10.4.5(transitive)
+ Addedgraceful-fs@4.2.11(transitive)
+ Addedhttp-cache-semantics@4.1.1(transitive)
+ Addedhttp-proxy-agent@7.0.2(transitive)
+ Addedhttps-proxy-agent@7.0.5(transitive)
+ Addediconv-lite@0.6.3(transitive)
+ Addedimurmurhash@0.1.4(transitive)
+ Addedindent-string@4.0.0(transitive)
+ Addedinstall-artifact-from-github@1.3.5(transitive)
+ Addedip-address@9.0.5(transitive)
+ Addedis-fullwidth-code-point@3.0.0(transitive)
+ Addedis-lambda@1.0.1(transitive)
+ Addedisexe@2.0.03.1.1(transitive)
+ Addedjackspeak@3.4.3(transitive)
+ Addedjsbn@1.1.0(transitive)
+ Addedlru-cache@10.4.3(transitive)
+ Addedmake-fetch-happen@13.0.1(transitive)
+ Addedminimatch@9.0.5(transitive)
+ Addedminipass@3.3.65.0.07.1.2(transitive)
+ Addedminipass-collect@2.0.1(transitive)
+ Addedminipass-fetch@3.0.5(transitive)
+ Addedminipass-flush@1.0.5(transitive)
+ Addedminipass-pipeline@1.2.4(transitive)
+ Addedminipass-sized@1.0.3(transitive)
+ Addedminizlib@2.1.2(transitive)
+ Addedmkdirp@1.0.4(transitive)
+ Addedms@2.1.3(transitive)
+ Addednan@2.22.0(transitive)
+ Addednegotiator@0.6.3(transitive)
+ Addednode-gyp@10.2.0(transitive)
+ Addednopt@7.2.1(transitive)
+ Addedp-map@4.0.0(transitive)
+ Addedpackage-json-from-dist@1.0.1(transitive)
+ Addedpath-key@3.1.1(transitive)
+ Addedpath-scurry@1.11.1(transitive)
+ Addedproc-log@4.2.0(transitive)
+ Addedpromise-retry@2.0.1(transitive)
+ Addedre2@1.21.4(transitive)
+ Addedretry@0.12.0(transitive)
+ Addedsafer-buffer@2.1.2(transitive)
+ Addedsemver@7.6.3(transitive)
+ Addedshebang-command@2.0.0(transitive)
+ Addedshebang-regex@3.0.0(transitive)
+ Addedsignal-exit@4.1.0(transitive)
+ Addedsmart-buffer@4.2.0(transitive)
+ Addedsocks@2.8.3(transitive)
+ Addedsocks-proxy-agent@8.0.4(transitive)
+ Addedsprintf-js@1.1.3(transitive)
+ Addedssri@10.0.6(transitive)
+ Addedstring-width@4.2.35.1.2(transitive)
+ Addedstrip-ansi@6.0.17.1.0(transitive)
+ Addedtar@6.2.1(transitive)
+ Addedunique-filename@3.0.0(transitive)
+ Addedunique-slug@4.0.0(transitive)
+ Addedwhich@2.0.24.0.0(transitive)
+ Addedwrap-ansi@7.0.08.1.0(transitive)
+ Addedyallist@4.0.0(transitive)