Socket
Socket
Sign inDemoInstall

node-normalizer

Package Overview
Dependencies
Maintainers
2
Versions
21
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

node-normalizer - npm Package Compare versions

Comparing version 0.2.0 to 1.0.0-alpha1

.babelrc

256

lib/normalizer.js

@@ -1,161 +0,155 @@

var fs = require('fs');
var path = require("path");
var readline = require('readline');
var stream = require('stream');
var str = require("string");
var async = require("async");
var RE2 = require("re2");
var debug = require('debug')("Normalizer");
'use strict';
var re1 = new RE2(/\+/g);
var re2 = new RE2(/\t/g);
var re3 = new RE2(/\s+/g);
var re4 = new RE2(/(’|‘)/g);
var re5 = new RE2(/(“|”)/g);
var re6 = new RE2(/(–|—)/g);
var re7 = new RE2(/[^\x00-\x7F]/g);
var re8 = new RE2(/[\+]{1}/g);
var re9 = new RE2(/<plus>/g);
var re10 = new RE2(/\d,\d/g);
var re11 = new RE2(/_/g);
Object.defineProperty(exports, "__esModule", {
value: true
});
// TODO, fix the paths
var tasks = [
{'key':'_sys','file':'systemessentials.txt'},
{'key':'_extra','file':'substitutes.txt'},
{'key':'_contractions','file':'contractions.txt'},
{'key':'_interjections','file':'interjections.txt'},
{'key':'_britsh','file':'british.txt'},
{'key':'_spellfix','file':'spellfix.txt'},
{'key':'_texting','file':'texting.txt'}
];
var _fs = require('fs');
var reSet = {};
var _fs2 = _interopRequireDefault(_fs);
var readSubstitutes = function(file, lineHandle, closeHandle) {
var _path = require('path');
var p = path.join(__dirname, "../data/", file);
var data = fs.readFileSync(p,'utf8').split("\r\n");
var _path2 = _interopRequireDefault(_path);
for (var i = 0; i < data.length; i++) {
var line = data[i];
var nline = str(line).trimLeft();
var _string = require('string');
// Lets allow comments with '#'
var pos = nline.indexOf('#');
var _string2 = _interopRequireDefault(_string);
if (pos === -1) {
var parts = nline.s.split(" ");
var _re = require('re2');
if (parts[1] === undefined) {
lineHandle(parts[0], "");
} else {
var _re2 = _interopRequireDefault(_re);
lineHandle(parts[0], parts[1]);
}
var _debug = require('debug');
} else if (pos > 0) {
nline = nline.left(pos);
var parts = nline.s.split(" ");
lineHandle(parts[0], parts[1]);
}
var _debug2 = _interopRequireDefault(_debug);
function _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { default: obj }; }
var debug = (0, _debug2.default)('Normalizer');
var re1 = new _re2.default(/\+/g);
var re2 = new _re2.default(/\t/g);
var re3 = new _re2.default(/\s+/g);
var re4 = new _re2.default(/(’|‘)/g);
var re5 = new _re2.default(/(“|”)/g);
var re6 = new _re2.default(/(–|—)/g);
var re7 = new _re2.default(/[^\x00-\x7F]/g);
var re8 = new _re2.default(/[\+]{1}/g);
var re9 = new _re2.default(/<plus>/g);
var re10 = new _re2.default(/\d,\d/g);
var re11 = new _re2.default(/_/g);
// TODO, fix the paths
var tasks = [{ key: '_sys', file: 'systemessentials.txt' }, { key: '_extra', file: 'substitutes.txt' }, { key: '_contractions', file: 'contractions.txt' }, { key: '_interjections', file: 'interjections.txt' }, { key: '_british', file: 'british.txt' }, { key: '_spellfix', file: 'spellfix.txt' }, { key: '_texting', file: 'texting.txt' }];
var reSet = {};
var quotemeta = function quotemeta(string) {
var unsafe = '\\.+*?[^]$(){}=!<>|:';
for (var i = 0; i < unsafe.length; i++) {
string = string.replace(new RegExp('\\' + unsafe.charAt(i), 'g'), '\\' + unsafe.charAt(i));
}
closeHandle();
return string;
};
exports.loadData = function(cb){
var lineHandle = function lineHandle(task, key, replacer) {
if (reSet[task.key] === undefined) {
reSet[task.key] = {};
}
var fc = 0;
var itor = function(item, cb2) {
debug("Loaded File", item);
if (reSet[task.key][key] === undefined) {
reSet[task.key][key] = [];
}
var lineHandle = function(key, replacer) {
// Add RegEx
var startM = false;
var endM = false;
var lookup = key;
if (reSet[item.key] === undefined) {
reSet[item.key] = {};
}
if (key[0] === '<') {
startM = true;
lookup = key.substring(1);
}
if (reSet[item.key][key] === undefined) {
reSet[item.key][key] = [];
if (key.slice(-1) === '>') {
endM = true;
lookup = lookup.substring(0, lookup.length - 1);
}
// Add RegEx
var startM, endM, lookup = key;
if (key[0] == '<') {
startM = true;
lookup = key.substring(1);
}
var qm = quotemeta(re11.replace(lookup, ' '));
if (key.slice(-1) == '>') {
endM = true;
lookup = lookup.substring(0, lookup.length - 1);
}
if (startM && endM) {
reSet[task.key][key].push({ re: new RegExp('^' + qm + '$', 'gi'), r: replacer });
} else if (startM) {
reSet[task.key][key].push({ re: new RegExp('^' + qm + '(\\W+|$)', 'gi'), r: replacer + '$1' });
} else if (endM) {
reSet[task.key][key].push({ re: new RegExp('(\\W+|^)' + qm + '$', 'gi'), r: '$1' + replacer });
if (task.key === '_sys') {
reSet[task.key][key].push({ re: new RegExp(qm + '$', 'gi'), r: replacer });
} else {
// reSet[task.key][key].push({ re: new RegExp(`(\\W+)${qm}(\\W+)`, 'gi'), r: `$1${replacer}$2` });
}
} else {
reSet[task.key][key].push({ re: new RegExp('(\\W+|^)' + qm + '(\\W+|$)', 'gi'), r: '$1' + replacer + '$2' });
}
};
var qm = quotemeta(re11.replace(lookup, " "));
var doTask = function doTask(task) {
debug('Loading file: ', task);
if (startM && endM) {
reSet[item.key][key].push({re: new RegExp("^" + qm + "$", "gi"), r: replacer });
} else if (startM) {
reSet[item.key][key].push({re: new RegExp("^" + qm + "(\\W+|$)", "gi"), r: replacer + "$1"});
} else if (endM) {
reSet[item.key][key].push({re: new RegExp("(\\W+|^)" + qm + "$", "gi"), r: "$1" + replacer });
if (item.key == "_sys") {
reSet[item.key][key].push({re: new RegExp(qm + "$", "gi"), r: replacer });
} else {
// reSet[item.key][key].push({re: new RegExp("(\\W+)" + qm + "(\\W+)", "gi"), r: "$1" + replacer + "$2" });
}
} else {
reSet[item.key][key].push({re: new RegExp("(\\W+|^)" + qm + "(\\W+|$)", "gi"), r: "$1" + replacer + "$2" });
}
}
};
var dir = _path2.default.join(__dirname, '../data/');
var data = _fs2.default.readFileSync(dir + task.file, 'utf8').split('\r\n');
readSubstitutes(item.file, lineHandle, function() {
fc++;
if (tasks.length == fc) {
debug("Done Reading Subs");
cb2('done');
for (var i = 0; i < data.length; i++) {
var line = data[i];
var nline = (0, _string2.default)(line).trimLeft();
// Let's allow comments with '#'
var pos = nline.indexOf('#');
if (pos === -1) {
var parts = nline.s.split(' ');
if (parts[1] === undefined) {
lineHandle(task, parts[0], '');
} else {
lineHandle(task, parts[0], parts[1]);
}
});
};
async.map(tasks, itor, function(){
debug("Done Loading files");
cb();
});
} else if (pos > 0) {
nline = nline.left(pos);
var _parts = nline.s.split(' ');
lineHandle(task, _parts[0], _parts[1]);
}
}
};
exports.clean = function(msg){
tasks.forEach(doTask);
msg = re1.replace(msg, "<plus>");
msg = re2.replace(msg, " ");
msg = re3.replace(msg, " ");
debug('Done loading files');
var clean = function clean(msg) {
msg = re1.replace(msg, '<plus>');
msg = re2.replace(msg, ' ');
msg = re3.replace(msg, ' ');
msg = re4.replace(msg, "'");
msg = re5.replace(msg, '"');
msg = re6.replace(msg, "—");
msg = re7.replace(msg, "");
msg = re6.replace(msg, '—');
msg = re7.replace(msg, '');
var fileItor = function(item1, next1) {
var itemItor = function(item2, next2) {
var reArray = reSet[item1][item2];
var reItor = function(item3, next3) {
// msg = item3.re.replace(msg, item3.r);
msg = msg.replace(item3.re, item3.r);
next3(null);
};
var replacer = function replacer(regex) {
msg = msg.replace(regex.re, regex.r);
};
async.map(reArray, reItor, function(){
next2(null);
});
};
async.each(Object.keys(reSet[item1]), itemItor, function(){
next1(null);
Object.keys(reSet).forEach(function (taskKey) {
Object.keys(reSet[taskKey]).forEach(function (key) {
var reArray = reSet[taskKey][key];
reArray.forEach(replacer);
});
};
});
async.mapSeries(Object.keys(reSet), fileItor, function() {
msg = re8.replace(msg, " ");
msg = re9.replace(msg, "+");
msg = re10.replace(msg, function(v) { return v.replace(",",""); });
msg = re8.replace(msg, ' ');
msg = re9.replace(msg, '+');
msg = re10.replace(msg, function (v) {
return v.replace(',', '');
});

@@ -166,8 +160,2 @@

var quotemeta = function (string) {
var unsafe = "\\.+*?[^]$(){}=!<>|:";
for (var i = 0; i < unsafe.length; i++) {
string = string.replace(new RegExp("\\" + unsafe.charAt(i), "g"), "\\" + unsafe.charAt(i));
}
return string;
};
exports.default = { clean: clean };
{
"name": "node-normalizer",
"version": "0.2.0",
"version": "1.0.0-alpha1",
"description": "Normalize and clean text",
"main": "index.js",
"main": "lib/index.js",
"repository": {
"type": "git",
"url": "https://github.com/superscriptjs/normalizer"
},
"scripts": {
"test": "mocha test/basic.js -R spec -t 4000"
"build": "babel src --presets babel-preset-es2015 --out-dir lib",
"prepublish": "npm run build",
"test": "mocha --compilers js:babel-register test -R spec -t 4000",
"test-travis": "./node_modules/istanbul/lib/cli.js cover ./node_modules/mocha/bin/_mocha -- --compilers js:babel-register -R spec test -s 1700 -t 4000"
},

@@ -12,11 +19,20 @@ "author": "Rob Ellis",

"dependencies": {
"async": "~0.7.0",
"debug": "~0.8.0",
"re2": "^1.3.0",
"string": "~1.8.0"
"debug": "^2.2.0",
"re2": "^1.3.3",
"string": "^3.3.3"
},
"devDependencies": {
"mocha": "~1.18.2",
"should": "~3.3.1"
"babel-cli": "^6.16.0",
"babel-preset-es2015": "^6.16.0",
"babel-register": "^6.18.0",
"coveralls": "^2.11.14",
"eslint": "^3.8.1",
"eslint-config-airbnb": "^12.0.0",
"eslint-plugin-import": "^1.16.0",
"eslint-plugin-jsx-a11y": "^2.2.3",
"eslint-plugin-react": "^6.4.1",
"istanbul": "^0.4.5",
"mocha": "^3.1.2",
"should": "^11.1.1"
}
}

@@ -7,4 +7,12 @@ # Normalize, clean and fix text

The order in which the processing happes is important.
A single method `clean(message)` is exposed.
```
import normalizer from node-normalizer;
// const normalizer = require('node-normalizer').default;
const cleanedMessage = normalizer.clean('my message');
```
The order in which the processing happens is important.
* <xxx means sentence start then xxx

@@ -11,0 +19,0 @@ * 1. spelling corrections for common spelling errors

@@ -1,98 +0,89 @@

var mocha = require("mocha");
var should = require("should");
var norm = require("../index");
import mocha from 'mocha';
import should from 'should';
import norm from '../src/normalizer';
describe('Normalizer', function(){
var startTime;
before(function(done){
norm.loadData(function(){
startTime = new Date();
done();
});
describe('Normalizer', () => {
let startTime;
before((done) => {
startTime = new Date();
done();
});
after(function(done) {
console.log('Test duration: ' + (new Date() - startTime) + 'ms');
done();
})
describe('Should clean input', function() {
it("should replace subsitutes", function() {
norm.clean("Nov 1st I weighed 90 kgs. total").should.eql("November 1st I weighed 90 kilograms total");
norm.clean("I shared it on FB w/ friends, ie: you").should.eql("I shared it on Facebook with friends, for example : you");
describe('Should clean input', () => {
it('should replace subsitutes', () => {
norm.clean('Nov 1st I weighed 90 kgs. total').should.eql('November 1st I weighed 90 kilograms total');
norm.clean('I shared it on FB w/ friends, ie: you').should.eql('I shared it on Facebook with friends, for example : you');
});
it("should expand contractions", function() {
norm.clean("I'm on the yelow zebra").should.eql("I am on the yellow zebra");
norm.clean("I'll listen to y'all").should.eql("I will listen to you all");
norm.clean("do n't make it right").should.eql("do not make it right");
norm.clean("it's all good").should.eql("it is all good");
it('should expand contractions', () => {
norm.clean("I'm on the yelow zebra").should.eql('I am on the yellow zebra');
norm.clean("I'll listen to y'all").should.eql('I will listen to you all');
norm.clean("do n't make it right").should.eql('do not make it right');
norm.clean("it's all good").should.eql('it is all good');
});
it("should swap british / canadian words", function() {
norm.clean("armour axe coloured gold").should.eql("armor ax colored gold");
it('should swap british / canadian words', () => {
norm.clean('armour axe coloured gold').should.eql('armor ax colored gold');
});
it("should fix spelling", function() {
norm.clean("are we sceduled thrsday for teh restraunt").should.eql("are we scheduled Thursday for the restaurant");
it('should fix spelling', () => {
norm.clean('are we sceduled thrsday for teh restraunt').should.eql('are we scheduled Thursday for the restaurant');
});
it("should expand txt speak", function() {
norm.clean("n").should.eql("~no");
norm.clean("lol").should.eql("~emolaugh");
norm.clean("haha").should.eql("~emolaugh");
norm.clean(":)").should.eql("~emohappy");
it('should expand txt speak', () => {
norm.clean('n').should.eql('~no');
norm.clean('lol').should.eql('~emolaugh');
norm.clean('haha').should.eql('~emolaugh');
norm.clean(':)').should.eql('~emohappy');
});
it("should clean this", function() {
norm.clean("Well , I could not help it, could I").should.eql("I could not help it, could I")
it('should clean this', () => {
norm.clean('Well , I could not help it, could I').should.eql('I could not help it, could I');
});
it("should not remove +", function() {
norm.clean("3+4=7").should.eql("3+4=7");
it('should not remove +', () => {
norm.clean('3+4=7').should.eql('3+4=7');
});
it("should remove extra spaces", function() {
norm.clean("this is spaced out").should.eql("this is spaced out");
it('should remove extra spaces', () => {
norm.clean('this is spaced out').should.eql('this is spaced out');
});
it("should remove punct", function() {
norm.clean("why do i care?").should.eql("why do I care");
it('should remove punct', () => {
norm.clean('why do i care?').should.eql('why do I care');
});
it("Fix numbers", function() {
norm.clean("how much is 1,000.00").should.eql("how much is 1000.00");
it('Fix numbers', () => {
norm.clean('how much is 1,000.00').should.eql('how much is 1000.00');
});
it("Spell Fix 2 word combo", function() {
norm.clean("hwo do you").should.eql("how do you");
norm.clean("hwo is you").should.eql("who is you");
it('Spell Fix 2 word combo', () => {
norm.clean('hwo do you').should.eql('how do you');
norm.clean('hwo is you').should.eql('who is you');
});
it("Fix ASCII characters", function() {
norm.clean("What’s up").should.eql("what is up");
norm.clean("What's up").should.eql("what is up");
norm.clean("I said “shut up”").should.eql('I said "shut up"');
norm.clean("œ").should.eql('');
it('Fix ASCII characters', () => {
norm.clean('What’s up').should.eql('what is up');
norm.clean("What's up").should.eql('what is up');
norm.clean('I said “shut up”').should.eql('I said "shut up"');
norm.clean('œ').should.eql('');
});
});
describe('Matching', function() {
describe('Matching', () => {
// <it_is>
describe('<xxx>', function() {
it('should match start and end', function() {
norm.clean('it is').should.eql("~yes");
describe('<xxx>', () => {
it('should match start and end', () => {
norm.clean('it is').should.eql('~yes');
});
it('should not match start', function() {
norm.clean('it is abc').should.eql("it is abc");
it('should not match start', () => {
norm.clean('it is abc').should.eql('it is abc');
});
it('should not match end', function() {
it('should not match end', () => {
norm.clean('abc it is').should.eql('abc it is');
});
it('should not match middle', function() {
it('should not match middle', () => {
norm.clean('abc it is abc').should.eql('abc it is abc');

@@ -103,17 +94,17 @@ });

// <ew
describe('<xxx', function() {
it('should match start and end', function() {
norm.clean('ew').should.eql("~emodisgust");
describe('<xxx', () => {
it('should match start and end', () => {
norm.clean('ew').should.eql('~emodisgust');
});
it('should match start', function() {
norm.clean('ew abc').should.eql("~emodisgust abc");
it('should match start', () => {
norm.clean('ew abc').should.eql('~emodisgust abc');
});
it('should not match end', function() {
norm.clean('abc ew').should.eql("abc ew");
it('should not match end', () => {
norm.clean('abc ew').should.eql('abc ew');
});
it('should not match middle', function() {
norm.clean('abc ew abc').should.eql("abc ew abc");
it('should not match middle', () => {
norm.clean('abc ew abc').should.eql('abc ew abc');
});

@@ -123,17 +114,17 @@ });

// have_to_go>
describe('xxx>', function() {
it('should match start and end', function() {
norm.clean('have to go').should.eql("~emogoodbye");
describe('xxx>', () => {
it('should match start and end', () => {
norm.clean('have to go').should.eql('~emogoodbye');
});
it('should not match start', function() {
norm.clean('have to go abc').should.eql("have to go abc");
it('should not match start', () => {
norm.clean('have to go abc').should.eql('have to go abc');
});
it('should match end', function() {
norm.clean('abc have to go').should.eql("abc ~emogoodbye")
it('should match end', () => {
norm.clean('abc have to go').should.eql('abc ~emogoodbye');
});
it('should not match middle', function() {
norm.clean('abc have to go abc').should.eql("abc have to go abc")
it('should not match middle', () => {
norm.clean('abc have to go abc').should.eql('abc have to go abc');
});

@@ -143,22 +134,25 @@ });

// okay
describe('xxx', function() {
it('should match start and end', function() {
norm.clean('okay').should.eql("~yes");
describe('xxx', () => {
it('should match start and end', () => {
norm.clean('okay').should.eql('~yes');
});
it('should match start', function() {
norm.clean('okay abc').should.eql("~yes abc");
it('should match start', () => {
norm.clean('okay abc').should.eql('~yes abc');
});
it('should match end', function() {
norm.clean('abc okay').should.eql("abc ~yes");
it('should match end', () => {
norm.clean('abc okay').should.eql('abc ~yes');
});
it('should match middle', function() {
norm.clean('abc okay abc').should.eql("abc ~yes abc");
it('should match middle', () => {
norm.clean('abc okay abc').should.eql('abc ~yes abc');
});
})
});
});
after((done) => {
console.log(`Test duration: ${new Date() - startTime}ms`);
done();
});
});

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc