@@ -1,161 +0,155 @@
		var fs = require('fs');
		var path = require("path");
		var readline = require('readline');
		var stream = require('stream');
		var str = require("string");
		var async = require("async");
		var RE2 = require("re2");
		var debug = require('debug')("Normalizer");
		'use strict';

		var re1 = new RE2(/\+/g);
		var re2 = new RE2(/\t/g);
		var re3 = new RE2(/\s+/g);
		var re4 = new RE2(/(’\|‘)/g);
		var re5 = new RE2(/(“\|”)/g);
		var re6 = new RE2(/(–\|—)/g);
		var re7 = new RE2(/[^\x00-\x7F]/g);
		var re8 = new RE2(/[\+]{1}/g);
		var re9 = new RE2(/<plus>/g);
		var re10 = new RE2(/\d,\d/g);
		var re11 = new RE2(/_/g);
		Object.defineProperty(exports, "__esModule", {
		value: true
		});

		// TODO, fix the paths
		var tasks = [
		{'key':'_sys','file':'systemessentials.txt'},
		{'key':'_extra','file':'substitutes.txt'},
		{'key':'_contractions','file':'contractions.txt'},
		{'key':'_interjections','file':'interjections.txt'},
		{'key':'_britsh','file':'british.txt'},
		{'key':'_spellfix','file':'spellfix.txt'},
		{'key':'_texting','file':'texting.txt'}
		];
		var _fs = require('fs');

		var reSet = {};
		var _fs2 = _interopRequireDefault(_fs);

		var readSubstitutes = function(file, lineHandle, closeHandle) {
		var _path = require('path');

		var p = path.join(__dirname, "../data/", file);
		var data = fs.readFileSync(p,'utf8').split("\r\n");
		var _path2 = _interopRequireDefault(_path);

		for (var i = 0; i < data.length; i++) {
		var line = data[i];
		var nline = str(line).trimLeft();
		var _string = require('string');

		// Lets allow comments with '#'
		var pos = nline.indexOf('#');
		var _string2 = _interopRequireDefault(_string);

		if (pos === -1) {
		var parts = nline.s.split(" ");
		var _re = require('re2');

		if (parts[1] === undefined) {
		lineHandle(parts[0], "");
		} else {
		var _re2 = _interopRequireDefault(_re);

		lineHandle(parts[0], parts[1]);
		}
		var _debug = require('debug');

		} else if (pos > 0) {
		nline = nline.left(pos);
		var parts = nline.s.split(" ");
		lineHandle(parts[0], parts[1]);
		}
		var _debug2 = _interopRequireDefault(_debug);

		function _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { default: obj }; }

		var debug = (0, _debug2.default)('Normalizer');

		var re1 = new _re2.default(/\+/g);
		var re2 = new _re2.default(/\t/g);
		var re3 = new _re2.default(/\s+/g);
		var re4 = new _re2.default(/(’\|‘)/g);
		var re5 = new _re2.default(/(“\|”)/g);
		var re6 = new _re2.default(/(–\|—)/g);
		var re7 = new _re2.default(/[^\x00-\x7F]/g);
		var re8 = new _re2.default(/[\+]{1}/g);
		var re9 = new _re2.default(/<plus>/g);
		var re10 = new _re2.default(/\d,\d/g);
		var re11 = new _re2.default(/_/g);

		// TODO, fix the paths
		var tasks = [{ key: '_sys', file: 'systemessentials.txt' }, { key: '_extra', file: 'substitutes.txt' }, { key: '_contractions', file: 'contractions.txt' }, { key: '_interjections', file: 'interjections.txt' }, { key: '_british', file: 'british.txt' }, { key: '_spellfix', file: 'spellfix.txt' }, { key: '_texting', file: 'texting.txt' }];

		var reSet = {};

		var quotemeta = function quotemeta(string) {
		var unsafe = '\\.+*?[^]$(){}=!<>\|:';
		for (var i = 0; i < unsafe.length; i++) {
		string = string.replace(new RegExp('\\' + unsafe.charAt(i), 'g'), '\\' + unsafe.charAt(i));
		}

		closeHandle();
		return string;
		};

		exports.loadData = function(cb){
		var lineHandle = function lineHandle(task, key, replacer) {
		if (reSet[task.key] === undefined) {
		reSet[task.key] = {};
		}

		var fc = 0;
		var itor = function(item, cb2) {
		debug("Loaded File", item);
		if (reSet[task.key][key] === undefined) {
		reSet[task.key][key] = [];
		}

		var lineHandle = function(key, replacer) {
		// Add RegEx
		var startM = false;
		var endM = false;
		var lookup = key;

		if (reSet[item.key] === undefined) {
		reSet[item.key] = {};
		}
		if (key[0] === '<') {
		startM = true;
		lookup = key.substring(1);
		}

		if (reSet[item.key][key] === undefined) {
		reSet[item.key][key] = [];
		if (key.slice(-1) === '>') {
		endM = true;
		lookup = lookup.substring(0, lookup.length - 1);
		}

		// Add RegEx
		var startM, endM, lookup = key;
		if (key[0] == '<') {
		startM = true;
		lookup = key.substring(1);
		}
		var qm = quotemeta(re11.replace(lookup, ' '));

		if (key.slice(-1) == '>') {
		endM = true;
		lookup = lookup.substring(0, lookup.length - 1);
		}
		if (startM && endM) {
		reSet[task.key][key].push({ re: new RegExp('^' + qm + '$', 'gi'), r: replacer });
		} else if (startM) {
		reSet[task.key][key].push({ re: new RegExp('^' + qm + '(\\W+\|$)', 'gi'), r: replacer + '$1' });
		} else if (endM) {
		reSet[task.key][key].push({ re: new RegExp('(\\W+\|^)' + qm + '$', 'gi'), r: '$1' + replacer });
		if (task.key === '_sys') {
		reSet[task.key][key].push({ re: new RegExp(qm + '$', 'gi'), r: replacer });
		} else {
		// reSet[task.key][key].push({ re: new RegExp(`(\\W+)${qm}(\\W+)`, 'gi'), r: `$1${replacer}$2` });
		}
		} else {
		reSet[task.key][key].push({ re: new RegExp('(\\W+\|^)' + qm + '(\\W+\|$)', 'gi'), r: '$1' + replacer + '$2' });
		}
		};

		var qm = quotemeta(re11.replace(lookup, " "));
		var doTask = function doTask(task) {
		debug('Loading file: ', task);

		if (startM && endM) {
		reSet[item.key][key].push({re: new RegExp("^" + qm + "$", "gi"), r: replacer });
		} else if (startM) {
		reSet[item.key][key].push({re: new RegExp("^" + qm + "(\\W+\|$)", "gi"), r: replacer + "$1"});
		} else if (endM) {
		reSet[item.key][key].push({re: new RegExp("(\\W+\|^)" + qm + "$", "gi"), r: "$1" + replacer });
		if (item.key == "_sys") {
		reSet[item.key][key].push({re: new RegExp(qm + "$", "gi"), r: replacer });
		} else {
		// reSet[item.key][key].push({re: new RegExp("(\\W+)" + qm + "(\\W+)", "gi"), r: "$1" + replacer + "$2" });
		}
		} else {
		reSet[item.key][key].push({re: new RegExp("(\\W+\|^)" + qm + "(\\W+\|$)", "gi"), r: "$1" + replacer + "$2" });
		}
		}
		};
		var dir = _path2.default.join(__dirname, '../data/');
		var data = _fs2.default.readFileSync(dir + task.file, 'utf8').split('\r\n');

		readSubstitutes(item.file, lineHandle, function() {
		fc++;
		if (tasks.length == fc) {
		debug("Done Reading Subs");
		cb2('done');
		for (var i = 0; i < data.length; i++) {
		var line = data[i];
		var nline = (0, _string2.default)(line).trimLeft();

		// Let's allow comments with '#'
		var pos = nline.indexOf('#');

		if (pos === -1) {
		var parts = nline.s.split(' ');

		if (parts[1] === undefined) {
		lineHandle(task, parts[0], '');
		} else {
		lineHandle(task, parts[0], parts[1]);
		}
		});
		};

		async.map(tasks, itor, function(){
		debug("Done Loading files");
		cb();
		});
		} else if (pos > 0) {
		nline = nline.left(pos);
		var _parts = nline.s.split(' ');
		lineHandle(task, _parts[0], _parts[1]);
		}
		}
		};

		exports.clean = function(msg){
		tasks.forEach(doTask);

		msg = re1.replace(msg, "<plus>");
		msg = re2.replace(msg, " ");
		msg = re3.replace(msg, " ");
		debug('Done loading files');

		var clean = function clean(msg) {
		msg = re1.replace(msg, '<plus>');
		msg = re2.replace(msg, ' ');
		msg = re3.replace(msg, ' ');
		msg = re4.replace(msg, "'");
		msg = re5.replace(msg, '"');
		msg = re6.replace(msg, "—");
		msg = re7.replace(msg, "");
		msg = re6.replace(msg, '—');
		msg = re7.replace(msg, '');

		var fileItor = function(item1, next1) {
		var itemItor = function(item2, next2) {
		var reArray = reSet[item1][item2];
		var reItor = function(item3, next3) {
		// msg = item3.re.replace(msg, item3.r);
		msg = msg.replace(item3.re, item3.r);
		next3(null);
		};
		var replacer = function replacer(regex) {
		msg = msg.replace(regex.re, regex.r);
		};

		async.map(reArray, reItor, function(){
		next2(null);
		});
		};

		async.each(Object.keys(reSet[item1]), itemItor, function(){
		next1(null);
		Object.keys(reSet).forEach(function (taskKey) {
		Object.keys(reSet[taskKey]).forEach(function (key) {
		var reArray = reSet[taskKey][key];
		reArray.forEach(replacer);
		});
		};
		});

		async.mapSeries(Object.keys(reSet), fileItor, function() {
		msg = re8.replace(msg, " ");
		msg = re9.replace(msg, "+");
		msg = re10.replace(msg, function(v) { return v.replace(",",""); });
		msg = re8.replace(msg, ' ');
		msg = re9.replace(msg, '+');
		msg = re10.replace(msg, function (v) {
		return v.replace(',', '');
		});
		@@ -166,8 +160,2 @@

		var quotemeta = function (string) {
		var unsafe = "\\.+*?[^]$(){}=!<>\|:";
		for (var i = 0; i < unsafe.length; i++) {
		string = string.replace(new RegExp("\\" + unsafe.charAt(i), "g"), "\\" + unsafe.charAt(i));
		}
		return string;
		};
		exports.default = { clean: clean };

package.json

		{
		"name": "node-normalizer",
		"version": "0.2.0",
		"version": "1.0.0-alpha1",
		"description": "Normalize and clean text",
		"main": "index.js",
		"main": "lib/index.js",
		"repository": {
		"type": "git",
		"url": "https://github.com/superscriptjs/normalizer"
		},
		"scripts": {
		"test": "mocha test/basic.js -R spec -t 4000"
		"build": "babel src --presets babel-preset-es2015 --out-dir lib",
		"prepublish": "npm run build",
		"test": "mocha --compilers js:babel-register test -R spec -t 4000",
		"test-travis": "./node_modules/istanbul/lib/cli.js cover ./node_modules/mocha/bin/_mocha -- --compilers js:babel-register -R spec test -s 1700 -t 4000"
		},
		@@ -12,11 +19,20 @@ "author": "Rob Ellis",
		"dependencies": {
		"async": "~0.7.0",
		"debug": "~0.8.0",
		"re2": "^1.3.0",
		"string": "~1.8.0"
		"debug": "^2.2.0",
		"re2": "^1.3.3",
		"string": "^3.3.3"
		},
		"devDependencies": {
		"mocha": "~1.18.2",
		"should": "~3.3.1"
		"babel-cli": "^6.16.0",
		"babel-preset-es2015": "^6.16.0",
		"babel-register": "^6.18.0",
		"coveralls": "^2.11.14",
		"eslint": "^3.8.1",
		"eslint-config-airbnb": "^12.0.0",
		"eslint-plugin-import": "^1.16.0",
		"eslint-plugin-jsx-a11y": "^2.2.3",
		"eslint-plugin-react": "^6.4.1",
		"istanbul": "^0.4.5",
		"mocha": "^3.1.2",
		"should": "^11.1.1"
		}
		}

readme.md

		@@ -7,4 +7,12 @@ # Normalize, clean and fix text

		The order in which the processing happes is important.
		A single method `clean(message)` is exposed.

		```
		import normalizer from node-normalizer;
		// const normalizer = require('node-normalizer').default;
		const cleanedMessage = normalizer.clean('my message');
		```

		The order in which the processing happens is important.

		* <xxx means sentence start then xxx
		@@ -11,0 +19,0 @@ * 1. spelling corrections for common spelling errors

174

test/basic.js

		@@ -1,98 +0,89 @@
		var mocha = require("mocha");
		var should = require("should");
		var norm = require("../index");
		import mocha from 'mocha';
		import should from 'should';
		import norm from '../src/normalizer';

		describe('Normalizer', function(){
		var startTime;

		before(function(done){
		norm.loadData(function(){
		startTime = new Date();
		done();
		});
		describe('Normalizer', () => {
		let startTime;
		before((done) => {
		startTime = new Date();
		done();
		});

		after(function(done) {
		console.log('Test duration: ' + (new Date() - startTime) + 'ms');
		done();
		})

		describe('Should clean input', function() {

		it("should replace subsitutes", function() {
		norm.clean("Nov 1st I weighed 90 kgs. total").should.eql("November 1st I weighed 90 kilograms total");
		norm.clean("I shared it on FB w/ friends, ie: you").should.eql("I shared it on Facebook with friends, for example : you");
		describe('Should clean input', () => {
		it('should replace subsitutes', () => {
		norm.clean('Nov 1st I weighed 90 kgs. total').should.eql('November 1st I weighed 90 kilograms total');
		norm.clean('I shared it on FB w/ friends, ie: you').should.eql('I shared it on Facebook with friends, for example : you');
		});

		it("should expand contractions", function() {
		norm.clean("I'm on the yelow zebra").should.eql("I am on the yellow zebra");
		norm.clean("I'll listen to y'all").should.eql("I will listen to you all");
		norm.clean("do n't make it right").should.eql("do not make it right");
		norm.clean("it's all good").should.eql("it is all good");
		it('should expand contractions', () => {
		norm.clean("I'm on the yelow zebra").should.eql('I am on the yellow zebra');
		norm.clean("I'll listen to y'all").should.eql('I will listen to you all');
		norm.clean("do n't make it right").should.eql('do not make it right');
		norm.clean("it's all good").should.eql('it is all good');
		});

		it("should swap british / canadian words", function() {
		norm.clean("armour axe coloured gold").should.eql("armor ax colored gold");
		it('should swap british / canadian words', () => {
		norm.clean('armour axe coloured gold').should.eql('armor ax colored gold');
		});

		it("should fix spelling", function() {
		norm.clean("are we sceduled thrsday for teh restraunt").should.eql("are we scheduled Thursday for the restaurant");
		it('should fix spelling', () => {
		norm.clean('are we sceduled thrsday for teh restraunt').should.eql('are we scheduled Thursday for the restaurant');
		});

		it("should expand txt speak", function() {
		norm.clean("n").should.eql("~no");
		norm.clean("lol").should.eql("~emolaugh");
		norm.clean("haha").should.eql("~emolaugh");
		norm.clean(":)").should.eql("~emohappy");
		it('should expand txt speak', () => {
		norm.clean('n').should.eql('~no');
		norm.clean('lol').should.eql('~emolaugh');
		norm.clean('haha').should.eql('~emolaugh');
		norm.clean(':)').should.eql('~emohappy');
		});

		it("should clean this", function() {
		norm.clean("Well , I could not help it, could I").should.eql("I could not help it, could I")
		it('should clean this', () => {
		norm.clean('Well , I could not help it, could I').should.eql('I could not help it, could I');
		});

		it("should not remove +", function() {
		norm.clean("3+4=7").should.eql("3+4=7");
		it('should not remove +', () => {
		norm.clean('3+4=7').should.eql('3+4=7');
		});

		it("should remove extra spaces", function() {
		norm.clean("this is spaced out").should.eql("this is spaced out");
		it('should remove extra spaces', () => {
		norm.clean('this is spaced out').should.eql('this is spaced out');
		});

		it("should remove punct", function() {
		norm.clean("why do i care?").should.eql("why do I care");
		it('should remove punct', () => {
		norm.clean('why do i care?').should.eql('why do I care');
		});

		it("Fix numbers", function() {
		norm.clean("how much is 1,000.00").should.eql("how much is 1000.00");
		it('Fix numbers', () => {
		norm.clean('how much is 1,000.00').should.eql('how much is 1000.00');
		});

		it("Spell Fix 2 word combo", function() {
		norm.clean("hwo do you").should.eql("how do you");
		norm.clean("hwo is you").should.eql("who is you");
		it('Spell Fix 2 word combo', () => {
		norm.clean('hwo do you').should.eql('how do you');
		norm.clean('hwo is you').should.eql('who is you');
		});

		it("Fix ASCII characters", function() {
		norm.clean("What’s up").should.eql("what is up");
		norm.clean("What's up").should.eql("what is up");
		norm.clean("I said “shut up”").should.eql('I said "shut up"');
		norm.clean("œ").should.eql('');
		it('Fix ASCII characters', () => {
		norm.clean('What’s up').should.eql('what is up');
		norm.clean("What's up").should.eql('what is up');
		norm.clean('I said “shut up”').should.eql('I said "shut up"');
		norm.clean('œ').should.eql('');
		});
		});

		describe('Matching', function() {
		describe('Matching', () => {
		// <it_is>
		describe('<xxx>', function() {
		it('should match start and end', function() {
		norm.clean('it is').should.eql("~yes");
		describe('<xxx>', () => {
		it('should match start and end', () => {
		norm.clean('it is').should.eql('~yes');
		});

		it('should not match start', function() {
		norm.clean('it is abc').should.eql("it is abc");
		it('should not match start', () => {
		norm.clean('it is abc').should.eql('it is abc');
		});

		it('should not match end', function() {
		it('should not match end', () => {
		norm.clean('abc it is').should.eql('abc it is');
		});

		it('should not match middle', function() {
		it('should not match middle', () => {
		norm.clean('abc it is abc').should.eql('abc it is abc');
		@@ -103,17 +94,17 @@ });
		// <ew
		describe('<xxx', function() {
		it('should match start and end', function() {
		norm.clean('ew').should.eql("~emodisgust");
		describe('<xxx', () => {
		it('should match start and end', () => {
		norm.clean('ew').should.eql('~emodisgust');
		});

		it('should match start', function() {
		norm.clean('ew abc').should.eql("~emodisgust abc");
		it('should match start', () => {
		norm.clean('ew abc').should.eql('~emodisgust abc');
		});

		it('should not match end', function() {
		norm.clean('abc ew').should.eql("abc ew");
		it('should not match end', () => {
		norm.clean('abc ew').should.eql('abc ew');
		});

		it('should not match middle', function() {
		norm.clean('abc ew abc').should.eql("abc ew abc");
		it('should not match middle', () => {
		norm.clean('abc ew abc').should.eql('abc ew abc');
		});
		@@ -123,17 +114,17 @@ });
		// have_to_go>
		describe('xxx>', function() {
		it('should match start and end', function() {
		norm.clean('have to go').should.eql("~emogoodbye");
		describe('xxx>', () => {
		it('should match start and end', () => {
		norm.clean('have to go').should.eql('~emogoodbye');
		});

		it('should not match start', function() {
		norm.clean('have to go abc').should.eql("have to go abc");
		it('should not match start', () => {
		norm.clean('have to go abc').should.eql('have to go abc');
		});

		it('should match end', function() {
		norm.clean('abc have to go').should.eql("abc ~emogoodbye")
		it('should match end', () => {
		norm.clean('abc have to go').should.eql('abc ~emogoodbye');
		});

		it('should not match middle', function() {
		norm.clean('abc have to go abc').should.eql("abc have to go abc")
		it('should not match middle', () => {
		norm.clean('abc have to go abc').should.eql('abc have to go abc');
		});
		@@ -143,22 +134,25 @@ });
		// okay
		describe('xxx', function() {
		it('should match start and end', function() {
		norm.clean('okay').should.eql("~yes");
		describe('xxx', () => {
		it('should match start and end', () => {
		norm.clean('okay').should.eql('~yes');
		});

		it('should match start', function() {
		norm.clean('okay abc').should.eql("~yes abc");
		it('should match start', () => {
		norm.clean('okay abc').should.eql('~yes abc');
		});

		it('should match end', function() {
		norm.clean('abc okay').should.eql("abc ~yes");
		it('should match end', () => {
		norm.clean('abc okay').should.eql('abc ~yes');
		});

		it('should match middle', function() {
		norm.clean('abc okay abc').should.eql("abc ~yes abc");
		it('should match middle', () => {
		norm.clean('abc okay abc').should.eql('abc ~yes abc');
		});
		})
		});
		});


		after((done) => {
		console.log(`Test duration: ${new Date() - startTime}ms`);
		done();
		});
		});

index.js

.npmignore

Sorry, the diff of this file is not supported yet

node-normalizer - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics

Worsened metrics

Dependency changes