Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

neek

Package Overview
Dependencies
Maintainers
1
Versions
10
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

neek - npm Package Compare versions

Comparing version 0.2.2 to 1.0.0

.codeclimate.yml

109

lib/neek.js

@@ -1,72 +0,69 @@

var NeekReader = require('./reader'),
StringStream = require('./stream');
var fs = require('fs');
module.exports = function(){
var Transformer = require('./transform');
var StringStream = require('./stream');
var _this = this;
module.exports = {
this.setInput = function(input) {
this.input = input;
return this;
};
unique: function unique(input, output, callback) {
this.setOutput = function(output) {
this.output = output;
return this;
};
if (!isReadStream(input)) {
throw new Error('No input stream specified!');
}
this.unique = function(algorithm, callback) {
if (!_this.input) {
throw new Error("No input stream specified!");
}
if (!isWriteStream(output)) {
throw new Error('No output stream specified!');
}
if (!_this.output) {
throw new Error("No output format specified!");
}
if (!algorithm) {
algorithm = null;
callback = new Function();
}
if (typeof input === 'string') {
input = fs.createReadStream(input);
}
if (typeof algorithm == "function") {
callback = algorithm;
algorithm = null;
}
var stream = null;
var reader = new NeekReader(),
stream = null;
if (output !== 'string') {
if (typeof output === 'string') {
stream = fs.createWriteStream(output);
} else {
stream = output;
}
} else {
stream = new StringStream();
}
reader.algorithm = algorithm;
var transformer = new Transformer();
reader.isTTY = Boolean(_this.output.isTTY);
input
.pipe(transformer)
.pipe(stream);
_this.input.pipe(reader);
if(_this.output != "string"){
reader.pipe(_this.output);
stream.once('finish', function (){
if (callback) {
if (output !== 'string') {
transformer.unpipe(output);
} else {
reader.pipe((stream = new StringStream(), stream));
transformer.unpipe(stream);
}
callback({
output: output !== 'string' ? null : stream.get(),
total: transformer.count,
unique: transformer.set.size
});
}
});
reader.on('end', function () {
var total = reader.count,
unique = reader.set.count();
}
reader.reset();
};
if (callback) {
if (_this.output != "string") {
reader.unpipe(_this.output);
} else {
reader.unpipe(stream);
}
callback({
output: _this.output != "string" ? null : stream.get(),
total: total,
unique: unique
});
}
});
}
};
function isReadStream(rs){
return isStream(rs, process.stdin, '_read');
}
function isWriteStream(ws){
return isStream(ws, process.stdout, '_write');
}
function isStream(st, sys, prop){
return st && (st === sys|| typeof(st[prop]) === 'function' || typeof(st) === 'string');
}

@@ -1,10 +0,10 @@

var Stream = require('readable-stream'),
util = require('util');
var Stream = require('readable-stream');
var util = require('util');
// Init function
function StringStream(init) {
// Init via super
Stream.super_.call(this);
// Init the data
this._data = init || '';
// Init via super
Stream.super_.call(this);
// Init the data
this._data = init || '';
}

@@ -17,3 +17,4 @@

StringStream.prototype.end = function(){
this.emit('end');
this.emit('end');
this.emit('finish');
};

@@ -23,3 +24,3 @@

StringStream.prototype.get = function(){
return this._data;
return this._data;
};

@@ -29,3 +30,3 @@

StringStream.prototype.write = function(data){
this._data += data;
this._data += data;
};

@@ -32,0 +33,0 @@

{
"name":"neek",
"version":"0.2.2",
"description":"A simple implementation of *nix uniq in NodeJS to take advantage of streaming",
"author":{
"name":"Isaac Whitfield",
"email":"iwhitfield@appcelerator.com"
},
"repository":{
"type":"git",
"url":"http://github.com/iwhitfield/neek.git"
},
"bugs":{
"email":"iwhitfield@appcelerator.com",
"url":"http://github.com/iwhitfield/neek/issues"
},
"license":"MIT",
"readmeFilename":"README.md",
"preferGlobal":true,
"keywords":[
"duplicate",
"uniq",
"unique"
],
"scripts": {
"coverage":"rm -f coverage.html && mocha -r blanket -R html-cov test > coverage.html",
"test":"node test.js spec; node test.js | ./node_modules/coveralls/bin/coveralls.js"
},
"bin":{
"neek":"bin/neek"
},
"dependencies":{
"hashes":"*",
"minimist":"*",
"readable-stream":"~1.0.0"
},
"devDependencies":{
"blanket":"*",
"coveralls": "*",
"mocha":"*",
"mocha-lcov-reporter":"0.0.1"
},
"config": {
"blanket":{
"pattern":[
"lib/neek.js",
"lib/reader.js",
"lib/stream.js"
]
}
}
"name": "neek",
"version": "1.0.0",
"description": "A simple implementation of *nix uniq in NodeJS to take advantage of streaming",
"author": {
"name": "Isaac Whitfield",
"email": "iwhitfield@appcelerator.com"
},
"repository": {
"type": "git",
"url": "http://github.com/zackehh/neek.git"
},
"bugs": {
"email": "iwhitfield@appcelerator.com",
"url": "http://github.com/zackehh/neek/issues"
},
"license": "MIT",
"readmeFilename": "README.md",
"preferGlobal": true,
"keywords": [
"duplicate",
"uniq",
"unique"
],
"scripts": {
"coverage": "grunt coverage",
"lint": "grunt lint",
"test": "grunt"
},
"bin": {
"neek": "bin/neek"
},
"dependencies": {
"farmhash": "1.1.0",
"hashes": "0.1.3",
"minimist": "1.2.0",
"readable-stream": "2.0.2"
},
"devDependencies": {
"grunt": "0.4.5",
"grunt-cli": "0.1.13",
"grunt-codeclimate-reporter": "1.1.2",
"grunt-contrib-clean": "0.6.0",
"grunt-contrib-jshint": "0.11.3",
"grunt-mkdir": "0.1.2",
"grunt-mocha-istanbul": "3.0.1",
"grunt-mocha-test": "0.12.7",
"istanbul": "0.3.22",
"jshint": "2.8.0",
"mocha": "2.3.3",
"should": "7.1.0"
}
}

@@ -1,2 +0,2 @@

Neek [![Build Status](https://travis-ci.org/iwhitfield/neek.svg?branch=master)](https://travis-ci.org/iwhitfield/neek) [![Coverage Status](https://coveralls.io/repos/iwhitfield/neek/badge.png)](https://coveralls.io/r/iwhitfield/neek)
Neek [![Build Status](https://travis-ci.org/zackehh/neek.svg?branch=master)](https://travis-ci.org/zackehh/neek) [![Code Climate](https://codeclimate.com/github/zackehh/neek/badges/gpa.svg)](https://codeclimate.com/github/zackehh/neek) [![Test Coverage](https://codeclimate.com/github/zackehh/neek/badges/coverage.svg)](https://codeclimate.com/github/zackehh/neek)
====

@@ -8,4 +8,8 @@

This module is built on each commit with TravisCI on Node 0.8.x, 0.10.x and 0.11.x. It will *not* work on Node 0.6.x unfortunately. In order to maintain support throughout these versions, the [Hashes](https://npmjs.org/package/hashes "Hashes") library is used. There are more efficient alternatives (perhaps a gaining a second per 100,000 records), however they have native components and are unstable on 0.11.x at the moment. At some point in future, I'll revisit this and implement a better HashSet - perhaps when 0.12.x is live.
the current version of `Neek` is designed using several features of `ES6`; namely the `Set` interface. If this is not available, it will fall back to a library interface which is not as fast (but it's still pretty good). As such, best performance occurs when on `Node >= v4.0.0` and all numbers in this README will refer to this version.
Neek is built on Travis against the latest version of Node, Node 0.12.x and Node 0.10.x. Older versions of Node are not supported, however Neek was functional on 0.8.x with version `0.2.2` (admittedly with slower performance).
Build results are sent over to [Code Climate](https://codeclimate.com/github/zackehh/neek) for analysis.
### Setup ###

@@ -16,3 +20,3 @@

```
$ sudo npm install -g neek
$ npm install -g neek
```

@@ -31,5 +35,5 @@

```
$ neek --input dup_file.txt > output.txt
$ neek --input dup_file.txt -o output.txt
$ cat dup_file.txt | neek > output.txt
$ cat dup_file.txt | neek -o output.txt
```

@@ -40,3 +44,2 @@

```
-a, --algorithm the cipher algorithm to use (default to SHA1)
-i, --input an input file to process

@@ -49,15 +52,23 @@ -o, --output a file to output to

Please note that `input`/`output` accept either a String path or a Stream.
```
var Neek = require('neek);
var neek = require('neek);
new Neek()
.setInput(fs.createReadStream('./test/resources/lines_with_dups.txt'))
.setOutput(fs.createWriteStream('./test/resources/output_without_dups.txt'))
.unique('md5', function(result){
var readable = './test/resources/lines_with_dups.txt';
var writable = './test/resources/output_without_dups.txt';
});
neek.unique(readable, writable, function(result){
console.log(result);
});
```
You can use `setInput()` and `setOutput()` to define your streams. You then call `unique()` to actually remove the duplicate data. `setOutput()` can take a parameter "string", which will pass the output to the callback as described below. `unique()` can take an optional algorithm param (defaulting to SHA1), and a callback function which is passed a result object.
### unique(input, output[, callback])
The unique method is the only method currently available on the `neek` module. You pass in your two Streams and an optional callback.
The `output` parameter can take the value 'string', which will pass the output to the callback in `result.output`, rather than piping it to a stream. The callback to `unique` is optional, but be careful when omitting it in case you're depending on the Stream being written.
If you pass a String type to either `input` or `output` (when output `!== 'string'`) it will be wrapped up in a read/write stream, with the assumption that it is a file path.
This object contains three fields; output, size and count. These fields translate to the following:

@@ -73,3 +84,3 @@

On a test set of a 293MB file containing 576,905 total lines with 322,392 unique lines, below is a comparison of the performance of Unix tool `uniq` and `neek`. This is assuming that your data is sorted.
On a test set of a 527MB file containing 1,071,367 total lines with 443,917 unique lines, below is a comparison of the performance of Unix tools `uniq` and `sort`, and then `neek`. `uniq` is assuming that your data is sorted.

@@ -79,42 +90,38 @@ **Uniq**

```
$ time uniq test-set.txt
$ time uniq test-set.txt > deduplicated.txt
# output
real 0m33.951s
user 0m27.086s
sys 0m2.161s
real 0m38.922s
user 0m37.647s
sys 0m1.105s
```
**Neek**
In the unfortunately case that your data isn't sorted, you would have to use `sort`, however Neek behaves the same regardless of order.
**Sort**
```
$ time bin/neek --input test-set.txt
$ time sort -u test-set.txt > deduplicated.txt
# output
real 0m16.354s
user 0m13.733s
sys 0m2.217s
real 2m16.459s
user 2m13.757s
sys 0m2.186s
```
In the unfortunately case that your data isn't sorted, you would have to use `sort`, however Neek behaves the same regardless of order.
Now let's look at Neek!
**Sort**
**Neek**
```
$ time sort -u test-set.txt
$ time bin/neek --input test-set.txt -o deduplicated.txt
# output
real 1m39.203s
user 1m32.484s
sys 0m1.518s
real 0m9.581s
user 0m8.615s
sys 0m1.588s
```
As you can see, Neek is roughly 45% faster to run than Uniq and almost 85% faster to run than Sort, meaning it's invaluable for larger files.
As you can see, Neek is ~4.1x (around 400%) faster to run than `uniq` and ~14.2x (around 1400%) faster to run than `sort`, meaning it's invaluable for larger files. Aside from being far faster Neek uses efficient pipes, which is far better for memory usage. Tools like `sort` will buffer the entire file into memory, making it a bad choice for large files.
### Redirection ###
One important thing to note here is that a shell redirection is slightly faster than using the `--output` flag. In the processing of the above file, the `--output` flag took an extra 9 seconds due to the overheads inside Node.
On versions **prior** to Node v4.x one important thing to note is that a shell redirection is slightly faster than using the `--output` flag. In the processing of the above file, the `--output` flag took an extra 9 seconds due to the overheads inside Node.

@@ -124,15 +131,15 @@ Where possible, I would recommend simply using a shell redirection. If you do use a redirection, make sure to pass `-q`. Here is a comparison:

```
$ time bin/neek --input test-set.txt -q > output.txt
$ time bin/neek --input test-set.txt -q > deduplicated.txt
real 0m16.354s
user 0m13.733s
sys 0m2.217s
real 0m19.928s
user 0m16.596s
sys 0m3.653s
$ time bin/neek --input test-set.txt --output output.txt
$ time bin/neek --input test-set.txt --output deduplicated.txt
Processing complete: 576905 -> 322392
real 0m30.536s
user 0m22.242s
sys 0m10.883s
```
real 0m25.744s
user 0m14.974s
sys 0m6.657s
```
In post Node v4.x, this is **not** an issue (in fact the situation is almost reversed, shell redirection is far slower).

@@ -1,139 +0,147 @@

var assert = require('assert'),
exec = require('child_process').exec,
fs = require('fs'),
Neek = require('../index'),
path = require('path');
var fs = require('fs');
var should = require('should');
var spawn = require('child_process').spawn;
var without_dups;
var neek = require('../index');
describe('Neek', function(){
var dup_file = './test/resources/lines_with_dups.txt';
before(function(start){
without_dups = fs.readFileSync('./test/resources/lines_without_dups.txt').toString();
start();
});
describe('Neek', function (){
describe('filters out duplicates', function(){
var without_dups;
it('using the MD5 algorithm', function(next){
before(function (start){
fs.readFile('./test/resources/lines_without_dups.txt', function (err, file){
if(err){
return start(err);
}
without_dups = file.toString();
start();
});
});
new Neek()
.setInput(fs.createReadStream('./test/resources/lines_with_dups.txt'))
.setOutput("string")
.unique('md5', function(result){
assert.strictEqual(result.output, without_dups);
next();
});
it('filters out duplicates', function (next){
neek.unique(fs.createReadStream(dup_file), 'string', function (result){
should(result).be.ok;
should(result.output).be.ok;
should(result.output).eql(without_dups);
});
next();
});
});
it('using the SHA-1 algorithm', function(next){
it('handles with String output', function(next){
neek.unique(dup_file, 'tmp/first_output_without_dups.txt', function (result){
should(result).be.ok;
should(result.total).be.ok;
should(result.unique).be.ok;
should(result.total).eql(13);
should(result.unique).eql(8);
new Neek()
.setInput(fs.createReadStream('./test/resources/lines_with_dups.txt'))
.setOutput("string")
.unique(function(result){
assert.strictEqual(result.output, without_dups);
next();
});
next();
});
});
});
it('handles with Stream output', function(next){
var readable = fs.createReadStream(dup_file);
var writable = fs.createWriteStream('tmp/second_output_without_dups.txt');
it('using the SHA-256 algorithm', function(next){
neek.unique(readable, writable, function (result){
should(result).be.ok;
should(result.total).be.ok;
should(result.unique).be.ok;
should(result.total).eql(13);
should(result.unique).eql(8);
new Neek()
.setInput(fs.createReadStream('./test/resources/lines_with_dups.txt'))
.setOutput("string")
.unique('sha256', function(result){
assert.strictEqual(result.output, without_dups);
next();
});
next();
});
});
});
it('functions without a callback', function (start){
var path = 'tmp/third_output_without_dups.txt';
it('using the SHA-512 algorithm', function(next){
neek.unique(fs.createReadStream(dup_file), fs.createWriteStream(path));
new Neek()
.setInput(fs.createReadStream('./test/resources/lines_with_dups.txt'))
.setOutput("string")
.unique('sha512', function(result){
assert.strictEqual(result.output, without_dups);
next();
});
setTimeout(function (){
});
fs.readFile(path, function (err, content){
should(err).not.be.ok;
should(content).be.ok;
should(content.toString()).eql(without_dups);
start();
});
});
}, 250);
});
describe('handles unexpected situations', function(){
it('falls back to using a library if non-ES6 is available', function(next){
var stored_set = global.Set;
it('by throwing errors if an input stream type is missing', function(next){
delete global.Set;
try {
new Neek()
.setOutput("string")
.unique('sha512', function (result) {
assert.strictEqual(result.output, without_dups);
next(new Error('No error thrown!'));
});
} catch(e) {
assert(e.message == "No input stream specified!");
next();
}
neek.unique(fs.createReadStream(dup_file), 'string', function (result){
should(result).be.ok;
should(result.total).be.ok;
should(result.unique).be.ok;
should(result.total).eql(13);
should(result.unique).eql(8);
});
global.Set = stored_set;
it('by throwing errors if an output stream type is missing', function(next){
next();
});
});
try {
new Neek()
.setInput(fs.createReadStream('./test/resources/lines_with_dups.txt'))
.unique('sha512', function (result) {
assert.strictEqual(result.output, without_dups);
next(new Error('No error thrown!'));
});
} catch(e) {
assert(e.message == "No output format specified!");
next();
}
it('is usable via command line', function (next){
var child = spawn('./bin/neek', ['--input', dup_file]);
var data = '';
});
child.stdout.on('data', function (d){
data += d.toString();
});
it('by handling cases with no arguments', function(next){
new Neek()
.setInput(fs.createReadStream('./test/resources/lines_with_dups.txt'))
.setOutput(fs.createWriteStream('./test/resources/output_without_dups.txt'))
.unique();
child.stderr.on('data', function (data) {
should(data).not.be.ok;
});
next();
});
child.stdout.on('end', function (){
should(data).eql(without_dups);
next();
});
});
describe('command line interface', function(){
it('throws an error if an input stream type is missing', function (next){
it('should write out to a stream', function(next){
setTimeout(function(){
var testFile = fs.readFileSync('./test/resources/output_without_dups.txt').toString();
assert.equal(testFile, without_dups);
next();
}, 200);
});
try {
neek.unique(function (result){
should(result).be.ok;
should(result.output).be.ok;
should(result.output).eql(without_dups);
it('should be usable via command line', function(next){
next(new Error('No error thrown!'));
});
} catch(e) {
should(e.message).eql('No input stream specified!');
next();
}
var cmd = path.join(__dirname, '../bin/neek'),
resource = path.join(__dirname, './resources/');
});
exec(cmd + ' --input ' + resource + 'lines_with_dups.txt', function(err, stdout, stderr){
assert(!err, err);
assert(!stderr);
assert.equal(stdout, without_dups + "\nProcessing complete: 13 -> 8\n");
next();
});
it('throws an error if an output stream type is missing', function (next){
});
try {
neek.unique(fs.createReadStream(dup_file), function (result){
should(result).be.ok;
should(result.output).be.ok;
should(result.output).eql(without_dups);
});
next(new Error('No error thrown!'));
});
} catch(e) {
should(e.message).eql('No output stream specified!');
next();
}
});
});
});

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc