🚀 Big News: Socket Acquires Coana to Bring Reachability Analysis to Every Appsec Team.Learn more
Socket
Book a DemoInstallSign in
Socket

krawler

Package Overview
Dependencies
Maintainers
1
Versions
9
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

krawler - npm Package Compare versions

Comparing version

to
0.3.1

127

lib/index.js

@@ -56,7 +56,16 @@ var async = require('async'),

* @param {Array|string} urls
* @param {Object=} queueOptions
* @returns {Krawler}
*/
Krawler.prototype.queue = function(urls) {
Krawler.prototype.queue = function(urls, queueOptions) {
var self = this;
if(queueOptions === undefined) {
queueOptions = {};
}
_.extend({
customCallback: false
}, queueOptions);
if(urls === undefined || !urls.length) {

@@ -83,12 +92,21 @@ if (!_.isObject(urls)) {

async.eachLimit(urls, self.options_.maxConnections, function(url, callback) {
var queueCallback;
self.fetchUrl(_.isObject(url) ? url.url : url)
if(queueOptions.customCallback) {
queueCallback = callback;
}
var promise = self
.fetchUrl(_.isObject(url) ? url.url : url)
.then(function(resolved) {
self.emit('data', resolved.data, url, resolved.response);
self.emit('data', resolved.data, url, resolved.response, queueCallback);
}, function(err) {
self.emit('error', err, url);
})
.fin(function() {
self.emit('error', err, url, queueCallback);
});
if(!queueOptions.customCallback) {
promise.fin(function() {
callback(); // call callback no mather what
});
}

@@ -162,63 +180,66 @@ }, function(err) {

deferred.reject(err);
} else if(response.statusCode !== 200) {
return
}
if(response.statusCode >= 400 && response.statusCode < 600) {
deferred.reject('Wrong response code: ' + response.statusCode);
} else {
return;
}
try {
try {
var data = body;
var data = body;
if(self.options_.forceUTF8) {
data = self.convertToUTF8(data);
}
if(self.options_.forceUTF8) {
data = self.convertToUTF8(data);
}
switch (self.options_.parser) {
case 'cheerio':
try {
deferred.resolve({
data: cheerio.load(data),
response: response
});
} catch (e) {
deferred.reject(e);
}
break;
case 'json':
try {
deferred.resolve({
data: JSON.parse(data),
response: response
});
} catch (e) {
deferred.reject(e);
}
break;
case 'xml':
parseString(data, function (err, xml) {
if(err) {
deferred.reject(err);
} else {
deferred.resolve({
data: xml,
response: response
});
}
switch (self.options_.parser) {
case 'cheerio':
try {
deferred.resolve({
data: cheerio.load(data),
response: response
});
break;
} catch (e) {
deferred.reject(e);
}
break;
default :
case 'json':
try {
deferred.resolve({
data: data,
data: JSON.parse(data),
response: response
});
break;
}
} catch (e) {
deferred.reject(e);
}
break;
} catch (e) {
deferred.reject(e);
case 'xml':
parseString(data, function (err, xml) {
if(err) {
deferred.reject(err);
} else {
deferred.resolve({
data: xml,
response: response
});
}
});
break;
default :
deferred.resolve({
data: data,
response: response
});
break;
}
} catch (e) {
deferred.reject(e);
}
});

@@ -225,0 +246,0 @@

{
"name": "krawler",
"version": "0.3.0",
"version": "0.3.1",
"description": "Fast and lightweight web crawler with built-in cheerio, xml and json parser.",

@@ -5,0 +5,0 @@ "keywords": [

@@ -71,2 +71,3 @@ # node Krawler [![Build Status](https://travis-ci.org/ondrs/node-krawler.png?branch=master)](https://travis-ci.org/ondrs/node-krawler)

krawler
.queue(urls)
.on('data', function(json, url, response) {

@@ -83,2 +84,33 @@ // do something with json...

## Queue options
After Krawler emits the 'data' event, it automatically continues to a next url address. It does not care if the result was processed or not.
If you would like to have a full control over the result handling, you can turn on the custom callback option.
Then you can control the program flow by invoking your callback. Don't forget to call it in every case, otherwise the queue will stuck.
var queueOptions = {
customCallback: true
};
krawler
.queue(urls, queueOptions)
.on('data', function($, url, response, callback) {
// expensive operation
downloadAllInternet
.then(function() {
// ...
})
.fin(callback); // always call the callback
})
.on('error', function(err, url, callback) {
// there has been an 'error' on 'url'
callback();
})
.on('end', function() {
// all URLs has been fetched
});
```
## Objects Example

@@ -85,0 +117,0 @@

@@ -95,2 +95,39 @@ var Krawler = require(__dirname + '/../lib/index');

it('should fetch several HTML pages in queue with custom queue callback', function(done) {
var urls = [],
fetched = [],
queueOptions = {
customCallback: true
},
counter = 0;
crawler = new Krawler;
for(var i = 0; i < 3; ++i) {
urls.push('https://www.google.cz/?q=' + i);
}
crawler
.queue(urls, queueOptions)
.on('data', function(data, url, response, callback) {
fetched.push(url);
setTimeout(function() {
++counter;
callback();
}, 3000);
})
.on('error', function(err, url, callback) {
done(err);
})
.on('end', function() {
expect(urls.length).to.be.equal(fetched.length);
expect(urls.length).to.be.equal(counter);
done();
});
});
it('should fetch single HTML page in queue', function(done) {

@@ -97,0 +134,0 @@

Sorry, the diff of this file is not supported yet