Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

krawler

Package Overview
Dependencies
Maintainers
1
Versions
9
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

krawler - npm Package Compare versions

Comparing version 0.3.0 to 0.3.1

127

lib/index.js

@@ -56,7 +56,16 @@ var async = require('async'),

* @param {Array|string} urls
* @param {Object=} queueOptions
* @returns {Krawler}
*/
Krawler.prototype.queue = function(urls) {
Krawler.prototype.queue = function(urls, queueOptions) {
var self = this;
if(queueOptions === undefined) {
queueOptions = {};
}
_.extend({
customCallback: false
}, queueOptions);
if(urls === undefined || !urls.length) {

@@ -83,12 +92,21 @@ if (!_.isObject(urls)) {

async.eachLimit(urls, self.options_.maxConnections, function(url, callback) {
var queueCallback;
self.fetchUrl(_.isObject(url) ? url.url : url)
if(queueOptions.customCallback) {
queueCallback = callback;
}
var promise = self
.fetchUrl(_.isObject(url) ? url.url : url)
.then(function(resolved) {
self.emit('data', resolved.data, url, resolved.response);
self.emit('data', resolved.data, url, resolved.response, queueCallback);
}, function(err) {
self.emit('error', err, url);
})
.fin(function() {
self.emit('error', err, url, queueCallback);
});
if(!queueOptions.customCallback) {
promise.fin(function() {
callback(); // call callback no mather what
});
}

@@ -162,63 +180,66 @@ }, function(err) {

deferred.reject(err);
} else if(response.statusCode !== 200) {
return
}
if(response.statusCode >= 400 && response.statusCode < 600) {
deferred.reject('Wrong response code: ' + response.statusCode);
} else {
return;
}
try {
try {
var data = body;
var data = body;
if(self.options_.forceUTF8) {
data = self.convertToUTF8(data);
}
if(self.options_.forceUTF8) {
data = self.convertToUTF8(data);
}
switch (self.options_.parser) {
case 'cheerio':
try {
deferred.resolve({
data: cheerio.load(data),
response: response
});
} catch (e) {
deferred.reject(e);
}
break;
case 'json':
try {
deferred.resolve({
data: JSON.parse(data),
response: response
});
} catch (e) {
deferred.reject(e);
}
break;
case 'xml':
parseString(data, function (err, xml) {
if(err) {
deferred.reject(err);
} else {
deferred.resolve({
data: xml,
response: response
});
}
switch (self.options_.parser) {
case 'cheerio':
try {
deferred.resolve({
data: cheerio.load(data),
response: response
});
break;
} catch (e) {
deferred.reject(e);
}
break;
default :
case 'json':
try {
deferred.resolve({
data: data,
data: JSON.parse(data),
response: response
});
break;
}
} catch (e) {
deferred.reject(e);
}
break;
} catch (e) {
deferred.reject(e);
case 'xml':
parseString(data, function (err, xml) {
if(err) {
deferred.reject(err);
} else {
deferred.resolve({
data: xml,
response: response
});
}
});
break;
default :
deferred.resolve({
data: data,
response: response
});
break;
}
} catch (e) {
deferred.reject(e);
}
});

@@ -225,0 +246,0 @@

{
"name": "krawler",
"version": "0.3.0",
"version": "0.3.1",
"description": "Fast and lightweight web crawler with built-in cheerio, xml and json parser.",

@@ -5,0 +5,0 @@ "keywords": [

@@ -71,2 +71,3 @@ # node Krawler [![Build Status](https://travis-ci.org/ondrs/node-krawler.png?branch=master)](https://travis-ci.org/ondrs/node-krawler)

krawler
.queue(urls)
.on('data', function(json, url, response) {

@@ -83,2 +84,33 @@ // do something with json...

## Queue options
After Krawler emits the 'data' event, it automatically continues to a next url address. It does not care if the result was processed or not.
If you would like to have a full control over the result handling, you can turn on the custom callback option.
Then you can control the program flow by invoking your callback. Don't forget to call it in every case, otherwise the queue will stuck.
var queueOptions = {
customCallback: true
};
krawler
.queue(urls, queueOptions)
.on('data', function($, url, response, callback) {
// expensive operation
downloadAllInternet
.then(function() {
// ...
})
.fin(callback); // always call the callback
})
.on('error', function(err, url, callback) {
// there has been an 'error' on 'url'
callback();
})
.on('end', function() {
// all URLs has been fetched
});
```
## Objects Example

@@ -85,0 +117,0 @@

@@ -95,2 +95,39 @@ var Krawler = require(__dirname + '/../lib/index');

it('should fetch several HTML pages in queue with custom queue callback', function(done) {
var urls = [],
fetched = [],
queueOptions = {
customCallback: true
},
counter = 0;
crawler = new Krawler;
for(var i = 0; i < 3; ++i) {
urls.push('https://www.google.cz/?q=' + i);
}
crawler
.queue(urls, queueOptions)
.on('data', function(data, url, response, callback) {
fetched.push(url);
setTimeout(function() {
++counter;
callback();
}, 3000);
})
.on('error', function(err, url, callback) {
done(err);
})
.on('end', function() {
expect(urls.length).to.be.equal(fetched.length);
expect(urls.length).to.be.equal(counter);
done();
});
});
it('should fetch single HTML page in queue', function(done) {

@@ -97,0 +134,0 @@

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc