Comparing version 0.3.0 to 0.3.1
127
lib/index.js
@@ -56,7 +56,16 @@ var async = require('async'), | ||
* @param {Array|string} urls | ||
* @param {Object=} queueOptions | ||
* @returns {Krawler} | ||
*/ | ||
Krawler.prototype.queue = function(urls) { | ||
Krawler.prototype.queue = function(urls, queueOptions) { | ||
var self = this; | ||
if(queueOptions === undefined) { | ||
queueOptions = {}; | ||
} | ||
_.extend({ | ||
customCallback: false | ||
}, queueOptions); | ||
if(urls === undefined || !urls.length) { | ||
@@ -83,12 +92,21 @@ if (!_.isObject(urls)) { | ||
async.eachLimit(urls, self.options_.maxConnections, function(url, callback) { | ||
var queueCallback; | ||
self.fetchUrl(_.isObject(url) ? url.url : url) | ||
if(queueOptions.customCallback) { | ||
queueCallback = callback; | ||
} | ||
var promise = self | ||
.fetchUrl(_.isObject(url) ? url.url : url) | ||
.then(function(resolved) { | ||
self.emit('data', resolved.data, url, resolved.response); | ||
self.emit('data', resolved.data, url, resolved.response, queueCallback); | ||
}, function(err) { | ||
self.emit('error', err, url); | ||
}) | ||
.fin(function() { | ||
self.emit('error', err, url, queueCallback); | ||
}); | ||
if(!queueOptions.customCallback) { | ||
promise.fin(function() { | ||
callback(); // call callback no mather what | ||
}); | ||
} | ||
@@ -162,63 +180,66 @@ }, function(err) { | ||
deferred.reject(err); | ||
} else if(response.statusCode !== 200) { | ||
return | ||
} | ||
if(response.statusCode >= 400 && response.statusCode < 600) { | ||
deferred.reject('Wrong response code: ' + response.statusCode); | ||
} else { | ||
return; | ||
} | ||
try { | ||
try { | ||
var data = body; | ||
var data = body; | ||
if(self.options_.forceUTF8) { | ||
data = self.convertToUTF8(data); | ||
} | ||
if(self.options_.forceUTF8) { | ||
data = self.convertToUTF8(data); | ||
} | ||
switch (self.options_.parser) { | ||
case 'cheerio': | ||
try { | ||
deferred.resolve({ | ||
data: cheerio.load(data), | ||
response: response | ||
}); | ||
} catch (e) { | ||
deferred.reject(e); | ||
} | ||
break; | ||
case 'json': | ||
try { | ||
deferred.resolve({ | ||
data: JSON.parse(data), | ||
response: response | ||
}); | ||
} catch (e) { | ||
deferred.reject(e); | ||
} | ||
break; | ||
case 'xml': | ||
parseString(data, function (err, xml) { | ||
if(err) { | ||
deferred.reject(err); | ||
} else { | ||
deferred.resolve({ | ||
data: xml, | ||
response: response | ||
}); | ||
} | ||
switch (self.options_.parser) { | ||
case 'cheerio': | ||
try { | ||
deferred.resolve({ | ||
data: cheerio.load(data), | ||
response: response | ||
}); | ||
break; | ||
} catch (e) { | ||
deferred.reject(e); | ||
} | ||
break; | ||
default : | ||
case 'json': | ||
try { | ||
deferred.resolve({ | ||
data: data, | ||
data: JSON.parse(data), | ||
response: response | ||
}); | ||
break; | ||
} | ||
} catch (e) { | ||
deferred.reject(e); | ||
} | ||
break; | ||
} catch (e) { | ||
deferred.reject(e); | ||
case 'xml': | ||
parseString(data, function (err, xml) { | ||
if(err) { | ||
deferred.reject(err); | ||
} else { | ||
deferred.resolve({ | ||
data: xml, | ||
response: response | ||
}); | ||
} | ||
}); | ||
break; | ||
default : | ||
deferred.resolve({ | ||
data: data, | ||
response: response | ||
}); | ||
break; | ||
} | ||
} catch (e) { | ||
deferred.reject(e); | ||
} | ||
}); | ||
@@ -225,0 +246,0 @@ |
{ | ||
"name": "krawler", | ||
"version": "0.3.0", | ||
"version": "0.3.1", | ||
"description": "Fast and lightweight web crawler with built-in cheerio, xml and json parser.", | ||
@@ -5,0 +5,0 @@ "keywords": [ |
@@ -71,2 +71,3 @@ # node Krawler [![Build Status](https://travis-ci.org/ondrs/node-krawler.png?branch=master)](https://travis-ci.org/ondrs/node-krawler) | ||
krawler | ||
.queue(urls) | ||
.on('data', function(json, url, response) { | ||
@@ -83,2 +84,33 @@ // do something with json... | ||
## Queue options | ||
After Krawler emits the 'data' event, it automatically continues to a next url address. It does not care if the result was processed or not. | ||
If you would like to have a full control over the result handling, you can turn on the custom callback option. | ||
Then you can control the program flow by invoking your callback. Don't forget to call it in every case, otherwise the queue will stuck. | ||
var queueOptions = { | ||
customCallback: true | ||
}; | ||
krawler | ||
.queue(urls, queueOptions) | ||
.on('data', function($, url, response, callback) { | ||
// expensive operation | ||
downloadAllInternet | ||
.then(function() { | ||
// ... | ||
}) | ||
.fin(callback); // always call the callback | ||
}) | ||
.on('error', function(err, url, callback) { | ||
// there has been an 'error' on 'url' | ||
callback(); | ||
}) | ||
.on('end', function() { | ||
// all URLs has been fetched | ||
}); | ||
``` | ||
## Objects Example | ||
@@ -85,0 +117,0 @@ |
@@ -95,2 +95,39 @@ var Krawler = require(__dirname + '/../lib/index'); | ||
it('should fetch several HTML pages in queue with custom queue callback', function(done) { | ||
var urls = [], | ||
fetched = [], | ||
queueOptions = { | ||
customCallback: true | ||
}, | ||
counter = 0; | ||
crawler = new Krawler; | ||
for(var i = 0; i < 3; ++i) { | ||
urls.push('https://www.google.cz/?q=' + i); | ||
} | ||
crawler | ||
.queue(urls, queueOptions) | ||
.on('data', function(data, url, response, callback) { | ||
fetched.push(url); | ||
setTimeout(function() { | ||
++counter; | ||
callback(); | ||
}, 3000); | ||
}) | ||
.on('error', function(err, url, callback) { | ||
done(err); | ||
}) | ||
.on('end', function() { | ||
expect(urls.length).to.be.equal(fetched.length); | ||
expect(urls.length).to.be.equal(counter); | ||
done(); | ||
}); | ||
}); | ||
it('should fetch single HTML page in queue', function(done) { | ||
@@ -97,0 +134,0 @@ |
Sorry, the diff of this file is not supported yet
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
63954
405
169