node Krawler
Fast and lightweight promise/event based web krawler with built-in cheerio, xml and json parser.
And of course ... the best :)
How to install
npm install krawler
Basic example
var Krawler = require('krawler')
var urls = [
'http://ondraplsek.cz'
];
var krawler = new Krawler;
krawler
.queue(urls)
.on('data', function($, url, response) {
})
.on('error', function(err, url) {
})
.on('end', function() {
});
Options
Krawler provides following API:
var krawler = new Krawler({
maxConnections: 10,
parser: 'cheerio',
forceUTF8: false,
});
mikeal/request is used for fetching web pages so any desired option from this package can be passed to Krawler's constructor.
Advanced Example
var urls = [
'https://graph.facebook.com/nodejs',
'https://graph.facebook.com/facebook',
'https://graph.facebook.com/cocacola',
'https://graph.facebook.com/google',
'https://graph.facebook.com/microsoft',
];
var krawler = new Krawler({
maxConnections: 5,
parser: 'json',
forceUTF8: true
});
krawler
.on('data', function(json, url, response) {
})
.on('error', function(err, url) {
})
.on('end', function() {
});
Objects Example
Instead of pure strings or an array of strings, one can also pass an object or
an array of objects who each have the url as a property named 'url'. This enables you to access the properties of the url object later in the process. Example:
var Krawler = require('krawler')
var urls = [
{ name: 'SomeSite', url: 'http://ondraplsek.cz' }
];
var krawler = new Krawler;
krawler
.queue(urls)
.on('data', function($, url, response) {
})
.on('error', function(err, url) {
})
.on('end', function() {
});
Promises
If your program flow is based on promises you can easily attach Krawler to your promise chain.
Method fetchUrl() returns a Q.promise. When the promise is full filled, callback function is called with a result object.
Object has two properties
- data - parsed/raw content of the web page base on parser setting
- response - response object from mikeal/request
var krawler = new Krawler;
findUrl()
.then(function(url) {
return krawler.fetchUrl(url);
})
.then(function(result) {
return processData(result.data);
})