crawler-ninja
Advanced tools
Comparing version 0.1.12 to 0.1.13
23
index.js
@@ -7,4 +7,2 @@ var events = require('events'); | ||
var log = require("crawler-ninja-logger").Logger; | ||
var Map = require("collections/fast-map"); | ||
var Set = require("collections/fast-set"); | ||
var requester = require("./lib/queue-requester"); | ||
@@ -16,4 +14,2 @@ var URI = require('./lib/uri.js'); | ||
var domainBlackList = require("./default-lists/domain-black-list.js").list(); | ||
@@ -109,3 +105,3 @@ var suffixBlackList = require("./default-lists/suffix-black-list.js").list(); | ||
this.httpRequester = new requester.Requester(this.config); | ||
requester.init(this.config.maxConnections, this.config.onDrain); | ||
@@ -134,3 +130,3 @@ events.EventEmitter.call(this); | ||
function(error){ | ||
if (self.httpRequester.idle()) { | ||
if (requester.idle()) { | ||
self.config.onDrain(); | ||
@@ -158,3 +154,3 @@ } | ||
store.getStore().addStartUrl(options, function(error) { | ||
self.httpRequester.queue(addDefaultOptions({uri:options, url:options}, self.config)); | ||
requester.queue(addDefaultOptions({uri:options, url:options}, self.config)); | ||
}); | ||
@@ -170,3 +166,3 @@ | ||
function(error){ | ||
if (self.httpRequester.idle()) { | ||
if (requester.idle()) { | ||
self.config.onDrain(); | ||
@@ -180,3 +176,3 @@ } | ||
store.getStore().addStartUrl(_.has(options, "url") ? options.url : options.uri, function(error) { | ||
self.httpRequester.queue(addDefaultOptions(options, self.config)); | ||
requester.queue(addDefaultOptions(options, self.config)); | ||
}); | ||
@@ -318,3 +314,6 @@ } | ||
async.apply(self.applyRedirect.bind(self), result), | ||
], callback); | ||
], function(error) { | ||
result = null; | ||
callback(error); | ||
}); | ||
@@ -333,3 +332,3 @@ } | ||
this.pm.crawlRedirect(from, to, result.statusCode, function(){ | ||
self.httpRequester.queue(self.buildNewOptions(result,to)); | ||
requester.queue(self.buildNewOptions(result,to)); | ||
callback(); | ||
@@ -558,3 +557,3 @@ }); | ||
if (toCrawl && (result.depthLimit == -1 || currentDepth <= result.depthLimit)) { | ||
self.httpRequester.queue(self.buildNewOptions(result,linkUri)); | ||
requester.queue(self.buildNewOptions(result,linkUri)); | ||
callback(); | ||
@@ -561,0 +560,0 @@ } |
@@ -33,4 +33,2 @@ /** | ||
var requestArgs = ['uri','url','qs','method','headers','body','form','json','multipart','followRedirect', | ||
@@ -42,8 +40,7 @@ 'followAllRedirects', 'maxRedirects','encoding','pool','timeout','proxy','auth','oauth','strictSSL', | ||
var start = new Date(); | ||
var req = request(_.pick.apply(this,[options].concat(requestArgs)), function(error,response) { | ||
request(_.pick.apply(this,[options].concat(requestArgs)), function(error,response) { | ||
var end = new Date() - start; | ||
if (error) { | ||
return onContent(error, endCallback, options); | ||
return endCallback({code: error.code}, options); | ||
} | ||
@@ -68,3 +65,3 @@ | ||
if (error) { | ||
result = null; | ||
return onContent(error, endCallback, options); | ||
@@ -87,6 +84,2 @@ } | ||
if (error) { | ||
return endCallback({code: error.code}, options); | ||
} | ||
// Sorry for this hack but that's solve some issue with Cheerio | ||
@@ -93,0 +86,0 @@ if (!result.body) { |
@@ -7,3 +7,3 @@ var Map = require("collections/fast-map"); | ||
this.plugins = new Map(); | ||
this.pluginCounter = 0; | ||
} | ||
@@ -10,0 +10,0 @@ |
@@ -1,411 +0,408 @@ | ||
var async = require('async'); | ||
var _ = require('underscore'); | ||
var URI = require("./uri.js"); | ||
var request = require("./http-request.js"); | ||
var log = require("crawler-ninja-logger").Logger; | ||
var store = require("../lib/store/store.js"); | ||
/** | ||
* The Request Queue | ||
* | ||
* its main job is to make the http requests & analyze the responses | ||
* its main job is make the http requests & analyze the responses | ||
* It is used an internal queue to limit the number of workers | ||
* | ||
* @param the options to use to make the requests. | ||
* | ||
*/ | ||
var Requester = function(options) { | ||
var async = require('async'); | ||
var _ = require('underscore'); | ||
var log = require("crawler-ninja-logger").Logger; | ||
var URI = require("./uri.js"); | ||
var request = require("./http-request.js"); | ||
var store = require("../lib/store/store.js"); | ||
this.options = options; | ||
this.initQueue(); | ||
} | ||
/** | ||
* Add a new url to crawl in the queue. | ||
* Check the desired options and add it to a request queue | ||
* | ||
* @param the options used to crawl the url | ||
* | ||
*/ | ||
Requester.prototype.queue = function (options) { | ||
(function () { | ||
var self = this; | ||
// Up to you to use uri or url. | ||
if (options.uri) { | ||
options.url = options.uri; | ||
} | ||
else { | ||
options.uri = options.url; | ||
} | ||
var requestQueue = {}; | ||
// if skipDuplicates, don't crawl twice the same uri | ||
if (this.options.skipDuplicates) { | ||
store.getStore().isInCrawlHistory(options.uri, function(error, isInCrawlHistory) { | ||
if (isInCrawlHistory) { | ||
log.warn({"url" : options.url, "step" : "queue-resquester.queue", "message" : "Don't crawl this url - Option skipDuplicates=true & the url has already been crawled" }); | ||
} | ||
else { | ||
store.getStore().addInHistory(options.uri,function(error) { | ||
self.q.push(options); | ||
log.info({"url" : options.url, "step" : "queue-resquester.queue", "message" : "Add in the request queue"}); | ||
}); | ||
} | ||
/** | ||
* Init the Queue Requester | ||
* | ||
* | ||
* @param The number of task/connection that the request queu can start in parallel | ||
* @param the callback executes when all task (url to cralw) are completed | ||
* | ||
*/ | ||
}); | ||
} | ||
else { | ||
log.info({"url" : options.url, "step" : "queue-resquester.queue", "message" : "Add in the request queue"}); | ||
this.q.push(options); | ||
} | ||
function init (maxConnections, onDrain) { | ||
createQueue(maxConnections, onDrain); | ||
} | ||
/** | ||
* Add a new url to crawl in the queue. | ||
* Check the desired options and add it to a request queue | ||
* | ||
* @param the options used to crawl the url | ||
* | ||
*/ | ||
function queue(options) { | ||
} | ||
// Up to you to use uri or url. | ||
if (options.uri) { | ||
options.url = options.uri; | ||
} | ||
else { | ||
options.uri = options.url; | ||
} | ||
// if skipDuplicates, don't crawl twice the same uri | ||
if (options.skipDuplicates) { | ||
store.getStore().isInCrawlHistory(options.uri, function(error, isInCrawlHistory) { | ||
if (isInCrawlHistory) { | ||
log.warn({"url" : options.url, "step" : "queue-resquester.queue", "message" : "Don't crawl this url - Option skipDuplicates=true & the url has already been crawled" }); | ||
} | ||
else { | ||
store.getStore().addInHistory(options.uri,function(error) { | ||
/** | ||
* @return false if there are items waiting or being processed in the queue, or true if not. | ||
* | ||
*/ | ||
Requester.prototype.idle = function() { | ||
return this.q.idle(); | ||
} | ||
requestQueue.push(options); | ||
log.info({"url" : options.url, "step" : "queue-resquester.queue", "message" : "Add in the request queue"}); | ||
/***************************************************************************************** | ||
* | ||
* PRIVATES FUNCTIONS | ||
* | ||
******************************************************************************************/ | ||
}); | ||
} | ||
/** | ||
* Init the queue | ||
* | ||
*/ | ||
Requester.prototype.initQueue = function () { | ||
}); | ||
} | ||
else { | ||
log.info({"url" : options.url, "step" : "queue-resquester.queue", "message" : "Add in the request queue"}); | ||
requestQueue.push(options); | ||
} | ||
var self = this; | ||
this.q = async.queue( | ||
function (options, callback) { | ||
log.debug({"url" : options.url, "step" : "queue-resquester.execQueueTask", "message" : "Start Crawling"}); | ||
// If the domain is in the blacklist => don't crawl the url | ||
if (options.domainBlackList.indexOf(URI.domainName(options.url)) > 0) { | ||
log.error({"url" : options.url, "step" : "queue-resquester.execQueueTask", "message" : "Domain of the url is in the blacklist"}); | ||
options.onCrawl({code:"DOMAINBLACKLIST"}, options, function(error){ | ||
callback(error); | ||
}); | ||
return; | ||
} | ||
} | ||
// Check if there are some errors for the host & make the appropriate crawl in function of that | ||
store.getStore().getHostErrors(options.url, function(error, errorInfo) { | ||
log.debug({"url" : options.url, "step" : "queue-resquester.execQueueTask", "message" : "Check if errors already exist"}); | ||
if (error) { | ||
self.onStoreError(error, options); | ||
return callback(); | ||
} | ||
/** | ||
* @return false if there are some URL waiting to be crawled or being processed in the queue, or true if not. | ||
* | ||
*/ | ||
idle = function() { | ||
return requestQueue.idle(); | ||
} | ||
if (errorInfo) { | ||
self.crawlWithErrors(options, errorInfo, callback); | ||
} | ||
else { | ||
self.crawl(options, callback); | ||
} | ||
}); | ||
}, | ||
self.options.maxConnections); | ||
/***************************************************************************************** | ||
* | ||
* PRIVATES FUNCTIONS | ||
* | ||
******************************************************************************************/ | ||
this.q.drain = function() { | ||
/** | ||
* Create the Request Queue | ||
* | ||
*/ | ||
function createQueue (maxConnections, onDrain) { | ||
requestQueue = async.queue(onUrlToCrawl,maxConnections); | ||
requestQueue.drain = onDrain; | ||
if (self.options.onDrain) { | ||
self.options.onDrain(); | ||
} | ||
} | ||
} | ||
/** | ||
* Stop the crawl if the crawl persistence store provides some errors | ||
* | ||
* | ||
* @param the error provided by the persistence store | ||
* @param the crawl options | ||
* | ||
*/ | ||
Requester.prototype.onStoreError = function(error, options, callback) { | ||
log.error({"url" : options.url , "step" : "queue-resquester.onStoreError", "message" : "Error from the crawl persistence service (crawl canceled for this url) : " + error.code}); | ||
options.onCrawl({code:"STOPCRAWL"}, options, function(error){ | ||
callback(error); | ||
}); | ||
} | ||
function onUrlToCrawl(options, callback) { | ||
/** | ||
* Crawl one url with optionnaly a delay (rate limit) | ||
* | ||
* | ||
* @param the crawl options | ||
* @param the callback used to inform the queue that request is finished | ||
*/ | ||
Requester.prototype.crawl = function(options, callback) { | ||
log.debug({"url" : options.url, "step" : "queue-resquester.execQueueTask", "message" : "Start Crawling"}); | ||
// If the domain is in the blacklist => don't crawl the url | ||
if (options.domainBlackList.indexOf(URI.domainName(options.url)) > 0) { | ||
var self = this; | ||
log.error({"url" : options.url, "step" : "queue-resquester.execQueueTask", "message" : "Domain of the url is in the blacklist"}); | ||
options.onCrawl({code:"DOMAINBLACKLIST"}, options, function(error){ | ||
process.nextTick(function() {callback(error)}); | ||
}); | ||
return; | ||
} | ||
if (options.rateLimits != 0) { | ||
// Check if there are some errors for the host & make the appropriate crawl in function of that | ||
store.getStore().getHostErrors(options.url, function(error, errorInfo) { | ||
log.debug({"url" : options.url, "step" : "queue-resquester.execQueueTask", "message" : "Check if errors already exist"}); | ||
log.error({"url" : options.url, "step" : "queue-resquester.crawl", "message" : "Request with option on ratelimit = " + options.rateLimits}); | ||
setTimeout(function() { | ||
if (error) { | ||
onStoreError(error, options); | ||
return callback(); | ||
} | ||
self.execHttp(options, callback); | ||
if (errorInfo) { | ||
crawlWithErrors(options, errorInfo, callback); | ||
} | ||
else { | ||
crawl(options, callback); | ||
} | ||
}); | ||
}, options.rateLimits); | ||
} | ||
else { | ||
self.execHttp(options, callback); | ||
} | ||
} | ||
/** | ||
* Stop the crawl if the crawl persistence store provides some errors | ||
* | ||
* | ||
* @param the error provided by the persistence store | ||
* @param the crawl options | ||
* | ||
*/ | ||
function onStoreError(error, options, callback) { | ||
log.error({"url" : options.url , "step" : "queue-resquester.onStoreError", "message" : "Error from the crawl persistence service (crawl canceled for this url) : " + error.code}); | ||
options.onCrawl({code:"STOPCRAWL"}, options, function(error){ | ||
process.nextTick(function() {callback(error)}); | ||
}); | ||
} | ||
} | ||
/** | ||
* Crawl an url for a host which has already provided some errors (timout, connection refused, ... ) | ||
* | ||
* | ||
* @param the crawl options | ||
* @param the info on errors | ||
* @param the callback used to inform the queue that request is finished | ||
*/ | ||
Requester.prototype.crawlWithErrors = function(options, errorInfo, callback) { | ||
/** | ||
* Crawl one url with optionnaly a delay (rate limit) | ||
* | ||
* | ||
* @param the crawl options | ||
* @param the callback used to inform the queue that request is finished | ||
*/ | ||
function crawl(options, callback) { | ||
log.warn({"url" : options.url, "step" : "queue-resquester.crawlWithErrors", "message" : "Crawl with errors", "options" : "errorInfo"}); | ||
var self = this; | ||
if (options.rateLimits != 0) { | ||
if (errorInfo.stopCrawlOnThisDomain) { | ||
log.error({"url" : options.url, "step" : "queue-resquester.crawlWithErrors", "message" : "Too many errors on the domain - Stop to crawl its URLS"}); | ||
log.error({"url" : options.url, "step" : "queue-resquester.crawl", "message" : "Request with option on ratelimit = " + options.rateLimits}); | ||
setTimeout(function() { | ||
execHttp(options, callback); | ||
options.onCrawl({code:"STOPCRAWL"}, options, function(error) { | ||
callback(error); | ||
}); | ||
}, options.rateLimits); | ||
} | ||
else { | ||
execHttp(options, callback); | ||
} | ||
return; | ||
} | ||
/** | ||
* Crawl an url for a host which has already provided some errors (timout, connection refused, ... ) | ||
* | ||
* | ||
* @param the crawl options | ||
* @param the info on errors | ||
* @param the callback used to inform the queue that request is finished | ||
*/ | ||
function crawlWithErrors(options, errorInfo, callback) { | ||
log.warn({"url" : options.url, "step" : "queue-resquester.crawlWithErrors", "message" : "Crawl with errors", "options" : "errorInfo"}); | ||
if (errorInfo.forceRateLimits) { | ||
log.warn({"url" : options.url, "step" : "queue-resquester.crawlWithErrors", "message" : "Too many errors on the domain - Force request with rate limit" }); | ||
setTimeout(function() { | ||
self.execHttp(options, callback); | ||
if (errorInfo.stopCrawlOnThisDomain) { | ||
log.error({"url" : options.url, "step" : "queue-resquester.crawlWithErrors", "message" : "Too many errors on the domain - Stop to crawl its URLS"}); | ||
}, options.errorRates[errorInfo.currentRateLimitIndex]); | ||
options.onCrawl({code:"STOPCRAWL"}, options, function(error) { | ||
process.nextTick(function() {callback(error)}); | ||
}); | ||
return; | ||
} | ||
return; | ||
} | ||
//Case of a retry due to a previous http error on the same request | ||
if (options.maxRetries < options.retries) { | ||
log.warn({"url" : options.url, "step" : "queue-resquester.crawlWithErrors", "message" : "Retry Request - maxRetries =" + options.maxRetries + " - retries : " + options.retries}); | ||
setTimeout(function() { | ||
self.execHttp(options, callback); | ||
if (errorInfo.forceRateLimits) { | ||
log.warn({"url" : options.url, "step" : "queue-resquester.crawlWithErrors", "message" : "Too many errors on the domain - Force request with rate limit" }); | ||
setTimeout(function() { | ||
execHttp(options, callback); | ||
}, options.retryTimeout); | ||
return; | ||
}, options.errorRates[errorInfo.currentRateLimitIndex]); | ||
} | ||
return; | ||
} | ||
log.error({"url" : options.url, "step" : "queue-resquester.crawlWithErrors", "message" : "Invalid Error option - last crawl of the url", "options" : "errorInfo" }); | ||
this.crawl(options, callback); | ||
//Case of a retry due to a previous http error on the same request | ||
if (options.maxRetries < options.retries) { | ||
log.warn({"url" : options.url, "step" : "queue-resquester.crawlWithErrors", "message" : "Retry Request - maxRetries =" + options.maxRetries + " - retries : " + options.retries}); | ||
setTimeout(function() { | ||
execHttp(options, callback); | ||
} | ||
}, options.retryTimeout); | ||
return; | ||
/** | ||
* Execute an http request | ||
* | ||
* @param The options to used for the request | ||
* @param callback executed when the request is finished | ||
* | ||
*/ | ||
Requester.prototype.execHttp = function (options, callback) { | ||
var self = this; | ||
if (this.options.proxyList) { | ||
options.proxy = this.options.proxyList.getProxy().getUrl(); | ||
} | ||
log.debug({"url" : options.url, "step" : "queue-requester.execHttp", "message" : "Execute the request"}); | ||
request(options, function(error, result) { | ||
log.debug({"url" : options.url, "step" : "queue-requester.execHttp", "message" : "Execute the request done"}); | ||
if (error) { | ||
self.onRequestError(error, options, result, callback); | ||
} | ||
else { | ||
options.onCrawl(null, result, function(error){ | ||
callback(error); | ||
}); | ||
} | ||
}); | ||
log.error({"url" : options.url, "step" : "queue-resquester.crawlWithErrors", "message" : "Invalid Error option - last crawl of the url", "options" : "errorInfo" }); | ||
crawl(options, callback); | ||
} | ||
} | ||
/** | ||
* Execute an http request | ||
* | ||
* @param The options to used for the request | ||
* @param callback executed when the request is finished | ||
* | ||
*/ | ||
function execHttp(options, callback) { | ||
/** | ||
* Callback used when a Http request generates an error | ||
* | ||
* | ||
* @param The Http error | ||
* @param the crawl options | ||
* @param the HTTP response | ||
* @param callback() | ||
*/ | ||
Requester.prototype.onRequestError = function (error, options, result, callback) { | ||
if (options.proxyList) { | ||
options.proxy = options.proxyList.getProxy().getUrl(); | ||
} | ||
log.debug({"url" : options.url, "step" : "queue-requester.execHttp", "message" : "Execute the request"}); | ||
request(options, function(error, result) { | ||
log.debug({"url" : options.url, "step" : "queue-requester.execHttp", "message" : "Execute the request done"}); | ||
if (error) { | ||
onRequestError(error, options, result, callback); | ||
} | ||
else { | ||
options.onCrawl(null, result, function(error){ | ||
process.nextTick(function() {callback(error)}); | ||
}); | ||
} | ||
//var self = this; | ||
}); | ||
// if the error is a timeout : | ||
// 1. Check the crawl rate and if necessary decrease it for slower skipDuplicates | ||
// 2. Save the error info for the associated host. | ||
// 3. recrawl the url if the maximum of retries is not yet reaches | ||
if (error.code == 'ETIMEDOUT' || error.code == 'ESOCKETTIMEDOUT') { | ||
log.error({"url" : options.url, "step" : "queue-requester.onRequestError", "message" : "Timeout"}); | ||
//var execOnError = async.compose(self.recrawlUrl, self.saveErrorInfo, self.checkCrawlRate); | ||
var execOnError = async.compose(this.recrawlUrl.bind(this), this.saveErrorInfo.bind(this), this.checkCrawlRate.bind(this)); | ||
execOnError({options : options, result : result, error : error,crawler : this}, function(err, params){ | ||
callback(); | ||
}); | ||
return; | ||
} | ||
} | ||
/** | ||
* Callback used when a Http request generates an error | ||
* | ||
* | ||
* @param The Http error | ||
* @param the crawl options | ||
* @param the HTTP response | ||
* @param callback() | ||
*/ | ||
function onRequestError(error, options, result, callback) { | ||
// if it is a connection error, recrawl the url if the maximum of retries is not yet reaches | ||
if (error.code == 'ECONNRESET' || error.code == 'ECONNREFUSED' ) { | ||
log.error({"url" : options.url, "step" : "queue-requester.onRequestError", "message" : "connection refused"}); | ||
this.recrawlUrl({options : options, result : result, error : error, crawler : this}, function(error,params){ | ||
callback(); | ||
// if the error is a timeout : | ||
// 1. Check the crawl rate and if necessary decrease it for slower skipDuplicates | ||
// 2. Save the error info for the associated host. | ||
// 3. recrawl the url if the maximum of retries is not yet reaches | ||
if (error.code == 'ETIMEDOUT' || error.code == 'ESOCKETTIMEDOUT') { | ||
log.error({"url" : options.url, "step" : "queue-requester.onRequestError", "message" : "Timeout"}); | ||
var execOnError = async.compose(recrawlUrl, saveErrorInfo, checkCrawlRate); | ||
execOnError({options : options, result : result, error : error}, function(err, params){ | ||
process.nextTick(function() {callback()}); | ||
}); | ||
return; | ||
} | ||
// For the other kind of errors, just inform the crawler | ||
options.onCrawl(error, result, function(error) { | ||
callback(error); | ||
}); | ||
} | ||
// if it is a connection error, recrawl the url if the maximum of retries is not yet reaches | ||
if (error.code == 'ECONNRESET' || error.code == 'ECONNREFUSED' ) { | ||
log.error({"url" : options.url, "step" : "queue-requester.onRequestError", "message" : "connection refused"}); | ||
recrawlUrl({options : options, result : result, error : error}, function(error,params){ | ||
process.nextTick(function() {callback()}); | ||
}); | ||
return; | ||
} | ||
}; | ||
// For the other kind of errors, just inform the crawler | ||
options.onCrawl(error, result, function(error) { | ||
process.nextTick(function() {callback(error)}); | ||
}); | ||
/** | ||
* In the case of a timeout error, this method is call in order to check | ||
* if it is not necessary to decrease the crawl rate | ||
* | ||
* | ||
* @param the crawl params (options, result, errors, errorInfo) | ||
* @param callback(error, params) | ||
*/ | ||
Requester.prototype.checkCrawlRate = function (params, callback) { | ||
}; | ||
var self = this; | ||
/** | ||
* In the case of a timeout error, this method is call in order to check | ||
* if it is not necessary to decrease the crawl rate | ||
* | ||
* | ||
* @param the crawl params (options, result, errors, errorInfo) | ||
* @param callback(error, params) | ||
*/ | ||
function checkCrawlRate(params, callback) { | ||
store.getStore().getHostErrors(params.options.url, function(error, errorInfo) { | ||
store.getStore().getHostErrors(params.options.url, function(error, errorInfo) { | ||
if (error) { | ||
self.onStoreError(error, params.options); | ||
return callback(error); | ||
} | ||
if (error) { | ||
onStoreError(error, params.options); | ||
return callback(error); | ||
} | ||
params.errorInfo = errorInfo; | ||
params.errorInfo = errorInfo; | ||
if (! errorInfo) { | ||
params.errorInfo = { numberOfErrors : 0, currentRateLimitIndex : -1, forceRateLimits : false}; | ||
} | ||
params.errorInfo.numberOfErrors++; | ||
if (! errorInfo) { | ||
params.errorInfo = { numberOfErrors : 0, currentRateLimitIndex : -1, forceRateLimits : false}; | ||
} | ||
params.errorInfo.numberOfErrors++; | ||
if (params.options.maxErrors != -1 && params.errorInfo.numberOfErrors == params.options.maxErrors ) { | ||
self.decreaseCrawlRate(params, callback); | ||
} | ||
else { | ||
log.info({"url" : params.options.url, "step" : "queue-requester.checkCrawlRate", "message" : "Don't decrease rate (number of errors < max number of errors)"}); | ||
callback(null,params); | ||
} | ||
}); | ||
if (params.options.maxErrors != -1 && params.errorInfo.numberOfErrors == params.options.maxErrors ) { | ||
decreaseCrawlRate(params, callback); | ||
} | ||
else { | ||
log.info({"url" : params.options.url, "step" : "queue-requester.checkCrawlRate", "message" : "Don't decrease rate (number of errors < max number of errors)"}); | ||
callback(null,params); | ||
} | ||
}); | ||
} | ||
} | ||
/** | ||
* In the case of a timeout error and if there are too many errors, | ||
* this method will decrease the crawl rate or stop the crawl for the | ||
* associated domain | ||
* | ||
* @param the crawl params (options, result, errors, errorInfo) | ||
* @param callback(error, params) | ||
*/ | ||
Requester.prototype.decreaseCrawlRate = function(params, callback) { | ||
/** | ||
* In the case of a timeout error and if there are too many errors, | ||
* this method will decrease the crawl rate or stop the crawl for the | ||
* associated domain | ||
* | ||
* @param the crawl params (options, result, errors, errorInfo) | ||
* @param callback(error, params) | ||
*/ | ||
function decreaseCrawlRate(params, callback) { | ||
params.errorInfo.currentRateLimitIndex++; | ||
// If there is still an available rate limit | ||
if (params.options.errorRates.length > 0 && params.errorInfo.currentRateLimitIndex < params.options.errorRates.length) { | ||
params.errorInfo.numberOfErrors = 0; | ||
params.errorInfo.forceRateLimits = true; | ||
log.warn({"url" : params.options.url, "step" : "queue-requester.decreaseCrawlRate", "message" : "Too many errors, set rateLimits to " + params.options.errorRates[params.errorInfo.currentRateLimitIndex]}); | ||
params.errorInfo.currentRateLimitIndex++; | ||
// If there is still an available rate limit | ||
if (params.options.errorRates.length > 0 && params.errorInfo.currentRateLimitIndex < params.options.errorRates.length) { | ||
params.errorInfo.numberOfErrors = 0; | ||
params.errorInfo.forceRateLimits = true; | ||
log.warn({"url" : params.options.url, "step" : "queue-requester.decreaseCrawlRate", "message" : "Too many errors, set rateLimits to " + params.options.errorRates[params.errorInfo.currentRateLimitIndex]}); | ||
params.options.maxRetries = params.options.retries; | ||
params.options.maxRetries = params.options.retries; | ||
} | ||
// we stop to crawl on this domain if all rate limits have been used | ||
else { | ||
log.error({"url" : params.options.url, "step" : "queue-requester.decreaseCrawlRate", "message" : "Stop crawl domain - all crawl rates done"}); | ||
params.errorInfo.stopCrawlOnThisDomain = true; | ||
} | ||
} | ||
// we stop to crawl on this domain if all rate limits have been used | ||
else { | ||
log.error({"url" : params.options.url, "step" : "queue-requester.decreaseCrawlRate", "message" : "Stop crawl domain - all crawl rates done"}); | ||
params.errorInfo.stopCrawlOnThisDomain = true; | ||
} | ||
callback(null, params); | ||
} | ||
callback(null, params); | ||
} | ||
/** | ||
* Save the error info into the crawl persistence store | ||
* | ||
* | ||
* @param the crawl params (options, result, errorInfo) | ||
* @param callback(error, params) | ||
*/ | ||
Requester.prototype.saveErrorInfo = function(params, callback) { | ||
/** | ||
* Save the error info into the crawl persistence store | ||
* | ||
* | ||
* @param the crawl params (options, result, errorInfo) | ||
* @param callback(error, params) | ||
*/ | ||
function saveErrorInfo(params, callback) { | ||
store.getStore().setHostErrors(params.options.url, params.errorInfo, function(error) { | ||
callback(null, params); | ||
}); | ||
store.getStore().setHostErrors(params.options.url, params.errorInfo, function(error) { | ||
callback(null, params); | ||
}); | ||
} | ||
} | ||
/** | ||
* Recrawl an url if the maximum of retries is no yet fetch | ||
* | ||
* | ||
* @param the crawl params (options, result, errorInfo) | ||
* @param callback(error, params) | ||
*/ | ||
Requester.prototype.recrawlUrl = function(params, callback) { | ||
/** | ||
* Recrawl an url if the maximum of retries is no yet fetch | ||
* | ||
* | ||
* @param the crawl params (options, result, errorInfo) | ||
* @param callback(error, params) | ||
*/ | ||
function recrawlUrl(params, callback) { | ||
if (params.result.maxRetries > 1) { | ||
log.warn({"url" : params.options.url, "step" : "queue-requester.recrawlUrl", "message" : "Recrawl"}); | ||
params.result.maxRetries--; | ||
if (params.result.maxRetries > 1) { | ||
log.warn({"url" : params.options.url, "step" : "queue-requester.recrawlUrl", "message" : "Recrawl"}); | ||
params.result.maxRetries--; | ||
//TODO : async this code | ||
store.getStore().removeFromHistory(params.result.url); | ||
params.crawler.queue(params.result); | ||
//TODO : async this code | ||
store.getStore().removeFromHistory(params.result.url); | ||
queue(params.result); | ||
callback(null, params); | ||
} | ||
else { | ||
log.warn({"url" : params.result.url, "step" : "queue-requester.recrawlUrl", "message" : "Don't recrawl - end of retries"}); | ||
params.options.onCrawl(params.error, params.result, function(error){ | ||
callback(error, params); | ||
}); | ||
callback(null, params); | ||
} | ||
else { | ||
log.warn({"url" : params.result.url, "step" : "queue-requester.recrawlUrl", "message" : "Don't recrawl - end of retries"}); | ||
params.options.onCrawl(params.error, params.result, function(error){ | ||
callback(error, params); | ||
}); | ||
} | ||
} | ||
} | ||
module.exports.init = init; | ||
module.exports.queue = queue; | ||
module.exports.idle = idle; | ||
module.exports.Requester = Requester; | ||
}()); |
{ | ||
"name": "crawler-ninja", | ||
"version": "0.1.12", | ||
"version": "0.1.13", | ||
"description": "A web crawler made for the SEO based on plugins. Please wait or contribute ... still in beta", | ||
@@ -11,3 +11,2 @@ "main": "index.js", | ||
"dependencies": { | ||
"crawler-ninja-logger" : "*", | ||
"URIjs": "*", | ||
@@ -17,3 +16,5 @@ "async": " *", | ||
"collections": "*", | ||
"crawler-ninja-logger": "*", | ||
"crypto": "*", | ||
"heapdump": "^0.3.7", | ||
"request": "*", | ||
@@ -20,0 +21,0 @@ "riak-js": "*", |
@@ -474,1 +474,4 @@ Crawler Ninja | ||
- Add an empty plugin sample. See the js file : /plugins/empty-plugin.js | ||
0.1.13 | ||
- Experiments for a better memory management |
var assert = require("assert"); | ||
var crawler = require("../index.js"); | ||
var crawler = require("../index.js"); | ||
var cs = require("../plugins/console-plugin.js"); | ||
var testSite = require("./website-2/start.js").site; | ||
var heapdump = require('heapdump'); | ||
var proxyList = null; | ||
describe('Proxies', function() { | ||
describe('Memory leaks', function() { | ||
it.only('should crawl without memory leaks', function(done) { | ||
this.timeout(3000000); | ||
var c = new crawler.Crawler({skipDuplicates: false}); | ||
it.skip('should crawl without memory leaks', function(done) { | ||
this.timeout(3000000); | ||
setInterval(function(){ | ||
console.log(">>>>> Dump !"); | ||
heapdump.writeSnapshot('./dump/dump' + Date.now() + '.heapsnapshot'); | ||
}, 120000); | ||
var options = { | ||
skipDuplicates: true, | ||
scripts : false, | ||
links : false, | ||
image : false, | ||
maxConnections : 100 | ||
} | ||
var c = new crawler.Crawler(options); | ||
var consolePlugin = new cs.Plugin(); | ||
@@ -22,12 +37,6 @@ c.registerPlugin(consolePlugin); | ||
/* | ||
c.on("error", function(error, result) { | ||
assert(_.find(result.proxyList.getProxies(), function(p){ return p.getUrl()=== result.proxy; })); | ||
c.queue({url : "http://www.rtbf.be/", externalDomains: false}); | ||
}); | ||
*/ | ||
c.queue({url : "http://localhost:9999/internal-links.html"}); | ||
}); | ||
}); |
Sorry, the diff of this file is not supported yet
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
8218647
11080
477
12
+ Addedheapdump@^0.3.7
+ Addedheapdump@0.3.15(transitive)