crawler-ninja - npm Package Compare versions

Comparing version 0.1.12 to 0.1.13

index.js

		@@ -7,4 +7,2 @@ var events = require('events');
		var log = require("crawler-ninja-logger").Logger;
		var Map = require("collections/fast-map");
		var Set = require("collections/fast-set");
		var requester = require("./lib/queue-requester");
		@@ -16,4 +14,2 @@ var URI = require('./lib/uri.js');



		var domainBlackList = require("./default-lists/domain-black-list.js").list();
		@@ -109,3 +105,3 @@ var suffixBlackList = require("./default-lists/suffix-black-list.js").list();

		this.httpRequester = new requester.Requester(this.config);
		requester.init(this.config.maxConnections, this.config.onDrain);

		@@ -134,3 +130,3 @@ events.EventEmitter.call(this);
		function(error){
		if (self.httpRequester.idle()) {
		if (requester.idle()) {
		self.config.onDrain();
		@@ -158,3 +154,3 @@ }
		store.getStore().addStartUrl(options, function(error) {
		self.httpRequester.queue(addDefaultOptions({uri:options, url:options}, self.config));
		requester.queue(addDefaultOptions({uri:options, url:options}, self.config));
		});
		@@ -170,3 +166,3 @@
		function(error){
		if (self.httpRequester.idle()) {
		if (requester.idle()) {
		self.config.onDrain();
		@@ -180,3 +176,3 @@ }
		store.getStore().addStartUrl(_.has(options, "url") ? options.url : options.uri, function(error) {
		self.httpRequester.queue(addDefaultOptions(options, self.config));
		requester.queue(addDefaultOptions(options, self.config));
		});
		@@ -318,3 +314,6 @@ }
		async.apply(self.applyRedirect.bind(self), result),
		], callback);
		], function(error) {
		result = null;
		callback(error);
		});

		@@ -333,3 +332,3 @@ }
		this.pm.crawlRedirect(from, to, result.statusCode, function(){
		self.httpRequester.queue(self.buildNewOptions(result,to));
		requester.queue(self.buildNewOptions(result,to));
		callback();
		@@ -558,3 +557,3 @@ });
		if (toCrawl && (result.depthLimit == -1 \|\| currentDepth <= result.depthLimit)) {
		self.httpRequester.queue(self.buildNewOptions(result,linkUri));
		requester.queue(self.buildNewOptions(result,linkUri));
		callback();
		@@ -561,0 +560,0 @@ }

lib/http-request.js

		@@ -33,4 +33,2 @@ /**



		var requestArgs = ['uri','url','qs','method','headers','body','form','json','multipart','followRedirect',
		@@ -42,8 +40,7 @@ 'followAllRedirects', 'maxRedirects','encoding','pool','timeout','proxy','auth','oauth','strictSSL',
		var start = new Date();
		var req = request(_.pick.apply(this,[options].concat(requestArgs)), function(error,response) {
		request(_.pick.apply(this,[options].concat(requestArgs)), function(error,response) {

		var end = new Date() - start;
		if (error) {

		return onContent(error, endCallback, options);
		return endCallback({code: error.code}, options);
		}
		@@ -68,3 +65,3 @@
		if (error) {

		result = null;
		return onContent(error, endCallback, options);
		@@ -87,6 +84,2 @@ }

		if (error) {
		return endCallback({code: error.code}, options);
		}

		// Sorry for this hack but that's solve some issue with Cheerio
		@@ -93,0 +86,0 @@ if (!result.body) {

lib/plugin-manager.js

		@@ -7,3 +7,3 @@ var Map = require("collections/fast-map");
		this.plugins = new Map();
		this.pluginCounter = 0;

		}
		@@ -10,0 +10,0 @@

629

lib/queue-requester.js

		@@ -1,411 +0,408 @@
		var async = require('async');
		var _ = require('underscore');
		var URI = require("./uri.js");
		var request = require("./http-request.js");
		var log = require("crawler-ninja-logger").Logger;
		var store = require("../lib/store/store.js");

		/**
		* The Request Queue
		*
		* its main job is to make the http requests & analyze the responses
		* its main job is make the http requests & analyze the responses
		* It is used an internal queue to limit the number of workers
		*
		* @param the options to use to make the requests.
		*
		*/
		var Requester = function(options) {
		var async = require('async');
		var _ = require('underscore');
		var log = require("crawler-ninja-logger").Logger;
		var URI = require("./uri.js");
		var request = require("./http-request.js");
		var store = require("../lib/store/store.js");

		this.options = options;
		this.initQueue();
		}

		/**
		* Add a new url to crawl in the queue.
		* Check the desired options and add it to a request queue
		*
		* @param the options used to crawl the url
		*
		*/
		Requester.prototype.queue = function (options) {
		(function () {

		var self = this;
		// Up to you to use uri or url.
		if (options.uri) {
		options.url = options.uri;
		}
		else {
		options.uri = options.url;
		}
		var requestQueue = {};

		// if skipDuplicates, don't crawl twice the same uri
		if (this.options.skipDuplicates) {
		store.getStore().isInCrawlHistory(options.uri, function(error, isInCrawlHistory) {
		if (isInCrawlHistory) {
		log.warn({"url" : options.url, "step" : "queue-resquester.queue", "message" : "Don't crawl this url - Option skipDuplicates=true & the url has already been crawled" });
		}
		else {
		store.getStore().addInHistory(options.uri,function(error) {
		self.q.push(options);
		log.info({"url" : options.url, "step" : "queue-resquester.queue", "message" : "Add in the request queue"});

		});
		}
		/**
		* Init the Queue Requester
		*
		*
		* @param The number of task/connection that the request queu can start in parallel
		* @param the callback executes when all task (url to cralw) are completed
		*
		*/

		});
		}
		else {
		log.info({"url" : options.url, "step" : "queue-resquester.queue", "message" : "Add in the request queue"});
		this.q.push(options);
		}
		function init (maxConnections, onDrain) {
		createQueue(maxConnections, onDrain);
		}

		/**
		* Add a new url to crawl in the queue.
		* Check the desired options and add it to a request queue
		*
		* @param the options used to crawl the url
		*
		*/
		function queue(options) {

		}
		// Up to you to use uri or url.
		if (options.uri) {
		options.url = options.uri;
		}
		else {
		options.uri = options.url;
		}

		// if skipDuplicates, don't crawl twice the same uri
		if (options.skipDuplicates) {
		store.getStore().isInCrawlHistory(options.uri, function(error, isInCrawlHistory) {
		if (isInCrawlHistory) {
		log.warn({"url" : options.url, "step" : "queue-resquester.queue", "message" : "Don't crawl this url - Option skipDuplicates=true & the url has already been crawled" });
		}
		else {
		store.getStore().addInHistory(options.uri,function(error) {

		/**
		* @return false if there are items waiting or being processed in the queue, or true if not.
		*
		*/
		Requester.prototype.idle = function() {
		return this.q.idle();
		}
		requestQueue.push(options);
		log.info({"url" : options.url, "step" : "queue-resquester.queue", "message" : "Add in the request queue"});

		/*****************************************************************************************
		*
		* PRIVATES FUNCTIONS
		*
		******************************************************************************************/
		});
		}

		/**
		* Init the queue
		*
		*/
		Requester.prototype.initQueue = function () {
		});
		}
		else {
		log.info({"url" : options.url, "step" : "queue-resquester.queue", "message" : "Add in the request queue"});
		requestQueue.push(options);
		}

		var self = this;
		this.q = async.queue(
		function (options, callback) {


		log.debug({"url" : options.url, "step" : "queue-resquester.execQueueTask", "message" : "Start Crawling"});
		// If the domain is in the blacklist => don't crawl the url
		if (options.domainBlackList.indexOf(URI.domainName(options.url)) > 0) {

		log.error({"url" : options.url, "step" : "queue-resquester.execQueueTask", "message" : "Domain of the url is in the blacklist"});
		options.onCrawl({code:"DOMAINBLACKLIST"}, options, function(error){
		callback(error);
		});
		return;
		}
		}

		// Check if there are some errors for the host & make the appropriate crawl in function of that
		store.getStore().getHostErrors(options.url, function(error, errorInfo) {
		log.debug({"url" : options.url, "step" : "queue-resquester.execQueueTask", "message" : "Check if errors already exist"});

		if (error) {
		self.onStoreError(error, options);
		return callback();
		}
		/**
		* @return false if there are some URL waiting to be crawled or being processed in the queue, or true if not.
		*
		*/
		idle = function() {
		return requestQueue.idle();
		}

		if (errorInfo) {
		self.crawlWithErrors(options, errorInfo, callback);
		}
		else {
		self.crawl(options, callback);
		}
		});
		},
		self.options.maxConnections);
		/*****************************************************************************************
		*
		* PRIVATES FUNCTIONS
		*
		******************************************************************************************/

		this.q.drain = function() {
		/**
		* Create the Request Queue
		*
		*/
		function createQueue (maxConnections, onDrain) {
		requestQueue = async.queue(onUrlToCrawl,maxConnections);
		requestQueue.drain = onDrain;

		if (self.options.onDrain) {
		self.options.onDrain();
		}
		}
		}

		/**
		* Stop the crawl if the crawl persistence store provides some errors
		*
		*
		* @param the error provided by the persistence store
		* @param the crawl options
		*
		*/
		Requester.prototype.onStoreError = function(error, options, callback) {
		log.error({"url" : options.url , "step" : "queue-resquester.onStoreError", "message" : "Error from the crawl persistence service (crawl canceled for this url) : " + error.code});
		options.onCrawl({code:"STOPCRAWL"}, options, function(error){
		callback(error);
		});
		}
		function onUrlToCrawl(options, callback) {

		/**
		* Crawl one url with optionnaly a delay (rate limit)
		*
		*
		* @param the crawl options
		* @param the callback used to inform the queue that request is finished
		*/
		Requester.prototype.crawl = function(options, callback) {
		log.debug({"url" : options.url, "step" : "queue-resquester.execQueueTask", "message" : "Start Crawling"});
		// If the domain is in the blacklist => don't crawl the url
		if (options.domainBlackList.indexOf(URI.domainName(options.url)) > 0) {

		var self = this;
		log.error({"url" : options.url, "step" : "queue-resquester.execQueueTask", "message" : "Domain of the url is in the blacklist"});
		options.onCrawl({code:"DOMAINBLACKLIST"}, options, function(error){
		process.nextTick(function() {callback(error)});
		});
		return;
		}

		if (options.rateLimits != 0) {
		// Check if there are some errors for the host & make the appropriate crawl in function of that
		store.getStore().getHostErrors(options.url, function(error, errorInfo) {
		log.debug({"url" : options.url, "step" : "queue-resquester.execQueueTask", "message" : "Check if errors already exist"});

		log.error({"url" : options.url, "step" : "queue-resquester.crawl", "message" : "Request with option on ratelimit = " + options.rateLimits});
		setTimeout(function() {
		if (error) {
		onStoreError(error, options);
		return callback();
		}

		self.execHttp(options, callback);
		if (errorInfo) {
		crawlWithErrors(options, errorInfo, callback);
		}
		else {
		crawl(options, callback);
		}
		});

		}, options.rateLimits);
		}
		else {
		self.execHttp(options, callback);
		}
		}

		/**
		* Stop the crawl if the crawl persistence store provides some errors
		*
		*
		* @param the error provided by the persistence store
		* @param the crawl options
		*
		*/
		function onStoreError(error, options, callback) {
		log.error({"url" : options.url , "step" : "queue-resquester.onStoreError", "message" : "Error from the crawl persistence service (crawl canceled for this url) : " + error.code});
		options.onCrawl({code:"STOPCRAWL"}, options, function(error){
		process.nextTick(function() {callback(error)});
		});
		}

		}
		/**
		* Crawl an url for a host which has already provided some errors (timout, connection refused, ... )
		*
		*
		* @param the crawl options
		* @param the info on errors
		* @param the callback used to inform the queue that request is finished
		*/
		Requester.prototype.crawlWithErrors = function(options, errorInfo, callback) {
		/**
		* Crawl one url with optionnaly a delay (rate limit)
		*
		*
		* @param the crawl options
		* @param the callback used to inform the queue that request is finished
		*/
		function crawl(options, callback) {

		log.warn({"url" : options.url, "step" : "queue-resquester.crawlWithErrors", "message" : "Crawl with errors", "options" : "errorInfo"});
		var self = this;
		if (options.rateLimits != 0) {

		if (errorInfo.stopCrawlOnThisDomain) {
		log.error({"url" : options.url, "step" : "queue-resquester.crawlWithErrors", "message" : "Too many errors on the domain - Stop to crawl its URLS"});
		log.error({"url" : options.url, "step" : "queue-resquester.crawl", "message" : "Request with option on ratelimit = " + options.rateLimits});
		setTimeout(function() {
		execHttp(options, callback);

		options.onCrawl({code:"STOPCRAWL"}, options, function(error) {
		callback(error);
		});
		}, options.rateLimits);
		}
		else {
		execHttp(options, callback);
		}

		return;

		}
		/**
		* Crawl an url for a host which has already provided some errors (timout, connection refused, ... )
		*
		*
		* @param the crawl options
		* @param the info on errors
		* @param the callback used to inform the queue that request is finished
		*/
		function crawlWithErrors(options, errorInfo, callback) {

		log.warn({"url" : options.url, "step" : "queue-resquester.crawlWithErrors", "message" : "Crawl with errors", "options" : "errorInfo"});

		if (errorInfo.forceRateLimits) {
		log.warn({"url" : options.url, "step" : "queue-resquester.crawlWithErrors", "message" : "Too many errors on the domain - Force request with rate limit" });
		setTimeout(function() {
		self.execHttp(options, callback);
		if (errorInfo.stopCrawlOnThisDomain) {
		log.error({"url" : options.url, "step" : "queue-resquester.crawlWithErrors", "message" : "Too many errors on the domain - Stop to crawl its URLS"});

		}, options.errorRates[errorInfo.currentRateLimitIndex]);
		options.onCrawl({code:"STOPCRAWL"}, options, function(error) {
		process.nextTick(function() {callback(error)});
		});

		return;
		}
		return;
		}

		//Case of a retry due to a previous http error on the same request
		if (options.maxRetries < options.retries) {
		log.warn({"url" : options.url, "step" : "queue-resquester.crawlWithErrors", "message" : "Retry Request - maxRetries =" + options.maxRetries + " - retries : " + options.retries});
		setTimeout(function() {
		self.execHttp(options, callback);
		if (errorInfo.forceRateLimits) {
		log.warn({"url" : options.url, "step" : "queue-resquester.crawlWithErrors", "message" : "Too many errors on the domain - Force request with rate limit" });
		setTimeout(function() {
		execHttp(options, callback);

		}, options.retryTimeout);
		return;
		}, options.errorRates[errorInfo.currentRateLimitIndex]);

		}
		return;
		}

		log.error({"url" : options.url, "step" : "queue-resquester.crawlWithErrors", "message" : "Invalid Error option - last crawl of the url", "options" : "errorInfo" });
		this.crawl(options, callback);
		//Case of a retry due to a previous http error on the same request
		if (options.maxRetries < options.retries) {
		log.warn({"url" : options.url, "step" : "queue-resquester.crawlWithErrors", "message" : "Retry Request - maxRetries =" + options.maxRetries + " - retries : " + options.retries});
		setTimeout(function() {
		execHttp(options, callback);

		}
		}, options.retryTimeout);
		return;

		/**
		* Execute an http request
		*
		* @param The options to used for the request
		* @param callback executed when the request is finished
		*
		*/
		Requester.prototype.execHttp = function (options, callback) {

		var self = this;
		if (this.options.proxyList) {
		options.proxy = this.options.proxyList.getProxy().getUrl();
		}
		log.debug({"url" : options.url, "step" : "queue-requester.execHttp", "message" : "Execute the request"});
		request(options, function(error, result) {
		log.debug({"url" : options.url, "step" : "queue-requester.execHttp", "message" : "Execute the request done"});
		if (error) {
		self.onRequestError(error, options, result, callback);
		}
		else {
		options.onCrawl(null, result, function(error){
		callback(error);
		});
		}

		});
		log.error({"url" : options.url, "step" : "queue-resquester.crawlWithErrors", "message" : "Invalid Error option - last crawl of the url", "options" : "errorInfo" });
		crawl(options, callback);

		}

		}
		/**
		* Execute an http request
		*
		* @param The options to used for the request
		* @param callback executed when the request is finished
		*
		*/
		function execHttp(options, callback) {

		/**
		* Callback used when a Http request generates an error
		*
		*
		* @param The Http error
		* @param the crawl options
		* @param the HTTP response
		* @param callback()
		*/
		Requester.prototype.onRequestError = function (error, options, result, callback) {
		if (options.proxyList) {
		options.proxy = options.proxyList.getProxy().getUrl();
		}
		log.debug({"url" : options.url, "step" : "queue-requester.execHttp", "message" : "Execute the request"});
		request(options, function(error, result) {
		log.debug({"url" : options.url, "step" : "queue-requester.execHttp", "message" : "Execute the request done"});
		if (error) {
		onRequestError(error, options, result, callback);
		}
		else {
		options.onCrawl(null, result, function(error){
		process.nextTick(function() {callback(error)});
		});
		}

		//var self = this;
		});

		// if the error is a timeout :
		// 1. Check the crawl rate and if necessary decrease it for slower skipDuplicates
		// 2. Save the error info for the associated host.
		// 3. recrawl the url if the maximum of retries is not yet reaches
		if (error.code == 'ETIMEDOUT' \|\| error.code == 'ESOCKETTIMEDOUT') {
		log.error({"url" : options.url, "step" : "queue-requester.onRequestError", "message" : "Timeout"});
		//var execOnError = async.compose(self.recrawlUrl, self.saveErrorInfo, self.checkCrawlRate);
		var execOnError = async.compose(this.recrawlUrl.bind(this), this.saveErrorInfo.bind(this), this.checkCrawlRate.bind(this));

		execOnError({options : options, result : result, error : error,crawler : this}, function(err, params){
		callback();
		});
		return;
		}

		}
		/**
		* Callback used when a Http request generates an error
		*
		*
		* @param The Http error
		* @param the crawl options
		* @param the HTTP response
		* @param callback()
		*/
		function onRequestError(error, options, result, callback) {

		// if it is a connection error, recrawl the url if the maximum of retries is not yet reaches
		if (error.code == 'ECONNRESET' \|\| error.code == 'ECONNREFUSED' ) {
		log.error({"url" : options.url, "step" : "queue-requester.onRequestError", "message" : "connection refused"});
		this.recrawlUrl({options : options, result : result, error : error, crawler : this}, function(error,params){
		callback();
		// if the error is a timeout :
		// 1. Check the crawl rate and if necessary decrease it for slower skipDuplicates
		// 2. Save the error info for the associated host.
		// 3. recrawl the url if the maximum of retries is not yet reaches
		if (error.code == 'ETIMEDOUT' \|\| error.code == 'ESOCKETTIMEDOUT') {
		log.error({"url" : options.url, "step" : "queue-requester.onRequestError", "message" : "Timeout"});
		var execOnError = async.compose(recrawlUrl, saveErrorInfo, checkCrawlRate);

		execOnError({options : options, result : result, error : error}, function(err, params){
		process.nextTick(function() {callback()});
		});
		return;
		}

		// For the other kind of errors, just inform the crawler
		options.onCrawl(error, result, function(error) {
		callback(error);
		});
		}

		// if it is a connection error, recrawl the url if the maximum of retries is not yet reaches
		if (error.code == 'ECONNRESET' \|\| error.code == 'ECONNREFUSED' ) {
		log.error({"url" : options.url, "step" : "queue-requester.onRequestError", "message" : "connection refused"});
		recrawlUrl({options : options, result : result, error : error}, function(error,params){
		process.nextTick(function() {callback()});
		});
		return;
		}

		};
		// For the other kind of errors, just inform the crawler
		options.onCrawl(error, result, function(error) {
		process.nextTick(function() {callback(error)});
		});

		/**
		* In the case of a timeout error, this method is call in order to check
		* if it is not necessary to decrease the crawl rate
		*
		*
		* @param the crawl params (options, result, errors, errorInfo)
		* @param callback(error, params)
		*/
		Requester.prototype.checkCrawlRate = function (params, callback) {

		};

		var self = this;
		/**
		* In the case of a timeout error, this method is call in order to check
		* if it is not necessary to decrease the crawl rate
		*
		*
		* @param the crawl params (options, result, errors, errorInfo)
		* @param callback(error, params)
		*/
		function checkCrawlRate(params, callback) {

		store.getStore().getHostErrors(params.options.url, function(error, errorInfo) {
		store.getStore().getHostErrors(params.options.url, function(error, errorInfo) {

		if (error) {
		self.onStoreError(error, params.options);
		return callback(error);
		}
		if (error) {
		onStoreError(error, params.options);
		return callback(error);
		}

		params.errorInfo = errorInfo;
		params.errorInfo = errorInfo;

		if (! errorInfo) {
		params.errorInfo = { numberOfErrors : 0, currentRateLimitIndex : -1, forceRateLimits : false};
		}
		params.errorInfo.numberOfErrors++;
		if (! errorInfo) {
		params.errorInfo = { numberOfErrors : 0, currentRateLimitIndex : -1, forceRateLimits : false};
		}
		params.errorInfo.numberOfErrors++;


		if (params.options.maxErrors != -1 && params.errorInfo.numberOfErrors == params.options.maxErrors ) {
		self.decreaseCrawlRate(params, callback);
		}
		else {
		log.info({"url" : params.options.url, "step" : "queue-requester.checkCrawlRate", "message" : "Don't decrease rate (number of errors < max number of errors)"});
		callback(null,params);
		}
		});
		if (params.options.maxErrors != -1 && params.errorInfo.numberOfErrors == params.options.maxErrors ) {
		decreaseCrawlRate(params, callback);
		}
		else {
		log.info({"url" : params.options.url, "step" : "queue-requester.checkCrawlRate", "message" : "Don't decrease rate (number of errors < max number of errors)"});
		callback(null,params);
		}
		});


		}
		}

		/**
		* In the case of a timeout error and if there are too many errors,
		* this method will decrease the crawl rate or stop the crawl for the
		* associated domain
		*
		* @param the crawl params (options, result, errors, errorInfo)
		* @param callback(error, params)
		*/
		Requester.prototype.decreaseCrawlRate = function(params, callback) {
		/**
		* In the case of a timeout error and if there are too many errors,
		* this method will decrease the crawl rate or stop the crawl for the
		* associated domain
		*
		* @param the crawl params (options, result, errors, errorInfo)
		* @param callback(error, params)
		*/
		function decreaseCrawlRate(params, callback) {

		params.errorInfo.currentRateLimitIndex++;
		// If there is still an available rate limit
		if (params.options.errorRates.length > 0 && params.errorInfo.currentRateLimitIndex < params.options.errorRates.length) {
		params.errorInfo.numberOfErrors = 0;
		params.errorInfo.forceRateLimits = true;
		log.warn({"url" : params.options.url, "step" : "queue-requester.decreaseCrawlRate", "message" : "Too many errors, set rateLimits to " + params.options.errorRates[params.errorInfo.currentRateLimitIndex]});
		params.errorInfo.currentRateLimitIndex++;
		// If there is still an available rate limit
		if (params.options.errorRates.length > 0 && params.errorInfo.currentRateLimitIndex < params.options.errorRates.length) {
		params.errorInfo.numberOfErrors = 0;
		params.errorInfo.forceRateLimits = true;
		log.warn({"url" : params.options.url, "step" : "queue-requester.decreaseCrawlRate", "message" : "Too many errors, set rateLimits to " + params.options.errorRates[params.errorInfo.currentRateLimitIndex]});

		params.options.maxRetries = params.options.retries;
		params.options.maxRetries = params.options.retries;

		}
		// we stop to crawl on this domain if all rate limits have been used
		else {
		log.error({"url" : params.options.url, "step" : "queue-requester.decreaseCrawlRate", "message" : "Stop crawl domain - all crawl rates done"});
		params.errorInfo.stopCrawlOnThisDomain = true;
		}
		}
		// we stop to crawl on this domain if all rate limits have been used
		else {
		log.error({"url" : params.options.url, "step" : "queue-requester.decreaseCrawlRate", "message" : "Stop crawl domain - all crawl rates done"});
		params.errorInfo.stopCrawlOnThisDomain = true;
		}

		callback(null, params);
		}
		callback(null, params);
		}

		/**
		* Save the error info into the crawl persistence store
		*
		*
		* @param the crawl params (options, result, errorInfo)
		* @param callback(error, params)
		*/
		Requester.prototype.saveErrorInfo = function(params, callback) {
		/**
		* Save the error info into the crawl persistence store
		*
		*
		* @param the crawl params (options, result, errorInfo)
		* @param callback(error, params)
		*/
		function saveErrorInfo(params, callback) {

		store.getStore().setHostErrors(params.options.url, params.errorInfo, function(error) {
		callback(null, params);
		});
		store.getStore().setHostErrors(params.options.url, params.errorInfo, function(error) {
		callback(null, params);
		});

		}
		}

		/**
		* Recrawl an url if the maximum of retries is no yet fetch
		*
		*
		* @param the crawl params (options, result, errorInfo)
		* @param callback(error, params)
		*/
		Requester.prototype.recrawlUrl = function(params, callback) {
		/**
		* Recrawl an url if the maximum of retries is no yet fetch
		*
		*
		* @param the crawl params (options, result, errorInfo)
		* @param callback(error, params)
		*/
		function recrawlUrl(params, callback) {

		if (params.result.maxRetries > 1) {
		log.warn({"url" : params.options.url, "step" : "queue-requester.recrawlUrl", "message" : "Recrawl"});
		params.result.maxRetries--;
		if (params.result.maxRetries > 1) {
		log.warn({"url" : params.options.url, "step" : "queue-requester.recrawlUrl", "message" : "Recrawl"});
		params.result.maxRetries--;

		//TODO : async this code
		store.getStore().removeFromHistory(params.result.url);
		params.crawler.queue(params.result);
		//TODO : async this code
		store.getStore().removeFromHistory(params.result.url);
		queue(params.result);

		callback(null, params);
		}
		else {
		log.warn({"url" : params.result.url, "step" : "queue-requester.recrawlUrl", "message" : "Don't recrawl - end of retries"});
		params.options.onCrawl(params.error, params.result, function(error){
		callback(error, params);
		});
		callback(null, params);
		}
		else {
		log.warn({"url" : params.result.url, "step" : "queue-requester.recrawlUrl", "message" : "Don't recrawl - end of retries"});
		params.options.onCrawl(params.error, params.result, function(error){
		callback(error, params);
		});

		}

		}

		}
		module.exports.init = init;
		module.exports.queue = queue;
		module.exports.idle = idle;



		module.exports.Requester = Requester;
		}());

package.json

		{
		"name": "crawler-ninja",
		"version": "0.1.12",
		"version": "0.1.13",
		"description": "A web crawler made for the SEO based on plugins. Please wait or contribute ... still in beta",
		@@ -11,3 +11,2 @@ "main": "index.js",
		"dependencies": {
		"crawler-ninja-logger" : "*",
		"URIjs": "*",
		@@ -17,3 +16,5 @@ "async": " *",
		"collections": "*",
		"crawler-ninja-logger": "*",
		"crypto": "*",
		"heapdump": "^0.3.7",
		"request": "*",
		@@ -20,0 +21,0 @@ "riak-js": "*",

README.md

		@@ -474,1 +474,4 @@ Crawler Ninja
		- Add an empty plugin sample. See the js file : /plugins/empty-plugin.js

		0.1.13
		- Experiments for a better memory management

test/memory-leak-test.js

		var assert = require("assert");
		var crawler = require("../index.js");
		var crawler = require("../index.js");
		var cs = require("../plugins/console-plugin.js");
		var testSite = require("./website-2/start.js").site;

		var heapdump = require('heapdump');


		var proxyList = null;

		describe('Proxies', function() {
		describe('Memory leaks', function() {


		it.only('should crawl without memory leaks', function(done) {
		this.timeout(3000000);
		var c = new crawler.Crawler({skipDuplicates: false});
		it.skip('should crawl without memory leaks', function(done) {
		this.timeout(3000000);
		setInterval(function(){
		console.log(">>>>> Dump !");
		heapdump.writeSnapshot('./dump/dump' + Date.now() + '.heapsnapshot');
		}, 120000);

		var options = {
		skipDuplicates: true,
		scripts : false,
		links : false,
		image : false,
		maxConnections : 100

		}
		var c = new crawler.Crawler(options);
		var consolePlugin = new cs.Plugin();
		@@ -22,12 +37,6 @@ c.registerPlugin(consolePlugin);

		/*
		c.on("error", function(error, result) {
		assert(_.find(result.proxyList.getProxies(), function(p){ return p.getUrl()=== result.proxy; }));
		c.queue({url : "http://www.rtbf.be/", externalDomains: false});

		});
		*/
		c.queue({url : "http://localhost:9999/internal-links.html"});

		});

		});

.npmignore

Sorry, the diff of this file is not supported yet

crawler-ninja - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics

Worsened metrics

Dependency changes