Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

crawler-ninja

Package Overview
Dependencies
Maintainers
1
Versions
28
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

crawler-ninja - npm Package Compare versions

Comparing version 0.1.12 to 0.1.13

23

index.js

@@ -7,4 +7,2 @@ var events = require('events');

var log = require("crawler-ninja-logger").Logger;
var Map = require("collections/fast-map");
var Set = require("collections/fast-set");
var requester = require("./lib/queue-requester");

@@ -16,4 +14,2 @@ var URI = require('./lib/uri.js');

var domainBlackList = require("./default-lists/domain-black-list.js").list();

@@ -109,3 +105,3 @@ var suffixBlackList = require("./default-lists/suffix-black-list.js").list();

this.httpRequester = new requester.Requester(this.config);
requester.init(this.config.maxConnections, this.config.onDrain);

@@ -134,3 +130,3 @@ events.EventEmitter.call(this);

function(error){
if (self.httpRequester.idle()) {
if (requester.idle()) {
self.config.onDrain();

@@ -158,3 +154,3 @@ }

store.getStore().addStartUrl(options, function(error) {
self.httpRequester.queue(addDefaultOptions({uri:options, url:options}, self.config));
requester.queue(addDefaultOptions({uri:options, url:options}, self.config));
});

@@ -170,3 +166,3 @@

function(error){
if (self.httpRequester.idle()) {
if (requester.idle()) {
self.config.onDrain();

@@ -180,3 +176,3 @@ }

store.getStore().addStartUrl(_.has(options, "url") ? options.url : options.uri, function(error) {
self.httpRequester.queue(addDefaultOptions(options, self.config));
requester.queue(addDefaultOptions(options, self.config));
});

@@ -318,3 +314,6 @@ }

async.apply(self.applyRedirect.bind(self), result),
], callback);
], function(error) {
result = null;
callback(error);
});

@@ -333,3 +332,3 @@ }

this.pm.crawlRedirect(from, to, result.statusCode, function(){
self.httpRequester.queue(self.buildNewOptions(result,to));
requester.queue(self.buildNewOptions(result,to));
callback();

@@ -558,3 +557,3 @@ });

if (toCrawl && (result.depthLimit == -1 || currentDepth <= result.depthLimit)) {
self.httpRequester.queue(self.buildNewOptions(result,linkUri));
requester.queue(self.buildNewOptions(result,linkUri));
callback();

@@ -561,0 +560,0 @@ }

@@ -33,4 +33,2 @@ /**

var requestArgs = ['uri','url','qs','method','headers','body','form','json','multipart','followRedirect',

@@ -42,8 +40,7 @@ 'followAllRedirects', 'maxRedirects','encoding','pool','timeout','proxy','auth','oauth','strictSSL',

var start = new Date();
var req = request(_.pick.apply(this,[options].concat(requestArgs)), function(error,response) {
request(_.pick.apply(this,[options].concat(requestArgs)), function(error,response) {
var end = new Date() - start;
if (error) {
return onContent(error, endCallback, options);
return endCallback({code: error.code}, options);
}

@@ -68,3 +65,3 @@

if (error) {
result = null;
return onContent(error, endCallback, options);

@@ -87,6 +84,2 @@ }

if (error) {
return endCallback({code: error.code}, options);
}
// Sorry for this hack but that's solve some issue with Cheerio

@@ -93,0 +86,0 @@ if (!result.body) {

@@ -7,3 +7,3 @@ var Map = require("collections/fast-map");

this.plugins = new Map();
this.pluginCounter = 0;
}

@@ -10,0 +10,0 @@

@@ -1,411 +0,408 @@

var async = require('async');
var _ = require('underscore');
var URI = require("./uri.js");
var request = require("./http-request.js");
var log = require("crawler-ninja-logger").Logger;
var store = require("../lib/store/store.js");
/**
* The Request Queue
*
* its main job is to make the http requests & analyze the responses
* its main job is make the http requests & analyze the responses
* It is used an internal queue to limit the number of workers
*
* @param the options to use to make the requests.
*
*/
var Requester = function(options) {
var async = require('async');
var _ = require('underscore');
var log = require("crawler-ninja-logger").Logger;
var URI = require("./uri.js");
var request = require("./http-request.js");
var store = require("../lib/store/store.js");
this.options = options;
this.initQueue();
}
/**
* Add a new url to crawl in the queue.
* Check the desired options and add it to a request queue
*
* @param the options used to crawl the url
*
*/
Requester.prototype.queue = function (options) {
(function () {
var self = this;
// Up to you to use uri or url.
if (options.uri) {
options.url = options.uri;
}
else {
options.uri = options.url;
}
var requestQueue = {};
// if skipDuplicates, don't crawl twice the same uri
if (this.options.skipDuplicates) {
store.getStore().isInCrawlHistory(options.uri, function(error, isInCrawlHistory) {
if (isInCrawlHistory) {
log.warn({"url" : options.url, "step" : "queue-resquester.queue", "message" : "Don't crawl this url - Option skipDuplicates=true & the url has already been crawled" });
}
else {
store.getStore().addInHistory(options.uri,function(error) {
self.q.push(options);
log.info({"url" : options.url, "step" : "queue-resquester.queue", "message" : "Add in the request queue"});
});
}
/**
* Init the Queue Requester
*
*
* @param The number of task/connection that the request queu can start in parallel
* @param the callback executes when all task (url to cralw) are completed
*
*/
});
}
else {
log.info({"url" : options.url, "step" : "queue-resquester.queue", "message" : "Add in the request queue"});
this.q.push(options);
}
function init (maxConnections, onDrain) {
createQueue(maxConnections, onDrain);
}
/**
* Add a new url to crawl in the queue.
* Check the desired options and add it to a request queue
*
* @param the options used to crawl the url
*
*/
function queue(options) {
}
// Up to you to use uri or url.
if (options.uri) {
options.url = options.uri;
}
else {
options.uri = options.url;
}
// if skipDuplicates, don't crawl twice the same uri
if (options.skipDuplicates) {
store.getStore().isInCrawlHistory(options.uri, function(error, isInCrawlHistory) {
if (isInCrawlHistory) {
log.warn({"url" : options.url, "step" : "queue-resquester.queue", "message" : "Don't crawl this url - Option skipDuplicates=true & the url has already been crawled" });
}
else {
store.getStore().addInHistory(options.uri,function(error) {
/**
* @return false if there are items waiting or being processed in the queue, or true if not.
*
*/
Requester.prototype.idle = function() {
return this.q.idle();
}
requestQueue.push(options);
log.info({"url" : options.url, "step" : "queue-resquester.queue", "message" : "Add in the request queue"});
/*****************************************************************************************
*
* PRIVATES FUNCTIONS
*
******************************************************************************************/
});
}
/**
* Init the queue
*
*/
Requester.prototype.initQueue = function () {
});
}
else {
log.info({"url" : options.url, "step" : "queue-resquester.queue", "message" : "Add in the request queue"});
requestQueue.push(options);
}
var self = this;
this.q = async.queue(
function (options, callback) {
log.debug({"url" : options.url, "step" : "queue-resquester.execQueueTask", "message" : "Start Crawling"});
// If the domain is in the blacklist => don't crawl the url
if (options.domainBlackList.indexOf(URI.domainName(options.url)) > 0) {
log.error({"url" : options.url, "step" : "queue-resquester.execQueueTask", "message" : "Domain of the url is in the blacklist"});
options.onCrawl({code:"DOMAINBLACKLIST"}, options, function(error){
callback(error);
});
return;
}
}
// Check if there are some errors for the host & make the appropriate crawl in function of that
store.getStore().getHostErrors(options.url, function(error, errorInfo) {
log.debug({"url" : options.url, "step" : "queue-resquester.execQueueTask", "message" : "Check if errors already exist"});
if (error) {
self.onStoreError(error, options);
return callback();
}
/**
* @return false if there are some URL waiting to be crawled or being processed in the queue, or true if not.
*
*/
idle = function() {
return requestQueue.idle();
}
if (errorInfo) {
self.crawlWithErrors(options, errorInfo, callback);
}
else {
self.crawl(options, callback);
}
});
},
self.options.maxConnections);
/*****************************************************************************************
*
* PRIVATES FUNCTIONS
*
******************************************************************************************/
this.q.drain = function() {
/**
* Create the Request Queue
*
*/
function createQueue (maxConnections, onDrain) {
requestQueue = async.queue(onUrlToCrawl,maxConnections);
requestQueue.drain = onDrain;
if (self.options.onDrain) {
self.options.onDrain();
}
}
}
/**
* Stop the crawl if the crawl persistence store provides some errors
*
*
* @param the error provided by the persistence store
* @param the crawl options
*
*/
Requester.prototype.onStoreError = function(error, options, callback) {
log.error({"url" : options.url , "step" : "queue-resquester.onStoreError", "message" : "Error from the crawl persistence service (crawl canceled for this url) : " + error.code});
options.onCrawl({code:"STOPCRAWL"}, options, function(error){
callback(error);
});
}
function onUrlToCrawl(options, callback) {
/**
* Crawl one url with optionnaly a delay (rate limit)
*
*
* @param the crawl options
* @param the callback used to inform the queue that request is finished
*/
Requester.prototype.crawl = function(options, callback) {
log.debug({"url" : options.url, "step" : "queue-resquester.execQueueTask", "message" : "Start Crawling"});
// If the domain is in the blacklist => don't crawl the url
if (options.domainBlackList.indexOf(URI.domainName(options.url)) > 0) {
var self = this;
log.error({"url" : options.url, "step" : "queue-resquester.execQueueTask", "message" : "Domain of the url is in the blacklist"});
options.onCrawl({code:"DOMAINBLACKLIST"}, options, function(error){
process.nextTick(function() {callback(error)});
});
return;
}
if (options.rateLimits != 0) {
// Check if there are some errors for the host & make the appropriate crawl in function of that
store.getStore().getHostErrors(options.url, function(error, errorInfo) {
log.debug({"url" : options.url, "step" : "queue-resquester.execQueueTask", "message" : "Check if errors already exist"});
log.error({"url" : options.url, "step" : "queue-resquester.crawl", "message" : "Request with option on ratelimit = " + options.rateLimits});
setTimeout(function() {
if (error) {
onStoreError(error, options);
return callback();
}
self.execHttp(options, callback);
if (errorInfo) {
crawlWithErrors(options, errorInfo, callback);
}
else {
crawl(options, callback);
}
});
}, options.rateLimits);
}
else {
self.execHttp(options, callback);
}
}
/**
* Stop the crawl if the crawl persistence store provides some errors
*
*
* @param the error provided by the persistence store
* @param the crawl options
*
*/
function onStoreError(error, options, callback) {
log.error({"url" : options.url , "step" : "queue-resquester.onStoreError", "message" : "Error from the crawl persistence service (crawl canceled for this url) : " + error.code});
options.onCrawl({code:"STOPCRAWL"}, options, function(error){
process.nextTick(function() {callback(error)});
});
}
}
/**
* Crawl an url for a host which has already provided some errors (timout, connection refused, ... )
*
*
* @param the crawl options
* @param the info on errors
* @param the callback used to inform the queue that request is finished
*/
Requester.prototype.crawlWithErrors = function(options, errorInfo, callback) {
/**
* Crawl one url with optionnaly a delay (rate limit)
*
*
* @param the crawl options
* @param the callback used to inform the queue that request is finished
*/
function crawl(options, callback) {
log.warn({"url" : options.url, "step" : "queue-resquester.crawlWithErrors", "message" : "Crawl with errors", "options" : "errorInfo"});
var self = this;
if (options.rateLimits != 0) {
if (errorInfo.stopCrawlOnThisDomain) {
log.error({"url" : options.url, "step" : "queue-resquester.crawlWithErrors", "message" : "Too many errors on the domain - Stop to crawl its URLS"});
log.error({"url" : options.url, "step" : "queue-resquester.crawl", "message" : "Request with option on ratelimit = " + options.rateLimits});
setTimeout(function() {
execHttp(options, callback);
options.onCrawl({code:"STOPCRAWL"}, options, function(error) {
callback(error);
});
}, options.rateLimits);
}
else {
execHttp(options, callback);
}
return;
}
/**
* Crawl an url for a host which has already provided some errors (timout, connection refused, ... )
*
*
* @param the crawl options
* @param the info on errors
* @param the callback used to inform the queue that request is finished
*/
function crawlWithErrors(options, errorInfo, callback) {
log.warn({"url" : options.url, "step" : "queue-resquester.crawlWithErrors", "message" : "Crawl with errors", "options" : "errorInfo"});
if (errorInfo.forceRateLimits) {
log.warn({"url" : options.url, "step" : "queue-resquester.crawlWithErrors", "message" : "Too many errors on the domain - Force request with rate limit" });
setTimeout(function() {
self.execHttp(options, callback);
if (errorInfo.stopCrawlOnThisDomain) {
log.error({"url" : options.url, "step" : "queue-resquester.crawlWithErrors", "message" : "Too many errors on the domain - Stop to crawl its URLS"});
}, options.errorRates[errorInfo.currentRateLimitIndex]);
options.onCrawl({code:"STOPCRAWL"}, options, function(error) {
process.nextTick(function() {callback(error)});
});
return;
}
return;
}
//Case of a retry due to a previous http error on the same request
if (options.maxRetries < options.retries) {
log.warn({"url" : options.url, "step" : "queue-resquester.crawlWithErrors", "message" : "Retry Request - maxRetries =" + options.maxRetries + " - retries : " + options.retries});
setTimeout(function() {
self.execHttp(options, callback);
if (errorInfo.forceRateLimits) {
log.warn({"url" : options.url, "step" : "queue-resquester.crawlWithErrors", "message" : "Too many errors on the domain - Force request with rate limit" });
setTimeout(function() {
execHttp(options, callback);
}, options.retryTimeout);
return;
}, options.errorRates[errorInfo.currentRateLimitIndex]);
}
return;
}
log.error({"url" : options.url, "step" : "queue-resquester.crawlWithErrors", "message" : "Invalid Error option - last crawl of the url", "options" : "errorInfo" });
this.crawl(options, callback);
//Case of a retry due to a previous http error on the same request
if (options.maxRetries < options.retries) {
log.warn({"url" : options.url, "step" : "queue-resquester.crawlWithErrors", "message" : "Retry Request - maxRetries =" + options.maxRetries + " - retries : " + options.retries});
setTimeout(function() {
execHttp(options, callback);
}
}, options.retryTimeout);
return;
/**
* Execute an http request
*
* @param The options to used for the request
* @param callback executed when the request is finished
*
*/
Requester.prototype.execHttp = function (options, callback) {
var self = this;
if (this.options.proxyList) {
options.proxy = this.options.proxyList.getProxy().getUrl();
}
log.debug({"url" : options.url, "step" : "queue-requester.execHttp", "message" : "Execute the request"});
request(options, function(error, result) {
log.debug({"url" : options.url, "step" : "queue-requester.execHttp", "message" : "Execute the request done"});
if (error) {
self.onRequestError(error, options, result, callback);
}
else {
options.onCrawl(null, result, function(error){
callback(error);
});
}
});
log.error({"url" : options.url, "step" : "queue-resquester.crawlWithErrors", "message" : "Invalid Error option - last crawl of the url", "options" : "errorInfo" });
crawl(options, callback);
}
}
/**
* Execute an http request
*
* @param The options to used for the request
* @param callback executed when the request is finished
*
*/
function execHttp(options, callback) {
/**
* Callback used when a Http request generates an error
*
*
* @param The Http error
* @param the crawl options
* @param the HTTP response
* @param callback()
*/
Requester.prototype.onRequestError = function (error, options, result, callback) {
if (options.proxyList) {
options.proxy = options.proxyList.getProxy().getUrl();
}
log.debug({"url" : options.url, "step" : "queue-requester.execHttp", "message" : "Execute the request"});
request(options, function(error, result) {
log.debug({"url" : options.url, "step" : "queue-requester.execHttp", "message" : "Execute the request done"});
if (error) {
onRequestError(error, options, result, callback);
}
else {
options.onCrawl(null, result, function(error){
process.nextTick(function() {callback(error)});
});
}
//var self = this;
});
// if the error is a timeout :
// 1. Check the crawl rate and if necessary decrease it for slower skipDuplicates
// 2. Save the error info for the associated host.
// 3. recrawl the url if the maximum of retries is not yet reaches
if (error.code == 'ETIMEDOUT' || error.code == 'ESOCKETTIMEDOUT') {
log.error({"url" : options.url, "step" : "queue-requester.onRequestError", "message" : "Timeout"});
//var execOnError = async.compose(self.recrawlUrl, self.saveErrorInfo, self.checkCrawlRate);
var execOnError = async.compose(this.recrawlUrl.bind(this), this.saveErrorInfo.bind(this), this.checkCrawlRate.bind(this));
execOnError({options : options, result : result, error : error,crawler : this}, function(err, params){
callback();
});
return;
}
}
/**
* Callback used when a Http request generates an error
*
*
* @param The Http error
* @param the crawl options
* @param the HTTP response
* @param callback()
*/
function onRequestError(error, options, result, callback) {
// if it is a connection error, recrawl the url if the maximum of retries is not yet reaches
if (error.code == 'ECONNRESET' || error.code == 'ECONNREFUSED' ) {
log.error({"url" : options.url, "step" : "queue-requester.onRequestError", "message" : "connection refused"});
this.recrawlUrl({options : options, result : result, error : error, crawler : this}, function(error,params){
callback();
// if the error is a timeout :
// 1. Check the crawl rate and if necessary decrease it for slower skipDuplicates
// 2. Save the error info for the associated host.
// 3. recrawl the url if the maximum of retries is not yet reaches
if (error.code == 'ETIMEDOUT' || error.code == 'ESOCKETTIMEDOUT') {
log.error({"url" : options.url, "step" : "queue-requester.onRequestError", "message" : "Timeout"});
var execOnError = async.compose(recrawlUrl, saveErrorInfo, checkCrawlRate);
execOnError({options : options, result : result, error : error}, function(err, params){
process.nextTick(function() {callback()});
});
return;
}
// For the other kind of errors, just inform the crawler
options.onCrawl(error, result, function(error) {
callback(error);
});
}
// if it is a connection error, recrawl the url if the maximum of retries is not yet reaches
if (error.code == 'ECONNRESET' || error.code == 'ECONNREFUSED' ) {
log.error({"url" : options.url, "step" : "queue-requester.onRequestError", "message" : "connection refused"});
recrawlUrl({options : options, result : result, error : error}, function(error,params){
process.nextTick(function() {callback()});
});
return;
}
};
// For the other kind of errors, just inform the crawler
options.onCrawl(error, result, function(error) {
process.nextTick(function() {callback(error)});
});
/**
* In the case of a timeout error, this method is call in order to check
* if it is not necessary to decrease the crawl rate
*
*
* @param the crawl params (options, result, errors, errorInfo)
* @param callback(error, params)
*/
Requester.prototype.checkCrawlRate = function (params, callback) {
};
var self = this;
/**
* In the case of a timeout error, this method is call in order to check
* if it is not necessary to decrease the crawl rate
*
*
* @param the crawl params (options, result, errors, errorInfo)
* @param callback(error, params)
*/
function checkCrawlRate(params, callback) {
store.getStore().getHostErrors(params.options.url, function(error, errorInfo) {
store.getStore().getHostErrors(params.options.url, function(error, errorInfo) {
if (error) {
self.onStoreError(error, params.options);
return callback(error);
}
if (error) {
onStoreError(error, params.options);
return callback(error);
}
params.errorInfo = errorInfo;
params.errorInfo = errorInfo;
if (! errorInfo) {
params.errorInfo = { numberOfErrors : 0, currentRateLimitIndex : -1, forceRateLimits : false};
}
params.errorInfo.numberOfErrors++;
if (! errorInfo) {
params.errorInfo = { numberOfErrors : 0, currentRateLimitIndex : -1, forceRateLimits : false};
}
params.errorInfo.numberOfErrors++;
if (params.options.maxErrors != -1 && params.errorInfo.numberOfErrors == params.options.maxErrors ) {
self.decreaseCrawlRate(params, callback);
}
else {
log.info({"url" : params.options.url, "step" : "queue-requester.checkCrawlRate", "message" : "Don't decrease rate (number of errors < max number of errors)"});
callback(null,params);
}
});
if (params.options.maxErrors != -1 && params.errorInfo.numberOfErrors == params.options.maxErrors ) {
decreaseCrawlRate(params, callback);
}
else {
log.info({"url" : params.options.url, "step" : "queue-requester.checkCrawlRate", "message" : "Don't decrease rate (number of errors < max number of errors)"});
callback(null,params);
}
});
}
}
/**
* In the case of a timeout error and if there are too many errors,
* this method will decrease the crawl rate or stop the crawl for the
* associated domain
*
* @param the crawl params (options, result, errors, errorInfo)
* @param callback(error, params)
*/
Requester.prototype.decreaseCrawlRate = function(params, callback) {
/**
* In the case of a timeout error and if there are too many errors,
* this method will decrease the crawl rate or stop the crawl for the
* associated domain
*
* @param the crawl params (options, result, errors, errorInfo)
* @param callback(error, params)
*/
function decreaseCrawlRate(params, callback) {
params.errorInfo.currentRateLimitIndex++;
// If there is still an available rate limit
if (params.options.errorRates.length > 0 && params.errorInfo.currentRateLimitIndex < params.options.errorRates.length) {
params.errorInfo.numberOfErrors = 0;
params.errorInfo.forceRateLimits = true;
log.warn({"url" : params.options.url, "step" : "queue-requester.decreaseCrawlRate", "message" : "Too many errors, set rateLimits to " + params.options.errorRates[params.errorInfo.currentRateLimitIndex]});
params.errorInfo.currentRateLimitIndex++;
// If there is still an available rate limit
if (params.options.errorRates.length > 0 && params.errorInfo.currentRateLimitIndex < params.options.errorRates.length) {
params.errorInfo.numberOfErrors = 0;
params.errorInfo.forceRateLimits = true;
log.warn({"url" : params.options.url, "step" : "queue-requester.decreaseCrawlRate", "message" : "Too many errors, set rateLimits to " + params.options.errorRates[params.errorInfo.currentRateLimitIndex]});
params.options.maxRetries = params.options.retries;
params.options.maxRetries = params.options.retries;
}
// we stop to crawl on this domain if all rate limits have been used
else {
log.error({"url" : params.options.url, "step" : "queue-requester.decreaseCrawlRate", "message" : "Stop crawl domain - all crawl rates done"});
params.errorInfo.stopCrawlOnThisDomain = true;
}
}
// we stop to crawl on this domain if all rate limits have been used
else {
log.error({"url" : params.options.url, "step" : "queue-requester.decreaseCrawlRate", "message" : "Stop crawl domain - all crawl rates done"});
params.errorInfo.stopCrawlOnThisDomain = true;
}
callback(null, params);
}
callback(null, params);
}
/**
* Save the error info into the crawl persistence store
*
*
* @param the crawl params (options, result, errorInfo)
* @param callback(error, params)
*/
Requester.prototype.saveErrorInfo = function(params, callback) {
/**
* Save the error info into the crawl persistence store
*
*
* @param the crawl params (options, result, errorInfo)
* @param callback(error, params)
*/
function saveErrorInfo(params, callback) {
store.getStore().setHostErrors(params.options.url, params.errorInfo, function(error) {
callback(null, params);
});
store.getStore().setHostErrors(params.options.url, params.errorInfo, function(error) {
callback(null, params);
});
}
}
/**
* Recrawl an url if the maximum of retries is no yet fetch
*
*
* @param the crawl params (options, result, errorInfo)
* @param callback(error, params)
*/
Requester.prototype.recrawlUrl = function(params, callback) {
/**
* Recrawl an url if the maximum of retries is no yet fetch
*
*
* @param the crawl params (options, result, errorInfo)
* @param callback(error, params)
*/
function recrawlUrl(params, callback) {
if (params.result.maxRetries > 1) {
log.warn({"url" : params.options.url, "step" : "queue-requester.recrawlUrl", "message" : "Recrawl"});
params.result.maxRetries--;
if (params.result.maxRetries > 1) {
log.warn({"url" : params.options.url, "step" : "queue-requester.recrawlUrl", "message" : "Recrawl"});
params.result.maxRetries--;
//TODO : async this code
store.getStore().removeFromHistory(params.result.url);
params.crawler.queue(params.result);
//TODO : async this code
store.getStore().removeFromHistory(params.result.url);
queue(params.result);
callback(null, params);
}
else {
log.warn({"url" : params.result.url, "step" : "queue-requester.recrawlUrl", "message" : "Don't recrawl - end of retries"});
params.options.onCrawl(params.error, params.result, function(error){
callback(error, params);
});
callback(null, params);
}
else {
log.warn({"url" : params.result.url, "step" : "queue-requester.recrawlUrl", "message" : "Don't recrawl - end of retries"});
params.options.onCrawl(params.error, params.result, function(error){
callback(error, params);
});
}
}
}
module.exports.init = init;
module.exports.queue = queue;
module.exports.idle = idle;
module.exports.Requester = Requester;
}());
{
"name": "crawler-ninja",
"version": "0.1.12",
"version": "0.1.13",
"description": "A web crawler made for the SEO based on plugins. Please wait or contribute ... still in beta",

@@ -11,3 +11,2 @@ "main": "index.js",

"dependencies": {
"crawler-ninja-logger" : "*",
"URIjs": "*",

@@ -17,3 +16,5 @@ "async": " *",

"collections": "*",
"crawler-ninja-logger": "*",
"crypto": "*",
"heapdump": "^0.3.7",
"request": "*",

@@ -20,0 +21,0 @@ "riak-js": "*",

@@ -474,1 +474,4 @@ Crawler Ninja

- Add an empty plugin sample. See the js file : /plugins/empty-plugin.js
0.1.13
- Experiments for a better memory management
var assert = require("assert");
var crawler = require("../index.js");
var crawler = require("../index.js");
var cs = require("../plugins/console-plugin.js");
var testSite = require("./website-2/start.js").site;
var heapdump = require('heapdump');
var proxyList = null;
describe('Proxies', function() {
describe('Memory leaks', function() {
it.only('should crawl without memory leaks', function(done) {
this.timeout(3000000);
var c = new crawler.Crawler({skipDuplicates: false});
it.skip('should crawl without memory leaks', function(done) {
this.timeout(3000000);
setInterval(function(){
console.log(">>>>> Dump !");
heapdump.writeSnapshot('./dump/dump' + Date.now() + '.heapsnapshot');
}, 120000);
var options = {
skipDuplicates: true,
scripts : false,
links : false,
image : false,
maxConnections : 100
}
var c = new crawler.Crawler(options);
var consolePlugin = new cs.Plugin();

@@ -22,12 +37,6 @@ c.registerPlugin(consolePlugin);

/*
c.on("error", function(error, result) {
assert(_.find(result.proxyList.getProxies(), function(p){ return p.getUrl()=== result.proxy; }));
c.queue({url : "http://www.rtbf.be/", externalDomains: false});
});
*/
c.queue({url : "http://localhost:9999/internal-links.html"});
});
});

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc