crawler-ninja - npm Package Compare versions

Comparing version 0.2.4 to 0.2.5

index.js

		@@ -17,2 +17,3 @@ var timers = require('timers');
		var DEFAULT_NUMBER_OF_CONNECTIONS = 5;
		var DEFAULT_JAR = true;
		var DEFAULT_DEPTH_LIMIT = -1; // no limit
		@@ -35,4 +36,2 @@ var DEFAULT_TIME_OUT = 20000;

		// The Http request doesn't follow redirect
		// in order to catch/log/manage them in some plugins
		var DEFAULT_FOLLOW_301 = false;
		@@ -42,3 +41,2 @@
		var DEFAULT_USER_AGENT = "NinjaBot";
		var DEFAULT_CACHE = false;
		var DEFAULT_REFERER = false;
		@@ -54,3 +52,3 @@

		// assign the default updateDepth method used to calculate the crawl depth
		// assign the default updateDepth function used to calculate the crawl depth
		var updateDepthFn = updateDepth;
		@@ -222,2 +220,3 @@
		//TODO : review this code with async
		/*
		http.resolveRedirection(options, function(error, targetUrl){
		@@ -233,2 +232,11 @@ store.getStore().addStartUrls([targetUrl, options.url], function(error) {
		});
		*/
		store.getStore().addStartUrls([ options.url], function(error) {
		requestQueue.queue(options, function(error){
		log.debug({"url" : options.url, "step" : "addInQueue", "message" : "Url correctly added in the queue"});
		if (requestQueue.idle()){
		endCallback();
		}
		});
		});

		@@ -320,3 +328,4 @@
		storeModuleName : DEFAULT_STORE_MODULE,
		queueModuleName : DEFAULT_QUEUE_MODULE
		queueModuleName : DEFAULT_QUEUE_MODULE,
		jar : DEFAULT_JAR

		@@ -390,3 +399,3 @@ };
		var from = result.url;
		var to = URI.linkToURI(from, result.headers.location);
		var to = URI.linkToURI(from, result.responseHeaders.location);

		@@ -739,3 +748,3 @@ // Send the redirect info to the plugins &
		var check = result.canCrawl(parentUrl, link, anchor, isDoFollow);
		log.debug({"url" : link, "step" : "isAGoodLinkToCrawl", "message" : "method options.canCrawl has been called and return "} + check);
		log.debug({"url" : link, "step" : "isAGoodLinkToCrawl", "message" : "function options.canCrawl has been called and return "} + check);
		return callback(null, {toCrawl : check, isExternal : ! startFrom.link.isStartFromDomain});
		@@ -742,0 +751,0 @@

102

lib/http/http-request.js

		@@ -8,24 +8,3 @@
		(function () {
		/**
		* Resolve an HTTP redirection
		*
		* @param the url to check/resolve
		* @param callback (error, the resolved url)
		*/
		function resolveRedirection(options, callback) {

		var opts = _.clone(options);
		opts.followRedirect = true;
		request(options, function(error, response) {
		if (error) {
		return callback(null, options.url);
		}

		callback(null, response.url);

		});

		}


		/**
		@@ -38,29 +17,9 @@ * Execute the HTTP call, check error, build a response

		if (! options.headers) {
		options.headers = {};
		}
		var reqOptions = initRequestOptions(options);

		if (options.userAgent) {
		//console.log("User Agent :" + options.userAgent);
		options.headers['User-Agent'] = options.userAgent;
		}

		if (typeof options.encoding === 'undefined') {
		options.headers['Accept-Encoding'] = 'gzip';
		options.encoding = null;
		}

		if (options.referer) {
		options.headers.Referer = options.referer;
		}

		// For HTTPS requests
		// Some old servers do not support recent TSL version
		//options.secureOptions = require('constants').SSL_OP_NO_TLSv1_2;
		//options.rejectUnauthorized = false;

		var start = new Date();

		//console.log("ops", ops);
		//console.log("options get", options);
		request.get(options, function(error,response) {
		request.get(reqOptions, function(error,response) {

		@@ -104,3 +63,44 @@ var end = new Date() - start;

		/**
		*
		* Create the options for the http request based on the crawler options
		* Passing the complete crawler options to request doesn't work
		*
		* @param The crawler options
		*/
		function initRequestOptions(options) {
		var opts = _.clone(options);
		if (! opts.headers) {
		opts.headers = {};
		}

		if (opts.userAgent) {
		//console.log("User Agent :" + options.userAgent);
		opts.headers['User-Agent'] = opts.userAgent;
		}

		if (typeof opts.encoding === 'undefined') {
		opts.headers['Accept-Encoding'] = 'gzip';
		opts.encoding = null;
		}

		if (opts.referer) {
		opts.headers.Referer = opts.referer;
		}

		opts.method = 'get';

		// For HTTPS requests
		// Some old servers do not support recent TSL version
		//options.secureOptions = require('constants').SSL_OP_NO_TLSv1_2;
		//options.rejectUnauthorized = false;

		var requestArgs = ['uri','url','method','headers','followRedirect', 'followAllRedirects', 'maxRedirects', 'encoding',
		'pool','timeout','proxy', 'referer', 'strictSSL', 'secureOptions', 'rejectUnauthorized','jar' ];

		opts = _.pick.apply(this,[opts].concat(requestArgs));

		return opts;
		}

		/**
		@@ -125,7 +125,18 @@ *

		// Sorry for this hack but that's solve some issues with Cheerio
		// This hack solves some issues with Cheerio
		// TODO : Check if it is still necessary
		result.body = result.body.toString();

		// Add the options used for the previous request in the result
		// By this way, the next request will use the same options
		// TODO : How to avoid this bad manipulation ?
		result = _.extend(result, _.omit(options, _.keys(result)));

		// The header of the response can be used later by the crawler
		result.responseHeaders = result.headers;

		// Remove headers. We will rebuild it for the following request in
		// the function initRequestOptions
		result.headers = null;

		options = null;
		@@ -136,3 +147,2 @@

		module.exports.resolveRedirection = resolveRedirection;
		module.exports.get = get;
		@@ -139,0 +149,0 @@

package.json

		{
		"name": "crawler-ninja",
		"version": "0.2.4",
		"version": "0.2.5",
		"description": "A web crawler made for the SEO based on plugins. Please wait or contribute ... still in beta",
		@@ -5,0 +5,0 @@ "main": "index.js",

plugins/audit-plugin.js

		@@ -97,6 +97,6 @@ var crypto = require('crypto');
		resourceInfo.size = result.body.length;
		resourceInfo.contentType = result.headers["content-type"];
		resourceInfo.contentType = result.responseHeaders["content-type"];

		// last modified & other header attributes
		resourceInfo.headers = result.headers;
		resourceInfo.responseHeaders = result.responseHeaders;

		@@ -152,10 +152,10 @@ // if HTML
		resourceInfo.size = result.body.length;
		resourceInfo.contentType = result.headers["content-type"];
		resourceInfo.contentType = result.responseHeaders["content-type"];

		// last modified & other header attributes
		resourceInfo.headers = result.headers;
		resourceInfo.responseHeaders = result.responseHeaders;

		this.resources.set(result.url, resourceInfo);

		addToListMap(this.outLinks, result.url, {page: result.headers["location"], anchor : 'Redirect', isDoFollow : true});
		addToListMap(this.outLinks, result.url, {page: result.responseHeaders["location"], anchor : 'Redirect', isDoFollow : true});
		};
		@@ -172,3 +172,3 @@

		resourceInfo.headers = result.headers;
		resourceInfo.responseHeaders = result.responseHeaders;

		@@ -175,0 +175,0 @@ this.resources.set(result.url, resourceInfo);

plugins/stat-plugin.js

		@@ -47,3 +47,3 @@ /**

		var contentType = result.headers[CONTENT_TYPE_HEADER];
		var contentType = result.responseHeaders[CONTENT_TYPE_HEADER];
		if (contentType) {
		@@ -50,0 +50,0 @@ this.addContentType(contentType);

README.md

		@@ -158,3 +158,3 @@ Crawler Ninja

		You can pass change/overide the default crawl options by using the init method.
		You can pass change/overide the default crawl options by using the init function.

		@@ -169,2 +169,3 @@ ```javascript
		- maxConnections : the number of connections used to crawl, default is 5.
		- jar : If true, remember cookies for future use, default is true
		- rateLimits : number of milliseconds to delay between each requests , default = 0.
		@@ -187,7 +188,7 @@ - externalDomains : if true crawl external domains. This option can crawl a lot of different linked domains, default = false.
		- suffixBlackList : The list of url suffice to avoid to crawl (an array of String). The default list is in the file : /default-lists/domain-black-list.js
		- method : HTTP method used for crawling, default : GET.


		You can also use the [mikeal's request options](https://github.com/mikeal/request#requestoptions-callback) and will be directly passed to the request() method.

		You can also use the [mikeal's request options](https://github.com/mikeal/request#requestoptions-callback) and will be directly passed to the request() function.

		You can pass these options to the init() function if you want them to be global or as
		@@ -358,3 +359,3 @@ items in the queue() calls if you want them to be specific to that item (overwriting global options).
		With the default crawl options, it is possible to get errors like timeouts on some https sites. This happens with sites that do not support TLS 1.2+ .
		You can check the HTTPS infos for you site with : https://www.ssllabs.com/ssltest/
		You can check the HTTPS infos and the TLS compliant level for your site on : https://www.ssllabs.com/ssltest/

		@@ -369,5 +370,16 @@ In order to crawl those sites, you have to add the following parameters in the crawl options :
		```

		We will try to integrate this kind of exception in the crawler code for an upcoming release.

		### Starting the crawl with a redirect on a different subdomain

		If you start a crawl on http://wwww.mysite.com and if this url is redirecting to http://mysite.com, the crawl stop directly with the default options.

		Indeed, the default options doesn't crawl other hosts/subdomain on the same domain. You can use the option externalHosts to avoid this situation.

		```javascript
		var options = {
		externalHosts : true
		};

		```
		The Crawl Store
		@@ -497,3 +509,3 @@ ---------------
		- Add in the crawl json param "isExternal". By this way, a plugin can check if the link is external or not.
		- Add a new option "retry404" : some sites provide inconsistent response for some urls (status 404 instead of 200). In such case, it should be nice to retry (this issuee needs to be analyzed in more detail).
		- Add a new option "retry400" : some sites provide inconsistent response for some urls (status 404 instead of 200). In such case, it should be nice to retry (this issuee needs to be analyzed in more detail).

		@@ -525,1 +537,7 @@ 0.1.20
		- Now, it is possible to use all http request params in the crawler options.

		0.2.5
		- Fix regression when crawling specific websites.
		- Review how to build the options for making http requests.
		- Add in the result of a request the response headers.
		- Better support for HTTP redirects (300+).

test/crawl-external-test.js

		@@ -76,3 +76,3 @@ var assert = require("assert");
		it.skip('Should crawl a https site with a old config', function(done) {
		this.timeout(3000000);
		this.timeout(40000);
		var end = function(){
		@@ -84,4 +84,7 @@ //assert(audit.resources.toArray().length === 0);
		var options = {
		userAgent : 'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0',
		externalHosts : true,
		secureOptions : require('constants').SSL_OP_NO_TLSv1_2,
		rejectUnauthorized : false

		};
		@@ -93,4 +96,5 @@

		crawler.queue({url : "https://www.notaire.be"}); //https://www.notaire.be/
		crawler.queue({url : "https://www.notaire.be/" });


		});
		@@ -97,0 +101,0 @@

crawler-ninja - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics