crawler-ninja
Advanced tools
Comparing version 0.2.4 to 0.2.5
23
index.js
@@ -17,2 +17,3 @@ var timers = require('timers'); | ||
var DEFAULT_NUMBER_OF_CONNECTIONS = 5; | ||
var DEFAULT_JAR = true; | ||
var DEFAULT_DEPTH_LIMIT = -1; // no limit | ||
@@ -35,4 +36,2 @@ var DEFAULT_TIME_OUT = 20000; | ||
// The Http request doesn't follow redirect | ||
// in order to catch/log/manage them in some plugins | ||
var DEFAULT_FOLLOW_301 = false; | ||
@@ -42,3 +41,2 @@ | ||
var DEFAULT_USER_AGENT = "NinjaBot"; | ||
var DEFAULT_CACHE = false; | ||
var DEFAULT_REFERER = false; | ||
@@ -54,3 +52,3 @@ | ||
// assign the default updateDepth method used to calculate the crawl depth | ||
// assign the default updateDepth function used to calculate the crawl depth | ||
var updateDepthFn = updateDepth; | ||
@@ -222,2 +220,3 @@ | ||
//TODO : review this code with async | ||
/* | ||
http.resolveRedirection(options, function(error, targetUrl){ | ||
@@ -233,2 +232,11 @@ store.getStore().addStartUrls([targetUrl, options.url], function(error) { | ||
}); | ||
*/ | ||
store.getStore().addStartUrls([ options.url], function(error) { | ||
requestQueue.queue(options, function(error){ | ||
log.debug({"url" : options.url, "step" : "addInQueue", "message" : "Url correctly added in the queue"}); | ||
if (requestQueue.idle()){ | ||
endCallback(); | ||
} | ||
}); | ||
}); | ||
@@ -320,3 +328,4 @@ | ||
storeModuleName : DEFAULT_STORE_MODULE, | ||
queueModuleName : DEFAULT_QUEUE_MODULE | ||
queueModuleName : DEFAULT_QUEUE_MODULE, | ||
jar : DEFAULT_JAR | ||
@@ -390,3 +399,3 @@ }; | ||
var from = result.url; | ||
var to = URI.linkToURI(from, result.headers.location); | ||
var to = URI.linkToURI(from, result.responseHeaders.location); | ||
@@ -739,3 +748,3 @@ // Send the redirect info to the plugins & | ||
var check = result.canCrawl(parentUrl, link, anchor, isDoFollow); | ||
log.debug({"url" : link, "step" : "isAGoodLinkToCrawl", "message" : "method options.canCrawl has been called and return "} + check); | ||
log.debug({"url" : link, "step" : "isAGoodLinkToCrawl", "message" : "function options.canCrawl has been called and return "} + check); | ||
return callback(null, {toCrawl : check, isExternal : ! startFrom.link.isStartFromDomain}); | ||
@@ -742,0 +751,0 @@ |
@@ -8,24 +8,3 @@ | ||
(function () { | ||
/** | ||
* Resolve an HTTP redirection | ||
* | ||
* @param the url to check/resolve | ||
* @param callback (error, the resolved url) | ||
*/ | ||
function resolveRedirection(options, callback) { | ||
var opts = _.clone(options); | ||
opts.followRedirect = true; | ||
request(options, function(error, response) { | ||
if (error) { | ||
return callback(null, options.url); | ||
} | ||
callback(null, response.url); | ||
}); | ||
} | ||
/** | ||
@@ -38,29 +17,9 @@ * Execute the HTTP call, check error, build a response | ||
if (! options.headers) { | ||
options.headers = {}; | ||
} | ||
var reqOptions = initRequestOptions(options); | ||
if (options.userAgent) { | ||
//console.log("User Agent :" + options.userAgent); | ||
options.headers['User-Agent'] = options.userAgent; | ||
} | ||
if (typeof options.encoding === 'undefined') { | ||
options.headers['Accept-Encoding'] = 'gzip'; | ||
options.encoding = null; | ||
} | ||
if (options.referer) { | ||
options.headers.Referer = options.referer; | ||
} | ||
// For HTTPS requests | ||
// Some old servers do not support recent TSL version | ||
//options.secureOptions = require('constants').SSL_OP_NO_TLSv1_2; | ||
//options.rejectUnauthorized = false; | ||
var start = new Date(); | ||
//console.log("ops", ops); | ||
//console.log("options get", options); | ||
request.get(options, function(error,response) { | ||
request.get(reqOptions, function(error,response) { | ||
@@ -104,3 +63,44 @@ var end = new Date() - start; | ||
/** | ||
* | ||
* Create the options for the http request based on the crawler options | ||
* Passing the complete crawler options to request doesn't work | ||
* | ||
* @param The crawler options | ||
*/ | ||
function initRequestOptions(options) { | ||
var opts = _.clone(options); | ||
if (! opts.headers) { | ||
opts.headers = {}; | ||
} | ||
if (opts.userAgent) { | ||
//console.log("User Agent :" + options.userAgent); | ||
opts.headers['User-Agent'] = opts.userAgent; | ||
} | ||
if (typeof opts.encoding === 'undefined') { | ||
opts.headers['Accept-Encoding'] = 'gzip'; | ||
opts.encoding = null; | ||
} | ||
if (opts.referer) { | ||
opts.headers.Referer = opts.referer; | ||
} | ||
opts.method = 'get'; | ||
// For HTTPS requests | ||
// Some old servers do not support recent TSL version | ||
//options.secureOptions = require('constants').SSL_OP_NO_TLSv1_2; | ||
//options.rejectUnauthorized = false; | ||
var requestArgs = ['uri','url','method','headers','followRedirect', 'followAllRedirects', 'maxRedirects', 'encoding', | ||
'pool','timeout','proxy', 'referer', 'strictSSL', 'secureOptions', 'rejectUnauthorized','jar' ]; | ||
opts = _.pick.apply(this,[opts].concat(requestArgs)); | ||
return opts; | ||
} | ||
/** | ||
@@ -125,7 +125,18 @@ * | ||
// Sorry for this hack but that's solve some issues with Cheerio | ||
// This hack solves some issues with Cheerio | ||
// TODO : Check if it is still necessary | ||
result.body = result.body.toString(); | ||
// Add the options used for the previous request in the result | ||
// By this way, the next request will use the same options | ||
// TODO : How to avoid this bad manipulation ? | ||
result = _.extend(result, _.omit(options, _.keys(result))); | ||
// The header of the response can be used later by the crawler | ||
result.responseHeaders = result.headers; | ||
// Remove headers. We will rebuild it for the following request in | ||
// the function initRequestOptions | ||
result.headers = null; | ||
options = null; | ||
@@ -136,3 +147,2 @@ | ||
module.exports.resolveRedirection = resolveRedirection; | ||
module.exports.get = get; | ||
@@ -139,0 +149,0 @@ |
{ | ||
"name": "crawler-ninja", | ||
"version": "0.2.4", | ||
"version": "0.2.5", | ||
"description": "A web crawler made for the SEO based on plugins. Please wait or contribute ... still in beta", | ||
@@ -5,0 +5,0 @@ "main": "index.js", |
@@ -97,6 +97,6 @@ var crypto = require('crypto'); | ||
resourceInfo.size = result.body.length; | ||
resourceInfo.contentType = result.headers["content-type"]; | ||
resourceInfo.contentType = result.responseHeaders["content-type"]; | ||
// last modified & other header attributes | ||
resourceInfo.headers = result.headers; | ||
resourceInfo.responseHeaders = result.responseHeaders; | ||
@@ -152,10 +152,10 @@ // if HTML | ||
resourceInfo.size = result.body.length; | ||
resourceInfo.contentType = result.headers["content-type"]; | ||
resourceInfo.contentType = result.responseHeaders["content-type"]; | ||
// last modified & other header attributes | ||
resourceInfo.headers = result.headers; | ||
resourceInfo.responseHeaders = result.responseHeaders; | ||
this.resources.set(result.url, resourceInfo); | ||
addToListMap(this.outLinks, result.url, {page: result.headers["location"], anchor : 'Redirect', isDoFollow : true}); | ||
addToListMap(this.outLinks, result.url, {page: result.responseHeaders["location"], anchor : 'Redirect', isDoFollow : true}); | ||
}; | ||
@@ -172,3 +172,3 @@ | ||
resourceInfo.headers = result.headers; | ||
resourceInfo.responseHeaders = result.responseHeaders; | ||
@@ -175,0 +175,0 @@ this.resources.set(result.url, resourceInfo); |
@@ -47,3 +47,3 @@ /** | ||
var contentType = result.headers[CONTENT_TYPE_HEADER]; | ||
var contentType = result.responseHeaders[CONTENT_TYPE_HEADER]; | ||
if (contentType) { | ||
@@ -50,0 +50,0 @@ this.addContentType(contentType); |
@@ -158,3 +158,3 @@ Crawler Ninja | ||
You can pass change/overide the default crawl options by using the init method. | ||
You can pass change/overide the default crawl options by using the init function. | ||
@@ -169,2 +169,3 @@ ```javascript | ||
- maxConnections : the number of connections used to crawl, default is 5. | ||
- jar : If true, remember cookies for future use, default is true | ||
- rateLimits : number of milliseconds to delay between each requests , default = 0. | ||
@@ -187,7 +188,7 @@ - externalDomains : if true crawl external domains. This option can crawl a lot of different linked domains, default = false. | ||
- suffixBlackList : The list of url suffice to avoid to crawl (an array of String). The default list is in the file : /default-lists/domain-black-list.js | ||
- method : HTTP method used for crawling, default : GET. | ||
You can also use the [mikeal's request options](https://github.com/mikeal/request#requestoptions-callback) and will be directly passed to the request() method. | ||
You can also use the [mikeal's request options](https://github.com/mikeal/request#requestoptions-callback) and will be directly passed to the request() function. | ||
You can pass these options to the init() function if you want them to be global or as | ||
@@ -358,3 +359,3 @@ items in the queue() calls if you want them to be specific to that item (overwriting global options). | ||
With the default crawl options, it is possible to get errors like timeouts on some https sites. This happens with sites that do not support TLS 1.2+ . | ||
You can check the HTTPS infos for you site with : https://www.ssllabs.com/ssltest/ | ||
You can check the HTTPS infos and the TLS compliant level for your site on : https://www.ssllabs.com/ssltest/ | ||
@@ -369,5 +370,16 @@ In order to crawl those sites, you have to add the following parameters in the crawl options : | ||
``` | ||
We will try to integrate this kind of exception in the crawler code for an upcoming release. | ||
### Starting the crawl with a redirect on a different subdomain | ||
If you start a crawl on http://wwww.mysite.com and if this url is redirecting to http://mysite.com, the crawl stop directly with the default options. | ||
Indeed, the default options doesn't crawl other hosts/subdomain on the same domain. You can use the option externalHosts to avoid this situation. | ||
```javascript | ||
var options = { | ||
externalHosts : true | ||
}; | ||
``` | ||
The Crawl Store | ||
@@ -497,3 +509,3 @@ --------------- | ||
- Add in the crawl json param "isExternal". By this way, a plugin can check if the link is external or not. | ||
- Add a new option "retry404" : some sites provide inconsistent response for some urls (status 404 instead of 200). In such case, it should be nice to retry (this issuee needs to be analyzed in more detail). | ||
- Add a new option "retry400" : some sites provide inconsistent response for some urls (status 404 instead of 200). In such case, it should be nice to retry (this issuee needs to be analyzed in more detail). | ||
@@ -525,1 +537,7 @@ 0.1.20 | ||
- Now, it is possible to use all http request params in the crawler options. | ||
0.2.5 | ||
- Fix regression when crawling specific websites. | ||
- Review how to build the options for making http requests. | ||
- Add in the result of a request the response headers. | ||
- Better support for HTTP redirects (300+). |
@@ -76,3 +76,3 @@ var assert = require("assert"); | ||
it.skip('Should crawl a https site with a old config', function(done) { | ||
this.timeout(3000000); | ||
this.timeout(40000); | ||
var end = function(){ | ||
@@ -84,4 +84,7 @@ //assert(audit.resources.toArray().length === 0); | ||
var options = { | ||
userAgent : 'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0', | ||
externalHosts : true, | ||
secureOptions : require('constants').SSL_OP_NO_TLSv1_2, | ||
rejectUnauthorized : false | ||
}; | ||
@@ -93,4 +96,5 @@ | ||
crawler.queue({url : "https://www.notaire.be"}); //https://www.notaire.be/ | ||
crawler.queue({url : "https://www.notaire.be/" }); | ||
}); | ||
@@ -97,0 +101,0 @@ |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
8219241
11086
537