supercrawler
Advanced tools
Comparing version 1.3.3 to 1.4.0
@@ -45,2 +45,3 @@ var Crawler, | ||
this._outstandingRequests = 0; | ||
this._robotsIgnoreServerError = opts.robotsIgnoreServerError || false; | ||
}; | ||
@@ -270,6 +271,7 @@ | ||
}); | ||
}).catch(error.RobotsNotAllowedError, function () { | ||
}).catch(error.RobotsNotAllowedError, function (err) { | ||
return new Url({ | ||
url: url, | ||
errorCode: "ROBOTS_NOT_ALLOWED" | ||
errorCode: "ROBOTS_NOT_ALLOWED", | ||
errorMessage: err.message | ||
}); | ||
@@ -433,2 +435,3 @@ }).catch(error.HttpError, function (err) { | ||
robotsTxt, | ||
ignoreServerError, | ||
self = this; | ||
@@ -439,2 +442,3 @@ | ||
robotsTxt = this._robotsCache.get(robotsUrl); | ||
ignoreServerError = this._robotsIgnoreServerError; | ||
@@ -457,8 +461,16 @@ if (typeof robotsTxt !== "undefined") { | ||
// if robots returns a 404 or 410, we assume there are no restrictions. | ||
if (robotsStatusCode === 404 || robotsStatusCode === 410) { | ||
return Promise.resolve({ | ||
statusCode: 200, | ||
body: "" | ||
}); | ||
// if robots returns a dismissable status code, we assume | ||
// there are no restrictions. | ||
switch (robotsStatusCode) { | ||
case 404: | ||
case 410: | ||
case 500: | ||
if (robotsStatusCode === 500 && !ignoreServerError) { | ||
break; | ||
} | ||
return Promise.resolve({ | ||
statusCode: 200, | ||
body: "" | ||
}); | ||
} | ||
@@ -465,0 +477,0 @@ |
{ | ||
"name": "supercrawler", | ||
"description": "A web crawler. Supercrawler automatically crawls websites. Define custom handlers to parse content. Obeys robots.txt, rate limits and concurrency limits.", | ||
"version": "1.3.3", | ||
"version": "1.4.0", | ||
"homepage": "https://github.com/brendonboshell/supercrawler", | ||
@@ -6,0 +6,0 @@ "author": "Brendon Boshell <brendonboshell@gmail.com>", |
@@ -132,2 +132,3 @@ # Node.js Web Crawler | ||
| robotsCacheTime | Number of milliseconds that robots.txt should be cached for. Defaults to 3600000 (1 hour). | | ||
| robotsIgnoreServerError | Indicates if `500` status code response for robots.txt should be ignored. Defaults to `false`. | | ||
| userAgent | User agent to use for requests. Defaults to `Mozilla/5.0 (compatible; supercrawler/1.0; +https://github.com/brendonboshell/supercrawler)` | | ||
@@ -355,2 +356,10 @@ | request | Object of options to be passed to [request](https://github.com/request/request). Note that request does not support an asynchronous (and distributed) cookie jar. | | ||
### 1.4.0 | ||
* [Added] Added the `robotsIgnoreServerError` option to accept a robots.txt 500 error code as "allow all" rather than "deny all" (default), thanks [cbess](https://github.com/cbess). | ||
### 1.3.3 | ||
* [Fix] Updated dependencies, thanks [cbess](https://github.com/cbess). | ||
### 1.3.1 | ||
@@ -357,0 +366,0 @@ |
@@ -553,2 +553,33 @@ var proxyquire = require('proxyquire'), | ||
it("crawls all pages if robots.txt is 500 (robotsIgnoreServerError flag)", function (done) { | ||
var crawler = new Crawler({ | ||
interval: 10, | ||
robotsIgnoreServerError: true | ||
}); | ||
crawler.start(); | ||
robotsStatusCode = 500; | ||
setTimeout(function () { | ||
crawler.stop(); | ||
expect(numCrawlsOfUrl("https://example.com/index17.html", false)).to.equal(1); | ||
done(); | ||
}, 200); | ||
}); | ||
it("does not crawl pages if robots.txt is 500", function (done) { | ||
var crawler = new Crawler({ | ||
interval: 10 | ||
}); | ||
crawler.start(); | ||
robotsStatusCode = 500; | ||
setTimeout(function () { | ||
crawler.stop(); | ||
expect(numCrawlsOfUrl("https://example.com/index17.html", false)).to.equal(0); | ||
done(); | ||
}, 200); | ||
}); | ||
it("excludes all pages if robots.txt could not be crawled", function (done) { | ||
@@ -555,0 +586,0 @@ var crawler = new Crawler({ |
126746
3091
539