supercrawler
Advanced tools
Comparing version 0.16.0 to 0.16.1
@@ -414,4 +414,4 @@ var Crawler, | ||
// if robots returns a 404, we assume there are no restrictions. | ||
if (robotsStatusCode === 404) { | ||
// if robots returns a 404 or 410, we assume there are no restrictions. | ||
if (robotsStatusCode === 404 || robotsStatusCode === 410) { | ||
return Promise.resolve({ | ||
@@ -418,0 +418,0 @@ statusCode: 200, |
{ | ||
"name": "supercrawler", | ||
"description": "A web crawler. Supercrawler automatically crawls websites. Define custom handlers to parse content. Obeys robots.txt, rate limits and concurrency limits.", | ||
"version": "0.16.0", | ||
"version": "0.16.1", | ||
"homepage": "https://github.com/brendonboshell/supercrawler", | ||
@@ -6,0 +6,0 @@ "author": "Brendon Boshell <brendonboshell@gmail.com>", |
@@ -350,2 +350,6 @@ # Node.js Web Crawler | ||
### 0.16.1 | ||
* [Fixed] Treats 410 the same as 404 for robots.txt requests. | ||
### 0.16.0 | ||
@@ -352,0 +356,0 @@ |
@@ -450,2 +450,17 @@ var proxyquire = require('proxyquire'), | ||
it("crawls all pages if robots.txt is 410", function (done) { | ||
var crawler = new Crawler({ | ||
interval: 10 | ||
}); | ||
crawler.start(); | ||
robotsStatusCode = 410; | ||
setTimeout(function () { | ||
crawler.stop(); | ||
expect(numCrawlsOfUrl("https://example.com/index17.html", false)).to.equal(1); | ||
done(); | ||
}, 200); | ||
}); | ||
it("excludes all pages if robots.txt could not be crawled", function (done) { | ||
@@ -452,0 +467,0 @@ var crawler = new Crawler({ |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
119076
2901
502