Socket
Socket
Sign inDemoInstall

supercrawler

Package Overview
Dependencies
179
Maintainers
1
Versions
45
Alerts
File Explorer

Advanced tools

Install Socket

Detect and block malicious and high-risk dependencies

Install

Comparing version 1.3.3 to 1.4.0

28

lib/Crawler.js

@@ -45,2 +45,3 @@ var Crawler,

this._outstandingRequests = 0;
this._robotsIgnoreServerError = opts.robotsIgnoreServerError || false;
};

@@ -270,6 +271,7 @@

});
}).catch(error.RobotsNotAllowedError, function () {
}).catch(error.RobotsNotAllowedError, function (err) {
return new Url({
url: url,
errorCode: "ROBOTS_NOT_ALLOWED"
errorCode: "ROBOTS_NOT_ALLOWED",
errorMessage: err.message
});

@@ -433,2 +435,3 @@ }).catch(error.HttpError, function (err) {

robotsTxt,
ignoreServerError,
self = this;

@@ -439,2 +442,3 @@

robotsTxt = this._robotsCache.get(robotsUrl);
ignoreServerError = this._robotsIgnoreServerError;

@@ -457,8 +461,16 @@ if (typeof robotsTxt !== "undefined") {

// if robots returns a 404 or 410, we assume there are no restrictions.
if (robotsStatusCode === 404 || robotsStatusCode === 410) {
return Promise.resolve({
statusCode: 200,
body: ""
});
// if robots returns a dismissable status code, we assume
// there are no restrictions.
switch (robotsStatusCode) {
case 404:
case 410:
case 500:
if (robotsStatusCode === 500 && !ignoreServerError) {
break;
}
return Promise.resolve({
statusCode: 200,
body: ""
});
}

@@ -465,0 +477,0 @@

{
"name": "supercrawler",
"description": "A web crawler. Supercrawler automatically crawls websites. Define custom handlers to parse content. Obeys robots.txt, rate limits and concurrency limits.",
"version": "1.3.3",
"version": "1.4.0",
"homepage": "https://github.com/brendonboshell/supercrawler",

@@ -6,0 +6,0 @@ "author": "Brendon Boshell <brendonboshell@gmail.com>",

@@ -132,2 +132,3 @@ # Node.js Web Crawler

| robotsCacheTime | Number of milliseconds that robots.txt should be cached for. Defaults to 3600000 (1 hour). |
| robotsIgnoreServerError | Indicates if `500` status code response for robots.txt should be ignored. Defaults to `false`. |
| userAgent | User agent to use for requests. Defaults to `Mozilla/5.0 (compatible; supercrawler/1.0; +https://github.com/brendonboshell/supercrawler)` |

@@ -355,2 +356,10 @@ | request | Object of options to be passed to [request](https://github.com/request/request). Note that request does not support an asynchronous (and distributed) cookie jar. |

### 1.4.0
* [Added] Added the `robotsIgnoreServerError` option to accept a robots.txt 500 error code as "allow all" rather than "deny all" (default), thanks [cbess](https://github.com/cbess).
### 1.3.3
* [Fix] Updated dependencies, thanks [cbess](https://github.com/cbess).
### 1.3.1

@@ -357,0 +366,0 @@

@@ -553,2 +553,33 @@ var proxyquire = require('proxyquire'),

it("crawls all pages if robots.txt is 500 (robotsIgnoreServerError flag)", function (done) {
var crawler = new Crawler({
interval: 10,
robotsIgnoreServerError: true
});
crawler.start();
robotsStatusCode = 500;
setTimeout(function () {
crawler.stop();
expect(numCrawlsOfUrl("https://example.com/index17.html", false)).to.equal(1);
done();
}, 200);
});
it("does not crawl pages if robots.txt is 500", function (done) {
var crawler = new Crawler({
interval: 10
});
crawler.start();
robotsStatusCode = 500;
setTimeout(function () {
crawler.stop();
expect(numCrawlsOfUrl("https://example.com/index17.html", false)).to.equal(0);
done();
}, 200);
});
it("excludes all pages if robots.txt could not be crawled", function (done) {

@@ -555,0 +586,0 @@ var crawler = new Crawler({

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc