supercrawler - npm Package Compare versions

supercrawler

Advanced tools

Install Socket

Detect and block malicious and high-risk dependencies

Comparing version 1.3.3 to 1.4.0

lib/Crawler.js

		@@ -45,2 +45,3 @@ var Crawler,
		this._outstandingRequests = 0;
		this._robotsIgnoreServerError = opts.robotsIgnoreServerError \|\| false;
		};
		@@ -270,6 +271,7 @@
		});
		}).catch(error.RobotsNotAllowedError, function () {
		}).catch(error.RobotsNotAllowedError, function (err) {
		return new Url({
		url: url,
		errorCode: "ROBOTS_NOT_ALLOWED"
		errorCode: "ROBOTS_NOT_ALLOWED",
		errorMessage: err.message
		});
		@@ -433,2 +435,3 @@ }).catch(error.HttpError, function (err) {
		robotsTxt,
		ignoreServerError,
		self = this;
		@@ -439,2 +442,3 @@
		robotsTxt = this._robotsCache.get(robotsUrl);
		ignoreServerError = this._robotsIgnoreServerError;

		@@ -457,8 +461,16 @@ if (typeof robotsTxt !== "undefined") {

		// if robots returns a 404 or 410, we assume there are no restrictions.
		if (robotsStatusCode === 404 \|\| robotsStatusCode === 410) {
		return Promise.resolve({
		statusCode: 200,
		body: ""
		});
		// if robots returns a dismissable status code, we assume
		// there are no restrictions.
		switch (robotsStatusCode) {
		case 404:
		case 410:
		case 500:
		if (robotsStatusCode === 500 && !ignoreServerError) {
		break;
		}

		return Promise.resolve({
		statusCode: 200,
		body: ""
		});
		}
		@@ -465,0 +477,0 @@

package.json

		{
		"name": "supercrawler",
		"description": "A web crawler. Supercrawler automatically crawls websites. Define custom handlers to parse content. Obeys robots.txt, rate limits and concurrency limits.",
		"version": "1.3.3",
		"version": "1.4.0",
		"homepage": "https://github.com/brendonboshell/supercrawler",
		@@ -6,0 +6,0 @@ "author": "Brendon Boshell <brendonboshell@gmail.com>",

README.md

		@@ -132,2 +132,3 @@ # Node.js Web Crawler
		\| robotsCacheTime \| Number of milliseconds that robots.txt should be cached for. Defaults to 3600000 (1 hour). \|
		\| robotsIgnoreServerError \| Indicates if `500` status code response for robots.txt should be ignored. Defaults to `false`. \|
		\| userAgent \| User agent to use for requests. Defaults to `Mozilla/5.0 (compatible; supercrawler/1.0; +https://github.com/brendonboshell/supercrawler)` \|
		@@ -355,2 +356,10 @@ \| request \| Object of options to be passed to [request](https://github.com/request/request). Note that request does not support an asynchronous (and distributed) cookie jar. \|

		### 1.4.0

		* [Added] Added the `robotsIgnoreServerError` option to accept a robots.txt 500 error code as "allow all" rather than "deny all" (default), thanks [cbess](https://github.com/cbess).

		### 1.3.3

		* [Fix] Updated dependencies, thanks [cbess](https://github.com/cbess).

		### 1.3.1
		@@ -357,0 +366,0 @@

test/Crawler.spec.js

		@@ -553,2 +553,33 @@ var proxyquire = require('proxyquire'),

		it("crawls all pages if robots.txt is 500 (robotsIgnoreServerError flag)", function (done) {
		var crawler = new Crawler({
		interval: 10,
		robotsIgnoreServerError: true
		});

		crawler.start();
		robotsStatusCode = 500;

		setTimeout(function () {
		crawler.stop();
		expect(numCrawlsOfUrl("https://example.com/index17.html", false)).to.equal(1);
		done();
		}, 200);
		});

		it("does not crawl pages if robots.txt is 500", function (done) {
		var crawler = new Crawler({
		interval: 10
		});

		crawler.start();
		robotsStatusCode = 500;

		setTimeout(function () {
		crawler.stop();
		expect(numCrawlsOfUrl("https://example.com/index17.html", false)).to.equal(0);
		done();
		}, 200);
		});

		it("excludes all pages if robots.txt could not be crawled", function (done) {
		@@ -555,0 +586,0 @@ var crawler = new Crawler({

Improved metrics