@@ -341,5 +341,12 @@ var Crawler,

		// robots.txt doesn't exist in the cache, so we have to hit the
		// server to get it.
		return this._downloadUrl(robotsUrl).catch(error.HttpError, function (err) {
		// We want to add /robots.txt to the crawl queue. This is because we may
		// parse the robots.txt file with a content handler, in order to extract
		// it's Sitemap: directives. (And then we'll crawl those sitemaps too!)
		return this.getUrlList().insertIfNotExists(new Url({
		url: robotsUrl
		})).then(function () {
		// robots.txt doesn't exist in the cache, so we have to hit the
		// server to get it.
		return self._downloadUrl(robotsUrl);
		}).catch(error.HttpError, function (err) {
		var robotsStatusCode = err.statusCode;
		@@ -346,0 +353,0 @@

lib/index.js

		var Crawler = require("./Crawler"),
		Url = require("./Url"),
		DbUrlList = require("./DbUrlList"),
		htmlLinkParser = require("./handlers/htmlLinkParser");
		htmlLinkParser = require("./handlers/htmlLinkParser"),
		robotsParser = require("./handlers/robotsParser");

		@@ -11,4 +12,5 @@ module.exports = {
		handlers: {
		htmlLinkParser: htmlLinkParser
		htmlLinkParser: htmlLinkParser,
		robotsParser: robotsParser
		}
		};

package.json

		{
		"name": "supercrawler",
		"description": "A web crawler. Supercrawler automatically crawls websites. Define custom handlers to parse content. Obeys robots.txt, rate limits and concurrency limits.",
		"version": "0.2.2",
		"version": "0.2.3",
		"homepage": "https://github.com/brendonboshell/supercrawler",
		@@ -6,0 +6,0 @@ "author": "Brendon Boshell <brendonboshell@gmail.com>",

README.md

		@@ -9,2 +9,15 @@ # Supercrawler - Node.js Web Crawler

		## Features

		* Link Detection. Supercrawler will parse crawled HTML documents, identify
		links and add them to the queue.
		* Robots Parsing. Supercrawler will request robots.txt and check the rules
		before crawling. It will also identify any sitemaps.
		* Sitemaps Parsing. Supercrawler will read links from XML sitemap files,
		and add links to the queue.
		* Concurrency Limiting. Supercrawler limits the number of requests sent out
		at any one time.
		* Rate limiting. Supercrawler will add a delay between requests to avoid
		bombarding servers.

		## Step 1. Create a New Crawler
		@@ -181,12 +194,18 @@

		## Features

		* Pluggable priority queues. Supercrawler ships with a simple first-in,
		first-out style queue. But you can easily plug your own queue in, allowing
		you to retry failed crawls, prioritize specific pages or save crawl data
		in a database, for example.
		* Concurrency limiting. You can set a maximum number of requests that can
		execute at the same time.
		* Rate limiting. You can set a rate limit to prevent crawling too quickly.
		* Robots adherence. Supercrawler automatically downloads, checks and caches
		the results of robots.txt exclusions.
		## handlers.robotsParser

		A function that returns a handler which parses a robots.txt file. Robots.txt
		file are automatically crawled, and sent through the same content handler
		routines as any other file. This handler will look for any `Sitemap: ` directives,
		and add those XML sitemaps to the crawl.

		It will ignore any files that are not `/robots.txt`.

		If you want to extract the URLs from those XML sitemaps, you will also need
		to add a sitemap parser.

		Example usage:

		var rp = supercrawler.handlers.robotsParser();
		crawler.addHandler("text/plain", supercrawler.handlers.robotsParser());

test/Crawler.spec.js

		@@ -276,2 +276,18 @@ var proxyquire = require('proxyquire'),

		it("adds the robots.txt file itself to the crawl queue", function (done) {
		var crawler = new Crawler({
		interval: 10
		});

		crawler.start();

		setTimeout(function () {
		crawler.stop();
		sinon.assert.calledWith(insertIfNotExistsSpy, sinon.match({
		_url: "https://example.com/robots.txt"
		}));
		done();
		}, 200);
		});

		it("requests a page that is not excluded by robots.txt", function (done) {
		@@ -278,0 +294,0 @@ var crawler = new Crawler({

test/index.spec.js

		@@ -7,2 +7,3 @@ var proxyquire = require('proxyquire'),
		htmlLinkParserMock,
		robotsParserMock,
		index;
		@@ -14,2 +15,3 @@
		htmlLinkParserMock = function () {};
		robotsParserMock = function () {};

		@@ -20,3 +22,4 @@ index = proxyquire("../lib/index", {
		"./DbUrlList": DbUrlListMock,
		"./handlers/htmlLinkParser": htmlLinkParserMock
		"./handlers/htmlLinkParser": htmlLinkParserMock,
		"./handlers/robotsParser": robotsParserMock
		});
		@@ -40,2 +43,6 @@
		});

		it("exposes robotsParser", function () {
		expect(index.handlers.robotsParser).to.equal(robotsParserMock);
		});
		});

supercrawler - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics