@@ -255,3 +255,3 @@ var Crawler,
		return Promise.try(function () {
		return handlerFun(body, url);
		return handlerFun(body, url, contentType);
		}).then(function (subArr) {
		@@ -279,3 +279,4 @@ if (!(subArr instanceof Array)) {
		"User-Agent": this.getUserAgent()
		}
		},
		encoding: null
		}).catch(function (err) {
		@@ -368,6 +369,8 @@ err = new error.RequestError("A request error occured. " + err.message);
		}).then(function (response) {
		var body;
		var body,
		robotsTxt;

		body = response.body;
		self._robotsCache.set(robotsUrl, body);
		robotsTxt = body.toString();
		self._robotsCache.set(robotsUrl, robotsTxt);

		@@ -374,0 +377,0 @@ return robotsTxt;

lib/handlers/htmlLinkParser.js

		@@ -9,6 +9,6 @@ var cheerio = require("cheerio"),

		return function (body, url) {
		return function (buf, url) {
		var $;

		$ = cheerio.load(body);
		$ = cheerio.load(buf);

		@@ -15,0 +15,0 @@ return $("a[href]").map(function () {

lib/handlers/robotsParser.js

		@@ -10,3 +10,3 @@ var robotsParser = require("robots-parser"),
		module.exports = function () {
		return function (body, url) {
		return function (buf, url) {
		var robots,
		@@ -22,3 +22,3 @@ urlObj;

		robots = robotsParser(url, body);
		robots = robotsParser(url, buf.toString());

		@@ -25,0 +25,0 @@ return robots.getSitemaps().map(function (sitemapHref) {

lib/index.js

		@@ -5,3 +5,4 @@ var Crawler = require("./Crawler"),
		htmlLinkParser = require("./handlers/htmlLinkParser"),
		robotsParser = require("./handlers/robotsParser");
		robotsParser = require("./handlers/robotsParser"),
		sitemapsParser = require("./handlers/sitemapsParser");

		@@ -14,4 +15,5 @@ module.exports = {
		htmlLinkParser: htmlLinkParser,
		robotsParser: robotsParser
		robotsParser: robotsParser,
		sitemapsParser: sitemapsParser
		}
		};

package.json

		{
		"name": "supercrawler",
		"description": "A web crawler. Supercrawler automatically crawls websites. Define custom handlers to parse content. Obeys robots.txt, rate limits and concurrency limits.",
		"version": "0.2.3",
		"version": "0.3.0",
		"homepage": "https://github.com/brendonboshell/supercrawler",
		@@ -6,0 +6,0 @@ "author": "Brendon Boshell <brendonboshell@gmail.com>",

README.md

		@@ -45,3 +45,3 @@ # Supercrawler - Node.js Web Crawler
		}));
		crawler.addHandler("text/html", function (body, url) {
		crawler.addHandler("text/html", function (buf, url) {
		console.log("Got page", url);
		@@ -211,1 +211,14 @@ });
		crawler.addHandler("text/plain", supercrawler.handlers.robotsParser());

		# handlers.sitemapsParser

		A function that returns a handler which parses an XML sitemaps file. It will
		pick up any URLs matching `sitemapindex > sitemap > loc, urlset > url > loc`.

		It will also handle a gzipped file, since that it part of the sitemaps
		specification.

		Example usage:

		var sp = supercrawler.handlers.sitemapsParser();
		crawler.addHandler(supercrawler.handlers.sitemapsParser());

test/Crawler.spec.js

		@@ -39,3 +39,4 @@ var proxyquire = require('proxyquire'),
		pageBody,
		robotsStatusCode;
		robotsStatusCode,
		robotsTxt;

		@@ -47,2 +48,6 @@ beforeEach(function () {
		robotsStatusCode = 200;
		robotsTxt = ["User-agent: *",
		"Allow: /",
		"Disallow: /index17.html"
		].join("\n");

		@@ -65,3 +70,3 @@ requestSpy = sinon.spy(function (opts, cb) {
		statusCode: pageStatusCode,
		body: pageBody
		body: new Buffer(pageBody)
		});
		@@ -80,6 +85,3 @@ }, 1);
		statusCode: robotsStatusCode,
		body: ["User-agent: *",
		"Allow: /",
		"Disallow: /index17.html"
		].join("\n")
		body: new Buffer(robotsTxt)
		});
		@@ -310,2 +312,20 @@ }, 1);

		it("skips page excluded by robots.txt, even if robots.txt not in cache", function (done) {
		var crawler = new Crawler({
		interval: 10
		});

		robotsTxt = ["User-agent: *",
		"Allow: /",
		"Disallow: /index1.html"
		].join("\n");
		crawler.start();

		setTimeout(function () {
		crawler.stop();
		expect(numCrawlsOfUrl("https://example.com/index1.html")).to.equal(0);
		done();
		}, 200);
		});

		it("skips a page that is excluded by robots.txt", function (done) {
		@@ -433,3 +453,3 @@ var crawler = new Crawler({
		sinon.assert.calledWith(handler,
		sinon.match("<html><body>test</body></html>"),
		sinon.match(new Buffer("<html><body>test</body></html>")),
		"https://example.com/index1.html");
		@@ -452,3 +472,3 @@ done();
		sinon.assert.calledWith(handler,
		sinon.match("<html><body>test</body></html>"),
		sinon.match(new Buffer("<html><body>test</body></html>")),
		"https://example.com/index1.html");
		@@ -487,3 +507,3 @@ done();
		crawler.stop();
		expect(handler.calledWith(sinon.match("<html><body>test</body></html>"),
		expect(handler.calledWith(sinon.match(new Buffer("<html><body>test</body></html>")),
		"https://example.com/index1.html")).to.equal(true);
		@@ -505,3 +525,3 @@ done();
		crawler.stop();
		expect(handler.calledWith(sinon.match("<html><body>test</body></html>"),
		expect(handler.calledWith(sinon.match(new Buffer("<html><body>test</body></html>")),
		"https://example.com/index1.html")).to.equal(true);
		@@ -529,2 +549,21 @@ done();

		it("passes the content type as the third argument", function (done) {
		var crawler = new Crawler({
		interval: 10
		});

		crawler.addHandler(handler);
		crawler.start();
		pageContentType = "text/plain";

		setTimeout(function () {
		crawler.stop();
		sinon.assert.calledWith(handler,
		sinon.match(new Buffer("<html><body>test</body></html>")),
		"https://example.com/index1.html",
		"text/plain");
		done();
		}, 15);
		});

		it("adds URL to the queue", function (done) {
		@@ -531,0 +570,0 @@ var crawler = new Crawler({

test/handlers/htmlLinkParser.spec.js

		@@ -14,3 +14,3 @@ var htmlLinkParser = require("../../lib/handlers/htmlLinkParser"),

		return html;
		return new Buffer(html);
		};
		@@ -17,0 +17,0 @@

test/handlers/robotsParser.spec.js

		@@ -20,3 +20,3 @@ var robotsParser = require("../../lib/handlers/robotsParser"),
		it("can extract extract a absolute path sitemap", function () {
		expect(rb(robotsTxt, "http://example.com/robots.txt")).to.deep.equal([
		expect(rb(new Buffer(robotsTxt), "http://example.com/robots.txt")).to.deep.equal([
		"http://subdomain.example.com/sitemap_index_1.xml"
		@@ -29,3 +29,3 @@ ]);

		expect(rb(robotsTxt, "http://example.com/robots.txt")).to.deep.equal([
		expect(rb(new Buffer(robotsTxt), "http://example.com/robots.txt")).to.deep.equal([
		"http://subdomain.example.com/sitemap_index_1.xml",
		@@ -38,8 +38,8 @@ "http://example.com/sitemap_index.xml"
		robotsTxt = "";
		expect(rb(robotsTxt, "http://example.com/robots.txt")).to.deep.equal([]);
		expect(rb(new Buffer(robotsTxt), "http://example.com/robots.txt")).to.deep.equal([]);
		});

		it("returns empty when the URL path is not /robots.txt", function () {
		expect(rb(robotsTxt, "http://example.com/Iamnotarobots.txt")).to.deep.equal([]);
		expect(rb(new Buffer(robotsTxt), "http://example.com/Iamnotarobots.txt")).to.deep.equal([]);
		});
		});

test/index.spec.js

		@@ -8,2 +8,3 @@ var proxyquire = require('proxyquire'),
		robotsParserMock,
		sitemapsParserMock,
		index;
		@@ -16,2 +17,3 @@
		robotsParserMock = function () {};
		sitemapsParserMock = function () {};

		@@ -23,3 +25,4 @@ index = proxyquire("../lib/index", {
		"./handlers/htmlLinkParser": htmlLinkParserMock,
		"./handlers/robotsParser": robotsParserMock
		"./handlers/robotsParser": robotsParserMock,
		"./handlers/sitemapsParser": sitemapsParserMock
		});
		@@ -47,2 +50,6 @@
		});

		it("exposes sitemapsParser", function () {
		expect(index.handlers.sitemapsParser).to.equal(sitemapsParserMock);
		});
		});

supercrawler - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics