supercrawler - npm Package Compare versions

10

lib/Crawler.js

		@@ -276,4 +276,12 @@ var Crawler,
		Crawler.prototype._fireHandlers = function (contentType, body, url) {
		var ctx;

		contentType = contentType.replace(/;.*$/g, "");

		ctx = {
		body: body,
		url: url,
		contentType: contentType
		};

		return Promise.reduce(this._handlers, function (arr, handlerObj) {
		@@ -295,3 +303,3 @@ var handlerContentType = handlerObj.contentType,
		return Promise.try(function () {
		return handlerFun(body, url, contentType);
		return handlerFun(ctx);
		}).then(function (subArr) {
		@@ -298,0 +306,0 @@ if (!(subArr instanceof Array)) {

7

lib/handlers/htmlLinkParser.js

		@@ -9,6 +9,7 @@ var cheerio = require("cheerio"),

		return function (buf, url) {
		return function (context) {
		var $;

		$ = cheerio.load(buf);
		$ = context.$ \|\| cheerio.load(context.body);
		context.$ = $;

		@@ -25,3 +26,3 @@ return $("a[href], link[href][rel=alternate]").map(function () {
		targetHref = $this.attr("href");
		absoluteTargetUrl = urlMod.resolve(url, targetHref);
		absoluteTargetUrl = urlMod.resolve(context.url, targetHref);
		urlObj = urlMod.parse(absoluteTargetUrl);
		@@ -28,0 +29,0 @@ protocol = urlObj.protocol;

10

lib/handlers/robotsParser.js

		@@ -20,7 +20,7 @@ var robotsParser = require("robots-parser"),

		return function (buf, url) {
		return function (context) {
		var robots,
		urlObj;

		urlObj = urlMod.parse(url);
		urlObj = urlMod.parse(context.url);

		@@ -32,10 +32,10 @@ // skip if this is not actually a robots.txt file.

		robots = robotsParser(url, buf.toString());
		robots = robotsParser(context.url, context.body.toString());

		return robots.getSitemaps().map(function (sitemapHref) {
		return urlMod.resolve(url, sitemapHref);
		return urlMod.resolve(context.url, sitemapHref);
		}).filter(function (sitemapUrl) {
		return opts.urlFilter(sitemapUrl, url);
		return opts.urlFilter(sitemapUrl, context.url);
		});
		};
		};

15

lib/handlers/sitemapsParser.js

		@@ -34,11 +34,11 @@ var cheerio = require("cheerio"),

		return function (buf, url, contentType) {
		return function (context) {
		var xmlBufProm;

		// If sitemap has come in compressed state, we must uncompress it!
		if (contentType === "application/x-gzip" \|\|
		contentType === "application/gzip") {
		xmlBufProm = Promise.promisify(zlib.gunzip)(buf);
		if (context.contentType === "application/x-gzip" \|\|
		context.contentType === "application/gzip") {
		xmlBufProm = Promise.promisify(zlib.gunzip)(context.body);
		} else {
		xmlBufProm = Promise.resolve(buf);
		xmlBufProm = Promise.resolve(context.body);
		}
		@@ -51,5 +51,4 @@

		var $ = cheerio.load(xmlBuf, {
		xmlMode: true
		});
		var $ = context.$ \|\| cheerio.load(xmlBuf);
		context.$ = $;

		@@ -56,0 +55,0 @@ // We map over the array rather than using Cheerio's map, because it is

2

package.json

		{
		"name": "supercrawler",
		"description": "A web crawler. Supercrawler automatically crawls websites. Define custom handlers to parse content. Obeys robots.txt, rate limits and concurrency limits.",
		"version": "0.11.0",
		"version": "0.12.0",
		"homepage": "https://github.com/brendonboshell/supercrawler",
		@@ -6,0 +6,0 @@ "author": "Brendon Boshell <brendonboshell@gmail.com>",

13

README.md

		@@ -100,5 +100,5 @@ # Node.js Web Crawler
		// Custom content handler for HTML pages.
		crawler.addHandler("text/html", function (buf, url) {
		var sizeKb = Buffer.byteLength(buf) / 1024;
		logger.info("Processed", url, "Size=", sizeKb, "KB");
		crawler.addHandler("text/html", function (context) {
		var sizeKb = Buffer.byteLength(context.body) / 1024;
		logger.info("Processed", context.url, "Size=", sizeKb, "KB");
		});
		@@ -313,2 +313,9 @@ ```

		### 0.12.0

		* [Change] Rather than calling content handlers with (body, url), they are
		now called with a single `context` argument. This allows you to pass information
		forwards via handlers. For example, you might cache the `cheerio` parsing
		so you don't parse with every content handler.

		### 0.11.0
		@@ -315,0 +322,0 @@

49

test/Crawler.spec.js

		@@ -539,5 +539,7 @@ var proxyquire = require('proxyquire'),
		crawler.stop();
		sinon.assert.calledWith(handler,
		sinon.match(new Buffer("<html><body>test</body></html>")),
		"https://example.com/index1.html");
		sinon.assert.calledWith(handler, sinon.match({
		body: sinon.match(new Buffer("<html><body>test</body></html>")),
		url: "https://example.com/index1.html",
		contentType: "text/plain"
		}));
		done();
		@@ -558,5 +560,7 @@ }, 15);
		crawler.stop();
		sinon.assert.calledWith(handler,
		sinon.match(new Buffer("<html><body>test</body></html>")),
		"https://example.com/index1.html");
		sinon.assert.calledWith(handler, sinon.match({
		body: sinon.match(new Buffer("<html><body>test</body></html>")),
		url: "https://example.com/index1.html",
		contentType: "text/html"
		}));
		done();
		@@ -577,4 +581,6 @@ }, 15);
		crawler.stop();
		expect(handler.calledWith(sinon.match("<html><body>test</body></html>"),
		"https://example.com/index1.html")).to.equal(false);
		expect(handler.calledWith(sinon.match({
		body: sinon.match("<html><body>test</body></html>"),
		url: "https://example.com/index1.html"
		}))).to.equal(false);
		done();
		@@ -595,4 +601,6 @@ }, 15);
		crawler.stop();
		expect(handler.calledWith(sinon.match(new Buffer("<html><body>test</body></html>")),
		"https://example.com/index1.html")).to.equal(true);
		expect(handler.calledWith(sinon.match({
		body: sinon.match(new Buffer("<html><body>test</body></html>")),
		url: "https://example.com/index1.html"
		}))).to.equal(true);
		done();
		@@ -613,4 +621,6 @@ }, 15);
		crawler.stop();
		expect(handler.calledWith(sinon.match(new Buffer("<html><body>test</body></html>")),
		"https://example.com/index1.html")).to.equal(true);
		expect(handler.calledWith(sinon.match({
		body: sinon.match(new Buffer("<html><body>test</body></html>")),
		url: "https://example.com/index1.html"
		}))).to.equal(true);
		done();
		@@ -631,4 +641,6 @@ }, 15);
		crawler.stop();
		expect(handler.calledWith(sinon.match("<html><body>test</body></html>"),
		"https://example.com/index1.html")).to.equal(false);
		expect(handler.calledWith(sinon.match({
		body: sinon.match("<html><body>test</body></html>"),
		url: "https://example.com/index1.html"
		}))).to.equal(false);
		done();
		@@ -649,6 +661,7 @@ }, 15);
		crawler.stop();
		sinon.assert.calledWith(handler,
		sinon.match(new Buffer("<html><body>test</body></html>")),
		"https://example.com/index1.html",
		"text/plain");
		sinon.assert.calledWith(handler, sinon.match({
		body: sinon.match(new Buffer("<html><body>test</body></html>")),
		url: "https://example.com/index1.html",
		contentType: "text/plain"
		}));
		done();
		@@ -655,0 +668,0 @@ }, 100);

40

test/handlers/htmlLinkParser.spec.js

		@@ -24,3 +24,6 @@ var htmlLinkParser = require("../../lib/handlers/htmlLinkParser"),

		expect(hlp(html, "https://example2.com/index")).to.deep.equal([
		expect(hlp({
		body: html,
		url: "https://example2.com/index"
		})).to.deep.equal([
		"https://example.com/test"
		@@ -36,3 +39,6 @@ ]);

		expect(hlp(html, "https://example.com/my/page.html")).to.deep.equal([
		expect(hlp({
		body: html,
		url: "https://example.com/my/page.html"
		})).to.deep.equal([
		"https://example.com/my/page2.html"
		@@ -48,3 +54,6 @@ ]);

		expect(hlp(html, "https://example.com/my/page.html")).to.deep.equal([
		expect(hlp({
		body: html,
		url: "https://example.com/my/page.html"
		})).to.deep.equal([
		"https://example.com/page2.html"
		@@ -64,3 +73,6 @@ ]);

		expect(hlp(html, "https://example.com/my/page.html")).to.deep.equal([
		expect(hlp({
		body: html,
		url: "https://example.com/my/page.html"
		})).to.deep.equal([
		"https://example.com/page2.html",
		@@ -80,3 +92,6 @@ "https://example.com/my/page3.html",

		expect(hlp(html, "https://example.com/my/page.html")).to.deep.equal([]);
		expect(hlp({
		body: html,
		url: "https://example.com/my/page.html"
		})).to.deep.equal([]);
		});
		@@ -97,3 +112,6 @@

		expect(hlp(html, "https://example.com/my/page.html")).to.deep.equal([
		expect(hlp({
		body: html,
		url: "https://example.com/my/page.html"
		})).to.deep.equal([
		"https://example.com/page2.html",
		@@ -111,3 +129,6 @@ "https://example.com/my/page3.html",

		expect(hlp(html, "http://example.com")).to.deep.equal([
		expect(hlp({
		body: html,
		url: "http://example.com"
		})).to.deep.equal([
		"http://example.com/index-es/"
		@@ -123,4 +144,7 @@ ]);

		expect(hlp(html, "http://example.com")).to.deep.equal([]);
		expect(hlp({
		body: html,
		url: "http://example.com"
		})).to.deep.equal([]);
		});
		});

25

test/handlers/robotsParser.spec.js

		@@ -20,3 +20,6 @@ var robotsParser = require("../../lib/handlers/robotsParser"),
		it("can extract extract a absolute path sitemap", function () {
		expect(rb(new Buffer(robotsTxt), "http://example.com/robots.txt")).to.deep.equal([
		expect(rb({
		body: new Buffer(robotsTxt),
		url: "http://example.com/robots.txt"
		})).to.deep.equal([
		"http://subdomain.example.com/sitemap_index_1.xml"
		@@ -29,3 +32,6 @@ ]);

		expect(rb(new Buffer(robotsTxt), "http://example.com/robots.txt")).to.deep.equal([
		expect(rb({
		body: new Buffer(robotsTxt),
		url: "http://example.com/robots.txt"
		})).to.deep.equal([
		"http://subdomain.example.com/sitemap_index_1.xml",
		@@ -43,3 +49,6 @@ "http://example.com/sitemap_index.xml"

		expect(rb(new Buffer(robotsTxt), "http://example.com/robots.txt")).to.deep.equal([
		expect(rb({
		body: new Buffer(robotsTxt),
		url: "http://example.com/robots.txt"
		})).to.deep.equal([
		"http://subdomain.example.com/sitemap_index_1.xml"
		@@ -51,8 +60,14 @@ ]);
		robotsTxt = "";
		expect(rb(new Buffer(robotsTxt), "http://example.com/robots.txt")).to.deep.equal([]);
		expect(rb({
		body: new Buffer(robotsTxt),
		url: "http://example.com/robots.txt"
		})).to.deep.equal([]);
		});

		it("returns empty when the URL path is not /robots.txt", function () {
		expect(rb(new Buffer(robotsTxt), "http://example.com/Iamnotarobots.txt")).to.deep.equal([]);
		expect(rb({
		body: new Buffer(robotsTxt),
		url: "http://example.com/Iamnotarobots.txt"
		})).to.deep.equal([]);
		});
		});

37

test/handlers/sitemapsParser.spec.js

		@@ -47,3 +47,6 @@ var sitemapsParser = require("../../lib/handlers/sitemapsParser"),
		it("discovers another sitemap", function (done) {
		sp(new Buffer(sitemapindex), "http://example.com/sitemap_index.xml").then(function (urls) {
		sp({
		body: new Buffer(sitemapindex),
		url: "http://example.com/sitemap_index.xml"
		}).then(function (urls) {
		expect(urls).to.deep.equal([
		@@ -58,3 +61,6 @@ "http://example.com/sitemap.xml.gz"
		sitemapindex = "<html><body><h1>I'm not a sitemap</h1></body></html>";
		sp(new Buffer(sitemapindex), "http://example.com/sitemap_index.xml").then(function (urls) {
		sp({
		body: new Buffer(sitemapindex),
		url: "http://example.com/sitemap_index.xml"
		}).then(function (urls) {
		expect(urls).to.deep.equal([]);
		@@ -66,3 +72,6 @@ done();
		it("discovers a urlset", function (done) {
		sp(new Buffer(urlset), "http://example.com/sitemap_index.xml").then(function (urls) {
		sp({
		body: new Buffer(urlset),
		url: "http://example.com/sitemap_index.xml"
		}).then(function (urls) {
		expect(urls).to.deep.equal([
		@@ -76,3 +85,6 @@ "https://example.com/home.html"
		it("discovers an alternate link", function (done) {
		sp(new Buffer(urlsetWithAlternate), "http://example.com/sitemap_index.xml").then(function (urls) {
		sp({
		body: new Buffer(urlsetWithAlternate),
		url: "http://example.com/sitemap_index.xml"
		}).then(function (urls) {
		expect(urls).to.deep.equal([
		@@ -93,3 +105,6 @@ "https://example.com/home.html",

		sp(new Buffer(urlsetWithAlternate), "http://example.com/sitemap_index.xml").then(function (urls) {
		sp({
		body: new Buffer(urlsetWithAlternate),
		url: "http://example.com/sitemap_index.xml"
		}).then(function (urls) {
		expect(urls).to.deep.equal([
		@@ -104,3 +119,7 @@ "https://example.com/home.html"
		Promise.promisify(zlib.gzip)(new Buffer(urlset)).then(function (buf) {
		return sp(buf, "http://example.com/sitemap_index.xml", "application/x-gzip");
		return sp({
		body: buf,
		url: "http://example.com/sitemap_index.xml",
		contentType: "application/x-gzip"
		});
		}).then(function (urls) {
		@@ -116,3 +135,7 @@ expect(urls).to.deep.equal([
		Promise.promisify(zlib.gzip)(new Buffer(urlset)).then(function (buf) {
		return sp(buf, "http://example.com/sitemap_index.xml", "application/gzip");
		return sp({
		body: buf,
		url: "http://example.com/sitemap_index.xml",
		contentType: "application/gzip"
		});
		}).then(function (urls) {
		@@ -119,0 +142,0 @@ expect(urls).to.deep.equal([

supercrawler - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics