supercrawler - npm Package Compare versions

Comparing version 0.3.3 to 0.4.0

lib/Crawler.js

		@@ -189,10 +189,20 @@ var Crawler,
		return this._downloadAndCheckRobots(url).then(function () {
		return self._downloadUrl(url);
		return self._downloadUrl(url, false);
		}).then(function (_response) {
		var contentType;
		var contentType,
		statusCode,
		location;

		response = _response;
		contentType = response.headers["content-type"] \|\| mime.lookup(url);
		statusCode = response.statusCode;
		location = response.headers.location;

		return self._fireHandlers(contentType, response.body, url);
		// If this is a redirect, we follow the location header.
		// Otherwise, we get the discovered URLs from the content handlers.
		if (statusCode >= 300 && statusCode < 400) {
		return [urlMod.resolve(url, location)];
		} else {
		return self._fireHandlers(contentType, response.body, url);
		}
		}).then(function (links) {
		@@ -268,8 +278,12 @@ return Promise.map(links, function (link) {
		/**
		* Download a particular URL.
		* Download a particular URL. Generally speaking, we do not want to follow
		* redirects, because we just add the destination URLs to the queue and crawl
		* them later. But, when requesting /robots.txt, we do follow the redirects.
		* This is an edge case.
		*
		* @param {string} url URL to fetch.
		* @return {Promise} Promise of result.
		* @param {string} url URL to fetch.
		* @param {Boolean} followRedirect True if redirect should be followed.
		* @return {Promise} Promise of result.
		*/
		Crawler.prototype._downloadUrl = function (url) {
		Crawler.prototype._downloadUrl = function (url, followRedirect) {
		return request({
		@@ -281,3 +295,4 @@ url: url,
		},
		encoding: null
		encoding: null,
		followRedirect: Boolean(followRedirect)
		}).catch(function (err) {
		@@ -353,3 +368,3 @@ err = new error.RequestError("A request error occured. " + err.message);
		// server to get it.
		return self._downloadUrl(robotsUrl);
		return self._downloadUrl(robotsUrl, true);
		}).catch(error.HttpError, function (err) {
		@@ -356,0 +371,0 @@ var robotsStatusCode = err.statusCode;

package.json

		{
		"name": "supercrawler",
		"description": "A web crawler. Supercrawler automatically crawls websites. Define custom handlers to parse content. Obeys robots.txt, rate limits and concurrency limits.",
		"version": "0.3.3",
		"version": "0.4.0",
		"homepage": "https://github.com/brendonboshell/supercrawler",
		@@ -6,0 +6,0 @@ "author": "Brendon Boshell <brendonboshell@gmail.com>",

README.md

		@@ -282,1 +282,52 @@ # Node.js Web Crawler
		crawler.addHandler(supercrawler.handlers.sitemapsParser());

		## Changelog

		### 0.4.0

		* [Changed] Supercrawler no longer follows redirects on crawled URLs. Supercrawler will now add a redirected URL to the queue as a separate entry. We still follow redirects for the `/robots.txt` that is used for checking rules; but not for `/robots.txt` added to the queue.

		### 0.3.3

		* [Fix] `DbUrlList` to mark a URL as taken, and ensure it never returns a URL that is being crawled in another concurrent request. This has required a new field called `holdDate` on the `url` table

		### 0.3.2

		* [Fix] Time-based unit tests made more reliable.

		### 0.3.1

		* [Added] Support for Travis CI.

		### 0.3.0

		* [Added] Content type passed as third argument to all content type handlers.
		* [Added] Sitemaps parser to extract sitemap URLs and urlset URLs.
		* [Changed] Content handlers receive Buffers rather than strings for the first argument.
		* [Fix] Robots.txt checking to work for the first crawled URL. There was a bug that caused robots.txt to be ignored if it wasn't in the cache.

		### 0.2.3

		* [Added] A robots.txt parser that identifies `Sitemap:` directives.

		### 0.2.2

		* [Fixed] Support for URLs up to 10,000 characters long. This required a new `urlHash` SHA1 field on the `url` table, to support the unique index.

		### 0.2.1

		* [Added] Extensive documentation.

		### 0.2.0

		* [Added] Status code is updated in the queue for successfully crawled pages (HTTP code < 400).
		* [Added] A new error type `error.RequestError` for all errors that occur when requesting a page.
		* [Added] `DbUrlList` queue object that stores URLs in a SQL database. Includes exponetial backoff retry logic.
		* [Changed] Interface to `DbUrlList` and `FifoUrlList` is now via methods `insertIfNotExists`, `upsert` and `getNextUrl`. Previously, it was just `insert` (which also updated) and `upsert`, but we need a way to differentiate between discovered URLs which should not update the crawl state.

		### 0.1.0

		* [Added] `Crawler` object, supporting rate limiting, concurrent requests limiting, robots.txt caching.
		* [Added] `FifoUrlList` object, a first-in, first-out in-memory list of URLs to be crawled.
		* [Added] `Url` object, representing a URL in the crawl queue.
		* [Added] `htmlLinkParser`, a function to extract links from crawled HTML documents.

test/Crawler.spec.js

		@@ -37,2 +37,3 @@ var proxyquire = require('proxyquire'),
		pageContentType,
		pageLocationHeader,
		pageStatusCode,
		@@ -62,2 +63,6 @@ pageBody,

		if (pageLocationHeader) {
		headers.location = pageLocationHeader;
		}

		if (pageStatusCode === 0) {
		@@ -108,3 +113,3 @@ return cb(new Error("Some request error"));

		var numCrawlsOfUrl = function (url) {
		var numCrawlsOfUrl = function (url, followRedirect) {
		var numCalls = 0;
		@@ -119,3 +124,4 @@ var n = 0;
		url: url,
		forever: true
		forever: true,
		followRedirect: followRedirect
		}))) {
		@@ -132,3 +138,3 @@ numCalls++;
		var numRobotsCalls = function () {
		return numCrawlsOfUrl("https://example.com/robots.txt");
		return numCrawlsOfUrl("https://example.com/robots.txt", true);
		};
		@@ -299,2 +305,36 @@

		it("will add destination URL to queue when redirected", function (done) {
		var crawler = new Crawler({ interval: 10 });

		crawler.start();

		pageStatusCode = 301;
		pageLocationHeader = "https://example.com/destination.html";

		setTimeout(function () {
		crawler.stop();
		sinon.assert.calledWith(insertIfNotExistsSpy, sinon.match({
		_url: "https://example.com/destination.html"
		}));
		done();
		}, 200);
		});

		it("will add relative destination URL to queue when redirected", function (done) {
		var crawler = new Crawler({ interval: 10 });

		crawler.start();

		pageStatusCode = 301;
		pageLocationHeader = "/destination2.html";

		setTimeout(function () {
		crawler.stop();
		sinon.assert.calledWith(insertIfNotExistsSpy, sinon.match({
		_url: "https://example.com/destination2.html"
		}));
		done();
		}, 200);
		});

		it("requests a page that is not excluded by robots.txt", function (done) {
		@@ -309,3 +349,3 @@ var crawler = new Crawler({
		crawler.stop();
		expect(numCrawlsOfUrl("https://example.com/index18.html")).to.equal(1);
		expect(numCrawlsOfUrl("https://example.com/index18.html", false)).to.equal(1);
		done();
		@@ -328,3 +368,3 @@ }, 200);
		crawler.stop();
		expect(numCrawlsOfUrl("https://example.com/index1.html")).to.equal(0);
		expect(numCrawlsOfUrl("https://example.com/index1.html", false)).to.equal(0);
		done();
		@@ -343,3 +383,3 @@ }, 200);
		crawler.stop();
		expect(numCrawlsOfUrl("https://example.com/index17.html")).to.equal(0);
		expect(numCrawlsOfUrl("https://example.com/index17.html", false)).to.equal(0);
		done();
		@@ -359,3 +399,3 @@ }, 200);
		crawler.stop();
		expect(numCrawlsOfUrl("https://example.com/index17.html")).to.equal(1);
		expect(numCrawlsOfUrl("https://example.com/index17.html", false)).to.equal(1);
		done();
		@@ -375,3 +415,3 @@ }, 200);
		crawler.stop();
		expect(numCrawlsOfUrl("https://example.com/index5.html")).to.equal(0);
		expect(numCrawlsOfUrl("https://example.com/index5.html", false)).to.equal(0);
		done();
		@@ -378,0 +418,0 @@ }, 200);

supercrawler - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics