supercrawler - npm Package Compare versions

supercrawler

Package Overview

Dependencies

161

Advanced tools

Install Socket

Detect and block malicious and high-risk dependencies

Install

Comparing version 1.7.2 to 2.0.0

lib/Crawler.js

		@@ -85,3 +85,7 @@ var Crawler,
		*/
		Crawler.prototype.getUserAgent = function () {
		Crawler.prototype.getUserAgent = function (url) {
		if (typeof this._userAgent === 'function') {
		return this._userAgent(url);
		}

		return this._userAgent;
		@@ -313,3 +317,3 @@ };
		}).then(function (url) {
		self.emit("crawledurl", url.getUrl(), url.getErrorCode(), url.getStatusCode());
		self.emit("crawledurl", url.getUrl(), url.getErrorCode(), url.getStatusCode(), url.getErrorMessage());

		@@ -386,3 +390,3 @@ return url;
		headers: {
		"User-Agent": this.getUserAgent()
		"User-Agent": this.getUserAgent(url)
		},
		@@ -428,3 +432,3 @@ encoding: null,
		robots = robotsParser(self._getRobotsUrl(url), robotsTxt);
		isAllowed = robots.isAllowed(url, self.getUserAgent());
		isAllowed = robots.isAllowed(url, self.getUserAgent(url));

		@@ -431,0 +435,0 @@ if (!isAllowed) {

lib/handlers/sitemapsParser.js

		@@ -67,3 +67,3 @@ var cheerio = require("cheerio"),
		return match ? match.data : null;
		}).filter(nullFilter);
		}).filter(nullFilter).filter(opts.urlFilter);

		@@ -70,0 +70,0 @@ urlUrls = $("urlset > url > loc").get().map(function (el) {

package.json

		{
		"name": "supercrawler",
		"description": "A web crawler. Supercrawler automatically crawls websites. Define custom handlers to parse content. Obeys robots.txt, rate limits and concurrency limits.",
		"version": "1.7.2",
		"version": "2.0.0",
		"homepage": "https://github.com/brendonboshell/supercrawler",
		@@ -6,0 +6,0 @@ "author": "Brendon Boshell <brendonboshell@gmail.com>",

README.md

		@@ -137,3 +137,3 @@ # Node.js Web Crawler
		\| robotsIgnoreServerError \| Indicates if `500` status code response for robots.txt should be ignored. Defaults to `false`. \|
		\| userAgent \| User agent to use for requests. Defaults to `Mozilla/5.0 (compatible; supercrawler/1.0; +https://github.com/brendonboshell/supercrawler)` \|
		\| userAgent \| User agent to use for requests. This can be either a string or a function that takes the URL being crawled. Defaults to `Mozilla/5.0 (compatible; supercrawler/1.0; +https://github.com/brendonboshell/supercrawler)`. \|
		\| request \| Object of options to be passed to [request](https://github.com/request/request). Note that request does not support an asynchronous (and distributed) cookie jar. \|
		@@ -168,3 +168,3 @@
		\| crawlurl(url) \| Fires when crawling starts with a new URL. \|
		\| crawledurl(url, errorCode, statusCode) \| Fires when crawling of a URL is complete. `errorCode` is `null` if no error occurred. `statusCode` is set if and only if the request was successful. \|
		\| crawledurl(url, errorCode, statusCode, errorMessage) \| Fires when crawling of a URL is complete. `errorCode` is `null` if no error occurred. `statusCode` is set if and only if the request was successful. `errorMessage` is `null` if no error occurred. \|
		\| urllistempty \| Fires when the URL list is (intermittently) empty. \|
		@@ -361,3 +361,3 @@ \| urllistcomplete \| Fires when the URL list is permanently empty, barring URLs added by external sources. This only makes sense when running Supercrawler in non-distributed fashion. \|
		\| --- \| --- \|
		\| urlFilter \| Function that takes a URL and returns `true` if it should be included. \|
		\| urlFilter \| Function that takes a URL (including sitemap entries) and returns `true` if it should be included. \|

		@@ -373,2 +373,8 @@ Example usage:

		### 2.0.0

		* [Added] `crawledurl` event to contain the error message, thanks [hjr3](https://github.com/hjr3).
		* [Changed] `sitemapsParser` to apply `urlFilter` on the sitemaps entries, thanks [hjr3](https://github.com/hjr3).
		* [Added] `Crawler` to take `userAgent` option as a function, thanks [hjr3](https://github.com/hjr3).

		### 1.7.2
		@@ -375,0 +381,0 @@

test/Crawler.spec.js

		@@ -201,2 +201,18 @@ var proxyquire = require('proxyquire'),
		});

		it("will accept a function as a user agent", function () {
		expect(new Crawler({
		userAgent: () => "mybot/1.1"
		}).getUserAgent()).to.equal("mybot/1.1");

		expect(new Crawler({
		userAgent: (url) => {
		if (url === 'http://www.example.com/some/random/page') {
		return 'url specific user agent';
		}

		return "mybot/1.1";
		}
		}).getUserAgent()).to.equal("mybot/1.1");
		});
		});
		@@ -1021,3 +1037,3 @@
		crawler.stop();
		sinon.assert.calledWith(spy, "https://example.com/index1.html", "OTHER_ERROR", null);
		sinon.assert.calledWith(spy, "https://example.com/index1.html", "OTHER_ERROR", null, "abitrary error");
		done();
		@@ -1024,0 +1040,0 @@ }, 200);

test/handlers/sitemapsParser.spec.js

		@@ -22,2 +22,6 @@ var sitemapsParser = require("../../lib/handlers/sitemapsParser"),
		"</sitemap>",
		"<sitemap>",
		"<loc>http://example.com/sitemap-de.xml.gz</loc>",
		"<lastmod>2015-07-17T18:16:02.754-07:00</lastmod>",
		"</sitemap>",
		"</sitemapindex>"
		@@ -53,3 +57,4 @@ ].join("\n");
		expect(urls).to.deep.equal([
		"http://example.com/sitemap.xml.gz"
		"http://example.com/sitemap.xml.gz",
		"http://example.com/sitemap-de.xml.gz"
		]);
		@@ -114,2 +119,20 @@ done();

		it("can apply a filter to sitemap URLs discovered within sitemap", function (done) {
		var sp = new sitemapsParser({
		urlFilter: function (url) {
		return url.indexOf("de") === -1;
		}
		});

		sp({
		body: new Buffer(sitemapindex ),
		url: "http://example.com/sitemap_index.xml"
		}).then(function (urls) {
		expect(urls).to.deep.equal([
		"http://example.com/sitemap.xml.gz"
		]);
		done();
		});
		});

		it("supports a .gz sitemap file", function (done) {
		@@ -116,0 +139,0 @@ Promise.promisify(zlib.gzip)(new Buffer(urlset)).then(function (buf) {

.DS_Store

		@@ -137,3 +137,3 @@ # Node.js Web Crawler
		\| robotsIgnoreServerError \| Indicates if `500` status code response for robots.txt should be ignored. Defaults to `false`. \|
		\| userAgent \| User agent to use for requests. Defaults to `Mozilla/5.0 (compatible; supercrawler/1.0; +https://github.com/brendonboshell/supercrawler)` \|
		\| userAgent \| User agent to use for requests. This can be either a string or a function that takes the URL being crawled. Defaults to `Mozilla/5.0 (compatible; supercrawler/1.0; +https://github.com/brendonboshell/supercrawler)`. \|
		\| request \| Object of options to be passed to [request](https://github.com/request/request). Note that request does not support an asynchronous (and distributed) cookie jar. \|
		@@ -168,3 +168,3 @@
		\| crawlurl(url) \| Fires when crawling starts with a new URL. \|
		\| crawledurl(url, errorCode, statusCode) \| Fires when crawling of a URL is complete. `errorCode` is `null` if no error occurred. `statusCode` is set if and only if the request was successful. \|
		\| crawledurl(url, errorCode, statusCode, errorMessage) \| Fires when crawling of a URL is complete. `errorCode` is `null` if no error occurred. `statusCode` is set if and only if the request was successful. `errorMessage` is `null` if no error occurred. \|
		\| urllistempty \| Fires when the URL list is (intermittently) empty. \|
		@@ -361,3 +361,3 @@ \| urllistcomplete \| Fires when the URL list is permanently empty, barring URLs added by external sources. This only makes sense when running Supercrawler in non-distributed fashion. \|
		\| --- \| --- \|
		\| urlFilter \| Function that takes a URL and returns `true` if it should be included. \|
		\| urlFilter \| Function that takes a URL (including sitemap entries) and returns `true` if it should be included. \|

		@@ -373,2 +373,8 @@ Example usage:

		### 2.0.0

		* [Added] `crawledurl` event to contain the error message, thanks [hjr3](https://github.com/hjr3).
		* [Changed] `sitemapsParser` to apply `urlFilter` on the sitemaps entries, thanks [hjr3](https://github.com/hjr3).
		* [Added] `Crawler` to take `userAgent` option as a function, thanks [hjr3](https://github.com/hjr3).

		### 1.7.2
		@@ -375,0 +381,0 @@

Improved metrics

Worsened metrics