supercrawler - npm Package Compare versions

supercrawler

Advanced tools

Install Socket

Detect and block malicious and high-risk dependencies

Comparing version 1.5.0 to 1.6.0

lib/DbUrlList.js

		@@ -30,2 +30,9 @@ var DbUrlList,

		// Some options defaults
		if (opts.db.table === undefined) {
		opts.db.table = "url";
		}

		this._recrawlInMs = opts.recrawlInMs \|\| YEAR_MS;

		opts.db.sequelizeOpts.logging = false;
		@@ -35,3 +42,3 @@
		opts.db.sequelizeOpts);
		this._urlTable = this._db.define('url', {
		this._urlTable = this._db.define(opts.db.table, {
		urlHash: {
		@@ -226,3 +233,3 @@ type: Sequelize.STRING(40),
		// again.
		nextRetryDate = new Date(new Date().getTime() + YEAR_MS);
		nextRetryDate = new Date(new Date().getTime() + self._recrawlInMs);
		}
		@@ -229,0 +236,0 @@ } else {

lib/handlers/htmlLinkParser.js

		@@ -9,2 +9,8 @@ var cheerio = require("cheerio"),

		if (!opts.urlFilter) {
		opts.urlFilter = function () {
		return true;
		};
		}

		return function (context) {
		@@ -49,4 +55,6 @@ var $;
		});
		}).get();
		}).get().filter(function (url) {
		return opts.urlFilter(url, context.url);
		});
		};
		};

package.json

		{
		"name": "supercrawler",
		"description": "A web crawler. Supercrawler automatically crawls websites. Define custom handlers to parse content. Obeys robots.txt, rate limits and concurrency limits.",
		"version": "1.5.0",
		"version": "1.6.0",
		"homepage": "https://github.com/brendonboshell/supercrawler",
		@@ -6,0 +6,0 @@ "author": "Brendon Boshell <brendonboshell@gmail.com>",

README.md

		@@ -182,2 +182,4 @@ # Node.js Web Crawler
		\| opts.db.sequelizeOpts \| Options to pass to sequelize. \|
		\| opts.db.table \| Table name to store URL queue. Default = 'url' \|
		\| opts.recrawlInMs \| Number of milliseconds to recrawl a URL. Default = 31536000000 (1 year) \|

		@@ -304,2 +306,3 @@ Example usage:
		\| hostnames \| Array of hostnames that are allowed to be crawled. \|
		\| urlFilter(url, pageUrl) \| Function that takes a URL and returns `true` if it should be included. \|

		@@ -314,2 +317,10 @@ Example usage:

		```js
		var hlp = supercrawler.handlers.htmlLinkParser({
		urlFilter: function (url) {
		return url.indexOf("page1") === -1;
		}
		});
		```

		## handlers.robotsParser
		@@ -359,2 +370,8 @@

		### 1.6.0

		* [Added] Added `opts.db.table` option to `DbUrlList` ([adversinc](https://github.com/adversinc)).
		* [Added] Added `recrawlInMs` option to `DbUrlList` ([adversinc](https://github.com/adversinc)).
		* [Added] Added the `urlFilter` option to `htmlLinkParser` ([adversinc](https://github.com/adversinc)).

		### 1.5.0
		@@ -361,0 +378,0 @@

test/handlers/htmlLinkParser.spec.js

		@@ -142,2 +142,23 @@ var htmlLinkParser = require("../../lib/handlers/htmlLinkParser"),
		});

		it ("can apply a filter to the URLs discovered", function () {
		var hlp = htmlLinkParser({
		urlFilter: function (url) {
		return url.indexOf("page1") === -1;
		}
		}),
		html;

		html = makeHtmlWithLinks([
		"page1.html",
		"page2.html"
		]);

		expect(hlp({
		body: html,
		url: "http://example.com"
		})).to.deep.equal([
		"http://example.com/page2.html"
		]);
		});
		});

Improved metrics