js-crawler - npm Package Compare versions

8

crawler.js

		@@ -86,5 +86,8 @@ var request = require('request');
		this.maxRequestsPerSecond = DEFAULT_MAX_REQUESTS_PER_SECOND;
		this.shouldCrawl = function() {
		this.shouldCrawl = function(url) {
		return true;
		};
		this.shouldCrawlLinksFrom = function(url) {
		return true;
		};
		//Urls that are queued for crawling, for some of them HTTP requests may not yet have been issued
		@@ -106,2 +109,3 @@ this._currentUrlsToCrawl = [];
		this.shouldCrawl = (options && options.shouldCrawl) \|\| this.shouldCrawl;
		this.shouldCrawlLinksFrom = (options && options.shouldCrawlLinksFrom) \|\| this.shouldCrawlLinksFrom;
		this.onSuccess = _.noop;
		@@ -242,3 +246,3 @@ this.onFailure = _.noop;
		self.crawledUrls.push(lastUrlInRedirectChain);
		if (depth > 1 && isTextContent) {
		if (self.shouldCrawlLinksFrom(lastUrlInRedirectChain) && depth > 1 && isTextContent) {
		self._crawlUrls(self._getAllUrls(lastUrlInRedirectChain, body), lastUrlInRedirectChain, depth - 1);
		@@ -245,0 +249,0 @@ }

3

e2e/crawler.spec.js

		@@ -200,2 +200,5 @@ var Crawler = require('../crawler');
		//TODO: Test for the correct referer value in a chain of visited pages
		//TODO: Test for the shouldCrawlLinksFrom function
		//TODO: Test for shouldCrawl

		//TODO: Redirect with another HTTP code? 301?
		@@ -202,0 +205,0 @@ //TODO: Binary content, links are not analyzed in binary content, binary content itself is not returned (as it can be too large)(?)

2

package.json

		{
		"name": "js-crawler",
		"version": "0.3.16",
		"version": "0.3.17",
		"description": "Web crawler for Node.js",
		@@ -5,0 +5,0 @@ "main": "crawler.js",

32

README.md

		@@ -180,4 +180,2 @@ js-crawler

		* `shouldCrawl` - function that specifies whether an url should be crawled, returns `true` or `false`.

		* `maxRequestsPerSecond` - the maximum number of HTTP requests per second that can be made by the crawler, default value is 100
		@@ -187,4 +185,15 @@

		Example:
		* `shouldCrawl` - function that specifies whether a url should be crawled/requested, returns `true` or `false`,
		argument is the current `url` the crawler considers for crawling

		* `shouldCrawlLinksFrom` - function that specifies whether the crawler should crawl links found at a given url, returns `true` or `false`, argument is the current `url` being crawled

		Note: `shouldCrawl` determines if a given URL should be requested/visited at all, where as `shouldSpider` determines if the links on a given URL should be harvested/added to the crawling queue.
		Many users may find that setting `shouldCrawl` is sufficient, as URLs cannot be spidered if they are never visited/requested.
		A common use case for having these functions separated: if a user would like to check external links on a site for errors, without crawling those external links, the user could create a `shouldSpider` function that restricts spidering to the original URL.


		Examples:

		The following will crawl the specified URL, but not allow external URLs to be visited/requested, and therefore not search for additional links to crawl on the external URLs:
		```javascript
		@@ -204,4 +213,19 @@ var Crawler = require("js-crawler");

		Default value is a function that always returns `true`.
		The following will crawl the specified URL, allow external URLs to be visited/requested, but will not search for additional links to crawl on the external URLs:
		```javascript
		var Crawler = require("js-crawler");

		var crawler = new Crawler().configure({
		shouldCrawlLinksFrom: function(url) {
		return url.indexOf("reddit.com") < 0;
		}
		});

		crawler.crawl("http://www.reddit.com/r/javascript", function(page) {
		console.log(page.url);
		});
		```

		The default value for each is a function that always returns `true`.

		#### Development
		@@ -208,0 +232,0 @@

71

spec/crawler.spec.js

		@@ -435,2 +435,73 @@ var Crawler = require('../crawler');
		});


		describe('shouldCrawl', function() {

		describe('should not crawl url', function() {

		beforeEach(function() {
		crawler.shouldCrawl = function(urlToCrawl) {
		return urlToCrawl != url;
		};
		});

		it('should not call onSuccess', function() {
		crawler._crawlUrl(url, referer, depth);
		expect(crawler.onSuccess).not.toHaveBeenCalled();
		});
		});

		describe('should crawl url', function() {

		beforeEach(function() {
		crawler.shouldCrawl = function(urlToCrawl) {
		return urlToCrawl == url;
		};
		});

		it('should not call onSuccess', function() {
		crawler._crawlUrl(url, referer, depth);
		expect(crawler.onSuccess).toHaveBeenCalledWith({
		url: url,
		status: OK,
		content: body,
		error: null,
		response: response,
		body: body,
		referer: referer
		});
		});
		});
		});

		describe('shouldCrawlLinksFrom', function() {

		describe('should not crawl links from url', function() {

		beforeEach(function() {
		crawler.shouldCrawlLinksFrom = function(urlToCrawl) {
		return urlToCrawl != url;
		};
		});

		it('should not call _crawlUrls', function() {
		crawler._crawlUrl(url, referer, depth);
		expect(crawler._crawlUrls).not.toHaveBeenCalled();
		});
		});

		describe('should crawl links from url', function() {

		beforeEach(function() {
		crawler.shouldCrawlLinksFrom = function(urlToCrawl) {
		return urlToCrawl == url;
		};
		});

		it('should call _crawlUrls with the correct list of urls', function() {
		crawler._crawlUrl(url, referer, depth);
		expect(crawler._crawlUrls).toHaveBeenCalledWith(['url1', 'url2', 'url3'], url, depth - 1);
		});
		});
		});
		});
		@@ -437,0 +508,0 @@ });

.npmignore

Sorry, the diff of this file is not supported yet

Improved metrics