js-crawler
Advanced tools
Comparing version 0.3.16 to 0.3.17
@@ -86,5 +86,8 @@ var request = require('request'); | ||
this.maxRequestsPerSecond = DEFAULT_MAX_REQUESTS_PER_SECOND; | ||
this.shouldCrawl = function() { | ||
this.shouldCrawl = function(url) { | ||
return true; | ||
}; | ||
this.shouldCrawlLinksFrom = function(url) { | ||
return true; | ||
}; | ||
//Urls that are queued for crawling, for some of them HTTP requests may not yet have been issued | ||
@@ -106,2 +109,3 @@ this._currentUrlsToCrawl = []; | ||
this.shouldCrawl = (options && options.shouldCrawl) || this.shouldCrawl; | ||
this.shouldCrawlLinksFrom = (options && options.shouldCrawlLinksFrom) || this.shouldCrawlLinksFrom; | ||
this.onSuccess = _.noop; | ||
@@ -242,3 +246,3 @@ this.onFailure = _.noop; | ||
self.crawledUrls.push(lastUrlInRedirectChain); | ||
if (depth > 1 && isTextContent) { | ||
if (self.shouldCrawlLinksFrom(lastUrlInRedirectChain) && depth > 1 && isTextContent) { | ||
self._crawlUrls(self._getAllUrls(lastUrlInRedirectChain, body), lastUrlInRedirectChain, depth - 1); | ||
@@ -245,0 +249,0 @@ } |
@@ -200,2 +200,5 @@ var Crawler = require('../crawler'); | ||
//TODO: Test for the correct referer value in a chain of visited pages | ||
//TODO: Test for the shouldCrawlLinksFrom function | ||
//TODO: Test for shouldCrawl | ||
//TODO: Redirect with another HTTP code? 301? | ||
@@ -202,0 +205,0 @@ //TODO: Binary content, links are not analyzed in binary content, binary content itself is not returned (as it can be too large)(?) |
{ | ||
"name": "js-crawler", | ||
"version": "0.3.16", | ||
"version": "0.3.17", | ||
"description": "Web crawler for Node.js", | ||
@@ -5,0 +5,0 @@ "main": "crawler.js", |
@@ -180,4 +180,2 @@ js-crawler | ||
* `shouldCrawl` - function that specifies whether an url should be crawled, returns `true` or `false`. | ||
* `maxRequestsPerSecond` - the maximum number of HTTP requests per second that can be made by the crawler, default value is 100 | ||
@@ -187,4 +185,15 @@ | ||
Example: | ||
* `shouldCrawl` - function that specifies whether a url should be crawled/requested, returns `true` or `false`, | ||
argument is the current `url` the crawler considers for crawling | ||
* `shouldCrawlLinksFrom` - function that specifies whether the crawler should crawl links found at a given url, returns `true` or `false`, argument is the current `url` being crawled | ||
Note: `shouldCrawl` determines if a given URL should be requested/visited at all, where as `shouldSpider` determines if the links on a given URL should be harvested/added to the crawling queue. | ||
Many users may find that setting `shouldCrawl` is sufficient, as URLs cannot be spidered if they are never visited/requested. | ||
A common use case for having these functions separated: if a user would like to check external links on a site for errors, without crawling those external links, the user could create a `shouldSpider` function that restricts spidering to the original URL. | ||
**Examples:** | ||
The following will crawl the specified URL, but not allow external URLs to be visited/requested, and therefore not search for additional links to crawl on the external URLs: | ||
```javascript | ||
@@ -204,4 +213,19 @@ var Crawler = require("js-crawler"); | ||
Default value is a function that always returns `true`. | ||
The following will crawl the specified URL, allow external URLs to be visited/requested, but will not search for additional links to crawl on the external URLs: | ||
```javascript | ||
var Crawler = require("js-crawler"); | ||
var crawler = new Crawler().configure({ | ||
shouldCrawlLinksFrom: function(url) { | ||
return url.indexOf("reddit.com") < 0; | ||
} | ||
}); | ||
crawler.crawl("http://www.reddit.com/r/javascript", function(page) { | ||
console.log(page.url); | ||
}); | ||
``` | ||
The default value for each is a function that always returns `true`. | ||
#### Development | ||
@@ -208,0 +232,0 @@ |
@@ -435,2 +435,73 @@ var Crawler = require('../crawler'); | ||
}); | ||
describe('shouldCrawl', function() { | ||
describe('should not crawl url', function() { | ||
beforeEach(function() { | ||
crawler.shouldCrawl = function(urlToCrawl) { | ||
return urlToCrawl != url; | ||
}; | ||
}); | ||
it('should not call onSuccess', function() { | ||
crawler._crawlUrl(url, referer, depth); | ||
expect(crawler.onSuccess).not.toHaveBeenCalled(); | ||
}); | ||
}); | ||
describe('should crawl url', function() { | ||
beforeEach(function() { | ||
crawler.shouldCrawl = function(urlToCrawl) { | ||
return urlToCrawl == url; | ||
}; | ||
}); | ||
it('should not call onSuccess', function() { | ||
crawler._crawlUrl(url, referer, depth); | ||
expect(crawler.onSuccess).toHaveBeenCalledWith({ | ||
url: url, | ||
status: OK, | ||
content: body, | ||
error: null, | ||
response: response, | ||
body: body, | ||
referer: referer | ||
}); | ||
}); | ||
}); | ||
}); | ||
describe('shouldCrawlLinksFrom', function() { | ||
describe('should not crawl links from url', function() { | ||
beforeEach(function() { | ||
crawler.shouldCrawlLinksFrom = function(urlToCrawl) { | ||
return urlToCrawl != url; | ||
}; | ||
}); | ||
it('should not call _crawlUrls', function() { | ||
crawler._crawlUrl(url, referer, depth); | ||
expect(crawler._crawlUrls).not.toHaveBeenCalled(); | ||
}); | ||
}); | ||
describe('should crawl links from url', function() { | ||
beforeEach(function() { | ||
crawler.shouldCrawlLinksFrom = function(urlToCrawl) { | ||
return urlToCrawl == url; | ||
}; | ||
}); | ||
it('should call _crawlUrls with the correct list of urls', function() { | ||
crawler._crawlUrl(url, referer, depth); | ||
expect(crawler._crawlUrls).toHaveBeenCalledWith(['url1', 'url2', 'url3'], url, depth - 1); | ||
}); | ||
}); | ||
}); | ||
}); | ||
@@ -437,0 +508,0 @@ }); |
Sorry, the diff of this file is not supported yet
5012319
1329
287