Socket
Socket
Sign inDemoInstall

js-crawler

Package Overview
Dependencies
57
Maintainers
1
Versions
23
Alerts
File Explorer

Advanced tools

Install Socket

Detect and block malicious and high-risk dependencies

Install

Comparing version 0.3.16 to 0.3.17

8

crawler.js

@@ -86,5 +86,8 @@ var request = require('request');

this.maxRequestsPerSecond = DEFAULT_MAX_REQUESTS_PER_SECOND;
this.shouldCrawl = function() {
this.shouldCrawl = function(url) {
return true;
};
this.shouldCrawlLinksFrom = function(url) {
return true;
};
//Urls that are queued for crawling, for some of them HTTP requests may not yet have been issued

@@ -106,2 +109,3 @@ this._currentUrlsToCrawl = [];

this.shouldCrawl = (options && options.shouldCrawl) || this.shouldCrawl;
this.shouldCrawlLinksFrom = (options && options.shouldCrawlLinksFrom) || this.shouldCrawlLinksFrom;
this.onSuccess = _.noop;

@@ -242,3 +246,3 @@ this.onFailure = _.noop;

self.crawledUrls.push(lastUrlInRedirectChain);
if (depth > 1 && isTextContent) {
if (self.shouldCrawlLinksFrom(lastUrlInRedirectChain) && depth > 1 && isTextContent) {
self._crawlUrls(self._getAllUrls(lastUrlInRedirectChain, body), lastUrlInRedirectChain, depth - 1);

@@ -245,0 +249,0 @@ }

@@ -200,2 +200,5 @@ var Crawler = require('../crawler');

//TODO: Test for the correct referer value in a chain of visited pages
//TODO: Test for the shouldCrawlLinksFrom function
//TODO: Test for shouldCrawl
//TODO: Redirect with another HTTP code? 301?

@@ -202,0 +205,0 @@ //TODO: Binary content, links are not analyzed in binary content, binary content itself is not returned (as it can be too large)(?)

{
"name": "js-crawler",
"version": "0.3.16",
"version": "0.3.17",
"description": "Web crawler for Node.js",

@@ -5,0 +5,0 @@ "main": "crawler.js",

@@ -180,4 +180,2 @@ js-crawler

* `shouldCrawl` - function that specifies whether an url should be crawled, returns `true` or `false`.
* `maxRequestsPerSecond` - the maximum number of HTTP requests per second that can be made by the crawler, default value is 100

@@ -187,4 +185,15 @@

Example:
* `shouldCrawl` - function that specifies whether a url should be crawled/requested, returns `true` or `false`,
argument is the current `url` the crawler considers for crawling
* `shouldCrawlLinksFrom` - function that specifies whether the crawler should crawl links found at a given url, returns `true` or `false`, argument is the current `url` being crawled
Note: `shouldCrawl` determines if a given URL should be requested/visited at all, where as `shouldSpider` determines if the links on a given URL should be harvested/added to the crawling queue.
Many users may find that setting `shouldCrawl` is sufficient, as URLs cannot be spidered if they are never visited/requested.
A common use case for having these functions separated: if a user would like to check external links on a site for errors, without crawling those external links, the user could create a `shouldSpider` function that restricts spidering to the original URL.
**Examples:**
The following will crawl the specified URL, but not allow external URLs to be visited/requested, and therefore not search for additional links to crawl on the external URLs:
```javascript

@@ -204,4 +213,19 @@ var Crawler = require("js-crawler");

Default value is a function that always returns `true`.
The following will crawl the specified URL, allow external URLs to be visited/requested, but will not search for additional links to crawl on the external URLs:
```javascript
var Crawler = require("js-crawler");
var crawler = new Crawler().configure({
shouldCrawlLinksFrom: function(url) {
return url.indexOf("reddit.com") < 0;
}
});
crawler.crawl("http://www.reddit.com/r/javascript", function(page) {
console.log(page.url);
});
```
The default value for each is a function that always returns `true`.
#### Development

@@ -208,0 +232,0 @@

@@ -435,2 +435,73 @@ var Crawler = require('../crawler');

});
describe('shouldCrawl', function() {
describe('should not crawl url', function() {
beforeEach(function() {
crawler.shouldCrawl = function(urlToCrawl) {
return urlToCrawl != url;
};
});
it('should not call onSuccess', function() {
crawler._crawlUrl(url, referer, depth);
expect(crawler.onSuccess).not.toHaveBeenCalled();
});
});
describe('should crawl url', function() {
beforeEach(function() {
crawler.shouldCrawl = function(urlToCrawl) {
return urlToCrawl == url;
};
});
it('should not call onSuccess', function() {
crawler._crawlUrl(url, referer, depth);
expect(crawler.onSuccess).toHaveBeenCalledWith({
url: url,
status: OK,
content: body,
error: null,
response: response,
body: body,
referer: referer
});
});
});
});
describe('shouldCrawlLinksFrom', function() {
describe('should not crawl links from url', function() {
beforeEach(function() {
crawler.shouldCrawlLinksFrom = function(urlToCrawl) {
return urlToCrawl != url;
};
});
it('should not call _crawlUrls', function() {
crawler._crawlUrl(url, referer, depth);
expect(crawler._crawlUrls).not.toHaveBeenCalled();
});
});
describe('should crawl links from url', function() {
beforeEach(function() {
crawler.shouldCrawlLinksFrom = function(urlToCrawl) {
return urlToCrawl == url;
};
});
it('should call _crawlUrls with the correct list of urls', function() {
crawler._crawlUrl(url, referer, depth);
expect(crawler._crawlUrls).toHaveBeenCalledWith(['url1', 'url2', 'url3'], url, depth - 1);
});
});
});
});

@@ -437,0 +508,0 @@ });

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc