Socket
Socket
Sign inDemoInstall

js-crawler

Package Overview
Dependencies
57
Maintainers
1
Versions
23
Alerts
File Explorer

Advanced tools

Install Socket

Detect and block malicious and high-risk dependencies

Install

Comparing version 0.3.17 to 0.3.18

8

crawler.js

@@ -193,3 +193,7 @@ var request = require('request');

//console.log('Should skip? url = ', url, _.contains(self.knownUrls, url) || !self.shouldCrawl(url));
return _.contains(self.knownUrls, url) || !self.shouldCrawl(url);
var shouldCrawlUrl = self.shouldCrawl(url);
if (!shouldCrawlUrl) {
self._finishedCrawling(url);
}
return _.contains(self.knownUrls, url) || !shouldCrawlUrl;
});

@@ -208,3 +212,3 @@ };

url: url,
encoding: null, // Added by @tibetty so as to avoid request treating body as a string by default
encoding: null, // Added by @tibetty so as to avoid request treating body as a string by default
rejectUnauthorized : false,

@@ -211,0 +215,0 @@ followRedirect: true,

@@ -199,2 +199,49 @@ var Crawler = require('../crawler');

describe('shouldCrawl', () => {
it('should call onAllFinished when last url should not be crawled', (done) => {
var expectedUrls = [
'http://localhost:3000/simple_cycle/page1.html',
'http://localhost:3000/simple_cycle/page2.html'
];
crawler.configure({
shouldCrawl: function(url) {
//Omit page3.html
return url.indexOf('page3.html') < 0;
}
})
crawler.crawl('http://localhost:3000/simple_cycle/page1.html',
function onSuccess(page) {
},
function onFailure() {
expect('Errors while crawling').to.be('');
},
function onAllFinished(crawledUrls) {
expect(crawledUrls.sort()).toEqual(expectedUrls.sort());
done();
}
);
});
it('should call onAllFinished when no urls should be crawled', (done) => {
crawler.configure({
shouldCrawl: function(url) {
return false;
}
})
crawler.crawl('http://localhost:3000/simple_cycle/page1.html',
function onSuccess(page) {
},
function onFailure() {
expect('Errors while crawling').to.be('');
},
function onAllFinished(crawledUrls) {
expect(crawledUrls.length).toEqual(0);
done();
}
);
});
});
//TODO: Test for the correct referer value in a chain of visited pages

@@ -201,0 +248,0 @@ //TODO: Test for the shouldCrawlLinksFrom function

{
"name": "js-crawler",
"version": "0.3.17",
"version": "0.3.18",
"description": "Web crawler for Node.js",

@@ -5,0 +5,0 @@ "main": "crawler.js",

@@ -161,5 +161,5 @@ js-crawler

#### Forgetting crawled urls
#### Reusing the same crawler instance for repeated crawling: forgetting crawled urls
By default a crawler instance will remember all the urls it ever crawled and will not crawl them again. In order to make it forget all the crawled urls the method `forgetCrawled` can be used. There is another way to solve the same problem: create a new instance of a crawler.
By default a crawler instance will remember all the urls it ever crawled and will not crawl them again. In order to make it forget all the crawled urls the method `forgetCrawled` can be used. There is another way to solve the same problem: create a new instance of a crawler. Example https://github.com/antivanov/js-crawler/blob/master/examples/github_forgetting_crawled_urls.js

@@ -190,5 +190,3 @@ #### Supported options

Note: `shouldCrawl` determines if a given URL should be requested/visited at all, where as `shouldSpider` determines if the links on a given URL should be harvested/added to the crawling queue.
Many users may find that setting `shouldCrawl` is sufficient, as URLs cannot be spidered if they are never visited/requested.
A common use case for having these functions separated: if a user would like to check external links on a site for errors, without crawling those external links, the user could create a `shouldSpider` function that restricts spidering to the original URL.
Note: `shouldCrawl` determines if a given URL should be requested/visited at all, where as `shouldCrawlLinksFrom` determines if the links on a given URL should be harvested/added to the crawling queue. Many users may find that using `shouldCrawl` is sufficient, as links from a page cannot be crawled if the page is never visited/requested in the first place. A common use case for having these functions separated: if a user would like to check external links on a site for errors without crawling those external links, the user could create a `shouldCrawlLinksFrom` function that restricts crawling to the original url without visiting external links.

@@ -195,0 +193,0 @@

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc