js-crawler
Advanced tools
Comparing version 0.3.17 to 0.3.18
@@ -193,3 +193,7 @@ var request = require('request'); | ||
//console.log('Should skip? url = ', url, _.contains(self.knownUrls, url) || !self.shouldCrawl(url)); | ||
return _.contains(self.knownUrls, url) || !self.shouldCrawl(url); | ||
var shouldCrawlUrl = self.shouldCrawl(url); | ||
if (!shouldCrawlUrl) { | ||
self._finishedCrawling(url); | ||
} | ||
return _.contains(self.knownUrls, url) || !shouldCrawlUrl; | ||
}); | ||
@@ -208,3 +212,3 @@ }; | ||
url: url, | ||
encoding: null, // Added by @tibetty so as to avoid request treating body as a string by default | ||
encoding: null, // Added by @tibetty so as to avoid request treating body as a string by default | ||
rejectUnauthorized : false, | ||
@@ -211,0 +215,0 @@ followRedirect: true, |
@@ -199,2 +199,49 @@ var Crawler = require('../crawler'); | ||
describe('shouldCrawl', () => { | ||
it('should call onAllFinished when last url should not be crawled', (done) => { | ||
var expectedUrls = [ | ||
'http://localhost:3000/simple_cycle/page1.html', | ||
'http://localhost:3000/simple_cycle/page2.html' | ||
]; | ||
crawler.configure({ | ||
shouldCrawl: function(url) { | ||
//Omit page3.html | ||
return url.indexOf('page3.html') < 0; | ||
} | ||
}) | ||
crawler.crawl('http://localhost:3000/simple_cycle/page1.html', | ||
function onSuccess(page) { | ||
}, | ||
function onFailure() { | ||
expect('Errors while crawling').to.be(''); | ||
}, | ||
function onAllFinished(crawledUrls) { | ||
expect(crawledUrls.sort()).toEqual(expectedUrls.sort()); | ||
done(); | ||
} | ||
); | ||
}); | ||
it('should call onAllFinished when no urls should be crawled', (done) => { | ||
crawler.configure({ | ||
shouldCrawl: function(url) { | ||
return false; | ||
} | ||
}) | ||
crawler.crawl('http://localhost:3000/simple_cycle/page1.html', | ||
function onSuccess(page) { | ||
}, | ||
function onFailure() { | ||
expect('Errors while crawling').to.be(''); | ||
}, | ||
function onAllFinished(crawledUrls) { | ||
expect(crawledUrls.length).toEqual(0); | ||
done(); | ||
} | ||
); | ||
}); | ||
}); | ||
//TODO: Test for the correct referer value in a chain of visited pages | ||
@@ -201,0 +248,0 @@ //TODO: Test for the shouldCrawlLinksFrom function |
{ | ||
"name": "js-crawler", | ||
"version": "0.3.17", | ||
"version": "0.3.18", | ||
"description": "Web crawler for Node.js", | ||
@@ -5,0 +5,0 @@ "main": "crawler.js", |
@@ -161,5 +161,5 @@ js-crawler | ||
#### Forgetting crawled urls | ||
#### Reusing the same crawler instance for repeated crawling: forgetting crawled urls | ||
By default a crawler instance will remember all the urls it ever crawled and will not crawl them again. In order to make it forget all the crawled urls the method `forgetCrawled` can be used. There is another way to solve the same problem: create a new instance of a crawler. | ||
By default a crawler instance will remember all the urls it ever crawled and will not crawl them again. In order to make it forget all the crawled urls the method `forgetCrawled` can be used. There is another way to solve the same problem: create a new instance of a crawler. Example https://github.com/antivanov/js-crawler/blob/master/examples/github_forgetting_crawled_urls.js | ||
@@ -190,5 +190,3 @@ #### Supported options | ||
Note: `shouldCrawl` determines if a given URL should be requested/visited at all, where as `shouldSpider` determines if the links on a given URL should be harvested/added to the crawling queue. | ||
Many users may find that setting `shouldCrawl` is sufficient, as URLs cannot be spidered if they are never visited/requested. | ||
A common use case for having these functions separated: if a user would like to check external links on a site for errors, without crawling those external links, the user could create a `shouldSpider` function that restricts spidering to the original URL. | ||
Note: `shouldCrawl` determines if a given URL should be requested/visited at all, where as `shouldCrawlLinksFrom` determines if the links on a given URL should be harvested/added to the crawling queue. Many users may find that using `shouldCrawl` is sufficient, as links from a page cannot be crawled if the page is never visited/requested in the first place. A common use case for having these functions separated: if a user would like to check external links on a site for errors without crawling those external links, the user could create a `shouldCrawlLinksFrom` function that restricts crawling to the original url without visiting external links. | ||
@@ -195,0 +193,0 @@ |
5014000
1376
285