js-crawler - npm Package Compare versions

js-crawler

Package Overview

Dependencies

Maintainers

Advanced tools

Install Socket

Detect and block malicious and high-risk dependencies

Install

Comparing version 0.3.17 to 0.3.18

crawler.js

		@@ -193,3 +193,7 @@ var request = require('request');
		//console.log('Should skip? url = ', url, _.contains(self.knownUrls, url) \|\| !self.shouldCrawl(url));
		return _.contains(self.knownUrls, url) \|\| !self.shouldCrawl(url);
		var shouldCrawlUrl = self.shouldCrawl(url);
		if (!shouldCrawlUrl) {
		self._finishedCrawling(url);
		}
		return _.contains(self.knownUrls, url) \|\| !shouldCrawlUrl;
		});
		@@ -208,3 +212,3 @@ };
		url: url,
		encoding: null, // Added by @tibetty so as to avoid request treating body as a string by default
		encoding: null, // Added by @tibetty so as to avoid request treating body as a string by default
		rejectUnauthorized : false,
		@@ -211,0 +215,0 @@ followRedirect: true,

e2e/crawler.spec.js

		@@ -199,2 +199,49 @@ var Crawler = require('../crawler');

		describe('shouldCrawl', () => {

		it('should call onAllFinished when last url should not be crawled', (done) => {
		var expectedUrls = [
		'http://localhost:3000/simple_cycle/page1.html',
		'http://localhost:3000/simple_cycle/page2.html'
		];

		crawler.configure({
		shouldCrawl: function(url) {
		//Omit page3.html
		return url.indexOf('page3.html') < 0;
		}
		})
		crawler.crawl('http://localhost:3000/simple_cycle/page1.html',
		function onSuccess(page) {
		},
		function onFailure() {
		expect('Errors while crawling').to.be('');
		},
		function onAllFinished(crawledUrls) {
		expect(crawledUrls.sort()).toEqual(expectedUrls.sort());
		done();
		}
		);
		});

		it('should call onAllFinished when no urls should be crawled', (done) => {
		crawler.configure({
		shouldCrawl: function(url) {
		return false;
		}
		})
		crawler.crawl('http://localhost:3000/simple_cycle/page1.html',
		function onSuccess(page) {
		},
		function onFailure() {
		expect('Errors while crawling').to.be('');
		},
		function onAllFinished(crawledUrls) {
		expect(crawledUrls.length).toEqual(0);
		done();
		}
		);
		});
		});

		//TODO: Test for the correct referer value in a chain of visited pages
		@@ -201,0 +248,0 @@ //TODO: Test for the shouldCrawlLinksFrom function

package.json

		{
		"name": "js-crawler",
		"version": "0.3.17",
		"version": "0.3.18",
		"description": "Web crawler for Node.js",
		@@ -5,0 +5,0 @@ "main": "crawler.js",

README.md

		@@ -161,5 +161,5 @@ js-crawler

		#### Forgetting crawled urls
		#### Reusing the same crawler instance for repeated crawling: forgetting crawled urls

		By default a crawler instance will remember all the urls it ever crawled and will not crawl them again. In order to make it forget all the crawled urls the method `forgetCrawled` can be used. There is another way to solve the same problem: create a new instance of a crawler.
		By default a crawler instance will remember all the urls it ever crawled and will not crawl them again. In order to make it forget all the crawled urls the method `forgetCrawled` can be used. There is another way to solve the same problem: create a new instance of a crawler. Example https://github.com/antivanov/js-crawler/blob/master/examples/github_forgetting_crawled_urls.js

		@@ -190,5 +190,3 @@ #### Supported options

		Note: `shouldCrawl` determines if a given URL should be requested/visited at all, where as `shouldSpider` determines if the links on a given URL should be harvested/added to the crawling queue.
		Many users may find that setting `shouldCrawl` is sufficient, as URLs cannot be spidered if they are never visited/requested.
		A common use case for having these functions separated: if a user would like to check external links on a site for errors, without crawling those external links, the user could create a `shouldSpider` function that restricts spidering to the original URL.
		Note: `shouldCrawl` determines if a given URL should be requested/visited at all, where as `shouldCrawlLinksFrom` determines if the links on a given URL should be harvested/added to the crawling queue. Many users may find that using `shouldCrawl` is sufficient, as links from a page cannot be crawled if the page is never visited/requested in the first place. A common use case for having these functions separated: if a user would like to check external links on a site for errors without crawling those external links, the user could create a `shouldCrawlLinksFrom` function that restricts crawling to the original url without visiting external links.

		@@ -195,0 +193,0 @@

Improved metrics

Worsened metrics