js-crawler - npm Package Compare versions

js-crawler

Advanced tools

Install Socket

Detect and block malicious and high-risk dependencies

Comparing version 0.3.13 to 0.3.14

e2e/static/non_http_https_links/page1.html

e2e/static/non_http_https_links/page2.html

crawler.js

		@@ -257,4 +257,4 @@ var request = require('request');
		Crawler.prototype._isTextContent = function(response) {
		return response.headers && response.headers['content-type']
		&& response.headers['content-type'].match(/^text\/html.*$/);
		return Boolean(response && response.headers && response.headers['content-type']
		&& response.headers['content-type'].match(/^text\/html.*$/));
		};
		@@ -290,2 +290,7 @@

		Crawler.prototype._isLinkProtocolSupported = function(link) {
		return (link.indexOf('://') < 0 && link.indexOf('mailto:') < 0)
		\|\| link.indexOf('http://') >= 0 \|\| link.indexOf('https://') >= 0;
		};

		Crawler.prototype._getAllUrls = function(defaultBaseUrl, body) {
		@@ -308,3 +313,5 @@ var self = this;
		.uniq()
		.filter(this.shouldCrawl)
		.filter(function(link) {
		return self._isLinkProtocolSupported(link) && self.shouldCrawl(link);
		})
		.value();
		@@ -311,0 +318,0 @@

e2e/crawler.spec.js

		@@ -172,2 +172,29 @@ var Crawler = require('../crawler');

		describe('references contain links to non-http resources', () => {

		it('should ignore mailto link', (done) => {
		var crawledUrls = [];
		var expectedUrls = [
		'http://localhost:3000/non_http_https_links/page1.html',
		'http://localhost:3000/non_http_https_links/page2.html'
		];

		crawler.crawl({
		url: 'http://localhost:3000/non_http_https_links/page1.html',
		success: function(page) {
		crawledUrls.push(page.url);
		},
		failure: function(error) {
		console.log(error);
		expect('Error while crawling').toEqual('');
		done();
		},
		finished: function(crawledUrls) {
		expect(crawledUrls.sort()).toEqual(expectedUrls.sort());
		done();
		}
		});
		});
		});

		//TODO: Redirect with another HTTP code? 301?
		@@ -174,0 +201,0 @@ //TODO: Binary content, links are not analyzed in binary content, binary content itself is not returned (as it can be too large)(?)

package.json

		{
		"name": "js-crawler",
		"version": "0.3.13",
		"version": "0.3.14",
		"description": "Web crawler for Node.js",
		@@ -5,0 +5,0 @@ "main": "crawler.js",

spec/crawler.spec.js

		@@ -70,2 +70,12 @@ var Crawler = require('../crawler');

		it('should ignore mailto links', function() {
		expect(crawler._getAllUrls(baseUrl, '<a href="mailto:someone@somewhere.com"></a>'))
		.toEqual([]);
		});

		it('should ignore ftp links', function() {
		expect(crawler._getAllUrls(baseUrl, '<a href="ftp://myserver.org"></a>'))
		.toEqual([]);
		});

		describe('ignoreRelative option', function() {
		@@ -391,2 +401,6 @@
		});

		it('if response is not defined, content is not considered to be text', function() {
		expect(crawler._isTextContent()).toBe(false);
		});
		});
		@@ -393,0 +407,0 @@

Improved metrics