js-crawler - npm Package Compare versions

e2e/static/base_tag/index/page1.html

e2e/static/base_tag/index/page1relativebase.html

e2e/static/base_tag/index/relative_base_tag/page3.html

e2e/static/base_tag/page2.html

e2e/static/one_page_graph/page1.html

e2e/static/simple_cycle/page1.html

e2e/static/simple_cycle/page2.html

e2e/static/simple_cycle/page3.html

examples/binary_file/download_file.js

examples/binary_file/index.html

examples/binary_file/request_file.js

examples/binary_file/taito_ku.pdf

50

crawler.js

		@@ -211,3 +211,3 @@ var request = require('request');
		}
		}, function(error, response, body) {
		}, function(error, response) {
		if (self.knownUrls[url]) {
		@@ -221,2 +221,5 @@ //Was already crawled while the request has been processed, no need to call callbacks
		});
		var isTextContent = self._isTextContent(response);
		var body = isTextContent ? self._getDecodedBody(response) : '<<...binary content (omitted by js-crawler)...>>';

		if (!error && (response.statusCode === 200)) {
		@@ -237,12 +240,4 @@ //If no redirects, then response.request.uri.href === url, otherwise last url
		self.crawledUrls.push(lastUrlInRedirectChain);
		/*
		Some minor changes made by @tibetty to:
		1. ensure further link analysis only make upon html content;
		2. convert binary buffer to properly an encoded string to facilitate analysis.
		*/
		if (depth > 1 && response.headers['content-type'].match(/^text\/html.*$/)) {
		var encoding = 'utf8';
		if (response.headers['content-encoding']) encoding = response.headers['content-encoding'];
		var encodedBody = body.toString(encoding);
		self._crawlUrls(self._getAllUrls(lastUrlInRedirectChain, encodedBody), lastUrlInRedirectChain, depth - 1);
		if (depth > 1 && isTextContent) {
		self._crawlUrls(self._getAllUrls(lastUrlInRedirectChain, body), lastUrlInRedirectChain, depth - 1);
		}
		@@ -254,2 +249,3 @@ }
		status: response ? response.statusCode : undefined,
		content: body,
		error: error,
		@@ -264,2 +260,16 @@ response: response,

		Crawler.prototype._isTextContent = function(response) {
		return response.headers && response.headers['content-type']
		&& response.headers['content-type'].match(/^text\/html.*$/);
		};

		Crawler.prototype._getDecodedBody = function(response, body) {
		var encoding = 'utf8';

		if (response.headers['content-encoding']) {
		encoding = response.headers['content-encoding'];
		}
		return response.body.toString(encoding);
		};

		Crawler.prototype._stripComments = function(str) {
		@@ -269,5 +279,21 @@ return str.replace(/<!--.*?-->/g, '');

		Crawler.prototype._getAllUrls = function(baseUrl, body) {
		Crawler.prototype._getBaseUrl = function(defaultBaseUrl, body) {

		/*
		* Resolving the base url following
		* the algorithm from https://www.w3.org/TR/html5/document-metadata.html#the-base-element
		*/
		var baseUrlRegex = /<base href="(.*?)">/;
		var baseUrlInPage = body.match(baseUrlRegex);
		if (!baseUrlInPage) {
		return defaultBaseUrl;
		}

		return url.resolve(defaultBaseUrl, baseUrlInPage[1]);
		};

		Crawler.prototype._getAllUrls = function(defaultBaseUrl, body) {
		var self = this;
		body = this._stripComments(body);
		var baseUrl = this._getBaseUrl(defaultBaseUrl, body);
		var linksRegex = self.ignoreRelative ? /<a[^>]+?href=".?:\/\/.?"/gmi : /<a[^>]+?href=".*?"/gmi;
		@@ -274,0 +300,0 @@ var links = body.match(linksRegex) \|\| [];

107

e2e/crawler.spec.js

		@@ -70,8 +70,111 @@ var Crawler = require('../crawler');

		describe('simple cycle', () => {

		it('should crawl all urls in a cycle only once', (done) => {
		var crawledUrls = [];
		var expectedUrls = [
		'http://localhost:3000/simple_cycle/page1.html',
		'http://localhost:3000/simple_cycle/page2.html',
		'http://localhost:3000/simple_cycle/page3.html'
		];

		crawler.crawl('http://localhost:3000/simple_cycle/page1.html',
		function onSuccess(page) {
		crawledUrls.push(page.url);
		},
		function onFailure() {
		expect('Errors while crawling').to.be('');
		},
		function onAllFinished(crawledUrls) {
		expect(crawledUrls.sort()).toEqual(expectedUrls.sort());
		done();
		}
		);
		});
		});

		describe('page success', () => {

		it('should return url, content, status', (done) => {
		crawler.crawl('http://localhost:3000/one_page_graph/page1.html',
		function onSuccess(page) {
		expect(page.url).toEqual('http://localhost:3000/one_page_graph/page1.html');
		expect(page.status).toEqual(200);
		expect(page.content).toEqual('<html><body>One page graph.</body></html>');
		expect(page.error).toBeNull();
		expect(page.response).not.toBeNull();
		expect(page.body).toEqual(page.content);
		done();
		}
		);
		});
		});

		describe('page error', () => {

		it('should return error', (done) => {
		var HTTP_NOT_FOUND = 404;

		crawler.crawl('http://localhost:3000/one_page_graph/no_such_page.html', null,
		function onError(page) {
		expect(page.url).toEqual('http://localhost:3000/one_page_graph/no_such_page.html');
		expect(page.status).toEqual(HTTP_NOT_FOUND);
		expect(page.content).toEqual('Cannot GET /one_page_graph/no_such_page.html\n');
		expect(page.error).toBeNull();
		expect(page.response).not.toBeNull();
		expect(page.body).toEqual(page.content);
		done();
		}
		);
		});
		});

		describe('base tag', () => {

		it('should use base url as the base for relative urls', (done) => {
		var crawledUrls = [];
		var expectedUrls = [
		'http://localhost:3000/base_tag/index/page1.html',
		'http://localhost:3000/base_tag/page2.html'
		];

		crawler.crawl({
		url: 'http://localhost:3000/base_tag/index/page1.html',
		success: function(page) {
		crawledUrls.push(page.url);
		},
		finished: function(crawledUrls) {
		expect(crawledUrls.sort()).toEqual(expectedUrls.sort());
		done();
		}
		});
		});

		it('should resolve relative base url', (done) => {
		var crawledUrls = [];
		var expectedUrls = [
		'http://localhost:3000/base_tag/index/page1relativebase.html',
		'http://localhost:3000/base_tag/index/relative_base_tag/page3.html'
		];

		crawler.crawl({
		url: 'http://localhost:3000/base_tag/index/page1relativebase.html',
		success: function(page) {
		crawledUrls.push(page.url);
		},
		finished: function(crawledUrls) {
		expect(crawledUrls.sort()).toEqual(expectedUrls.sort());
		done();
		}
		});
		});
		});

		//TODO: Redirect with another HTTP code? 301?
		//TODO: Cycles
		//TODO: Binary content
		//TODO: Binary content, links are not analyzed in binary content, binary content itself is not returned (as it can be too large)(?)
		//TODO: Test for throughput limitation
		//TODO: Test for depth limitation
		//TODO: Forgetting crawled urls
		//TODO: Reusing the same crawler, no new urls will be crawled
		//TODO: Test for crawling 1000 links (generate them in server.js)
		});

2

package.json

		{
		"name": "js-crawler",
		"version": "0.3.11",
		"version": "0.3.13",
		"description": "Web crawler for Node.js",
		@@ -5,0 +5,0 @@ "main": "crawler.js",

61

spec/crawler.spec.js

		@@ -152,2 +152,43 @@ var Crawler = require('../crawler');
		});

		describe('base url specified in HTML', () => {

		var defaultBaseUrl = 'http://localhost:8080/defaultbase/';
		var specifiedAbsoluteBaseUrl = 'http://localhost:8080/specifiedabsolutebase/';
		var specifiedRelativeBaseUrl = 'specifiedrelativebase/';

		it('should resolve relative urls using base url', function() {
		var fragment = '<base href="' +specifiedAbsoluteBaseUrl + '">\
		<a href="resource/1"></a>\
		<a href="resource/2"></a>\
		<a href="resource/3"></a>';

		expect(crawler._getAllUrls(defaultBaseUrl, fragment))
		.toEqual([
		'http://localhost:8080/specifiedabsolutebase/resource/1',
		'http://localhost:8080/specifiedabsolutebase/resource/2',
		'http://localhost:8080/specifiedabsolutebase/resource/3'
		]);
		});

		it('should resolve absolute urls to themselves', function() {
		var fragment = '<base href="' +specifiedAbsoluteBaseUrl + '">\
		<a href="/resource/1"></a>';

		expect(crawler._getAllUrls(defaultBaseUrl, fragment))
		.toEqual([
		'http://localhost:8080/resource/1'
		]);
		});

		it('should resolve relative urls with relative base url specified', function() {
		var fragment = '<base href="' +specifiedRelativeBaseUrl + '">\
		<a href="resource/1"></a>';

		expect(crawler._getAllUrls(defaultBaseUrl, fragment))
		.toEqual([
		'http://localhost:8080/defaultbase/specifiedrelativebase/resource/1'
		]);
		});
		});
		});
		@@ -223,6 +264,8 @@
		var errorStatusCode = 404;
		var errorBody = 'Server error';
		var errorResponse = {
		statusCode: errorStatusCode
		headers: {'content-type': 'text/html'},
		statusCode: errorStatusCode,
		body: errorBody
		};
		var errorBody = 'Server error';

		@@ -244,2 +287,3 @@ describe('error', function() {
		status: errorStatusCode,
		content: errorBody,
		error: error,
		@@ -274,3 +318,3 @@ response: errorResponse,
		headers: {
		'content-type': ''
		'content-type': 'text/html'
		},
		@@ -281,3 +325,4 @@ request: {
		}
		}
		},
		body: body
		};
		@@ -336,4 +381,4 @@ spyOn(crawler, 'onSuccess');
		response.headers['content-type'] = 'text/html';
		body = jasmine.createSpyObj('bodyBuffer', ['toString']);
		body.toString.and.returnValue(decodedBody);
		response.body = jasmine.createSpyObj('bodyBuffer', ['toString']);
		response.body.toString.and.returnValue(decodedBody);
		});
		@@ -343,3 +388,3 @@
		crawler._crawlUrl(url, referer, depth);
		expect(body.toString).toHaveBeenCalledWith('utf8');
		expect(response.body.toString).toHaveBeenCalledWith('utf8');
		});
		@@ -350,3 +395,3 @@
		crawler._crawlUrl(url, referer, depth);
		expect(body.toString).toHaveBeenCalledWith('gzip');
		expect(response.body.toString).toHaveBeenCalledWith('gzip');
		});
		@@ -353,0 +398,0 @@ });

New alerts

Improved metrics

Worsened metrics