Socket
Socket
Sign inDemoInstall

js-crawler

Package Overview
Dependencies
57
Maintainers
1
Versions
23
Alerts
File Explorer

Advanced tools

Install Socket

Detect and block malicious and high-risk dependencies

Install

Comparing version 0.3.11 to 0.3.13

e2e/static/base_tag/index/page1.html

50

crawler.js

@@ -211,3 +211,3 @@ var request = require('request');

}
}, function(error, response, body) {
}, function(error, response) {
if (self.knownUrls[url]) {

@@ -221,2 +221,5 @@ //Was already crawled while the request has been processed, no need to call callbacks

});
var isTextContent = self._isTextContent(response);
var body = isTextContent ? self._getDecodedBody(response) : '<<...binary content (omitted by js-crawler)...>>';
if (!error && (response.statusCode === 200)) {

@@ -237,12 +240,4 @@ //If no redirects, then response.request.uri.href === url, otherwise last url

self.crawledUrls.push(lastUrlInRedirectChain);
/*
Some minor changes made by @tibetty to:
1. ensure further link analysis only make upon html content;
2. convert binary buffer to properly an encoded string to facilitate analysis.
*/
if (depth > 1 && response.headers['content-type'].match(/^text\/html.*$/)) {
var encoding = 'utf8';
if (response.headers['content-encoding']) encoding = response.headers['content-encoding'];
var encodedBody = body.toString(encoding);
self._crawlUrls(self._getAllUrls(lastUrlInRedirectChain, encodedBody), lastUrlInRedirectChain, depth - 1);
if (depth > 1 && isTextContent) {
self._crawlUrls(self._getAllUrls(lastUrlInRedirectChain, body), lastUrlInRedirectChain, depth - 1);
}

@@ -254,2 +249,3 @@ }

status: response ? response.statusCode : undefined,
content: body,
error: error,

@@ -264,2 +260,16 @@ response: response,

Crawler.prototype._isTextContent = function(response) {
return response.headers && response.headers['content-type']
&& response.headers['content-type'].match(/^text\/html.*$/);
};
Crawler.prototype._getDecodedBody = function(response, body) {
var encoding = 'utf8';
if (response.headers['content-encoding']) {
encoding = response.headers['content-encoding'];
}
return response.body.toString(encoding);
};
Crawler.prototype._stripComments = function(str) {

@@ -269,5 +279,21 @@ return str.replace(/<!--.*?-->/g, '');

Crawler.prototype._getAllUrls = function(baseUrl, body) {
Crawler.prototype._getBaseUrl = function(defaultBaseUrl, body) {
/*
* Resolving the base url following
* the algorithm from https://www.w3.org/TR/html5/document-metadata.html#the-base-element
*/
var baseUrlRegex = /<base href="(.*?)">/;
var baseUrlInPage = body.match(baseUrlRegex);
if (!baseUrlInPage) {
return defaultBaseUrl;
}
return url.resolve(defaultBaseUrl, baseUrlInPage[1]);
};
Crawler.prototype._getAllUrls = function(defaultBaseUrl, body) {
var self = this;
body = this._stripComments(body);
var baseUrl = this._getBaseUrl(defaultBaseUrl, body);
var linksRegex = self.ignoreRelative ? /<a[^>]+?href=".*?:\/\/.*?"/gmi : /<a[^>]+?href=".*?"/gmi;

@@ -274,0 +300,0 @@ var links = body.match(linksRegex) || [];

@@ -70,8 +70,111 @@ var Crawler = require('../crawler');

describe('simple cycle', () => {
it('should crawl all urls in a cycle only once', (done) => {
var crawledUrls = [];
var expectedUrls = [
'http://localhost:3000/simple_cycle/page1.html',
'http://localhost:3000/simple_cycle/page2.html',
'http://localhost:3000/simple_cycle/page3.html'
];
crawler.crawl('http://localhost:3000/simple_cycle/page1.html',
function onSuccess(page) {
crawledUrls.push(page.url);
},
function onFailure() {
expect('Errors while crawling').to.be('');
},
function onAllFinished(crawledUrls) {
expect(crawledUrls.sort()).toEqual(expectedUrls.sort());
done();
}
);
});
});
describe('page success', () => {
it('should return url, content, status', (done) => {
crawler.crawl('http://localhost:3000/one_page_graph/page1.html',
function onSuccess(page) {
expect(page.url).toEqual('http://localhost:3000/one_page_graph/page1.html');
expect(page.status).toEqual(200);
expect(page.content).toEqual('<html><body>One page graph.</body></html>');
expect(page.error).toBeNull();
expect(page.response).not.toBeNull();
expect(page.body).toEqual(page.content);
done();
}
);
});
});
describe('page error', () => {
it('should return error', (done) => {
var HTTP_NOT_FOUND = 404;
crawler.crawl('http://localhost:3000/one_page_graph/no_such_page.html', null,
function onError(page) {
expect(page.url).toEqual('http://localhost:3000/one_page_graph/no_such_page.html');
expect(page.status).toEqual(HTTP_NOT_FOUND);
expect(page.content).toEqual('Cannot GET /one_page_graph/no_such_page.html\n');
expect(page.error).toBeNull();
expect(page.response).not.toBeNull();
expect(page.body).toEqual(page.content);
done();
}
);
});
});
describe('base tag', () => {
it('should use base url as the base for relative urls', (done) => {
var crawledUrls = [];
var expectedUrls = [
'http://localhost:3000/base_tag/index/page1.html',
'http://localhost:3000/base_tag/page2.html'
];
crawler.crawl({
url: 'http://localhost:3000/base_tag/index/page1.html',
success: function(page) {
crawledUrls.push(page.url);
},
finished: function(crawledUrls) {
expect(crawledUrls.sort()).toEqual(expectedUrls.sort());
done();
}
});
});
it('should resolve relative base url', (done) => {
var crawledUrls = [];
var expectedUrls = [
'http://localhost:3000/base_tag/index/page1relativebase.html',
'http://localhost:3000/base_tag/index/relative_base_tag/page3.html'
];
crawler.crawl({
url: 'http://localhost:3000/base_tag/index/page1relativebase.html',
success: function(page) {
crawledUrls.push(page.url);
},
finished: function(crawledUrls) {
expect(crawledUrls.sort()).toEqual(expectedUrls.sort());
done();
}
});
});
});
//TODO: Redirect with another HTTP code? 301?
//TODO: Cycles
//TODO: Binary content
//TODO: Binary content, links are not analyzed in binary content, binary content itself is not returned (as it can be too large)(?)
//TODO: Test for throughput limitation
//TODO: Test for depth limitation
//TODO: Forgetting crawled urls
//TODO: Reusing the same crawler, no new urls will be crawled
//TODO: Test for crawling 1000 links (generate them in server.js)
});
{
"name": "js-crawler",
"version": "0.3.11",
"version": "0.3.13",
"description": "Web crawler for Node.js",

@@ -5,0 +5,0 @@ "main": "crawler.js",

@@ -152,2 +152,43 @@ var Crawler = require('../crawler');

});
describe('base url specified in HTML', () => {
var defaultBaseUrl = 'http://localhost:8080/defaultbase/';
var specifiedAbsoluteBaseUrl = 'http://localhost:8080/specifiedabsolutebase/';
var specifiedRelativeBaseUrl = 'specifiedrelativebase/';
it('should resolve relative urls using base url', function() {
var fragment = '<base href="' +specifiedAbsoluteBaseUrl + '">\
<a href="resource/1"></a>\
<a href="resource/2"></a>\
<a href="resource/3"></a>';
expect(crawler._getAllUrls(defaultBaseUrl, fragment))
.toEqual([
'http://localhost:8080/specifiedabsolutebase/resource/1',
'http://localhost:8080/specifiedabsolutebase/resource/2',
'http://localhost:8080/specifiedabsolutebase/resource/3'
]);
});
it('should resolve absolute urls to themselves', function() {
var fragment = '<base href="' +specifiedAbsoluteBaseUrl + '">\
<a href="/resource/1"></a>';
expect(crawler._getAllUrls(defaultBaseUrl, fragment))
.toEqual([
'http://localhost:8080/resource/1'
]);
});
it('should resolve relative urls with relative base url specified', function() {
var fragment = '<base href="' +specifiedRelativeBaseUrl + '">\
<a href="resource/1"></a>';
expect(crawler._getAllUrls(defaultBaseUrl, fragment))
.toEqual([
'http://localhost:8080/defaultbase/specifiedrelativebase/resource/1'
]);
});
});
});

@@ -223,6 +264,8 @@

var errorStatusCode = 404;
var errorBody = 'Server error';
var errorResponse = {
statusCode: errorStatusCode
headers: {'content-type': 'text/html'},
statusCode: errorStatusCode,
body: errorBody
};
var errorBody = 'Server error';

@@ -244,2 +287,3 @@ describe('error', function() {

status: errorStatusCode,
content: errorBody,
error: error,

@@ -274,3 +318,3 @@ response: errorResponse,

headers: {
'content-type': ''
'content-type': 'text/html'
},

@@ -281,3 +325,4 @@ request: {

}
}
},
body: body
};

@@ -336,4 +381,4 @@ spyOn(crawler, 'onSuccess');

response.headers['content-type'] = 'text/html';
body = jasmine.createSpyObj('bodyBuffer', ['toString']);
body.toString.and.returnValue(decodedBody);
response.body = jasmine.createSpyObj('bodyBuffer', ['toString']);
response.body.toString.and.returnValue(decodedBody);
});

@@ -343,3 +388,3 @@

crawler._crawlUrl(url, referer, depth);
expect(body.toString).toHaveBeenCalledWith('utf8');
expect(response.body.toString).toHaveBeenCalledWith('utf8');
});

@@ -350,3 +395,3 @@

crawler._crawlUrl(url, referer, depth);
expect(body.toString).toHaveBeenCalledWith('gzip');
expect(response.body.toString).toHaveBeenCalledWith('gzip');
});

@@ -353,0 +398,0 @@ });

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc