js-crawler
Advanced tools
Comparing version 0.3.11 to 0.3.13
@@ -211,3 +211,3 @@ var request = require('request'); | ||
} | ||
}, function(error, response, body) { | ||
}, function(error, response) { | ||
if (self.knownUrls[url]) { | ||
@@ -221,2 +221,5 @@ //Was already crawled while the request has been processed, no need to call callbacks | ||
}); | ||
var isTextContent = self._isTextContent(response); | ||
var body = isTextContent ? self._getDecodedBody(response) : '<<...binary content (omitted by js-crawler)...>>'; | ||
if (!error && (response.statusCode === 200)) { | ||
@@ -237,12 +240,4 @@ //If no redirects, then response.request.uri.href === url, otherwise last url | ||
self.crawledUrls.push(lastUrlInRedirectChain); | ||
/* | ||
Some minor changes made by @tibetty to: | ||
1. ensure further link analysis only make upon html content; | ||
2. convert binary buffer to properly an encoded string to facilitate analysis. | ||
*/ | ||
if (depth > 1 && response.headers['content-type'].match(/^text\/html.*$/)) { | ||
var encoding = 'utf8'; | ||
if (response.headers['content-encoding']) encoding = response.headers['content-encoding']; | ||
var encodedBody = body.toString(encoding); | ||
self._crawlUrls(self._getAllUrls(lastUrlInRedirectChain, encodedBody), lastUrlInRedirectChain, depth - 1); | ||
if (depth > 1 && isTextContent) { | ||
self._crawlUrls(self._getAllUrls(lastUrlInRedirectChain, body), lastUrlInRedirectChain, depth - 1); | ||
} | ||
@@ -254,2 +249,3 @@ } | ||
status: response ? response.statusCode : undefined, | ||
content: body, | ||
error: error, | ||
@@ -264,2 +260,16 @@ response: response, | ||
Crawler.prototype._isTextContent = function(response) { | ||
return response.headers && response.headers['content-type'] | ||
&& response.headers['content-type'].match(/^text\/html.*$/); | ||
}; | ||
Crawler.prototype._getDecodedBody = function(response, body) { | ||
var encoding = 'utf8'; | ||
if (response.headers['content-encoding']) { | ||
encoding = response.headers['content-encoding']; | ||
} | ||
return response.body.toString(encoding); | ||
}; | ||
Crawler.prototype._stripComments = function(str) { | ||
@@ -269,5 +279,21 @@ return str.replace(/<!--.*?-->/g, ''); | ||
Crawler.prototype._getAllUrls = function(baseUrl, body) { | ||
Crawler.prototype._getBaseUrl = function(defaultBaseUrl, body) { | ||
/* | ||
* Resolving the base url following | ||
* the algorithm from https://www.w3.org/TR/html5/document-metadata.html#the-base-element | ||
*/ | ||
var baseUrlRegex = /<base href="(.*?)">/; | ||
var baseUrlInPage = body.match(baseUrlRegex); | ||
if (!baseUrlInPage) { | ||
return defaultBaseUrl; | ||
} | ||
return url.resolve(defaultBaseUrl, baseUrlInPage[1]); | ||
}; | ||
Crawler.prototype._getAllUrls = function(defaultBaseUrl, body) { | ||
var self = this; | ||
body = this._stripComments(body); | ||
var baseUrl = this._getBaseUrl(defaultBaseUrl, body); | ||
var linksRegex = self.ignoreRelative ? /<a[^>]+?href=".*?:\/\/.*?"/gmi : /<a[^>]+?href=".*?"/gmi; | ||
@@ -274,0 +300,0 @@ var links = body.match(linksRegex) || []; |
@@ -70,8 +70,111 @@ var Crawler = require('../crawler'); | ||
describe('simple cycle', () => { | ||
it('should crawl all urls in a cycle only once', (done) => { | ||
var crawledUrls = []; | ||
var expectedUrls = [ | ||
'http://localhost:3000/simple_cycle/page1.html', | ||
'http://localhost:3000/simple_cycle/page2.html', | ||
'http://localhost:3000/simple_cycle/page3.html' | ||
]; | ||
crawler.crawl('http://localhost:3000/simple_cycle/page1.html', | ||
function onSuccess(page) { | ||
crawledUrls.push(page.url); | ||
}, | ||
function onFailure() { | ||
expect('Errors while crawling').to.be(''); | ||
}, | ||
function onAllFinished(crawledUrls) { | ||
expect(crawledUrls.sort()).toEqual(expectedUrls.sort()); | ||
done(); | ||
} | ||
); | ||
}); | ||
}); | ||
describe('page success', () => { | ||
it('should return url, content, status', (done) => { | ||
crawler.crawl('http://localhost:3000/one_page_graph/page1.html', | ||
function onSuccess(page) { | ||
expect(page.url).toEqual('http://localhost:3000/one_page_graph/page1.html'); | ||
expect(page.status).toEqual(200); | ||
expect(page.content).toEqual('<html><body>One page graph.</body></html>'); | ||
expect(page.error).toBeNull(); | ||
expect(page.response).not.toBeNull(); | ||
expect(page.body).toEqual(page.content); | ||
done(); | ||
} | ||
); | ||
}); | ||
}); | ||
describe('page error', () => { | ||
it('should return error', (done) => { | ||
var HTTP_NOT_FOUND = 404; | ||
crawler.crawl('http://localhost:3000/one_page_graph/no_such_page.html', null, | ||
function onError(page) { | ||
expect(page.url).toEqual('http://localhost:3000/one_page_graph/no_such_page.html'); | ||
expect(page.status).toEqual(HTTP_NOT_FOUND); | ||
expect(page.content).toEqual('Cannot GET /one_page_graph/no_such_page.html\n'); | ||
expect(page.error).toBeNull(); | ||
expect(page.response).not.toBeNull(); | ||
expect(page.body).toEqual(page.content); | ||
done(); | ||
} | ||
); | ||
}); | ||
}); | ||
describe('base tag', () => { | ||
it('should use base url as the base for relative urls', (done) => { | ||
var crawledUrls = []; | ||
var expectedUrls = [ | ||
'http://localhost:3000/base_tag/index/page1.html', | ||
'http://localhost:3000/base_tag/page2.html' | ||
]; | ||
crawler.crawl({ | ||
url: 'http://localhost:3000/base_tag/index/page1.html', | ||
success: function(page) { | ||
crawledUrls.push(page.url); | ||
}, | ||
finished: function(crawledUrls) { | ||
expect(crawledUrls.sort()).toEqual(expectedUrls.sort()); | ||
done(); | ||
} | ||
}); | ||
}); | ||
it('should resolve relative base url', (done) => { | ||
var crawledUrls = []; | ||
var expectedUrls = [ | ||
'http://localhost:3000/base_tag/index/page1relativebase.html', | ||
'http://localhost:3000/base_tag/index/relative_base_tag/page3.html' | ||
]; | ||
crawler.crawl({ | ||
url: 'http://localhost:3000/base_tag/index/page1relativebase.html', | ||
success: function(page) { | ||
crawledUrls.push(page.url); | ||
}, | ||
finished: function(crawledUrls) { | ||
expect(crawledUrls.sort()).toEqual(expectedUrls.sort()); | ||
done(); | ||
} | ||
}); | ||
}); | ||
}); | ||
//TODO: Redirect with another HTTP code? 301? | ||
//TODO: Cycles | ||
//TODO: Binary content | ||
//TODO: Binary content, links are not analyzed in binary content, binary content itself is not returned (as it can be too large)(?) | ||
//TODO: Test for throughput limitation | ||
//TODO: Test for depth limitation | ||
//TODO: Forgetting crawled urls | ||
//TODO: Reusing the same crawler, no new urls will be crawled | ||
//TODO: Test for crawling 1000 links (generate them in server.js) | ||
}); |
{ | ||
"name": "js-crawler", | ||
"version": "0.3.11", | ||
"version": "0.3.13", | ||
"description": "Web crawler for Node.js", | ||
@@ -5,0 +5,0 @@ "main": "crawler.js", |
@@ -152,2 +152,43 @@ var Crawler = require('../crawler'); | ||
}); | ||
describe('base url specified in HTML', () => { | ||
var defaultBaseUrl = 'http://localhost:8080/defaultbase/'; | ||
var specifiedAbsoluteBaseUrl = 'http://localhost:8080/specifiedabsolutebase/'; | ||
var specifiedRelativeBaseUrl = 'specifiedrelativebase/'; | ||
it('should resolve relative urls using base url', function() { | ||
var fragment = '<base href="' +specifiedAbsoluteBaseUrl + '">\ | ||
<a href="resource/1"></a>\ | ||
<a href="resource/2"></a>\ | ||
<a href="resource/3"></a>'; | ||
expect(crawler._getAllUrls(defaultBaseUrl, fragment)) | ||
.toEqual([ | ||
'http://localhost:8080/specifiedabsolutebase/resource/1', | ||
'http://localhost:8080/specifiedabsolutebase/resource/2', | ||
'http://localhost:8080/specifiedabsolutebase/resource/3' | ||
]); | ||
}); | ||
it('should resolve absolute urls to themselves', function() { | ||
var fragment = '<base href="' +specifiedAbsoluteBaseUrl + '">\ | ||
<a href="/resource/1"></a>'; | ||
expect(crawler._getAllUrls(defaultBaseUrl, fragment)) | ||
.toEqual([ | ||
'http://localhost:8080/resource/1' | ||
]); | ||
}); | ||
it('should resolve relative urls with relative base url specified', function() { | ||
var fragment = '<base href="' +specifiedRelativeBaseUrl + '">\ | ||
<a href="resource/1"></a>'; | ||
expect(crawler._getAllUrls(defaultBaseUrl, fragment)) | ||
.toEqual([ | ||
'http://localhost:8080/defaultbase/specifiedrelativebase/resource/1' | ||
]); | ||
}); | ||
}); | ||
}); | ||
@@ -223,6 +264,8 @@ | ||
var errorStatusCode = 404; | ||
var errorBody = 'Server error'; | ||
var errorResponse = { | ||
statusCode: errorStatusCode | ||
headers: {'content-type': 'text/html'}, | ||
statusCode: errorStatusCode, | ||
body: errorBody | ||
}; | ||
var errorBody = 'Server error'; | ||
@@ -244,2 +287,3 @@ describe('error', function() { | ||
status: errorStatusCode, | ||
content: errorBody, | ||
error: error, | ||
@@ -274,3 +318,3 @@ response: errorResponse, | ||
headers: { | ||
'content-type': '' | ||
'content-type': 'text/html' | ||
}, | ||
@@ -281,3 +325,4 @@ request: { | ||
} | ||
} | ||
}, | ||
body: body | ||
}; | ||
@@ -336,4 +381,4 @@ spyOn(crawler, 'onSuccess'); | ||
response.headers['content-type'] = 'text/html'; | ||
body = jasmine.createSpyObj('bodyBuffer', ['toString']); | ||
body.toString.and.returnValue(decodedBody); | ||
response.body = jasmine.createSpyObj('bodyBuffer', ['toString']); | ||
response.body.toString.and.returnValue(decodedBody); | ||
}); | ||
@@ -343,3 +388,3 @@ | ||
crawler._crawlUrl(url, referer, depth); | ||
expect(body.toString).toHaveBeenCalledWith('utf8'); | ||
expect(response.body.toString).toHaveBeenCalledWith('utf8'); | ||
}); | ||
@@ -350,3 +395,3 @@ | ||
crawler._crawlUrl(url, referer, depth); | ||
expect(body.toString).toHaveBeenCalledWith('gzip'); | ||
expect(response.body.toString).toHaveBeenCalledWith('gzip'); | ||
}); | ||
@@ -353,0 +398,0 @@ }); |
Filesystem access
Supply chain riskAccesses the file system, and could potentially read sensitive data.
Found 1 instance in 1 package
5004977
38
1197
1