js-crawler
Advanced tools
Comparing version 0.3.13 to 0.3.14
@@ -257,4 +257,4 @@ var request = require('request'); | ||
Crawler.prototype._isTextContent = function(response) { | ||
return response.headers && response.headers['content-type'] | ||
&& response.headers['content-type'].match(/^text\/html.*$/); | ||
return Boolean(response && response.headers && response.headers['content-type'] | ||
&& response.headers['content-type'].match(/^text\/html.*$/)); | ||
}; | ||
@@ -290,2 +290,7 @@ | ||
Crawler.prototype._isLinkProtocolSupported = function(link) { | ||
return (link.indexOf('://') < 0 && link.indexOf('mailto:') < 0) | ||
|| link.indexOf('http://') >= 0 || link.indexOf('https://') >= 0; | ||
}; | ||
Crawler.prototype._getAllUrls = function(defaultBaseUrl, body) { | ||
@@ -308,3 +313,5 @@ var self = this; | ||
.uniq() | ||
.filter(this.shouldCrawl) | ||
.filter(function(link) { | ||
return self._isLinkProtocolSupported(link) && self.shouldCrawl(link); | ||
}) | ||
.value(); | ||
@@ -311,0 +318,0 @@ |
@@ -172,2 +172,29 @@ var Crawler = require('../crawler'); | ||
describe('references contain links to non-http resources', () => { | ||
it('should ignore mailto link', (done) => { | ||
var crawledUrls = []; | ||
var expectedUrls = [ | ||
'http://localhost:3000/non_http_https_links/page1.html', | ||
'http://localhost:3000/non_http_https_links/page2.html' | ||
]; | ||
crawler.crawl({ | ||
url: 'http://localhost:3000/non_http_https_links/page1.html', | ||
success: function(page) { | ||
crawledUrls.push(page.url); | ||
}, | ||
failure: function(error) { | ||
console.log(error); | ||
expect('Error while crawling').toEqual(''); | ||
done(); | ||
}, | ||
finished: function(crawledUrls) { | ||
expect(crawledUrls.sort()).toEqual(expectedUrls.sort()); | ||
done(); | ||
} | ||
}); | ||
}); | ||
}); | ||
//TODO: Redirect with another HTTP code? 301? | ||
@@ -174,0 +201,0 @@ //TODO: Binary content, links are not analyzed in binary content, binary content itself is not returned (as it can be too large)(?) |
{ | ||
"name": "js-crawler", | ||
"version": "0.3.13", | ||
"version": "0.3.14", | ||
"description": "Web crawler for Node.js", | ||
@@ -5,0 +5,0 @@ "main": "crawler.js", |
@@ -70,2 +70,12 @@ var Crawler = require('../crawler'); | ||
it('should ignore mailto links', function() { | ||
expect(crawler._getAllUrls(baseUrl, '<a href="mailto:someone@somewhere.com"></a>')) | ||
.toEqual([]); | ||
}); | ||
it('should ignore ftp links', function() { | ||
expect(crawler._getAllUrls(baseUrl, '<a href="ftp://myserver.org"></a>')) | ||
.toEqual([]); | ||
}); | ||
describe('ignoreRelative option', function() { | ||
@@ -391,2 +401,6 @@ | ||
}); | ||
it('if response is not defined, content is not considered to be text', function() { | ||
expect(crawler._isTextContent()).toBe(false); | ||
}); | ||
}); | ||
@@ -393,0 +407,0 @@ |
5006812
40
1238