crawler-ninja
Advanced tools
Comparing version 0.2.5 to 0.2.6
@@ -351,3 +351,3 @@ var timers = require('timers'); | ||
var $ = html.isHTML(result.body) ? html.$(result.body) : null; | ||
var $ = html.$(result); | ||
@@ -354,0 +354,0 @@ // Analyse the HTTP response in order to check the content (links, images, ...) |
@@ -7,10 +7,31 @@ /** | ||
module.exports.isHTML = function (body) { | ||
function analyzeContent(result) { | ||
return body.match(/^\s*</) !== null; | ||
}; | ||
if (! result.responseHeaders["content-type"]) { | ||
result.body = result.body.toString(); | ||
result.isHTML = result.body.match(/^\s*</) !== null; | ||
return result; | ||
} | ||
if (result.responseHeaders["content-type"].indexOf("html")>0 || result.responseHeaders["content-type"].indexOf("octet-stream")>0) { | ||
module.exports.$ = function (body) { | ||
result.body = result.body.toString(); | ||
result.isHTML = result.body.match(/^\s*</) !== null; | ||
return result; | ||
} | ||
else { | ||
result.isHTML = false; | ||
return result; | ||
} | ||
} | ||
module.exports.$ = function (result) { | ||
result = analyzeContent(result); | ||
if (! result.isHTML) { | ||
return null; | ||
} | ||
var options = { | ||
@@ -21,4 +42,4 @@ normalizeWhitespace: false, | ||
}; | ||
return cheerio.load(body, options); | ||
return cheerio.load(result.body, options); | ||
}; |
@@ -122,5 +122,2 @@ | ||
// This hack solves some issues with Cheerio | ||
// TODO : Check if it is still necessary | ||
result.body = result.body.toString(); | ||
@@ -127,0 +124,0 @@ // Add the options used for the previous request in the result |
{ | ||
"name": "crawler-ninja", | ||
"version": "0.2.5", | ||
"version": "0.2.6", | ||
"description": "A web crawler made for the SEO based on plugins. Please wait or contribute ... still in beta", | ||
@@ -5,0 +5,0 @@ "main": "index.js", |
@@ -537,1 +537,5 @@ Crawler Ninja | ||
- Better support for HTTP redirects (300+). | ||
0.2.6 | ||
- Review how to analyze content in function of the response/content type. | ||
- Add unit tests for url containing special characters. |
@@ -33,3 +33,3 @@ var assert = require("assert"); | ||
it('should return only one page stat for an HTML page without tag', function(done) { | ||
it('should return zero HTML page for a page without tag', function(done) { | ||
var end = function(){ | ||
@@ -51,3 +51,53 @@ | ||
it('should return only one page stat for an HTML page without extension', function(done) { | ||
var end = function(){ | ||
assert(stat.data.numberOfUrls === 1, "Incorrect number of crawled urls : " + stat.data.numberOfUrls); | ||
//assert(stat.data.contentTypes['text/html; charset=UTF-8'] == 1); | ||
assert(stat.data.numberOfHTMLs === 1, "Incorrect number of crawled HTML pages : " + stat.data.numberOfHTMLs); | ||
done(); | ||
}; | ||
crawler.init(null, end); | ||
var stat = new memstat.Plugin(); | ||
crawler.registerPlugin(stat); | ||
crawler.queue({url : "http://localhost:9999/index"}); | ||
}); | ||
it('should return zero html page without extension for a text file', function(done) { | ||
var end = function(){ | ||
assert(stat.data.numberOfUrls === 1, "Incorrect number of crawled urls : " + stat.data.numberOfUrls); | ||
//assert(stat.data.contentTypes['text/html; charset=UTF-8'] == 1); | ||
assert(stat.data.numberOfHTMLs === 0, "Incorrect number of crawled HTML pages : " + stat.data.numberOfHTMLs); | ||
done(); | ||
}; | ||
crawler.init(null, end); | ||
var stat = new memstat.Plugin(); | ||
crawler.registerPlugin(stat); | ||
crawler.queue({url : "http://localhost:9999/text"}); | ||
}); | ||
it('should return zero html page for a image url without extension', function(done) { | ||
var end = function(){ | ||
assert(stat.data.numberOfUrls === 1, "Incorrect number of crawled urls : " + stat.data.numberOfUrls); | ||
//assert(stat.data.contentTypes['text/html; charset=UTF-8'] == 1); | ||
assert(stat.data.numberOfHTMLs === 0, "Incorrect number of crawled HTML pages : " + stat.data.numberOfHTMLs); | ||
done(); | ||
}; | ||
crawler.init(null, end); | ||
var stat = new memstat.Plugin(); | ||
crawler.registerPlugin(stat); | ||
crawler.queue({url : "http://localhost:9999/200x200-image"}); | ||
}); | ||
it('should return only one page stat for a text page', function(done) { | ||
@@ -54,0 +104,0 @@ var end = function(){ |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
8231064
88
11158
541