Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

crawler-ninja

Package Overview
Dependencies
Maintainers
1
Versions
28
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

crawler-ninja - npm Package Compare versions

Comparing version 0.2.5 to 0.2.6

test/crawl-special-char-test.js

2

index.js

@@ -351,3 +351,3 @@ var timers = require('timers');

var $ = html.isHTML(result.body) ? html.$(result.body) : null;
var $ = html.$(result);

@@ -354,0 +354,0 @@ // Analyse the HTTP response in order to check the content (links, images, ...)

@@ -7,10 +7,31 @@ /**

module.exports.isHTML = function (body) {
function analyzeContent(result) {
return body.match(/^\s*</) !== null;
};
if (! result.responseHeaders["content-type"]) {
result.body = result.body.toString();
result.isHTML = result.body.match(/^\s*</) !== null;
return result;
}
if (result.responseHeaders["content-type"].indexOf("html")>0 || result.responseHeaders["content-type"].indexOf("octet-stream")>0) {
module.exports.$ = function (body) {
result.body = result.body.toString();
result.isHTML = result.body.match(/^\s*</) !== null;
return result;
}
else {
result.isHTML = false;
return result;
}
}
module.exports.$ = function (result) {
result = analyzeContent(result);
if (! result.isHTML) {
return null;
}
var options = {

@@ -21,4 +42,4 @@ normalizeWhitespace: false,

};
return cheerio.load(body, options);
return cheerio.load(result.body, options);
};

@@ -122,5 +122,2 @@

// This hack solves some issues with Cheerio
// TODO : Check if it is still necessary
result.body = result.body.toString();

@@ -127,0 +124,0 @@ // Add the options used for the previous request in the result

{
"name": "crawler-ninja",
"version": "0.2.5",
"version": "0.2.6",
"description": "A web crawler made for the SEO based on plugins. Please wait or contribute ... still in beta",

@@ -5,0 +5,0 @@ "main": "index.js",

@@ -537,1 +537,5 @@ Crawler Ninja

- Better support for HTTP redirects (300+).
0.2.6
- Review how to analyze content in function of the response/content type.
- Add unit tests for url containing special characters.

@@ -33,3 +33,3 @@ var assert = require("assert");

it('should return only one page stat for an HTML page without tag', function(done) {
it('should return zero HTML page for a page without tag', function(done) {
var end = function(){

@@ -51,3 +51,53 @@

it('should return only one page stat for an HTML page without extension', function(done) {
var end = function(){
assert(stat.data.numberOfUrls === 1, "Incorrect number of crawled urls : " + stat.data.numberOfUrls);
//assert(stat.data.contentTypes['text/html; charset=UTF-8'] == 1);
assert(stat.data.numberOfHTMLs === 1, "Incorrect number of crawled HTML pages : " + stat.data.numberOfHTMLs);
done();
};
crawler.init(null, end);
var stat = new memstat.Plugin();
crawler.registerPlugin(stat);
crawler.queue({url : "http://localhost:9999/index"});
});
it('should return zero html page without extension for a text file', function(done) {
var end = function(){
assert(stat.data.numberOfUrls === 1, "Incorrect number of crawled urls : " + stat.data.numberOfUrls);
//assert(stat.data.contentTypes['text/html; charset=UTF-8'] == 1);
assert(stat.data.numberOfHTMLs === 0, "Incorrect number of crawled HTML pages : " + stat.data.numberOfHTMLs);
done();
};
crawler.init(null, end);
var stat = new memstat.Plugin();
crawler.registerPlugin(stat);
crawler.queue({url : "http://localhost:9999/text"});
});
it('should return zero html page for a image url without extension', function(done) {
var end = function(){
assert(stat.data.numberOfUrls === 1, "Incorrect number of crawled urls : " + stat.data.numberOfUrls);
//assert(stat.data.contentTypes['text/html; charset=UTF-8'] == 1);
assert(stat.data.numberOfHTMLs === 0, "Incorrect number of crawled HTML pages : " + stat.data.numberOfHTMLs);
done();
};
crawler.init(null, end);
var stat = new memstat.Plugin();
crawler.registerPlugin(stat);
crawler.queue({url : "http://localhost:9999/200x200-image"});
});
it('should return only one page stat for a text page', function(done) {

@@ -54,0 +104,0 @@ var end = function(){

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc