New Case Study:See how Anthropic automated 95% of dependency reviews with Socket.Learn More
Socket
Sign inDemoInstall
Socket

simplecrawler

Package Overview
Dependencies
Maintainers
2
Versions
70
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

simplecrawler - npm Package Compare versions

Comparing version 0.2.2 to 0.2.3

67

lib/crawler.js

@@ -99,10 +99,10 @@ // Simplecrawler

this.authPass = "";
// Support for retaining cookies for parse duration
this.acceptCookies = true;
this.cookies = new CookieJar();
// Support for custom headers...
this.customHeaders = {};
// Domain Whitelist

@@ -132,3 +132,3 @@ // We allow domains to be whitelisted, so cross-domain requests can be made.

this.downloadUnsupported = true;
// URL Encoding setting...

@@ -256,3 +256,3 @@ this.urlEncoding = "unicode";

var newURL, crawler = this;
// If the URL didn't contain anything, don't fetch it.

@@ -266,7 +266,7 @@ if (!URL.replace(/\s+/ig,"").length) return false;

.normalize();
if (crawler.urlEncoding === "iso8859") {
newURL = newURL.iso8859();
}
} catch(e) {

@@ -327,2 +327,3 @@ // Couldn't process the URL, since URIjs choked on it.

.replace(/["'\)]$/i,"")
.replace(/^\/\//, queueItem.protocol + "://")
.split(/\s+/g)

@@ -333,3 +334,3 @@ .shift()

}
// Clean links

@@ -369,3 +370,3 @@ function cleanAndQueue(urlMatch) {

}
// Rough scan for URLs

@@ -592,3 +593,3 @@ return discoverRegex

crawler.openRequests ++;
// Variable declarations

@@ -625,3 +626,3 @@ var fetchData = false,

};
// Add cookie header from cookie jar if we're configured to

@@ -633,3 +634,3 @@ // send/accept cookies

}
// Add auth headers if we need them

@@ -643,3 +644,3 @@ if (crawler.needsAuth) {

}
// And if we've got any custom headers available

@@ -649,7 +650,7 @@ if (crawler.customHeaders) {

if (!crawler.customHeaders.hasOwnProperty(header)) continue;
requestOptions.headers[header] = crawler.customHeaders[header];
}
}
// Emit fetchstart event - gives the user time to mangle the request options

@@ -662,3 +663,3 @@ // if required.

timeCommenced = Date.now();
// Get the resource!

@@ -669,6 +670,6 @@ clientRequest =

});
clientRequest.on("error",function(errorData) {
crawler.openRequests --;
// Emit 5xx / 4xx event

@@ -680,3 +681,3 @@ crawler.emit("fetchclienterror",queueItem,errorData);

});
return crawler;

@@ -690,5 +691,5 @@ });

handle downloading the resource, queueing of linked items, etc.
Examples
// Passing in a response from `request`

@@ -713,6 +714,6 @@ request(queueItem.url,function(err,res,body) {

stateData = queueItem.stateData;
// Record what time we first received the header information
timeHeadersReceived = Date.now();
// If we weren't passed a time of commencement, assume Now()

@@ -731,3 +732,3 @@ timeCommenced = timeCommenced || Date.now();

stateData.headers = response.headers;
// Do we need to save cookies? Were we sent any?

@@ -737,3 +738,3 @@ if (crawler.acceptCookies &&

crawler.cookies.addFromHeaders(response.headers["set-cookie"]);
// Emit header receive event

@@ -790,6 +791,6 @@ crawler.emit("fetchheaders",queueItem,response);

if (responseLengthReceived + chunk.length <= crawler.maxResourceSize) {
// Start by creating a new buffer, which will be our main
// buffer from now on...
var tmpNewBuffer = new Buffer(responseLengthReceived + chunk.length);

@@ -812,3 +813,3 @@

// resource.
//
//
// Throw error event and ignore.

@@ -832,3 +833,3 @@ //

!dataReceived) {
// Slice the buffer to chop off any unused space

@@ -845,3 +846,3 @@ responseBuffer = responseBuffer.slice(0,responseLengthReceived);

responseLength <= crawler.maxResourceSize) {
queueItem.status = "headers";

@@ -872,3 +873,3 @@

response.headers.location) {
queueItem.fetched = true;

@@ -908,3 +909,3 @@ queueItem.status = "redirected";

}
return crawler;

@@ -1017,2 +1018,2 @@ };

module.exports = Crawler;
module.exports = Crawler;
{
"name": "simplecrawler",
"description": "Very straigntforward web crawler. Uses EventEmitter. Generates queue statistics and has a basic cache mechanism with extensible backend.",
"version": "0.2.2",
"version": "0.2.3",
"homepage": "http://github.com/cgiffard/node-simplecrawler",

@@ -6,0 +6,0 @@ "author": "Christopher Giffard <christopher.giffard@cgiffard.com>",

@@ -6,20 +6,24 @@ // Routes for testing server

"/": function(write) {
write(200,"Home. <a href='/stage2'>stage2</a>");
write(200,"Home. <a href='stage2'>stage2</a>");
},
"/stage2": function(write) {
write(200,"Stage2. http://127.0.0.1:3000/stage/3");
},
"/stage/3": function(write) {
write(200,"Stage3. <a href='../stage4'>stage4</a>");
write(200,"Stage3. <a href='//127.0.0.1:3000/stage/4'>stage4</a>");
},
"/stage4": function(write,redir) {
redir("/stage5");
"/stage/4": function(write) {
write(200,"Stage4. <a href='../stage5'>stage5</a>");
},
"/stage5": function(write) {
"/stage5": function(write,redir) {
redir("/stage6");
},
"/stage6": function(write) {
write(200,"Crawl complete!");
}
};
};

@@ -5,3 +5,3 @@ // Runs a very simple crawl on an HTTP server

var chai = require("chai");
chai.should();
chai.should();

@@ -11,55 +11,47 @@ var testserver = require("./lib/testserver.js");

describe("Test Crawl",function() {
var Crawler = require("../");
// Create a new crawler to crawl this server
var localCrawler = new Crawler("127.0.0.1","/",3000);
var linksDiscovered = 0;
it("should be able to be started",function(done) {
localCrawler.on("crawlstart",done);
localCrawler.start();
localCrawler.running.should.be.truthy;
});
it("should have a queue with at least the initial crawl path",function() {
localCrawler.queue.length.should.be.greaterThan(0);
});
it("should complete the fetch queue",function(done) {
it("should discover all linked resources in the queue",function(done) {
localCrawler.on("discoverycomplete",function() {
linksDiscovered ++;
});
localCrawler.on("complete",function() {
linksDiscovered.should.equal(5);
done();
});
});
// Suddenly feeling very incompatible with the mocha philosophy.
// Will try to make it work first, then look for another framework that
// supports parallell tests.
//
// it("should be able to discover link resources",function(done) {
// var linksDiscovered = 0;
//
// localCrawler.on("discoverycomplete",function() {
// linksDiscovered ++;
//
// if (!linksDiscovered) done();
// });
// });
//
//
//
// Todo: test how simple error conditions, content types, and responses
// are handled.
// TODO
// Test how simple error conditions, content types, and responses are handled.
// Test encodings.
// Test URL detection
// Test handling binary data
// test bad content length
});
// Test bad content length
});

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc