@@ -99,10 +99,10 @@ // Simplecrawler
		this.authPass = "";


		// Support for retaining cookies for parse duration
		this.acceptCookies = true;
		this.cookies = new CookieJar();


		// Support for custom headers...
		this.customHeaders = {};


		// Domain Whitelist
		@@ -132,3 +132,3 @@ // We allow domains to be whitelisted, so cross-domain requests can be made.
		this.downloadUnsupported = true;


		// URL Encoding setting...
		@@ -256,3 +256,3 @@ this.urlEncoding = "unicode";
		var newURL, crawler = this;


		// If the URL didn't contain anything, don't fetch it.
		@@ -266,7 +266,7 @@ if (!URL.replace(/\s+/ig,"").length) return false;
		.normalize();


		if (crawler.urlEncoding === "iso8859") {
		newURL = newURL.iso8859();
		}


		} catch(e) {
		@@ -327,2 +327,3 @@ // Couldn't process the URL, since URIjs choked on it.
		.replace(/["'\)]$/i,"")
		.replace(/^\/\//, queueItem.protocol + "://")
		.split(/\s+/g)
		@@ -333,3 +334,3 @@ .shift()
		}


		// Clean links
		@@ -369,3 +370,3 @@ function cleanAndQueue(urlMatch) {
		}


		// Rough scan for URLs
		@@ -592,3 +593,3 @@ return discoverRegex
		crawler.openRequests ++;


		// Variable declarations
		@@ -625,3 +626,3 @@ var fetchData = false,
		};


		// Add cookie header from cookie jar if we're configured to
		@@ -633,3 +634,3 @@ // send/accept cookies
		}


		// Add auth headers if we need them
		@@ -643,3 +644,3 @@ if (crawler.needsAuth) {
		}


		// And if we've got any custom headers available
		@@ -649,7 +650,7 @@ if (crawler.customHeaders) {
		if (!crawler.customHeaders.hasOwnProperty(header)) continue;


		requestOptions.headers[header] = crawler.customHeaders[header];
		}
		}


		// Emit fetchstart event - gives the user time to mangle the request options
		@@ -662,3 +663,3 @@ // if required.
		timeCommenced = Date.now();


		// Get the resource!
		@@ -669,6 +670,6 @@ clientRequest =
		});


		clientRequest.on("error",function(errorData) {
		crawler.openRequests --;


		// Emit 5xx / 4xx event
		@@ -680,3 +681,3 @@ crawler.emit("fetchclienterror",queueItem,errorData);
		});


		return crawler;
		@@ -690,5 +691,5 @@ });
		handle downloading the resource, queueing of linked items, etc.


		Examples


		// Passing in a response from `request`
		@@ -713,6 +714,6 @@ request(queueItem.url,function(err,res,body) {
		stateData = queueItem.stateData;


		// Record what time we first received the header information
		timeHeadersReceived = Date.now();


		// If we weren't passed a time of commencement, assume Now()
		@@ -731,3 +732,3 @@ timeCommenced = timeCommenced \|\| Date.now();
		stateData.headers = response.headers;


		// Do we need to save cookies? Were we sent any?
		@@ -737,3 +738,3 @@ if (crawler.acceptCookies &&
		crawler.cookies.addFromHeaders(response.headers["set-cookie"]);


		// Emit header receive event
		@@ -790,6 +791,6 @@ crawler.emit("fetchheaders",queueItem,response);
		if (responseLengthReceived + chunk.length <= crawler.maxResourceSize) {


		// Start by creating a new buffer, which will be our main
		// buffer from now on...


		var tmpNewBuffer = new Buffer(responseLengthReceived + chunk.length);
		@@ -812,3 +813,3 @@
		// resource.
		//
		//
		// Throw error event and ignore.
		@@ -832,3 +833,3 @@ //
		!dataReceived) {


		// Slice the buffer to chop off any unused space
		@@ -845,3 +846,3 @@ responseBuffer = responseBuffer.slice(0,responseLengthReceived);
		responseLength <= crawler.maxResourceSize) {


		queueItem.status = "headers";
		@@ -872,3 +873,3 @@
		response.headers.location) {


		queueItem.fetched = true;
		@@ -908,3 +909,3 @@ queueItem.status = "redirected";
		}


		return crawler;
		@@ -1017,2 +1018,2 @@ };

		module.exports = Crawler;
		module.exports = Crawler;

package.json

		{
		"name": "simplecrawler",
		"description": "Very straigntforward web crawler. Uses EventEmitter. Generates queue statistics and has a basic cache mechanism with extensible backend.",
		"version": "0.2.2",
		"version": "0.2.3",
		"homepage": "http://github.com/cgiffard/node-simplecrawler",
		@@ -6,0 +6,0 @@ "author": "Christopher Giffard <christopher.giffard@cgiffard.com>",

test/lib/routes.js

		@@ -6,20 +6,24 @@ // Routes for testing server
		"/": function(write) {
		write(200,"Home. <a href='/stage2'>stage2</a>");
		write(200,"Home. <a href='stage2'>stage2</a>");
		},


		"/stage2": function(write) {
		write(200,"Stage2. http://127.0.0.1:3000/stage/3");
		},


		"/stage/3": function(write) {
		write(200,"Stage3. <a href='../stage4'>stage4</a>");
		write(200,"Stage3. <a href='//127.0.0.1:3000/stage/4'>stage4</a>");
		},

		"/stage4": function(write,redir) {
		redir("/stage5");

		"/stage/4": function(write) {
		write(200,"Stage4. <a href='../stage5'>stage5</a>");
		},

		"/stage5": function(write) {

		"/stage5": function(write,redir) {
		redir("/stage6");
		},

		"/stage6": function(write) {
		write(200,"Crawl complete!");
		}
		};
		};

test/testcrawl.js

		@@ -5,3 +5,3 @@ // Runs a very simple crawl on an HTTP server
		var chai = require("chai");
		chai.should();
		chai.should();

		@@ -11,55 +11,47 @@ var testserver = require("./lib/testserver.js");
		describe("Test Crawl",function() {


		var Crawler = require("../");


		// Create a new crawler to crawl this server
		var localCrawler = new Crawler("127.0.0.1","/",3000);

		var linksDiscovered = 0;

		it("should be able to be started",function(done) {


		localCrawler.on("crawlstart",done);


		localCrawler.start();
		localCrawler.running.should.be.truthy;
		});


		it("should have a queue with at least the initial crawl path",function() {


		localCrawler.queue.length.should.be.greaterThan(0);
		});

		it("should complete the fetch queue",function(done) {


		it("should discover all linked resources in the queue",function(done) {

		localCrawler.on("discoverycomplete",function() {
		linksDiscovered ++;
		});

		localCrawler.on("complete",function() {
		linksDiscovered.should.equal(5);
		done();
		});
		});

		// Suddenly feeling very incompatible with the mocha philosophy.
		// Will try to make it work first, then look for another framework that
		// supports parallell tests.
		//
		// it("should be able to discover link resources",function(done) {
		// var linksDiscovered = 0;
		//
		// localCrawler.on("discoverycomplete",function() {
		// linksDiscovered ++;
		//
		// if (!linksDiscovered) done();
		// });
		// });
		//
		//
		//
		// Todo: test how simple error conditions, content types, and responses
		// are handled.



		// TODO

		// Test how simple error conditions, content types, and responses are handled.

		// Test encodings.


		// Test URL detection


		// Test handling binary data

		// test bad content length

		});
		// Test bad content length

		});

README.markdown

Sorry, the diff of this file is not supported yet

simplecrawler - npm Package Compare versions

Improved metrics

Worsened metrics