@@ -37,2 +37,4 @@ // Simplecrawler
		var Crawler = function(host,initialPath,initialPort,interval) {
		var crawler = this;

		// Data integrity checks
		@@ -46,8 +48,8 @@ if (initialPort && isNaN(initialPort))
		// Domain to crawl
		this.host = host \|\| "";
		crawler.host = host \|\| "";

		// Gotta start crawling somewhere
		this.initialPath = initialPath \|\| "/";
		this.initialPort = initialPort \|\| 80;
		this.initialProtocol = "http";
		crawler.initialPath = initialPath \|\| "/";
		crawler.initialPort = initialPort \|\| 80;
		crawler.initialProtocol = "http";

		@@ -57,13 +59,13 @@ // Internal 'tick' interval for spawning new requests
		// One request will be spooled per tick, up to the concurrency threshold.
		this.interval = interval \|\| 250;
		crawler.interval = interval \|\| 250;

		// Maximum request concurrency. Be sensible. Five ties in with node's
		// default maxSockets value.
		this.maxConcurrency = 5;
		crawler.maxConcurrency = 5;

		// Maximum time we'll wait for headers
		this.timeout = 5 * 60 * 1000;
		crawler.timeout = 5 * 60 * 1000;

		// User Agent
		this.userAgent =
		crawler.userAgent =
		"Node/" + MetaInfo.name + " " + MetaInfo.version +
		@@ -74,3 +76,3 @@ " (" + MetaInfo.repository.url + ")";
		// (but it's basically just an array)
		this.queue = new FetchQueue();
		crawler.queue = new FetchQueue();

		@@ -80,40 +82,40 @@ // Do we filter by domain?
		// recommend leaving this on!
		this.filterByDomain = true;
		crawler.filterByDomain = true;

		// Do we scan subdomains?
		this.scanSubdomains = false;
		crawler.scanSubdomains = false;

		// Treat WWW subdomain the same as the main domain (and don't count
		// it as a separate subdomain)
		this.ignoreWWWDomain = true;
		crawler.ignoreWWWDomain = true;

		// Or go even further and strip WWW subdomain from domains altogether!
		this.stripWWWDomain = false;
		crawler.stripWWWDomain = false;

		// Internal cachestore
		this.cache = null;
		crawler.cache = null;

		// Use an HTTP Proxy?
		this.useProxy = false;
		this.proxyHostname = "127.0.0.1";
		this.proxyPort = 8123;
		crawler.useProxy = false;
		crawler.proxyHostname = "127.0.0.1";
		crawler.proxyPort = 8123;

		// Support for HTTP basic auth
		this.needsAuth = false;
		this.authUser = "";
		this.authPass = "";
		crawler.needsAuth = false;
		crawler.authUser = "";
		crawler.authPass = "";

		// Support for retaining cookies for parse duration
		this.acceptCookies = true;
		this.cookies = new CookieJar();
		crawler.acceptCookies = true;
		crawler.cookies = new CookieJar();

		// Support for custom headers...
		this.customHeaders = {};
		crawler.customHeaders = {};

		// Domain Whitelist
		// We allow domains to be whitelisted, so cross-domain requests can be made.
		this.domainWhitelist = [];
		crawler.domainWhitelist = [];

		// Supported Protocols
		this.allowedProtocols = [
		crawler.allowedProtocols = [
		/^http(s)?$/i, // HTTP & HTTPS
		@@ -124,7 +126,7 @@ /^(rss\|atom\|feed)(\+xml)?$/i // RSS / XML
		// Max file size to download/store
		this.maxResourceSize = 1024 * 1024 * 16; // 16mb
		crawler.maxResourceSize = 1024 * 1024 * 16; // 16mb

		// Supported MIME-types
		// Matching MIME-types will be scanned for links
		this.supportedMimeTypes = [
		crawler.supportedMimeTypes = [
		/^text\//i,
		@@ -137,13 +139,36 @@ /^application\/(rss\|html\|xhtml)?[\+\/\-]?xml/i,
		// Download linked, but unsupported files (binary - images, documents, etc)
		this.downloadUnsupported = true;
		crawler.downloadUnsupported = true;

		// URL Encoding setting...
		this.urlEncoding = "unicode";
		crawler.urlEncoding = "unicode";

		// STATE (AND OTHER) VARIABLES NOT TO STUFF WITH
		this.openRequests = 0;
		this.fetchConditions = [];
		crawler._openRequests = 0;
		crawler._fetchConditions = [];
		crawler._openListeners = 0;
		crawler._listenerMap = {};

		// Run the EventEmitter constructor
		EventEmitter.call(this);
		EventEmitter.call(crawler);

		crawler._emitSpecial = function() {
		var args = Array.prototype.slice.call(arguments,0),
		event = args[0],
		eventArgsLen = args.length-1,
		asyncListenerCount = 0;

		crawler.listeners(event).forEach(function(listener) {
		if (listener.length > eventArgsLen)
		asyncListenerCount++;
		});

		crawler._openListeners += asyncListenerCount\|0;

		crawler.emit.apply(crawler,args.concat([
		function listenerComplete() {
		if (crawler._openListeners > 0)
		crawler._openListeners --;
		}
		]));
		};
		};
		@@ -171,10 +196,10 @@
		// only if we haven't already got stuff in our queue...
		if (!this.queue.length) {
		if (!crawler.queue.length) {

		// Initialise our queue by pushing the initial request data into it...
		this.queue.add(
		this.initialProtocol,
		this.host,
		this.initialPort,
		this.initialPath,
		crawler.queue.add(
		crawler.initialProtocol,
		crawler.host,
		crawler.initialPort,
		crawler.initialPath,
		function(error) {
		@@ -185,8 +210,8 @@ if (error) throw error;

		this.crawlIntervalID = setInterval(function() {
		crawler.crawlIntervalID = setInterval(function() {
		crawler.crawl.call(crawler);
		},this.interval);
		},crawler.interval);

		this.emit("crawlstart");
		this.running = true;
		crawler._emitSpecial("crawlstart");
		crawler.running = true;

		@@ -198,3 +223,3 @@ // Now kick off the initial crawl

		return this;
		return crawler;
		};
		@@ -216,3 +241,3 @@
		Crawler.prototype.protocolSupported = function(URL) {
		var protocol;
		var protocol, crawler = this;

		@@ -227,3 +252,3 @@ try {

		return this.allowedProtocols.reduce(function(prev,protocolCheck) {
		return crawler.allowedProtocols.reduce(function(prev,protocolCheck) {
		return prev \|\| !!protocolCheck.exec(protocol);
		@@ -247,5 +272,6 @@ },false);
		Crawler.prototype.mimeTypeSupported = function(MIMEType) {
		var crawler = this;

		return (
		this.supportedMimeTypes.reduce(function(prev,mimeCheck) {
		crawler.supportedMimeTypes.reduce(function(prev,mimeCheck) {
		return prev \|\| !!mimeCheck.exec(MIMEType);
		@@ -271,3 +297,12 @@ },false)
		var newURL, crawler = this;


		if (!context \|\| typeof(context) !== "object")
		context = {
		url: (
		crawler.initialProtocol + "://" +
		crawler.host + ":" +
		crawler.initialPort + "/"
		)
		};

		// If the URL didn't contain anything, don't fetch it.
		@@ -499,12 +534,12 @@ if (!URL.replace(/\s+/ig,"").length) return false;
		Crawler.prototype.queueLinkedItems = function(resourceData,queueItem) {
		var resources = this.discoverResources(resourceData,queueItem),
		crawler = this;
		var crawler = this,
		resources = crawler.discoverResources(resourceData,queueItem);

		// Emit discovered resources. ie: might be useful in building a graph of
		// page relationships.
		this.emit("discoverycomplete",queueItem,resources);
		crawler._emitSpecial("discoverycomplete",queueItem,resources);

		resources.forEach(function(url){ crawler.queueURL(url,queueItem); });

		return this;
		return crawler;
		};
		@@ -537,9 +572,9 @@
		}


		// Pass this URL past fetch conditions to ensure the user thinks it's valid
		var fetchDenied = false;
		fetchDenied = crawler.fetchConditions.reduce(function(prev,callback) {
		fetchDenied = crawler._fetchConditions.reduce(function(prev,callback) {
		return fetchDenied \|\| !callback(parsedURL);
		},false);


		if (fetchDenied) {
		@@ -549,3 +584,3 @@ // Fetch Conditions conspired to block URL
		}


		// Check the domain is valid before adding it to the queue
		@@ -562,6 +597,6 @@ if (crawler.domainValid(parsedURL.host)) {
		// We received an error condition when adding the callback
		crawler.emit("queueerror",error,parsedURL);
		crawler._emitSpecial("queueerror",error,parsedURL);
		} else {
		crawler.emit("queueadd",newQueueItem,parsedURL);
		newQueueItem.referrer = queueItem.url;
		crawler._emitSpecial("queueadd",newQueueItem,parsedURL);
		newQueueItem.referrer = queueItem ? queueItem.url : null;
		}
		@@ -572,3 +607,3 @@ }
		// If we caught an error, emit queueerror
		crawler.emit("queueerror",error,parsedURL);
		crawler._emitSpecial("queueerror",error,parsedURL);
		return false;
		@@ -608,3 +643,3 @@ }
		var crawler = this;
		crawler.openRequests ++;
		crawler._openRequests ++;

		@@ -670,3 +705,3 @@ // Variable declarations
		// if required.
		crawler.emit("fetchstart",queueItem,requestOptions);
		crawler._emitSpecial("fetchstart",queueItem,requestOptions);

		@@ -684,6 +719,6 @@ process.nextTick(function() {
		clientRequest.on("error",function(errorData) {
		crawler.openRequests --;
		crawler._openRequests --;

		// Emit 5xx / 4xx event
		crawler.emit("fetchclienterror",queueItem,errorData);
		crawler._emitSpecial("fetchclienterror",queueItem,errorData);
		queueItem.fetched = true;
		@@ -748,3 +783,3 @@ queueItem.stateData.code = 599;
		// Emit header receive event
		crawler.emit("fetchheaders",queueItem,response);
		crawler._emitSpecial("fetchheaders",queueItem,response);

		@@ -772,3 +807,3 @@ // Ensure response length is reasonable...

		crawler.emit("fetchcomplete",queueItem,responseBuffer,response);
		crawler._emitSpecial("fetchcomplete",queueItem,responseBuffer,response);

		@@ -788,3 +823,3 @@ // First, save item to cache (if we're using a cache!)

		crawler.openRequests --;
		crawler._openRequests --;
		}
		@@ -827,3 +862,3 @@

		crawler.emit("fetchdataerror",queueItem,response);
		crawler._emitSpecial("fetchdataerror",queueItem,response);
		}
		@@ -869,3 +904,3 @@ } else {
		crawler.cache.getCacheData(queueItem,function(cacheObject) {
		crawler.emit("notmodified",queueItem,response,cacheObject);
		crawler._emitSpecial("notmodified",queueItem,response,cacheObject);
		});
		@@ -875,3 +910,3 @@ } else {
		// we don't send any data.
		crawler.emit("notmodified",queueItem,response);
		crawler._emitSpecial("notmodified",queueItem,response);
		}
		@@ -890,3 +925,3 @@
		// Emit redirect event
		crawler.emit("fetchredirect",queueItem,parsedURL,response);
		crawler._emitSpecial("fetchredirect",queueItem,parsedURL,response);

		@@ -896,3 +931,3 @@ // Clean URL, add to queue...

		crawler.openRequests --;
		crawler._openRequests --;

		@@ -905,5 +940,5 @@ // Ignore this request, but record that we had a 404
		// Emit 404 event
		crawler.emit("fetch404",queueItem,response);
		crawler._emitSpecial("fetch404",queueItem,response);

		crawler.openRequests --;
		crawler._openRequests --;

		@@ -916,5 +951,5 @@ // And oh dear. Handle this one as well. (other 400s, 500s, etc)
		// Emit 5xx / 4xx event
		crawler.emit("fetcherror",queueItem,response);
		crawler._emitSpecial("fetcherror",queueItem,response);

		crawler.openRequests --;
		crawler._openRequests --;
		}
		@@ -941,3 +976,3 @@

		if (crawler.openRequests > crawler.maxConcurrency) return;
		if (crawler._openRequests > crawler.maxConcurrency) return;

		@@ -947,3 +982,6 @@ crawler.queue.oldestUnfetchedItem(function(err,queueItem) {
		crawler.fetchQueueItem(queueItem);
		} else if (crawler.openRequests === 0) {

		} else if ( !crawler._openRequests &&
		!crawler._openListeners) {

		crawler.queue.complete(function(err,completeCount) {
		@@ -972,5 +1010,6 @@ if (completeCount === crawler.queue.length) {
		Crawler.prototype.stop = function() {
		clearInterval(this.crawlIntervalID);
		this.running = false;
		return this;
		var crawler = this;
		clearInterval(crawler.crawlIntervalID);
		crawler.running = false;
		return crawler;
		};
		@@ -1000,5 +1039,6 @@
		Crawler.prototype.addFetchCondition = function(callback) {
		var crawler = this;
		if (callback instanceof Function) {
		this.fetchConditions.push(callback);
		return this.fetchConditions.length - 1;
		crawler._fetchConditions.push(callback);
		return crawler._fetchConditions.length - 1;
		} else {
		@@ -1024,6 +1064,7 @@ throw new Error("Fetch Condition must be a function.");
		Crawler.prototype.removeFetchCondition = function(index) {
		if (this.fetchConditions[index] &&
		this.fetchConditions[index] instanceof Function) {
		var crawler = this;
		if (crawler._fetchConditions[index] &&
		crawler._fetchConditions[index] instanceof Function) {

		return !!this.fetchConditions.splice(index,1);
		return !!crawler._fetchConditions.splice(index,1);
		} else {
		@@ -1030,0 +1071,0 @@ throw new Error("Unable to find indexed Fetch Condition.");

package.json

		{
		"name": "simplecrawler",
		"description": "Very straigntforward web crawler. Uses EventEmitter. Generates queue statistics and has a basic cache mechanism with extensible backend.",
		"version": "0.2.5",
		"version": "0.2.6",
		"homepage": "http://github.com/cgiffard/node-simplecrawler",
		@@ -16,3 +16,3 @@ "author": "Christopher Giffard <christopher.giffard@cgiffard.com>",
		"scripts": {
		"test": "mocha -R spec"
		"test": "mocha -R spec -t 4000"
		},
		@@ -19,0 +19,0 @@ "bin": {

test/lib/routes.js

		@@ -27,3 +27,15 @@ // Routes for testing server
		write(200,"Crawl complete!");
		},

		"/async-stage1": function(write) {
		write(200,"http://127.0.0.1:3000/async-stage2");
		},

		"/async-stage2": function(write) {
		write(200,"http://127.0.0.1:3000/async-stage3");
		},

		"/async-stage3": function(write) {
		write(200,"Complete!");
		}
		};

test/testcrawl.js

		@@ -14,9 +14,13 @@ // Runs a very simple crawl on an HTTP server
		// Create a new crawler to crawl this server
		var localCrawler = new Crawler("127.0.0.1","/",3000);
		var localCrawler = new Crawler("127.0.0.1","/",3000),
		asyncCrawler = new Crawler("127.0.0.1","/",3000);
		var linksDiscovered = 0;

		it("should be able to be started",function(done) {

		localCrawler.on("crawlstart",done);


		localCrawler.on("crawlstart",function() { done() });
		localCrawler.on("discoverycomplete",function() {
		linksDiscovered ++;
		});

		localCrawler.start();
		@@ -33,6 +37,2 @@ localCrawler.running.should.be.truthy;

		localCrawler.on("discoverycomplete",function() {
		linksDiscovered ++;
		});

		localCrawler.on("complete",function() {
		@@ -43,4 +43,31 @@ linksDiscovered.should.equal(5);
		});

		it("should support async event listeners for manual discovery",function(done) {

		// Use a different crawler this time
		asyncCrawler.discoverResources = false;
		asyncCrawler.queueURL("http://127.0.0.1:3000/async-stage1");
		asyncCrawler.start();

		asyncCrawler.on("fetchcomplete",function(queueItem,data,res,evtDone) {
		setTimeout(function(){
		linksDiscovered ++;

		if (String(data).match(/complete/i))
		return evtDone();

		// Taking advantage of the fact that for these, the sum total
		// of the body data is a URL.
		asyncCrawler.queueURL(String(data)).should.be.true;

		evtDone();
		},250);
		});

		asyncCrawler.on("complete",function() {
		linksDiscovered.should.equal(8);
		done();
		});
		});


		// TODO
		@@ -47,0 +74,0 @@

README.markdown

Sorry, the diff of this file is not supported yet

simplecrawler - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics