@@ -9,2 +9,3 @@ // Simplecrawler
		Cache = require("./cache.js"),
		CookieJar = require("./cookies.js"),
		MetaInfo = require("../package.json");
		@@ -19,3 +20,3 @@
		Public: Constructor for the crawler.


		host - Initial hostname/domain to begin crawling from. By
		@@ -26,7 +27,7 @@ default, the crawl will be locked to this hostname.
		interval - Request interval for the crawler. Defaults to 250ms.


		Examples

		var crawler = new Crawler("example.com","/",80,500);


		var crawler = new Crawler("example.com");
		@@ -41,6 +42,6 @@
		throw new Error("Port must be a number!");


		// SETTINGS TO STUFF WITH
		// (not here! Do it when you create a `new Crawler()`)


		// Domain to crawl
		@@ -58,3 +59,3 @@ this.host = host \|\| "";
		this.interval = interval \|\| 250;


		// Maximum request concurrency. Be sensible. Five ties in with node's
		@@ -75,3 +76,3 @@ // default maxSockets value.
		this.queue = new FetchQueue();


		// Do we filter by domain?
		@@ -99,8 +100,15 @@ // Unless you want to be crawling the entire internet, I would
		this.proxyPort = 8123;

		// Support for HTTP basic auth
		this.needsAuth = false;
		this.authUser = "";
		this.authPass = "";

		// Support for HTTP basic auth
		this.needsAuth = false;
		this.authUser = "";
		this.authPass = "";

		// Support for retaining cookies for parse duration
		this.acceptCookies = true;
		this.cookies = new CookieJar();

		// Support for custom headers...
		this.customHeaders = {};

		// Domain Whitelist
		@@ -130,2 +138,5 @@ // We allow domains to be whitelisted, so cross-domain requests can be made.
		this.downloadUnsupported = true;

		// URL Encoding setting...
		this.urlEncoding = "unicode";

		@@ -145,3 +156,3 @@ // STATE (AND OTHER) VARIABLES NOT TO STUFF WITH
		commences.


		Examples
		@@ -156,6 +167,6 @@
		var crawler = this;


		// only if we haven't already got stuff in our queue...
		if (!this.queue.length) {


		// Initialise our queue by pushing the initial request data into it...
		@@ -171,10 +182,10 @@ this.queue.add(
		}


		this.crawlIntervalID = setInterval(function() {
		crawler.crawl.call(crawler);
		},this.interval);


		this.emit("crawlstart");
		this.running = true;


		// Now kick off the initial crawl
		@@ -184,3 +195,3 @@ process.nextTick(function() {
		});


		return this;
		@@ -193,3 +204,3 @@ };
		URL - URL with a protocol, for testing.


		Examples
		@@ -205,6 +216,6 @@
		var protocol;


		try {
		protocol = URI(URL).protocol();


		} catch(e) {
		@@ -214,3 +225,3 @@ // If URIjs died, we definitely /do not/ support the protocol.
		}


		return this.allowedProtocols.reduce(function(prev,protocolCheck) {
		@@ -235,3 +246,3 @@ return prev \|\| !!protocolCheck.exec(protocol);
		Crawler.prototype.mimeTypeSupported = function(MIMEType) {


		return (
		@@ -258,12 +269,17 @@ this.supportedMimeTypes.reduce(function(prev,mimeCheck) {
		Crawler.prototype.processURL = function(URL,context) {
		var newURL;
		var newURL, crawler = this;

		// If the URL didn't contain anything, don't fetch it.
		if (!URL.replace(/\s+/ig,"").length) return false;


		try {
		newURL =
		newURL =
		URI(URL)
		.absoluteTo(context.url)
		.normalize();

		if (crawler.urlEncoding === "iso8859") {
		newURL = newURL.iso8859();
		}

		} catch(e) {
		@@ -273,3 +289,3 @@ // Couldn't process the URL, since URIjs choked on it.
		}


		// simplecrawler uses slightly different terminology to URIjs. Sorry!
		@@ -304,3 +320,3 @@ return {
		crawler = this;


		// Regular expressions for finding URL items in HTML and text
		@@ -311,3 +327,3 @@ var discoverRegex = [
		/url\([^)]+/ig,


		// This might be a bit of a gamble... but get hard-coded
		@@ -319,3 +335,3 @@ // strings out of javacript: URLs. They're often popup-image
		];


		function cleanURL(URL) {
		@@ -337,7 +353,7 @@ return URL
		if (!urlMatch) return [];


		return urlMatch
		.map(cleanURL)
		.reduce(function(list,URL) {


		// Ensure URL is whole and complete
		@@ -353,9 +369,9 @@ try {
		}


		// If we hit an empty item, don't add return it
		if (!URL.length) return list;


		// If we don't support the protocol in question
		if (!crawler.protocolSupported(URL)) return list;


		// Does the item already exist in the list?
		@@ -398,9 +414,9 @@ if (resources.reduce(function(prev,current) {
		crawlerHost = crawler.host;


		// If we're ignoring the WWW domain, remove the WWW for comparisons...
		if (crawler.ignoreWWWDomain)
		host = host.replace(/^www\./i,"");


		function domainInWhitelist(host) {


		// If there's no whitelist, or the whitelist is of zero length,
		@@ -410,12 +426,12 @@ // just return false.
		!crawler.domainWhitelist.length) return false;


		// Otherwise, scan through it.
		return !!crawler.domainWhitelist.reduce(function(prev,cur,index,array) {


		// If we already located the relevant domain in the whitelist...
		if (prev) return prev;


		// If the domain is just equal, return true.
		if (host === cur) return true;


		// If we're ignoring WWW subdomains, and both domains,
		@@ -425,3 +441,3 @@ // less www. are the same, return true.
		return true;


		// Otherwise, sorry. No dice.
		@@ -434,7 +450,7 @@ return false;
		function isSubdomainOf(subdomain,host) {


		// Comparisons must be case-insensitive
		subdomain = subdomain.toLowerCase();
		host = host.toLowerCase();


		// If we're ignoring www, remove it from both
		@@ -446,3 +462,3 @@ // (if www is the first domain component...)
		}


		// They should be the same flipped around!
		@@ -500,3 +516,3 @@ return (
		resources.forEach(function(url){ crawler.queueURL(url,queueItem); });


		return this;
		@@ -525,3 +541,3 @@ };
		typeof(url) === "object" ? url : crawler.processURL(url,queueItem);


		// URL Parser decided this URL was junky. Next please!
		@@ -542,3 +558,3 @@ if (!parsedURL) {
		}


		// Check the domain is valid before adding it to the queue
		@@ -568,3 +584,3 @@ if (crawler.domainValid(parsedURL.host)) {
		}


		return true;
		@@ -577,5 +593,5 @@ };
		It kicks off resource discovery and queues any new resources found.


		queueItem - Queue item to be fetched.


		Emits
		@@ -602,6 +618,3 @@ fetchstart
		crawler.openRequests ++;

		// Emit fetchstart event
		crawler.emit("fetchstart",queueItem);


		// Variable declarations
		@@ -611,10 +624,3 @@ var fetchData = false,
		clientRequest,
		timeCommenced,
		timeHeadersReceived,
		timeDataReceived,
		parsedURL,
		responseBuffer,
		responseLength,
		responseLengthReceived,
		contentType;
		timeCommenced;

		@@ -647,5 +653,13 @@ // Mark as spooled

		if(crawler.needsAuth) {
		// Add cookie header from cookie jar if we're configured to
		// send/accept cookies
		if (crawler.acceptCookies && crawler.cookies.getAsHeader()) {
		requestOptions.headers.cookie =
		crawler.cookies.getAsHeader(requestHost,requestPath);
		}

		// Add auth headers if we need them
		if (crawler.needsAuth) {
		var auth = crawler.authUser + ":" + crawler.authPass;


		// Generate auth header
		@@ -656,189 +670,252 @@ auth = 'Basic ' + (new Buffer(auth).toString('base64'));

		// Record what time we started this request
		timeCommenced = (new Date().getTime());
		// And if we've got any custom headers available
		if (crawler.customHeaders) {
		for (var header in crawler.customHeaders) {
		if (!crawler.customHeaders.hasOwnProperty(header)) continue;

		requestOptions.headers[header] = crawler.customHeaders[header];
		}
		}

		// Emit fetchstart event - gives the user time to mangle the request options
		// if required.
		crawler.emit("fetchstart",queueItem,requestOptions);

		// Get the resource!
		clientRequest = client.get(requestOptions,function(response) {
		var dataReceived = false,
		stateData = queueItem.stateData;
		process.nextTick(function() {
		// Record what time we started this request
		timeCommenced = Date.now();

		responseLengthReceived = 0;
		// Get the resource!
		clientRequest =
		client.get(requestOptions,function(response) {
		crawler.handleResponse(queueItem,response,timeCommenced);
		});

		clientRequest.on("error",function(errorData) {
		crawler.openRequests --;

		// Emit 5xx / 4xx event
		crawler.emit("fetchclienterror",queueItem,errorData);
		queueItem.fetched = true;
		queueItem.stateData.code = 599;
		queueItem.status = "failed";
		});

		return crawler;
		});
		};

		// Record what time we first received the header information
		timeHeadersReceived = (new Date().getTime());

		responseLength = parseInt(response.headers["content-length"],10);
		responseLength = !isNaN(responseLength) ? responseLength : 0;
		/*
		Public: Given a queueItem and a matching response object, the crawler will
		handle downloading the resource, queueing of linked items, etc.

		Examples

		// Passing in a response from `request`
		request(queueItem.url,function(err,res,body) {
		crawler.handleResponse(queueItem,res);
		});

		// Save timing and content some header information into queue
		stateData.requestLatency = (timeHeadersReceived - timeCommenced);
		stateData.requestTime = (timeHeadersReceived - timeCommenced);
		stateData.contentLength = responseLength;
		stateData.contentType = contentType = response.headers["content-type"];
		stateData.code = response.statusCode;
		Returns the crawler object for chaining.

		// Save entire headers, in less scannable way
		stateData.headers = response.headers;
		*/
		Crawler.prototype.handleResponse = function(queueItem,response,timeCommenced) {
		var crawler = this,
		dataReceived = false,
		timeHeadersReceived,
		timeDataReceived,
		parsedURL,
		responseBuffer,
		responseLength,
		responseLengthReceived = 0,
		contentType,
		stateData = queueItem.stateData;

		// Record what time we first received the header information
		timeHeadersReceived = Date.now();

		// If we weren't passed a time of commencement, assume Now()
		timeCommenced = timeCommenced \|\| Date.now();

		// Emit header receive event
		crawler.emit("fetchheaders",queueItem,response);
		responseLength = parseInt(response.headers["content-length"],10);
		responseLength = !isNaN(responseLength) ? responseLength : 0;

		// Ensure response length is reasonable...
		responseLength =
		responseLength > 0 ? responseLength : crawler.maxResourceSize;

		queueItem.stateData.contentLength = responseLength;
		// Save timing and content some header information into queue
		stateData.requestLatency = (timeHeadersReceived - timeCommenced);
		stateData.requestTime = (timeHeadersReceived - timeCommenced);
		stateData.contentLength = responseLength;
		stateData.contentType = contentType = response.headers["content-type"];
		stateData.code = response.statusCode;
		stateData.headers = response.headers;

		// Do we need to save cookies? Were we sent any?
		if (crawler.acceptCookies &&
		response.headers.hasOwnProperty('set-cookie'))
		crawler.cookies.addFromHeaders(response.headers["set-cookie"]);

		// Emit header receive event
		crawler.emit("fetchheaders",queueItem,response);

		// Function for dealing with 200 responses
		function processReceivedData() {
		if (queueItem.fetched) return;

		timeDataReceived = (new Date().getTime());
		// Ensure response length is reasonable...
		responseLength =
		responseLength > 0 ? responseLength : crawler.maxResourceSize;

		queueItem.fetched = true;
		queueItem.status = "downloaded";

		// Save state information
		stateData.downloadTime = (timeDataReceived - timeHeadersReceived);
		stateData.requestTime = (timeDataReceived - timeCommenced);
		stateData.actualDataSize = responseBuffer.length;
		stateData.sentIncorrectSize = responseBuffer.length !== responseLength;
		queueItem.stateData.contentLength = responseLength;

		crawler.emit("fetchcomplete",queueItem,responseBuffer,response);
		// Function for dealing with 200 responses
		function processReceivedData() {
		if (queueItem.fetched) return;

		// First, save item to cache (if we're using a cache!)
		if (crawler.cache !== null &&
		crawler.cache.setCacheData instanceof Function) {

		crawler.cache.setCacheData(queueItem,responseBuffer);
		}
		timeDataReceived = (new Date().getTime());

		// We only process the item if it's of a valid mimetype
		// and only if the crawler is set to discover its own resources
		if (crawler.mimeTypeSupported(contentType) && crawler.discoverResources) {
		crawler.queueLinkedItems(responseBuffer,queueItem);
		}

		crawler.openRequests --;
		queueItem.fetched = true;
		queueItem.status = "downloaded";

		// Save state information
		stateData.downloadTime = (timeDataReceived - timeHeadersReceived);
		stateData.requestTime = (timeDataReceived - timeCommenced);
		stateData.actualDataSize = responseBuffer.length;
		stateData.sentIncorrectSize = responseBuffer.length !== responseLength;

		crawler.emit("fetchcomplete",queueItem,responseBuffer,response);

		// First, save item to cache (if we're using a cache!)
		if (crawler.cache !== null &&
		crawler.cache.setCacheData instanceof Function) {

		crawler.cache.setCacheData(queueItem,responseBuffer);
		}

		function receiveData(chunk) {
		if (chunk && chunk.length && !dataReceived) {
		if (responseLengthReceived + chunk.length > responseBuffer.length) {
		// Oh dear. We've been sent more data than we were initially told.
		// This could be a mis-calculation, or a streaming resource.
		// Let's increase the size of our buffer to match, as long as it isn't
		// larger than our maximum resource size.
		// We only process the item if it's of a valid mimetype
		// and only if the crawler is set to discover its own resources
		if (crawler.mimeTypeSupported(contentType) && crawler.discoverResources) {
		crawler.queueLinkedItems(responseBuffer,queueItem);
		}

		if (responseLengthReceived + chunk.length <= crawler.maxResourceSize) {
		// Start by creating a new buffer, which will be our main buffer going forward...
		var tmpNewBuffer = new Buffer(responseLengthReceived + chunk.length);
		crawler.openRequests --;
		}

		// Copy all our old data into it...
		responseBuffer.copy(tmpNewBuffer,0,0,responseBuffer.length);
		function receiveData(chunk) {
		if (chunk && chunk.length && !dataReceived) {
		if (responseLengthReceived + chunk.length > responseBuffer.length) {
		// Oh dear. We've been sent more data than we were initially told.
		// This could be a mis-calculation, or a streaming resource.
		// Let's increase the size of our buffer to match, as long as it isn't
		// larger than our maximum resource size.

		// And now the new chunk
		chunk.copy(tmpNewBuffer,responseBuffer.length,0,chunk.length);
		if (responseLengthReceived + chunk.length <= crawler.maxResourceSize) {

		// Start by creating a new buffer, which will be our main
		// buffer from now on...

		var tmpNewBuffer = new Buffer(responseLengthReceived + chunk.length);

		// And now make the response buffer our new buffer, leaving the original for GC
		responseBuffer = tmpNewBuffer;
		// Copy all our old data into it...
		responseBuffer.copy(tmpNewBuffer,0,0,responseBuffer.length);

		} else {
		// Oh dear oh dear! The response is not only more data than we were initially told,
		// but it also exceeds the maximum amount of data we're prepared to download per resource.
		// Throw error event and ignore.
		//
		// We'll then deal with the data that we have.
		// And now the new chunk
		chunk.copy(tmpNewBuffer,responseBuffer.length,0,chunk.length);

		crawler.emit("fetchdataerror",queueItem,response);
		}
		// And now make the response buffer our new buffer,
		// leaving the original for GC
		responseBuffer = tmpNewBuffer;

		} else {
		// Copy the chunk data into our main buffer
		chunk.copy(responseBuffer,responseLengthReceived,0,chunk.length);
		// Oh dear oh dear! The response is not only more data
		// than we were initially told, but it also exceeds the
		// maximum amount of data we're prepared to download per
		// resource.
		//
		// Throw error event and ignore.
		//
		// We'll then deal with the data that we have.

		crawler.emit("fetchdataerror",queueItem,response);
		}

		// Increment our data received counter
		responseLengthReceived += chunk.length;
		} else {
		// Copy the chunk data into our main buffer
		chunk.copy(responseBuffer,responseLengthReceived,0,chunk.length);
		}

		// Increment our data received counter
		responseLengthReceived += chunk.length;
		}

		if ((responseLengthReceived >= responseLength \|\| response.complete) && !dataReceived) {
		// Slice the buffer to chop off any unused space
		responseBuffer = responseBuffer.slice(0,responseLengthReceived);

		dataReceived = true;
		processReceivedData();
		}
		}

		// If we should just go ahead and get the data
		if (response.statusCode >= 200 && response.statusCode < 300 && responseLength <= crawler.maxResourceSize) {
		queueItem.status = "headers";
		if ((responseLengthReceived >= responseLength \|\| response.complete) &&
		!dataReceived) {

		// Create a buffer with our response length
		responseBuffer = new Buffer(responseLength);
		// Slice the buffer to chop off any unused space
		responseBuffer = responseBuffer.slice(0,responseLengthReceived);

		response.on("data",receiveData);
		response.on("end",receiveData);
		dataReceived = true;
		processReceivedData();
		}
		}

		// We've got a not-modified response back
		} else if (response.statusCode === 304) {
		// If we should just go ahead and get the data
		if (response.statusCode >= 200 && response.statusCode < 300 &&
		responseLength <= crawler.maxResourceSize) {

		queueItem.status = "headers";

		if (crawler.cache !== null && crawler.cache.getCacheData) {
		// We've got access to a cache
		crawler.cache.getCacheData(queueItem,function(cacheObject) {
		crawler.emit("notmodified",queueItem,response,cacheObject);
		});
		} else {
		// Emit notmodified event. We don't have a cache available, so we don't send any data.
		crawler.emit("notmodified",queueItem,response);
		}
		// Create a buffer with our response length
		responseBuffer = new Buffer(responseLength);

		// If we should queue a redirect
		} else if (response.statusCode >= 300 && response.statusCode < 400 && response.headers.location) {
		queueItem.fetched = true;
		queueItem.status = "redirected";

		// Parse the redirect URL ready for adding to the queue...
		parsedURL = crawler.processURL(response.headers.location,queueItem);
		response.on("data",receiveData);
		response.on("end",receiveData);

		// Emit redirect event
		crawler.emit("fetchredirect",queueItem,parsedURL,response);
		// We've got a not-modified response back
		} else if (response.statusCode === 304) {

		// Clean URL, add to queue...
		crawler.queueURL(parsedURL,queueItem);
		if (crawler.cache !== null && crawler.cache.getCacheData) {
		// We've got access to a cache
		crawler.cache.getCacheData(queueItem,function(cacheObject) {
		crawler.emit("notmodified",queueItem,response,cacheObject);
		});
		} else {
		// Emit notmodified event. We don't have a cache available, so
		// we don't send any data.
		crawler.emit("notmodified",queueItem,response);
		}

		crawler.openRequests --;
		// If we should queue a redirect
		} else if (response.statusCode >= 300 && response.statusCode < 400 &&
		response.headers.location) {

		queueItem.fetched = true;
		queueItem.status = "redirected";

		// Ignore this request, but record that we had a 404
		} else if (response.statusCode === 404) {
		queueItem.fetched = true;
		queueItem.status = "notfound";
		// Parse the redirect URL ready for adding to the queue...
		parsedURL = crawler.processURL(response.headers.location,queueItem);

		// Emit 404 event
		crawler.emit("fetch404",queueItem,response);
		// Emit redirect event
		crawler.emit("fetchredirect",queueItem,parsedURL,response);

		crawler.openRequests --;
		// Clean URL, add to queue...
		crawler.queueURL(parsedURL,queueItem);

		// And oh dear. Handle this one as well. (other 400s, 500s, etc)
		} else {
		queueItem.fetched = true;
		queueItem.status = "failed";
		crawler.openRequests --;

		// Emit 5xx / 4xx event
		crawler.emit("fetcherror",queueItem,response);
		// Ignore this request, but record that we had a 404
		} else if (response.statusCode === 404) {
		queueItem.fetched = true;
		queueItem.status = "notfound";

		crawler.openRequests --;
		}
		});
		// Emit 404 event
		crawler.emit("fetch404",queueItem,response);

		clientRequest.on("error",function(errorData) {
		crawler.openRequests --;

		// Emit 5xx / 4xx event
		crawler.emit("fetchclienterror",queueItem,errorData);
		// And oh dear. Handle this one as well. (other 400s, 500s, etc)
		} else {
		queueItem.fetched = true;
		queueItem.stateData.code = 599;
		queueItem.status = "failed";
		});

		// Emit 5xx / 4xx event
		crawler.emit("fetcherror",queueItem,response);

		crawler.openRequests --;
		}

		@@ -853,3 +930,3 @@ return crawler;
		to do so and there are unfetched items in the queue.


		Examples
		@@ -864,5 +941,5 @@
		var crawler = this;


		if (crawler.openRequests > crawler.maxConcurrency) return;


		crawler.queue.oldestUnfetchedItem(function(err,queueItem) {
		@@ -880,3 +957,3 @@ if (queueItem) {
		});


		return crawler;
		@@ -941,3 +1018,3 @@ };

		Returns true if the fetch condition was removed, and throws an error if it
		Returns true if the fetch condition was removed, and throws an error if it
		could not be found.
		@@ -949,3 +1026,3 @@
		this.fetchConditions[index] instanceof Function) {


		return !!this.fetchConditions.splice(index,1);
		@@ -952,0 +1029,0 @@ } else {

package.json

		{
		"name": "simplecrawler",
		"description": "Very straigntforward web crawler. Uses EventEmitter. Generates queue statistics and has a basic cache mechanism with extensible backend.",
		"version": "0.1.7",
		"homepage": "http://github.com/cgiffard/node-simplecrawler",
		"author": "Christopher Giffard <christopher.giffard@cgiffard.com>",
		"keywords": [
		"crawler",
		"spider",
		"cache",
		"queue",
		"simplecrawler",
		"eventemitter"
		],
		"scripts": {
		"test": "mocha -R spec"
		},
		"bin": {
		"crawl": "./lib/cli.js"
		},
		"repository": {
		"type": "git",
		"url": "http://github.com/cgiffard/node-simplecrawler.git"
		},
		"bugs": {
		"url": "https://github.com/cgiffard/node-simplecrawler/issues"
		},
		"main": "./lib/index.js",
		"engines": {
		"node": ">=0.4.0"
		},
		"devDependencies": {
		"mocha": "~1.8.1",
		"jshint": "~0.7.x",
		"chai": "~1.2.0"
		},
		"dependencies": {
		"iconv": "~1.2.4",
		"URIjs": "~1.8.3"
		}
		"name": "simplecrawler",
		"description": "Very straigntforward web crawler. Uses EventEmitter. Generates queue statistics and has a basic cache mechanism with extensible backend.",
		"version": "0.2.0",
		"homepage": "http://github.com/cgiffard/node-simplecrawler",
		"author": "Christopher Giffard <christopher.giffard@cgiffard.com>",
		"keywords": [
		"crawler",
		"spider",
		"cache",
		"queue",
		"simplecrawler",
		"eventemitter"
		],
		"scripts": {
		"test": "mocha -R spec"
		},
		"bin": {
		"crawl": "./lib/cli.js"
		},
		"repository": {
		"type": "git",
		"url": "http://github.com/cgiffard/node-simplecrawler.git"
		},
		"bugs": {
		"url": "https://github.com/cgiffard/node-simplecrawler/issues"
		},
		"main": "./lib/index.js",
		"engines": {
		"node": ">=0.4.0"
		},
		"devDependencies": {
		"mocha": "~1.8.1",
		"jshint": "~0.7.x",
		"chai": "~1.2.0"
		},
		"dependencies": {
		"iconv": "~1.2.4",
		"URIjs": "~1.8.3"
		}
		}

test/jshint.js

		@@ -18,2 +18,3 @@ // Tests to ensure crawler code is well formed
		"cli",
		"cookies",
		"crawler",
		@@ -20,0 +21,0 @@ "index",

README.markdown

Sorry, the diff of this file is not supported yet

simplecrawler - npm Package Compare versions

Improved metrics