simplecrawler - npm Package Compare versions

Comparing version 0.5.3 to 0.5.4

358

lib/cache-backend-fs.js

		@@ -1,11 +0,18 @@
		// Simplecrawler - FS cache backend
		/*
		* Simplecrawler - FS cache backend
		* https://github.com/cgiffard/node-simplecrawler
		*
		* Copyright (c) 2011-2015, Christopher Giffard
		*
		*/

		// Tries to ensure a local 'cache' of a website is as close as possible to a mirror of the website itself.
		// The idea is that it is then possible to re-serve the website just using the cache.

		var fs = require("fs");
		var crypto = require("crypto");
		var fs = require("fs"),
		crypto = require("crypto");

		// Factory for FSBackend
		var backend = function backend(loadParameter) {
		return new FSBackend(loadParameter);
		return new FSBackend(loadParameter);
		};
		@@ -17,6 +24,6 @@
		var FSBackend = function FSBackend(loadParameter) {
		this.loaded = false;
		this.index = [];
		this.location = typeof(loadParameter) === "string" && loadParameter.length > 0 ? loadParameter : process.cwd() + "/cache/";
		this.location = this.location.substr(this.location.length-1) === "/" ? this.location : this.location + "/";
		this.loaded = false;
		this.index = [];
		this.location = typeof loadParameter === "string" && loadParameter.length > 0 ? loadParameter : process.cwd() + "/cache/";
		this.location = this.location.substr(this.location.length - 1) === "/" ? this.location : this.location + "/";
		};
		@@ -29,212 +36,215 @@

		function sanitisePath(path,queueObject) {
		// Remove first slash (as we set one later.)
		path = path.replace(/^\//,"");
		function sanitisePath(path, queueObject) {
		// Remove first slash (as we set one later.)
		path = path.replace(/^\//, "");

		var pathStack = [];
		var pathStack = [];

		// Trim whitespace. If no path is present - assume index.html.
		var sanitisedPath = path.length ? path.replace(/\s*$/ig,"") : "index.html";
		var headers = queueObject.stateData.headers, sanitisedPathParts;
		// Trim whitespace. If no path is present - assume index.html.
		var sanitisedPath = path.length ? path.replace(/\s*$/ig, "") : "index.html";
		var headers = queueObject.stateData.headers, sanitisedPathParts;

		if (sanitisedPath.match(/\?/)) {
		sanitisedPathParts = sanitisedPath.split(/\?/g);
		var resource = sanitisedPathParts.shift();
		var hashedQS = crypto.createHash("sha1").update(sanitisedPathParts.join("?")).digest("hex");
		sanitisedPath = resource + "?" + hashedQS;
		}
		if (sanitisedPath.match(/\?/)) {
		sanitisedPathParts = sanitisedPath.split(/\?/g);
		var resource = sanitisedPathParts.shift();
		var hashedQS = crypto.createHash("sha1").update(sanitisedPathParts.join("?")).digest("hex");
		sanitisedPath = resource + "?" + hashedQS;
		}

		pathStack = sanitisedPath.split(/\//g);
		pathStack = pathStack.map(function(pathChunk,count) {
		if (pathChunk.length >= 250) {
		return crypto.createHash("sha1").update(pathChunk).digest("hex");
		}
		pathStack = sanitisedPath.split(/\//g);
		pathStack = pathStack.map(function(pathChunk) {
		if (pathChunk.length >= 250) {
		return crypto.createHash("sha1").update(pathChunk).digest("hex");
		}

		return pathChunk;
		});
		return pathChunk;
		});

		sanitisedPath = pathStack.join("/");
		sanitisedPath = pathStack.join("/");

		// Try to get a file extension for the file - for ease of identification
		// We run through this if we either:
		// 1) haven't got a file extension at all, or:
		// 2) have an HTML file without an HTML file extension (might be .php, .aspx, .do, or some other server-processed type)
		// Try to get a file extension for the file - for ease of identification
		// We run through this if we either:
		// 1) haven't got a file extension at all, or:
		// 2) have an HTML file without an HTML file extension (might be .php, .aspx, .do, or some other server-processed type)

		if (!sanitisedPath.match(/\.[a-z0-9]{1,6}$/i) \|\| (headers["content-type"] && headers["content-type"].match(/text\/html/i) && !sanitisedPath.match(/\.htm[l]?$/i))) {
		var subMimeType = "";
		var mimeParts = [];
		if (!sanitisedPath.match(/\.[a-z0-9]{1,6}$/i) \|\| headers["content-type"] && headers["content-type"].match(/text\/html/i) && !sanitisedPath.match(/\.htm[l]?$/i)) {
		var subMimeType = "";
		var mimeParts = [];

		if (headers["content-type"] && headers["content-type"].match(/text\/html/i)) {
		if (sanitisedPath.match(/\/$/)) {
		sanitisedPath += "index.html";
		} else {
		sanitisedPath += ".html";
		}
		if (headers["content-type"] && headers["content-type"].match(/text\/html/i)) {
		if (sanitisedPath.match(/\/$/)) {
		sanitisedPath += "index.html";
		} else {
		sanitisedPath += ".html";
		}

		} else if (headers["content-type"] && (mimeParts = headers["content-type"].match(/(image\|video\|audio\|application)\/([a-z0-9]+)/i))) {
		subMimeType = mimeParts[2];
		sanitisedPath += "." + subMimeType;
		}
		}
		} else if (headers["content-type"] && (mimeParts = headers["content-type"].match(/(image\|video\|audio\|application)\/([a-z0-9]+)/i))) {
		subMimeType = mimeParts[2];
		sanitisedPath += "." + subMimeType;
		}
		}

		return sanitisedPath;
		return sanitisedPath;
		}

		FSBackend.prototype.fileExists = function(location) {
		try {
		fs.statSync(location);
		return true;
		} catch (er) {
		return false;
		}
		try {
		fs.statSync(location);
		return true;
		} catch (er) {
		return false;
		}
		};

		FSBackend.prototype.isDirectory = function(location) {
		try {
		if (fs.statSync(location).isDirectory()) {
		return true;
		}
		try {
		if (fs.statSync(location).isDirectory()) {
		return true;
		}

		return false;
		} catch (er) {
		return false;
		}
		return false;
		} catch (er) {
		return false;
		}
		};

		FSBackend.prototype.load = function() {
		var backend = this;
		var backend = this;

		if (!this.fileExists(this.location) && this.isDirectory(this.location)) {
		throw new Error("Unable to verify cache location exists.");
		}
		if (!backend.fileExists(backend.location) && backend.isDirectory(backend.location)) {
		throw new Error("Unable to verify cache location exists.");
		}

		try {
		var fileData;
		if ((fileData = fs.readFileSync(this.location + "cacheindex.json")) && fileData.length) {
		this.index = JSON.parse(fileData.toString("utf8"));
		this.loaded = true;
		}
		} catch(error) {
		if (error.code === "ENOENT") {
		// Cache index doesn't exist. Assume this is a new cache.
		// Just leave the memory index empty for now.
		this.loaded = true;
		} else {
		throw error;
		}
		}
		try {
		var fileData;
		if ((fileData = fs.readFileSync(backend.location + "cacheindex.json")) && fileData.length) {
		backend.index = JSON.parse(fileData.toString("utf8"));
		backend.loaded = true;
		}
		} catch (error) {
		if (error.code === "ENOENT") {
		// Cache index doesn't exist. Assume this is a new cache.
		// Just leave the memory index empty for now.
		backend.loaded = true;
		} else {
		throw error;
		}
		}

		// Flush store to disk when closing.
		process.on("exit",function() {
		backend.saveCache.apply(backend);
		});
		// Flush store to disk when closing.
		process.on("exit", function() {
		backend.saveCache.apply(backend);
		});
		};

		FSBackend.prototype.saveCache = function(callback) {
		fs.writeFile(this.location + "cacheindex.json", JSON.stringify(this.index), callback);
		fs.writeFile(this.location + "cacheindex.json", JSON.stringify(this.index), callback);
		};

		FSBackend.prototype.setItem = function(queueObject,data,callback) {
		callback = callback instanceof Function ? callback : function(){};
		FSBackend.prototype.setItem = function(queueObject, data, callback) {
		callback = callback instanceof Function ? callback : function() {};

		var backend = this;
		var pathStack = [queueObject.protocol, queueObject.host, queueObject.port];
		pathStack = pathStack.concat(sanitisePath(queueObject.path,queueObject).split(/\/+/g));
		var backend = this;
		var pathStack = [queueObject.protocol, queueObject.host, queueObject.port];
		pathStack = pathStack.concat(sanitisePath(queueObject.path, queueObject).split(/\/+/g));

		var cacheItemExists = false;
		var firstInstanceIndex = NaN;
		if (this.index.reduce(function(prev,current,index,array) {
		firstInstanceIndex = !isNaN(firstInstanceIndex) ? firstInstanceIndex : index;
		return prev \|\| current.url === queueObject.url;
		},false)) {
		cacheItemExists = true;
		}
		var cacheItemExists = false;
		var firstInstanceIndex = NaN;
		if (backend.index.reduce(function(prev, current, index) {
		firstInstanceIndex = !isNaN(firstInstanceIndex) ? firstInstanceIndex : index;
		return prev \|\| current.url === queueObject.url;
		}, false)) {
		cacheItemExists = true;
		}

		var writeFileData = function(currentPath,data) {
		fs.writeFile(currentPath,data,function(error) {
		if (error) throw error;
		fs.writeFile(currentPath + ".cacheData.json",JSON.stringify(queueObject),function(error) {
		if (error) throw error;
		var writeFileData = function(currentPath, data) {
		fs.writeFile(currentPath, data, function(error) {
		if (error) {
		throw error;
		}
		fs.writeFile(currentPath + ".cacheData.json", JSON.stringify(queueObject), function(error) {
		if (error) {
		throw error;
		}

		var cacheObject = {
		url: queueObject.url,
		etag: queueObject.stateData.headers.etag,
		lastModified: queueObject.stateData.headers['last-modified'],
		dataFile: currentPath,
		metaFile: currentPath + ".cacheData.json"
		};
		var cacheObject = {
		url: queueObject.url,
		etag: queueObject.stateData.headers.etag,
		lastModified: queueObject.stateData.headers["last-modified"],
		dataFile: currentPath,
		metaFile: currentPath + ".cacheData.json"
		};

		if (cacheItemExists) {
		backend.index[firstInstanceIndex] = cacheObject;
		} else {
		backend.index.push(cacheObject);
		}
		if (cacheItemExists) {
		backend.index[firstInstanceIndex] = cacheObject;
		} else {
		backend.index.push(cacheObject);
		}

		callback(cacheObject);
		});
		});
		};
		callback(cacheObject);
		});
		});
		};

		pathStack.forEach(function(pathChunk,count) {
		var currentPath = backend.location + pathStack.slice(0,count+1).join("/");
		if (backend.fileExists(backend.location + pathStack.slice(0,count+1).join("/"))) {
		if (!backend.isDirectory(currentPath)) {
		if (count === pathStack.length -1) {
		// Just overwrite the file...
		writeFileData(currentPath,data);
		} else {
		throw new Error("Cache storage of resource (%s) blocked by file: %s",queueObject.url,currentPath);
		}
		}
		} else {
		if (count === pathStack.length -1) {
		// Write the file data in
		writeFileData(currentPath,data);
		} else {
		fs.mkdirSync(currentPath);
		}
		}
		});
		pathStack.forEach(function(pathChunk, count) {
		var currentPath = backend.location + pathStack.slice(0, count + 1).join("/");
		if (backend.fileExists(backend.location + pathStack.slice(0, count + 1).join("/"))) {
		if (!backend.isDirectory(currentPath)) {
		if (count === pathStack.length - 1) {
		// Just overwrite the file...
		writeFileData(currentPath, data);
		} else {
		throw new Error("Cache storage of resource (%s) blocked by file: %s", queueObject.url, currentPath);
		}
		}
		} else {
		if (count === pathStack.length - 1) {
		// Write the file data in
		writeFileData(currentPath, data);
		}
		fs.mkdirSync(currentPath);

		}
		});
		};

		FSBackend.prototype.getItem = function(queueObject,callback) {
		var cacheItemResult = this.index.filter(function(item) {
		return item.url === queueObject.url;
		});
		FSBackend.prototype.getItem = function(queueObject, callback) {
		var cacheItemResult = this.index.filter(function(item) {
		return item.url === queueObject.url;
		});

		if (cacheItemResult.length) {
		var cacheItem = cacheItemResult.shift();
		if (cacheItemResult.length) {
		var cacheItem = cacheItemResult.shift();

		callback({
		"url": cacheItem.url,
		"etag": cacheItem.etag,
		"lastModified": cacheItem.lastModified,
		"getData": function(callback) {
		fs.readFile(cacheItem.dataFile,function(error,data) {
		if (error) {
		callback(error);
		return false;
		}
		callback({
		url: cacheItem.url,
		etag: cacheItem.etag,
		lastModified: cacheItem.lastModified,
		getData: function(callback) {
		fs.readFile(cacheItem.dataFile, function(error, data) {
		if (error) {
		callback(error);
		return false;
		}

		callback(null,data);
		});
		},
		"getMetadata": function(callback) {
		fs.readFile(cacheItem.metaFile,function(error,data) {
		if (error) {
		callback(error);
		return false;
		}
		callback(null, data);
		});
		},
		getMetadata: function(callback) {
		fs.readFile(cacheItem.metaFile, function(error, data) {
		if (error) {
		callback(error);
		return false;
		}

		callback(null,JSON.parse(data.toString("utf8")));
		});
		}
		});
		callback(null, JSON.parse(data.toString("utf8")));
		});
		}
		});

		} else {
		callback(null);
		}
		} else {
		callback(null);
		}

		return false;
		return false;
		};

lib/cache.js

		@@ -1,24 +0,24 @@
		// Simplecrawler - cache module
		// Christopher Giffard, 2011
		//
		// http://www.github.com/cgiffard/node-simplecrawler
		/*
		* Simplecrawler - cache module
		* https://github.com/cgiffard/node-simplecrawler
		*
		* Copyright (c) 2011-2015, Christopher Giffard
		*
		*/

		var fs = require("fs");
		var EventEmitter = require('events').EventEmitter;
		var EventEmitter = require("events").EventEmitter;
		var FilesystemBackend = require("./cache-backend-fs.js");
		// var RedisBackend = require("cache-backend-redis.js");
		// var MongoBackend = require("cache-backend-mongo.js");

		// Init cache wrapper for backend...
		var Cache = function Cache(cacheLoadParameter,cacheBackend) {
		var Cache = function Cache(cacheLoadParameter, cacheBackend) {

		// Ensure parameters are how we want them...
		cacheBackend = typeof cacheBackend === "object" ? cacheBackend : FilesystemBackend;
		cacheLoadParameter = cacheLoadParameter instanceof Array ? cacheLoadParameter : [cacheLoadParameter];
		// Ensure parameters are how we want them...
		cacheBackend = typeof cacheBackend === "object" ? cacheBackend : FilesystemBackend;
		cacheLoadParameter = cacheLoadParameter instanceof Array ? cacheLoadParameter : [cacheLoadParameter];

		// Now we can just run the factory.
		this.datastore = cacheBackend.apply(cacheBackend,cacheLoadParameter);
		// Now we can just run the factory.
		this.datastore = cacheBackend.apply(cacheBackend, cacheLoadParameter);

		// Instruct the backend to load up.
		this.datastore.load();
		// Instruct the backend to load up.
		this.datastore.load();
		};
		@@ -29,13 +29,13 @@
		// Set up data import and export functions
		Cache.prototype.setCacheData = function(queueObject,data,callback) {
		this.datastore.setItem(queueObject,data,callback);
		this.emit("setcache",queueObject,data);
		Cache.prototype.setCacheData = function(queueObject, data, callback) {
		this.datastore.setItem(queueObject, data, callback);
		this.emit("setcache", queueObject, data);
		};

		Cache.prototype.getCacheData = function(queueObject,callback) {
		this.datastore.getItem(queueObject,callback);
		Cache.prototype.getCacheData = function(queueObject, callback) {
		this.datastore.getItem(queueObject, callback);
		};

		Cache.prototype.saveCache = function() {
		this.datastore.saveCache();
		this.datastore.saveCache();
		};
		@@ -42,0 +42,0 @@

536

lib/cookies.js

		@@ -1,244 +0,268 @@
		// Cookie Jar Functionality
		var EventEmitter = require("events").EventEmitter,
		util = require("util");
		/*
		* Simplecrawler - Cookie Jar Functionality
		* https://github.com/cgiffard/node-simplecrawler
		*
		* Copyright (c) 2011-2015, Christopher Giffard
		*
		*/

		var EventEmitter = require("events").EventEmitter,
		util = require("util");

		/*
		Public: Constructor for the cookie jar.
		Public: Constructor for the cookie jar.

		Examples
		Examples

		var cookieJar = new CookieJar();
		var cookieJar = new CookieJar();

		Returns the cookie jar object which has now been constructed.
		Returns the cookie jar object which has now been constructed.

		*/
		function CookieJar() {
		var cookies = [];
		this.__defineGetter__("cookies",function() {
		return cookies;
		});
		var cookies = [];
		this.__defineGetter__("cookies", function() {
		return cookies;
		});

		// Run the EventEmitter constructor
		EventEmitter.call(this);
		// Run the EventEmitter constructor
		EventEmitter.call(this);
		}

		util.inherits(CookieJar,EventEmitter);
		util.inherits(CookieJar, EventEmitter);

		/*
		Public: Adds a new cookie to the jar, either by creating a new Cookie() object
		from specific details such as name, value, etc., accepting a string from a
		Set-Cookie header, or by passing in an existing Cookie() object.
		Public: Adds a new cookie to the jar, either by creating a new Cookie() object
		from specific details such as name, value, etc., accepting a string from a
		Set-Cookie header, or by passing in an existing Cookie() object.

		name - The name of the cookie to add. Alternately, set-cookie
		header as string, or an existing cookie object.
		value - The value of the cookie.
		expiry - Expiry timestamp in milliseconds.
		path - Limit cookie to path (defaults to "/")
		domain - Limit cookie to domain
		httponly - Boolean value specifying httponly
		cb - Optional callback.
		name - The name of the cookie to add. Alternately, set-cookie
		header as string, or an existing cookie object.
		value - The value of the cookie.
		expiry - Expiry timestamp in milliseconds.
		path - Limit cookie to path (defaults to "/")
		domain - Limit cookie to domain
		httponly - Boolean value specifying httponly
		cb - Optional callback.

		Emits
		Emits

		addcookie - Emitted with new cookie object as an argument.
		addcookie - Emitted with new cookie object as an argument.

		Examples
		Examples

		cookieJar.add("mycookie","myValue",Date.now(),"/","test.com",false);
		cookieJar.add("mycookie","myValue",Date.now(),"/","test.com",false);

		Returns the cookie jar object for chaining.
		Returns the cookie jar object for chaining.

		*/
		CookieJar.prototype.add = function(name,value,expiry,path,domain,httponly,cb) {
		CookieJar.prototype.add = function(name, value, expiry, path, domain, httponly, cb) {

		var existingIndex = -1, newCookie;
		var existingIndex = -1, newCookie;

		if (arguments.length > 1) {
		newCookie = new Cookie(name,value,expiry,path,domain,httponly);
		} else if (name instanceof Cookie) {
		newCookie = name;
		} else {
		newCookie = Cookie.fromString(name);
		}
		if (arguments.length > 1) {
		newCookie = new Cookie(name, value, expiry, path, domain, httponly);
		} else if (name instanceof Cookie) {
		newCookie = name;
		} else {
		newCookie = Cookie.fromString(name);
		}

		// Are we updating an existing cookie or adding a new one?
		this.cookies.forEach(function(cookie,index) {
		if (cookie.name === newCookie.name &&
		cookie.matchDomain(newCookie.domain)) {
		// Are we updating an existing cookie or adding a new one?
		this.cookies.forEach(function(cookie, index) {
		if (cookie.name === newCookie.name &&
		cookie.matchDomain(newCookie.domain)) {

		existingIndex = index;
		}
		});
		existingIndex = index;
		}
		});

		if (existingIndex < 0) {
		this.cookies.push(newCookie);
		} else {
		this.cookies[existingIndex] = newCookie;
		}
		if (existingIndex < 0) {
		this.cookies.push(newCookie);
		} else {
		this.cookies[existingIndex] = newCookie;
		}

		this.emit("addcookie",newCookie);
		this.emit("addcookie", newCookie);

		if (cb && cb instanceof Function)
		cb(null,newCookie);
		if (cb && cb instanceof Function) {
		cb(null, newCookie);
		}

		return this;
		return this;
		};

		/*
		Public: Removes cookies from the cookie jar. If no domain and name are
		specified, all cookies in the jar are removed.
		Public: Removes cookies from the cookie jar. If no domain and name are
		specified, all cookies in the jar are removed.

		name - The name of the cookie(s) to remove
		domain - The domain from which to remove cookies.
		cb - Optional callback.
		name - The name of the cookie(s) to remove
		domain - The domain from which to remove cookies.
		cb - Optional callback.

		Emits
		Emits

		removecookie - Emitted with array of removed cookies.
		removecookie - Emitted with array of removed cookies.

		Examples
		Examples

		cookieJar.remove(null,"nytimes.com");
		cookieJar.remove(null,"nytimes.com");

		Returns an array of removed cookies.
		Returns an array of removed cookies.

		*/
		CookieJar.prototype.remove = function(name,domain,cb) {
		var cookiesRemoved = [], jar = this;
		CookieJar.prototype.remove = function(name, domain, cb) {
		var cookiesRemoved = [],
		jar = this;

		this.cookies.forEach(function(cookie,index) {
		jar.cookies.forEach(function(cookie, index) {

		// If the names don't match, we're not removing this cookie
		if (!!name && cookie.name !== name)
		return false;
		// If the names don't match, we're not removing this cookie
		if (!!name && cookie.name !== name) {
		return false;
		}

		// If the domains don't match, we're not removing this cookie
		if (!!domain && !cookie.matchDomain(domain))
		return false;
		// If the domains don't match, we're not removing this cookie
		if (!!domain && !cookie.matchDomain(domain)) {
		return false;
		}

		// Matched. Remove!
		cookiesRemoved.push(jar.cookies.splice(index,1));
		});
		// Matched. Remove!
		cookiesRemoved.push(jar.cookies.splice(index, 1));
		});

		jar.emit("removecookie",cookiesRemoved);
		jar.emit("removecookie", cookiesRemoved);

		if (cb && cb instanceof Function)
		cb(null,cookiesRemoved);
		if (cb && cb instanceof Function) {
		cb(null, cookiesRemoved);
		}

		return cookiesRemoved;
		return cookiesRemoved;
		};

		/*
		Public: Gets an array of cookies based on name and domain.
		Public: Gets an array of cookies based on name and domain.

		name - The name of the cookie(s) to retrieve
		domain - The domain from which to retrieve cookies.
		cb - Optional callback.
		name - The name of the cookie(s) to retrieve
		domain - The domain from which to retrieve cookies.
		cb - Optional callback.

		Examples
		Examples

		cookieJar.get(null,"nytimes.com");
		cookieJar.get(null,"nytimes.com");

		Returns an array of cookies.
		Returns an array of cookies.

		*/
		CookieJar.prototype.get = function(name,domain,cb) {
		CookieJar.prototype.get = function(name, domain, cb) {

		var cookies =
		this.cookies.filter(function(cookie,index) {
		var cookies = this.cookies.filter(function(cookie) {

		// If the names don't match, we're not returning this cookie
		if (!!name && cookie.name !== name)
		return false;
		// If the names don't match, we're not returning this cookie
		if (!!name && cookie.name !== name) {
		return false;
		}

		// If the domains don't match, we're not returning this cookie
		if (!!domain && !cookie.matchDomain(domain))
		return false;
		// If the domains don't match, we're not returning this cookie
		if (!!domain && !cookie.matchDomain(domain)) {
		return false;
		}

		return true;
		});
		return true;
		});

		if (cb && cb instanceof Function)
		cb(null,cookies);
		if (cb && cb instanceof Function) {
		cb(null, cookies);
		}

		return cookies;
		return cookies;
		};

		/*
		Public: Generates an array of headers based on the value of the cookie jar.
		Public: Generates an array of headers based on the value of the cookie jar.

		domain - The domain from which to generate cookies.
		path - Filter headers to cookies applicable to this path.
		cb - Optional callback.
		domain - The domain from which to generate cookies.
		path - Filter headers to cookies applicable to this path.
		cb - Optional callback.

		Examples
		Examples

		cookieJar.getAsHeader("nytimes.com","/myaccount");
		cookieJar.getAsHeader("nytimes.com","/myaccount");

		Returns an array of cookie headers.
		Returns an array of cookie headers.

		*/
		CookieJar.prototype.getAsHeader = function(domain,path,cb) {
		CookieJar.prototype.getAsHeader = function(domain, path, cb) {

		var headers =
		this.cookies
		.filter(function(cookie) {
		if (cookie.isExpired()) return false;
		if (!domain && !path) return true;
		if (domain) return cookie.matchDomain(domain);
		if (path) return cookie.matchPath(path);
		})
		.map(function(cookie) {
		return cookie.toString();
		});
		var headers =
		this.cookies.filter(function(cookie) {
		if (cookie.isExpired()) {
		return false;
		}
		if (!domain && !path) {
		return true;
		}
		if (domain) {
		return cookie.matchDomain(domain);
		}
		if (path) {
		return cookie.matchPath(path);
		}
		})
		.map(function(cookie) {
		return cookie.toString();
		});

		if (cb && cb instanceof Function)
		cb(null,headers);
		if (cb && cb instanceof Function) {
		cb(null, headers);
		}

		return headers;
		return headers;
		};

		/*
		Public: Adds cookies to the cookie jar based on an array of 'set-cookie'
		headers provided by a webserver. Duplicate cookies are overwritten.
		Public: Adds cookies to the cookie jar based on an array of 'set-cookie'
		headers provided by a webserver. Duplicate cookies are overwritten.

		headers - An array of 'set-cookie' headers
		cb - Optional callback.
		headers - An array of 'set-cookie' headers
		cb - Optional callback.

		Examples
		Examples

		cookieJar.addFromHeaders(res.headers["set-cookie"]);
		cookieJar.addFromHeaders(res.headers["set-cookie"]);

		Returns the cookie jar for chaining.
		Returns the cookie jar for chaining.

		*/
		CookieJar.prototype.addFromHeaders = function(headers,cb) {
		var jar = this;
		CookieJar.prototype.addFromHeaders = function(headers, cb) {
		var jar = this;

		if (!(headers instanceof Array))
		headers = [headers];
		if (!(headers instanceof Array)) {
		headers = [headers];
		}

		headers.forEach(function(header) {
		jar.add(header);
		});
		headers.forEach(function(header) {
		jar.add(header);
		});

		if (cb && cb instanceof Function)
		cb(jar);
		if (cb && cb instanceof Function) {
		cb(jar);
		}

		return jar;
		return jar;
		};

		/*
		Public: Outputs a linefeed-separated list of set-cookie headers representing
		the entire contents of the cookie jar.
		Public: Outputs a linefeed-separated list of set-cookie headers representing
		the entire contents of the cookie jar.

		Examples
		Examples

		cookieJar.toString();
		cookieJar.toString();

		Returns a list of headers in string form.
		Returns a list of headers in string form.

		*/
		CookieJar.prototype.toString = function() {
		return this.getAsHeader().join("\n");
		return this.getAsHeader().join("\n");
		};
		@@ -248,51 +272,54 @@
		/*
		Public: Constructor for the Cookie() object: create a new cookie.
		Public: Constructor for the Cookie() object: create a new cookie.

		name - The name of the cookie to add.
		value - The value of the cookie.
		expires - Expiry timestamp in milliseconds.
		path - Limit cookie to path (defaults to "/")
		domain - Limit cookie to domain
		httponly - Boolean value specifying httponly
		name - The name of the cookie to add.
		value - The value of the cookie.
		expires - Expiry timestamp in milliseconds.
		path - Limit cookie to path (defaults to "/")
		domain - Limit cookie to domain
		httponly - Boolean value specifying httponly

		Examples
		Examples

		var myCookie = new Cookie("mycookie","myValue",Date.now(),"/","test.com",false);
		var myCookie = new Cookie("mycookie","myValue",Date.now(),"/","test.com",false);

		Returns the newly created Cookie object.
		Returns the newly created Cookie object.

		*/
		function Cookie(name,value,expires,path,domain,httponly) {
		function Cookie(name, value, expires, path, domain, httponly) {

		if (!name) throw new Error("A name is required to create a cookie.");
		if (!name) {
		throw new Error("A name is required to create a cookie.");
		}

		// Parse date to timestamp - consider it never expiring if timestamp is not
		// passed to the function
		if (expires) {
		// Parse date to timestamp - consider it never expiring if timestamp is not
		// passed to the function
		if (expires) {

		if (typeof expires !== "number")
		expires = (new Date(expires)).getTime();
		if (typeof expires !== "number") {
		expires = (new Date(expires)).getTime();
		}

		} else {
		expires = -1;
		}
		} else {
		expires = -1;
		}

		this.name = name;
		this.value = value \|\| "";
		this.expires = expires;
		this.path = path \|\| "/";
		this.domain = domain \|\| "*";
		this.httponly = !!httponly;
		this.name = name;
		this.value = value \|\| "";
		this.expires = expires;
		this.path = path \|\| "/";
		this.domain = domain \|\| "*";
		this.httponly = !!httponly;
		}

		/*
		Public, Static: Returns a new Cookie() object based on a header string.
		Public, Static: Returns a new Cookie() object based on a header string.

		string - A set-cookie header string
		string - A set-cookie header string

		Examples
		Examples

		var myCookie = Cookie.fromString(response.headers["set-cookie"][0]);
		var myCookie = Cookie.fromString(response.headers["set-cookie"][0]);

		Returns the newly created Cookie object.
		Returns the newly created Cookie object.

		@@ -302,126 +329,137 @@ */

		if (!string \|\| typeof string !== "string")
		throw new Error("String must be supplied to generate a cookie.");
		if (!string \|\| typeof string !== "string") {
		throw new Error("String must be supplied to generate a cookie.");
		}

		function parseKeyVal(input) {
		var key = input.split(/\=/).shift(),
		val = input.split(/\=/).slice(1).join("=");
		function parseKeyVal(input) {
		var key = input.split(/\=/).shift(),
		val = input.split(/\=/).slice(1).join("=");

		return [key,val];
		}
		return [key, val];
		}

		string = string.replace(/^\sset\-cookie\s\:\s*/i,"");
		string = string.replace(/^\sset\-cookie\s\:\s*/i, "");

		var parts = string.split(/\s\;\s/i),
		name = parseKeyVal(parts.shift()),
		keyValParts = {};
		var parts = string.split(/\s\;\s/i),
		name = parseKeyVal(parts.shift()),
		keyValParts = {};

		keyValParts.name = name[0];
		keyValParts.value = name[1];
		keyValParts.name = name[0];
		keyValParts.value = name[1];

		parts
		.filter(function(input) {
		return !!input.replace(/\s+/ig,"").length;
		})
		.map(parseKeyVal)
		.forEach(function(keyval) {
		var key = String(keyval[0]).toLowerCase().replace(/[^a-z0-9]/ig,"");
		keyValParts[key] = keyval[1];
		});
		parts
		.filter(function(input) {
		return !!input.replace(/\s+/ig, "").length;
		})
		.map(parseKeyVal)
		.forEach(function(keyval) {
		var key = String(keyval[0]).toLowerCase().replace(/[^a-z0-9]/ig, "");
		keyValParts[key] = keyval[1];
		});

		return new Cookie(
		keyValParts.name,
		keyValParts.value,
		keyValParts.expires \|\| keyValParts.expiry,
		keyValParts.path,
		keyValParts.domain,
		keyValParts.hasOwnProperty("httponly")
		);
		return new Cookie(
		keyValParts.name,
		keyValParts.value,
		keyValParts.expires \|\| keyValParts.expiry,
		keyValParts.path,
		keyValParts.domain,
		keyValParts.hasOwnProperty("httponly")
		);
		};

		/*
		Public: Outputs the cookie as a string, in the form of a set-cookie header.
		Public: Outputs the cookie as a string, in the form of a set-cookie header.

		includeHeader - Boolean value specifying whether to include the
		'Set-Cookie: ' header name at the beginning of the
		string.
		includeHeader - Boolean value specifying whether to include the
		'Set-Cookie: ' header name at the beginning of the
		string.

		Examples
		Examples

		var header = myCookie.toString(true);
		var header = myCookie.toString(true);

		Returns the header string.
		Returns the header string.

		*/
		Cookie.prototype.toString = function(includeHeader) {
		var string = "";
		var string = "";

		if (includeHeader) string = "Set-Cookie: ";
		if (includeHeader) {
		string = "Set-Cookie: ";
		}

		string += this.name + "=" + this.value + "; ";
		string += this.name + "=" + this.value + "; ";

		if (this.expires > 0)
		string += "Expires=" + (new Date(this.expires)).toGMTString() + "; ";
		if (this.expires > 0) {
		string += "Expires=" + (new Date(this.expires)).toGMTString() + "; ";
		}

		if (!!this.path)
		string += "Path=" + this.path + "; ";
		if (this.path) {
		string += "Path=" + this.path + "; ";
		}

		if (!!this.domain)
		string += "Domain=" + this.domain + "; ";
		if (this.domain) {
		string += "Domain=" + this.domain + "; ";
		}

		if (!!this.httponly)
		string += "Httponly; ";
		if (this.httponly) {
		string += "Httponly; ";
		}

		return string;
		return string;
		};

		/*
		Public: Determines whether a cookie has expired or not.
		Public: Determines whether a cookie has expired or not.

		Examples
		Examples

		if (myCookie.isExpired()) { ... }
		if (myCookie.isExpired()) { ... }

		Returns a boolean value specifying whether the cookie has expired (true) or
		whether it is still valid (false.)
		Returns a boolean value specifying whether the cookie has expired (true) or
		whether it is still valid (false.)

		*/
		Cookie.prototype.isExpired = function() {
		if (this.expires < 0) return false;
		return (this.expires < Date.now());
		if (this.expires < 0) {
		return false;
		}
		return this.expires < Date.now();
		};

		/*
		Public: Determines whether a cookie matches a given domain.
		Public: Determines whether a cookie matches a given domain.

		Examples
		Examples

		if (myCookie.matchDomain("example.com")) { ... }
		if (myCookie.matchDomain("example.com")) { ... }

		Returns a boolean value specifying whether the cookie matches (true) or
		doesn't match (false.)
		Returns a boolean value specifying whether the cookie matches (true) or
		doesn't match (false.)

		*/
		Cookie.prototype.matchDomain = function(domain) {
		var reverseDomain = this.domain.split("").reverse().join(""),
		reverseDomainComp = domain.split("").reverse().join("");
		var reverseDomain = this.domain.split("").reverse().join(""),
		reverseDomainComp = domain.split("").reverse().join("");

		return reverseDomain.indexOf(reverseDomainComp) === 0;
		return reverseDomain.indexOf(reverseDomainComp) === 0;
		};

		/*
		Public: Determines whether a cookie matches a given path.
		Public: Determines whether a cookie matches a given path.

		Examples
		Examples

		if (myCookie.matchPath("/test/account")) { ... }
		if (myCookie.matchPath("/test/account")) { ... }

		Returns a boolean value specifying whether the cookie matches (true) or
		doesn't match (false.)
		Returns a boolean value specifying whether the cookie matches (true) or
		doesn't match (false.)

		*/
		Cookie.prototype.matchPath = function(path) {
		if (!this.path) return true;
		if (!this.path) {
		return true;
		}

		return path.indexOf(this.path) === 0;
		return path.indexOf(this.path) === 0;
		};
		@@ -428,0 +466,0 @@

1940

lib/crawler.js

		@@ -1,18 +0,20 @@
		// Simplecrawler
		// Christopher Giffard, 2011 - 2013+
		//
		// http://www.github.com/cgiffard/node-simplecrawler
		/*
		* Simplecrawler
		* https://github.com/cgiffard/node-simplecrawler
		*
		* Copyright (c) 2011-2015, Christopher Giffard
		*
		*/

		// Queue Dependency
		var FetchQueue = require("./queue.js"),
		Cache = require("./cache.js"),
		CookieJar = require("./cookies.js"),
		MetaInfo = require("../package.json");
		var FetchQueue = require("./queue.js"),
		CookieJar = require("./cookies.js"),
		MetaInfo = require("../package.json");

		var http = require("http"),
		https = require("https"),
		EventEmitter = require('events').EventEmitter,
		URI = require("URIjs"),
		zlib = require("zlib"),
		util = require("util");
		var http = require("http"),
		https = require("https"),
		EventEmitter = require("events").EventEmitter,
		uri = require("urijs"),
		zlib = require("zlib"),
		util = require("util");

		@@ -22,887 +24,935 @@ var QUEUE_ITEM_INITIAL_DEPTH = 1;
		/*
		Public: Constructor for the crawler.
		Public: Constructor for the crawler.

		host - Initial hostname/domain to begin crawling from. By
		default, the crawl will be locked to this hostname.
		initialPath - Initial path to begin crawling from.
		initialPort - Port to begin crawling from.
		interval - Request interval for the crawler. Defaults to 250ms.
		host - Initial hostname/domain to begin crawling from. By
		default, the crawl will be locked to this hostname.
		initialPath - Initial path to begin crawling from.
		initialPort - Port to begin crawling from.
		interval - Request interval for the crawler. Defaults to 250ms.

		Examples
		Examples

		var crawler = new Crawler("example.com","/",80,500);
		var crawler = new Crawler("example.com","/",80,500);

		var crawler = new Crawler("example.com");
		var crawler = new Crawler("example.com");

		Returns the crawler object which has now been constructed.
		Returns the crawler object which has now been constructed.

		*/
		var Crawler = function(host,initialPath,initialPort,interval) {
		var crawler = this;
		var Crawler = function(host, initialPath, initialPort, interval) {
		var crawler = this;

		// Data integrity checks
		if (initialPort && isNaN(initialPort))
		throw new Error("Port must be a number!");
		// Data integrity checks
		if (initialPort && isNaN(initialPort)) {
		throw new Error("Port must be a number!");
		}

		// SETTINGS TO STUFF WITH
		// (not here! Do it when you create a `new Crawler()`)
		// SETTINGS TO STUFF WITH
		// (not here! Do it when you create a `new Crawler()`)

		// Domain to crawl
		crawler.host = host \|\| "";
		// Domain to crawl
		crawler.host = host \|\| "";

		// Gotta start crawling somewhere
		crawler.initialPath = initialPath \|\| "/";
		crawler.initialPort = initialPort \|\| 80;
		crawler.initialProtocol = "http";
		// Gotta start crawling somewhere
		crawler.initialPath = initialPath \|\| "/";
		crawler.initialPort = initialPort \|\| 80;
		crawler.initialProtocol = "http";

		// Internal 'tick' interval for spawning new requests
		// (as long as concurrency is under cap)
		// One request will be spooled per tick, up to the concurrency threshold.
		crawler.interval = interval \|\| 250;
		// Internal 'tick' interval for spawning new requests
		// (as long as concurrency is under cap)
		// One request will be spooled per tick, up to the concurrency threshold.
		crawler.interval = interval \|\| 250;

		// Maximum request concurrency. Be sensible. Five ties in with node's
		// default maxSockets value.
		crawler.maxConcurrency = 5;
		// Maximum request concurrency. Be sensible. Five ties in with node's
		// default maxSockets value.
		crawler.maxConcurrency = 5;

		// Maximum time we'll wait for headers
		crawler.timeout = 5 * 60 * 1000;
		// Maximum time we'll wait for headers
		crawler.timeout = 5 * 60 * 1000;

		// Maximum time we'll wait for async listeners.
		crawler.listenerTTL = 10 * 1000;
		// Maximum time we'll wait for async listeners.
		crawler.listenerTTL = 10 * 1000;

		// User Agent
		crawler.userAgent =
		"Node/" + MetaInfo.name + " " + MetaInfo.version +
		" (" + MetaInfo.repository.url + ")";
		// User Agent
		crawler.userAgent =
		"Node/" + MetaInfo.name + " " + MetaInfo.version +
		" (" + MetaInfo.repository.url + ")";

		// Queue for requests - FetchQueue gives us stats and other sugar
		// (but it's basically just an array)
		crawler.queue = new FetchQueue();
		// Queue for requests - FetchQueue gives us stats and other sugar
		// (but it's basically just an array)
		crawler.queue = new FetchQueue();

		// Do we filter by domain?
		// Unless you want to be crawling the entire internet, I would
		// recommend leaving this on!
		crawler.filterByDomain = true;
		// Do we filter by domain?
		// Unless you want to be crawling the entire internet, I would
		// recommend leaving this on!
		crawler.filterByDomain = true;

		// Do we scan subdomains?
		crawler.scanSubdomains = false;
		// Do we scan subdomains?
		crawler.scanSubdomains = false;

		// Treat WWW subdomain the same as the main domain (and don't count
		// it as a separate subdomain)
		crawler.ignoreWWWDomain = true;
		// Treat WWW subdomain the same as the main domain (and don't count
		// it as a separate subdomain)
		crawler.ignoreWWWDomain = true;

		// Or go even further and strip WWW subdomain from domains altogether!
		crawler.stripWWWDomain = false;
		// Or go even further and strip WWW subdomain from domains altogether!
		crawler.stripWWWDomain = false;

		// Internal cachestore
		crawler.cache = null;
		// Internal cachestore
		crawler.cache = null;

		// Use an HTTP Proxy?
		crawler.useProxy = false;
		crawler.proxyHostname = "127.0.0.1";
		crawler.proxyPort = 8123;
		crawler.proxyUser = null;
		crawler.proxyPass = null;
		// Use an HTTP Proxy?
		crawler.useProxy = false;
		crawler.proxyHostname = "127.0.0.1";
		crawler.proxyPort = 8123;
		crawler.proxyUser = null;
		crawler.proxyPass = null;

		// Support for HTTP basic auth
		crawler.needsAuth = false;
		crawler.authUser = "";
		crawler.authPass = "";
		// Support for HTTP basic auth
		crawler.needsAuth = false;
		crawler.authUser = "";
		crawler.authPass = "";

		// Support for retaining cookies for parse duration
		crawler.acceptCookies = true;
		crawler.cookies = new CookieJar();
		// Support for retaining cookies for parse duration
		crawler.acceptCookies = true;
		crawler.cookies = new CookieJar();

		// Support for custom headers...
		crawler.customHeaders = {};
		// Support for custom headers...
		crawler.customHeaders = {};

		// Domain Whitelist
		// We allow domains to be whitelisted, so cross-domain requests can be made.
		crawler.domainWhitelist = [];
		// Domain Whitelist
		// We allow domains to be whitelisted, so cross-domain requests can be made.
		crawler.domainWhitelist = [];

		// Supported Protocols
		crawler.allowedProtocols = [
		/^http(s)?$/i, // HTTP & HTTPS
		/^(rss\|atom\|feed)(\+xml)?$/i // RSS / XML
		];
		// Supported Protocols
		crawler.allowedProtocols = [
		/^http(s)?$/i, // HTTP & HTTPS
		/^(rss\|atom\|feed)(\+xml)?$/i // RSS / XML
		];

		// Max file size to download/store
		crawler.maxResourceSize = 1024 * 1024 * 16; // 16mb
		// Max file size to download/store
		crawler.maxResourceSize = 1024 * 1024 * 16; // 16mb

		// Supported MIME-types
		// Matching MIME-types will be scanned for links
		crawler.supportedMimeTypes = [
		/^text\//i,
		/^application\/(rss\|html\|xhtml)?[\+\/\-]?xml/i,
		/^application\/javascript/i,
		/^xml/i
		];
		// Supported MIME-types
		// Matching MIME-types will be scanned for links
		crawler.supportedMimeTypes = [
		/^text\//i,
		/^application\/(rss\|html\|xhtml)?[\+\/\-]?xml/i,
		/^application\/javascript/i,
		/^xml/i
		];

		// Download linked, but unsupported files (binary - images, documents, etc)
		crawler.downloadUnsupported = true;
		// Download linked, but unsupported files (binary - images, documents, etc)
		crawler.downloadUnsupported = true;

		// URL Encoding setting...
		crawler.urlEncoding = "unicode";
		// URL Encoding setting...
		crawler.urlEncoding = "unicode";

		// Strip Querystring Parameters from URL
		crawler.stripQuerystring = false;
		// Strip Querystring Parameters from URL
		crawler.stripQuerystring = false;

		// Regular expressions for finding URL items in HTML and text
		crawler.discoverRegex = [
		/\s?(?:href\|src)\s?=\s?(["']).*?\1/ig,
		/\s?(?:href\|src)\s?=\s?[^"'][^\s>]+/ig,
		/\s?url$(["']).*?\1$/ig,
		/\s?url$[^"'].*?$/ig,
		// Regular expressions for finding URL items in HTML and text
		crawler.discoverRegex = [
		/\s?(?:href\|src)\s?=\s?(["']).*?\1/ig,
		/\s?(?:href\|src)\s?=\s?[^"'][^\s>]+/ig,
		/\s?url$(["']).*?\1$/ig,
		/\s?url$[^"'].*?$/ig,

		// This could easily duplicate matches above, e.g. in the case of
		// href="http://example.com"
		/http(s)?\:\/\/[^?\s><\'\"]+/ig,
		// This could easily duplicate matches above, e.g. in the case of
		// href="http://example.com"
		/http(s)?\:\/\/[^?\s><\'\"]+/ig,

		// This might be a bit of a gamble... but get hard-coded
		// strings out of javacript: URLs. They're often popup-image
		// or preview windows, which would otherwise be unavailable to us.
		// Worst case scenario is we make some junky requests.
		/^javascript\:[a-z0-9\$\_\.]+\(['"][^'"\s]+/ig
		];
		// This might be a bit of a gamble... but get hard-coded
		// strings out of javacript: URLs. They're often popup-image
		// or preview windows, which would otherwise be unavailable to us.
		// Worst case scenario is we make some junky requests.
		/^javascript\:[a-z0-9\$\_\.]+\(['"][^'"\s]+/ig
		];

		// Whether to parse inside HTML comments
		crawler.parseHTMLComments = true;
		// Whether to parse inside HTML comments
		crawler.parseHTMLComments = true;

		// Whether to parse inside script tags
		crawler.parseScriptTags = true;
		// Whether to parse inside script tags
		crawler.parseScriptTags = true;

		// Max depth parameter
		crawler.maxDepth = 0;
		// Max depth parameter
		crawler.maxDepth = 0;

		// Whether to allow 'resources' greater than the max depth to be downloaded
		crawler.fetchWhitelistedMimeTypesBelowMaxDepth = false;
		// Whether to allow 'resources' greater than the max depth to be downloaded
		crawler.fetchWhitelistedMimeTypesBelowMaxDepth = false;

		// Ignore invalid ssl certificates
		crawler.ignoreInvalidSSL = false;
		// Ignore invalid SSL certificates
		crawler.ignoreInvalidSSL = false;

		// STATE (AND OTHER) VARIABLES NOT TO STUFF WITH
		var hiddenProps = {
		"_openRequests": 0,
		"_fetchConditions": [],
		"_openListeners": 0
		};
		// STATE (AND OTHER) VARIABLES NOT TO STUFF WITH
		var hiddenProps = {
		_openRequests: 0,
		_fetchConditions: [],
		_openListeners: 0
		};

		// Run the EventEmitter constructor
		EventEmitter.call(crawler);
		// Run the EventEmitter constructor
		EventEmitter.call(crawler);

		// Apply all the hidden props
		Object.keys(hiddenProps).forEach(function(key) {
		Object.defineProperty(crawler, key, {
		"writable": true,
		"enumerable": false,
		"value": hiddenProps[key]
		});
		});
		// Apply all the hidden props
		Object.keys(hiddenProps).forEach(function(key) {
		Object.defineProperty(crawler, key, {
		writable: true,
		enumerable: false,
		value: hiddenProps[key]
		});
		});
		};

		util.inherits(Crawler,EventEmitter);
		util.inherits(Crawler, EventEmitter);

		/*
		Public: Starts or resumes the crawl. If the queue is empty, it adds a new
		queue item from which to begin crawling based on the initial configuration
		of the crawler itself. The crawler waits for process.nextTick to begin, so
		handlers and other properties can be altered or addressed before the crawl
		commences.
		Public: Starts or resumes the crawl. If the queue is empty, it adds a new
		queue item from which to begin crawling based on the initial configuration
		of the crawler itself. The crawler waits for process.nextTick to begin, so
		handlers and other properties can be altered or addressed before the crawl
		commences.

		Examples
		Examples

		crawler.start();
		crawler.start();

		Returns the crawler object, to enable chaining.
		Returns the crawler object, to enable chaining.

		*/
		Crawler.prototype.start = function() {
		var crawler = this;
		var crawler = this;

		// only if we haven't already got stuff in our queue...
		crawler.queue.getLength(function(err, length) {
		if (err) throw err;
		// only if we haven't already got stuff in our queue...
		crawler.queue.getLength(function(err, length) {
		if (err) {
		throw err;
		}

		if (!length) {
		if (!length) {

		// Initialise our queue by pushing the initial request data into it...
		crawler.queue.add(
		crawler.initialProtocol,
		crawler.host,
		crawler.initialPort,
		crawler.initialPath,
		QUEUE_ITEM_INITIAL_DEPTH,
		function(error) {
		if (error) throw error;
		});
		}
		// Initialise our queue by pushing the initial request data into it...
		crawler.queue.add(
		crawler.initialProtocol,
		crawler.host,
		crawler.initialPort,
		crawler.initialPath,
		QUEUE_ITEM_INITIAL_DEPTH,
		function(error) {
		if (error) {
		throw error;
		}
		});
		}

		crawler.crawlIntervalID =
		setInterval(
		function() {
		crawler.crawl.call(crawler);
		},
		crawler.interval);
		crawler.crawlIntervalID =
		setInterval(
		function() {
		crawler.crawl(crawler);
		},
		crawler.interval);

		crawler.emit("crawlstart");
		crawler.running = true;
		crawler.emit("crawlstart");
		crawler.running = true;

		// Now kick off the initial crawl
		process.nextTick(function() {
		crawler.crawl();
		});
		});
		// Now kick off the initial crawl
		process.nextTick(function() {
		crawler.crawl();
		});
		});

		return crawler;
		return crawler;
		};

		/*
		Public: Determines whether the protocol is supported, given a URL.
		Public: Determines whether the protocol is supported, given a URL.

		URL - URL with a protocol, for testing.
		URL - URL with a protocol, for testing.

		Examples
		Examples

		crawler.protocolSupported("http://google.com/") // true, by default
		crawler.protocolSupported("wss://google.com/") // false, by default
		crawler.protocolSupported("http://google.com/") // true, by default
		crawler.protocolSupported("wss://google.com/") // false, by default

		Returns a boolean, true if the protocol is supported - false if not.
		Returns a boolean, true if the protocol is supported - false if not.

		*/
		Crawler.prototype.protocolSupported = function(URL) {
		var protocol, crawler = this;
		var protocol,
		crawler = this;

		try {
		protocol = URI(URL).protocol();
		try {
		protocol = uri(URL).protocol();

		// Unspecified protocol. Assume http
		if (!protocol)
		protocol = "http";
		// Unspecified protocol. Assume http
		if (!protocol) {
		protocol = "http";
		}

		} catch(e) {
		// If URIjs died, we definitely /do not/ support the protocol.
		return false;
		}
		} catch (e) {
		// If URIjs died, we definitely /do not/ support the protocol.
		return false;
		}

		return crawler.allowedProtocols.reduce(function(prev,protocolCheck) {
		return prev \|\| !!protocolCheck.exec(protocol);
		},false);
		return crawler.allowedProtocols.reduce(function(prev, protocolCheck) {
		return prev \|\| !!protocolCheck.exec(protocol);
		}, false);
		};

		/*
		Public: Determines whether the mimetype is supported, given a mimetype
		Public: Determines whether the mimetype is supported, given a mimetype

		MIMEType - String containing MIME type to test
		MIMEType - String containing MIME type to test

		Examples
		Examples

		crawler.mimeTypeSupported("text/html") // true, by default
		crawler.mimeTypeSupported("application/octet-stream") // false, by default
		crawler.mimeTypeSupported("text/html") // true, by default
		crawler.mimeTypeSupported("application/octet-stream") // false, by default

		Returns a boolean, true if the MIME type is supported — false if not.
		Returns a boolean, true if the MIME type is supported — false if not.

		*/
		Crawler.prototype.mimeTypeSupported = function(MIMEType) {
		var crawler = this;
		var crawler = this;

		return (
		crawler.supportedMimeTypes.reduce(function(prev,mimeCheck) {
		return prev \|\| !!mimeCheck.exec(MIMEType);
		},false)
		);
		return crawler.supportedMimeTypes.reduce(function(prev, mimeCheck) {
		return prev \|\| !!mimeCheck.exec(MIMEType);
		}, false);
		};

		/*
		Public: Determines whether the queueItem can be fetched from its depth
		Public: Determines whether the queueItem can be fetched from its depth

		In fact, the queueItem needs to be fetched before calling this (because we
		need its MIME type). This will just determine if we need to send an event
		for this item & if we need to fetch linked resources.
		In fact, the queueItem needs to be fetched before calling this (because we
		need its MIME type). This will just determine if we need to send an event
		for this item & if we need to fetch linked resources.

		If the queue item is a CSS or JS file, it will always be fetched (we need
		all images in CSS files, even if max depth is already reached). If it's an
		HTML page, we will check if max depth is reached or not.
		If the queue item is a CSS or JS file, it will always be fetched (we need
		all images in CSS files, even if max depth is already reached). If it's an
		HTML page, we will check if max depth is reached or not.

		queueItem - Queue item object to check
		queueItem - Queue item object to check

		Returns a boolean, true if the queue item can be fetched - false if not.
		Returns a boolean, true if the queue item can be fetched - false if not.

		*/
		Crawler.prototype.depthAllowed = function(queueItem) {
		var crawler = this;
		var crawler = this;

		// Items matching this pattern will always be fetched, even if max depth
		// is reached
		var mimeTypesWhitelist = [
		/^text\/(css\|javascript\|ecmascript)/i,
		/^application\/javascript/i,
		/^application\/x-font/i,
		/^application\/font/i,
		/^image\//i,
		/^font\//i
		];
		// Items matching this pattern will always be fetched, even if max depth
		// is reached
		var mimeTypesWhitelist = [
		/^text\/(css\|javascript\|ecmascript)/i,
		/^application\/javascript/i,
		/^application\/x-font/i,
		/^application\/font/i,
		/^image\//i,
		/^font\//i
		];

		return (
		crawler.maxDepth === 0 \|\|
		queueItem.depth <= crawler.maxDepth \|\|
		(
		crawler.fetchWhitelistedMimeTypesBelowMaxDepth &&
		mimeTypesWhitelist.reduce(function(prev,mimeCheck) {
		return prev \|\| !!mimeCheck.exec(queueItem.stateData.contentType);
		}, false)
		)
		);
		return crawler.maxDepth === 0 \|\|
		queueItem.depth <= crawler.maxDepth \|\|
		crawler.fetchWhitelistedMimeTypesBelowMaxDepth &&
		mimeTypesWhitelist.reduce(function(prev, mimeCheck) {
		return prev \|\| !!mimeCheck.exec(queueItem.stateData.contentType);
		}, false);
		};

		/*
		Public: Extracts protocol, host, port and resource (path) given a URL string.
		Public: Extracts protocol, host, port and resource (path) given a URL string.

		URL - String containing URL to process
		URL - String containing URL to process

		Examples
		Examples

		var URLInfo = crawler.processURL("http://www.google.com/fish");
		var URLInfo = crawler.processURL("http://www.google.com/fish");

		Returns an object containing keys and values for "protocol", "host", "port",
		and "path".
		Returns an object containing keys and values for "protocol", "host", "port",
		and "path".

		*/
		Crawler.prototype.processURL = function(URL,context) {
		var newURL, crawler = this;
		Crawler.prototype.processURL = function(URL, context) {
		var newURL,
		crawler = this;

		if (!context \|\| typeof(context) !== "object")
		context = {
		url: (
		crawler.initialProtocol + "://" +
		crawler.host + ":" +
		crawler.initialPort + "/"
		),
		depth: QUEUE_ITEM_INITIAL_DEPTH
		};
		if (!context \|\| typeof context !== "object") {
		context = {
		url: crawler.initialProtocol + "://" +
		crawler.host + ":" +
		crawler.initialPort + "/",
		depth: QUEUE_ITEM_INITIAL_DEPTH
		};
		}

		// If the URL didn't contain anything, don't fetch it.
		if (!URL.replace(/\s+/ig,"").length) return false;
		// If the URL didn't contain anything, don't fetch it.
		if (!(URL && URL.replace(/\s+/ig, "").length)) {
		return false;
		}

		// Check if querystring should be ignored
		if (crawler.stripQuerystring === true)
		URL = crawler.removeQuerystring(URL);
		// Check if querystring should be ignored
		if (crawler.stripQuerystring === true) {
		URL = crawler.removeQuerystring(URL);
		}

		try {
		newURL =
		URI(URL)
		.absoluteTo(context.url)
		.normalize();
		if (crawler.stripWWWDomain && URL.match(/https?\:\/\/(www\.).*/i)) {
		URL = URL.replace("www.", "");
		}

		if (crawler.urlEncoding === "iso8859") {
		newURL = newURL.iso8859();
		}
		try {
		newURL =
		uri(URL)
		.absoluteTo(context.url)
		.normalize();

		} catch(e) {
		// Couldn't process the URL, since URIjs choked on it.
		return false;
		}
		if (crawler.urlEncoding === "iso8859") {
		newURL = newURL.iso8859();
		}

		// simplecrawler uses slightly different terminology to URIjs. Sorry!
		return {
		"protocol": newURL.protocol() \|\| "http",
		"host": newURL.hostname(),
		"port": newURL.port() \|\| 80,
		"path": newURL.resource(),
		"uriPath": newURL.path(),
		"depth": context.depth + 1
		};
		} catch (e) {
		// Couldn't process the URL, since URIjs choked on it.
		return false;
		}

		// simplecrawler uses slightly different terminology to URIjs. Sorry!
		return {
		protocol: newURL.protocol() \|\| "http",
		host: newURL.hostname(),
		port: newURL.port() \|\| 80,
		path: newURL.resource(),
		uriPath: newURL.path(),
		depth: context.depth + 1
		};
		};

		/*
		Public: Discovers linked resources in an HTML, XML or text document.
		Private: Perform string replace operations on a URL string. Eg. removes
		HTML attribute fluff around actual URL, replaces leading "//" with
		absolute protocol etc.

		resourceData - String containing document with linked resources.
		queueItem - Queue item corresponding to document being searched.
		queueItem - Queue item corresponding to where the resource was found
		URL - String to be cleaned up

		Examples
		Examples

		crawler.discoverResources("http://www.google.com")
		crawler.discoverResources("<a href='...'>test</a>")
		cleanURL({protocol: "http"}, "url('//example.com/about') ")

		Returns an array of the (string) resource URLs found in the document. If none
		were found, the array will be empty.
		Returns a string.
		*/
		function cleanURL (queueItem, URL) {
		return URL
		.replace(/^(?:\shref\|\ssrc)\s=+\s/i, "")
		.replace(/^\s*/, "")
		.replace(/^url$(.*)$/i, "$1")
		.replace(/^javascript\:\s[a-z0-9]+\((.)/i, "$1")
		.replace(/^(['"])(.*)\1$/, "$2")
		.replace(/^$(.*)$$/, "$1")
		.replace(/^\/\//, queueItem.protocol + "://")
		.replace(/\&/gi, "&")
		.replace(/\&/gi, "&")
		.replace(/\&/gi, "&")
		.split("#")
		.shift()
		.trim();
		}

		/*
		Public: Clean up a list of resources (normally provided by discoverResources).
		Also expands URL's that are relative to the current page.

		urlMatch - Array of string resources
		queueItem - Queue item corresponding to where the resources were retrieved from

		Examples

		crawler.cleanExpandResources(["http://www.google.com", "/about", "mailto: example@example.com"])

		Returns an array of URL strings.
		*/
		Crawler.prototype.discoverResources = function(resourceData,queueItem) {
		// Convert to UTF-8
		// TODO: account for text-encoding.
		var resources = [],
		resourceText = resourceData.toString("utf8"),
		crawler = this;
		Crawler.prototype.cleanExpandResources = function (urlMatch, queueItem) {
		var crawler = this,
		resources = [];

		if (!queueItem)
		queueItem = {};
		if (!urlMatch) {
		return [];
		}

		if (!queueItem.protocol)
		queueItem.protocol = "http";
		return urlMatch
		.map(cleanURL.bind(this, queueItem))
		.reduce(function(list, URL) {

		if (!crawler.parseHTMLComments) {
		resourceText = resourceText.replace(/<!--([\s\S]+?)-->/g, "");
		}
		// Ensure URL is whole and complete
		try {
		URL = uri(URL)
		.absoluteTo(queueItem.url \|\| "")
		.normalize()
		.toString();
		} catch (e) {
		// But if URI.js couldn't parse it - nobody can!
		return list;
		}

		if (!crawler.parseScriptTags) {
		resourceText = resourceText.replace(/<script(.*?)>([\s\S]+?)<\/script>/gi, "");
		}
		// If we hit an empty item, don't return it
		if (!URL.length) {
		return list;
		}

		function cleanURL(URL) {
		return URL
		.replace(/^(?:\shref\|\ssrc)\s=+\s/i,"")
		.replace(/^\s*/,"")
		.replace(/^url$(.*)$/i,"$1")
		.replace(/^javascript\:\s[a-z0-9]+\((.)/i,"$1")
		.replace(/^(['"])(.*)\1$/,"$2")
		.replace(/^$(.*)$$/,"$1")
		.replace(/^\/\//, queueItem.protocol + "://")
		.replace(/\&/gi,"&")
		.replace(/\&/gi,"&")
		.replace(/\&/gi,"&")
		.split("#")
		.shift();
		}
		// If we don't support the protocol in question
		if (!crawler.protocolSupported(URL)) {
		return list;
		}

		// Clean links
		function cleanAndQueue(urlMatch) {
		if (!urlMatch) return [];
		// Does the item already exist in the list?
		if (resources.reduce(function(prev, current) {
		return prev \|\| current === URL;
		}, false)) {
		return list;
		}

		return urlMatch
		.map(cleanURL)
		.reduce(function(list,URL) {
		var tmpURL;
		return list.concat(URL);
		}, []);
		};

		// Ensure URL is whole and complete
		try {
		tmpURL = URI(URL);
		/*
		Public: Discovers linked resources in an HTML, XML or text document.

		if (queueItem.url) {
		URL = tmpURL
		.absoluteTo(queueItem.url)
		.normalize()
		.toString();
		} else {
		URL = tmpURL
		.normalize()
		.toString();
		}
		resourceData - String containing document with linked resources.
		queueItem - Queue item corresponding to document being searched.

		} catch(e) {
		// But if URI.js couldn't parse it - nobody can!
		return list;
		}
		Examples

		// If we hit an empty item, don't add return it
		if (!URL.length) return list;
		crawler.discoverResources("http://www.google.com")
		crawler.discoverResources("<a href='...'>test</a>")

		// If we don't support the protocol in question
		if (!crawler.protocolSupported(URL)) return list;
		Returns an array of the (string) resource URLs found in the document. If none
		were found, the array will be empty.

		// Does the item already exist in the list?
		if (resources.reduce(function(prev,current) {
		return prev \|\| current === URL;
		},false))
		return list;
		*/
		Crawler.prototype.discoverResources = function(resourceData, queueItem) {
		// Convert to UTF-8
		// TODO: account for text-encoding.
		var resourceText = resourceData.toString("utf8"),
		crawler = this;

		return list.concat(URL);
		},[]);
		}
		if (!queueItem) {
		queueItem = {};
		}

		// Rough scan for URLs
		return crawler.discoverRegex
		.reduce(function(list,regex) {
		return list.concat(
		cleanAndQueue(
		resourceText.match(regex)));
		},[])
		.reduce(function(list,check) {
		if (list.indexOf(check) < 0)
		return list.concat([check]);
		if (!queueItem.protocol) {
		queueItem.protocol = "http";
		}

		return list;
		},[]);
		if (!crawler.parseHTMLComments) {
		resourceText = resourceText.replace(/<!--([\s\S]+?)-->/g, "");
		}

		if (!crawler.parseScriptTags) {
		resourceText = resourceText.replace(/<script(.?)>([\s\S]?)<\/script>/gi, "");
		}

		// Rough scan for URLs
		return crawler.discoverRegex
		.reduce(function(list, regex) {
		return list.concat(
		crawler.cleanExpandResources(
		resourceText.match(regex), queueItem));
		}, [])
		.reduce(function(list, check) {
		if (list.indexOf(check) < 0) {
		return list.concat([check]);
		}

		return list;
		}, []);
		};

		/*
		Public: Determines based on crawler state whether a domain is valid for
		crawling.
		Public: Determines based on crawler state whether a domain is valid for
		crawling.

		host - String containing the hostname of the resource to be fetched.
		host - String containing the hostname of the resource to be fetched.

		Examples
		Examples

		crawler.domainValid("127.0.0.1");
		crawler.domainValid("google.com");
		crawler.domainValid("test.example.com");
		crawler.domainValid("127.0.0.1");
		crawler.domainValid("google.com");
		crawler.domainValid("test.example.com");

		Returns an true if the domain is valid for crawling, false if not.
		Returns an true if the domain is valid for crawling, false if not.

		*/
		Crawler.prototype.domainValid = function(host) {
		var crawler = this,
		crawlerHost = crawler.host;
		var crawler = this;

		// If we're ignoring the WWW domain, remove the WWW for comparisons...
		if (crawler.ignoreWWWDomain)
		host = host.replace(/^www\./i,"");
		// If we're ignoring the WWW domain, remove the WWW for comparisons...
		if (crawler.ignoreWWWDomain) {
		host = host.replace(/^www\./i, "");
		}

		function domainInWhitelist(host) {
		function domainInWhitelist(host) {

		// If there's no whitelist, or the whitelist is of zero length,
		// just return false.
		if (!crawler.domainWhitelist \|\|
		!crawler.domainWhitelist.length) return false;
		// If there's no whitelist, or the whitelist is of zero length,
		// just return false.
		if (!crawler.domainWhitelist \|\| !crawler.domainWhitelist.length) {
		return false;
		}

		// Otherwise, scan through it.
		return !!crawler.domainWhitelist.reduce(function(prev,cur,index,array) {
		// Otherwise, scan through it.
		return !!crawler.domainWhitelist.reduce(function(prev, cur) {

		// If we already located the relevant domain in the whitelist...
		if (prev) return prev;
		// If we already located the relevant domain in the whitelist...
		if (prev) {
		return prev;
		}

		// If the domain is just equal, return true.
		if (host === cur) return true;
		// If the domain is just equal, return true.
		if (host === cur) {
		return true;
		}

		// If we're ignoring WWW subdomains, and both domains,
		// less www. are the same, return true.
		if (crawler.ignoreWWWDomain && host === cur.replace(/^www\./i,""))
		return true;
		// If we're ignoring WWW subdomains, and both domains,
		// less www. are the same, return true.
		if (crawler.ignoreWWWDomain && host === cur.replace(/^www\./i, "")) {
		return true;
		}

		// Otherwise, sorry. No dice.
		return false;
		},false);
		}
		// Otherwise, sorry. No dice.
		return false;
		}, false);
		}

		// Checks if the first domain is a subdomain of the second
		function isSubdomainOf(subdomain,host) {
		// Checks if the first domain is a subdomain of the second
		function isSubdomainOf(subdomain, host) {

		// Comparisons must be case-insensitive
		subdomain = subdomain.toLowerCase();
		host = host.toLowerCase();
		// Comparisons must be case-insensitive
		subdomain = subdomain.toLowerCase();
		host = host.toLowerCase();

		// If we're ignoring www, remove it from both
		// (if www is the first domain component...)
		if (crawler.ignoreWWWDomain) {
		subdomain = subdomain.replace(/^www./ig, "");
		host = host.replace(/^www./ig, "");
		}
		// If we're ignoring www, remove it from both
		// (if www is the first domain component...)
		if (crawler.ignoreWWWDomain) {
		subdomain = subdomain.replace(/^www./ig, "");
		host = host.replace(/^www./ig, "");
		}

		// They should be the same flipped around!
		return (
		subdomain.split("").reverse().join("").substr(0,host.length) ===
		host.split("").reverse().join(""));
		}
		// They should be the same flipped around!
		return subdomain.split("").reverse().join("").substr(0, host.length) ===
		host.split("").reverse().join("");
		}

		// If we're not filtering by domain, just return true.
		return (!crawler.filterByDomain \|\|
		// Or if the domain is just the right one, return true.
		(host === crawler.host) \|\|
		// Or if we're ignoring WWW subdomains, and both domains,
		// less www. are the same, return true.
		(
		crawler.ignoreWWWDomain &&
		crawler.host.replace(/^www\./i,"") ===
		host.replace(/^www\./i,"")
		) \|\|
		// Or if the domain in question exists in the domain whitelist,
		// return true.
		domainInWhitelist(host) \|\|
		// Or if we're scanning subdomains, and this domain is a subdomain
		// of the crawler's set domain, return true.
		(crawler.scanSubdomains && isSubdomainOf(host,crawler.host)));
		// If we're not filtering by domain, just return true.
		return !crawler.filterByDomain \|\|
		// Or if the domain is just the right one, return true.
		host === crawler.host \|\|
		// Or if we're ignoring WWW subdomains, and both domains,
		// less www. are the same, return true.
		crawler.ignoreWWWDomain &&
		crawler.host.replace(/^www\./i, "") ===
		host.replace(/^www\./i, "") \|\|
		// Or if the domain in question exists in the domain whitelist,
		// return true.
		domainInWhitelist(host) \|\|
		// Or if we're scanning subdomains, and this domain is a subdomain
		// of the crawler's set domain, return true.
		crawler.scanSubdomains && isSubdomainOf(host, crawler.host);
		};

		/*
		Public: Given a text or HTML document, initiates discovery of linked
		resources in the text, and queues the resources if applicable. Emits
		"discoverycomplete". Not to be confused with `crawler.discoverResources`,
		despite the `discoverResources` function being the main component of this
		one, since this function queues the resources in addition to
		discovering them.
		Public: Given a text or HTML document, initiates discovery of linked
		resources in the text, and queues the resources if applicable. Emits
		"discoverycomplete". Not to be confused with `crawler.discoverResources`,
		despite the `discoverResources` function being the main component of this
		one, since this function queues the resources in addition to
		discovering them.

		resourceData - Text document containing linked resource URLs.
		queueItem - Queue item from which the resource document was derived.
		decompressed - Content is already decompressed (default: false)
		resourceData - Text document containing linked resource URLs.
		queueItem - Queue item from which the resource document was derived.
		decompressed - Content is already decompressed (default: false)

		Emits
		Emits

		gziperr
		discoverycomplete
		gziperr
		discoverycomplete

		Examples
		Examples

		crawler.queueLinkedItems("<a href='...'>test</a>",queueItem);
		crawler.queueLinkedItems("<a href='...'>test</a>",queueItem);

		Returns the crawler object for chaining.
		Returns the crawler object for chaining.

		*/
		Crawler.prototype.queueLinkedItems = function(resourceData,queueItem,decompressed) {
		var crawler = this,
		resources = [];
		Crawler.prototype.queueLinkedItems = function(resourceData, queueItem, decompressed) {
		var crawler = this,
		resources = [];

		if (!decompressed &&
		queueItem.stateData &&
		queueItem.stateData.headers['content-encoding'] && (
		queueItem.stateData.headers['content-encoding'].match(/gzip/) \|\|
		queueItem.stateData.headers['content-encoding'].match(/deflate/))) {
		if (!decompressed &&
		queueItem.stateData &&
		queueItem.stateData.headers["content-encoding"] && (
		queueItem.stateData.headers["content-encoding"].match(/gzip/) \|\|
		queueItem.stateData.headers["content-encoding"].match(/deflate/))) {

		return zlib.unzip(resourceData,function(err,newData) {
		if (err) {
		return crawler.emit("gziperror", queueItem, err, resourceData);
		}
		return zlib.unzip(resourceData, function(err, newData) {
		if (err) {
		return crawler.emit("gziperror", queueItem, err, resourceData);
		}

		crawler.queueLinkedItems(newData,queueItem,true);
		});
		}
		crawler.queueLinkedItems(newData, queueItem, true);
		});
		}

		resources = crawler.discoverResources(resourceData,queueItem);
		resources = crawler.discoverResources(resourceData, queueItem);

		// Emit discovered resources. ie: might be useful in building a graph of
		// page relationships.
		crawler.emit("discoverycomplete",queueItem,resources);
		// Emit discovered resources. ie: might be useful in building a graph of
		// page relationships.
		crawler.emit("discoverycomplete", queueItem, resources);

		resources.forEach(function(url){ crawler.queueURL(url,queueItem); });
		resources.forEach(function(url) {
		crawler.queueURL(url, queueItem);
		});

		return crawler;
		return crawler;
		};

		/*
		Public: Given a single URL, this function cleans, validates, parses it and
		adds it to the queue. This is the best and simplest way to add an item to
		the queue.
		Public: Given a single URL, this function cleans, validates, parses it and
		adds it to the queue. This is the best and simplest way to add an item to
		the queue.

		url - URL to be queued.
		queueItem - Queue item from which the resource was linked.
		url - URL to be queued.
		queueItem - Queue item from which the resource was linked.

		Emits
		Emits

		queueduplicate
		queueerror
		queueadd
		queueduplicate
		queueerror
		queueadd

		Examples
		Examples

		crawler.queueURL("http://www.google.com/",queueItem);
		crawler.queueURL("http://www.google.com/",queueItem);

		Returns a boolean value indicating whether the URL was successfully queued
		or not.
		Returns a boolean value indicating whether the URL was successfully queued
		or not.

		*/
		Crawler.prototype.queueURL = function(url,queueItem) {
		var crawler = this;
		var parsedURL =
		typeof(url) === "object" ? url : crawler.processURL(url,queueItem);
		Crawler.prototype.queueURL = function(url, queueItem) {
		var crawler = this,
		parsedURL = typeof url === "object" ? url : crawler.processURL(url, queueItem);

		// URL Parser decided this URL was junky. Next please!
		if (!parsedURL) {
		return false;
		}
		// URL Parser decided this URL was junky. Next please!
		if (!parsedURL) {
		return false;
		}

		// Pass this URL past fetch conditions to ensure the user thinks it's valid
		var fetchDenied = false;
		fetchDenied = crawler._fetchConditions.reduce(function(prev,callback) {
		return prev \|\| !callback(parsedURL);
		},false);
		// Pass this URL past fetch conditions to ensure the user thinks it's valid
		var fetchDenied = false;
		fetchDenied = crawler._fetchConditions.reduce(function(prev, callback) {
		return prev \|\| !callback(parsedURL);
		}, false);

		if (fetchDenied) {
		// Fetch Conditions conspired to block URL
		return false;
		}
		if (fetchDenied) {
		// Fetch Conditions conspired to block URL
		return false;
		}

		// Check the domain is valid before adding it to the queue
		if (crawler.domainValid(parsedURL.host)) {
		crawler.queue.add(
		parsedURL.protocol,
		parsedURL.host,
		parsedURL.port,
		parsedURL.path,
		parsedURL.depth,
		function queueAddCallback(error,newQueueItem) {
		if (error) {
		// We received an error condition when adding the callback
		if (error.code && error.code === "DUP")
		return crawler.emit("queueduplicate",parsedURL);
		// Check the domain is valid before adding it to the queue
		if (crawler.domainValid(parsedURL.host)) {
		crawler.queue.add(
		parsedURL.protocol,
		parsedURL.host,
		parsedURL.port,
		parsedURL.path,
		parsedURL.depth,
		function queueAddCallback(error, newQueueItem) {
		if (error) {
		// We received an error condition when adding the callback
		if (error.code && error.code === "DUP") {
		return crawler.emit("queueduplicate", parsedURL);
		}

		return crawler.emit("queueerror",error,parsedURL);
		}
		return crawler.emit("queueerror", error, parsedURL);
		}

		crawler.emit("queueadd",newQueueItem,parsedURL);
		newQueueItem.referrer = queueItem ? queueItem.url : null;
		}
		);
		}
		crawler.emit("queueadd", newQueueItem, parsedURL);
		newQueueItem.referrer = queueItem ? queueItem.url : null;
		}
		);
		}

		return true;
		return true;
		};

		/*
		Public: The guts of the crawler: takes a queue item and spools a request for
		it, downloads, caches, and fires events based on the result of the request.
		It kicks off resource discovery and queues any new resources found.
		Public: The guts of the crawler: takes a queue item and spools a request for
		it, downloads, caches, and fires events based on the result of the request.
		It kicks off resource discovery and queues any new resources found.

		queueItem - Queue item to be fetched.
		queueItem - Queue item to be fetched.

		Emits
		fetchstart
		fetchheaders
		fetchcomplete
		fetchdataerror
		notmodified
		fetchredirect
		fetch404
		fetcherror
		fetchclienterror
		Emits
		fetchstart
		fetchheaders
		fetchcomplete
		fetchdataerror
		notmodified
		fetchredirect
		fetch404
		fetcherror
		fetchclienterror

		Examples
		Examples

		crawler.fetchQueueItem(queueItem);
		crawler.fetchQueueItem(queueItem);

		Returns the crawler object for chaining.
		Returns the crawler object for chaining.

		*/
		Crawler.prototype.fetchQueueItem = function(queueItem) {
		var crawler = this;
		crawler._openRequests ++;
		var crawler = this;
		crawler._openRequests++;

		// Variable declarations
		var fetchData = false,
		requestOptions,
		clientRequest,
		timeCommenced;
		// Variable declarations
		var requestOptions,
		clientRequest,
		timeCommenced;

		// Mark as spooled
		queueItem.status = "spooled";
		var client = (queueItem.protocol === "https" ? https : http);
		// Mark as spooled
		queueItem.status = "spooled";
		var client = queueItem.protocol === "https" ? https : http;

		// Up the socket limit if required.
		if (client.globalAgent.maxSockets < crawler.maxConcurrency) {
		client.globalAgent.maxSockets = crawler.maxConcurrency;
		}
		// Up the socket limit if required.
		if (client.globalAgent.maxSockets < crawler.maxConcurrency) {
		client.globalAgent.maxSockets = crawler.maxConcurrency;
		}

		// Extract request options from queue;
		var requestHost = queueItem.host,
		requestPort = queueItem.port,
		requestPath = queueItem.path;
		// Extract request options from queue;
		var requestHost = queueItem.host,
		requestPort = queueItem.port,
		requestPath = queueItem.path;

		// Are we passing through an HTTP proxy?
		if (crawler.useProxy) {
		requestHost = crawler.proxyHostname;
		requestPort = crawler.proxyPort;
		requestPath = queueItem.url;
		}
		// Are we passing through an HTTP proxy?
		if (crawler.useProxy) {
		requestHost = crawler.proxyHostname;
		requestPort = crawler.proxyPort;
		requestPath = queueItem.url;
		}

		// Load in request options
		requestOptions = {
		method: "GET",
		host: requestHost,
		port: requestPort,
		path: requestPath,
		headers: {
		"User-Agent": crawler.userAgent,
		"Host": queueItem.host + (
		queueItem.port !== 80 ?
		":" + queueItem.port :
		""
		)
		}
		};
		// Load in request options
		requestOptions = {
		method: "GET",
		host: requestHost,
		port: requestPort,
		path: requestPath,
		headers: {
		"User-Agent": crawler.userAgent,
		"Host": queueItem.host + (
		queueItem.port !== 80 ?
		":" + queueItem.port :
		""
		)
		}
		};

		if (queueItem.referrer) {
		requestOptions.headers.Referer = queueItem.referrer;
		}
		if (queueItem.referrer) {
		requestOptions.headers.Referer = queueItem.referrer;
		}

		// If port is one of the HTTP/HTTPS defaults, delete the option to avoid conflicts
		if (requestOptions.port === 80 \|\| requestOptions.port === 443) {
		delete requestOptions.port;
		}
		// If port is one of the HTTP/HTTPS defaults, delete the option to avoid conflicts
		if (requestOptions.port === 80 \|\| requestOptions.port === 443) {
		delete requestOptions.port;
		}

		// Add cookie header from cookie jar if we're configured to
		// send/accept cookies
		if (crawler.acceptCookies && crawler.cookies.getAsHeader()) {
		requestOptions.headers.cookie =
		crawler.cookies.getAsHeader(queueItem.host,queueItem.path);
		}
		// Add cookie header from cookie jar if we're configured to
		// send/accept cookies
		if (crawler.acceptCookies && crawler.cookies.getAsHeader()) {
		requestOptions.headers.cookie =
		crawler.cookies.getAsHeader(queueItem.host, queueItem.path);
		}

		// Add auth headers if we need them
		if (crawler.needsAuth) {
		var auth = crawler.authUser + ":" + crawler.authPass;
		// Add auth headers if we need them
		if (crawler.needsAuth) {
		var auth = crawler.authUser + ":" + crawler.authPass;

		// Generate auth header
		auth = 'Basic ' + (new Buffer(auth).toString('base64'));
		requestOptions.headers.Authorization = auth;
		}
		// Generate auth header
		auth = "Basic " + new Buffer(auth).toString("base64");
		requestOptions.headers.Authorization = auth;
		}

		// Add proxy auth if we need it
		if (crawler.proxyUser !== null && crawler.proxyPass !== null) {
		var proxyAuth = crawler.proxyUser + ":" + crawler.proxyPass;
		// Add proxy auth if we need it
		if (crawler.proxyUser !== null && crawler.proxyPass !== null) {
		var proxyAuth = crawler.proxyUser + ":" + crawler.proxyPass;

		// Generate auth header
		proxyAuth = 'Basic ' + (new Buffer(proxyAuth).toString('base64'));
		requestOptions.headers["Proxy-Authorization"] = proxyAuth;
		}
		// Generate auth header
		proxyAuth = "Basic " + new Buffer(proxyAuth).toString("base64");
		requestOptions.headers["Proxy-Authorization"] = proxyAuth;
		}

		// And if we've got any custom headers available
		if (crawler.customHeaders) {
		for (var header in crawler.customHeaders) {
		if (!crawler.customHeaders.hasOwnProperty(header)) continue;
		// And if we've got any custom headers available
		if (crawler.customHeaders) {
		for (var header in crawler.customHeaders) {
		if (!crawler.customHeaders.hasOwnProperty(header)) {
		continue;
		}

		requestOptions.headers[header] = crawler.customHeaders[header];
		}
		}
		requestOptions.headers[header] = crawler.customHeaders[header];
		}
		}

		// Apply the ignoreInvalidSSL setting to https connections
		if(client === https && crawler.ignoreInvalidSSL === true) {
		client.rejectUnauthorized = false;
		client.strictSSL = false;
		}
		// Apply the ignoreInvalidSSL setting to https connections
		if (client === https && crawler.ignoreInvalidSSL === true) {
		client.rejectUnauthorized = false;
		client.strictSSL = false;
		}

		// Emit fetchstart event - gives the user time to mangle the request options
		// if required.
		crawler.emit("fetchstart", queueItem, requestOptions);
		// Emit fetchstart event - gives the user time to mangle the request options
		// if required.
		crawler.emit("fetchstart", queueItem, requestOptions);

		process.nextTick(function() {
		// Record what time we started this request
		timeCommenced = Date.now();
		process.nextTick(function() {
		// Record what time we started this request
		timeCommenced = Date.now();

		// Get the resource!
		clientRequest =
		client.request(requestOptions,function(response) {
		crawler.handleResponse(queueItem,response,timeCommenced);
		});
		// Get the resource!
		clientRequest =
		client.request(requestOptions, function(response) {
		crawler.handleResponse(queueItem, response, timeCommenced);
		});

		clientRequest.end();
		clientRequest.end();

		clientRequest.setTimeout(crawler.timeout, function () {
		if (crawler.running && !queueItem.fetched) {
		crawler._openRequests--;
		}
		clientRequest.setTimeout(crawler.timeout, function() {
		if (queueItem.fetched) {
		return;
		}

		queueItem.fetched = true;
		queueItem.status = "timeout";
		crawler.emit("fetchtimeout", queueItem, crawler.timeout);
		clientRequest._crawlerHandled = true;
		clientRequest.abort();
		});
		if (crawler.running && !queueItem.fetched) {
		crawler._openRequests--;
		}

		clientRequest.on("error", function (errorData) {
		queueItem.fetched = true;
		queueItem.status = "timeout";
		crawler.emit("fetchtimeout", queueItem, crawler.timeout);
		clientRequest._crawlerHandled = true;
		clientRequest.abort();
		});

		// This event will be thrown if we manually aborted the request,
		// but we don't want to do anything in that case.
		if (clientRequest._crawlerHandled)
		return;
		clientRequest.on("error", function(errorData) {

		if (crawler.running && !queueItem.fetched) {
		crawler._openRequests--;
		}
		// This event will be thrown if we manually aborted the request,
		// but we don't want to do anything in that case.
		if (clientRequest._crawlerHandled) {
		return;
		}

		// Emit 5xx / 4xx event
		queueItem.fetched = true;
		queueItem.stateData.code = 599;
		queueItem.status = "failed";
		crawler.emit("fetchclienterror", queueItem, errorData);
		});
		if (crawler.running && !queueItem.fetched) {
		crawler._openRequests--;
		}

		return crawler;
		});
		// Emit 5xx / 4xx event
		queueItem.fetched = true;
		queueItem.stateData.code = 599;
		queueItem.status = "failed";
		crawler.emit("fetchclienterror", queueItem, errorData);
		});

		return crawler;
		});
		};
		@@ -912,405 +962,413 @@
		/*
		Public: Given a queueItem and a matching response object, the crawler will
		handle downloading the resource, queueing of linked items, etc.
		Public: Given a queueItem and a matching response object, the crawler will
		handle downloading the resource, queueing of linked items, etc.

		Examples
		Examples

		// Passing in a response from `request`
		request(queueItem.url,function(err,res,body) {
		crawler.handleResponse(queueItem,res);
		});
		// Passing in a response from `request`
		request(queueItem.url, function(err, res, body) {
		crawler.handleResponse(queueItem, res);
		});

		Returns the crawler object for chaining.
		Returns the crawler object for chaining.

		*/
		Crawler.prototype.handleResponse = function(queueItem,response,timeCommenced) {
		var crawler = this,
		dataReceived = false,
		timeHeadersReceived,
		timeDataReceived,
		parsedURL,
		responseBuffer,
		responseLength,
		responseLengthReceived = 0,
		contentType,
		stateData = queueItem.stateData;
		Crawler.prototype.handleResponse = function(queueItem, response, timeCommenced) {
		var crawler = this,
		dataReceived = false,
		timeHeadersReceived,
		timeDataReceived,
		parsedURL,
		responseBuffer,
		responseLength,
		responseLengthReceived = 0,
		contentType,
		stateData = queueItem.stateData;

		// Record what time we first received the header information
		timeHeadersReceived = Date.now();
		// Record what time we first received the header information
		timeHeadersReceived = Date.now();

		// If we weren't passed a time of commencement, assume Now()
		timeCommenced = timeCommenced \|\| Date.now();
		// If we weren't passed a time of commencement, assume Now()
		timeCommenced = timeCommenced \|\| Date.now();

		responseLength = parseInt(response.headers["content-length"],10);
		responseLength = !isNaN(responseLength) ? responseLength : 0;
		responseLength = parseInt(response.headers["content-length"], 10);
		responseLength = !isNaN(responseLength) ? responseLength : 0;

		// Save timing and content some header information into queue
		stateData.requestLatency = (timeHeadersReceived - timeCommenced);
		stateData.requestTime = (timeHeadersReceived - timeCommenced);
		stateData.contentLength = responseLength;
		stateData.contentType = contentType = response.headers["content-type"];
		stateData.code = response.statusCode;
		stateData.headers = response.headers;
		// Save timing and content some header information into queue
		stateData.requestLatency = timeHeadersReceived - timeCommenced;
		stateData.requestTime = timeHeadersReceived - timeCommenced;
		stateData.contentLength = responseLength;
		stateData.contentType = contentType = response.headers["content-type"];
		stateData.code = response.statusCode;
		stateData.headers = response.headers;

		// Do we need to save cookies? Were we sent any?
		if (crawler.acceptCookies &&
		response.headers.hasOwnProperty('set-cookie'))
		crawler.cookies.addFromHeaders(response.headers["set-cookie"]);
		// Do we need to save cookies? Were we sent any?
		if (crawler.acceptCookies && response.headers.hasOwnProperty("set-cookie")) {
		crawler.cookies.addFromHeaders(response.headers["set-cookie"]);
		}

		// Emit header receive event
		crawler.emit("fetchheaders",queueItem,response);
		// Emit header receive event
		crawler.emit("fetchheaders", queueItem, response);

		// Ensure response length is reasonable...
		responseLength =
		responseLength > 0 ? responseLength : crawler.maxResourceSize;
		// Ensure response length is reasonable...
		responseLength =
		responseLength > 0 ? responseLength : crawler.maxResourceSize;

		queueItem.stateData.contentLength = responseLength;
		queueItem.stateData.contentLength = responseLength;

		// Function for dealing with 200 responses
		function processReceivedData() {
		if (queueItem.fetched) return;
		// Function for dealing with 200 responses
		function processReceivedData() {
		if (queueItem.fetched) {
		return;
		}

		timeDataReceived = (new Date().getTime());
		timeDataReceived = new Date().getTime();

		queueItem.fetched = true;
		queueItem.status = "downloaded";
		queueItem.fetched = true;
		queueItem.status = "downloaded";

		// Save state information
		stateData.downloadTime = (timeDataReceived - timeHeadersReceived);
		stateData.requestTime = (timeDataReceived - timeCommenced);
		stateData.actualDataSize = responseBuffer.length;
		stateData.sentIncorrectSize = responseBuffer.length !== responseLength;
		// Save state information
		stateData.downloadTime = timeDataReceived - timeHeadersReceived;
		stateData.requestTime = timeDataReceived - timeCommenced;
		stateData.actualDataSize = responseBuffer.length;
		stateData.sentIncorrectSize = responseBuffer.length !== responseLength;

		// First, save item to cache (if we're using a cache!)
		if (crawler.cache !== null &&
		crawler.cache.setCacheData instanceof Function) {
		// First, save item to cache (if we're using a cache!)
		if (crawler.cache !== null && crawler.cache.setCacheData instanceof Function) {
		crawler.cache.setCacheData(queueItem, responseBuffer);
		}

		crawler.cache.setCacheData(queueItem,responseBuffer);
		}
		// Is the item allowed by depth conditions ?
		if (crawler.depthAllowed(queueItem)) {
		crawler.emit("fetchcomplete", queueItem, responseBuffer, response);

		// Is the item allowed by depth conditions ?
		if(crawler.depthAllowed(queueItem)) {
		crawler.emit("fetchcomplete",queueItem,responseBuffer,response);
		// We only process the item if it's of a valid mimetype
		// and only if the crawler is set to discover its own resources
		if (crawler.mimeTypeSupported(contentType) && crawler.discoverResources) {
		crawler.queueLinkedItems(responseBuffer, queueItem);
		}
		}

		// We only process the item if it's of a valid mimetype
		// and only if the crawler is set to discover its own resources
		if (crawler.mimeTypeSupported(contentType) && crawler.discoverResources) {
		crawler.queueLinkedItems(responseBuffer,queueItem);
		}
		}
		crawler._openRequests--;
		}

		crawler._openRequests --;
		}
		function receiveData(chunk) {
		if (chunk && chunk.length && !dataReceived) {
		if (responseLengthReceived + chunk.length > responseBuffer.length) {
		// Oh dear. We've been sent more data than we were initially told.
		// This could be a mis-calculation, or a streaming resource.
		// Let's increase the size of our buffer to match, as long as it isn't
		// larger than our maximum resource size.

		function receiveData(chunk) {
		if (chunk && chunk.length && !dataReceived) {
		if (responseLengthReceived + chunk.length > responseBuffer.length) {
		// Oh dear. We've been sent more data than we were initially told.
		// This could be a mis-calculation, or a streaming resource.
		// Let's increase the size of our buffer to match, as long as it isn't
		// larger than our maximum resource size.
		if (responseLengthReceived + chunk.length <= crawler.maxResourceSize) {

		if (responseLengthReceived + chunk.length <= crawler.maxResourceSize) {
		// Start by creating a new buffer, which will be our main
		// buffer from now on...

		// Start by creating a new buffer, which will be our main
		// buffer from now on...
		var tmpNewBuffer = new Buffer(responseLengthReceived + chunk.length);

		var tmpNewBuffer = new Buffer(responseLengthReceived + chunk.length);
		// Copy all our old data into it...
		responseBuffer.copy(tmpNewBuffer, 0, 0, responseBuffer.length);

		// Copy all our old data into it...
		responseBuffer.copy(tmpNewBuffer,0,0,responseBuffer.length);
		// And now the new chunk
		chunk.copy(tmpNewBuffer, responseBuffer.length, 0, chunk.length);

		// And now the new chunk
		chunk.copy(tmpNewBuffer,responseBuffer.length,0,chunk.length);
		// And now make the response buffer our new buffer,
		// leaving the original for GC
		responseBuffer = tmpNewBuffer;

		// And now make the response buffer our new buffer,
		// leaving the original for GC
		responseBuffer = tmpNewBuffer;
		} else {
		// Oh dear oh dear! The response is not only more data
		// than we were initially told, but it also exceeds the
		// maximum amount of data we're prepared to download per
		// resource.
		//
		// Throw error event and ignore.
		//
		// We'll then deal with the data that we have.

		} else {
		// Oh dear oh dear! The response is not only more data
		// than we were initially told, but it also exceeds the
		// maximum amount of data we're prepared to download per
		// resource.
		//
		// Throw error event and ignore.
		//
		// We'll then deal with the data that we have.
		crawler.emit("fetchdataerror", queueItem, response);
		}
		} else {
		// Copy the chunk data into our main buffer
		chunk.copy(responseBuffer, responseLengthReceived, 0, chunk.length);
		}

		crawler.emit("fetchdataerror",queueItem,response);
		}
		} else {
		// Copy the chunk data into our main buffer
		chunk.copy(responseBuffer,responseLengthReceived,0,chunk.length);
		}
		// Increment our data received counter
		responseLengthReceived += chunk.length;
		}

		// Increment our data received counter
		responseLengthReceived += chunk.length;
		}
		if ((responseLengthReceived >= responseLength \|\| response.complete) &&
		!dataReceived) {

		// Slice the buffer to chop off any unused space
		responseBuffer = responseBuffer.slice(0, responseLengthReceived);

		if ((responseLengthReceived >= responseLength \|\| response.complete) &&
		!dataReceived) {
		dataReceived = true;
		processReceivedData();
		}
		}

		// Slice the buffer to chop off any unused space
		responseBuffer = responseBuffer.slice(0,responseLengthReceived);
		// If we should just go ahead and get the data
		if (response.statusCode >= 200 && response.statusCode < 300 &&
		responseLength <= crawler.maxResourceSize) {

		dataReceived = true;
		processReceivedData();
		}
		}
		queueItem.status = "headers";

		// If we should just go ahead and get the data
		if (response.statusCode >= 200 && response.statusCode < 300 &&
		responseLength <= crawler.maxResourceSize) {
		// Create a buffer with our response length
		responseBuffer = new Buffer(responseLength);

		queueItem.status = "headers";
		// Only if we're prepared to download non-text resources...
		if (crawler.downloadUnsupported \|\|
		crawler.mimeTypeSupported(contentType)) {

		// Create a buffer with our response length
		responseBuffer = new Buffer(responseLength);
		response.on("data", receiveData);
		response.on("end", receiveData);
		} else {
		queueItem.fetched = true;
		crawler._openRequests--;

		// Only if we're prepared to download non-text resources...
		if (crawler.downloadUnsupported \|\|
		crawler.mimeTypeSupported(contentType)) {
		response.socket.end();
		}

		response.on("data",receiveData);
		response.on("end",receiveData);
		} else {
		response.socket.end();
		}
		// We've got a not-modified response back
		} else if (response.statusCode === 304) {

		// We've got a not-modified response back
		} else if (response.statusCode === 304) {
		if (crawler.cache !== null && crawler.cache.getCacheData) {
		// We've got access to a cache
		crawler.cache.getCacheData(queueItem, function(cacheObject) {
		crawler.emit("notmodified", queueItem, response, cacheObject);
		});
		} else {
		// Emit notmodified event. We don't have a cache available, so
		// we don't send any data.
		crawler.emit("notmodified", queueItem, response);
		}

		if (crawler.cache !== null && crawler.cache.getCacheData) {
		// We've got access to a cache
		crawler.cache.getCacheData(queueItem,function(cacheObject) {
		crawler.emit("notmodified",queueItem,response,cacheObject);
		});
		} else {
		// Emit notmodified event. We don't have a cache available, so
		// we don't send any data.
		crawler.emit("notmodified",queueItem,response);
		}
		// If we should queue a redirect
		} else if (response.statusCode >= 300 && response.statusCode < 400 &&
		response.headers.location) {

		// If we should queue a redirect
		} else if (response.statusCode >= 300 && response.statusCode < 400 &&
		response.headers.location) {
		queueItem.fetched = true;
		queueItem.status = "redirected";

		queueItem.fetched = true;
		queueItem.status = "redirected";
		// Parse the redirect URL ready for adding to the queue...
		parsedURL = crawler.processURL(response.headers.location, queueItem);

		// Parse the redirect URL ready for adding to the queue...
		parsedURL = crawler.processURL(response.headers.location,queueItem);
		// Emit redirect event
		crawler.emit("fetchredirect", queueItem, parsedURL, response);

		// Emit redirect event
		crawler.emit("fetchredirect",queueItem,parsedURL,response);
		// Clean URL, add to queue...
		crawler.queueURL(parsedURL, queueItem);
		response.socket.end();

		// Clean URL, add to queue...
		crawler.queueURL(parsedURL,queueItem);
		response.socket.end();
		crawler._openRequests--;

		crawler._openRequests --;
		// Ignore this request, but record that we had a 404
		} else if (response.statusCode === 404 \|\| response.statusCode === 410) {
		queueItem.fetched = true;
		queueItem.status = "notfound";

		// Ignore this request, but record that we had a 404
		} else if (response.statusCode === 404 \|\| response.statusCode === 410) {
		queueItem.fetched = true;
		queueItem.status = "notfound";
		// Emit 404 event
		crawler.emit("fetch404", queueItem, response);
		response.socket.end();

		// Emit 404 event
		crawler.emit("fetch404",queueItem,response);
		response.socket.end();
		crawler._openRequests--;

		crawler._openRequests --;
		// And oh dear. Handle this one as well. (other 400s, 500s, etc)
		} else {
		queueItem.fetched = true;
		queueItem.status = "failed";

		// And oh dear. Handle this one as well. (other 400s, 500s, etc)
		} else {
		queueItem.fetched = true;
		queueItem.status = "failed";
		// Emit 5xx / 4xx event
		crawler.emit("fetcherror", queueItem, response);
		response.socket.end();

		// Emit 5xx / 4xx event
		crawler.emit("fetcherror",queueItem,response);
		response.socket.end();
		crawler._openRequests--;
		}

		crawler._openRequests --;
		}

		return crawler;
		return crawler;
		};

		/*
		Public: The main crawler runloop. Fires at the interval specified in the
		crawler configuration, when the crawl is running. May be manually fired.
		This function initiates fetching of a queue item if there are enough workers
		to do so and there are unfetched items in the queue.
		Public: The main crawler runloop. Fires at the interval specified in the
		crawler configuration, when the crawl is running. May be manually fired.
		This function initiates fetching of a queue item if there are enough workers
		to do so and there are unfetched items in the queue.

		Examples
		Examples

		crawler.crawl();
		crawler.crawl();

		Returns the crawler object for chaining.
		Returns the crawler object for chaining.

		*/
		Crawler.prototype.crawl = function() {
		var crawler = this;
		var crawler = this;

		if (crawler._openRequests > crawler.maxConcurrency) return;
		if (crawler._openRequests > crawler.maxConcurrency) {
		return [];
		}

		crawler.queue.oldestUnfetchedItem(function(err, queueItem) {
		crawler.queue.oldestUnfetchedItem(function(err, queueItem) { // eslint-disable-line

		if (queueItem) {
		crawler.fetchQueueItem(queueItem);
		if (queueItem) {
		crawler.fetchQueueItem(queueItem);

		} else if ( !crawler._openRequests &&
		!crawler._openListeners) {
		} else if (!crawler._openRequests && !crawler._openListeners) {

		crawler.queue.complete(function(err, completeCount) {
		if (err) throw err;
		crawler.queue.complete(function(err, completeCount) {
		if (err) {
		throw err;
		}

		crawler.queue.getLength(function(err, length) {
		if (err) throw err;
		crawler.queue.getLength(function(err, length) {
		if (err) {
		throw err;
		}

		if (completeCount === length) {
		crawler.emit("complete");
		crawler.stop();
		}
		});
		});
		}
		});
		if (completeCount === length) {
		crawler.emit("complete");
		crawler.stop();
		}
		});
		});
		}
		});

		return crawler;
		return crawler;
		};

		/*
		Public: Stops the crawler, terminating the crawl runloop.
		Public: Stops the crawler, terminating the crawl runloop.

		Examples
		Examples

		crawler.stop();
		crawler.stop();

		Returns the crawler object for chaining.
		Returns the crawler object for chaining.

		*/
		Crawler.prototype.stop = function() {
		var crawler = this;
		clearInterval(crawler.crawlIntervalID);
		crawler.running = false;
		return crawler;
		var crawler = this;
		clearInterval(crawler.crawlIntervalID);
		crawler.running = false;
		return crawler;
		};

		/*
		Public: Holds the crawler in a 'running' state, preventing the `complete`
		event from firing until the callback this function returns has been executed,
		or a predetermined timeout (as specified by `crawler.listenerTTL`) has
		elapsed.
		Public: Holds the crawler in a 'running' state, preventing the `complete`
		event from firing until the callback this function returns has been executed,
		or a predetermined timeout (as specified by `crawler.listenerTTL`) has
		elapsed.

		Examples
		Examples

		crawler.on("fetchcomplete",function(queueItem,data) {
		continue = this.wait();
		doSomethingThatTakesAlongTime(function callback() {
		continue();
		});
		});
		crawler.on("fetchcomplete",function(queueItem,data) {
		continue = this.wait();
		doSomethingThatTakesAlongTime(function callback() {
		continue();
		});
		});

		Returns callback which will allow the crawler to continue.
		Returns callback which will allow the crawler to continue.

		*/
		Crawler.prototype.wait = function() {
		var crawler = this,
		cleared = false,
		timeout =
		setTimeout(function() {
		if (cleared) return;
		cleared = true;
		crawler._openListeners --;
		}, crawler.listenerTTL);
		var crawler = this,
		cleared = false,
		timeout =
		setTimeout(function() {
		if (cleared) {
		return;
		}
		cleared = true;
		crawler._openListeners--;
		}, crawler.listenerTTL);

		crawler._openListeners ++;
		crawler._openListeners++;

		return function() {
		if (cleared) return;
		cleared = true;
		crawler._openListeners --;
		clearTimeout(timeout);
		};
		return function() {
		if (cleared) {
		return;
		}
		cleared = true;
		crawler._openListeners--;
		clearTimeout(timeout);
		};
		};

		/*
		Public: Given a function, this method adds it to an internal list maintained
		by the crawler to be executed against each URL to determine whether it should
		be fetched or not.
		Public: Given a function, this method adds it to an internal list maintained
		by the crawler to be executed against each URL to determine whether it should
		be fetched or not.

		callback - Function to be called when evaluating a URL. This function is
		passed an object containing the protocol, hostname, port, and path
		of a resource to be fetched. It can determine whether it should
		be requested or not by returning a boolean - false for no, true
		for yes.
		callback - Function to be called when evaluating a URL. This function is
		passed an object containing the protocol, hostname, port, and path
		of a resource to be fetched. It can determine whether it should
		be requested or not by returning a boolean - false for no, true
		for yes.

		Examples
		Examples

		crawler.addFetchCondition(function(parsedURL) {
		return (parsedURL.host !== "evildomain.com");
		});
		crawler.addFetchCondition(function(parsedURL) {
		return (parsedURL.host !== "evildomain.com");
		});

		Returns the ID of the fetch condition - used for removing it from the crawler
		later.
		Returns the ID of the fetch condition - used for removing it from the crawler
		later.

		*/
		Crawler.prototype.addFetchCondition = function(callback) {
		var crawler = this;
		if (callback instanceof Function) {
		crawler._fetchConditions.push(callback);
		return crawler._fetchConditions.length - 1;
		} else {
		throw new Error("Fetch Condition must be a function.");
		}
		var crawler = this;
		if (callback instanceof Function) {
		crawler._fetchConditions.push(callback);
		return crawler._fetchConditions.length - 1;
		}
		throw new Error("Fetch Condition must be a function.");
		};

		/*
		Public: Given the ID of an existing fetch condition, this function removes
		it from the crawler's internal list of conditions.
		Public: Given the ID of an existing fetch condition, this function removes
		it from the crawler's internal list of conditions.

		index - ID of fetch condition to be removed.
		index - ID of fetch condition to be removed.

		Examples
		Examples

		crawler.removeFetchCondition(3);
		crawler.removeFetchCondition(3);

		Returns true if the fetch condition was removed, and throws an error if it
		could not be found.
		Returns true if the fetch condition was removed, and throws an error if it
		could not be found.

		*/
		Crawler.prototype.removeFetchCondition = function(index) {
		var crawler = this;
		if (crawler._fetchConditions[index] &&
		crawler._fetchConditions[index] instanceof Function) {
		var crawler = this;
		if (crawler._fetchConditions[index] &&
		crawler._fetchConditions[index] instanceof Function) {

		return !!crawler._fetchConditions.splice(index,1);
		} else {
		throw new Error("Unable to find indexed Fetch Condition.");
		}
		return !!crawler._fetchConditions.splice(index, 1);
		}
		throw new Error("Unable to find indexed Fetch Condition.");
		};

		/*
		Public: Given a URL it will remove the querstring if it exists.
		Public: Given a URL it will remove the querstring if it exists.

		url - URL from which to remove the querystring
		url - URL from which to remove the querystring

		Examples
		Examples

		crawler.removeQuerystring(url);
		crawler.removeQuerystring(url);

		Returns URL without querystring if it exists
		Returns URL without querystring if it exists

		*/
		Crawler.prototype.removeQuerystring = function(url) {
		if (url.indexOf("?") > -1) {
		return url.substr(0,url.indexOf("?"));
		} else {
		return url;
		}
		if (url.indexOf("?") > -1) {
		return url.substr(0, url.indexOf("?"));
		}
		return url;
		};

		module.exports = Crawler;

lib/index.js

		@@ -1,7 +0,12 @@
		// SimpleCrawler
		// Export interfaces
		/*
		* Simplecrawler - Export interfaces
		* https://github.com/cgiffard/node-simplecrawler
		*
		* Copyright (c) 2011-2015, Christopher Giffard
		*
		*/

		module.exports = require("./crawler.js");

		// Aliasing for compatibility with legacy code.
		// Aliasing for compatibility with legacy code
		module.exports.Crawler = module.exports;
		@@ -8,0 +13,0 @@

438

lib/queue.js

		@@ -1,21 +0,23 @@
		// Simplecrawler - queue module
		// Christopher Giffard, 2011
		//
		// http://www.github.com/cgiffard/node-simplecrawler
		/*
		* Simplecrawler - queue module
		* https://github.com/cgiffard/node-simplecrawler
		*
		* Copyright (c) 2011-2015, Christopher Giffard
		*
		*/


		var fs = require("fs");

		var allowedStatistics = [
		"requestTime",
		"requestLatency",
		"downloadTime",
		"contentLength",
		"actualDataSize"
		"requestTime",
		"requestLatency",
		"downloadTime",
		"contentLength",
		"actualDataSize"
		];

		var FetchQueue = function(){
		this.oldestUnfetchedIndex = 0;
		this.completeCache = 0;
		this.scanIndex = {};
		var FetchQueue = function() {
		this.oldestUnfetchedIndex = 0;
		this.completeCache = 0;
		this.scanIndex = {};
		};
		@@ -28,47 +30,49 @@

		// For legacy reasons
		if (depth instanceof Function) {
		callback = depth;
		depth = 1;
		}

		depth = depth \|\| 1;
		callback = callback && callback instanceof Function ? callback : function(){};
		var self = this;
		// For legacy reasons
		if (depth instanceof Function) {
		callback = depth;
		depth = 1;
		}

		// Ensure all variables conform to reasonable defaults
		protocol = protocol === "https" ? "https" : "http";
		depth = depth \|\| 1;
		callback = callback && callback instanceof Function ? callback : function() {};
		var self = this;

		if (isNaN(port) \|\| !port) {
		return callback(new Error("Port must be numeric!"));
		}
		// Ensure all variables conform to reasonable defaults
		protocol = protocol === "https" ? "https" : "http";

		var url = protocol + "://" + domain + (port !== 80 ? ":" + port : "") + path;
		if (isNaN(port) \|\| !port) {
		return callback(new Error("Port must be numeric!"));
		}

		this.exists(protocol,domain,port,path,
		function(err,exists) {
		if (err) return callback(err);
		var url = protocol + "://" + domain + (port !== 80 ? ":" + port : "") + path;

		if (!exists) {
		var queueItem = {
		"url": url,
		"protocol": protocol,
		"host": domain,
		"port": port,
		"path": path,
		"depth": depth,
		"fetched": false,
		"status": "queued",
		"stateData": {}
		};
		self.exists(protocol, domain, port, path,
		function(err, exists) {
		if (err) {
		return callback(err);
		}

		self.push(queueItem);
		callback(null, queueItem);
		} else {
		var error = new Error("Resource already exists in queue!");
		error.code = "DUP";
		if (!exists) {
		var queueItem = {
		url: url,
		protocol: protocol,
		host: domain,
		port: port,
		path: path,
		depth: depth,
		fetched: false,
		status: "queued",
		stateData: {}
		};

		callback(error);
		}
		});
		self.push(queueItem);
		callback(null, queueItem);
		} else {
		var error = new Error("Resource already exists in queue!");
		error.code = "DUP";

		callback(error);
		}
		});
		};
		@@ -78,18 +82,17 @@
		FetchQueue.prototype.exists = function(protocol, domain, port, path, callback) {
		callback = callback && callback instanceof Function ? callback : function(){};
		callback = callback && callback instanceof Function ? callback : function() {};

		port = (port !== 80 ? ":" + port : "");
		port = port !== 80 ? ":" + port : "";

		var url =
		(protocol + "://" + domain + port + path)
		.toLowerCase();
		var url = (protocol + "://" + domain + port + path).toLowerCase();

		if (!!this.scanIndex[url]) {
		callback(null, 1);
		return 1;
		} else {
		this.scanIndex[url] = true;
		callback(null, 0);
		return 0;
		}
		if (this.scanIndex[url]) {
		callback(null, 1);
		return 1;
		}

		this.scanIndex[url] = true;
		callback(null, 0);
		return 0;

		};
		@@ -99,7 +102,9 @@
		FetchQueue.prototype.last = function(callback) {
		callback = callback && callback instanceof Function ? callback : function(){};
		var item, self = this;
		item = self[self.length-1];
		callback(null, item);
		return item;
		callback = callback && callback instanceof Function ? callback : function() {};
		var item,
		self = this;

		item = self[self.length - 1];
		callback(null, item);
		return item;
		};
		@@ -109,10 +114,11 @@
		FetchQueue.prototype.get = function(id, callback) {
		callback = callback && callback instanceof Function ? callback : function(){};
		var item, self = this;
		callback = callback && callback instanceof Function ? callback : function() {};
		var item,
		self = this;

		if (!isNaN(id) && self.length > id) {
		item = self[id];
		callback(null, item);
		return item;
		}
		if (!isNaN(id) && self.length > id) {
		item = self[id];
		callback(null, item);
		return item;
		}
		};
		@@ -122,15 +128,16 @@
		FetchQueue.prototype.oldestUnfetchedItem = function(callback) {
		callback = callback && callback instanceof Function ? callback : function(){};
		var item, self = this;
		callback = callback && callback instanceof Function ? callback : function() {};
		var item,
		self = this;

		for (var itemIndex = self.oldestUnfetchedIndex; itemIndex < self.length; itemIndex ++) {
		if (self[itemIndex].status === "queued") {
		self.oldestUnfetchedIndex = itemIndex;
		item = self[itemIndex];
		callback(null, item);
		return item;
		}
		}
		for (var itemIndex = self.oldestUnfetchedIndex; itemIndex < self.length; itemIndex++) {
		if (self[itemIndex].status === "queued") {
		self.oldestUnfetchedIndex = itemIndex;
		item = self[itemIndex];
		callback(null, item);
		return item;
		}
		}

		callback(new Error("No unfetched items remain."));
		callback(new Error("No unfetched items remain."));
		};
		@@ -140,18 +147,19 @@
		FetchQueue.prototype.max = function(statisticName, callback) {
		callback = callback && callback instanceof Function ? callback : function(){};
		var maxStatisticValue = 0, self = this;
		callback = callback && callback instanceof Function ? callback : function() {};
		var maxStatisticValue = 0,
		self = this;

		if (allowedStatistics.join().indexOf(statisticName) === -1) {
		// Not a recognised statistic!
		return callback(new Error("Invalid statistic."));
		}
		if (allowedStatistics.join().indexOf(statisticName) === -1) {
		// Not a recognised statistic!
		return callback(new Error("Invalid statistic."));
		}

		self.forEach(function(item) {
		if (item.fetched && item.stateData[statisticName] !== null && item.stateData[statisticName] > maxStatisticValue) {
		maxStatisticValue = item.stateData[statisticName];
		}
		});
		self.forEach(function(item) {
		if (item.fetched && item.stateData[statisticName] !== null && item.stateData[statisticName] > maxStatisticValue) {
		maxStatisticValue = item.stateData[statisticName];
		}
		});

		callback(null, maxStatisticValue);
		return maxStatisticValue;
		callback(null, maxStatisticValue);
		return maxStatisticValue;
		};
		@@ -161,19 +169,21 @@
		FetchQueue.prototype.min = function(statisticName, callback) {
		callback = callback && callback instanceof Function ? callback : function(){};
		var minimum, minStatisticValue = Infinity, self = this;
		callback = callback && callback instanceof Function ? callback : function() {};
		var minimum,
		minStatisticValue = Infinity,
		self = this;

		if (allowedStatistics.join().indexOf(statisticName) === -1) {
		// Not a recognised statistic!
		return callback(new Error("Invalid statistic."));
		}
		if (allowedStatistics.join().indexOf(statisticName) === -1) {
		// Not a recognised statistic!
		return callback(new Error("Invalid statistic."));
		}

		self.forEach(function(item) {
		if (item.fetched && item.stateData[statisticName] !== null && item.stateData[statisticName] < minStatisticValue) {
		minStatisticValue = item.stateData[statisticName];
		}
		});

		minimum = minStatisticValue === Infinity? 0 : minStatisticValue;
		callback(null, minimum);
		return minimum;
		self.forEach(function(item) {
		if (item.fetched && item.stateData[statisticName] !== null && item.stateData[statisticName] < minStatisticValue) {
		minStatisticValue = item.stateData[statisticName];
		}
		});

		minimum = minStatisticValue === Infinity ? 0 : minStatisticValue;
		callback(null, minimum);
		return minimum;
		};
		@@ -183,19 +193,22 @@
		FetchQueue.prototype.avg = function(statisticName, callback) {
		callback = callback && callback instanceof Function ? callback : function(){};
		var average, NumberSum = 0, NumberCount = 0, self = this;
		callback = callback && callback instanceof Function ? callback : function() {};
		var average,
		NumberSum = 0,
		NumberCount = 0,
		self = this;

		if (allowedStatistics.join().indexOf(statisticName) === -1) {
		// Not a recognised statistic!
		return callback(new Error("Invalid statistic."));
		}
		if (allowedStatistics.join().indexOf(statisticName) === -1) {
		// Not a recognised statistic!
		return callback(new Error("Invalid statistic."));
		}

		self.forEach(function(item) {
		if (item.fetched && item.stateData[statisticName] !== null && !isNaN(item.stateData[statisticName])) {
		NumberSum += item.stateData[statisticName];
		NumberCount ++;
		}
		});
		average = NumberSum / NumberCount;
		callback(null, average);
		return average;
		self.forEach(function(item) {
		if (item.fetched && item.stateData[statisticName] !== null && !isNaN(item.stateData[statisticName])) {
		NumberSum += item.stateData[statisticName];
		NumberCount++;
		}
		});
		average = NumberSum / NumberCount;
		callback(null, average);
		return average;
		};
		@@ -205,13 +218,14 @@
		FetchQueue.prototype.complete = function(callback) {
		callback = callback && callback instanceof Function ? callback : function(){};
		var NumberComplete = 0, self = this;
		callback = callback && callback instanceof Function ? callback : function() {};
		var NumberComplete = 0,
		self = this;

		self.forEach(function(item) {
		if (item.fetched) {
		NumberComplete ++;
		}
		});
		self.forEach(function(item) {
		if (item.fetched) {
		NumberComplete++;
		}
		});

		callback(null, NumberComplete);
		return NumberComplete;
		callback(null, NumberComplete);
		return NumberComplete;
		};
		@@ -221,13 +235,14 @@
		FetchQueue.prototype.countWithStatus = function(status, callback) {
		callback = callback && callback instanceof Function ? callback : function(){};
		var queueItemsMatched = 0, self = this;
		callback = callback && callback instanceof Function ? callback : function() {};
		var queueItemsMatched = 0,
		self = this;

		self.forEach(function(item) {
		if (item.status === status) {
		queueItemsMatched ++;
		}
		});
		self.forEach(function(item) {
		if (item.status === status) {
		queueItemsMatched++;
		}
		});

		callback(null,queueItemsMatched);
		return queueItemsMatched;
		callback(null, queueItemsMatched);
		return queueItemsMatched;
		};
		@@ -237,14 +252,15 @@
		FetchQueue.prototype.getWithStatus = function(status, callback) {
		callback = callback && callback instanceof Function ? callback : function(){};
		var subqueue = [], self = this;
		callback = callback && callback instanceof Function ? callback : function() {};
		var subqueue = [],
		self = this;

		self.forEach(function(item,index) {
		if (item.status === status) {
		subqueue.push(item);
		subqueue[subqueue.length-1].queueIndex = index;
		}
		});
		self.forEach(function(item, index) {
		if (item.status === status) {
		subqueue.push(item);
		subqueue[subqueue.length - 1].queueIndex = index;
		}
		});

		callback(null,subqueue);
		return subqueue;
		callback(null, subqueue);
		return subqueue;
		};
		@@ -254,10 +270,13 @@
		FetchQueue.prototype.errors = function(callback) {
		callback = callback && callback instanceof Function ? callback : function(){};
		var total, failedCount, notFoundCount, self = this;
		callback = callback && callback instanceof Function ? callback : function() {};
		var total,
		failedCount,
		notFoundCount,
		self = this;

		failedCount = self.countWithStatus("failed");
		notFoundCount = self.countWithStatus("notfound");
		total = failedCount + notFoundCount;
		callback(null, total);
		return total;
		failedCount = self.countWithStatus("failed");
		notFoundCount = self.countWithStatus("notfound");
		total = failedCount + notFoundCount;
		callback(null, total);
		return total;
		};
		@@ -267,20 +286,20 @@
		FetchQueue.prototype.getLength = function(callback) {
		return callback(null, this.length);
		return callback(null, this.length);
		};

		// Writes the queue to disk
		FetchQueue.prototype.freeze = function(filename,callback) {
		callback = callback && callback instanceof Function ? callback : function(){};
		var self = this;
		FetchQueue.prototype.freeze = function(filename, callback) {
		callback = callback && callback instanceof Function ? callback : function() {};
		var self = this;

		// Re-queue in-progress items before freezing...
		self.forEach(function(item) {
		if (item.fetched !== true) {
		item.status = "queued";
		}
		});
		// Re-queue in-progress items before freezing...
		self.forEach(function(item) {
		if (item.fetched !== true) {
		item.status = "queued";
		}
		});

		fs.writeFile(filename,JSON.stringify(self),function(err) {
		callback(err, self);
		});
		fs.writeFile(filename, JSON.stringify(self), function(err) {
		callback(err, self);
		});
		};
		@@ -290,39 +309,44 @@
		FetchQueue.prototype.defrost = function(filename, callback) {
		callback = callback && callback instanceof Function ? callback : function(){};
		var fileData, self = this, defrostedQueue = [];
		callback = callback && callback instanceof Function ? callback : function() {};
		var self = this,
		defrostedQueue = [];

		fs.readFile(filename,function(err,fileData) {
		if (err) return callback(err);
		fs.readFile(filename, function(err, fileData) {
		if (err) {
		return callback(err);
		}

		if (!fileData.toString("utf8").length) {
		return callback(new Error("Failed to defrost queue from zero-length JSON."));
		}
		if (!fileData.toString("utf8").length) {
		return callback(new Error("Failed to defrost queue from zero-length JSON."));
		}

		try {
		defrostedQueue = JSON.parse(fileData.toString("utf8"));
		} catch(error) {
		return callback(error);
		}
		try {
		defrostedQueue = JSON.parse(fileData.toString("utf8"));
		} catch (error) {
		return callback(error);
		}

		self.oldestUnfetchedIndex = Infinity;
		self.scanIndex = {};
		self.oldestUnfetchedIndex = Infinity;
		self.scanIndex = {};

		for (var index in defrostedQueue) {
		if (defrostedQueue.hasOwnProperty(index) && !isNaN(index)) {
		var queueItem = defrostedQueue[index];
		self.push(queueItem);
		for (var index in defrostedQueue) {
		if (defrostedQueue.hasOwnProperty(index) && !isNaN(index)) {
		var queueItem = defrostedQueue[index];
		self.push(queueItem);

		if (queueItem.status !== "downloaded")
		self.oldestUnfetchedIndex = Math.min(
		self.oldestUnfetchedIndex, index);
		if (queueItem.status !== "downloaded") {
		self.oldestUnfetchedIndex = Math.min(
		self.oldestUnfetchedIndex, index);
		}

		self.scanIndex[queueItem.url] = true;
		}
		}
		self.scanIndex[queueItem.url] = true;
		}
		}

		if (self.oldestUnfetchedIndex === Infinity)
		self.oldestUnfetchedIndex = 0;
		if (self.oldestUnfetchedIndex === Infinity) {
		self.oldestUnfetchedIndex = 0;
		}

		callback(null,self);
		});
		callback(null, self);
		});
		};

120

lib/quickcrawl.js

		@@ -1,73 +0,85 @@
		var Crawler = require("./crawler.js"),
		URI = require("URIjs");
		/*
		* Simplecrawler
		* https://github.com/cgiffard/node-simplecrawler
		*
		* Copyright (c) 2011-2015, Christopher Giffard
		*
		*/

		var Crawler = require("./crawler.js"),
		uri = require("urijs");


		/*
		Public: Convenience function for really quick, simple crawls. It generates
		a new crawler, parses the URL provided, and sets up the new crawler with
		the host and path information extracted from the URL. It returns the crawler
		object, so you can set up event handlers, and waits until `process.nextTick`
		before kicking off the crawl.
		Public: Convenience function for really quick, simple crawls. It generates
		a new crawler, parses the URL provided, and sets up the new crawler with
		the host and path information extracted from the URL. It returns the crawler
		object, so you can set up event handlers, and waits until `process.nextTick`
		before kicking off the crawl.

		url - URL to begin crawl from.
		successCallback - Optional function called once an item is completely
		downloaded. Functionally identical to a fetchcomplete
		event listener.
		failCallback - Optional function to be called if an item fails to
		download. Functionally identical to a fetcherror
		event listener.
		url - URL to begin crawl from.
		successCallback - Optional function called once an item is completely
		downloaded. Functionally identical to a fetchcomplete
		event listener.
		failCallback - Optional function to be called if an item fails to
		download. Functionally identical to a fetcherror
		event listener.

		Examples
		Examples

		Crawler.crawl(
		"http://example.com:3000/start",
		function(queueItem,data) {
		console.log("I got a new item!");
		}
		);
		Crawler.crawl(
		"http://example.com:3000/start",
		function(queueItem,data) {
		console.log("I got a new item!");
		}
		);

		Crawler
		.crawl("http://www.example.com/")
		.on("fetchstart",function(queueItem) {
		console.log("Beginning fetch for",queueItem.url);
		});
		Crawler
		.crawl("http://www.example.com/")
		.on("fetchstart",function(queueItem) {
		console.log("Beginning fetch for",queueItem.url);
		});

		Returns the crawler object which has now been constructed.
		Returns the crawler object which has now been constructed.

		*/
		module.exports = function crawl(url,successCallback,failCallback) {

		// Parse the URL first
		url = URI(url);
		module.exports = function crawl(url, successCallback, failCallback) {

		// If either the protocol, path, or hostname are unset, we can't really
		// do much. Die with error.
		if (!url.protocol())
		throw new Error("Can't crawl with unspecified protocol.");
		// Parse the URL first
		url = uri(url);

		if (!url.hostname())
		throw new Error("Can't crawl with unspecified hostname.");
		// If either the protocol, path, or hostname are unset,
		// we can't really do much. Die with error.
		if (!url.protocol()) {
		throw new Error("Can't crawl with unspecified protocol.");
		}

		if (!url.path())
		throw new Error("Can't crawl with unspecified path.");
		if (!url.hostname()) {
		throw new Error("Can't crawl with unspecified hostname.");
		}

		var tmpCrawler =
		new Crawler(
		url.hostname(),
		url.path(),
		url.port() \|\| 80);
		if (!url.path()) {
		throw new Error("Can't crawl with unspecified path.");
		}

		// Attach callbacks if they were provided
		if (successCallback) tmpCrawler.on("fetchcomplete",successCallback);
		if (failCallback) tmpCrawler.on("fetcherror",failCallback);
		var tmpCrawler = new Crawler(url.hostname(), url.path(), url.port() \|\| 80);

		// Start the crawler on the next runloop
		// This enables initial configuration options and event handlers to take
		// effect before the first resource is queued.
		process.nextTick(function() {
		tmpCrawler.start();
		});
		// Attach callbacks if they were provided
		if (successCallback) {
		tmpCrawler.on("fetchcomplete", successCallback);
		}
		if (failCallback) {
		tmpCrawler.on("fetcherror", failCallback);
		}

		// Return crawler
		return tmpCrawler;
		// Start the crawler on the next runloop
		// This enables initial configuration options and event handlers to take
		// effect before the first resource is queued.
		process.nextTick(function() {
		tmpCrawler.start();
		});

		// Return crawler
		return tmpCrawler;
		};

package.json

		{
		"name": "simplecrawler",
		"description": "Very straigntforward web crawler. Uses EventEmitter. Generates queue statistics and has a basic cache mechanism with extensible backend.",
		"version": "0.5.3",
		"description": "Very straightforward web crawler. Uses EventEmitter. Generates queue statistics and has a basic cache mechanism with extensible backend.",
		"version": "0.5.4",
		"homepage": "https://github.com/cgiffard/node-simplecrawler",
		"author": "Christopher Giffard <christopher.giffard@cgiffard.com>",
		"license": "BSD-2-Clause",
		"repository": {
		"type": "git",
		"url": "https://github.com/cgiffard/node-simplecrawler.git"
		"url": "git+https://github.com/cgiffard/node-simplecrawler.git"
		},
		@@ -24,3 +25,5 @@ "bugs": {
		"scripts": {
		"test": "mocha -R spec -t 5000"
		"lint": "eslint example/ lib/ test/",
		"mocha": "mocha -R spec -t 5000",
		"test": "npm run lint && npm run mocha"
		},
		@@ -32,8 +35,8 @@ "bin": {
		"dependencies": {
		"URIjs": "^1.15.0"
		"urijs": "^1.16.1"
		},
		"devDependencies": {
		"chai": "^2.2.0",
		"jshint": "^2.7.0",
		"mocha": "^2.2.4"
		"chai": "^3.2.0",
		"eslint": "^1.5.1",
		"mocha": "^2.3.2"
		},
		@@ -40,0 +43,0 @@ "engines": {

README.markdown

Sorry, the diff of this file is not supported yet

simplecrawler - npm Package Compare versions

Improved metrics

Dependency changes