simplecrawler - npm Package Compare versions

		@@ -85,2 +85,9 @@ /*

		// Should we update crawler.host if the first response is a redirect to another domain.
		crawler.allowInitialDomainChange = false;

		// Set Accept-Encoding header and automatically decompress HTTP responses
		// based on Content-Encoding header
		crawler.decompressResponses = true;

		// Decode HTTP responses based on their Content-Type header or any
		@@ -175,2 +182,10 @@ // inline charset definition

		// Find srcset links
		function (string) {
		var result = /\ssrcset\s=\s(["'])(.*)\1/.exec(string);
		return Array.isArray(result) ? String(result[2]).split(",").map(function (string) {
		return string.replace(/\s?\w*$/, "").trim();
		}) : "";
		},

		// Find resources in <meta> redirects. We need to wrap these RegExp's in
		@@ -199,2 +214,12 @@ // functions because we only want to return the first capture group, not

		// Matching MIME-types will be allowed to fetch further than max depth
		crawler.whitelistedMimeTypes = [
		/^text\/(css\|javascript\|ecmascript)/i,
		/^application\/javascript/i,
		/^application\/x-font/i,
		/^application\/font/i,
		/^image\//i,
		/^font\//i
		];

		// Whether to allow 'resources' greater than the max depth to be downloaded
		@@ -212,2 +237,3 @@ crawler.fetchWhitelistedMimeTypesBelowMaxDepth = false;
		var hiddenProps = {
		_isFirstRequest: true,
		_openRequests: 0,
		@@ -250,2 +276,6 @@ _fetchConditions: [],

		if (crawler.running) {
		return crawler;
		}

		// only if we haven't already got stuff in our queue...
		@@ -322,5 +352,5 @@ crawler.queue.getLength(function(err, length) {

		return crawler.allowedProtocols.reduce(function(prev, protocolCheck) {
		return prev \|\| !!protocolCheck.exec(protocol);
		}, false);
		return crawler.allowedProtocols.some(function(protocolCheck) {
		return protocolCheck.test(protocol);
		});
		};
		@@ -344,5 +374,5 @@

		return crawler.supportedMimeTypes.reduce(function(prev, mimeCheck) {
		return prev \|\| !!mimeCheck.exec(MIMEType);
		}, false);
		return crawler.supportedMimeTypes.some(function(mimeCheck) {
		return mimeCheck.test(MIMEType);
		});
		};
		@@ -357,6 +387,2 @@

		If the queue item is a CSS or JS file, it will always be fetched (we need
		all images in CSS files, even if max depth is already reached). If it's an
		HTML page, we will check if max depth is reached or not.

		queueItem - Queue item object to check
		@@ -370,19 +396,16 @@

		// Items matching this pattern will always be fetched, even if max depth
		// is reached
		var mimeTypesWhitelist = [
		/^text\/(css\|javascript\|ecmascript)/i,
		/^application\/javascript/i,
		/^application\/x-font/i,
		/^application\/font/i,
		/^image\//i,
		/^font\//i
		];
		var belowMaxDepth = crawler.fetchWhitelistedMimeTypesBelowMaxDepth;

		if (typeof belowMaxDepth === "boolean") {
		belowMaxDepth = belowMaxDepth === false ? 0 : Infinity;
		}

		var whitelistedDepth = queueItem.depth - belowMaxDepth;

		return crawler.maxDepth === 0 \|\|
		queueItem.depth <= crawler.maxDepth \|\|
		crawler.fetchWhitelistedMimeTypesBelowMaxDepth &&
		mimeTypesWhitelist.reduce(function(prev, mimeCheck) {
		return prev \|\| !!mimeCheck.exec(queueItem.stateData.contentType);
		}, false);
		whitelistedDepth <= crawler.maxDepth &&
		crawler.whitelistedMimeTypes.some(function(mimeCheck) {
		return mimeCheck.test(queueItem.stateData.contentType);
		});
		};
		@@ -534,5 +557,5 @@
		// Does the item already exist in the list?
		if (list.reduce(function(prev, current) {
		return prev \|\| current === URL;
		}, false)) {
		if (list.some(function(entry) {
		return entry === URL;
		})) {
		return list;
		@@ -633,23 +656,14 @@ }
		// Otherwise, scan through it.
		return !!crawler.domainWhitelist.reduce(function(prev, cur) {

		// If we already located the relevant domain in the whitelist...
		if (prev) {
		return prev;
		}

		return !!crawler.domainWhitelist.some(function(entry) {
		// If the domain is just equal, return true.
		if (host === cur) {
		if (host === entry) {
		return true;
		}

		// If we're ignoring WWW subdomains, and both domains,
		// less www. are the same, return true.
		if (crawler.ignoreWWWDomain && host === cur.replace(/^www\./i, "")) {
		if (crawler.ignoreWWWDomain && host === entry.replace(/^www\./i, "")) {
		return true;
		}

		// Otherwise, sorry. No dice.
		return false;
		}, false);
		});
		}
		@@ -701,9 +715,7 @@

		resourceData - Text document containing linked resource URLs.
		queueItem - Queue item from which the resource document was derived.
		decompressed - Content is already decompressed (default: false)
		resourceData - Text document containing linked resource URLs.
		queueItem - Queue item from which the resource document was derived.

		Emits

		gziperr
		discoverycomplete
		@@ -718,23 +730,7 @@
		*/
		Crawler.prototype.queueLinkedItems = function(resourceData, queueItem, decompressed) {
		var crawler = this,
		resources = [];
		Crawler.prototype.queueLinkedItems = function(resourceData, queueItem) {
		var crawler = this;

		if (!decompressed &&
		queueItem.stateData &&
		queueItem.stateData.headers["content-encoding"] && (
		queueItem.stateData.headers["content-encoding"].match(/gzip/) \|\|
		queueItem.stateData.headers["content-encoding"].match(/deflate/))) {
		var resources = crawler.discoverResources(resourceData, queueItem);

		return zlib.unzip(resourceData, function(err, newData) {
		if (err) {
		return crawler.emit("gziperror", queueItem, err, resourceData);
		}

		crawler.queueLinkedItems(newData, queueItem, true);
		});
		}

		resources = crawler.discoverResources(resourceData, queueItem);

		// Emit discovered resources. ie: might be useful in building a graph of
		@@ -783,6 +779,5 @@ // page relationships.
		// Pass this URL past fetch conditions to ensure the user thinks it's valid
		var fetchDenied = false;
		fetchDenied = crawler._fetchConditions.reduce(function(prev, callback) {
		return prev \|\| !callback(parsedURL, queueItem);
		}, false);
		var fetchDenied = crawler._fetchConditions.some(function(callback) {
		return !callback(parsedURL, queueItem);
		});

		@@ -877,2 +872,5 @@ if (fetchDenied) {

		var isStandardHTTPPort = queueItem.protocol === "http" && queueItem.port !== 80,
		isStandardHTTPSPort = queueItem.protocol === "https" && queueItem.port !== 443;

		// Load in request options
		@@ -886,8 +884,6 @@ requestOptions = {
		headers: {
		"User-Agent": crawler.userAgent,
		"Host": queueItem.host + (
		queueItem.port !== 80 ?
		":" + queueItem.port :
		""
		)
		"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8",
		"Accept-Encoding": "gzip, deflate",
		"User-Agent": crawler.userAgent,
		"Host": queueItem.host + (isStandardHTTPPort \|\| isStandardHTTPSPort ? ":" + queueItem.port : "")
		}
		@@ -901,3 +897,3 @@ };
		// If port is one of the HTTP/HTTPS defaults, delete the option to avoid conflicts
		if (requestOptions.port === 80 \|\| requestOptions.port === 443) {
		if (requestPort === 80 \|\| requestPort === 443) {
		delete requestOptions.port;
		@@ -934,7 +930,5 @@ }
		for (var header in crawler.customHeaders) {
		if (!crawler.customHeaders.hasOwnProperty(header)) {
		continue;
		if (crawler.customHeaders.hasOwnProperty(header)) {
		requestOptions.headers[header] = crawler.customHeaders[header];
		}

		requestOptions.headers[header] = crawler.customHeaders[header];
		}
		@@ -1064,8 +1058,8 @@ }
		// Save timing and content some header information into queue
		stateData.requestLatency = timeHeadersReceived - timeCommenced;
		stateData.requestTime = timeHeadersReceived - timeCommenced;
		stateData.contentLength = responseLength;
		stateData.contentType = contentType = response.headers["content-type"];
		stateData.code = response.statusCode;
		stateData.headers = response.headers;
		stateData.requestLatency = timeHeadersReceived - timeCommenced;
		stateData.requestTime = timeHeadersReceived - timeCommenced;
		stateData.contentLength = responseLength;
		stateData.contentType = contentType = response.headers["content-type"];
		stateData.code = response.statusCode;
		stateData.headers = response.headers;

		@@ -1085,7 +1079,16 @@ // Do we need to save cookies? Were we sent any?
		// Ensure response length is reasonable...
		responseLength =
		responseLength > 0 ? responseLength : crawler.maxResourceSize;

		responseLength = responseLength > 0 ? responseLength : crawler.maxResourceSize;
		queueItem.stateData.contentLength = responseLength;

		function emitFetchComplete(responseBody, decompressedBuffer) {
		responseBody = crawler.decodeResponses ? crawler.decodeBuffer(responseBody, stateData.contentType) : responseBody;
		crawler.emit("fetchcomplete", queueItem, responseBody, response);

		// We only process the item if it's of a valid mimetype
		// and only if the crawler is set to discover its own resources
		if (crawler.mimeTypeSupported(contentType) && crawler.discoverResources) {
		crawler.queueLinkedItems(decompressedBuffer \|\| responseBody, queueItem);
		}
		}

		// Function for dealing with 200 responses
		@@ -1116,13 +1119,19 @@ function processReceivedData() {

		// Is the item allowed by depth conditions ?
		if (crawler.depthAllowed(queueItem)) {
		var responseBody =
		crawler.decodeResponses ? crawler.decodeBuffer(responseBuffer, stateData.contentType) : responseBuffer;

		crawler.emit("fetchcomplete", queueItem, responseBody, response);

		// We only process the item if it's of a valid mimetype
		// and only if the crawler is set to discover its own resources
		if (crawler.mimeTypeSupported(contentType) && crawler.discoverResources) {
		crawler.queueLinkedItems(responseBuffer, queueItem);
		// No matter the value of `crawler.decompressResponses`, we still
		// decompress the response if it's gzipped or deflated. This is
		// because we always provide the discoverResources method with a
		// decompressed buffer
		if (/(gzip\|deflate)/.test(stateData.headers["content-encoding"])) {
		zlib.unzip(responseBuffer, function(error, decompressedBuffer) {
		if (error) {
		crawler.emit("gziperror", queueItem, error, responseBuffer);
		emitFetchComplete(responseBuffer);
		} else {
		var responseBody = crawler.decompressResponses ? decompressedBuffer : responseBuffer;
		emitFetchComplete(responseBody, decompressedBuffer);
		}
		});
		} else {
		emitFetchComplete(responseBuffer);
		}
		@@ -1196,2 +1205,4 @@ }

		crawler._isFirstRequest = false;

		// We've got a not-modified response back
		@@ -1211,2 +1222,4 @@ } else if (response.statusCode === 304) {

		crawler._isFirstRequest = false;

		// If we should queue a redirect
		@@ -1225,5 +1238,13 @@ } else if (response.statusCode >= 300 && response.statusCode < 400 &&

		if (crawler._isFirstRequest) {
		parsedURL.depth = 1;
		}

		if (crawler.allowInitialDomainChange && crawler._isFirstRequest) {
		crawler.host = parsedURL.host;
		}

		// Clean URL, add to queue...
		crawler.queueURL(parsedURL, queueItem);
		response.socket.end();
		response.socket.destroy();

		@@ -1238,7 +1259,9 @@ crawler._openRequests--;
		// Emit 404 event
		crawler.emit("fetch404", queueItem, response);
		response.socket.end();
		crawler.emit("fetch" + response.statusCode, queueItem, response);
		response.socket.destroy();

		crawler._openRequests--;

		crawler._isFirstRequest = false;

		// And oh dear. Handle this one as well. (other 400s, 500s, etc)
		@@ -1254,2 +1277,4 @@ } else {
		crawler._openRequests--;

		crawler._isFirstRequest = false;
		}
		@@ -1256,0 +1281,0 @@

lib/index.js

@@ -0,0 +0,0 @@ /*

lib/queue.js

@@ -0,0 +0,0 @@ /*

lib/quickcrawl.js

@@ -0,0 +0,0 @@ /*

package.json

		{
		"name": "simplecrawler",
		"description": "Very straightforward web crawler. Uses EventEmitter. Generates queue statistics and has a basic cache mechanism with extensible backend.",
		"version": "0.6.2",
		"version": "0.7.0",
		"homepage": "https://github.com/cgiffard/node-simplecrawler",
		@@ -6,0 +6,0 @@ "author": "Christopher Giffard <christopher.giffard@cgiffard.com>",

824

README.md

		@@ -1,49 +0,57 @@
		# Simple web-crawler for Node.js
		# Simple web crawler for node.js

		[![NPM version](https://img.shields.io/npm/v/simplecrawler.svg)](https://www.npmjs.com/package/simplecrawler)
		[![Build Status](https://img.shields.io/travis/cgiffard/node-simplecrawler/master.svg)](https://travis-ci.org/cgiffard/node-simplecrawler)
		[![Linux Build Status](https://img.shields.io/travis/cgiffard/node-simplecrawler/master.svg)](https://travis-ci.org/cgiffard/node-simplecrawler)
		[![Windows Build Status](https://img.shields.io/appveyor/ci/cgiffard/node-simplecrawler/master.svg?label=Windows%20build)](https://ci.appveyor.com/project/cgiffard/node-simplecrawler/branch/master)
		[![Dependency Status](https://img.shields.io/david/cgiffard/node-simplecrawler.svg)](https://david-dm.org/cgiffard/node-simplecrawler)
		[![devDependency Status](https://img.shields.io/david/dev/cgiffard/node-simplecrawler.svg)](https://david-dm.org/cgiffard/node-simplecrawler#info=devDependencies)

		Simplecrawler is designed to provide the most basic possible API for crawling
		websites, while being as flexible and robust as possible. I wrote simplecrawler
		to archive, analyse, and search some very large websites. It has happily chewed
		through 50,000 pages and written tens of gigabytes to disk without issue.
		simplecrawler is designed to provide a basic, flexible and robust API for
		crawling websites. I wrote simplecrawler to archive, analyse, and search some
		very large websites. It has happily chewed through hundreds of thousands of
		pages and written tens of gigabytes to disk without issue.

		#### Example (simple mode)
		## What does simplecrawler do?

		```js
		var Crawler = require("simplecrawler");

		Crawler.crawl("http://example.com/")
		.on("fetchcomplete", function(queueItem) {
		console.log("Completed fetching resource:", queueItem.url);
		});
		```

		### What does simplecrawler do?

		* Provides a very simple event driven API using `EventEmitter`
		* Extremely configurable base for writing your own crawler
		* Provides some simple logic for auto-detecting linked resources - which you can
		replace or augment
		replace or augment
		* Has a flexible queue system which can be frozen to disk and defrosted
		* Provides basic statistics on network performance
		* Uses buffers for fetching and managing data, preserving binary data (except
		when discovering links)
		when discovering links)

		### Installation
		## Documentation

		```
		npm install simplecrawler
		```
		- [Getting started](#getting-started)
		- [Simplified mode](#simplified-mode)
		- [Regular mode](#regular-mode)
		- [Events](#events)
		- [A note about HTTP error conditions](#a-note-about-http-error-conditions)
		- [Waiting for asynchronous event listeners](#waiting-for-asynchronous-event-listeners)
		- [Configuration](#configuration)
		- [Fetch conditions](#fetch-conditions)
		- [The queue](#the-queue)
		- [Manually adding to the queue](#manually-adding-to-the-queue)
		- [Queue items](#queue-items)
		- [Queue statistics and reporting](#queue-statistics-and-reporting)
		- [Saving and reloading the queue (freeze/defrost)](#saving-and-reloading-the-queue-freezedefrost)
		- [Cookies](#cookies)
		- [Cookie events](#cookie-events)
		- [Link Discovery](#link-discovery)
		- [FAQ/Troubleshooting](#faqtroubleshooting)
		- [Current Maintainers](#current-maintainers)
		- [Contributors](#contributors)
		- [License](#license)

		### Getting Started
		## Getting Started

		There are two ways of instantiating a new crawler - a simple but less flexible
		method inspired by [anemone](http://anemone.rubyforge.org), and the traditional
		method which provides a little more room to configure crawl parameters.
		There are two ways of instantiating a new crawler - a simplified but less
		flexible method inspired by [anemone](http://anemone.rubyforge.org), and the
		traditional method which provides a little more room to configure crawl
		parameters.

		Regardless of whether you use the simple or traditional methods of instantiation,
		you'll need to require simplecrawler:
		Regardless of whether you use the simplified or regular method of instantiation,
		you'll need to require simplecrawler first:

		@@ -54,9 +62,11 @@ ```js

		#### Simple Mode
		### Simplified Mode

		Simple mode generates a new crawler for you, preconfigures it based on a URL you
		provide, and returns the crawler to you for further configuration and so you can
		attach event handlers.
		If all you need is a quick crawl of a small website, the simplified mode of
		initiating the crawler provides a slightly quicker way of getting started. It
		generates a new crawler for you, preconfigures it based on a URL you provide,
		starts the crawl and returns the crawler instance for further configuration and
		so that you can attach event handlers.

		Simply call `Crawler.crawl`, with a URL first parameter, and two optional
		Simply call `Crawler.crawl` with a URL as the first parameter, and two optional
		functions that will be added as event listeners for `fetchcomplete` and
		@@ -85,21 +95,20 @@ `fetcherror` respectively.

		#### Advanced Mode
		### Regular Mode

		The alternative method of creating a crawler is to call the `simplecrawler`
		constructor yourself, and to initiate the crawl manually.
		The standard way of creating a crawler is to call the `simplecrawler`
		constructor yourself and initiate the crawl manually.

		```js
		var myCrawler = new Crawler("www.example.com");
		var crawler = new Crawler("www.example.com");
		```

		Nonstandard port? HTTPS? Want to start archiving a specific path? No problem:
		Non-standard port? HTTPS? Want to start crawling at a specific path? No problem:

		```js
		myCrawler.initialPath = "/archive";
		myCrawler.initialPort = 8080;
		myCrawler.initialProtocol = "https";
		crawler.initialPath = "/archive";
		crawler.initialPort = 8080;
		crawler.initialProtocol = "https";

		// Or:
		var myCrawler = new Crawler("www.example.com", "/archive", 8080);

		var crawler = new Crawler("www.example.com", "/archive", 8080);
		```
		@@ -112,36 +121,30 @@
		```js
		myCrawler.interval = 10000; // Ten seconds
		myCrawler.maxConcurrency = 1;
		crawler.interval = 10000; // Ten seconds
		crawler.maxConcurrency = 1;
		```

		You can also define a max depth for links to fetch:

		```js
		myCrawler.maxDepth = 1; // Only first page is fetched (with linked CSS & images)
		crawler.maxDepth = 1; // Only first page is fetched (with linked CSS & images)
		// Or:
		myCrawler.maxDepth = 2; // First page and discovered links from it are fetched
		crawler.maxDepth = 2; // First page and discovered links from it are fetched
		// Or:
		myCrawler.maxDepth = 3; // Etc.
		crawler.maxDepth = 3; // Etc.
		```

		For brevity, you may also specify the initial path and request interval when
		creating the crawler:
		For a full list of configurable properties, see the
		[configuration section](#configuration).

		```js
		var myCrawler = new Crawler("www.example.com", "/", 8080, 300);
		```
		You'll also need to set up event listeners for the [events](#events) you want to
		listen to. `fetchcomplete` and `complete` are a good place to start.

		### Running the crawler

		First, you'll need to set up an event listener to get the fetched data:

		```js
		myCrawler.on("fetchcomplete", function(queueItem, responseBuffer, response) {
		crawler.on("fetchcomplete", function(queueItem, responseBuffer, response) {
		console.log("I just received %s (%d bytes)", queueItem.url, responseBuffer.length);
		console.log("It was a resource of type %s", response.headers['content-type']);

		// Do something with the data in responseBuffer
		});
		```

		Then, when you're satisfied you're ready to go, start the crawler! It'll run
		Then, when you're satisfied and ready to go, start the crawler! It'll run
		through its queue finding linked resources on the domain to download, until it
		@@ -151,3 +154,3 @@ can't find any more.
		```js
		myCrawler.start();
		crawler.start();
		```
		@@ -188,3 +191,5 @@
		* `fetch404` (queueItem, response)
		Fired when a 404 or 410 HTTP status code is returned for a request.
		Fired when a 404 HTTP status code is returned for a request.
		* `fetch410` (queueItem, response)
		Fired when a 410 HTTP status code is returned for a request.
		* `fetcherror` (queueItem, response)
		@@ -213,8 +218,4 @@ Fired when an alternate 400 or 500 series HTTP status code is returned for a

		If this is annoying, and you'd really like to retain error pages by default, let
		me know. I didn't include it because I didn't need it - but if it's important to
		people I might put it back in. :)
		### Waiting for asynchronous event listeners

		#### Waiting for Asynchronous Event Listeners

		Sometimes, you might want to wait for simplecrawler to wait for you while you
		@@ -225,3 +226,3 @@ perform some asynchronous tasks in an event listener, instead of having it

		Simplecrawler provides a `wait` method you can call at any time. It is available
		simplecrawler provides a `wait` method you can call at any time. It is available
		via `this` from inside listeners, and on the crawler object itself. It returns
		@@ -234,3 +235,3 @@ a callback function.

		##### Example Asynchronous Event Listener
		#### Example asynchronous event listener

		@@ -247,156 +248,167 @@ ```js

		### Configuring the crawler
		## Configuration

		Here's a complete list of what you can stuff with at this stage:
		simplecrawler is highly configurable and there's a long list of settings you can
		change to adapt it to your specific needs.

		* `crawler.host` -
		* `crawler.host` -
		The domain to scan. By default, simplecrawler will restrict all requests to
		this domain.
		* `crawler.initialPath` -
		* `crawler.initialPath="/"` -
		The initial path with which the crawler will formulate its first request.
		Does not restrict subsequent requests.
		* `crawler.initialPort` -
		* `crawler.initialPort=80` -
		The initial port with which the crawler will formulate its first request.
		Does not restrict subsequent requests.
		* `crawler.initialProtocol` -
		The initial protocol with which the crawler will formulate its first request.
		Does not restrict subsequent requests.
		* `crawler.interval` -
		* `crawler.initialProtocol="http"` -
		The initial protocol with which the crawler will formulate its first
		request. Does not restrict subsequent requests.
		* `crawler.interval=250` -
		The interval with which the crawler will spool up new requests (one per
		tick.) Defaults to 250 ms.
		* `crawler.maxConcurrency` -
		tick).
		* `crawler.maxConcurrency=5` -
		The maximum number of requests the crawler will run simultaneously. Defaults
		to 5 - the default number of http agents node will run.
		* `crawler.timeout` -
		* `crawler.timeout=300000` -
		The maximum time in milliseconds the crawler will wait for headers before
		aborting the request.
		* `crawler.listenerTTL` -
		* `crawler.listenerTTL=10000` -
		The maximum time in milliseconds the crawler will wait for async listeners.
		* `crawler.userAgent` -
		The user agent the crawler will report. Defaults to
		`Node/SimpleCrawler <version> (https://github.com/cgiffard/node-simplecrawler)`.
		* `crawler.decodeResponses` -
		The response bodies will be intelligently character converted to standard
		JavaScript strings using the `iconv-lite` module. The character encoding
		is interpreted from the Content-Type header firstly, and secondly from any
		<meta charset="xxx" /> tags.
		* `crawler.queue` -
		* `crawler.userAgent="Node/simplecrawler <version> (https://github.com/cgiffard/node-simplecrawler)"` -
		The user agent the crawler will report.
		* `crawler.decompressResponses=true` -
		Response bodies that are compressed will be automatically decompressed
		before they're emitted in the `fetchcomplete` event. Even if this is falsy,
		compressed responses will be decompressed before they're passed to the
		`discoverResources` method.
		* `crawler.decodeResponses=false` -
		Response bodies will be intelligently character converted to standard
		JavaScript strings using the
		[iconv-lite](https://www.npmjs.com/package/iconv-lite) module. The character
		encoding is interpreted from the Content-Type header firstly, and secondly
		from any `<meta charset="xxx" />` tags.
		* `crawler.queue` -
		The queue in use by the crawler (Must implement the `FetchQueue` interface)
		* `crawler.filterByDomain` -
		* `crawler.allowInitialDomainChange=false` -
		If the response for the initial URL is a redirect to another domain (e.g.
		from github.net to github.com), update `crawler.host` to continue the
		crawling on that domain.
		* `crawler.filterByDomain=true` -
		Specifies whether the crawler will restrict queued requests to a given
		domain/domains.
		* `crawler.scanSubdomains` -
		Enables scanning subdomains (other than www) as well as the specified domain.
		Defaults to false.
		* `crawler.ignoreWWWDomain` -
		* `crawler.scanSubdomains=false` -
		Enables scanning subdomains (other than www) as well as the specified
		domain.
		* `crawler.ignoreWWWDomain=true` -
		Treats the `www` domain the same as the originally specified domain.
		Defaults to true.
		* `crawler.stripWWWDomain` -
		* `crawler.stripWWWDomain=false` -
		Or go even further and strip WWW subdomain from requests altogether!
		* `crawler.stripQuerystring` -
		Specify to strip querystring parameters from URLs. Defaults to false.
		* `crawler.discoverResources` -
		Use simplecrawler's internal resource discovery function. You can replace it
		with your own function, which must accept a buffer and a queueItem, and add
		the discovered resources to the crawler queue:
		* `crawler.stripQuerystring=false` -
		Specify to strip querystring parameters from URL's.
		* `crawler.discoverResources` -
		simplecrawler's default resource discovery function -
		which, given a buffer containing a resource, returns an array of URLs.
		For more details about link discovery, see [Link Discovery](#link-discovery)
		* `crawler.discoverRegex` -
		Array of regular expressions and functions that simplecrawler uses to
		discover resources. Functions in this array are expected to return an array.
		* `crawler.cache` -
		Specify a cache architecture to use when crawling. Must implement
		`SimpleCache` interface. You can save the site to disk using the built in
		file system cache like this:

		```js
		crawler.discoverResources = function(buf, queueItem) {
		// scan buffer for URLs, and then:
		...
		crawler.queueURL(aDiscoveredURL, queueItem);
		...
		};
		crawler.cache = new Crawler.cache('pathToCacheDirectory');
		```

		* `crawler.discoverRegex` -
		Array of regular expressions and functions that simplecrawler uses to
		discover resources. Functions in this array are expected to return an array.
		* `crawler.cache` -
		Specify a cache architecture to use when crawling. Must implement
		`SimpleCache` interface. You can save the site to disk using the built in file
		system cache like this: `crawler.cache = new Crawler.cache('pathToCacheDirectory');`
		* `crawler.useProxy` -
		* `crawler.useProxy=false` -
		The crawler should use an HTTP proxy to make its requests.
		* `crawler.proxyHostname` -
		* `crawler.proxyHostname="127.0.0.1"` -
		The hostname of the proxy to use for requests.
		* `crawler.proxyPort` -
		* `crawler.proxyPort=8123` -
		The port of the proxy to use for requests.
		* `crawler.proxyUser` -
		The username for HTTP/Basic proxy authentication (leave unset for unauthenticated proxies.)
		* `crawler.proxyPass` -
		The password for HTTP/Basic proxy authentication (leave unset for unauthenticated proxies.)
		* `crawler.domainWhitelist` -
		An array of domains the crawler is permitted to crawl from. If other settings
		are more permissive, they will override this setting.
		* `crawler.supportedMimeTypes` -
		* `crawler.proxyUser=null` -
		The username for HTTP/Basic proxy authentication (leave unset for
		unauthenticated proxies.)
		* `crawler.proxyPass=null` -
		The password for HTTP/Basic proxy authentication (leave unset for
		unauthenticated proxies.)
		* `crawler.domainWhitelist` -
		An array of domains the crawler is permitted to crawl from. If other
		settings are more permissive, they will override this setting.
		* `crawler.supportedMimeTypes` -
		An array of RegEx objects used to determine supported MIME types (types of
		data simplecrawler will scan for links.) If you're not using simplecrawler's
		resource discovery function, this won't have any effect.
		* `crawler.allowedProtocols` -
		An array of RegEx objects used to determine whether a URL protocol is supported.
		This is to deal with nonstandard protocol handlers that regular HTTP is
		sometimes given, like `feed:`. It does not provide support for non-http
		protocols (and why would it!?)
		* `crawler.maxResourceSize` -
		The maximum resource size, in bytes, which will be downloaded. Defaults to 16MB.
		* `crawler.downloadUnsupported` -
		Simplecrawler will download files it can't parse. Defaults to true, but if
		data simplecrawler will scan for links.) If you're not using
		simplecrawler's resource discovery function, this won't have any effect.
		* `crawler.allowedProtocols` -
		An array of RegExp objects used to determine whether a URL protocol is
		supported. This is to deal with nonstandard protocol handlers that regular
		HTTP is sometimes given, like `feed:`. It does not provide support for
		non-http protocols (and why would it!?)
		* `crawler.maxResourceSize=16777216` -
		The maximum resource size that will be downloaded, in bytes. Defaults to
		16MB.
		* `crawler.downloadUnsupported=true` -
		simplecrawler will download files it can't parse. Defaults to true, but if
		you'd rather save the RAM and GC lag, switch it off. When false, it closes
		sockets for unsupported resources.
		* `crawler.needsAuth` -
		Flag to specify if the domain you are hitting requires basic authentication
		* `crawler.authUser` -
		Username provided for needsAuth flag
		* `crawler.authPass` -
		Password provided for needsAuth flag
		* `crawler.customHeaders` -
		* `crawler.needsAuth=false` -
		Flag to specify if the domain you are hitting requires basic authentication.
		* `crawler.authUser=""` -
		Username provided for `needsAuth` flag.
		* `crawler.authPass=""` -
		Password provided for `needsAuth` flag.
		* `crawler.customHeaders` -
		An object specifying a number of custom headers simplecrawler will add to
		every request. These override the default headers simplecrawler sets, so
		be careful with them. If you want to tamper with headers on a per-request basis,
		see the `fetchqueue` event.
		* `crawler.acceptCookies` -
		Flag to indicate if the crawler should hold on to cookies
		* `crawler.urlEncoding` -
		Set this to `iso8859` to trigger URIjs' re-encoding of iso8859 URLs to unicode.
		Defaults to `unicode`.
		* `crawler.parseHTMLComments` -
		Whether to scan for URLs inside HTML comments.
		Defaults to `true`.
		* `crawler.parseScriptTags` -
		Whether to scan for URLs inside script tags.
		Defaults to `true`.
		* `crawler.maxDepth` -
		every request. These override the default headers simplecrawler sets, so be
		careful with them. If you want to tamper with headers on a per-request
		basis, see the `fetchqueue` event.
		* `crawler.acceptCookies=true` -
		Flag to indicate if the crawler should hold on to cookies.
		* `crawler.urlEncoding="unicode"` -
		Set this to `iso8859` to trigger
		[URI.js](https://medialize.github.io/URI.js/)' re-encoding of iso8859 URL's
		to unicode.
		* `crawler.parseHTMLComments=true` -
		Whether to scan for URL's inside HTML comments.
		* `crawler.parseScriptTags=true` -
		Whether to scan for URL's inside script tags.
		* `crawler.maxDepth=0` -
		Defines a maximum distance from the original request at which resources will
		be downloaded. Asset files are excluded from this distance condition if
		`crawler.fetchWhitelistedMimeTypesBelowMaxDepth` is `true`. Defaults to `0`
		— no max depth.
		* `crawler.fetchWhitelistedMimeTypesBelowMaxDepth` — Defaults to `false`. If
		`true`, then resources (fonts, images, CSS) will be excluded from `maxDepth`
		checks. (And therefore downloaded regardless of their depth.)
		* `crawler.ignoreInvalidSSL` -
		`crawler.fetchWhitelistedMimeTypesBelowMaxDepth` is `true`. Defaults to `0` —
		no max depth.
		* `crawler.whitelistedMimeTypes` -
		An array of RegEx objects used to determine whitelisted MIME types (types of
		data simplecrawler will fetch on disregardig the `maxDepth` checks).
		Defaults to common resource types like styles, fonts, scripts and images.
		* `crawler.fetchWhitelistedMimeTypesBelowMaxDepth=false` -
		Defines the depth for fetching resources in addition to maxDepth. If `true`,
		then resources (see `whitelistedMimeTypes`) will always be loaded, while
		`false` limits them to the same level. Furthermore a numeric value can be
		specified for a concrete offset (e.g. 1 for the next depth layer).
		* `crawler.ignoreInvalidSSL=false` -
		Treat self-signed SSL certificates as valid. SSL certificates will not be
		validated against known CAs. Only applies to https requests. You may also have
		to set the environment variable NODE_TLS_REJECT_UNAUTHORIZED to '0'.
		validated against known CAs. Only applies to https requests. You may also
		have to set the environment variable NODE_TLS_REJECT_UNAUTHORIZED to '0'.
		For example: `process.env.NODE_TLS_REJECT_UNAUTHORIZED = '0';`
		Defaults to false.

		#### Excluding certain resources from downloading
		## Fetch conditions

		Simplecrawler has a mechanism you can use to prevent certain resources from being
		fetched, based on the URL, called Fetch Conditions. A fetch condition is just
		a function, which, when given a parsed URL object, will return a boolean that
		indicates whether a given resource should be downloaded.
		simplecrawler has a mechanism you can use to prevent certain resources from
		being fetched, based on the URL, called fetch conditions. A fetch condition is a
		function that, when given a parsed URL object, returns a value that indicates
		whether a given resource should be downloaded.

		You may add as many fetch conditions as you like, and remove them at runtime.
		Simplecrawler will evaluate every single condition against every queued URL, and
		should just one of them return a falsy value (this includes `null` and `undefined`,
		so remember to always return a value!) then the resource in question will not be
		fetched.
		simplecrawler will evaluate will evaluate every fetch condition until one is
		encountered that returns a falsy value. If that happens, the resource in
		question will not be fetched.

		##### Adding a fetch condition
		### Adding a fetch condition

		This example fetch condition prevents URLs ending in `.pdf` from being downloaded.
		Adding a fetch condition assigns it an ID, which the `addFetchCondition` function
		returns. You can use this ID to remove the condition later.
		This example fetch condition prevents URL's ending in `.pdf` from being
		downloaded. Adding a fetch condition assigns it an ID, which the
		`addFetchCondition` function returns. You can use this ID to remove the
		condition later.

		@@ -410,3 +422,4 @@ ```js
		Fetch conditions are called with two arguments: `parsedURL` and `queueItem`.
		`parsedURL` is the resource to be fetched (or not) and has the following structure:
		`parsedURL` represents the resource to be fetched (or not) and has the following
		structure:

		@@ -424,26 +437,12 @@ ```js

		`queueItem` is a representation of the page where this resource was found, it
		looks like this:
		`queueItem` is a representation of the page where this resource was found. See
		the [queue item documentation](#queue-items) for details on its structure.

		```js
		{
		url: "http://example.com/index.php",
		protocol: "http",
		host: "example.com",
		port: 80,
		path: "/index.php",
		depth: 1,
		fetched: true,
		status: "downloaded",
		stateData: {...}
		}
		```

		This information enables you to write sophisticated logic for which pages to
		fetch and which to avoid. You could, for example, implement a link checker that
		not only checks your site, but also links to external sites, but doesn't continue
		crawling those sites by setting `filterByDomain` to false and checking that
		With this information, you can write sophisticated logic for determining which
		pages to fetch and which to avoid. For example, you could write a link checker
		that checks both internal and external links, yet doesn't continue crawling
		other domains by setting `filterByDomain` to false and checking that
		`queueItem.host` is the same as `crawler.host`.

		##### Removing a fetch condition
		### Removing a fetch condition

		@@ -457,48 +456,41 @@ If you stored the ID of the fetch condition you added earlier, you can remove it

		##### Excluding resources based on robots.txt
		## The queue

		Simplecrawler [purposely](https://github.com/cgiffard/node-simplecrawler/issues/153)
		doesn't come with any built in support for parsing robots.txt rules. Adding
		support manually is very straightforward using fetch conditions however, and
		in `examples/robots-txt-example.js` you'll find an example that makes use of
		the [robots-parser](https://www.npmjs.com/package/robots-parser) module to do
		just that.
		Like any other web crawler, simplecrawler has a queue. It can be directly
		accessed through `crawler.queue` and is by default only backed by an array,
		which means items in the queue can be accessed through array notation. However,
		since simplecrawler also supports different backing stores for the queue, the
		recommended way of accessing items is through the (pseudo) asynchronous
		`crawler.queue.get` method.

		### The Simplecrawler Queue

		Simplecrawler has a queue like any other web crawler. It can be directly accessed
		at `crawler.queue` (assuming you called your `Crawler()` object `crawler`.) It
		provides array access, so you can get to queue items just with array notation
		and an index.

		```js
		crawler.queue[5];
		crawler.queue.get(5, function (queueItem) {
		// Do something with the queueItem
		});
		```

		For compatibility with different backing stores, it now provides an alternate
		interface which the crawler core makes use of:
		Even though this operation is actually synchronous when the default backing
		store is used, this method helps maintain compatibility with asynchronous
		backing stores that would let you eg. store the queue in a database.

		```js
		crawler.queue.get(5);
		```
		### Manually adding to the queue

		It's not just an array though.

		#### Adding to the queue

		The simplest way to add to the queue is to use the crawler's own method,
		The simplest way of manually adding to the queue is to use the crawler's method
		`crawler.queueURL`. This method takes a complete URL, validates and deconstructs
		it, and adds it to the queue.

		If you instead want to add a resource by its components, you may call the
		`queue.add` method directly:
		```js
		var customQueueItem = {
		url: "http://example.com",
		depth: 2
		};

		```js
		crawler.queue.add(protocol, hostname, port, path);
		crawler.queueURL("/example.html", customQueueItem);
		```

		That's it! It's basically just a URL, but comma separated (that's how you can
		remember the order.)
		If you instead want to add a resource by its components, you may call the
		`queue.add` method directly with the signature `protocol`, `hostname`, `port`,
		`path`.

		#### Queue items
		### Queue items

		@@ -509,44 +501,52 @@ Because when working with simplecrawler, you'll constantly be handed queue items,

		* `url` - The complete, canonical URL of the resource.
		* `url` - The complete, canonical URL of the resource
		* `protocol` - The protocol of the resource (http, https)
		* `host` - The full domain/hostname of the resource
		* `port` - The port of the resource
		* `path` - The bit of the URL after the domain - includes the query string.
		* `fetched` - Has the request for this item been completed? You can monitor this as requests are processed.
		* `status` - The internal status of the item, always a string. This can be one of:
		* `queued` - The resource is in the queue to be fetched, but nothing's happened to it yet.
		* `spooled` - A request has been made to the remote server, but we're still waiting for a response.
		* `headers` - The headers for the resource have been received.
		* `downloaded` - The item has been entirely downloaded.
		* `redirected` - The resource request returned a 300 series response, with a Location header and a new URL.
		* `notfound` - The resource could not be found. (404)
		* `failed` - An error occurred when attempting to fetch the resource.
		* `stateData` - An object containing state data and other information about the request:
		* `requestLatency` - The time taken for headers to be received after the request was made.
		* `requestTime` - The total time taken for the request (including download time.)
		* `path` - The URL path, including the query string
		* `uriPath` - The URL path, excluding the query string
		* `depth` - How many steps simplecrawler has taken from the initial page (which
		is depth 1) to this resource.
		* `fetched` - Has the request for this item been completed? You can monitor this
		as requests are processed.
		* `status` - The internal status of the item, always a string. This can be one
		of:
		* `"queued"` - The resource is in the queue to be fetched, but nothing's
		happened to it yet.
		* `"spooled"` - A request has been made to the remote server, but we're
		still waiting for a response.
		* `"headers"` - The headers for the resource have been received.
		* `"downloaded"` - The item has been entirely downloaded.
		* `"redirected"` - The resource request returned a 300 series response, with
		a Location header and a new URL.
		* `"notfound"` - The resource could not be found. (404)
		* `"failed"` - An error occurred when attempting to fetch the resource.
		* `stateData` - An object containing state data and other information about the
		request:
		* `requestLatency` - The time taken for headers to be received after the
		request was made.
		* `requestTime` - The total time taken for the request (including download
		time.)
		* `downloadTime` - The total time taken for the resource to be downloaded.
		* `contentLength` - The length (in bytes) of the returned content. Calculated based on the `content-length` header.
		* `contentLength` - The length (in bytes) of the returned content.
		Calculated based on the `content-length` header.
		* `contentType` - The MIME type of the content.
		* `code` - The HTTP status code returned for the request.
		* `headers` - An object containing the header information returned by the server. This is the object node returns as part of the `response` object.
		* `actualDataSize` - The length (in bytes) of the returned content. Calculated based on what is actually received, not the `content-length` header.
		* `sentIncorrectSize` - True if the data length returned by the server did not match what we were told to expect by the `content-length` header.
		* `headers` - An object containing the header information returned by the
		server. This is the object node returns as part of the `response` object.
		* `actualDataSize` - The length (in bytes) of the returned content.
		Calculated based on what is actually received, not the `content-length`
		header.
		* `sentIncorrectSize` - True if the data length returned by the server did
		not match what we were told to expect by the `content-length` header.

		You can address these properties like you would any other object:
		As you can see, you can get a lot of meta-information out about each request.
		This has been put to use by providing some convenient methods for getting simple
		aggregate data about the queue.

		```js
		crawler.queue[52].url;
		queueItem.stateData.contentLength;
		queueItem.status === "queued";
		```
		### Queue statistics and reporting

		As you can see, you can get a lot of meta-information out about each request. The
		upside is, the queue actually has some convenient functions for getting simple
		aggregate data about the queue...

		#### Queue Statistics and Reporting

		First of all, the queue can provide some basic statistics about the network
		performance of your crawl (so far.) This is done live, so don't check it thirty
		times a second. You can test the following properties:
		performance of your crawl so far. This is done live, so don't check it 30 times
		a second. You can test the following properties:

		@@ -559,27 +559,37 @@ * `requestTime`

		And you can get the maximum, minimum, and average values for each with the
		You can get the maximum, minimum, and average values for each with the
		`crawler.queue.max`, `crawler.queue.min`, and `crawler.queue.avg` functions
		respectively. Like so:
		respectively. Like the `crawler.queue.get` method, these methods are pseudo
		asynchronous to support different backing stores for the queue. That means they
		will provide both a return value and a callback.

		```js
		console.log("The maximum request latency was %dms.", crawler.queue.max("requestLatency"));
		console.log("The minimum download time was %dms.", crawler.queue.min("downloadTime"));
		console.log("The average resource size received is %d bytes.", crawler.queue.avg("actualDataSize"));
		crawler.queue.max("requestLatency", function (max) {
		console.log("The maximum request latency was %dms.", max);
		});
		crawler.queue.min("downloadTime", function (min) {
		console.log("The minimum download time was %dms.", min);
		});
		crawler.queue.avg("actualDataSize", function (avg) {
		console.log("The average resource size received is %d bytes.", avg);
		});
		```

		You'll probably often need to determine how many items in the queue have a given
		status at any one time, and/or retrieve them. That's easy with
		You'll probably often need to determine how many queue items have a given status
		and/or retrieve them. That's easily done with the methods
		`crawler.queue.countWithStatus` and `crawler.queue.getWithStatus`.

		`crawler.queue.countWithStatus` returns the number of queued items with a given
		`crawler.queue.countWithStatus` provides the number of queued items with a given
		status, while `crawler.queue.getWithStatus` returns an array of the queue items
		themselves.
		themselves. Again, by default, these methods both return and accept callbacks.

		```js
		var redirectCount = crawler.queue.countWithStatus("redirected");
		crawler.queue.countWithStatus("redirected", function (redirectCount) {
		console.log("The redirect count is %d", redirectCount);
		});

		crawler.queue.getWithStatus("failed").forEach(function(queueItem) {
		console.log("Whoah, the request for %s failed!", queueItem.url);

		// do something...
		crawler.queue.getWithStatus("failed", function (failedItems) {
		failedItems.forEach(function(queueItem) {
		console.log("Whoah, the request for %s failed!", queueItem.url);
		});
		});
		@@ -590,31 +600,29 @@ ```

		* `crawler.queue.complete` - returns the number of queue items which have been
		completed (marked as fetched)
		* `crawler.queue.errors` - returns the number of requests which have failed
		(404s and other 400/500 errors, as well as client errors)
		* `crawler.queue.complete` - provides the number of queue items which have been
		completed (marked as fetched).
		* `crawler.queue.errors` - provides the number of requests which have failed
		(404s and other 400/500 errors, as well as client errors).

		#### Saving and reloading the queue (freeze/defrost)
		### Saving and reloading the queue (freeze/defrost)

		You'll probably want to be able to save your progress and reload it later, if
		your application fails or you need to abort the crawl for some reason. (Perhaps
		you just want to finish off for the night and pick it up tomorrow!) The
		`crawler.queue.freeze` and `crawler.queue.defrost` functions perform this task.
		It can be convenient to be able to save the crawl progress and later be able to
		reload it if your application fails or you need to abort the crawl for some
		reason. The `crawler.queue.freeze` and `crawler.queue.defrost` methods will let
		you do this.

		A word of warning though - they are not CPU friendly as they rely on
		JSON.parse and JSON.stringify. Use them only when you need to save the queue -
		don't call them every request or your application's performance will be incredibly
		poor - they block like crazy. That said, using them when your crawler commences
		and stops is perfectly reasonable.
		A word of warning - they are not CPU friendly as they rely on `JSON.parse`
		and `JSON.stringify`. Use them only when you need to save the queue - don't call
		them after every request or your application's performance will be incredibly
		poor - they block like crazy. That said, using them when your crawler
		commences and stops is perfectly reasonable.

		Note that the methods themselves are asynchronous, so if you are going to exit the
		process after you do the freezing, make sure you wait for callback - otherwise
		you'll get an empty file.
		Note that the methods themselves are asynchronous, so if you are going to exit
		the process after you do the freezing, make sure you wait for callback -
		otherwise you'll get an empty file.

		```js
		// Freeze queue
		crawler.queue.freeze("mysavedqueue.json", function() {
		crawler.queue.freeze("mysavedqueue.json", function () {
		process.exit();
		});

		// Defrost queue
		crawler.queue.defrost("mysavedqueue.json");
		@@ -625,53 +633,177 @@ ```

		Simplecrawler now has an internal cookie jar, which collects and resends cookies
		automatically, and by default.
		simplecrawler has an internal cookie jar, which collects and resends cookies
		automatically and by default. If you want to turn this off, set the
		`crawler.acceptCookies` option to `false`. The cookie jar is accessible via
		`crawler.cookies`, and is an event emitter itself.

		If you want to turn this off, set the `crawler.acceptCookies` option to `false`.
		### Cookie events

		The cookie jar is accessible via `crawler.cookies`, and is an event emitter itself:
		* `addcookie` (cookie) - Fired when a new cookie is added to the jar.
		* `removecookie` (cookie array) - Fired when one or more cookies are removed from the jar.

		### Cookie Events
		## Link Discovery

		* `addcookie` (cookie)
		Fired when a new cookie is added to the jar.
		* `removecookie` (cookie array)
		Fired when one or more cookies are removed from the jar.
		simplecrawler's discovery function is made to be replaceable — you can
		easily write your own that discovers only the links you're interested in.

		## Contributors
		The method must accept a buffer and a [`queueItem`](#queue-items), and
		return the resources that are to be added to the queue.

		I'd like to extend sincere thanks to:
		It is quite common to pair simplecrawler with a module like
		[cheerio](https://npmjs.com/package/cheerio) that can correctly parse
		HTML and provide a DOM like API for querying — or even a whole headless
		browser, like phantomJS.

		* [Nick Crohn](https://github.com/ncrohn) for the HTTP Basic auth support, and
		initial cookie support.
		* [Mike Moulton](https://github.com/mmoulton) for
		[fixing a bug in the URL discovery mechanism]
		(https://github.com/cgiffard/node-simplecrawler/pull/3), as well as
		[adding the `discoverycomplete` event]
		(https://github.com/cgiffard/node-simplecrawler/pull/10),
		* [Mike Iannacone](https://github.com/mikeiannacone) for correcting a keyword
		naming collision with node 0.8's EventEmitter.
		* [Greg Molnar](https://github.com/gregmolnar) for
		[adding a querystring-free path parameter to parsed URL objects.]
		(https://github.com/cgiffard/node-simplecrawler/pull/31)
		* [Breck Yunits](https://github.com/breck7) for contributing a useful code
		sample demonstrating using simplecrawler for caching a website to disk!
		* [Luke Plaster](https://github.com/notatestuser) for enabling protocol-agnostic
		link discovery
		* [Zeus](https://github.com/distracteddev) for fixing a bug where [default port
		info was wrongly specified in requests]
		(https://github.com/cgiffard/node-simplecrawler/pull/40)
		and for fixing the missing request timeout handling!
		* [Graham Hutchinson](https://github.com/ghhutch) for adding
		querystring-stripping option
		* [Jellyfrog](https://github.com/jellyfrog) for assisting in diagnosing some
		nasty EventEmitter issues.
		* [Brian Moeskau](https://github.com/bmoeskau) for helping to fix the confusing
		'async' events API, and providing invaluable feedback.
		The example below demonstrates how one might achieve basic HTML-correct
		discovery of only link tags using cheerio.

		And everybody else who has helped out in some way! :)
		```js
		crawler.discoverResources = function(buffer, queueItem) {
		var $ = cheerio.load(buffer.toString("utf8"));

		## Licence
		return $("a[href]").map(function () {
		return $(this).attr("href");
		}).get();
		};
		```

		Copyright (c) 2013, Christopher Giffard.
		## FAQ/Troubleshooting

		There are a couple of questions that pop up more often than others in the issue
		tracker. If you're having trouble with simplecrawler, please have a look at the
		list below before submitting an issue.

		- Q: Why does simplecrawler discover so many invalid URLs?

		A: simplecrawler's built-in discovery method is purposefully naive - it's a
		brute force approach intended to find everything: URLs in comments, binary files,
		scripts, image EXIF data, inside CSS documents, and more — useful for archiving
		and use cases where it's better to have false positives than fail to discover a
		resource.

		It's definitely not a solution for every case, though — if you're
		writing a link checker or validator, you don't want erroneous 404s
		throwing errors. Therefore, simplecrawler allows you to tune discovery in a few
		key ways:

		- You can either add to (or remove from) the `discoverRegex` array, tweaking
		the search patterns to meet your requirements; or
		- Swap out the `discoverResources` method. Parsing HTML pages is beyond the
		scope of simplecrawler, but it is very common to combine simplecrawler with
		a module like [cheerio](https://npmjs.com/package/cheerio) for more
		sophisticated resource discovery.

		Further documentation is available in the [link discovery](#link-discovery)
		section.

		- Q: Why did simplecrawler complete without fetching any resources?

		A: When this happens, it is usually because the initial request was redirected
		to a different domain that wasn't in the `domainWhitelist`.

		- Q: How do I crawl a site that requires a login?

		A: Logging in to a site is usually fairly simple and only requires an
		exhange of credentials over HTTP as well as the storing of a cookie that
		allows the client's session can be maintained between requests to the
		server. Simplecrawler doesn't have a built-in method for this entire
		procedure, but it does have an internal cookie jar that can be used to
		store the cookie that's returned from a manual HTTP request.

		Here's an example of how to perform a manual login HTTP request with the
		[request](https://npmjs.com/package/request) module and then store the
		returned cookie in simplecrawler's cookie jar.

		```js
		var Crawler = require("simplecrawler"),
		request = require("request");

		var crawler = new Crawler("example.com", "/");
		crawler.initialProtocol = "https";

		request.post("https://example.com/login", {
		form: {
		username: "iamauser",
		password: "supersecurepw"
		}
		}, function (error, response, body) {
		crawler.cookies.addFromHeaders(response.headers["set-cookie"]);
		crawler.start();
		});

		crawler.on("fetchcomplete", function (queueItem, responseBuffer, response) {
		console.log("Fetched", queueItem.url);
		});
		```

		- Q: What does it mean that events are asynchronous?

		A: One of the core concepts of node.js is its asynchronous nature. I/O
		operations (like network requests) take place outside of the main thread
		(which is where your code is executed). This is what makes node fast, the
		fact that it can continue executing code while there are multiple HTTP
		requests in flight, for example. But to be able to get back the result of
		the HTTP request, we need to register a function that will be called when
		the result is ready. This is what asynchronous means in node - the fact
		that code can continue executing while I/O operations are in progress - and
		it's the same concept as with AJAX requests in the browser.

		- Q: Promises are nice, can I use them with simplecrawler?

		A: No, not really. Promises are meant as a replacement for callbacks, but
		simplecrawler is event driven, not callback driven. Using callbacks to any
		greater extent in simplecrawler wouldn't make much sense, since you normally
		need to react more than once to what happens in simplecrawler.

		- Q: Something's happening and I don't see the output I'm expecting!

		Before filing an issue, check to see that you're not just missing something by
		logging all crawler events with the code below:

		```js
		var originalEmit = crawler.emit;
		crawler.emit = function(evtName, queueItem) {
		crawler.queue.complete(function(err, completeCount) {
		if (err) {
		throw err;
		}

		crawler.queue.getLength(function(err, length) {
		if (err) {
		throw err;
		}

		console.log("fetched %d of %d — %d open requests, %d open listeners",
		completeCount,
		length,
		crawler._openRequests,
		crawler._openListeners);
		});
		});

		console.log(evtName, queueItem ? queueItem.url ? queueItem.url : queueItem : null);
		originalEmit.apply(crawler, arguments);
		};
		```

		If you don't see what you need after inserting that code block, and you still need help,
		please attach the output of all the events fired with your email/issue.

		## Current Maintainers

		* [Christopher Giffard](https://github.com/cgiffard)
		* [Fredrik Ekelund](https://github.com/fredrikekelund)
		* [XhmikosR](https://github.com/XhmikosR)

		## Contributors

		simplecrawler has benefited from the kind efforts of dozens of contributors, to
		whom we are incredibly grateful. We originally listed their individual
		contributions but it became pretty unwieldy - the
		[full list can be found here.](https://github.com/cgiffard/node-simplecrawler/graphs/contributors)

		## License

		Copyright (c) 2016, Christopher Giffard.

		All rights reserved.
		@@ -678,0 +810,0 @@

Improved metrics