// Simplecrawler
		// Christopher Giffard, 2011
		// Christopher Giffard, 2011 - 2013+
		//
		@@ -16,3 +16,20 @@ // http://www.github.com/cgiffard/node-simplecrawler

		// Crawler Constructor
		/*
		Public: Constructor for the crawler.

		host - Initial hostname/domain to begin crawling from. By
		default, the crawl will be locked to this hostname.
		initialPath - Initial path to begin crawling from.
		initialPort - Port to begin crawling from.
		interval - Request interval for the crawler. Defaults to 250ms.

		Examples

		var crawler = new Crawler("example.com","/",80,500);

		var crawler = new Crawler("example.com");

		Returns the crawler object which has now been constructed.

		*/
		var Crawler = function(host,initialPath,initialPort,interval) {
		@@ -115,2 +132,16 @@ // Data integrity checks

		/*
		Public: Starts or resumes the crawl. If the queue is empty, it adds a new
		queue item from which to begin crawling based on the initial configuration
		of the crawler itself. The crawler waits for process.nextTick to begin, so
		handlers and other properties can be altered or addressed before the crawl
		commences.

		Examples

		crawler.start();

		Returns the crawler object, to enable chaining.

		*/
		Crawler.prototype.start = function() {
		@@ -144,5 +175,19 @@ var crawler = this;
		});

		return this;
		};

		// Determines whether the protocol is supported, given a URL
		/*
		Public: Determines whether the protocol is supported, given a URL.

		URL - URL with a protocol, for testing.

		Examples

		crawler.protocolSupported("http://google.com/") // true, by default
		crawler.protocolSupported("wss://google.com/") // false, by default

		Returns a boolean, true if the protocol is supported - false if not.

		*/
		Crawler.prototype.protocolSupported = function(URL) {
		@@ -164,3 +209,15 @@ var protocol;

		// Determines whether the mimetype is supported, given a... mimetype
		/*
		Public: Determines whether the mimetype is supported, given a mimetype

		MIMEType - String containing MIME type to test

		Examples

		crawler.mimeTypeSupported("text/html") // true, by default
		crawler.mimeTypeSupported("application/octet-stream") // false, by default

		Returns a boolean, true if the MIME type is supported - false if not.

		*/
		Crawler.prototype.mimeTypeSupported = function(MIMEType) {
		@@ -175,3 +232,15 @@

		// Takes a URL, and extracts the protocol, host, port, and resource
		/*
		Public: Extracts protocol, host, port and resource (path) given a URL string.

		URL - String containing URL to process

		Examples

		var URLInfo = crawler.processURL("http://www.google.com/fish");

		Returns an object containing keys and values for "protocol", "host", "port",
		and "path".

		*/
		Crawler.prototype.processURL = function(URL,context) {
		@@ -202,5 +271,17 @@ var newURL;

		// Input some text/html and this function will return a bunch of URLs for queueing
		// (if there are actually any in the resource, otherwise it'll return an empty array)
		Crawler.prototype.discoverResources = function(resourceData,queueItem) {
		/*
		Public: Discovers linked resources in an HTML, XML or text document.

		resourceData - String containing document with linked resources.

		Examples

		crawler.discoverResources("http://www.google.com")
		crawler.discoverResources("<a href='...'>test</a>")

		Returns an array of the (string) resource URLs found in the document. If none
		were found, the array will be empty.

		*/
		Crawler.prototype.discoverResources = function(resourceData) {
		var resources = [],
		@@ -219,2 +300,3 @@ resourceText = resourceData.toString("utf8"),
		// or preview windows, which would otherwise be unavailable to us.
		// Worst case scenario is we make some junky requests.
		/^javascript\:[a-z0-9]+\(['"][^'"\s]+/ig
		@@ -269,3 +351,17 @@ ];

		// Checks to see whether domain is valid for crawling.
		/*
		Public: Determines based on crawler state whether a domain is valid for
		crawling.

		host - String containing the hostname of the resource to be fetched.

		Examples

		crawler.domainValid("127.0.0.1");
		crawler.domainValid("google.com");
		crawler.domainValid("test.example.com");

		Returns an true if the domain is valid for crawling, false if not.

		*/
		Crawler.prototype.domainValid = function(host) {
		@@ -281,3 +377,4 @@ var crawler = this,

		// If there's no whitelist, or the whitelist is of zero length, just return false.
		// If there's no whitelist, or the whitelist is of zero length,
		// just return false.
		if (!crawler.domainWhitelist \|\|
		@@ -288,8 +385,14 @@ !crawler.domainWhitelist.length) return false;
		return !!crawler.domainWhitelist.reduce(function(prev,cur,index,array) {

		// If we already located the relevant domain in the whitelist...
		if (prev) return prev;

		// If the domain is just equal, return true.
		if (host === cur) return true;
		// If we're ignoring WWW subdomains, and both domains, less www. are the same, return true.
		if (crawler.ignoreWWWDomain && host === cur.replace(/^www\./i,"")) return true;

		// If we're ignoring WWW subdomains, and both domains,
		// less www. are the same, return true.
		if (crawler.ignoreWWWDomain && host === cur.replace(/^www\./i,""))
		return true;

		// Otherwise, sorry. No dice.
		@@ -307,3 +410,4 @@ return false;

		// If we're ignoring www, remove it from both (if www is the first domain component...)
		// If we're ignoring www, remove it from both
		// (if www is the first domain component...)
		if (crawler.ignoreWWWDomain) {
		@@ -324,12 +428,35 @@ subdomain.replace(/^www./ig,"");
		(host === crawler.host) \|\|
		// Or if we're ignoring WWW subdomains, and both domains, less www. are the same, return true.
		(crawler.ignoreWWWDomain && crawler.host.replace(/^www\./i,"") === host.replace(/^www\./i,"")) \|\|
		// Or if the domain in question exists in the domain whitelist, return true.
		// Or if we're ignoring WWW subdomains, and both domains,
		// less www. are the same, return true.
		(
		crawler.ignoreWWWDomain &&
		crawler.host.replace(/^www\./i,"") ===
		host.replace(/^www\./i,"")
		) \|\|
		// Or if the domain in question exists in the domain whitelist,
		// return true.
		domainInWhitelist(host) \|\|
		// Or if we're scanning subdomains, and this domain is a subdomain of the crawler's set domain, return true.
		// Or if we're scanning subdomains, and this domain is a subdomain
		// of the crawler's set domain, return true.
		(crawler.scanSubdomains && isSubdomainOf(host,crawler.host)));
		};

		// Input some text/html and this function will delegate resource discovery, check link validity
		// and queue up resources for downloading!
		/*
		Public: Given a text or HTML document, initiates discovery of linked
		resources in the text, and queues the resources if applicable. Emits
		"discoverycomplete". Not to be confused with `crawler.discoverResources`,
		despite the `discoverResources` function being the main component of this
		one, since this function queues the resources in addition to
		discovering them.

		resourceData - Text document containing linked resource URLs.
		queueItem - Queue item from which the resource document was derived.

		Examples

		crawler.queueLinkedItems("<a href='...'>test</a>",queueItem);

		Returns the crawler object for chaining.

		*/
		Crawler.prototype.queueLinkedItems = function(resourceData,queueItem) {
		@@ -344,8 +471,26 @@ var resources = this.discoverResources(resourceData,queueItem),
		resources.forEach(function(url){ crawler.queueURL(url,queueItem); });

		return this;
		};

		// Clean and queue a single URL...
		/*
		Public: Given a single URL, this function cleans, validates, parses it and
		adds it to the queue. This is the best and simplest way to add an item to
		the queue.

		url - URL to be queued.
		queueItem - Queue item from which the resource was linked.

		Examples

		crawler.queueURL("http://www.google.com/",queueItem);

		Returns a boolean value indicating whether the URL was successfully queued
		or not.

		*/
		Crawler.prototype.queueURL = function(url,queueItem) {
		var crawler = this;
		var parsedURL = typeof(url) === "object" ? url : crawler.processURL(url,queueItem);
		var parsedURL =
		typeof(url) === "object" ? url : crawler.processURL(url,queueItem);

		@@ -389,7 +534,34 @@ // URL Parser decided this URL was junky. Next please!
		crawler.emit("queueerror",error,parsedURL);
		return false;
		}
		}

		return true;
		};

		// Fetch a queue item
		/*
		Public: The guts of the crawler: takes a queue item and spools a request for
		it, downloads, caches, and fires events based on the result of the request.
		It kicks off resource discovery and queues any new resources found.

		queueItem - Queue item to be fetched.

		Emits
		fetchstart
		fetchheaders
		fetchcomplete
		fetchdataerror
		notmodified
		fetchredirect
		fetch404
		fetcherror
		fetchclienterror

		Examples

		crawler.fetchQueueItem(queueItem);

		Returns the crawler object for chaining.

		*/
		Crawler.prototype.fetchQueueItem = function(queueItem) {
		@@ -625,5 +797,19 @@ var crawler = this;
		});

		return crawler;
		};

		// Crawl init
		/*
		Public: The main crawler runloop. Fires at the interval specified in the
		crawler configuration, when the crawl is running. May be manually fired.
		This function initiates fetching of a queue item if there are enough workers
		to do so and there are unfetched items in the queue.

		Examples

		crawler.crawl();

		Returns the crawler object for chaining.

		*/
		Crawler.prototype.crawl = function() {
		@@ -646,9 +832,43 @@ var crawler = this;
		});

		return crawler;
		};

		/*
		Public: Stops the crawler, terminating the crawl runloop.

		Examples

		crawler.stop();

		Returns the crawler object for chaining.

		*/
		Crawler.prototype.stop = function() {
		clearInterval(this.crawlIntervalID);
		this.running = false;
		return this;
		};

		/*
		Public: Given a function, this method adds it to an internal list maintained
		by the crawler to be executed against each URL to determine whether it should
		be fetched or not.

		callback - Function to be called when evaluating a URL. This function is
		passed an object containing the protocol, hostname, port, and path
		of a resource to be fetched. It can determine whether it should
		be requested or not by returning a boolean - false for no, true
		for yes.

		Examples

		crawler.addFetchCondition(function(parsedURL) {
		return (parsedURL.host !== "evildomain.com");
		});

		Returns the ID of the fetch condition - used for removing it from the crawler
		later.

		*/
		Crawler.prototype.addFetchCondition = function(callback) {
		@@ -663,2 +883,16 @@ if (callback instanceof Function) {

		/*
		Public: Given the ID of an existing fetch condition, this function removes
		it from the crawler's internal list of conditions.

		url - ID of fetch condition to be removed.

		Examples

		crawler.removeFetchCondition(3);

		Returns true if the fetch condition was removed, and throws an error if it
		could not be found.

		*/
		Crawler.prototype.removeFetchCondition = function(index) {
		@@ -665,0 +899,0 @@ if (this.fetchConditions[index] &&

lib/quickcrawl.js

		@@ -35,3 +35,3 @@ var Crawler = require("./crawler.js"),

		Returns the new Vixen object which has now been constructed.
		Returns the crawler object which has now been constructed.

		@@ -38,0 +38,0 @@ */

package.json

		{
		"name": "simplecrawler",
		"description": "Very straigntforward web crawler. Uses EventEmitter. Generates queue statistics and has a basic cache mechanism with extensible backend.",
		"version": "0.1.4",
		"version": "0.1.5",
		"homepage": "http://github.com/cgiffard/node-simplecrawler",
		@@ -6,0 +6,0 @@ "author": "Christopher Giffard <christopher.giffard@cgiffard.com>",

simplecrawler - npm Package Compare versions

Improved metrics