@@ -19,2 +19,4 @@ // Simplecrawler

		var QUEUE_ITEM_INITIAL_DEPTH = 1;

		/*
		@@ -162,2 +164,11 @@ Public: Constructor for the crawler.

		// Whether to parse inside HTML comments
		crawler.parseHTMLComments = true;

		// Whether to parse inside script tags
		crawler.parseScriptTags = true;

		// Max depth parameter
		crawler.maxDepth = 0;

		// STATE (AND OTHER) VARIABLES NOT TO STUFF WITH
		@@ -214,2 +225,3 @@ var hiddenProps = {
		crawler.initialPath,
		QUEUE_ITEM_INITIAL_DEPTH,
		function(error) {
		@@ -296,2 +308,40 @@ if (error) throw error;
		/*
		Public: Determines whether the queueItem can be fetched from its depth

		In fact, the queueItem need to be fetched before calling this (because we need its MIMEType).
		This will just determine if we need to send an event for this item & if we need to fetch linked
		resources.

		If the queue item is a CSS or JS file, it will always be fetched (we need all images in CSS files,
		even if max depth is already reached). If it's an HTML page, we will check if max depth is reached
		or not.

		queueItem - Queue item object to check

		Returns a boolean, true if the queue item can be fetched - false if not.

		*/
		Crawler.prototype.depthAllowed = function(queueItem) {
		var crawler = this;

		// Items matching this pattern will always be fetched, even if max depth is reached
		var mimeTypesWhitelist = [
		/^text\/(css\|javascript\|ecmascript)/i,
		/^application\/javascript/i,
		/^application\/x-font/i,
		/^application\/font/i,
		/^image\//i,
		/^font\//i
		];

		return (
		crawler.maxDepth === 0 \|\|
		queueItem.depth <= crawler.maxDepth \|\|
		mimeTypesWhitelist.reduce(function(prev,mimeCheck) {
		return prev \|\| !!mimeCheck.exec(queueItem.stateData.contentType);
		}, false)
		);
		};

		/*
		Public: Extracts protocol, host, port and resource (path) given a URL string.
		@@ -318,3 +368,4 @@
		crawler.initialPort + "/"
		)
		),
		depth: QUEUE_ITEM_INITIAL_DEPTH
		};
		@@ -347,6 +398,7 @@
		"protocol": newURL.protocol() \|\| "http",
		"host": newURL.hostname(),
		"port": newURL.port() \|\| 80,
		"path": newURL.resource(),
		"uriPath": newURL.path()
		"host": newURL.hostname(),
		"port": newURL.port() \|\| 80,
		"path": newURL.resource(),
		"uriPath": newURL.path(),
		"depth": context.depth + 1
		};
		@@ -383,2 +435,10 @@ };

		if (!crawler.parseHTMLComments) {
		resourceText = resourceText.replace(/<!--([\s\S]+?)-->/g, "");
		}

		if (!crawler.parseScriptTags) {
		resourceText = resourceText.replace(/<script(.*?)>([\s\S]+?)<\/script>/gi, "");
		}

		function cleanURL(URL) {
		@@ -404,11 +464,20 @@ return URL
		.reduce(function(list,URL) {
		var tmpURL;

		// Ensure URL is whole and complete
		try {
		URL = URI(URL)
		.absoluteTo(queueItem.url)
		tmpURL = URI(URL);

		if (queueItem.url) {
		URL = tmpURL
		.absoluteTo(queueItem.url)
		.normalize()
		.toString();
		} else {
		URL = tmpURL
		.normalize()
		.toString();
		}

		} catch(e) {

		// But if URI.js couldn't parse it - nobody can!
		@@ -636,2 +705,3 @@ return list;
		parsedURL.path,
		parsedURL.depth,
		function queueAddCallback(error,newQueueItem) {
		@@ -723,7 +793,10 @@ if (error) {
		""
		),
		"Referer": queueItem.referrer
		)
		}
		};

		if (queueItem.referrer) {
		requestOptions.headers.Referer = queueItem.referrer;
		}

		// If port is one of the HTTP/HTTPS defaults, delete the option to avoid conflicts
		@@ -876,4 +949,2 @@ if (requestOptions.port === 80 \|\| requestOptions.port === 443) {

		crawler.emit("fetchcomplete",queueItem,responseBuffer,response);

		// First, save item to cache (if we're using a cache!)
		@@ -886,6 +957,11 @@ if (crawler.cache !== null &&

		// We only process the item if it's of a valid mimetype
		// and only if the crawler is set to discover its own resources
		if (crawler.mimeTypeSupported(contentType) && crawler.discoverResources) {
		crawler.queueLinkedItems(responseBuffer,queueItem);
		// Is the item allowed by depth conditions ?
		if(crawler.depthAllowed(queueItem)) {
		crawler.emit("fetchcomplete",queueItem,responseBuffer,response);

		// We only process the item if it's of a valid mimetype
		// and only if the crawler is set to discover its own resources
		if (crawler.mimeTypeSupported(contentType) && crawler.discoverResources) {
		crawler.queueLinkedItems(responseBuffer,queueItem);
		}
		}
		@@ -892,0 +968,0 @@

lib/queue.js

		@@ -26,3 +26,11 @@ // Simplecrawler - queue module
		FetchQueue.prototype = [];
		FetchQueue.prototype.add = function(protocol, domain, port, path, callback) {
		FetchQueue.prototype.add = function(protocol, domain, port, path, depth, callback) {

		// For legacy reasons
		if (depth instanceof Function) {
		callback = depth;
		depth = 1;
		}

		depth = depth \|\| 1;
		callback = callback && callback instanceof Function ? callback : function(){};
		@@ -51,2 +59,3 @@ var self = this;
		"path": path,
		"depth": depth,
		"fetched": false,
		@@ -53,0 +62,0 @@ "status": "queued",

package.json

		{
		"name": "simplecrawler",
		"description": "Very straigntforward web crawler. Uses EventEmitter. Generates queue statistics and has a basic cache mechanism with extensible backend.",
		"version": "0.3.10",
		"version": "0.3.11",
		"homepage": "http://github.com/cgiffard/node-simplecrawler",
		@@ -34,9 +34,9 @@ "author": "Christopher Giffard <christopher.giffard@cgiffard.com>",
		"devDependencies": {
		"mocha": "~1.8.2",
		"jshint": "~0.7.x",
		"chai": "~1.2.0"
		"chai": "^2.2.0",
		"jshint": "^2.7.0",
		"mocha": "^2.2.4"
		},
		"dependencies": {
		"URIjs": "~1.10.2"
		"URIjs": "^1.15.0"
		}
		}

test/cookies.js

		@@ -11,4 +11,4 @@ // Ensures that cookie support is functional and reliable across
		"Set-cookie: adxcs=-; path=/; domain=.nytimes.com",
		"Set-Cookie: PREF=ID=8c63f2522e22574d:FF=0:TM=1366349569:LM=1366349569:S=p1Urbmfwfs-R573P; expires=Sun, 19-Apr-2015 05:32:49 GMT; path=/; domain=.google.com",
		"Set-Cookie: NID=67=DhLO04YPAMlhETrTIe2oFPqWZfypQXLZfCIPItOvf70zhtUEMEItYfdVh6aROEzRHqtd9jHT6HJ7Oo93eqP3cjYNp8GgShfa6r0WVbsmQQRUvutbjBOPwzo7ybwYcWdB; expires=Sat, 19-Oct-2015 05:32:49 GMT; path=/; domain=.google.com; HttpOnly",
		"Set-Cookie: PREF=ID=8c63f2522e22574d:FF=0:TM=1366349569:LM=1366349569:S=p1Urbmfwfs-R573P; expires=Sun, 19-Apr-2020 05:32:49 GMT; path=/; domain=.google.com",
		"Set-Cookie: NID=67=DhLO04YPAMlhETrTIe2oFPqWZfypQXLZfCIPItOvf70zhtUEMEItYfdVh6aROEzRHqtd9jHT6HJ7Oo93eqP3cjYNp8GgShfa6r0WVbsmQQRUvutbjBOPwzo7ybwYcWdB; expires=Sat, 19-Oct-2020 05:32:49 GMT; path=/; domain=.google.com; HttpOnly",
		"Set-Cookie: fpc=d=Yq1z8hbA9WextmPFlb7suMTfMRgtSc2FyzAB7now1ExfUZ.eW7s4QSwSKlB6ZB0juN8OLZxWf_XXEIcspYaQmVVD0mD0xJ.xpXBCSw5Dl_Ql6n.RLoM.7CnTbNSsiSr2fkNiCN47tRUB4j8iWevNwQdFDn1hB8z8t1xwWt76n.sLIRY9p2_jTBhukfSD4SBpBkJhI1o-&v=2; expires=Sat, 19-Apr-2020 05:48:42 GMT; path=/; domain=www.yahoo.com",
		@@ -15,0 +15,0 @@ "Set-Cookie: test=test; path=/test; domain=test.com"

test/discovery.js

		@@ -8,5 +8,11 @@ // Runs a very simple crawl on an HTTP server

		var Crawler = require("../"),
		crawler = new Crawler(),
		var Crawler = null,
		crawler = null,
		discover = null;

		beforeEach(function() {
		Crawler = require("../");
		crawler = new Crawler();
		discover = crawler.discoverResources.bind(crawler);
		});

		@@ -64,2 +70,36 @@ it("should discover http/s prefixed URLs in the document",function() {
		});

		it("should ignore HTML comments with parseHTMLComments = false",function() {

		crawler.parseHTMLComments = false;

		var links =
		discover(" <!-- http://example.com/oneline_comment --> \
		<a href=google.com> \
		<!-- \
		http://example.com/resource \
		<a href=example.com> \
		-->");

		links.should.be.an("array");
		links.length.should.equal(1);
		links[0].should.equal("google.com");
		});

		it("should ignore script tags with parseScriptTags = false",function() {

		crawler.parseScriptTags = false;

		var links =
		discover(" <script>var a = \"<a href='http://example.com/oneline_script'></a>\";</script> \
		<a href=google.com> \
		<script type='text/javascript'> \
		http://example.com/resource \
		<a href=example.com> \
		</SCRIPT>");

		links.should.be.an("array");
		links.length.should.equal(1);
		links[0].should.equal("google.com");
		});
		});

test/lib/routes.js

		@@ -43,3 +43,48 @@ // Routes for testing server
		// We want to trigger a timeout. Never respond.
		},

		// Routes for depth tests
		"/depth/1": function(write) {
		write(200,"<link rel='stylesheet' href='/css'> Home. <a href='/depth/2'>depth2</a>");
		},

		"/depth/2": function(write) {
		write(200,"Depth 2. http://127.0.0.1:3000/depth/3");
		},

		"/depth/3": function(write) {
		write(200,"Depth 3. <link rel='stylesheet' href='/css/2'> <link rel='stylesheet' href='/css/4'>");
		},

		"/css": function(write) {
		write(200,"/* CSS 1 */ @import url('/css/2'); @font-face { url(/font/1) format('woff'); }", "text/css");
		},

		"/css/2": function(write) {
		write(200,"/* CSS 2 */ @import url('/css/3'); .img1 { background-image:url('/img/1'); }", "text/css");
		},

		"/css/3": function(write) {
		write(200,"/* CSS 3 */", "text/css");
		},

		"/css/4": function(write) {
		write(200,"/* CSS 4 */ .img1 { background-image:url('/img/2'); } @font-face { url(/font/2) format('woff'); }", "text/css");
		},

		"/img/1": function(write) {
		write(200,"", "image/png");
		},

		"/img/2": function(write) {
		write(200,"", "image/png");
		},

		"/font/1": function(write) {
		write(200,"", "font/woff");
		},

		"/font/2": function(write) {
		write(200,"", "application/font-woff");
		}
		};

test/testcrawl.js

		@@ -79,2 +79,9 @@ // Runs a very simple crawl on an HTTP server

		it('should not throw an error if header Referer is undefined', function (done) {
		var crawler = new Crawler("127.0.0.1","/depth/1",3000);
		crawler.maxDepth = 1;
		crawler.start();
		crawler.on('complete', function () { done(); });
		});

		// TODO
		@@ -81,0 +88,0 @@

README.markdown

Sorry, the diff of this file is not supported yet

simplecrawler - npm Package Compare versions

Improved metrics

Dependency changes