simplecrawler - npm Package Compare versions

simplecrawler

Package Overview

Dependencies

Maintainers

Advanced tools

Install Socket

Detect and block malicious and high-risk dependencies

Install

Comparing version 0.5.4 to 0.6.0

README.md

lib/cache-backend-fs.js

		@@ -198,9 +198,7 @@ /*
		}
		} else if (count === pathStack.length - 1) {
		// Write the file data in
		writeFileData(currentPath, data);
		} else {
		if (count === pathStack.length - 1) {
		// Write the file data in
		writeFileData(currentPath, data);
		}
		fs.mkdirSync(currentPath);

		}
		@@ -207,0 +205,0 @@ });

lib/cache.js

@@ -0,0 +0,0 @@ /*

lib/cli.js

		// CLI module for crawling.
		// Not yet built.

lib/cookies.js

		@@ -440,2 +440,6 @@ /*
		Cookie.prototype.matchDomain = function(domain) {
		if (this.domain === "*") {
		return true;
		}

		var reverseDomain = this.domain.split("").reverse().join(""),
		@@ -442,0 +446,0 @@ reverseDomainComp = domain.split("").reverse().join("");

159

lib/crawler.js

		@@ -19,3 +19,4 @@ /*
		zlib = require("zlib"),
		util = require("util");
		util = require("util"),
		iconv = require("iconv-lite");

		@@ -85,2 +86,6 @@ var QUEUE_ITEM_INITIAL_DEPTH = 1;

		// Decode HTTP responses based on their Content-Type header or any
		// inline charset definition
		crawler.decodeResponses = false;

		// Do we filter by domain?
		@@ -156,4 +161,4 @@ // Unless you want to be crawling the entire internet, I would
		crawler.discoverRegex = [
		/\s?(?:href\|src)\s?=\s?(["']).*?\1/ig,
		/\s?(?:href\|src)\s?=\s?[^"'][^\s>]+/ig,
		/\s(?:href\|src)\s?=\s?(["']).*?\1/ig,
		/\s(?:href\|src)\s?=\s?[^"'\s][^\s>]+/ig,
		/\s?url$(["']).*?\1$/ig,
		@@ -188,2 +193,6 @@ /\s?url$[^"'].*?$/ig,

		// The HTTP / HTTPS agent used to crawl
		crawler.httpAgent = http.globalAgent;
		crawler.httpsAgent = https.globalAgent;

		// STATE (AND OTHER) VARIABLES NOT TO STUFF WITH
		@@ -250,16 +259,16 @@ var hiddenProps = {

		crawler.crawlIntervalID =
		setInterval(
		function() {
		crawler.crawl(crawler);
		},
		crawler.interval);
		process.nextTick(function() {
		crawler.crawlIntervalID =
		setInterval(
		function() {
		crawler.crawl(crawler);
		},
		crawler.interval);

		crawler.emit("crawlstart");
		crawler.running = true;

		// Now kick off the initial crawl
		process.nextTick(function() {
		// Now kick off the initial crawl
		crawler.crawl();
		});
		crawler.running = true;
		crawler.emit("crawlstart");
		});
		@@ -448,4 +457,5 @@
		.replace(/^\s*/, "")
		.replace(/^(['"])(.*)\1$/, "$2")
		.replace(/^url$(.*)$/i, "$1")
		.replace(/^javascript\:\s[a-z0-9]+\((.)/i, "$1")
		.replace(/^javascript\:\s([a-z0-9]$["'](.)["']$).*/i, "$2")
		.replace(/^(['"])(.*)\1$/, "$2")
		@@ -476,4 +486,3 @@ .replace(/^$(.*)$$/, "$1")
		Crawler.prototype.cleanExpandResources = function (urlMatch, queueItem) {
		var crawler = this,
		resources = [];
		var crawler = this;

		@@ -510,3 +519,3 @@ if (!urlMatch) {
		// Does the item already exist in the list?
		if (resources.reduce(function(prev, current) {
		if (list.reduce(function(prev, current) {
		return prev \|\| current === URL;
		@@ -827,6 +836,7 @@ }, false)) {
		var client = queueItem.protocol === "https" ? https : http;
		var agent = queueItem.protocol === "https" ? crawler.httpsAgent : crawler.httpAgent;

		// Up the socket limit if required.
		if (client.globalAgent.maxSockets < crawler.maxConcurrency) {
		client.globalAgent.maxSockets = crawler.maxConcurrency;
		if (agent.maxSockets < crawler.maxConcurrency) {
		agent.maxSockets = crawler.maxConcurrency;
		}
		@@ -852,2 +862,3 @@
		path: requestPath,
		agent: agent,
		headers: {
		@@ -918,51 +929,49 @@ "User-Agent": crawler.userAgent,

		process.nextTick(function() {
		// Record what time we started this request
		timeCommenced = Date.now();
		// Record what time we started this request
		timeCommenced = Date.now();

		// Get the resource!
		clientRequest =
		client.request(requestOptions, function(response) {
		crawler.handleResponse(queueItem, response, timeCommenced);
		});
		// Get the resource!
		clientRequest =
		client.request(requestOptions, function(response) {
		crawler.handleResponse(queueItem, response, timeCommenced);
		});

		clientRequest.end();
		clientRequest.end();

		clientRequest.setTimeout(crawler.timeout, function() {
		if (queueItem.fetched) {
		return;
		}
		clientRequest.setTimeout(crawler.timeout, function() {
		if (queueItem.fetched) {
		return;
		}

		if (crawler.running && !queueItem.fetched) {
		crawler._openRequests--;
		}
		if (crawler.running && !queueItem.fetched) {
		crawler._openRequests--;
		}

		queueItem.fetched = true;
		queueItem.status = "timeout";
		crawler.emit("fetchtimeout", queueItem, crawler.timeout);
		clientRequest._crawlerHandled = true;
		clientRequest.abort();
		});
		queueItem.fetched = true;
		queueItem.status = "timeout";
		crawler.emit("fetchtimeout", queueItem, crawler.timeout);
		clientRequest._crawlerHandled = true;
		clientRequest.abort();
		});

		clientRequest.on("error", function(errorData) {
		clientRequest.on("error", function(errorData) {

		// This event will be thrown if we manually aborted the request,
		// but we don't want to do anything in that case.
		if (clientRequest._crawlerHandled) {
		return;
		}
		// This event will be thrown if we manually aborted the request,
		// but we don't want to do anything in that case.
		if (clientRequest._crawlerHandled) {
		return;
		}

		if (crawler.running && !queueItem.fetched) {
		crawler._openRequests--;
		}
		if (crawler.running && !queueItem.fetched) {
		crawler._openRequests--;
		}

		// Emit 5xx / 4xx event
		queueItem.fetched = true;
		queueItem.stateData.code = 599;
		queueItem.status = "failed";
		crawler.emit("fetchclienterror", queueItem, errorData);
		});
		// Emit 5xx / 4xx event
		queueItem.fetched = true;
		queueItem.stateData.code = 599;
		queueItem.status = "failed";
		crawler.emit("fetchclienterror", queueItem, errorData);
		});

		return crawler;
		});
		return crawler;
		};
		@@ -972,2 +981,27 @@
		/*
		Decode string buffer based on a complete Content-Type header. Will also look
		for an embedded <meta> tag with a charset definition, but the Content-Type
		header is prioritized, see:
		https://developer.mozilla.org/en-US/docs/Web/HTML/Element/meta#attr-charset

		Examples

		crawler.decodeBuffer(responseBuffer, "text/html; charset=tis-620");

		Returns the decoded buffer.

		*/
		Crawler.prototype.decodeBuffer = function (buffer, contentTypeHeader) {
		contentTypeHeader = contentTypeHeader \|\| "";

		var embeddedEncoding = /<meta.charset=["']{0,1}([^"'>])["']{0,1}\s*\/{0,1}>/i.exec(buffer.toString(undefined, 0, 512)) \|\| [],
		encoding = contentTypeHeader.split("charset=")[1] \|\| embeddedEncoding[1] \|\| contentTypeHeader;

		encoding = iconv.encodingExists(encoding) ? encoding : "utf-8";

		return iconv.decode(buffer, encoding);
		};


		/*
		Public: Given a queueItem and a matching response object, the crawler will
		@@ -1053,4 +1087,7 @@ handle downloading the resource, queueing of linked items, etc.
		if (crawler.depthAllowed(queueItem)) {
		crawler.emit("fetchcomplete", queueItem, responseBuffer, response);
		var responseBody =
		crawler.decodeResponses ? crawler.decodeBuffer(responseBuffer, stateData.contentType) : responseBuffer;

		crawler.emit("fetchcomplete", queueItem, responseBody, response);

		// We only process the item if it's of a valid mimetype
		@@ -1142,3 +1179,3 @@ // and only if the crawler is set to discover its own resources

		response.socket.end();
		response.socket.destroy();
		}
		@@ -1197,3 +1234,3 @@
		crawler.emit("fetcherror", queueItem, response);
		response.socket.end();
		response.socket.destroy();

		@@ -1200,0 +1237,0 @@ crawler._openRequests--;

lib/index.js

@@ -0,0 +0,0 @@ /*

lib/queue.js

@@ -0,0 +0,0 @@ /*

lib/quickcrawl.js

		@@ -66,3 +66,3 @@ /*

		var tmpCrawler = new Crawler(url.hostname(), url.path(), url.port() \|\| 80);
		var tmpCrawler = new Crawler(url.hostname(), url.path(), url.port() \|\| (url.protocol() === "https" ? 443 : 80));

		@@ -69,0 +69,0 @@ // Attach callbacks if they were provided

package.json

		{
		"name": "simplecrawler",
		"description": "Very straightforward web crawler. Uses EventEmitter. Generates queue statistics and has a basic cache mechanism with extensible backend.",
		"version": "0.5.4",
		"version": "0.6.0",
		"homepage": "https://github.com/cgiffard/node-simplecrawler",
		@@ -34,2 +34,3 @@ "author": "Christopher Giffard <christopher.giffard@cgiffard.com>",
		"dependencies": {
		"iconv-lite": "^0.4.13",
		"urijs": "^1.16.1"
		@@ -36,0 +37,0 @@ },

README.markdown

New alerts

Improved metrics

Worsened metrics

Dependency changes