simplecrawler - npm Package Compare versions

Comparing version 0.0.9 to 0.0.10

index.js

		@@ -7,11 +7,15 @@ // Simplecrawler
		// Queue Dependency
		var FetchQueue = require("./queue.js").queue;
		var Cache = require("./cache.js").Cache;
		var EventEmitter = require('events').EventEmitter;
		var http = require("http"),
		https = require("https");
		var FetchQueue = require("./queue.js").queue,
		Cache = require("./cache.js").Cache
		MetaInfo = require("./package.json");

		var http = require("http"),
		https = require("https"),
		EventEmitter = require('events').EventEmitter;

		// Crawler Constructor
		var Crawler = function(host,initialPath,initialPort,interval) {
		// SETTINGS TO STUFF WITH (not here! Do it when you create a `new Crawler()`)
		// SETTINGS TO STUFF WITH
		// (not here! Do it when you create a `new Crawler()`)

		// Domain to crawl
		@@ -25,7 +29,9 @@ this.host = host \|\| "";

		// Internal 'tick' interval for spawning new requests (as long as concurrency is under cap)
		// Internal 'tick' interval for spawning new requests
		// (as long as concurrency is under cap)
		// One request will be spooled per tick, up to the concurrency threshold.
		this.interval = interval \|\| 250;

		// Maximum request concurrency. Be sensible. Five ties in with node's default maxSockets value.
		// Maximum request concurrency. Be sensible. Five ties in with node's
		// default maxSockets value.
		this.maxConcurrency = 5;
		@@ -37,9 +43,13 @@
		// User Agent
		this.userAgent = "Node/SimpleCrawler 0.0.8 (http://www.github.com/cgiffard/node-simplecrawler)";
		this.userAgent
		= "Node/" + MetaInfo.name + " " + MetaInfo.version +
		" (" + MetaInfo.repository.url + ")";

		// Queue for requests - FetchQueue gives us stats and other sugar (but it's basically just an array)
		// Queue for requests - FetchQueue gives us stats and other sugar
		// (but it's basically just an array)
		this.queue = new FetchQueue();

		// Do we filter by domain?
		// Unless you want to be crawling the entire internet, I would recommend leaving this on!
		// Unless you want to be crawling the entire internet, I would
		// recommend leaving this on!
		this.filterByDomain = true;
		@@ -50,3 +60,4 @@

		// Treat WWW subdomain the same as the main domain (and don't count it as a separate subdomain)
		// Treat WWW subdomain the same as the main domain (and don't count
		// it as a separate subdomain)
		this.ignoreWWWDomain = true;
		@@ -57,3 +68,4 @@

		// Use simplecrawler's internal resource discovery function (switch it off if you'd prefer to discover and queue resources yourself!)
		// Use simplecrawler's internal resource discovery function (switch it off
		// if you'd prefer to discover and queue resources yourself!)
		this.discoverResources = true;
		@@ -121,5 +133,5 @@
		// Check whether we're global, domain-absolute or relative
		if (URL.match(/^http(s)?:\/\//i)) {
		if (URL.match(/^http(s)?:\/\//i) \|\| URL.match(/^\/\//)) {
		// We're global. Try and extract domain and port
		split = URL.replace(/^http(s)?:\/\//i,"").split(/\//g);
		split = URL.replace(/^(http(s)?:)?\/\//i,"").split(/\//g);
		hostData = split[0] && split[0].length ? split[0] : host;
		@@ -126,0 +138,0 @@

package.json

		{
		"name": "simplecrawler",
		"description": "Very straigntforward web crawler. Uses EventEmitter. Generates queue statistics and has a basic cache mechanism with extensible backend.",
		"version": "0.0.9",
		"version": "0.0.10",
		"homepage": "http://github.com/cgiffard/node-simplecrawler",
		@@ -6,0 +6,0 @@ "author": "Christopher Giffard <christopher.giffard@cgiffard.com>",

README.markdown

Sorry, the diff of this file is not supported yet

simplecrawler - npm Package Compare versions

Improved metrics