simplecrawler
Advanced tools
Comparing version 0.0.9 to 0.0.10
42
index.js
@@ -7,11 +7,15 @@ // Simplecrawler | ||
// Queue Dependency | ||
var FetchQueue = require("./queue.js").queue; | ||
var Cache = require("./cache.js").Cache; | ||
var EventEmitter = require('events').EventEmitter; | ||
var http = require("http"), | ||
https = require("https"); | ||
var FetchQueue = require("./queue.js").queue, | ||
Cache = require("./cache.js").Cache | ||
MetaInfo = require("./package.json"); | ||
var http = require("http"), | ||
https = require("https"), | ||
EventEmitter = require('events').EventEmitter; | ||
// Crawler Constructor | ||
var Crawler = function(host,initialPath,initialPort,interval) { | ||
// SETTINGS TO STUFF WITH (not here! Do it when you create a `new Crawler()`) | ||
// SETTINGS TO STUFF WITH | ||
// (not here! Do it when you create a `new Crawler()`) | ||
// Domain to crawl | ||
@@ -25,7 +29,9 @@ this.host = host || ""; | ||
// Internal 'tick' interval for spawning new requests (as long as concurrency is under cap) | ||
// Internal 'tick' interval for spawning new requests | ||
// (as long as concurrency is under cap) | ||
// One request will be spooled per tick, up to the concurrency threshold. | ||
this.interval = interval || 250; | ||
// Maximum request concurrency. Be sensible. Five ties in with node's default maxSockets value. | ||
// Maximum request concurrency. Be sensible. Five ties in with node's | ||
// default maxSockets value. | ||
this.maxConcurrency = 5; | ||
@@ -37,9 +43,13 @@ | ||
// User Agent | ||
this.userAgent = "Node/SimpleCrawler 0.0.8 (http://www.github.com/cgiffard/node-simplecrawler)"; | ||
this.userAgent | ||
= "Node/" + MetaInfo.name + " " + MetaInfo.version + | ||
" (" + MetaInfo.repository.url + ")"; | ||
// Queue for requests - FetchQueue gives us stats and other sugar (but it's basically just an array) | ||
// Queue for requests - FetchQueue gives us stats and other sugar | ||
// (but it's basically just an array) | ||
this.queue = new FetchQueue(); | ||
// Do we filter by domain? | ||
// Unless you want to be crawling the entire internet, I would recommend leaving this on! | ||
// Unless you want to be crawling the entire internet, I would | ||
// recommend leaving this on! | ||
this.filterByDomain = true; | ||
@@ -50,3 +60,4 @@ | ||
// Treat WWW subdomain the same as the main domain (and don't count it as a separate subdomain) | ||
// Treat WWW subdomain the same as the main domain (and don't count | ||
// it as a separate subdomain) | ||
this.ignoreWWWDomain = true; | ||
@@ -57,3 +68,4 @@ | ||
// Use simplecrawler's internal resource discovery function (switch it off if you'd prefer to discover and queue resources yourself!) | ||
// Use simplecrawler's internal resource discovery function (switch it off | ||
// if you'd prefer to discover and queue resources yourself!) | ||
this.discoverResources = true; | ||
@@ -121,5 +133,5 @@ | ||
// Check whether we're global, domain-absolute or relative | ||
if (URL.match(/^http(s)?:\/\//i)) { | ||
if (URL.match(/^http(s)?:\/\//i) || URL.match(/^\/\//)) { | ||
// We're global. Try and extract domain and port | ||
split = URL.replace(/^http(s)?:\/\//i,"").split(/\//g); | ||
split = URL.replace(/^(http(s)?:)?\/\//i,"").split(/\//g); | ||
hostData = split[0] && split[0].length ? split[0] : host; | ||
@@ -126,0 +138,0 @@ |
{ | ||
"name": "simplecrawler", | ||
"description": "Very straigntforward web crawler. Uses EventEmitter. Generates queue statistics and has a basic cache mechanism with extensible backend.", | ||
"version": "0.0.9", | ||
"version": "0.0.10", | ||
"homepage": "http://github.com/cgiffard/node-simplecrawler", | ||
@@ -6,0 +6,0 @@ "author": "Christopher Giffard <christopher.giffard@cgiffard.com>", |
Sorry, the diff of this file is not supported yet
53704
996