simplecrawler
Advanced tools
Comparing version 0.2.5 to 0.2.6
@@ -37,2 +37,4 @@ // Simplecrawler | ||
var Crawler = function(host,initialPath,initialPort,interval) { | ||
var crawler = this; | ||
// Data integrity checks | ||
@@ -46,8 +48,8 @@ if (initialPort && isNaN(initialPort)) | ||
// Domain to crawl | ||
this.host = host || ""; | ||
crawler.host = host || ""; | ||
// Gotta start crawling *somewhere* | ||
this.initialPath = initialPath || "/"; | ||
this.initialPort = initialPort || 80; | ||
this.initialProtocol = "http"; | ||
crawler.initialPath = initialPath || "/"; | ||
crawler.initialPort = initialPort || 80; | ||
crawler.initialProtocol = "http"; | ||
@@ -57,13 +59,13 @@ // Internal 'tick' interval for spawning new requests | ||
// One request will be spooled per tick, up to the concurrency threshold. | ||
this.interval = interval || 250; | ||
crawler.interval = interval || 250; | ||
// Maximum request concurrency. Be sensible. Five ties in with node's | ||
// default maxSockets value. | ||
this.maxConcurrency = 5; | ||
crawler.maxConcurrency = 5; | ||
// Maximum time we'll wait for headers | ||
this.timeout = 5 * 60 * 1000; | ||
crawler.timeout = 5 * 60 * 1000; | ||
// User Agent | ||
this.userAgent = | ||
crawler.userAgent = | ||
"Node/" + MetaInfo.name + " " + MetaInfo.version + | ||
@@ -74,3 +76,3 @@ " (" + MetaInfo.repository.url + ")"; | ||
// (but it's basically just an array) | ||
this.queue = new FetchQueue(); | ||
crawler.queue = new FetchQueue(); | ||
@@ -80,40 +82,40 @@ // Do we filter by domain? | ||
// recommend leaving this on! | ||
this.filterByDomain = true; | ||
crawler.filterByDomain = true; | ||
// Do we scan subdomains? | ||
this.scanSubdomains = false; | ||
crawler.scanSubdomains = false; | ||
// Treat WWW subdomain the same as the main domain (and don't count | ||
// it as a separate subdomain) | ||
this.ignoreWWWDomain = true; | ||
crawler.ignoreWWWDomain = true; | ||
// Or go even further and strip WWW subdomain from domains altogether! | ||
this.stripWWWDomain = false; | ||
crawler.stripWWWDomain = false; | ||
// Internal cachestore | ||
this.cache = null; | ||
crawler.cache = null; | ||
// Use an HTTP Proxy? | ||
this.useProxy = false; | ||
this.proxyHostname = "127.0.0.1"; | ||
this.proxyPort = 8123; | ||
crawler.useProxy = false; | ||
crawler.proxyHostname = "127.0.0.1"; | ||
crawler.proxyPort = 8123; | ||
// Support for HTTP basic auth | ||
this.needsAuth = false; | ||
this.authUser = ""; | ||
this.authPass = ""; | ||
crawler.needsAuth = false; | ||
crawler.authUser = ""; | ||
crawler.authPass = ""; | ||
// Support for retaining cookies for parse duration | ||
this.acceptCookies = true; | ||
this.cookies = new CookieJar(); | ||
crawler.acceptCookies = true; | ||
crawler.cookies = new CookieJar(); | ||
// Support for custom headers... | ||
this.customHeaders = {}; | ||
crawler.customHeaders = {}; | ||
// Domain Whitelist | ||
// We allow domains to be whitelisted, so cross-domain requests can be made. | ||
this.domainWhitelist = []; | ||
crawler.domainWhitelist = []; | ||
// Supported Protocols | ||
this.allowedProtocols = [ | ||
crawler.allowedProtocols = [ | ||
/^http(s)?$/i, // HTTP & HTTPS | ||
@@ -124,7 +126,7 @@ /^(rss|atom|feed)(\+xml)?$/i // RSS / XML | ||
// Max file size to download/store | ||
this.maxResourceSize = 1024 * 1024 * 16; // 16mb | ||
crawler.maxResourceSize = 1024 * 1024 * 16; // 16mb | ||
// Supported MIME-types | ||
// Matching MIME-types will be scanned for links | ||
this.supportedMimeTypes = [ | ||
crawler.supportedMimeTypes = [ | ||
/^text\//i, | ||
@@ -137,13 +139,36 @@ /^application\/(rss|html|xhtml)?[\+\/\-]?xml/i, | ||
// Download linked, but unsupported files (binary - images, documents, etc) | ||
this.downloadUnsupported = true; | ||
crawler.downloadUnsupported = true; | ||
// URL Encoding setting... | ||
this.urlEncoding = "unicode"; | ||
crawler.urlEncoding = "unicode"; | ||
// STATE (AND OTHER) VARIABLES NOT TO STUFF WITH | ||
this.openRequests = 0; | ||
this.fetchConditions = []; | ||
crawler._openRequests = 0; | ||
crawler._fetchConditions = []; | ||
crawler._openListeners = 0; | ||
crawler._listenerMap = {}; | ||
// Run the EventEmitter constructor | ||
EventEmitter.call(this); | ||
EventEmitter.call(crawler); | ||
crawler._emitSpecial = function() { | ||
var args = Array.prototype.slice.call(arguments,0), | ||
event = args[0], | ||
eventArgsLen = args.length-1, | ||
asyncListenerCount = 0; | ||
crawler.listeners(event).forEach(function(listener) { | ||
if (listener.length > eventArgsLen) | ||
asyncListenerCount++; | ||
}); | ||
crawler._openListeners += asyncListenerCount|0; | ||
crawler.emit.apply(crawler,args.concat([ | ||
function listenerComplete() { | ||
if (crawler._openListeners > 0) | ||
crawler._openListeners --; | ||
} | ||
])); | ||
}; | ||
}; | ||
@@ -171,10 +196,10 @@ | ||
// only if we haven't already got stuff in our queue... | ||
if (!this.queue.length) { | ||
if (!crawler.queue.length) { | ||
// Initialise our queue by pushing the initial request data into it... | ||
this.queue.add( | ||
this.initialProtocol, | ||
this.host, | ||
this.initialPort, | ||
this.initialPath, | ||
crawler.queue.add( | ||
crawler.initialProtocol, | ||
crawler.host, | ||
crawler.initialPort, | ||
crawler.initialPath, | ||
function(error) { | ||
@@ -185,8 +210,8 @@ if (error) throw error; | ||
this.crawlIntervalID = setInterval(function() { | ||
crawler.crawlIntervalID = setInterval(function() { | ||
crawler.crawl.call(crawler); | ||
},this.interval); | ||
},crawler.interval); | ||
this.emit("crawlstart"); | ||
this.running = true; | ||
crawler._emitSpecial("crawlstart"); | ||
crawler.running = true; | ||
@@ -198,3 +223,3 @@ // Now kick off the initial crawl | ||
return this; | ||
return crawler; | ||
}; | ||
@@ -216,3 +241,3 @@ | ||
Crawler.prototype.protocolSupported = function(URL) { | ||
var protocol; | ||
var protocol, crawler = this; | ||
@@ -227,3 +252,3 @@ try { | ||
return this.allowedProtocols.reduce(function(prev,protocolCheck) { | ||
return crawler.allowedProtocols.reduce(function(prev,protocolCheck) { | ||
return prev || !!protocolCheck.exec(protocol); | ||
@@ -247,5 +272,6 @@ },false); | ||
Crawler.prototype.mimeTypeSupported = function(MIMEType) { | ||
var crawler = this; | ||
return ( | ||
this.supportedMimeTypes.reduce(function(prev,mimeCheck) { | ||
crawler.supportedMimeTypes.reduce(function(prev,mimeCheck) { | ||
return prev || !!mimeCheck.exec(MIMEType); | ||
@@ -271,3 +297,12 @@ },false) | ||
var newURL, crawler = this; | ||
if (!context || typeof(context) !== "object") | ||
context = { | ||
url: ( | ||
crawler.initialProtocol + "://" + | ||
crawler.host + ":" + | ||
crawler.initialPort + "/" | ||
) | ||
}; | ||
// If the URL didn't contain anything, don't fetch it. | ||
@@ -499,12 +534,12 @@ if (!URL.replace(/\s+/ig,"").length) return false; | ||
Crawler.prototype.queueLinkedItems = function(resourceData,queueItem) { | ||
var resources = this.discoverResources(resourceData,queueItem), | ||
crawler = this; | ||
var crawler = this, | ||
resources = crawler.discoverResources(resourceData,queueItem); | ||
// Emit discovered resources. ie: might be useful in building a graph of | ||
// page relationships. | ||
this.emit("discoverycomplete",queueItem,resources); | ||
crawler._emitSpecial("discoverycomplete",queueItem,resources); | ||
resources.forEach(function(url){ crawler.queueURL(url,queueItem); }); | ||
return this; | ||
return crawler; | ||
}; | ||
@@ -537,9 +572,9 @@ | ||
} | ||
// Pass this URL past fetch conditions to ensure the user thinks it's valid | ||
var fetchDenied = false; | ||
fetchDenied = crawler.fetchConditions.reduce(function(prev,callback) { | ||
fetchDenied = crawler._fetchConditions.reduce(function(prev,callback) { | ||
return fetchDenied || !callback(parsedURL); | ||
},false); | ||
if (fetchDenied) { | ||
@@ -549,3 +584,3 @@ // Fetch Conditions conspired to block URL | ||
} | ||
// Check the domain is valid before adding it to the queue | ||
@@ -562,6 +597,6 @@ if (crawler.domainValid(parsedURL.host)) { | ||
// We received an error condition when adding the callback | ||
crawler.emit("queueerror",error,parsedURL); | ||
crawler._emitSpecial("queueerror",error,parsedURL); | ||
} else { | ||
crawler.emit("queueadd",newQueueItem,parsedURL); | ||
newQueueItem.referrer = queueItem.url; | ||
crawler._emitSpecial("queueadd",newQueueItem,parsedURL); | ||
newQueueItem.referrer = queueItem ? queueItem.url : null; | ||
} | ||
@@ -572,3 +607,3 @@ } | ||
// If we caught an error, emit queueerror | ||
crawler.emit("queueerror",error,parsedURL); | ||
crawler._emitSpecial("queueerror",error,parsedURL); | ||
return false; | ||
@@ -608,3 +643,3 @@ } | ||
var crawler = this; | ||
crawler.openRequests ++; | ||
crawler._openRequests ++; | ||
@@ -670,3 +705,3 @@ // Variable declarations | ||
// if required. | ||
crawler.emit("fetchstart",queueItem,requestOptions); | ||
crawler._emitSpecial("fetchstart",queueItem,requestOptions); | ||
@@ -684,6 +719,6 @@ process.nextTick(function() { | ||
clientRequest.on("error",function(errorData) { | ||
crawler.openRequests --; | ||
crawler._openRequests --; | ||
// Emit 5xx / 4xx event | ||
crawler.emit("fetchclienterror",queueItem,errorData); | ||
crawler._emitSpecial("fetchclienterror",queueItem,errorData); | ||
queueItem.fetched = true; | ||
@@ -748,3 +783,3 @@ queueItem.stateData.code = 599; | ||
// Emit header receive event | ||
crawler.emit("fetchheaders",queueItem,response); | ||
crawler._emitSpecial("fetchheaders",queueItem,response); | ||
@@ -772,3 +807,3 @@ // Ensure response length is reasonable... | ||
crawler.emit("fetchcomplete",queueItem,responseBuffer,response); | ||
crawler._emitSpecial("fetchcomplete",queueItem,responseBuffer,response); | ||
@@ -788,3 +823,3 @@ // First, save item to cache (if we're using a cache!) | ||
crawler.openRequests --; | ||
crawler._openRequests --; | ||
} | ||
@@ -827,3 +862,3 @@ | ||
crawler.emit("fetchdataerror",queueItem,response); | ||
crawler._emitSpecial("fetchdataerror",queueItem,response); | ||
} | ||
@@ -869,3 +904,3 @@ } else { | ||
crawler.cache.getCacheData(queueItem,function(cacheObject) { | ||
crawler.emit("notmodified",queueItem,response,cacheObject); | ||
crawler._emitSpecial("notmodified",queueItem,response,cacheObject); | ||
}); | ||
@@ -875,3 +910,3 @@ } else { | ||
// we don't send any data. | ||
crawler.emit("notmodified",queueItem,response); | ||
crawler._emitSpecial("notmodified",queueItem,response); | ||
} | ||
@@ -890,3 +925,3 @@ | ||
// Emit redirect event | ||
crawler.emit("fetchredirect",queueItem,parsedURL,response); | ||
crawler._emitSpecial("fetchredirect",queueItem,parsedURL,response); | ||
@@ -896,3 +931,3 @@ // Clean URL, add to queue... | ||
crawler.openRequests --; | ||
crawler._openRequests --; | ||
@@ -905,5 +940,5 @@ // Ignore this request, but record that we had a 404 | ||
// Emit 404 event | ||
crawler.emit("fetch404",queueItem,response); | ||
crawler._emitSpecial("fetch404",queueItem,response); | ||
crawler.openRequests --; | ||
crawler._openRequests --; | ||
@@ -916,5 +951,5 @@ // And oh dear. Handle this one as well. (other 400s, 500s, etc) | ||
// Emit 5xx / 4xx event | ||
crawler.emit("fetcherror",queueItem,response); | ||
crawler._emitSpecial("fetcherror",queueItem,response); | ||
crawler.openRequests --; | ||
crawler._openRequests --; | ||
} | ||
@@ -941,3 +976,3 @@ | ||
if (crawler.openRequests > crawler.maxConcurrency) return; | ||
if (crawler._openRequests > crawler.maxConcurrency) return; | ||
@@ -947,3 +982,6 @@ crawler.queue.oldestUnfetchedItem(function(err,queueItem) { | ||
crawler.fetchQueueItem(queueItem); | ||
} else if (crawler.openRequests === 0) { | ||
} else if ( !crawler._openRequests && | ||
!crawler._openListeners) { | ||
crawler.queue.complete(function(err,completeCount) { | ||
@@ -972,5 +1010,6 @@ if (completeCount === crawler.queue.length) { | ||
Crawler.prototype.stop = function() { | ||
clearInterval(this.crawlIntervalID); | ||
this.running = false; | ||
return this; | ||
var crawler = this; | ||
clearInterval(crawler.crawlIntervalID); | ||
crawler.running = false; | ||
return crawler; | ||
}; | ||
@@ -1000,5 +1039,6 @@ | ||
Crawler.prototype.addFetchCondition = function(callback) { | ||
var crawler = this; | ||
if (callback instanceof Function) { | ||
this.fetchConditions.push(callback); | ||
return this.fetchConditions.length - 1; | ||
crawler._fetchConditions.push(callback); | ||
return crawler._fetchConditions.length - 1; | ||
} else { | ||
@@ -1024,6 +1064,7 @@ throw new Error("Fetch Condition must be a function."); | ||
Crawler.prototype.removeFetchCondition = function(index) { | ||
if (this.fetchConditions[index] && | ||
this.fetchConditions[index] instanceof Function) { | ||
var crawler = this; | ||
if (crawler._fetchConditions[index] && | ||
crawler._fetchConditions[index] instanceof Function) { | ||
return !!this.fetchConditions.splice(index,1); | ||
return !!crawler._fetchConditions.splice(index,1); | ||
} else { | ||
@@ -1030,0 +1071,0 @@ throw new Error("Unable to find indexed Fetch Condition."); |
{ | ||
"name": "simplecrawler", | ||
"description": "Very straigntforward web crawler. Uses EventEmitter. Generates queue statistics and has a basic cache mechanism with extensible backend.", | ||
"version": "0.2.5", | ||
"version": "0.2.6", | ||
"homepage": "http://github.com/cgiffard/node-simplecrawler", | ||
@@ -16,3 +16,3 @@ "author": "Christopher Giffard <christopher.giffard@cgiffard.com>", | ||
"scripts": { | ||
"test": "mocha -R spec" | ||
"test": "mocha -R spec -t 4000" | ||
}, | ||
@@ -19,0 +19,0 @@ "bin": { |
@@ -27,3 +27,15 @@ // Routes for testing server | ||
write(200,"Crawl complete!"); | ||
}, | ||
"/async-stage1": function(write) { | ||
write(200,"http://127.0.0.1:3000/async-stage2"); | ||
}, | ||
"/async-stage2": function(write) { | ||
write(200,"http://127.0.0.1:3000/async-stage3"); | ||
}, | ||
"/async-stage3": function(write) { | ||
write(200,"Complete!"); | ||
} | ||
}; |
@@ -14,9 +14,13 @@ // Runs a very simple crawl on an HTTP server | ||
// Create a new crawler to crawl this server | ||
var localCrawler = new Crawler("127.0.0.1","/",3000); | ||
var localCrawler = new Crawler("127.0.0.1","/",3000), | ||
asyncCrawler = new Crawler("127.0.0.1","/",3000); | ||
var linksDiscovered = 0; | ||
it("should be able to be started",function(done) { | ||
localCrawler.on("crawlstart",done); | ||
localCrawler.on("crawlstart",function() { done() }); | ||
localCrawler.on("discoverycomplete",function() { | ||
linksDiscovered ++; | ||
}); | ||
localCrawler.start(); | ||
@@ -33,6 +37,2 @@ localCrawler.running.should.be.truthy; | ||
localCrawler.on("discoverycomplete",function() { | ||
linksDiscovered ++; | ||
}); | ||
localCrawler.on("complete",function() { | ||
@@ -43,4 +43,31 @@ linksDiscovered.should.equal(5); | ||
}); | ||
it("should support async event listeners for manual discovery",function(done) { | ||
// Use a different crawler this time | ||
asyncCrawler.discoverResources = false; | ||
asyncCrawler.queueURL("http://127.0.0.1:3000/async-stage1"); | ||
asyncCrawler.start(); | ||
asyncCrawler.on("fetchcomplete",function(queueItem,data,res,evtDone) { | ||
setTimeout(function(){ | ||
linksDiscovered ++; | ||
if (String(data).match(/complete/i)) | ||
return evtDone(); | ||
// Taking advantage of the fact that for these, the sum total | ||
// of the body data is a URL. | ||
asyncCrawler.queueURL(String(data)).should.be.true; | ||
evtDone(); | ||
},250); | ||
}); | ||
asyncCrawler.on("complete",function() { | ||
linksDiscovered.should.equal(8); | ||
done(); | ||
}); | ||
}); | ||
// TODO | ||
@@ -47,0 +74,0 @@ |
Sorry, the diff of this file is not supported yet
103120
2177
555