simplecrawler
Advanced tools
Comparing version
@@ -37,2 +37,4 @@ // Simplecrawler | ||
var Crawler = function(host,initialPath,initialPort,interval) { | ||
var crawler = this; | ||
// Data integrity checks | ||
@@ -46,8 +48,8 @@ if (initialPort && isNaN(initialPort)) | ||
// Domain to crawl | ||
this.host = host || ""; | ||
crawler.host = host || ""; | ||
// Gotta start crawling *somewhere* | ||
this.initialPath = initialPath || "/"; | ||
this.initialPort = initialPort || 80; | ||
this.initialProtocol = "http"; | ||
crawler.initialPath = initialPath || "/"; | ||
crawler.initialPort = initialPort || 80; | ||
crawler.initialProtocol = "http"; | ||
@@ -57,13 +59,13 @@ // Internal 'tick' interval for spawning new requests | ||
// One request will be spooled per tick, up to the concurrency threshold. | ||
this.interval = interval || 250; | ||
crawler.interval = interval || 250; | ||
// Maximum request concurrency. Be sensible. Five ties in with node's | ||
// default maxSockets value. | ||
this.maxConcurrency = 5; | ||
crawler.maxConcurrency = 5; | ||
// Maximum time we'll wait for headers | ||
this.timeout = 5 * 60 * 1000; | ||
crawler.timeout = 5 * 60 * 1000; | ||
// User Agent | ||
this.userAgent = | ||
crawler.userAgent = | ||
"Node/" + MetaInfo.name + " " + MetaInfo.version + | ||
@@ -74,3 +76,3 @@ " (" + MetaInfo.repository.url + ")"; | ||
// (but it's basically just an array) | ||
this.queue = new FetchQueue(); | ||
crawler.queue = new FetchQueue(); | ||
@@ -80,40 +82,40 @@ // Do we filter by domain? | ||
// recommend leaving this on! | ||
this.filterByDomain = true; | ||
crawler.filterByDomain = true; | ||
// Do we scan subdomains? | ||
this.scanSubdomains = false; | ||
crawler.scanSubdomains = false; | ||
// Treat WWW subdomain the same as the main domain (and don't count | ||
// it as a separate subdomain) | ||
this.ignoreWWWDomain = true; | ||
crawler.ignoreWWWDomain = true; | ||
// Or go even further and strip WWW subdomain from domains altogether! | ||
this.stripWWWDomain = false; | ||
crawler.stripWWWDomain = false; | ||
// Internal cachestore | ||
this.cache = null; | ||
crawler.cache = null; | ||
// Use an HTTP Proxy? | ||
this.useProxy = false; | ||
this.proxyHostname = "127.0.0.1"; | ||
this.proxyPort = 8123; | ||
crawler.useProxy = false; | ||
crawler.proxyHostname = "127.0.0.1"; | ||
crawler.proxyPort = 8123; | ||
// Support for HTTP basic auth | ||
this.needsAuth = false; | ||
this.authUser = ""; | ||
this.authPass = ""; | ||
crawler.needsAuth = false; | ||
crawler.authUser = ""; | ||
crawler.authPass = ""; | ||
// Support for retaining cookies for parse duration | ||
this.acceptCookies = true; | ||
this.cookies = new CookieJar(); | ||
crawler.acceptCookies = true; | ||
crawler.cookies = new CookieJar(); | ||
// Support for custom headers... | ||
this.customHeaders = {}; | ||
crawler.customHeaders = {}; | ||
// Domain Whitelist | ||
// We allow domains to be whitelisted, so cross-domain requests can be made. | ||
this.domainWhitelist = []; | ||
crawler.domainWhitelist = []; | ||
// Supported Protocols | ||
this.allowedProtocols = [ | ||
crawler.allowedProtocols = [ | ||
/^http(s)?$/i, // HTTP & HTTPS | ||
@@ -124,7 +126,7 @@ /^(rss|atom|feed)(\+xml)?$/i // RSS / XML | ||
// Max file size to download/store | ||
this.maxResourceSize = 1024 * 1024 * 16; // 16mb | ||
crawler.maxResourceSize = 1024 * 1024 * 16; // 16mb | ||
// Supported MIME-types | ||
// Matching MIME-types will be scanned for links | ||
this.supportedMimeTypes = [ | ||
crawler.supportedMimeTypes = [ | ||
/^text\//i, | ||
@@ -137,13 +139,36 @@ /^application\/(rss|html|xhtml)?[\+\/\-]?xml/i, | ||
// Download linked, but unsupported files (binary - images, documents, etc) | ||
this.downloadUnsupported = true; | ||
crawler.downloadUnsupported = true; | ||
// URL Encoding setting... | ||
this.urlEncoding = "unicode"; | ||
crawler.urlEncoding = "unicode"; | ||
// STATE (AND OTHER) VARIABLES NOT TO STUFF WITH | ||
this.openRequests = 0; | ||
this.fetchConditions = []; | ||
crawler._openRequests = 0; | ||
crawler._fetchConditions = []; | ||
crawler._openListeners = 0; | ||
crawler._listenerMap = {}; | ||
// Run the EventEmitter constructor | ||
EventEmitter.call(this); | ||
EventEmitter.call(crawler); | ||
crawler._emitSpecial = function() { | ||
var args = Array.prototype.slice.call(arguments,0), | ||
event = args[0], | ||
eventArgsLen = args.length-1, | ||
asyncListenerCount = 0; | ||
crawler.listeners(event).forEach(function(listener) { | ||
if (listener.length > eventArgsLen) | ||
asyncListenerCount++; | ||
}); | ||
crawler._openListeners += asyncListenerCount|0; | ||
crawler.emit.apply(crawler,args.concat([ | ||
function listenerComplete() { | ||
if (crawler._openListeners > 0) | ||
crawler._openListeners --; | ||
} | ||
])); | ||
}; | ||
}; | ||
@@ -171,10 +196,10 @@ | ||
// only if we haven't already got stuff in our queue... | ||
if (!this.queue.length) { | ||
if (!crawler.queue.length) { | ||
// Initialise our queue by pushing the initial request data into it... | ||
this.queue.add( | ||
this.initialProtocol, | ||
this.host, | ||
this.initialPort, | ||
this.initialPath, | ||
crawler.queue.add( | ||
crawler.initialProtocol, | ||
crawler.host, | ||
crawler.initialPort, | ||
crawler.initialPath, | ||
function(error) { | ||
@@ -185,8 +210,8 @@ if (error) throw error; | ||
this.crawlIntervalID = setInterval(function() { | ||
crawler.crawlIntervalID = setInterval(function() { | ||
crawler.crawl.call(crawler); | ||
},this.interval); | ||
},crawler.interval); | ||
this.emit("crawlstart"); | ||
this.running = true; | ||
crawler._emitSpecial("crawlstart"); | ||
crawler.running = true; | ||
@@ -198,3 +223,3 @@ // Now kick off the initial crawl | ||
return this; | ||
return crawler; | ||
}; | ||
@@ -216,3 +241,3 @@ | ||
Crawler.prototype.protocolSupported = function(URL) { | ||
var protocol; | ||
var protocol, crawler = this; | ||
@@ -227,3 +252,3 @@ try { | ||
return this.allowedProtocols.reduce(function(prev,protocolCheck) { | ||
return crawler.allowedProtocols.reduce(function(prev,protocolCheck) { | ||
return prev || !!protocolCheck.exec(protocol); | ||
@@ -247,5 +272,6 @@ },false); | ||
Crawler.prototype.mimeTypeSupported = function(MIMEType) { | ||
var crawler = this; | ||
return ( | ||
this.supportedMimeTypes.reduce(function(prev,mimeCheck) { | ||
crawler.supportedMimeTypes.reduce(function(prev,mimeCheck) { | ||
return prev || !!mimeCheck.exec(MIMEType); | ||
@@ -271,3 +297,12 @@ },false) | ||
var newURL, crawler = this; | ||
if (!context || typeof(context) !== "object") | ||
context = { | ||
url: ( | ||
crawler.initialProtocol + "://" + | ||
crawler.host + ":" + | ||
crawler.initialPort + "/" | ||
) | ||
}; | ||
// If the URL didn't contain anything, don't fetch it. | ||
@@ -499,12 +534,12 @@ if (!URL.replace(/\s+/ig,"").length) return false; | ||
Crawler.prototype.queueLinkedItems = function(resourceData,queueItem) { | ||
var resources = this.discoverResources(resourceData,queueItem), | ||
crawler = this; | ||
var crawler = this, | ||
resources = crawler.discoverResources(resourceData,queueItem); | ||
// Emit discovered resources. ie: might be useful in building a graph of | ||
// page relationships. | ||
this.emit("discoverycomplete",queueItem,resources); | ||
crawler._emitSpecial("discoverycomplete",queueItem,resources); | ||
resources.forEach(function(url){ crawler.queueURL(url,queueItem); }); | ||
return this; | ||
return crawler; | ||
}; | ||
@@ -537,9 +572,9 @@ | ||
} | ||
// Pass this URL past fetch conditions to ensure the user thinks it's valid | ||
var fetchDenied = false; | ||
fetchDenied = crawler.fetchConditions.reduce(function(prev,callback) { | ||
fetchDenied = crawler._fetchConditions.reduce(function(prev,callback) { | ||
return fetchDenied || !callback(parsedURL); | ||
},false); | ||
if (fetchDenied) { | ||
@@ -549,3 +584,3 @@ // Fetch Conditions conspired to block URL | ||
} | ||
// Check the domain is valid before adding it to the queue | ||
@@ -562,6 +597,6 @@ if (crawler.domainValid(parsedURL.host)) { | ||
// We received an error condition when adding the callback | ||
crawler.emit("queueerror",error,parsedURL); | ||
crawler._emitSpecial("queueerror",error,parsedURL); | ||
} else { | ||
crawler.emit("queueadd",newQueueItem,parsedURL); | ||
newQueueItem.referrer = queueItem.url; | ||
crawler._emitSpecial("queueadd",newQueueItem,parsedURL); | ||
newQueueItem.referrer = queueItem ? queueItem.url : null; | ||
} | ||
@@ -572,3 +607,3 @@ } | ||
// If we caught an error, emit queueerror | ||
crawler.emit("queueerror",error,parsedURL); | ||
crawler._emitSpecial("queueerror",error,parsedURL); | ||
return false; | ||
@@ -608,3 +643,3 @@ } | ||
var crawler = this; | ||
crawler.openRequests ++; | ||
crawler._openRequests ++; | ||
@@ -670,3 +705,3 @@ // Variable declarations | ||
// if required. | ||
crawler.emit("fetchstart",queueItem,requestOptions); | ||
crawler._emitSpecial("fetchstart",queueItem,requestOptions); | ||
@@ -684,6 +719,6 @@ process.nextTick(function() { | ||
clientRequest.on("error",function(errorData) { | ||
crawler.openRequests --; | ||
crawler._openRequests --; | ||
// Emit 5xx / 4xx event | ||
crawler.emit("fetchclienterror",queueItem,errorData); | ||
crawler._emitSpecial("fetchclienterror",queueItem,errorData); | ||
queueItem.fetched = true; | ||
@@ -748,3 +783,3 @@ queueItem.stateData.code = 599; | ||
// Emit header receive event | ||
crawler.emit("fetchheaders",queueItem,response); | ||
crawler._emitSpecial("fetchheaders",queueItem,response); | ||
@@ -772,3 +807,3 @@ // Ensure response length is reasonable... | ||
crawler.emit("fetchcomplete",queueItem,responseBuffer,response); | ||
crawler._emitSpecial("fetchcomplete",queueItem,responseBuffer,response); | ||
@@ -788,3 +823,3 @@ // First, save item to cache (if we're using a cache!) | ||
crawler.openRequests --; | ||
crawler._openRequests --; | ||
} | ||
@@ -827,3 +862,3 @@ | ||
crawler.emit("fetchdataerror",queueItem,response); | ||
crawler._emitSpecial("fetchdataerror",queueItem,response); | ||
} | ||
@@ -869,3 +904,3 @@ } else { | ||
crawler.cache.getCacheData(queueItem,function(cacheObject) { | ||
crawler.emit("notmodified",queueItem,response,cacheObject); | ||
crawler._emitSpecial("notmodified",queueItem,response,cacheObject); | ||
}); | ||
@@ -875,3 +910,3 @@ } else { | ||
// we don't send any data. | ||
crawler.emit("notmodified",queueItem,response); | ||
crawler._emitSpecial("notmodified",queueItem,response); | ||
} | ||
@@ -890,3 +925,3 @@ | ||
// Emit redirect event | ||
crawler.emit("fetchredirect",queueItem,parsedURL,response); | ||
crawler._emitSpecial("fetchredirect",queueItem,parsedURL,response); | ||
@@ -896,3 +931,3 @@ // Clean URL, add to queue... | ||
crawler.openRequests --; | ||
crawler._openRequests --; | ||
@@ -905,5 +940,5 @@ // Ignore this request, but record that we had a 404 | ||
// Emit 404 event | ||
crawler.emit("fetch404",queueItem,response); | ||
crawler._emitSpecial("fetch404",queueItem,response); | ||
crawler.openRequests --; | ||
crawler._openRequests --; | ||
@@ -916,5 +951,5 @@ // And oh dear. Handle this one as well. (other 400s, 500s, etc) | ||
// Emit 5xx / 4xx event | ||
crawler.emit("fetcherror",queueItem,response); | ||
crawler._emitSpecial("fetcherror",queueItem,response); | ||
crawler.openRequests --; | ||
crawler._openRequests --; | ||
} | ||
@@ -941,3 +976,3 @@ | ||
if (crawler.openRequests > crawler.maxConcurrency) return; | ||
if (crawler._openRequests > crawler.maxConcurrency) return; | ||
@@ -947,3 +982,6 @@ crawler.queue.oldestUnfetchedItem(function(err,queueItem) { | ||
crawler.fetchQueueItem(queueItem); | ||
} else if (crawler.openRequests === 0) { | ||
} else if ( !crawler._openRequests && | ||
!crawler._openListeners) { | ||
crawler.queue.complete(function(err,completeCount) { | ||
@@ -972,5 +1010,6 @@ if (completeCount === crawler.queue.length) { | ||
Crawler.prototype.stop = function() { | ||
clearInterval(this.crawlIntervalID); | ||
this.running = false; | ||
return this; | ||
var crawler = this; | ||
clearInterval(crawler.crawlIntervalID); | ||
crawler.running = false; | ||
return crawler; | ||
}; | ||
@@ -1000,5 +1039,6 @@ | ||
Crawler.prototype.addFetchCondition = function(callback) { | ||
var crawler = this; | ||
if (callback instanceof Function) { | ||
this.fetchConditions.push(callback); | ||
return this.fetchConditions.length - 1; | ||
crawler._fetchConditions.push(callback); | ||
return crawler._fetchConditions.length - 1; | ||
} else { | ||
@@ -1024,6 +1064,7 @@ throw new Error("Fetch Condition must be a function."); | ||
Crawler.prototype.removeFetchCondition = function(index) { | ||
if (this.fetchConditions[index] && | ||
this.fetchConditions[index] instanceof Function) { | ||
var crawler = this; | ||
if (crawler._fetchConditions[index] && | ||
crawler._fetchConditions[index] instanceof Function) { | ||
return !!this.fetchConditions.splice(index,1); | ||
return !!crawler._fetchConditions.splice(index,1); | ||
} else { | ||
@@ -1030,0 +1071,0 @@ throw new Error("Unable to find indexed Fetch Condition."); |
{ | ||
"name": "simplecrawler", | ||
"description": "Very straigntforward web crawler. Uses EventEmitter. Generates queue statistics and has a basic cache mechanism with extensible backend.", | ||
"version": "0.2.5", | ||
"version": "0.2.6", | ||
"homepage": "http://github.com/cgiffard/node-simplecrawler", | ||
@@ -16,3 +16,3 @@ "author": "Christopher Giffard <christopher.giffard@cgiffard.com>", | ||
"scripts": { | ||
"test": "mocha -R spec" | ||
"test": "mocha -R spec -t 4000" | ||
}, | ||
@@ -19,0 +19,0 @@ "bin": { |
@@ -27,3 +27,15 @@ // Routes for testing server | ||
write(200,"Crawl complete!"); | ||
}, | ||
"/async-stage1": function(write) { | ||
write(200,"http://127.0.0.1:3000/async-stage2"); | ||
}, | ||
"/async-stage2": function(write) { | ||
write(200,"http://127.0.0.1:3000/async-stage3"); | ||
}, | ||
"/async-stage3": function(write) { | ||
write(200,"Complete!"); | ||
} | ||
}; |
@@ -14,9 +14,13 @@ // Runs a very simple crawl on an HTTP server | ||
// Create a new crawler to crawl this server | ||
var localCrawler = new Crawler("127.0.0.1","/",3000); | ||
var localCrawler = new Crawler("127.0.0.1","/",3000), | ||
asyncCrawler = new Crawler("127.0.0.1","/",3000); | ||
var linksDiscovered = 0; | ||
it("should be able to be started",function(done) { | ||
localCrawler.on("crawlstart",done); | ||
localCrawler.on("crawlstart",function() { done() }); | ||
localCrawler.on("discoverycomplete",function() { | ||
linksDiscovered ++; | ||
}); | ||
localCrawler.start(); | ||
@@ -33,6 +37,2 @@ localCrawler.running.should.be.truthy; | ||
localCrawler.on("discoverycomplete",function() { | ||
linksDiscovered ++; | ||
}); | ||
localCrawler.on("complete",function() { | ||
@@ -43,4 +43,31 @@ linksDiscovered.should.equal(5); | ||
}); | ||
it("should support async event listeners for manual discovery",function(done) { | ||
// Use a different crawler this time | ||
asyncCrawler.discoverResources = false; | ||
asyncCrawler.queueURL("http://127.0.0.1:3000/async-stage1"); | ||
asyncCrawler.start(); | ||
asyncCrawler.on("fetchcomplete",function(queueItem,data,res,evtDone) { | ||
setTimeout(function(){ | ||
linksDiscovered ++; | ||
if (String(data).match(/complete/i)) | ||
return evtDone(); | ||
// Taking advantage of the fact that for these, the sum total | ||
// of the body data is a URL. | ||
asyncCrawler.queueURL(String(data)).should.be.true; | ||
evtDone(); | ||
},250); | ||
}); | ||
asyncCrawler.on("complete",function() { | ||
linksDiscovered.should.equal(8); | ||
done(); | ||
}); | ||
}); | ||
// TODO | ||
@@ -47,0 +74,0 @@ |
Sorry, the diff of this file is not supported yet
URL strings
Supply chain riskPackage contains fragments of external URLs or IP addresses, which the package may be accessing at runtime.
Found 1 instance in 1 package
URL strings
Supply chain riskPackage contains fragments of external URLs or IP addresses, which the package may be accessing at runtime.
Found 1 instance in 1 package
103120
3.13%2177
3.03%555
4.52%