simplecrawler
Advanced tools
Comparing version 0.2.10 to 0.3.0
@@ -65,2 +65,5 @@ // Simplecrawler | ||
crawler.timeout = 5 * 60 * 1000; | ||
// Maximum time we'll wait for async listeners. | ||
crawler.listenerTTL = 10 * 1000; | ||
@@ -143,6 +146,7 @@ // User Agent | ||
// STATE (AND OTHER) VARIABLES NOT TO STUFF WITH | ||
crawler._openRequests = 0; | ||
crawler._fetchConditions = []; | ||
crawler._openListeners = 0; | ||
crawler._listenerMap = {}; | ||
var hiddenProps = { | ||
"_openRequests": 0, | ||
"_fetchConditions": [], | ||
"_openListeners": 0 | ||
}; | ||
@@ -152,22 +156,10 @@ // Run the EventEmitter constructor | ||
crawler._emitSpecial = function() { | ||
var args = Array.prototype.slice.call(arguments,0), | ||
event = args[0], | ||
eventArgsLen = args.length-1, | ||
asyncListenerCount = 0; | ||
crawler.listeners(event).forEach(function(listener) { | ||
if (listener.length > eventArgsLen) | ||
asyncListenerCount++; | ||
// Apply all the hidden props | ||
Object.keys(hiddenProps).forEach(function(key) { | ||
Object.defineProperty(crawler, key, { | ||
"writable": true, | ||
"enumerable": false, | ||
"value": hiddenProps[key] | ||
}); | ||
crawler._openListeners += asyncListenerCount|0; | ||
crawler.emit.apply(crawler,args.concat([ | ||
function listenerComplete() { | ||
if (crawler._openListeners > 0) | ||
crawler._openListeners --; | ||
} | ||
])); | ||
}; | ||
}); | ||
}; | ||
@@ -212,3 +204,3 @@ | ||
crawler._emitSpecial("crawlstart"); | ||
crawler.emit("crawlstart"); | ||
crawler.running = true; | ||
@@ -553,3 +545,3 @@ | ||
// page relationships. | ||
crawler._emitSpecial("discoverycomplete",queueItem,resources); | ||
crawler.emit("discoverycomplete",queueItem,resources); | ||
@@ -615,8 +607,8 @@ resources.forEach(function(url){ crawler.queueURL(url,queueItem); }); | ||
if (error.code && error.code === "DUP") | ||
return crawler._emitSpecial("queueduplicate",parsedURL); | ||
return crawler.emit("queueduplicate",parsedURL); | ||
return crawler._emitSpecial("queueerror",error,parsedURL); | ||
return crawler.emit("queueerror",error,parsedURL); | ||
} | ||
crawler._emitSpecial("queueadd",newQueueItem,parsedURL); | ||
crawler.emit("queueadd",newQueueItem,parsedURL); | ||
newQueueItem.referrer = queueItem ? queueItem.url : null; | ||
@@ -723,3 +715,3 @@ } | ||
// if required. | ||
crawler._emitSpecial("fetchstart",queueItem,requestOptions); | ||
crawler.emit("fetchstart",queueItem,requestOptions); | ||
@@ -738,3 +730,3 @@ process.nextTick(function() { | ||
clientRequest.abort(); | ||
crawler._emitSpecial("fetchtimeout",queueItem,crawler.timeout); | ||
crawler.emit("fetchtimeout",queueItem,crawler.timeout); | ||
}); | ||
@@ -746,3 +738,3 @@ | ||
// Emit 5xx / 4xx event | ||
crawler._emitSpecial("fetchclienterror",queueItem,errorData); | ||
crawler.emit("fetchclienterror",queueItem,errorData); | ||
queueItem.fetched = true; | ||
@@ -807,3 +799,3 @@ queueItem.stateData.code = 599; | ||
// Emit header receive event | ||
crawler._emitSpecial("fetchheaders",queueItem,response); | ||
crawler.emit("fetchheaders",queueItem,response); | ||
@@ -831,3 +823,3 @@ // Ensure response length is reasonable... | ||
crawler._emitSpecial("fetchcomplete",queueItem,responseBuffer,response); | ||
crawler.emit("fetchcomplete",queueItem,responseBuffer,response); | ||
@@ -885,3 +877,3 @@ // First, save item to cache (if we're using a cache!) | ||
crawler._emitSpecial("fetchdataerror",queueItem,response); | ||
crawler.emit("fetchdataerror",queueItem,response); | ||
} | ||
@@ -927,3 +919,3 @@ } else { | ||
crawler.cache.getCacheData(queueItem,function(cacheObject) { | ||
crawler._emitSpecial("notmodified",queueItem,response,cacheObject); | ||
crawler.emit("notmodified",queueItem,response,cacheObject); | ||
}); | ||
@@ -933,3 +925,3 @@ } else { | ||
// we don't send any data. | ||
crawler._emitSpecial("notmodified",queueItem,response); | ||
crawler.emit("notmodified",queueItem,response); | ||
} | ||
@@ -948,3 +940,3 @@ | ||
// Emit redirect event | ||
crawler._emitSpecial("fetchredirect",queueItem,parsedURL,response); | ||
crawler.emit("fetchredirect",queueItem,parsedURL,response); | ||
@@ -962,3 +954,3 @@ // Clean URL, add to queue... | ||
// Emit 404 event | ||
crawler._emitSpecial("fetch404",queueItem,response); | ||
crawler.emit("fetch404",queueItem,response); | ||
@@ -973,3 +965,3 @@ crawler._openRequests --; | ||
// Emit 5xx / 4xx event | ||
crawler._emitSpecial("fetcherror",queueItem,response); | ||
crawler.emit("fetcherror",queueItem,response); | ||
@@ -1037,2 +1029,40 @@ crawler._openRequests --; | ||
/* | ||
Public: Holds the crawler in a 'running' state, preventing the `complete` | ||
event from firing until the callback this function returns has been executed, | ||
or a predetermined timeout (as specified by `crawler.listenerTTL`) has | ||
elapsed. | ||
Examples | ||
crawler.on("fetchcomplete",function(queueItem,data) { | ||
continue = this.wait(); | ||
doSomethingThatTakesAlongTime(function callback() { | ||
continue(); | ||
}); | ||
}); | ||
Returns callback which will allow the crawler to continue. | ||
*/ | ||
Crawler.prototype.wait = function() { | ||
var crawler = this, | ||
cleared = false, | ||
timeout = | ||
setTimeout(function() { | ||
if (cleared) return; | ||
cleared = true; | ||
crawler._openListeners --; | ||
}, crawler.listenerTTL); | ||
crawler._openListeners ++; | ||
return function() { | ||
if (cleared) return; | ||
cleared = true; | ||
crawler._openListeners --; | ||
clearTimeout(timeout); | ||
}; | ||
}; | ||
/* | ||
Public: Given a function, this method adds it to an internal list maintained | ||
@@ -1072,3 +1102,3 @@ by the crawler to be executed against each URL to determine whether it should | ||
url - ID of fetch condition to be removed. | ||
index - ID of fetch condition to be removed. | ||
@@ -1097,3 +1127,3 @@ Examples | ||
url | ||
url - URL from which to remove the querystring | ||
@@ -1100,0 +1130,0 @@ Examples |
{ | ||
"name": "simplecrawler", | ||
"description": "Very straigntforward web crawler. Uses EventEmitter. Generates queue statistics and has a basic cache mechanism with extensible backend.", | ||
"version": "0.2.10", | ||
"version": "0.3.0", | ||
"homepage": "http://github.com/cgiffard/node-simplecrawler", | ||
@@ -6,0 +6,0 @@ "author": "Christopher Giffard <christopher.giffard@cgiffard.com>", |
@@ -42,2 +42,4 @@ // Tests to ensure crawler code is well formed | ||
"clearInterval": true, | ||
"setTimeout": true, | ||
"clearTimeout": true, | ||
"Buffer": true | ||
@@ -44,0 +46,0 @@ }); |
@@ -54,3 +54,5 @@ // Runs a very simple crawl on an HTTP server | ||
asyncCrawler.on("fetchcomplete",function(queueItem,data,res,evtDone) { | ||
asyncCrawler.on("fetchcomplete",function(queueItem,data,res) { | ||
evtDone = this.wait(); | ||
setTimeout(function(){ | ||
@@ -57,0 +59,0 @@ linksDiscovered ++; |
Sorry, the diff of this file is not supported yet
108436
2309
575