simplecrawler
Advanced tools
Comparing version
@@ -65,2 +65,5 @@ // Simplecrawler | ||
crawler.timeout = 5 * 60 * 1000; | ||
// Maximum time we'll wait for async listeners. | ||
crawler.listenerTTL = 10 * 1000; | ||
@@ -143,6 +146,7 @@ // User Agent | ||
// STATE (AND OTHER) VARIABLES NOT TO STUFF WITH | ||
crawler._openRequests = 0; | ||
crawler._fetchConditions = []; | ||
crawler._openListeners = 0; | ||
crawler._listenerMap = {}; | ||
var hiddenProps = { | ||
"_openRequests": 0, | ||
"_fetchConditions": [], | ||
"_openListeners": 0 | ||
}; | ||
@@ -152,22 +156,10 @@ // Run the EventEmitter constructor | ||
crawler._emitSpecial = function() { | ||
var args = Array.prototype.slice.call(arguments,0), | ||
event = args[0], | ||
eventArgsLen = args.length-1, | ||
asyncListenerCount = 0; | ||
crawler.listeners(event).forEach(function(listener) { | ||
if (listener.length > eventArgsLen) | ||
asyncListenerCount++; | ||
// Apply all the hidden props | ||
Object.keys(hiddenProps).forEach(function(key) { | ||
Object.defineProperty(crawler, key, { | ||
"writable": true, | ||
"enumerable": false, | ||
"value": hiddenProps[key] | ||
}); | ||
crawler._openListeners += asyncListenerCount|0; | ||
crawler.emit.apply(crawler,args.concat([ | ||
function listenerComplete() { | ||
if (crawler._openListeners > 0) | ||
crawler._openListeners --; | ||
} | ||
])); | ||
}; | ||
}); | ||
}; | ||
@@ -212,3 +204,3 @@ | ||
crawler._emitSpecial("crawlstart"); | ||
crawler.emit("crawlstart"); | ||
crawler.running = true; | ||
@@ -553,3 +545,3 @@ | ||
// page relationships. | ||
crawler._emitSpecial("discoverycomplete",queueItem,resources); | ||
crawler.emit("discoverycomplete",queueItem,resources); | ||
@@ -615,8 +607,8 @@ resources.forEach(function(url){ crawler.queueURL(url,queueItem); }); | ||
if (error.code && error.code === "DUP") | ||
return crawler._emitSpecial("queueduplicate",parsedURL); | ||
return crawler.emit("queueduplicate",parsedURL); | ||
return crawler._emitSpecial("queueerror",error,parsedURL); | ||
return crawler.emit("queueerror",error,parsedURL); | ||
} | ||
crawler._emitSpecial("queueadd",newQueueItem,parsedURL); | ||
crawler.emit("queueadd",newQueueItem,parsedURL); | ||
newQueueItem.referrer = queueItem ? queueItem.url : null; | ||
@@ -723,3 +715,3 @@ } | ||
// if required. | ||
crawler._emitSpecial("fetchstart",queueItem,requestOptions); | ||
crawler.emit("fetchstart",queueItem,requestOptions); | ||
@@ -738,3 +730,3 @@ process.nextTick(function() { | ||
clientRequest.abort(); | ||
crawler._emitSpecial("fetchtimeout",queueItem,crawler.timeout); | ||
crawler.emit("fetchtimeout",queueItem,crawler.timeout); | ||
}); | ||
@@ -746,3 +738,3 @@ | ||
// Emit 5xx / 4xx event | ||
crawler._emitSpecial("fetchclienterror",queueItem,errorData); | ||
crawler.emit("fetchclienterror",queueItem,errorData); | ||
queueItem.fetched = true; | ||
@@ -807,3 +799,3 @@ queueItem.stateData.code = 599; | ||
// Emit header receive event | ||
crawler._emitSpecial("fetchheaders",queueItem,response); | ||
crawler.emit("fetchheaders",queueItem,response); | ||
@@ -831,3 +823,3 @@ // Ensure response length is reasonable... | ||
crawler._emitSpecial("fetchcomplete",queueItem,responseBuffer,response); | ||
crawler.emit("fetchcomplete",queueItem,responseBuffer,response); | ||
@@ -885,3 +877,3 @@ // First, save item to cache (if we're using a cache!) | ||
crawler._emitSpecial("fetchdataerror",queueItem,response); | ||
crawler.emit("fetchdataerror",queueItem,response); | ||
} | ||
@@ -927,3 +919,3 @@ } else { | ||
crawler.cache.getCacheData(queueItem,function(cacheObject) { | ||
crawler._emitSpecial("notmodified",queueItem,response,cacheObject); | ||
crawler.emit("notmodified",queueItem,response,cacheObject); | ||
}); | ||
@@ -933,3 +925,3 @@ } else { | ||
// we don't send any data. | ||
crawler._emitSpecial("notmodified",queueItem,response); | ||
crawler.emit("notmodified",queueItem,response); | ||
} | ||
@@ -948,3 +940,3 @@ | ||
// Emit redirect event | ||
crawler._emitSpecial("fetchredirect",queueItem,parsedURL,response); | ||
crawler.emit("fetchredirect",queueItem,parsedURL,response); | ||
@@ -962,3 +954,3 @@ // Clean URL, add to queue... | ||
// Emit 404 event | ||
crawler._emitSpecial("fetch404",queueItem,response); | ||
crawler.emit("fetch404",queueItem,response); | ||
@@ -973,3 +965,3 @@ crawler._openRequests --; | ||
// Emit 5xx / 4xx event | ||
crawler._emitSpecial("fetcherror",queueItem,response); | ||
crawler.emit("fetcherror",queueItem,response); | ||
@@ -1037,2 +1029,40 @@ crawler._openRequests --; | ||
/* | ||
Public: Holds the crawler in a 'running' state, preventing the `complete` | ||
event from firing until the callback this function returns has been executed, | ||
or a predetermined timeout (as specified by `crawler.listenerTTL`) has | ||
elapsed. | ||
Examples | ||
crawler.on("fetchcomplete",function(queueItem,data) { | ||
continue = this.wait(); | ||
doSomethingThatTakesAlongTime(function callback() { | ||
continue(); | ||
}); | ||
}); | ||
Returns callback which will allow the crawler to continue. | ||
*/ | ||
Crawler.prototype.wait = function() { | ||
var crawler = this, | ||
cleared = false, | ||
timeout = | ||
setTimeout(function() { | ||
if (cleared) return; | ||
cleared = true; | ||
crawler._openListeners --; | ||
}, crawler.listenerTTL); | ||
crawler._openListeners ++; | ||
return function() { | ||
if (cleared) return; | ||
cleared = true; | ||
crawler._openListeners --; | ||
clearTimeout(timeout); | ||
}; | ||
}; | ||
/* | ||
Public: Given a function, this method adds it to an internal list maintained | ||
@@ -1072,3 +1102,3 @@ by the crawler to be executed against each URL to determine whether it should | ||
url - ID of fetch condition to be removed. | ||
index - ID of fetch condition to be removed. | ||
@@ -1097,3 +1127,3 @@ Examples | ||
url | ||
url - URL from which to remove the querystring | ||
@@ -1100,0 +1130,0 @@ Examples |
{ | ||
"name": "simplecrawler", | ||
"description": "Very straigntforward web crawler. Uses EventEmitter. Generates queue statistics and has a basic cache mechanism with extensible backend.", | ||
"version": "0.2.10", | ||
"version": "0.3.0", | ||
"homepage": "http://github.com/cgiffard/node-simplecrawler", | ||
@@ -6,0 +6,0 @@ "author": "Christopher Giffard <christopher.giffard@cgiffard.com>", |
@@ -42,2 +42,4 @@ // Tests to ensure crawler code is well formed | ||
"clearInterval": true, | ||
"setTimeout": true, | ||
"clearTimeout": true, | ||
"Buffer": true | ||
@@ -44,0 +46,0 @@ }); |
@@ -54,3 +54,5 @@ // Runs a very simple crawl on an HTTP server | ||
asyncCrawler.on("fetchcomplete",function(queueItem,data,res,evtDone) { | ||
asyncCrawler.on("fetchcomplete",function(queueItem,data,res) { | ||
evtDone = this.wait(); | ||
setTimeout(function(){ | ||
@@ -57,0 +59,0 @@ linksDiscovered ++; |
Sorry, the diff of this file is not supported yet
URL strings
Supply chain riskPackage contains fragments of external URLs or IP addresses, which the package may be accessing at runtime.
Found 1 instance in 1 package
URL strings
Supply chain riskPackage contains fragments of external URLs or IP addresses, which the package may be accessing at runtime.
Found 1 instance in 1 package
108436
0.86%2309
1.23%575
1.05%