simplecrawler
Advanced tools
Comparing version 0.2.6 to 0.2.7
@@ -571,23 +571,20 @@ // Simplecrawler | ||
if (crawler.domainValid(parsedURL.host)) { | ||
try { | ||
crawler.queue.add( | ||
parsedURL.protocol, | ||
parsedURL.host, | ||
parsedURL.port, | ||
parsedURL.path, | ||
function queueAddCallback(error,newQueueItem) { | ||
if (error) { | ||
// We received an error condition when adding the callback | ||
crawler._emitSpecial("queueerror",error,parsedURL); | ||
} else { | ||
crawler._emitSpecial("queueadd",newQueueItem,parsedURL); | ||
newQueueItem.referrer = queueItem ? queueItem.url : null; | ||
} | ||
crawler.queue.add( | ||
parsedURL.protocol, | ||
parsedURL.host, | ||
parsedURL.port, | ||
parsedURL.path, | ||
function queueAddCallback(error,newQueueItem) { | ||
if (error) { | ||
// We received an error condition when adding the callback | ||
if (error.code && error.code === "DUP") | ||
return crawler._emitSpecial("queueduplicate",parsedURL); | ||
return crawler._emitSpecial("queueerror",error,parsedURL); | ||
} | ||
); | ||
} catch(error) { | ||
// If we caught an error, emit queueerror | ||
crawler._emitSpecial("queueerror",error,parsedURL); | ||
return false; | ||
} | ||
crawler._emitSpecial("queueadd",newQueueItem,parsedURL); | ||
newQueueItem.referrer = queueItem ? queueItem.url : null; | ||
} | ||
); | ||
} | ||
@@ -658,2 +655,7 @@ | ||
}; | ||
// If port is one of the HTTP/HTTPS defaults, delete the option to avoid conflicts | ||
if (requestOptions.port === 80 || requestOptions.port === 443) { | ||
delete requestOptions.port; | ||
} | ||
@@ -660,0 +662,0 @@ // Add cookie header from cookie jar if we're configured to |
@@ -29,3 +29,3 @@ // Simplecrawler - queue module | ||
var self = this; | ||
// Ensure all variables conform to reasonable defaults | ||
@@ -37,9 +37,9 @@ protocol = protocol === "https" ? "https" : "http"; | ||
} | ||
var url = protocol + "://" + domain + (port !== 80 ? ":" + port : "") + path; | ||
this.exists(protocol,domain,port,path, | ||
function(err,exists) { | ||
if (err) return callback(err); | ||
if (!exists) { | ||
@@ -56,7 +56,10 @@ var queueItem = { | ||
}; | ||
self.push(queueItem); | ||
callback(null,queueItem); | ||
} else { | ||
callback(new Error("Resource already exists in queue!")); | ||
var error = new Error("Resource already exists in queue!"); | ||
error.code = "DUP"; | ||
callback(error); | ||
} | ||
@@ -69,5 +72,9 @@ }); | ||
callback = callback && callback instanceof Function ? callback : function(){}; | ||
var url = (protocol + "://" + domain + (port !== 80 ? ":" + port : "") + path).toLowerCase(); | ||
port = (port !== 80 ? ":" + port : ""); | ||
var url = | ||
(protocol + "://" + domain + port + path) | ||
.toLowerCase(); | ||
if (!!this.scanIndex[url]) { | ||
@@ -85,3 +92,3 @@ callback(null,1); | ||
var self = this; | ||
callback(null,self[self.length-1]); | ||
@@ -94,3 +101,3 @@ }; | ||
var self = this; | ||
if (!isNaN(id) && self.length > id) { | ||
@@ -105,3 +112,3 @@ return callback(null,self[id]); | ||
var self = this; | ||
for (var itemIndex = self.oldestUnfetchedIndex; itemIndex < self.length; itemIndex ++) { | ||
@@ -113,3 +120,3 @@ if (self[itemIndex].status === "queued") { | ||
} | ||
callback(new Error("No unfetched items remain.")); | ||
@@ -127,3 +134,3 @@ }; | ||
} | ||
self.forEach(function(item) { | ||
@@ -134,3 +141,3 @@ if (item.fetched && item.stateData[statisticName] !== null && item.stateData[statisticName] > maxStatisticValue) { | ||
}); | ||
callback(null,maxStatisticValue); | ||
@@ -148,3 +155,3 @@ }; | ||
} | ||
self.forEach(function(item) { | ||
@@ -168,3 +175,3 @@ if (item.fetched && item.stateData[statisticName] !== null && item.stateData[statisticName] < minStatisticValue) { | ||
} | ||
self.forEach(function(item) { | ||
@@ -176,3 +183,3 @@ if (item.fetched && item.stateData[statisticName] !== null && !isNaN(item.stateData[statisticName])) { | ||
}); | ||
callback(null,NumberSum / NumberCount); | ||
@@ -191,3 +198,3 @@ }; | ||
}); | ||
callback(null,NumberComplete); | ||
@@ -215,3 +222,3 @@ return NumberComplete; | ||
var subqueue = [], self = this; | ||
self.forEach(function(item,index) { | ||
@@ -223,3 +230,3 @@ if (item.status === status) { | ||
}); | ||
callback(null,subqueue); | ||
@@ -232,3 +239,3 @@ }; | ||
var self = this; | ||
self.countWithStatus("failed",function(err1,failed) { | ||
@@ -245,3 +252,3 @@ self.countWithStatus("notfound",function(err2,notfound) { | ||
var self = this; | ||
// Re-queue in-progress items before freezing... | ||
@@ -263,10 +270,10 @@ self.forEach(function(item) { | ||
var fileData, self = this, defrostedQueue = []; | ||
fs.readFile(filename,function(err,fileData) { | ||
if (err) return callback(err); | ||
if (!fileData.toString("utf8").length) { | ||
return callback(new Error("Failed to defrost queue from zero-length JSON.")); | ||
} | ||
try { | ||
@@ -277,3 +284,3 @@ defrostedQueue = JSON.parse(fileData.toString("utf8")); | ||
} | ||
for (var index in defrostedQueue) { | ||
@@ -285,5 +292,5 @@ if (defrostedQueue.hasOwnProperty(index) && !isNaN(index)) { | ||
} | ||
callback(null,self); | ||
}); | ||
}; |
{ | ||
"name": "simplecrawler", | ||
"description": "Very straigntforward web crawler. Uses EventEmitter. Generates queue statistics and has a basic cache mechanism with extensible backend.", | ||
"version": "0.2.6", | ||
"version": "0.2.7", | ||
"homepage": "http://github.com/cgiffard/node-simplecrawler", | ||
@@ -6,0 +6,0 @@ "author": "Christopher Giffard <christopher.giffard@cgiffard.com>", |
Sorry, the diff of this file is not supported yet
103648
2181
561