simplecrawler
Advanced tools
Comparing version 0.2.2 to 0.2.3
@@ -99,10 +99,10 @@ // Simplecrawler | ||
this.authPass = ""; | ||
// Support for retaining cookies for parse duration | ||
this.acceptCookies = true; | ||
this.cookies = new CookieJar(); | ||
// Support for custom headers... | ||
this.customHeaders = {}; | ||
// Domain Whitelist | ||
@@ -132,3 +132,3 @@ // We allow domains to be whitelisted, so cross-domain requests can be made. | ||
this.downloadUnsupported = true; | ||
// URL Encoding setting... | ||
@@ -256,3 +256,3 @@ this.urlEncoding = "unicode"; | ||
var newURL, crawler = this; | ||
// If the URL didn't contain anything, don't fetch it. | ||
@@ -266,7 +266,7 @@ if (!URL.replace(/\s+/ig,"").length) return false; | ||
.normalize(); | ||
if (crawler.urlEncoding === "iso8859") { | ||
newURL = newURL.iso8859(); | ||
} | ||
} catch(e) { | ||
@@ -327,2 +327,3 @@ // Couldn't process the URL, since URIjs choked on it. | ||
.replace(/["'\)]$/i,"") | ||
.replace(/^\/\//, queueItem.protocol + "://") | ||
.split(/\s+/g) | ||
@@ -333,3 +334,3 @@ .shift() | ||
} | ||
// Clean links | ||
@@ -369,3 +370,3 @@ function cleanAndQueue(urlMatch) { | ||
} | ||
// Rough scan for URLs | ||
@@ -592,3 +593,3 @@ return discoverRegex | ||
crawler.openRequests ++; | ||
// Variable declarations | ||
@@ -625,3 +626,3 @@ var fetchData = false, | ||
}; | ||
// Add cookie header from cookie jar if we're configured to | ||
@@ -633,3 +634,3 @@ // send/accept cookies | ||
} | ||
// Add auth headers if we need them | ||
@@ -643,3 +644,3 @@ if (crawler.needsAuth) { | ||
} | ||
// And if we've got any custom headers available | ||
@@ -649,7 +650,7 @@ if (crawler.customHeaders) { | ||
if (!crawler.customHeaders.hasOwnProperty(header)) continue; | ||
requestOptions.headers[header] = crawler.customHeaders[header]; | ||
} | ||
} | ||
// Emit fetchstart event - gives the user time to mangle the request options | ||
@@ -662,3 +663,3 @@ // if required. | ||
timeCommenced = Date.now(); | ||
// Get the resource! | ||
@@ -669,6 +670,6 @@ clientRequest = | ||
}); | ||
clientRequest.on("error",function(errorData) { | ||
crawler.openRequests --; | ||
// Emit 5xx / 4xx event | ||
@@ -680,3 +681,3 @@ crawler.emit("fetchclienterror",queueItem,errorData); | ||
}); | ||
return crawler; | ||
@@ -690,5 +691,5 @@ }); | ||
handle downloading the resource, queueing of linked items, etc. | ||
Examples | ||
// Passing in a response from `request` | ||
@@ -713,6 +714,6 @@ request(queueItem.url,function(err,res,body) { | ||
stateData = queueItem.stateData; | ||
// Record what time we first received the header information | ||
timeHeadersReceived = Date.now(); | ||
// If we weren't passed a time of commencement, assume Now() | ||
@@ -731,3 +732,3 @@ timeCommenced = timeCommenced || Date.now(); | ||
stateData.headers = response.headers; | ||
// Do we need to save cookies? Were we sent any? | ||
@@ -737,3 +738,3 @@ if (crawler.acceptCookies && | ||
crawler.cookies.addFromHeaders(response.headers["set-cookie"]); | ||
// Emit header receive event | ||
@@ -790,6 +791,6 @@ crawler.emit("fetchheaders",queueItem,response); | ||
if (responseLengthReceived + chunk.length <= crawler.maxResourceSize) { | ||
// Start by creating a new buffer, which will be our main | ||
// buffer from now on... | ||
var tmpNewBuffer = new Buffer(responseLengthReceived + chunk.length); | ||
@@ -812,3 +813,3 @@ | ||
// resource. | ||
// | ||
// | ||
// Throw error event and ignore. | ||
@@ -832,3 +833,3 @@ // | ||
!dataReceived) { | ||
// Slice the buffer to chop off any unused space | ||
@@ -845,3 +846,3 @@ responseBuffer = responseBuffer.slice(0,responseLengthReceived); | ||
responseLength <= crawler.maxResourceSize) { | ||
queueItem.status = "headers"; | ||
@@ -872,3 +873,3 @@ | ||
response.headers.location) { | ||
queueItem.fetched = true; | ||
@@ -908,3 +909,3 @@ queueItem.status = "redirected"; | ||
} | ||
return crawler; | ||
@@ -1017,2 +1018,2 @@ }; | ||
module.exports = Crawler; | ||
module.exports = Crawler; |
{ | ||
"name": "simplecrawler", | ||
"description": "Very straigntforward web crawler. Uses EventEmitter. Generates queue statistics and has a basic cache mechanism with extensible backend.", | ||
"version": "0.2.2", | ||
"version": "0.2.3", | ||
"homepage": "http://github.com/cgiffard/node-simplecrawler", | ||
@@ -6,0 +6,0 @@ "author": "Christopher Giffard <christopher.giffard@cgiffard.com>", |
@@ -6,20 +6,24 @@ // Routes for testing server | ||
"/": function(write) { | ||
write(200,"Home. <a href='/stage2'>stage2</a>"); | ||
write(200,"Home. <a href='stage2'>stage2</a>"); | ||
}, | ||
"/stage2": function(write) { | ||
write(200,"Stage2. http://127.0.0.1:3000/stage/3"); | ||
}, | ||
"/stage/3": function(write) { | ||
write(200,"Stage3. <a href='../stage4'>stage4</a>"); | ||
write(200,"Stage3. <a href='//127.0.0.1:3000/stage/4'>stage4</a>"); | ||
}, | ||
"/stage4": function(write,redir) { | ||
redir("/stage5"); | ||
"/stage/4": function(write) { | ||
write(200,"Stage4. <a href='../stage5'>stage5</a>"); | ||
}, | ||
"/stage5": function(write) { | ||
"/stage5": function(write,redir) { | ||
redir("/stage6"); | ||
}, | ||
"/stage6": function(write) { | ||
write(200,"Crawl complete!"); | ||
} | ||
}; | ||
}; |
@@ -5,3 +5,3 @@ // Runs a very simple crawl on an HTTP server | ||
var chai = require("chai"); | ||
chai.should(); | ||
chai.should(); | ||
@@ -11,55 +11,47 @@ var testserver = require("./lib/testserver.js"); | ||
describe("Test Crawl",function() { | ||
var Crawler = require("../"); | ||
// Create a new crawler to crawl this server | ||
var localCrawler = new Crawler("127.0.0.1","/",3000); | ||
var linksDiscovered = 0; | ||
it("should be able to be started",function(done) { | ||
localCrawler.on("crawlstart",done); | ||
localCrawler.start(); | ||
localCrawler.running.should.be.truthy; | ||
}); | ||
it("should have a queue with at least the initial crawl path",function() { | ||
localCrawler.queue.length.should.be.greaterThan(0); | ||
}); | ||
it("should complete the fetch queue",function(done) { | ||
it("should discover all linked resources in the queue",function(done) { | ||
localCrawler.on("discoverycomplete",function() { | ||
linksDiscovered ++; | ||
}); | ||
localCrawler.on("complete",function() { | ||
linksDiscovered.should.equal(5); | ||
done(); | ||
}); | ||
}); | ||
// Suddenly feeling very incompatible with the mocha philosophy. | ||
// Will try to make it work first, then look for another framework that | ||
// supports parallell tests. | ||
// | ||
// it("should be able to discover link resources",function(done) { | ||
// var linksDiscovered = 0; | ||
// | ||
// localCrawler.on("discoverycomplete",function() { | ||
// linksDiscovered ++; | ||
// | ||
// if (!linksDiscovered) done(); | ||
// }); | ||
// }); | ||
// | ||
// | ||
// | ||
// Todo: test how simple error conditions, content types, and responses | ||
// are handled. | ||
// TODO | ||
// Test how simple error conditions, content types, and responses are handled. | ||
// Test encodings. | ||
// Test URL detection | ||
// Test handling binary data | ||
// test bad content length | ||
}); | ||
// Test bad content length | ||
}); |
Sorry, the diff of this file is not supported yet
529
99706
2107