simplecrawler
Advanced tools
Comparing version 0.3.1 to 0.3.2
@@ -16,2 +16,3 @@ // Simplecrawler | ||
URI = require("URIjs"), | ||
zlib = require("zlib"), | ||
util = require("util"); | ||
@@ -345,2 +346,4 @@ | ||
queueItem - Queue item corresponding to document being searched. | ||
decompressed - The data being passed in has already been decompressed. | ||
Don't try again. | ||
@@ -356,5 +359,5 @@ Examples | ||
*/ | ||
Crawler.prototype.discoverResources = function(resourceData,queueItem) { | ||
Crawler.prototype.discoverResources = function(resourceData,queueItem,decompressed) { | ||
var resources = [], | ||
resourceText = resourceData.toString("utf8"), | ||
resourceText, | ||
crawler = this; | ||
@@ -367,2 +370,21 @@ | ||
queueItem.protocol = "http"; | ||
if (!decompressed && | ||
queueItem.stateData && | ||
queueItem.stateData.headers['content-encoding'] && ( | ||
queueItem.stateData.headers['content-encoding'].match(/gzip/) || | ||
queueItem.stateData.headers['content-encoding'].match(/deflate/))) { | ||
return zlib.unzip(resourceData,function(err,newData) { | ||
if (err) { | ||
return crawler.emit("fetcherror",queueItem); | ||
} | ||
crawler.discoverResources(newData,queueItem,true); | ||
}); | ||
} | ||
// Convert to UTF-8 | ||
// TODO: account for text-encoding. | ||
resourceText = resourceData.toString("utf8"); | ||
@@ -369,0 +391,0 @@ function cleanURL(URL) { |
{ | ||
"name": "simplecrawler", | ||
"description": "Very straigntforward web crawler. Uses EventEmitter. Generates queue statistics and has a basic cache mechanism with extensible backend.", | ||
"version": "0.3.1", | ||
"version": "0.3.2", | ||
"homepage": "http://github.com/cgiffard/node-simplecrawler", | ||
@@ -30,3 +30,3 @@ "author": "Christopher Giffard <christopher.giffard@cgiffard.com>", | ||
"engines": { | ||
"node": ">=0.4.0" | ||
"node": ">=0.8.0" | ||
}, | ||
@@ -33,0 +33,0 @@ "devDependencies": { |
109345
2327