simplecrawler
Advanced tools
Comparing version 0.1.7 to 0.2.0
@@ -9,2 +9,3 @@ // Simplecrawler | ||
Cache = require("./cache.js"), | ||
CookieJar = require("./cookies.js"), | ||
MetaInfo = require("../package.json"); | ||
@@ -19,3 +20,3 @@ | ||
Public: Constructor for the crawler. | ||
host - Initial hostname/domain to begin crawling from. By | ||
@@ -26,7 +27,7 @@ default, the crawl will be locked to this hostname. | ||
interval - Request interval for the crawler. Defaults to 250ms. | ||
Examples | ||
var crawler = new Crawler("example.com","/",80,500); | ||
var crawler = new Crawler("example.com"); | ||
@@ -41,6 +42,6 @@ | ||
throw new Error("Port must be a number!"); | ||
// SETTINGS TO STUFF WITH | ||
// (not here! Do it when you create a `new Crawler()`) | ||
// Domain to crawl | ||
@@ -58,3 +59,3 @@ this.host = host || ""; | ||
this.interval = interval || 250; | ||
// Maximum request concurrency. Be sensible. Five ties in with node's | ||
@@ -75,3 +76,3 @@ // default maxSockets value. | ||
this.queue = new FetchQueue(); | ||
// Do we filter by domain? | ||
@@ -99,8 +100,15 @@ // Unless you want to be crawling the entire internet, I would | ||
this.proxyPort = 8123; | ||
// Support for HTTP basic auth | ||
this.needsAuth = false; | ||
this.authUser = ""; | ||
this.authPass = ""; | ||
// Support for HTTP basic auth | ||
this.needsAuth = false; | ||
this.authUser = ""; | ||
this.authPass = ""; | ||
// Support for retaining cookies for parse duration | ||
this.acceptCookies = true; | ||
this.cookies = new CookieJar(); | ||
// Support for custom headers... | ||
this.customHeaders = {}; | ||
// Domain Whitelist | ||
@@ -130,2 +138,5 @@ // We allow domains to be whitelisted, so cross-domain requests can be made. | ||
this.downloadUnsupported = true; | ||
// URL Encoding setting... | ||
this.urlEncoding = "unicode"; | ||
@@ -145,3 +156,3 @@ // STATE (AND OTHER) VARIABLES NOT TO STUFF WITH | ||
commences. | ||
Examples | ||
@@ -156,6 +167,6 @@ | ||
var crawler = this; | ||
// only if we haven't already got stuff in our queue... | ||
if (!this.queue.length) { | ||
// Initialise our queue by pushing the initial request data into it... | ||
@@ -171,10 +182,10 @@ this.queue.add( | ||
} | ||
this.crawlIntervalID = setInterval(function() { | ||
crawler.crawl.call(crawler); | ||
},this.interval); | ||
this.emit("crawlstart"); | ||
this.running = true; | ||
// Now kick off the initial crawl | ||
@@ -184,3 +195,3 @@ process.nextTick(function() { | ||
}); | ||
return this; | ||
@@ -193,3 +204,3 @@ }; | ||
URL - URL with a protocol, for testing. | ||
Examples | ||
@@ -205,6 +216,6 @@ | ||
var protocol; | ||
try { | ||
protocol = URI(URL).protocol(); | ||
} catch(e) { | ||
@@ -214,3 +225,3 @@ // If URIjs died, we definitely /do not/ support the protocol. | ||
} | ||
return this.allowedProtocols.reduce(function(prev,protocolCheck) { | ||
@@ -235,3 +246,3 @@ return prev || !!protocolCheck.exec(protocol); | ||
Crawler.prototype.mimeTypeSupported = function(MIMEType) { | ||
return ( | ||
@@ -258,12 +269,17 @@ this.supportedMimeTypes.reduce(function(prev,mimeCheck) { | ||
Crawler.prototype.processURL = function(URL,context) { | ||
var newURL; | ||
var newURL, crawler = this; | ||
// If the URL didn't contain anything, don't fetch it. | ||
if (!URL.replace(/\s+/ig,"").length) return false; | ||
try { | ||
newURL = | ||
newURL = | ||
URI(URL) | ||
.absoluteTo(context.url) | ||
.normalize(); | ||
if (crawler.urlEncoding === "iso8859") { | ||
newURL = newURL.iso8859(); | ||
} | ||
} catch(e) { | ||
@@ -273,3 +289,3 @@ // Couldn't process the URL, since URIjs choked on it. | ||
} | ||
// simplecrawler uses slightly different terminology to URIjs. Sorry! | ||
@@ -304,3 +320,3 @@ return { | ||
crawler = this; | ||
// Regular expressions for finding URL items in HTML and text | ||
@@ -311,3 +327,3 @@ var discoverRegex = [ | ||
/url\([^)]+/ig, | ||
// This might be a bit of a gamble... but get hard-coded | ||
@@ -319,3 +335,3 @@ // strings out of javacript: URLs. They're often popup-image | ||
]; | ||
function cleanURL(URL) { | ||
@@ -337,7 +353,7 @@ return URL | ||
if (!urlMatch) return []; | ||
return urlMatch | ||
.map(cleanURL) | ||
.reduce(function(list,URL) { | ||
// Ensure URL is whole and complete | ||
@@ -353,9 +369,9 @@ try { | ||
} | ||
// If we hit an empty item, don't add return it | ||
if (!URL.length) return list; | ||
// If we don't support the protocol in question | ||
if (!crawler.protocolSupported(URL)) return list; | ||
// Does the item already exist in the list? | ||
@@ -398,9 +414,9 @@ if (resources.reduce(function(prev,current) { | ||
crawlerHost = crawler.host; | ||
// If we're ignoring the WWW domain, remove the WWW for comparisons... | ||
if (crawler.ignoreWWWDomain) | ||
host = host.replace(/^www\./i,""); | ||
function domainInWhitelist(host) { | ||
// If there's no whitelist, or the whitelist is of zero length, | ||
@@ -410,12 +426,12 @@ // just return false. | ||
!crawler.domainWhitelist.length) return false; | ||
// Otherwise, scan through it. | ||
return !!crawler.domainWhitelist.reduce(function(prev,cur,index,array) { | ||
// If we already located the relevant domain in the whitelist... | ||
if (prev) return prev; | ||
// If the domain is just equal, return true. | ||
if (host === cur) return true; | ||
// If we're ignoring WWW subdomains, and both domains, | ||
@@ -425,3 +441,3 @@ // less www. are the same, return true. | ||
return true; | ||
// Otherwise, sorry. No dice. | ||
@@ -434,7 +450,7 @@ return false; | ||
function isSubdomainOf(subdomain,host) { | ||
// Comparisons must be case-insensitive | ||
subdomain = subdomain.toLowerCase(); | ||
host = host.toLowerCase(); | ||
// If we're ignoring www, remove it from both | ||
@@ -446,3 +462,3 @@ // (if www is the first domain component...) | ||
} | ||
// They should be the same flipped around! | ||
@@ -500,3 +516,3 @@ return ( | ||
resources.forEach(function(url){ crawler.queueURL(url,queueItem); }); | ||
return this; | ||
@@ -525,3 +541,3 @@ }; | ||
typeof(url) === "object" ? url : crawler.processURL(url,queueItem); | ||
// URL Parser decided this URL was junky. Next please! | ||
@@ -542,3 +558,3 @@ if (!parsedURL) { | ||
} | ||
// Check the domain is valid before adding it to the queue | ||
@@ -568,3 +584,3 @@ if (crawler.domainValid(parsedURL.host)) { | ||
} | ||
return true; | ||
@@ -577,5 +593,5 @@ }; | ||
It kicks off resource discovery and queues any new resources found. | ||
queueItem - Queue item to be fetched. | ||
Emits | ||
@@ -602,6 +618,3 @@ fetchstart | ||
crawler.openRequests ++; | ||
// Emit fetchstart event | ||
crawler.emit("fetchstart",queueItem); | ||
// Variable declarations | ||
@@ -611,10 +624,3 @@ var fetchData = false, | ||
clientRequest, | ||
timeCommenced, | ||
timeHeadersReceived, | ||
timeDataReceived, | ||
parsedURL, | ||
responseBuffer, | ||
responseLength, | ||
responseLengthReceived, | ||
contentType; | ||
timeCommenced; | ||
@@ -647,5 +653,13 @@ // Mark as spooled | ||
if(crawler.needsAuth) { | ||
// Add cookie header from cookie jar if we're configured to | ||
// send/accept cookies | ||
if (crawler.acceptCookies && crawler.cookies.getAsHeader()) { | ||
requestOptions.headers.cookie = | ||
crawler.cookies.getAsHeader(requestHost,requestPath); | ||
} | ||
// Add auth headers if we need them | ||
if (crawler.needsAuth) { | ||
var auth = crawler.authUser + ":" + crawler.authPass; | ||
// Generate auth header | ||
@@ -656,189 +670,252 @@ auth = 'Basic ' + (new Buffer(auth).toString('base64')); | ||
// Record what time we started this request | ||
timeCommenced = (new Date().getTime()); | ||
// And if we've got any custom headers available | ||
if (crawler.customHeaders) { | ||
for (var header in crawler.customHeaders) { | ||
if (!crawler.customHeaders.hasOwnProperty(header)) continue; | ||
requestOptions.headers[header] = crawler.customHeaders[header]; | ||
} | ||
} | ||
// Emit fetchstart event - gives the user time to mangle the request options | ||
// if required. | ||
crawler.emit("fetchstart",queueItem,requestOptions); | ||
// Get the resource! | ||
clientRequest = client.get(requestOptions,function(response) { | ||
var dataReceived = false, | ||
stateData = queueItem.stateData; | ||
process.nextTick(function() { | ||
// Record what time we started this request | ||
timeCommenced = Date.now(); | ||
responseLengthReceived = 0; | ||
// Get the resource! | ||
clientRequest = | ||
client.get(requestOptions,function(response) { | ||
crawler.handleResponse(queueItem,response,timeCommenced); | ||
}); | ||
clientRequest.on("error",function(errorData) { | ||
crawler.openRequests --; | ||
// Emit 5xx / 4xx event | ||
crawler.emit("fetchclienterror",queueItem,errorData); | ||
queueItem.fetched = true; | ||
queueItem.stateData.code = 599; | ||
queueItem.status = "failed"; | ||
}); | ||
return crawler; | ||
}); | ||
}; | ||
// Record what time we first received the header information | ||
timeHeadersReceived = (new Date().getTime()); | ||
responseLength = parseInt(response.headers["content-length"],10); | ||
responseLength = !isNaN(responseLength) ? responseLength : 0; | ||
/* | ||
Public: Given a queueItem and a matching response object, the crawler will | ||
handle downloading the resource, queueing of linked items, etc. | ||
Examples | ||
// Passing in a response from `request` | ||
request(queueItem.url,function(err,res,body) { | ||
crawler.handleResponse(queueItem,res); | ||
}); | ||
// Save timing and content some header information into queue | ||
stateData.requestLatency = (timeHeadersReceived - timeCommenced); | ||
stateData.requestTime = (timeHeadersReceived - timeCommenced); | ||
stateData.contentLength = responseLength; | ||
stateData.contentType = contentType = response.headers["content-type"]; | ||
stateData.code = response.statusCode; | ||
Returns the crawler object for chaining. | ||
// Save entire headers, in less scannable way | ||
stateData.headers = response.headers; | ||
*/ | ||
Crawler.prototype.handleResponse = function(queueItem,response,timeCommenced) { | ||
var crawler = this, | ||
dataReceived = false, | ||
timeHeadersReceived, | ||
timeDataReceived, | ||
parsedURL, | ||
responseBuffer, | ||
responseLength, | ||
responseLengthReceived = 0, | ||
contentType, | ||
stateData = queueItem.stateData; | ||
// Record what time we first received the header information | ||
timeHeadersReceived = Date.now(); | ||
// If we weren't passed a time of commencement, assume Now() | ||
timeCommenced = timeCommenced || Date.now(); | ||
// Emit header receive event | ||
crawler.emit("fetchheaders",queueItem,response); | ||
responseLength = parseInt(response.headers["content-length"],10); | ||
responseLength = !isNaN(responseLength) ? responseLength : 0; | ||
// Ensure response length is reasonable... | ||
responseLength = | ||
responseLength > 0 ? responseLength : crawler.maxResourceSize; | ||
queueItem.stateData.contentLength = responseLength; | ||
// Save timing and content some header information into queue | ||
stateData.requestLatency = (timeHeadersReceived - timeCommenced); | ||
stateData.requestTime = (timeHeadersReceived - timeCommenced); | ||
stateData.contentLength = responseLength; | ||
stateData.contentType = contentType = response.headers["content-type"]; | ||
stateData.code = response.statusCode; | ||
stateData.headers = response.headers; | ||
// Do we need to save cookies? Were we sent any? | ||
if (crawler.acceptCookies && | ||
response.headers.hasOwnProperty('set-cookie')) | ||
crawler.cookies.addFromHeaders(response.headers["set-cookie"]); | ||
// Emit header receive event | ||
crawler.emit("fetchheaders",queueItem,response); | ||
// Function for dealing with 200 responses | ||
function processReceivedData() { | ||
if (queueItem.fetched) return; | ||
timeDataReceived = (new Date().getTime()); | ||
// Ensure response length is reasonable... | ||
responseLength = | ||
responseLength > 0 ? responseLength : crawler.maxResourceSize; | ||
queueItem.fetched = true; | ||
queueItem.status = "downloaded"; | ||
// Save state information | ||
stateData.downloadTime = (timeDataReceived - timeHeadersReceived); | ||
stateData.requestTime = (timeDataReceived - timeCommenced); | ||
stateData.actualDataSize = responseBuffer.length; | ||
stateData.sentIncorrectSize = responseBuffer.length !== responseLength; | ||
queueItem.stateData.contentLength = responseLength; | ||
crawler.emit("fetchcomplete",queueItem,responseBuffer,response); | ||
// Function for dealing with 200 responses | ||
function processReceivedData() { | ||
if (queueItem.fetched) return; | ||
// First, save item to cache (if we're using a cache!) | ||
if (crawler.cache !== null && | ||
crawler.cache.setCacheData instanceof Function) { | ||
crawler.cache.setCacheData(queueItem,responseBuffer); | ||
} | ||
timeDataReceived = (new Date().getTime()); | ||
// We only process the item if it's of a valid mimetype | ||
// and only if the crawler is set to discover its own resources | ||
if (crawler.mimeTypeSupported(contentType) && crawler.discoverResources) { | ||
crawler.queueLinkedItems(responseBuffer,queueItem); | ||
} | ||
crawler.openRequests --; | ||
queueItem.fetched = true; | ||
queueItem.status = "downloaded"; | ||
// Save state information | ||
stateData.downloadTime = (timeDataReceived - timeHeadersReceived); | ||
stateData.requestTime = (timeDataReceived - timeCommenced); | ||
stateData.actualDataSize = responseBuffer.length; | ||
stateData.sentIncorrectSize = responseBuffer.length !== responseLength; | ||
crawler.emit("fetchcomplete",queueItem,responseBuffer,response); | ||
// First, save item to cache (if we're using a cache!) | ||
if (crawler.cache !== null && | ||
crawler.cache.setCacheData instanceof Function) { | ||
crawler.cache.setCacheData(queueItem,responseBuffer); | ||
} | ||
function receiveData(chunk) { | ||
if (chunk && chunk.length && !dataReceived) { | ||
if (responseLengthReceived + chunk.length > responseBuffer.length) { | ||
// Oh dear. We've been sent more data than we were initially told. | ||
// This could be a mis-calculation, or a streaming resource. | ||
// Let's increase the size of our buffer to match, as long as it isn't | ||
// larger than our maximum resource size. | ||
// We only process the item if it's of a valid mimetype | ||
// and only if the crawler is set to discover its own resources | ||
if (crawler.mimeTypeSupported(contentType) && crawler.discoverResources) { | ||
crawler.queueLinkedItems(responseBuffer,queueItem); | ||
} | ||
if (responseLengthReceived + chunk.length <= crawler.maxResourceSize) { | ||
// Start by creating a new buffer, which will be our main buffer going forward... | ||
var tmpNewBuffer = new Buffer(responseLengthReceived + chunk.length); | ||
crawler.openRequests --; | ||
} | ||
// Copy all our old data into it... | ||
responseBuffer.copy(tmpNewBuffer,0,0,responseBuffer.length); | ||
function receiveData(chunk) { | ||
if (chunk && chunk.length && !dataReceived) { | ||
if (responseLengthReceived + chunk.length > responseBuffer.length) { | ||
// Oh dear. We've been sent more data than we were initially told. | ||
// This could be a mis-calculation, or a streaming resource. | ||
// Let's increase the size of our buffer to match, as long as it isn't | ||
// larger than our maximum resource size. | ||
// And now the new chunk | ||
chunk.copy(tmpNewBuffer,responseBuffer.length,0,chunk.length); | ||
if (responseLengthReceived + chunk.length <= crawler.maxResourceSize) { | ||
// Start by creating a new buffer, which will be our main | ||
// buffer from now on... | ||
var tmpNewBuffer = new Buffer(responseLengthReceived + chunk.length); | ||
// And now make the response buffer our new buffer, leaving the original for GC | ||
responseBuffer = tmpNewBuffer; | ||
// Copy all our old data into it... | ||
responseBuffer.copy(tmpNewBuffer,0,0,responseBuffer.length); | ||
} else { | ||
// Oh dear oh dear! The response is not only more data than we were initially told, | ||
// but it also exceeds the maximum amount of data we're prepared to download per resource. | ||
// Throw error event and ignore. | ||
// | ||
// We'll then deal with the data that we have. | ||
// And now the new chunk | ||
chunk.copy(tmpNewBuffer,responseBuffer.length,0,chunk.length); | ||
crawler.emit("fetchdataerror",queueItem,response); | ||
} | ||
// And now make the response buffer our new buffer, | ||
// leaving the original for GC | ||
responseBuffer = tmpNewBuffer; | ||
} else { | ||
// Copy the chunk data into our main buffer | ||
chunk.copy(responseBuffer,responseLengthReceived,0,chunk.length); | ||
// Oh dear oh dear! The response is not only more data | ||
// than we were initially told, but it also exceeds the | ||
// maximum amount of data we're prepared to download per | ||
// resource. | ||
// | ||
// Throw error event and ignore. | ||
// | ||
// We'll then deal with the data that we have. | ||
crawler.emit("fetchdataerror",queueItem,response); | ||
} | ||
// Increment our data received counter | ||
responseLengthReceived += chunk.length; | ||
} else { | ||
// Copy the chunk data into our main buffer | ||
chunk.copy(responseBuffer,responseLengthReceived,0,chunk.length); | ||
} | ||
// Increment our data received counter | ||
responseLengthReceived += chunk.length; | ||
} | ||
if ((responseLengthReceived >= responseLength || response.complete) && !dataReceived) { | ||
// Slice the buffer to chop off any unused space | ||
responseBuffer = responseBuffer.slice(0,responseLengthReceived); | ||
dataReceived = true; | ||
processReceivedData(); | ||
} | ||
} | ||
// If we should just go ahead and get the data | ||
if (response.statusCode >= 200 && response.statusCode < 300 && responseLength <= crawler.maxResourceSize) { | ||
queueItem.status = "headers"; | ||
if ((responseLengthReceived >= responseLength || response.complete) && | ||
!dataReceived) { | ||
// Create a buffer with our response length | ||
responseBuffer = new Buffer(responseLength); | ||
// Slice the buffer to chop off any unused space | ||
responseBuffer = responseBuffer.slice(0,responseLengthReceived); | ||
response.on("data",receiveData); | ||
response.on("end",receiveData); | ||
dataReceived = true; | ||
processReceivedData(); | ||
} | ||
} | ||
// We've got a not-modified response back | ||
} else if (response.statusCode === 304) { | ||
// If we should just go ahead and get the data | ||
if (response.statusCode >= 200 && response.statusCode < 300 && | ||
responseLength <= crawler.maxResourceSize) { | ||
queueItem.status = "headers"; | ||
if (crawler.cache !== null && crawler.cache.getCacheData) { | ||
// We've got access to a cache | ||
crawler.cache.getCacheData(queueItem,function(cacheObject) { | ||
crawler.emit("notmodified",queueItem,response,cacheObject); | ||
}); | ||
} else { | ||
// Emit notmodified event. We don't have a cache available, so we don't send any data. | ||
crawler.emit("notmodified",queueItem,response); | ||
} | ||
// Create a buffer with our response length | ||
responseBuffer = new Buffer(responseLength); | ||
// If we should queue a redirect | ||
} else if (response.statusCode >= 300 && response.statusCode < 400 && response.headers.location) { | ||
queueItem.fetched = true; | ||
queueItem.status = "redirected"; | ||
// Parse the redirect URL ready for adding to the queue... | ||
parsedURL = crawler.processURL(response.headers.location,queueItem); | ||
response.on("data",receiveData); | ||
response.on("end",receiveData); | ||
// Emit redirect event | ||
crawler.emit("fetchredirect",queueItem,parsedURL,response); | ||
// We've got a not-modified response back | ||
} else if (response.statusCode === 304) { | ||
// Clean URL, add to queue... | ||
crawler.queueURL(parsedURL,queueItem); | ||
if (crawler.cache !== null && crawler.cache.getCacheData) { | ||
// We've got access to a cache | ||
crawler.cache.getCacheData(queueItem,function(cacheObject) { | ||
crawler.emit("notmodified",queueItem,response,cacheObject); | ||
}); | ||
} else { | ||
// Emit notmodified event. We don't have a cache available, so | ||
// we don't send any data. | ||
crawler.emit("notmodified",queueItem,response); | ||
} | ||
crawler.openRequests --; | ||
// If we should queue a redirect | ||
} else if (response.statusCode >= 300 && response.statusCode < 400 && | ||
response.headers.location) { | ||
queueItem.fetched = true; | ||
queueItem.status = "redirected"; | ||
// Ignore this request, but record that we had a 404 | ||
} else if (response.statusCode === 404) { | ||
queueItem.fetched = true; | ||
queueItem.status = "notfound"; | ||
// Parse the redirect URL ready for adding to the queue... | ||
parsedURL = crawler.processURL(response.headers.location,queueItem); | ||
// Emit 404 event | ||
crawler.emit("fetch404",queueItem,response); | ||
// Emit redirect event | ||
crawler.emit("fetchredirect",queueItem,parsedURL,response); | ||
crawler.openRequests --; | ||
// Clean URL, add to queue... | ||
crawler.queueURL(parsedURL,queueItem); | ||
// And oh dear. Handle this one as well. (other 400s, 500s, etc) | ||
} else { | ||
queueItem.fetched = true; | ||
queueItem.status = "failed"; | ||
crawler.openRequests --; | ||
// Emit 5xx / 4xx event | ||
crawler.emit("fetcherror",queueItem,response); | ||
// Ignore this request, but record that we had a 404 | ||
} else if (response.statusCode === 404) { | ||
queueItem.fetched = true; | ||
queueItem.status = "notfound"; | ||
crawler.openRequests --; | ||
} | ||
}); | ||
// Emit 404 event | ||
crawler.emit("fetch404",queueItem,response); | ||
clientRequest.on("error",function(errorData) { | ||
crawler.openRequests --; | ||
// Emit 5xx / 4xx event | ||
crawler.emit("fetchclienterror",queueItem,errorData); | ||
// And oh dear. Handle this one as well. (other 400s, 500s, etc) | ||
} else { | ||
queueItem.fetched = true; | ||
queueItem.stateData.code = 599; | ||
queueItem.status = "failed"; | ||
}); | ||
// Emit 5xx / 4xx event | ||
crawler.emit("fetcherror",queueItem,response); | ||
crawler.openRequests --; | ||
} | ||
@@ -853,3 +930,3 @@ return crawler; | ||
to do so and there are unfetched items in the queue. | ||
Examples | ||
@@ -864,5 +941,5 @@ | ||
var crawler = this; | ||
if (crawler.openRequests > crawler.maxConcurrency) return; | ||
crawler.queue.oldestUnfetchedItem(function(err,queueItem) { | ||
@@ -880,3 +957,3 @@ if (queueItem) { | ||
}); | ||
return crawler; | ||
@@ -941,3 +1018,3 @@ }; | ||
Returns true if the fetch condition was removed, and throws an error if it | ||
Returns true if the fetch condition was removed, and throws an error if it | ||
could not be found. | ||
@@ -949,3 +1026,3 @@ | ||
this.fetchConditions[index] instanceof Function) { | ||
return !!this.fetchConditions.splice(index,1); | ||
@@ -952,0 +1029,0 @@ } else { |
{ | ||
"name": "simplecrawler", | ||
"description": "Very straigntforward web crawler. Uses EventEmitter. Generates queue statistics and has a basic cache mechanism with extensible backend.", | ||
"version": "0.1.7", | ||
"homepage": "http://github.com/cgiffard/node-simplecrawler", | ||
"author": "Christopher Giffard <christopher.giffard@cgiffard.com>", | ||
"keywords": [ | ||
"crawler", | ||
"spider", | ||
"cache", | ||
"queue", | ||
"simplecrawler", | ||
"eventemitter" | ||
], | ||
"scripts": { | ||
"test": "mocha -R spec" | ||
}, | ||
"bin": { | ||
"crawl": "./lib/cli.js" | ||
}, | ||
"repository": { | ||
"type": "git", | ||
"url": "http://github.com/cgiffard/node-simplecrawler.git" | ||
}, | ||
"bugs": { | ||
"url": "https://github.com/cgiffard/node-simplecrawler/issues" | ||
}, | ||
"main": "./lib/index.js", | ||
"engines": { | ||
"node": ">=0.4.0" | ||
}, | ||
"devDependencies": { | ||
"mocha": "~1.8.1", | ||
"jshint": "~0.7.x", | ||
"chai": "~1.2.0" | ||
}, | ||
"dependencies": { | ||
"iconv": "~1.2.4", | ||
"URIjs": "~1.8.3" | ||
} | ||
"name": "simplecrawler", | ||
"description": "Very straigntforward web crawler. Uses EventEmitter. Generates queue statistics and has a basic cache mechanism with extensible backend.", | ||
"version": "0.2.0", | ||
"homepage": "http://github.com/cgiffard/node-simplecrawler", | ||
"author": "Christopher Giffard <christopher.giffard@cgiffard.com>", | ||
"keywords": [ | ||
"crawler", | ||
"spider", | ||
"cache", | ||
"queue", | ||
"simplecrawler", | ||
"eventemitter" | ||
], | ||
"scripts": { | ||
"test": "mocha -R spec" | ||
}, | ||
"bin": { | ||
"crawl": "./lib/cli.js" | ||
}, | ||
"repository": { | ||
"type": "git", | ||
"url": "http://github.com/cgiffard/node-simplecrawler.git" | ||
}, | ||
"bugs": { | ||
"url": "https://github.com/cgiffard/node-simplecrawler/issues" | ||
}, | ||
"main": "./lib/index.js", | ||
"engines": { | ||
"node": ">=0.4.0" | ||
}, | ||
"devDependencies": { | ||
"mocha": "~1.8.1", | ||
"jshint": "~0.7.x", | ||
"chai": "~1.2.0" | ||
}, | ||
"dependencies": { | ||
"iconv": "~1.2.4", | ||
"URIjs": "~1.8.3" | ||
} | ||
} |
@@ -18,2 +18,3 @@ // Tests to ensure crawler code is well formed | ||
"cli", | ||
"cookies", | ||
"crawler", | ||
@@ -20,0 +21,0 @@ "index", |
Sorry, the diff of this file is not supported yet
99761
22
2111
525