simplecrawler
Advanced tools
Comparing version 0.5.3 to 0.5.4
@@ -1,11 +0,18 @@ | ||
// Simplecrawler - FS cache backend | ||
/* | ||
* Simplecrawler - FS cache backend | ||
* https://github.com/cgiffard/node-simplecrawler | ||
* | ||
* Copyright (c) 2011-2015, Christopher Giffard | ||
* | ||
*/ | ||
// Tries to ensure a local 'cache' of a website is as close as possible to a mirror of the website itself. | ||
// The idea is that it is then possible to re-serve the website just using the cache. | ||
var fs = require("fs"); | ||
var crypto = require("crypto"); | ||
var fs = require("fs"), | ||
crypto = require("crypto"); | ||
// Factory for FSBackend | ||
var backend = function backend(loadParameter) { | ||
return new FSBackend(loadParameter); | ||
return new FSBackend(loadParameter); | ||
}; | ||
@@ -17,6 +24,6 @@ | ||
var FSBackend = function FSBackend(loadParameter) { | ||
this.loaded = false; | ||
this.index = []; | ||
this.location = typeof(loadParameter) === "string" && loadParameter.length > 0 ? loadParameter : process.cwd() + "/cache/"; | ||
this.location = this.location.substr(this.location.length-1) === "/" ? this.location : this.location + "/"; | ||
this.loaded = false; | ||
this.index = []; | ||
this.location = typeof loadParameter === "string" && loadParameter.length > 0 ? loadParameter : process.cwd() + "/cache/"; | ||
this.location = this.location.substr(this.location.length - 1) === "/" ? this.location : this.location + "/"; | ||
}; | ||
@@ -29,212 +36,215 @@ | ||
function sanitisePath(path,queueObject) { | ||
// Remove first slash (as we set one later.) | ||
path = path.replace(/^\//,""); | ||
function sanitisePath(path, queueObject) { | ||
// Remove first slash (as we set one later.) | ||
path = path.replace(/^\//, ""); | ||
var pathStack = []; | ||
var pathStack = []; | ||
// Trim whitespace. If no path is present - assume index.html. | ||
var sanitisedPath = path.length ? path.replace(/\s*$/ig,"") : "index.html"; | ||
var headers = queueObject.stateData.headers, sanitisedPathParts; | ||
// Trim whitespace. If no path is present - assume index.html. | ||
var sanitisedPath = path.length ? path.replace(/\s*$/ig, "") : "index.html"; | ||
var headers = queueObject.stateData.headers, sanitisedPathParts; | ||
if (sanitisedPath.match(/\?/)) { | ||
sanitisedPathParts = sanitisedPath.split(/\?/g); | ||
var resource = sanitisedPathParts.shift(); | ||
var hashedQS = crypto.createHash("sha1").update(sanitisedPathParts.join("?")).digest("hex"); | ||
sanitisedPath = resource + "?" + hashedQS; | ||
} | ||
if (sanitisedPath.match(/\?/)) { | ||
sanitisedPathParts = sanitisedPath.split(/\?/g); | ||
var resource = sanitisedPathParts.shift(); | ||
var hashedQS = crypto.createHash("sha1").update(sanitisedPathParts.join("?")).digest("hex"); | ||
sanitisedPath = resource + "?" + hashedQS; | ||
} | ||
pathStack = sanitisedPath.split(/\//g); | ||
pathStack = pathStack.map(function(pathChunk,count) { | ||
if (pathChunk.length >= 250) { | ||
return crypto.createHash("sha1").update(pathChunk).digest("hex"); | ||
} | ||
pathStack = sanitisedPath.split(/\//g); | ||
pathStack = pathStack.map(function(pathChunk) { | ||
if (pathChunk.length >= 250) { | ||
return crypto.createHash("sha1").update(pathChunk).digest("hex"); | ||
} | ||
return pathChunk; | ||
}); | ||
return pathChunk; | ||
}); | ||
sanitisedPath = pathStack.join("/"); | ||
sanitisedPath = pathStack.join("/"); | ||
// Try to get a file extension for the file - for ease of identification | ||
// We run through this if we either: | ||
// 1) haven't got a file extension at all, or: | ||
// 2) have an HTML file without an HTML file extension (might be .php, .aspx, .do, or some other server-processed type) | ||
// Try to get a file extension for the file - for ease of identification | ||
// We run through this if we either: | ||
// 1) haven't got a file extension at all, or: | ||
// 2) have an HTML file without an HTML file extension (might be .php, .aspx, .do, or some other server-processed type) | ||
if (!sanitisedPath.match(/\.[a-z0-9]{1,6}$/i) || (headers["content-type"] && headers["content-type"].match(/text\/html/i) && !sanitisedPath.match(/\.htm[l]?$/i))) { | ||
var subMimeType = ""; | ||
var mimeParts = []; | ||
if (!sanitisedPath.match(/\.[a-z0-9]{1,6}$/i) || headers["content-type"] && headers["content-type"].match(/text\/html/i) && !sanitisedPath.match(/\.htm[l]?$/i)) { | ||
var subMimeType = ""; | ||
var mimeParts = []; | ||
if (headers["content-type"] && headers["content-type"].match(/text\/html/i)) { | ||
if (sanitisedPath.match(/\/$/)) { | ||
sanitisedPath += "index.html"; | ||
} else { | ||
sanitisedPath += ".html"; | ||
} | ||
if (headers["content-type"] && headers["content-type"].match(/text\/html/i)) { | ||
if (sanitisedPath.match(/\/$/)) { | ||
sanitisedPath += "index.html"; | ||
} else { | ||
sanitisedPath += ".html"; | ||
} | ||
} else if (headers["content-type"] && (mimeParts = headers["content-type"].match(/(image|video|audio|application)\/([a-z0-9]+)/i))) { | ||
subMimeType = mimeParts[2]; | ||
sanitisedPath += "." + subMimeType; | ||
} | ||
} | ||
} else if (headers["content-type"] && (mimeParts = headers["content-type"].match(/(image|video|audio|application)\/([a-z0-9]+)/i))) { | ||
subMimeType = mimeParts[2]; | ||
sanitisedPath += "." + subMimeType; | ||
} | ||
} | ||
return sanitisedPath; | ||
return sanitisedPath; | ||
} | ||
FSBackend.prototype.fileExists = function(location) { | ||
try { | ||
fs.statSync(location); | ||
return true; | ||
} catch (er) { | ||
return false; | ||
} | ||
try { | ||
fs.statSync(location); | ||
return true; | ||
} catch (er) { | ||
return false; | ||
} | ||
}; | ||
FSBackend.prototype.isDirectory = function(location) { | ||
try { | ||
if (fs.statSync(location).isDirectory()) { | ||
return true; | ||
} | ||
try { | ||
if (fs.statSync(location).isDirectory()) { | ||
return true; | ||
} | ||
return false; | ||
} catch (er) { | ||
return false; | ||
} | ||
return false; | ||
} catch (er) { | ||
return false; | ||
} | ||
}; | ||
FSBackend.prototype.load = function() { | ||
var backend = this; | ||
var backend = this; | ||
if (!this.fileExists(this.location) && this.isDirectory(this.location)) { | ||
throw new Error("Unable to verify cache location exists."); | ||
} | ||
if (!backend.fileExists(backend.location) && backend.isDirectory(backend.location)) { | ||
throw new Error("Unable to verify cache location exists."); | ||
} | ||
try { | ||
var fileData; | ||
if ((fileData = fs.readFileSync(this.location + "cacheindex.json")) && fileData.length) { | ||
this.index = JSON.parse(fileData.toString("utf8")); | ||
this.loaded = true; | ||
} | ||
} catch(error) { | ||
if (error.code === "ENOENT") { | ||
// Cache index doesn't exist. Assume this is a new cache. | ||
// Just leave the memory index empty for now. | ||
this.loaded = true; | ||
} else { | ||
throw error; | ||
} | ||
} | ||
try { | ||
var fileData; | ||
if ((fileData = fs.readFileSync(backend.location + "cacheindex.json")) && fileData.length) { | ||
backend.index = JSON.parse(fileData.toString("utf8")); | ||
backend.loaded = true; | ||
} | ||
} catch (error) { | ||
if (error.code === "ENOENT") { | ||
// Cache index doesn't exist. Assume this is a new cache. | ||
// Just leave the memory index empty for now. | ||
backend.loaded = true; | ||
} else { | ||
throw error; | ||
} | ||
} | ||
// Flush store to disk when closing. | ||
process.on("exit",function() { | ||
backend.saveCache.apply(backend); | ||
}); | ||
// Flush store to disk when closing. | ||
process.on("exit", function() { | ||
backend.saveCache.apply(backend); | ||
}); | ||
}; | ||
FSBackend.prototype.saveCache = function(callback) { | ||
fs.writeFile(this.location + "cacheindex.json", JSON.stringify(this.index), callback); | ||
fs.writeFile(this.location + "cacheindex.json", JSON.stringify(this.index), callback); | ||
}; | ||
FSBackend.prototype.setItem = function(queueObject,data,callback) { | ||
callback = callback instanceof Function ? callback : function(){}; | ||
FSBackend.prototype.setItem = function(queueObject, data, callback) { | ||
callback = callback instanceof Function ? callback : function() {}; | ||
var backend = this; | ||
var pathStack = [queueObject.protocol, queueObject.host, queueObject.port]; | ||
pathStack = pathStack.concat(sanitisePath(queueObject.path,queueObject).split(/\/+/g)); | ||
var backend = this; | ||
var pathStack = [queueObject.protocol, queueObject.host, queueObject.port]; | ||
pathStack = pathStack.concat(sanitisePath(queueObject.path, queueObject).split(/\/+/g)); | ||
var cacheItemExists = false; | ||
var firstInstanceIndex = NaN; | ||
if (this.index.reduce(function(prev,current,index,array) { | ||
firstInstanceIndex = !isNaN(firstInstanceIndex) ? firstInstanceIndex : index; | ||
return prev || current.url === queueObject.url; | ||
},false)) { | ||
cacheItemExists = true; | ||
} | ||
var cacheItemExists = false; | ||
var firstInstanceIndex = NaN; | ||
if (backend.index.reduce(function(prev, current, index) { | ||
firstInstanceIndex = !isNaN(firstInstanceIndex) ? firstInstanceIndex : index; | ||
return prev || current.url === queueObject.url; | ||
}, false)) { | ||
cacheItemExists = true; | ||
} | ||
var writeFileData = function(currentPath,data) { | ||
fs.writeFile(currentPath,data,function(error) { | ||
if (error) throw error; | ||
fs.writeFile(currentPath + ".cacheData.json",JSON.stringify(queueObject),function(error) { | ||
if (error) throw error; | ||
var writeFileData = function(currentPath, data) { | ||
fs.writeFile(currentPath, data, function(error) { | ||
if (error) { | ||
throw error; | ||
} | ||
fs.writeFile(currentPath + ".cacheData.json", JSON.stringify(queueObject), function(error) { | ||
if (error) { | ||
throw error; | ||
} | ||
var cacheObject = { | ||
url: queueObject.url, | ||
etag: queueObject.stateData.headers.etag, | ||
lastModified: queueObject.stateData.headers['last-modified'], | ||
dataFile: currentPath, | ||
metaFile: currentPath + ".cacheData.json" | ||
}; | ||
var cacheObject = { | ||
url: queueObject.url, | ||
etag: queueObject.stateData.headers.etag, | ||
lastModified: queueObject.stateData.headers["last-modified"], | ||
dataFile: currentPath, | ||
metaFile: currentPath + ".cacheData.json" | ||
}; | ||
if (cacheItemExists) { | ||
backend.index[firstInstanceIndex] = cacheObject; | ||
} else { | ||
backend.index.push(cacheObject); | ||
} | ||
if (cacheItemExists) { | ||
backend.index[firstInstanceIndex] = cacheObject; | ||
} else { | ||
backend.index.push(cacheObject); | ||
} | ||
callback(cacheObject); | ||
}); | ||
}); | ||
}; | ||
callback(cacheObject); | ||
}); | ||
}); | ||
}; | ||
pathStack.forEach(function(pathChunk,count) { | ||
var currentPath = backend.location + pathStack.slice(0,count+1).join("/"); | ||
if (backend.fileExists(backend.location + pathStack.slice(0,count+1).join("/"))) { | ||
if (!backend.isDirectory(currentPath)) { | ||
if (count === pathStack.length -1) { | ||
// Just overwrite the file... | ||
writeFileData(currentPath,data); | ||
} else { | ||
throw new Error("Cache storage of resource (%s) blocked by file: %s",queueObject.url,currentPath); | ||
} | ||
} | ||
} else { | ||
if (count === pathStack.length -1) { | ||
// Write the file data in | ||
writeFileData(currentPath,data); | ||
} else { | ||
fs.mkdirSync(currentPath); | ||
} | ||
} | ||
}); | ||
pathStack.forEach(function(pathChunk, count) { | ||
var currentPath = backend.location + pathStack.slice(0, count + 1).join("/"); | ||
if (backend.fileExists(backend.location + pathStack.slice(0, count + 1).join("/"))) { | ||
if (!backend.isDirectory(currentPath)) { | ||
if (count === pathStack.length - 1) { | ||
// Just overwrite the file... | ||
writeFileData(currentPath, data); | ||
} else { | ||
throw new Error("Cache storage of resource (%s) blocked by file: %s", queueObject.url, currentPath); | ||
} | ||
} | ||
} else { | ||
if (count === pathStack.length - 1) { | ||
// Write the file data in | ||
writeFileData(currentPath, data); | ||
} | ||
fs.mkdirSync(currentPath); | ||
} | ||
}); | ||
}; | ||
FSBackend.prototype.getItem = function(queueObject,callback) { | ||
var cacheItemResult = this.index.filter(function(item) { | ||
return item.url === queueObject.url; | ||
}); | ||
FSBackend.prototype.getItem = function(queueObject, callback) { | ||
var cacheItemResult = this.index.filter(function(item) { | ||
return item.url === queueObject.url; | ||
}); | ||
if (cacheItemResult.length) { | ||
var cacheItem = cacheItemResult.shift(); | ||
if (cacheItemResult.length) { | ||
var cacheItem = cacheItemResult.shift(); | ||
callback({ | ||
"url": cacheItem.url, | ||
"etag": cacheItem.etag, | ||
"lastModified": cacheItem.lastModified, | ||
"getData": function(callback) { | ||
fs.readFile(cacheItem.dataFile,function(error,data) { | ||
if (error) { | ||
callback(error); | ||
return false; | ||
} | ||
callback({ | ||
url: cacheItem.url, | ||
etag: cacheItem.etag, | ||
lastModified: cacheItem.lastModified, | ||
getData: function(callback) { | ||
fs.readFile(cacheItem.dataFile, function(error, data) { | ||
if (error) { | ||
callback(error); | ||
return false; | ||
} | ||
callback(null,data); | ||
}); | ||
}, | ||
"getMetadata": function(callback) { | ||
fs.readFile(cacheItem.metaFile,function(error,data) { | ||
if (error) { | ||
callback(error); | ||
return false; | ||
} | ||
callback(null, data); | ||
}); | ||
}, | ||
getMetadata: function(callback) { | ||
fs.readFile(cacheItem.metaFile, function(error, data) { | ||
if (error) { | ||
callback(error); | ||
return false; | ||
} | ||
callback(null,JSON.parse(data.toString("utf8"))); | ||
}); | ||
} | ||
}); | ||
callback(null, JSON.parse(data.toString("utf8"))); | ||
}); | ||
} | ||
}); | ||
} else { | ||
callback(null); | ||
} | ||
} else { | ||
callback(null); | ||
} | ||
return false; | ||
return false; | ||
}; | ||
@@ -1,24 +0,24 @@ | ||
// Simplecrawler - cache module | ||
// Christopher Giffard, 2011 | ||
// | ||
// http://www.github.com/cgiffard/node-simplecrawler | ||
/* | ||
* Simplecrawler - cache module | ||
* https://github.com/cgiffard/node-simplecrawler | ||
* | ||
* Copyright (c) 2011-2015, Christopher Giffard | ||
* | ||
*/ | ||
var fs = require("fs"); | ||
var EventEmitter = require('events').EventEmitter; | ||
var EventEmitter = require("events").EventEmitter; | ||
var FilesystemBackend = require("./cache-backend-fs.js"); | ||
// var RedisBackend = require("cache-backend-redis.js"); | ||
// var MongoBackend = require("cache-backend-mongo.js"); | ||
// Init cache wrapper for backend... | ||
var Cache = function Cache(cacheLoadParameter,cacheBackend) { | ||
var Cache = function Cache(cacheLoadParameter, cacheBackend) { | ||
// Ensure parameters are how we want them... | ||
cacheBackend = typeof cacheBackend === "object" ? cacheBackend : FilesystemBackend; | ||
cacheLoadParameter = cacheLoadParameter instanceof Array ? cacheLoadParameter : [cacheLoadParameter]; | ||
// Ensure parameters are how we want them... | ||
cacheBackend = typeof cacheBackend === "object" ? cacheBackend : FilesystemBackend; | ||
cacheLoadParameter = cacheLoadParameter instanceof Array ? cacheLoadParameter : [cacheLoadParameter]; | ||
// Now we can just run the factory. | ||
this.datastore = cacheBackend.apply(cacheBackend,cacheLoadParameter); | ||
// Now we can just run the factory. | ||
this.datastore = cacheBackend.apply(cacheBackend, cacheLoadParameter); | ||
// Instruct the backend to load up. | ||
this.datastore.load(); | ||
// Instruct the backend to load up. | ||
this.datastore.load(); | ||
}; | ||
@@ -29,13 +29,13 @@ | ||
// Set up data import and export functions | ||
Cache.prototype.setCacheData = function(queueObject,data,callback) { | ||
this.datastore.setItem(queueObject,data,callback); | ||
this.emit("setcache",queueObject,data); | ||
Cache.prototype.setCacheData = function(queueObject, data, callback) { | ||
this.datastore.setItem(queueObject, data, callback); | ||
this.emit("setcache", queueObject, data); | ||
}; | ||
Cache.prototype.getCacheData = function(queueObject,callback) { | ||
this.datastore.getItem(queueObject,callback); | ||
Cache.prototype.getCacheData = function(queueObject, callback) { | ||
this.datastore.getItem(queueObject, callback); | ||
}; | ||
Cache.prototype.saveCache = function() { | ||
this.datastore.saveCache(); | ||
this.datastore.saveCache(); | ||
}; | ||
@@ -42,0 +42,0 @@ |
@@ -1,244 +0,268 @@ | ||
// Cookie Jar Functionality | ||
var EventEmitter = require("events").EventEmitter, | ||
util = require("util"); | ||
/* | ||
* Simplecrawler - Cookie Jar Functionality | ||
* https://github.com/cgiffard/node-simplecrawler | ||
* | ||
* Copyright (c) 2011-2015, Christopher Giffard | ||
* | ||
*/ | ||
var EventEmitter = require("events").EventEmitter, | ||
util = require("util"); | ||
/* | ||
Public: Constructor for the cookie jar. | ||
Public: Constructor for the cookie jar. | ||
Examples | ||
Examples | ||
var cookieJar = new CookieJar(); | ||
var cookieJar = new CookieJar(); | ||
Returns the cookie jar object which has now been constructed. | ||
Returns the cookie jar object which has now been constructed. | ||
*/ | ||
function CookieJar() { | ||
var cookies = []; | ||
this.__defineGetter__("cookies",function() { | ||
return cookies; | ||
}); | ||
var cookies = []; | ||
this.__defineGetter__("cookies", function() { | ||
return cookies; | ||
}); | ||
// Run the EventEmitter constructor | ||
EventEmitter.call(this); | ||
// Run the EventEmitter constructor | ||
EventEmitter.call(this); | ||
} | ||
util.inherits(CookieJar,EventEmitter); | ||
util.inherits(CookieJar, EventEmitter); | ||
/* | ||
Public: Adds a new cookie to the jar, either by creating a new Cookie() object | ||
from specific details such as name, value, etc., accepting a string from a | ||
Set-Cookie header, or by passing in an existing Cookie() object. | ||
Public: Adds a new cookie to the jar, either by creating a new Cookie() object | ||
from specific details such as name, value, etc., accepting a string from a | ||
Set-Cookie header, or by passing in an existing Cookie() object. | ||
name - The name of the cookie to add. Alternately, set-cookie | ||
header as string, or an existing cookie object. | ||
value - The value of the cookie. | ||
expiry - Expiry timestamp in milliseconds. | ||
path - Limit cookie to path (defaults to "/") | ||
domain - Limit cookie to domain | ||
httponly - Boolean value specifying httponly | ||
cb - Optional callback. | ||
name - The name of the cookie to add. Alternately, set-cookie | ||
header as string, or an existing cookie object. | ||
value - The value of the cookie. | ||
expiry - Expiry timestamp in milliseconds. | ||
path - Limit cookie to path (defaults to "/") | ||
domain - Limit cookie to domain | ||
httponly - Boolean value specifying httponly | ||
cb - Optional callback. | ||
Emits | ||
Emits | ||
addcookie - Emitted with new cookie object as an argument. | ||
addcookie - Emitted with new cookie object as an argument. | ||
Examples | ||
Examples | ||
cookieJar.add("mycookie","myValue",Date.now(),"/","test.com",false); | ||
cookieJar.add("mycookie","myValue",Date.now(),"/","test.com",false); | ||
Returns the cookie jar object for chaining. | ||
Returns the cookie jar object for chaining. | ||
*/ | ||
CookieJar.prototype.add = function(name,value,expiry,path,domain,httponly,cb) { | ||
CookieJar.prototype.add = function(name, value, expiry, path, domain, httponly, cb) { | ||
var existingIndex = -1, newCookie; | ||
var existingIndex = -1, newCookie; | ||
if (arguments.length > 1) { | ||
newCookie = new Cookie(name,value,expiry,path,domain,httponly); | ||
} else if (name instanceof Cookie) { | ||
newCookie = name; | ||
} else { | ||
newCookie = Cookie.fromString(name); | ||
} | ||
if (arguments.length > 1) { | ||
newCookie = new Cookie(name, value, expiry, path, domain, httponly); | ||
} else if (name instanceof Cookie) { | ||
newCookie = name; | ||
} else { | ||
newCookie = Cookie.fromString(name); | ||
} | ||
// Are we updating an existing cookie or adding a new one? | ||
this.cookies.forEach(function(cookie,index) { | ||
if (cookie.name === newCookie.name && | ||
cookie.matchDomain(newCookie.domain)) { | ||
// Are we updating an existing cookie or adding a new one? | ||
this.cookies.forEach(function(cookie, index) { | ||
if (cookie.name === newCookie.name && | ||
cookie.matchDomain(newCookie.domain)) { | ||
existingIndex = index; | ||
} | ||
}); | ||
existingIndex = index; | ||
} | ||
}); | ||
if (existingIndex < 0) { | ||
this.cookies.push(newCookie); | ||
} else { | ||
this.cookies[existingIndex] = newCookie; | ||
} | ||
if (existingIndex < 0) { | ||
this.cookies.push(newCookie); | ||
} else { | ||
this.cookies[existingIndex] = newCookie; | ||
} | ||
this.emit("addcookie",newCookie); | ||
this.emit("addcookie", newCookie); | ||
if (cb && cb instanceof Function) | ||
cb(null,newCookie); | ||
if (cb && cb instanceof Function) { | ||
cb(null, newCookie); | ||
} | ||
return this; | ||
return this; | ||
}; | ||
/* | ||
Public: Removes cookies from the cookie jar. If no domain and name are | ||
specified, all cookies in the jar are removed. | ||
Public: Removes cookies from the cookie jar. If no domain and name are | ||
specified, all cookies in the jar are removed. | ||
name - The name of the cookie(s) to remove | ||
domain - The domain from which to remove cookies. | ||
cb - Optional callback. | ||
name - The name of the cookie(s) to remove | ||
domain - The domain from which to remove cookies. | ||
cb - Optional callback. | ||
Emits | ||
Emits | ||
removecookie - Emitted with array of removed cookies. | ||
removecookie - Emitted with array of removed cookies. | ||
Examples | ||
Examples | ||
cookieJar.remove(null,"nytimes.com"); | ||
cookieJar.remove(null,"nytimes.com"); | ||
Returns an array of removed cookies. | ||
Returns an array of removed cookies. | ||
*/ | ||
CookieJar.prototype.remove = function(name,domain,cb) { | ||
var cookiesRemoved = [], jar = this; | ||
CookieJar.prototype.remove = function(name, domain, cb) { | ||
var cookiesRemoved = [], | ||
jar = this; | ||
this.cookies.forEach(function(cookie,index) { | ||
jar.cookies.forEach(function(cookie, index) { | ||
// If the names don't match, we're not removing this cookie | ||
if (!!name && cookie.name !== name) | ||
return false; | ||
// If the names don't match, we're not removing this cookie | ||
if (!!name && cookie.name !== name) { | ||
return false; | ||
} | ||
// If the domains don't match, we're not removing this cookie | ||
if (!!domain && !cookie.matchDomain(domain)) | ||
return false; | ||
// If the domains don't match, we're not removing this cookie | ||
if (!!domain && !cookie.matchDomain(domain)) { | ||
return false; | ||
} | ||
// Matched. Remove! | ||
cookiesRemoved.push(jar.cookies.splice(index,1)); | ||
}); | ||
// Matched. Remove! | ||
cookiesRemoved.push(jar.cookies.splice(index, 1)); | ||
}); | ||
jar.emit("removecookie",cookiesRemoved); | ||
jar.emit("removecookie", cookiesRemoved); | ||
if (cb && cb instanceof Function) | ||
cb(null,cookiesRemoved); | ||
if (cb && cb instanceof Function) { | ||
cb(null, cookiesRemoved); | ||
} | ||
return cookiesRemoved; | ||
return cookiesRemoved; | ||
}; | ||
/* | ||
Public: Gets an array of cookies based on name and domain. | ||
Public: Gets an array of cookies based on name and domain. | ||
name - The name of the cookie(s) to retrieve | ||
domain - The domain from which to retrieve cookies. | ||
cb - Optional callback. | ||
name - The name of the cookie(s) to retrieve | ||
domain - The domain from which to retrieve cookies. | ||
cb - Optional callback. | ||
Examples | ||
Examples | ||
cookieJar.get(null,"nytimes.com"); | ||
cookieJar.get(null,"nytimes.com"); | ||
Returns an array of cookies. | ||
Returns an array of cookies. | ||
*/ | ||
CookieJar.prototype.get = function(name,domain,cb) { | ||
CookieJar.prototype.get = function(name, domain, cb) { | ||
var cookies = | ||
this.cookies.filter(function(cookie,index) { | ||
var cookies = this.cookies.filter(function(cookie) { | ||
// If the names don't match, we're not returning this cookie | ||
if (!!name && cookie.name !== name) | ||
return false; | ||
// If the names don't match, we're not returning this cookie | ||
if (!!name && cookie.name !== name) { | ||
return false; | ||
} | ||
// If the domains don't match, we're not returning this cookie | ||
if (!!domain && !cookie.matchDomain(domain)) | ||
return false; | ||
// If the domains don't match, we're not returning this cookie | ||
if (!!domain && !cookie.matchDomain(domain)) { | ||
return false; | ||
} | ||
return true; | ||
}); | ||
return true; | ||
}); | ||
if (cb && cb instanceof Function) | ||
cb(null,cookies); | ||
if (cb && cb instanceof Function) { | ||
cb(null, cookies); | ||
} | ||
return cookies; | ||
return cookies; | ||
}; | ||
/* | ||
Public: Generates an array of headers based on the value of the cookie jar. | ||
Public: Generates an array of headers based on the value of the cookie jar. | ||
domain - The domain from which to generate cookies. | ||
path - Filter headers to cookies applicable to this path. | ||
cb - Optional callback. | ||
domain - The domain from which to generate cookies. | ||
path - Filter headers to cookies applicable to this path. | ||
cb - Optional callback. | ||
Examples | ||
Examples | ||
cookieJar.getAsHeader("nytimes.com","/myaccount"); | ||
cookieJar.getAsHeader("nytimes.com","/myaccount"); | ||
Returns an array of cookie headers. | ||
Returns an array of cookie headers. | ||
*/ | ||
CookieJar.prototype.getAsHeader = function(domain,path,cb) { | ||
CookieJar.prototype.getAsHeader = function(domain, path, cb) { | ||
var headers = | ||
this.cookies | ||
.filter(function(cookie) { | ||
if (cookie.isExpired()) return false; | ||
if (!domain && !path) return true; | ||
if (domain) return cookie.matchDomain(domain); | ||
if (path) return cookie.matchPath(path); | ||
}) | ||
.map(function(cookie) { | ||
return cookie.toString(); | ||
}); | ||
var headers = | ||
this.cookies.filter(function(cookie) { | ||
if (cookie.isExpired()) { | ||
return false; | ||
} | ||
if (!domain && !path) { | ||
return true; | ||
} | ||
if (domain) { | ||
return cookie.matchDomain(domain); | ||
} | ||
if (path) { | ||
return cookie.matchPath(path); | ||
} | ||
}) | ||
.map(function(cookie) { | ||
return cookie.toString(); | ||
}); | ||
if (cb && cb instanceof Function) | ||
cb(null,headers); | ||
if (cb && cb instanceof Function) { | ||
cb(null, headers); | ||
} | ||
return headers; | ||
return headers; | ||
}; | ||
/* | ||
Public: Adds cookies to the cookie jar based on an array of 'set-cookie' | ||
headers provided by a webserver. Duplicate cookies are overwritten. | ||
Public: Adds cookies to the cookie jar based on an array of 'set-cookie' | ||
headers provided by a webserver. Duplicate cookies are overwritten. | ||
headers - An array of 'set-cookie' headers | ||
cb - Optional callback. | ||
headers - An array of 'set-cookie' headers | ||
cb - Optional callback. | ||
Examples | ||
Examples | ||
cookieJar.addFromHeaders(res.headers["set-cookie"]); | ||
cookieJar.addFromHeaders(res.headers["set-cookie"]); | ||
Returns the cookie jar for chaining. | ||
Returns the cookie jar for chaining. | ||
*/ | ||
CookieJar.prototype.addFromHeaders = function(headers,cb) { | ||
var jar = this; | ||
CookieJar.prototype.addFromHeaders = function(headers, cb) { | ||
var jar = this; | ||
if (!(headers instanceof Array)) | ||
headers = [headers]; | ||
if (!(headers instanceof Array)) { | ||
headers = [headers]; | ||
} | ||
headers.forEach(function(header) { | ||
jar.add(header); | ||
}); | ||
headers.forEach(function(header) { | ||
jar.add(header); | ||
}); | ||
if (cb && cb instanceof Function) | ||
cb(jar); | ||
if (cb && cb instanceof Function) { | ||
cb(jar); | ||
} | ||
return jar; | ||
return jar; | ||
}; | ||
/* | ||
Public: Outputs a linefeed-separated list of set-cookie headers representing | ||
the entire contents of the cookie jar. | ||
Public: Outputs a linefeed-separated list of set-cookie headers representing | ||
the entire contents of the cookie jar. | ||
Examples | ||
Examples | ||
cookieJar.toString(); | ||
cookieJar.toString(); | ||
Returns a list of headers in string form. | ||
Returns a list of headers in string form. | ||
*/ | ||
CookieJar.prototype.toString = function() { | ||
return this.getAsHeader().join("\n"); | ||
return this.getAsHeader().join("\n"); | ||
}; | ||
@@ -248,51 +272,54 @@ | ||
/* | ||
Public: Constructor for the Cookie() object: create a new cookie. | ||
Public: Constructor for the Cookie() object: create a new cookie. | ||
name - The name of the cookie to add. | ||
value - The value of the cookie. | ||
expires - Expiry timestamp in milliseconds. | ||
path - Limit cookie to path (defaults to "/") | ||
domain - Limit cookie to domain | ||
httponly - Boolean value specifying httponly | ||
name - The name of the cookie to add. | ||
value - The value of the cookie. | ||
expires - Expiry timestamp in milliseconds. | ||
path - Limit cookie to path (defaults to "/") | ||
domain - Limit cookie to domain | ||
httponly - Boolean value specifying httponly | ||
Examples | ||
Examples | ||
var myCookie = new Cookie("mycookie","myValue",Date.now(),"/","test.com",false); | ||
var myCookie = new Cookie("mycookie","myValue",Date.now(),"/","test.com",false); | ||
Returns the newly created Cookie object. | ||
Returns the newly created Cookie object. | ||
*/ | ||
function Cookie(name,value,expires,path,domain,httponly) { | ||
function Cookie(name, value, expires, path, domain, httponly) { | ||
if (!name) throw new Error("A name is required to create a cookie."); | ||
if (!name) { | ||
throw new Error("A name is required to create a cookie."); | ||
} | ||
// Parse date to timestamp - consider it never expiring if timestamp is not | ||
// passed to the function | ||
if (expires) { | ||
// Parse date to timestamp - consider it never expiring if timestamp is not | ||
// passed to the function | ||
if (expires) { | ||
if (typeof expires !== "number") | ||
expires = (new Date(expires)).getTime(); | ||
if (typeof expires !== "number") { | ||
expires = (new Date(expires)).getTime(); | ||
} | ||
} else { | ||
expires = -1; | ||
} | ||
} else { | ||
expires = -1; | ||
} | ||
this.name = name; | ||
this.value = value || ""; | ||
this.expires = expires; | ||
this.path = path || "/"; | ||
this.domain = domain || "*"; | ||
this.httponly = !!httponly; | ||
this.name = name; | ||
this.value = value || ""; | ||
this.expires = expires; | ||
this.path = path || "/"; | ||
this.domain = domain || "*"; | ||
this.httponly = !!httponly; | ||
} | ||
/* | ||
Public, Static: Returns a new Cookie() object based on a header string. | ||
Public, Static: Returns a new Cookie() object based on a header string. | ||
string - A set-cookie header string | ||
string - A set-cookie header string | ||
Examples | ||
Examples | ||
var myCookie = Cookie.fromString(response.headers["set-cookie"][0]); | ||
var myCookie = Cookie.fromString(response.headers["set-cookie"][0]); | ||
Returns the newly created Cookie object. | ||
Returns the newly created Cookie object. | ||
@@ -302,126 +329,137 @@ */ | ||
if (!string || typeof string !== "string") | ||
throw new Error("String must be supplied to generate a cookie."); | ||
if (!string || typeof string !== "string") { | ||
throw new Error("String must be supplied to generate a cookie."); | ||
} | ||
function parseKeyVal(input) { | ||
var key = input.split(/\=/).shift(), | ||
val = input.split(/\=/).slice(1).join("="); | ||
function parseKeyVal(input) { | ||
var key = input.split(/\=/).shift(), | ||
val = input.split(/\=/).slice(1).join("="); | ||
return [key,val]; | ||
} | ||
return [key, val]; | ||
} | ||
string = string.replace(/^\s*set\-cookie\s*\:\s*/i,""); | ||
string = string.replace(/^\s*set\-cookie\s*\:\s*/i, ""); | ||
var parts = string.split(/\s*\;\s*/i), | ||
name = parseKeyVal(parts.shift()), | ||
keyValParts = {}; | ||
var parts = string.split(/\s*\;\s*/i), | ||
name = parseKeyVal(parts.shift()), | ||
keyValParts = {}; | ||
keyValParts.name = name[0]; | ||
keyValParts.value = name[1]; | ||
keyValParts.name = name[0]; | ||
keyValParts.value = name[1]; | ||
parts | ||
.filter(function(input) { | ||
return !!input.replace(/\s+/ig,"").length; | ||
}) | ||
.map(parseKeyVal) | ||
.forEach(function(keyval) { | ||
var key = String(keyval[0]).toLowerCase().replace(/[^a-z0-9]/ig,""); | ||
keyValParts[key] = keyval[1]; | ||
}); | ||
parts | ||
.filter(function(input) { | ||
return !!input.replace(/\s+/ig, "").length; | ||
}) | ||
.map(parseKeyVal) | ||
.forEach(function(keyval) { | ||
var key = String(keyval[0]).toLowerCase().replace(/[^a-z0-9]/ig, ""); | ||
keyValParts[key] = keyval[1]; | ||
}); | ||
return new Cookie( | ||
keyValParts.name, | ||
keyValParts.value, | ||
keyValParts.expires || keyValParts.expiry, | ||
keyValParts.path, | ||
keyValParts.domain, | ||
keyValParts.hasOwnProperty("httponly") | ||
); | ||
return new Cookie( | ||
keyValParts.name, | ||
keyValParts.value, | ||
keyValParts.expires || keyValParts.expiry, | ||
keyValParts.path, | ||
keyValParts.domain, | ||
keyValParts.hasOwnProperty("httponly") | ||
); | ||
}; | ||
/* | ||
Public: Outputs the cookie as a string, in the form of a set-cookie header. | ||
Public: Outputs the cookie as a string, in the form of a set-cookie header. | ||
includeHeader - Boolean value specifying whether to include the | ||
'Set-Cookie: ' header name at the beginning of the | ||
string. | ||
includeHeader - Boolean value specifying whether to include the | ||
'Set-Cookie: ' header name at the beginning of the | ||
string. | ||
Examples | ||
Examples | ||
var header = myCookie.toString(true); | ||
var header = myCookie.toString(true); | ||
Returns the header string. | ||
Returns the header string. | ||
*/ | ||
Cookie.prototype.toString = function(includeHeader) { | ||
var string = ""; | ||
var string = ""; | ||
if (includeHeader) string = "Set-Cookie: "; | ||
if (includeHeader) { | ||
string = "Set-Cookie: "; | ||
} | ||
string += this.name + "=" + this.value + "; "; | ||
string += this.name + "=" + this.value + "; "; | ||
if (this.expires > 0) | ||
string += "Expires=" + (new Date(this.expires)).toGMTString() + "; "; | ||
if (this.expires > 0) { | ||
string += "Expires=" + (new Date(this.expires)).toGMTString() + "; "; | ||
} | ||
if (!!this.path) | ||
string += "Path=" + this.path + "; "; | ||
if (this.path) { | ||
string += "Path=" + this.path + "; "; | ||
} | ||
if (!!this.domain) | ||
string += "Domain=" + this.domain + "; "; | ||
if (this.domain) { | ||
string += "Domain=" + this.domain + "; "; | ||
} | ||
if (!!this.httponly) | ||
string += "Httponly; "; | ||
if (this.httponly) { | ||
string += "Httponly; "; | ||
} | ||
return string; | ||
return string; | ||
}; | ||
/* | ||
Public: Determines whether a cookie has expired or not. | ||
Public: Determines whether a cookie has expired or not. | ||
Examples | ||
Examples | ||
if (myCookie.isExpired()) { ... } | ||
if (myCookie.isExpired()) { ... } | ||
Returns a boolean value specifying whether the cookie has expired (true) or | ||
whether it is still valid (false.) | ||
Returns a boolean value specifying whether the cookie has expired (true) or | ||
whether it is still valid (false.) | ||
*/ | ||
Cookie.prototype.isExpired = function() { | ||
if (this.expires < 0) return false; | ||
return (this.expires < Date.now()); | ||
if (this.expires < 0) { | ||
return false; | ||
} | ||
return this.expires < Date.now(); | ||
}; | ||
/* | ||
Public: Determines whether a cookie matches a given domain. | ||
Public: Determines whether a cookie matches a given domain. | ||
Examples | ||
Examples | ||
if (myCookie.matchDomain("example.com")) { ... } | ||
if (myCookie.matchDomain("example.com")) { ... } | ||
Returns a boolean value specifying whether the cookie matches (true) or | ||
doesn't match (false.) | ||
Returns a boolean value specifying whether the cookie matches (true) or | ||
doesn't match (false.) | ||
*/ | ||
Cookie.prototype.matchDomain = function(domain) { | ||
var reverseDomain = this.domain.split("").reverse().join(""), | ||
reverseDomainComp = domain.split("").reverse().join(""); | ||
var reverseDomain = this.domain.split("").reverse().join(""), | ||
reverseDomainComp = domain.split("").reverse().join(""); | ||
return reverseDomain.indexOf(reverseDomainComp) === 0; | ||
return reverseDomain.indexOf(reverseDomainComp) === 0; | ||
}; | ||
/* | ||
Public: Determines whether a cookie matches a given path. | ||
Public: Determines whether a cookie matches a given path. | ||
Examples | ||
Examples | ||
if (myCookie.matchPath("/test/account")) { ... } | ||
if (myCookie.matchPath("/test/account")) { ... } | ||
Returns a boolean value specifying whether the cookie matches (true) or | ||
doesn't match (false.) | ||
Returns a boolean value specifying whether the cookie matches (true) or | ||
doesn't match (false.) | ||
*/ | ||
Cookie.prototype.matchPath = function(path) { | ||
if (!this.path) return true; | ||
if (!this.path) { | ||
return true; | ||
} | ||
return path.indexOf(this.path) === 0; | ||
return path.indexOf(this.path) === 0; | ||
}; | ||
@@ -428,0 +466,0 @@ |
1940
lib/crawler.js
@@ -1,18 +0,20 @@ | ||
// Simplecrawler | ||
// Christopher Giffard, 2011 - 2013+ | ||
// | ||
// http://www.github.com/cgiffard/node-simplecrawler | ||
/* | ||
* Simplecrawler | ||
* https://github.com/cgiffard/node-simplecrawler | ||
* | ||
* Copyright (c) 2011-2015, Christopher Giffard | ||
* | ||
*/ | ||
// Queue Dependency | ||
var FetchQueue = require("./queue.js"), | ||
Cache = require("./cache.js"), | ||
CookieJar = require("./cookies.js"), | ||
MetaInfo = require("../package.json"); | ||
var FetchQueue = require("./queue.js"), | ||
CookieJar = require("./cookies.js"), | ||
MetaInfo = require("../package.json"); | ||
var http = require("http"), | ||
https = require("https"), | ||
EventEmitter = require('events').EventEmitter, | ||
URI = require("URIjs"), | ||
zlib = require("zlib"), | ||
util = require("util"); | ||
var http = require("http"), | ||
https = require("https"), | ||
EventEmitter = require("events").EventEmitter, | ||
uri = require("urijs"), | ||
zlib = require("zlib"), | ||
util = require("util"); | ||
@@ -22,887 +24,935 @@ var QUEUE_ITEM_INITIAL_DEPTH = 1; | ||
/* | ||
Public: Constructor for the crawler. | ||
Public: Constructor for the crawler. | ||
host - Initial hostname/domain to begin crawling from. By | ||
default, the crawl will be locked to this hostname. | ||
initialPath - Initial path to begin crawling from. | ||
initialPort - Port to begin crawling from. | ||
interval - Request interval for the crawler. Defaults to 250ms. | ||
host - Initial hostname/domain to begin crawling from. By | ||
default, the crawl will be locked to this hostname. | ||
initialPath - Initial path to begin crawling from. | ||
initialPort - Port to begin crawling from. | ||
interval - Request interval for the crawler. Defaults to 250ms. | ||
Examples | ||
Examples | ||
var crawler = new Crawler("example.com","/",80,500); | ||
var crawler = new Crawler("example.com","/",80,500); | ||
var crawler = new Crawler("example.com"); | ||
var crawler = new Crawler("example.com"); | ||
Returns the crawler object which has now been constructed. | ||
Returns the crawler object which has now been constructed. | ||
*/ | ||
var Crawler = function(host,initialPath,initialPort,interval) { | ||
var crawler = this; | ||
var Crawler = function(host, initialPath, initialPort, interval) { | ||
var crawler = this; | ||
// Data integrity checks | ||
if (initialPort && isNaN(initialPort)) | ||
throw new Error("Port must be a number!"); | ||
// Data integrity checks | ||
if (initialPort && isNaN(initialPort)) { | ||
throw new Error("Port must be a number!"); | ||
} | ||
// SETTINGS TO STUFF WITH | ||
// (not here! Do it when you create a `new Crawler()`) | ||
// SETTINGS TO STUFF WITH | ||
// (not here! Do it when you create a `new Crawler()`) | ||
// Domain to crawl | ||
crawler.host = host || ""; | ||
// Domain to crawl | ||
crawler.host = host || ""; | ||
// Gotta start crawling *somewhere* | ||
crawler.initialPath = initialPath || "/"; | ||
crawler.initialPort = initialPort || 80; | ||
crawler.initialProtocol = "http"; | ||
// Gotta start crawling *somewhere* | ||
crawler.initialPath = initialPath || "/"; | ||
crawler.initialPort = initialPort || 80; | ||
crawler.initialProtocol = "http"; | ||
// Internal 'tick' interval for spawning new requests | ||
// (as long as concurrency is under cap) | ||
// One request will be spooled per tick, up to the concurrency threshold. | ||
crawler.interval = interval || 250; | ||
// Internal 'tick' interval for spawning new requests | ||
// (as long as concurrency is under cap) | ||
// One request will be spooled per tick, up to the concurrency threshold. | ||
crawler.interval = interval || 250; | ||
// Maximum request concurrency. Be sensible. Five ties in with node's | ||
// default maxSockets value. | ||
crawler.maxConcurrency = 5; | ||
// Maximum request concurrency. Be sensible. Five ties in with node's | ||
// default maxSockets value. | ||
crawler.maxConcurrency = 5; | ||
// Maximum time we'll wait for headers | ||
crawler.timeout = 5 * 60 * 1000; | ||
// Maximum time we'll wait for headers | ||
crawler.timeout = 5 * 60 * 1000; | ||
// Maximum time we'll wait for async listeners. | ||
crawler.listenerTTL = 10 * 1000; | ||
// Maximum time we'll wait for async listeners. | ||
crawler.listenerTTL = 10 * 1000; | ||
// User Agent | ||
crawler.userAgent = | ||
"Node/" + MetaInfo.name + " " + MetaInfo.version + | ||
" (" + MetaInfo.repository.url + ")"; | ||
// User Agent | ||
crawler.userAgent = | ||
"Node/" + MetaInfo.name + " " + MetaInfo.version + | ||
" (" + MetaInfo.repository.url + ")"; | ||
// Queue for requests - FetchQueue gives us stats and other sugar | ||
// (but it's basically just an array) | ||
crawler.queue = new FetchQueue(); | ||
// Queue for requests - FetchQueue gives us stats and other sugar | ||
// (but it's basically just an array) | ||
crawler.queue = new FetchQueue(); | ||
// Do we filter by domain? | ||
// Unless you want to be crawling the entire internet, I would | ||
// recommend leaving this on! | ||
crawler.filterByDomain = true; | ||
// Do we filter by domain? | ||
// Unless you want to be crawling the entire internet, I would | ||
// recommend leaving this on! | ||
crawler.filterByDomain = true; | ||
// Do we scan subdomains? | ||
crawler.scanSubdomains = false; | ||
// Do we scan subdomains? | ||
crawler.scanSubdomains = false; | ||
// Treat WWW subdomain the same as the main domain (and don't count | ||
// it as a separate subdomain) | ||
crawler.ignoreWWWDomain = true; | ||
// Treat WWW subdomain the same as the main domain (and don't count | ||
// it as a separate subdomain) | ||
crawler.ignoreWWWDomain = true; | ||
// Or go even further and strip WWW subdomain from domains altogether! | ||
crawler.stripWWWDomain = false; | ||
// Or go even further and strip WWW subdomain from domains altogether! | ||
crawler.stripWWWDomain = false; | ||
// Internal cachestore | ||
crawler.cache = null; | ||
// Internal cachestore | ||
crawler.cache = null; | ||
// Use an HTTP Proxy? | ||
crawler.useProxy = false; | ||
crawler.proxyHostname = "127.0.0.1"; | ||
crawler.proxyPort = 8123; | ||
crawler.proxyUser = null; | ||
crawler.proxyPass = null; | ||
// Use an HTTP Proxy? | ||
crawler.useProxy = false; | ||
crawler.proxyHostname = "127.0.0.1"; | ||
crawler.proxyPort = 8123; | ||
crawler.proxyUser = null; | ||
crawler.proxyPass = null; | ||
// Support for HTTP basic auth | ||
crawler.needsAuth = false; | ||
crawler.authUser = ""; | ||
crawler.authPass = ""; | ||
// Support for HTTP basic auth | ||
crawler.needsAuth = false; | ||
crawler.authUser = ""; | ||
crawler.authPass = ""; | ||
// Support for retaining cookies for parse duration | ||
crawler.acceptCookies = true; | ||
crawler.cookies = new CookieJar(); | ||
// Support for retaining cookies for parse duration | ||
crawler.acceptCookies = true; | ||
crawler.cookies = new CookieJar(); | ||
// Support for custom headers... | ||
crawler.customHeaders = {}; | ||
// Support for custom headers... | ||
crawler.customHeaders = {}; | ||
// Domain Whitelist | ||
// We allow domains to be whitelisted, so cross-domain requests can be made. | ||
crawler.domainWhitelist = []; | ||
// Domain Whitelist | ||
// We allow domains to be whitelisted, so cross-domain requests can be made. | ||
crawler.domainWhitelist = []; | ||
// Supported Protocols | ||
crawler.allowedProtocols = [ | ||
/^http(s)?$/i, // HTTP & HTTPS | ||
/^(rss|atom|feed)(\+xml)?$/i // RSS / XML | ||
]; | ||
// Supported Protocols | ||
crawler.allowedProtocols = [ | ||
/^http(s)?$/i, // HTTP & HTTPS | ||
/^(rss|atom|feed)(\+xml)?$/i // RSS / XML | ||
]; | ||
// Max file size to download/store | ||
crawler.maxResourceSize = 1024 * 1024 * 16; // 16mb | ||
// Max file size to download/store | ||
crawler.maxResourceSize = 1024 * 1024 * 16; // 16mb | ||
// Supported MIME-types | ||
// Matching MIME-types will be scanned for links | ||
crawler.supportedMimeTypes = [ | ||
/^text\//i, | ||
/^application\/(rss|html|xhtml)?[\+\/\-]?xml/i, | ||
/^application\/javascript/i, | ||
/^xml/i | ||
]; | ||
// Supported MIME-types | ||
// Matching MIME-types will be scanned for links | ||
crawler.supportedMimeTypes = [ | ||
/^text\//i, | ||
/^application\/(rss|html|xhtml)?[\+\/\-]?xml/i, | ||
/^application\/javascript/i, | ||
/^xml/i | ||
]; | ||
// Download linked, but unsupported files (binary - images, documents, etc) | ||
crawler.downloadUnsupported = true; | ||
// Download linked, but unsupported files (binary - images, documents, etc) | ||
crawler.downloadUnsupported = true; | ||
// URL Encoding setting... | ||
crawler.urlEncoding = "unicode"; | ||
// URL Encoding setting... | ||
crawler.urlEncoding = "unicode"; | ||
// Strip Querystring Parameters from URL | ||
crawler.stripQuerystring = false; | ||
// Strip Querystring Parameters from URL | ||
crawler.stripQuerystring = false; | ||
// Regular expressions for finding URL items in HTML and text | ||
crawler.discoverRegex = [ | ||
/\s?(?:href|src)\s?=\s?(["']).*?\1/ig, | ||
/\s?(?:href|src)\s?=\s?[^"'][^\s>]+/ig, | ||
/\s?url\((["']).*?\1\)/ig, | ||
/\s?url\([^"'].*?\)/ig, | ||
// Regular expressions for finding URL items in HTML and text | ||
crawler.discoverRegex = [ | ||
/\s?(?:href|src)\s?=\s?(["']).*?\1/ig, | ||
/\s?(?:href|src)\s?=\s?[^"'][^\s>]+/ig, | ||
/\s?url\((["']).*?\1\)/ig, | ||
/\s?url\([^"'].*?\)/ig, | ||
// This could easily duplicate matches above, e.g. in the case of | ||
// href="http://example.com" | ||
/http(s)?\:\/\/[^?\s><\'\"]+/ig, | ||
// This could easily duplicate matches above, e.g. in the case of | ||
// href="http://example.com" | ||
/http(s)?\:\/\/[^?\s><\'\"]+/ig, | ||
// This might be a bit of a gamble... but get hard-coded | ||
// strings out of javacript: URLs. They're often popup-image | ||
// or preview windows, which would otherwise be unavailable to us. | ||
// Worst case scenario is we make some junky requests. | ||
/^javascript\:[a-z0-9\$\_\.]+\(['"][^'"\s]+/ig | ||
]; | ||
// This might be a bit of a gamble... but get hard-coded | ||
// strings out of javacript: URLs. They're often popup-image | ||
// or preview windows, which would otherwise be unavailable to us. | ||
// Worst case scenario is we make some junky requests. | ||
/^javascript\:[a-z0-9\$\_\.]+\(['"][^'"\s]+/ig | ||
]; | ||
// Whether to parse inside HTML comments | ||
crawler.parseHTMLComments = true; | ||
// Whether to parse inside HTML comments | ||
crawler.parseHTMLComments = true; | ||
// Whether to parse inside script tags | ||
crawler.parseScriptTags = true; | ||
// Whether to parse inside script tags | ||
crawler.parseScriptTags = true; | ||
// Max depth parameter | ||
crawler.maxDepth = 0; | ||
// Max depth parameter | ||
crawler.maxDepth = 0; | ||
// Whether to allow 'resources' greater than the max depth to be downloaded | ||
crawler.fetchWhitelistedMimeTypesBelowMaxDepth = false; | ||
// Whether to allow 'resources' greater than the max depth to be downloaded | ||
crawler.fetchWhitelistedMimeTypesBelowMaxDepth = false; | ||
// Ignore invalid ssl certificates | ||
crawler.ignoreInvalidSSL = false; | ||
// Ignore invalid SSL certificates | ||
crawler.ignoreInvalidSSL = false; | ||
// STATE (AND OTHER) VARIABLES NOT TO STUFF WITH | ||
var hiddenProps = { | ||
"_openRequests": 0, | ||
"_fetchConditions": [], | ||
"_openListeners": 0 | ||
}; | ||
// STATE (AND OTHER) VARIABLES NOT TO STUFF WITH | ||
var hiddenProps = { | ||
_openRequests: 0, | ||
_fetchConditions: [], | ||
_openListeners: 0 | ||
}; | ||
// Run the EventEmitter constructor | ||
EventEmitter.call(crawler); | ||
// Run the EventEmitter constructor | ||
EventEmitter.call(crawler); | ||
// Apply all the hidden props | ||
Object.keys(hiddenProps).forEach(function(key) { | ||
Object.defineProperty(crawler, key, { | ||
"writable": true, | ||
"enumerable": false, | ||
"value": hiddenProps[key] | ||
}); | ||
}); | ||
// Apply all the hidden props | ||
Object.keys(hiddenProps).forEach(function(key) { | ||
Object.defineProperty(crawler, key, { | ||
writable: true, | ||
enumerable: false, | ||
value: hiddenProps[key] | ||
}); | ||
}); | ||
}; | ||
util.inherits(Crawler,EventEmitter); | ||
util.inherits(Crawler, EventEmitter); | ||
/* | ||
Public: Starts or resumes the crawl. If the queue is empty, it adds a new | ||
queue item from which to begin crawling based on the initial configuration | ||
of the crawler itself. The crawler waits for process.nextTick to begin, so | ||
handlers and other properties can be altered or addressed before the crawl | ||
commences. | ||
Public: Starts or resumes the crawl. If the queue is empty, it adds a new | ||
queue item from which to begin crawling based on the initial configuration | ||
of the crawler itself. The crawler waits for process.nextTick to begin, so | ||
handlers and other properties can be altered or addressed before the crawl | ||
commences. | ||
Examples | ||
Examples | ||
crawler.start(); | ||
crawler.start(); | ||
Returns the crawler object, to enable chaining. | ||
Returns the crawler object, to enable chaining. | ||
*/ | ||
Crawler.prototype.start = function() { | ||
var crawler = this; | ||
var crawler = this; | ||
// only if we haven't already got stuff in our queue... | ||
crawler.queue.getLength(function(err, length) { | ||
if (err) throw err; | ||
// only if we haven't already got stuff in our queue... | ||
crawler.queue.getLength(function(err, length) { | ||
if (err) { | ||
throw err; | ||
} | ||
if (!length) { | ||
if (!length) { | ||
// Initialise our queue by pushing the initial request data into it... | ||
crawler.queue.add( | ||
crawler.initialProtocol, | ||
crawler.host, | ||
crawler.initialPort, | ||
crawler.initialPath, | ||
QUEUE_ITEM_INITIAL_DEPTH, | ||
function(error) { | ||
if (error) throw error; | ||
}); | ||
} | ||
// Initialise our queue by pushing the initial request data into it... | ||
crawler.queue.add( | ||
crawler.initialProtocol, | ||
crawler.host, | ||
crawler.initialPort, | ||
crawler.initialPath, | ||
QUEUE_ITEM_INITIAL_DEPTH, | ||
function(error) { | ||
if (error) { | ||
throw error; | ||
} | ||
}); | ||
} | ||
crawler.crawlIntervalID = | ||
setInterval( | ||
function() { | ||
crawler.crawl.call(crawler); | ||
}, | ||
crawler.interval); | ||
crawler.crawlIntervalID = | ||
setInterval( | ||
function() { | ||
crawler.crawl(crawler); | ||
}, | ||
crawler.interval); | ||
crawler.emit("crawlstart"); | ||
crawler.running = true; | ||
crawler.emit("crawlstart"); | ||
crawler.running = true; | ||
// Now kick off the initial crawl | ||
process.nextTick(function() { | ||
crawler.crawl(); | ||
}); | ||
}); | ||
// Now kick off the initial crawl | ||
process.nextTick(function() { | ||
crawler.crawl(); | ||
}); | ||
}); | ||
return crawler; | ||
return crawler; | ||
}; | ||
/* | ||
Public: Determines whether the protocol is supported, given a URL. | ||
Public: Determines whether the protocol is supported, given a URL. | ||
URL - URL with a protocol, for testing. | ||
URL - URL with a protocol, for testing. | ||
Examples | ||
Examples | ||
crawler.protocolSupported("http://google.com/") // true, by default | ||
crawler.protocolSupported("wss://google.com/") // false, by default | ||
crawler.protocolSupported("http://google.com/") // true, by default | ||
crawler.protocolSupported("wss://google.com/") // false, by default | ||
Returns a boolean, true if the protocol is supported - false if not. | ||
Returns a boolean, true if the protocol is supported - false if not. | ||
*/ | ||
Crawler.prototype.protocolSupported = function(URL) { | ||
var protocol, crawler = this; | ||
var protocol, | ||
crawler = this; | ||
try { | ||
protocol = URI(URL).protocol(); | ||
try { | ||
protocol = uri(URL).protocol(); | ||
// Unspecified protocol. Assume http | ||
if (!protocol) | ||
protocol = "http"; | ||
// Unspecified protocol. Assume http | ||
if (!protocol) { | ||
protocol = "http"; | ||
} | ||
} catch(e) { | ||
// If URIjs died, we definitely /do not/ support the protocol. | ||
return false; | ||
} | ||
} catch (e) { | ||
// If URIjs died, we definitely /do not/ support the protocol. | ||
return false; | ||
} | ||
return crawler.allowedProtocols.reduce(function(prev,protocolCheck) { | ||
return prev || !!protocolCheck.exec(protocol); | ||
},false); | ||
return crawler.allowedProtocols.reduce(function(prev, protocolCheck) { | ||
return prev || !!protocolCheck.exec(protocol); | ||
}, false); | ||
}; | ||
/* | ||
Public: Determines whether the mimetype is supported, given a mimetype | ||
Public: Determines whether the mimetype is supported, given a mimetype | ||
MIMEType - String containing MIME type to test | ||
MIMEType - String containing MIME type to test | ||
Examples | ||
Examples | ||
crawler.mimeTypeSupported("text/html") // true, by default | ||
crawler.mimeTypeSupported("application/octet-stream") // false, by default | ||
crawler.mimeTypeSupported("text/html") // true, by default | ||
crawler.mimeTypeSupported("application/octet-stream") // false, by default | ||
Returns a boolean, true if the MIME type is supported — false if not. | ||
Returns a boolean, true if the MIME type is supported — false if not. | ||
*/ | ||
Crawler.prototype.mimeTypeSupported = function(MIMEType) { | ||
var crawler = this; | ||
var crawler = this; | ||
return ( | ||
crawler.supportedMimeTypes.reduce(function(prev,mimeCheck) { | ||
return prev || !!mimeCheck.exec(MIMEType); | ||
},false) | ||
); | ||
return crawler.supportedMimeTypes.reduce(function(prev, mimeCheck) { | ||
return prev || !!mimeCheck.exec(MIMEType); | ||
}, false); | ||
}; | ||
/* | ||
Public: Determines whether the queueItem can be fetched from its depth | ||
Public: Determines whether the queueItem can be fetched from its depth | ||
In fact, the queueItem needs to be fetched before calling this (because we | ||
need its MIME type). This will just determine if we need to send an event | ||
for this item & if we need to fetch linked resources. | ||
In fact, the queueItem needs to be fetched before calling this (because we | ||
need its MIME type). This will just determine if we need to send an event | ||
for this item & if we need to fetch linked resources. | ||
If the queue item is a CSS or JS file, it will always be fetched (we need | ||
all images in CSS files, even if max depth is already reached). If it's an | ||
HTML page, we will check if max depth is reached or not. | ||
If the queue item is a CSS or JS file, it will always be fetched (we need | ||
all images in CSS files, even if max depth is already reached). If it's an | ||
HTML page, we will check if max depth is reached or not. | ||
queueItem - Queue item object to check | ||
queueItem - Queue item object to check | ||
Returns a boolean, true if the queue item can be fetched - false if not. | ||
Returns a boolean, true if the queue item can be fetched - false if not. | ||
*/ | ||
Crawler.prototype.depthAllowed = function(queueItem) { | ||
var crawler = this; | ||
var crawler = this; | ||
// Items matching this pattern will always be fetched, even if max depth | ||
// is reached | ||
var mimeTypesWhitelist = [ | ||
/^text\/(css|javascript|ecmascript)/i, | ||
/^application\/javascript/i, | ||
/^application\/x-font/i, | ||
/^application\/font/i, | ||
/^image\//i, | ||
/^font\//i | ||
]; | ||
// Items matching this pattern will always be fetched, even if max depth | ||
// is reached | ||
var mimeTypesWhitelist = [ | ||
/^text\/(css|javascript|ecmascript)/i, | ||
/^application\/javascript/i, | ||
/^application\/x-font/i, | ||
/^application\/font/i, | ||
/^image\//i, | ||
/^font\//i | ||
]; | ||
return ( | ||
crawler.maxDepth === 0 || | ||
queueItem.depth <= crawler.maxDepth || | ||
( | ||
crawler.fetchWhitelistedMimeTypesBelowMaxDepth && | ||
mimeTypesWhitelist.reduce(function(prev,mimeCheck) { | ||
return prev || !!mimeCheck.exec(queueItem.stateData.contentType); | ||
}, false) | ||
) | ||
); | ||
return crawler.maxDepth === 0 || | ||
queueItem.depth <= crawler.maxDepth || | ||
crawler.fetchWhitelistedMimeTypesBelowMaxDepth && | ||
mimeTypesWhitelist.reduce(function(prev, mimeCheck) { | ||
return prev || !!mimeCheck.exec(queueItem.stateData.contentType); | ||
}, false); | ||
}; | ||
/* | ||
Public: Extracts protocol, host, port and resource (path) given a URL string. | ||
Public: Extracts protocol, host, port and resource (path) given a URL string. | ||
URL - String containing URL to process | ||
URL - String containing URL to process | ||
Examples | ||
Examples | ||
var URLInfo = crawler.processURL("http://www.google.com/fish"); | ||
var URLInfo = crawler.processURL("http://www.google.com/fish"); | ||
Returns an object containing keys and values for "protocol", "host", "port", | ||
and "path". | ||
Returns an object containing keys and values for "protocol", "host", "port", | ||
and "path". | ||
*/ | ||
Crawler.prototype.processURL = function(URL,context) { | ||
var newURL, crawler = this; | ||
Crawler.prototype.processURL = function(URL, context) { | ||
var newURL, | ||
crawler = this; | ||
if (!context || typeof(context) !== "object") | ||
context = { | ||
url: ( | ||
crawler.initialProtocol + "://" + | ||
crawler.host + ":" + | ||
crawler.initialPort + "/" | ||
), | ||
depth: QUEUE_ITEM_INITIAL_DEPTH | ||
}; | ||
if (!context || typeof context !== "object") { | ||
context = { | ||
url: crawler.initialProtocol + "://" + | ||
crawler.host + ":" + | ||
crawler.initialPort + "/", | ||
depth: QUEUE_ITEM_INITIAL_DEPTH | ||
}; | ||
} | ||
// If the URL didn't contain anything, don't fetch it. | ||
if (!URL.replace(/\s+/ig,"").length) return false; | ||
// If the URL didn't contain anything, don't fetch it. | ||
if (!(URL && URL.replace(/\s+/ig, "").length)) { | ||
return false; | ||
} | ||
// Check if querystring should be ignored | ||
if (crawler.stripQuerystring === true) | ||
URL = crawler.removeQuerystring(URL); | ||
// Check if querystring should be ignored | ||
if (crawler.stripQuerystring === true) { | ||
URL = crawler.removeQuerystring(URL); | ||
} | ||
try { | ||
newURL = | ||
URI(URL) | ||
.absoluteTo(context.url) | ||
.normalize(); | ||
if (crawler.stripWWWDomain && URL.match(/https?\:\/\/(www\.).*/i)) { | ||
URL = URL.replace("www.", ""); | ||
} | ||
if (crawler.urlEncoding === "iso8859") { | ||
newURL = newURL.iso8859(); | ||
} | ||
try { | ||
newURL = | ||
uri(URL) | ||
.absoluteTo(context.url) | ||
.normalize(); | ||
} catch(e) { | ||
// Couldn't process the URL, since URIjs choked on it. | ||
return false; | ||
} | ||
if (crawler.urlEncoding === "iso8859") { | ||
newURL = newURL.iso8859(); | ||
} | ||
// simplecrawler uses slightly different terminology to URIjs. Sorry! | ||
return { | ||
"protocol": newURL.protocol() || "http", | ||
"host": newURL.hostname(), | ||
"port": newURL.port() || 80, | ||
"path": newURL.resource(), | ||
"uriPath": newURL.path(), | ||
"depth": context.depth + 1 | ||
}; | ||
} catch (e) { | ||
// Couldn't process the URL, since URIjs choked on it. | ||
return false; | ||
} | ||
// simplecrawler uses slightly different terminology to URIjs. Sorry! | ||
return { | ||
protocol: newURL.protocol() || "http", | ||
host: newURL.hostname(), | ||
port: newURL.port() || 80, | ||
path: newURL.resource(), | ||
uriPath: newURL.path(), | ||
depth: context.depth + 1 | ||
}; | ||
}; | ||
/* | ||
Public: Discovers linked resources in an HTML, XML or text document. | ||
Private: Perform string replace operations on a URL string. Eg. removes | ||
HTML attribute fluff around actual URL, replaces leading "//" with | ||
absolute protocol etc. | ||
resourceData - String containing document with linked resources. | ||
queueItem - Queue item corresponding to document being searched. | ||
queueItem - Queue item corresponding to where the resource was found | ||
URL - String to be cleaned up | ||
Examples | ||
Examples | ||
crawler.discoverResources("http://www.google.com") | ||
crawler.discoverResources("<a href='...'>test</a>") | ||
cleanURL({protocol: "http"}, "url('//example.com/about') ") | ||
Returns an array of the (string) resource URLs found in the document. If none | ||
were found, the array will be empty. | ||
Returns a string. | ||
*/ | ||
function cleanURL (queueItem, URL) { | ||
return URL | ||
.replace(/^(?:\s*href|\s*src)\s*=+\s*/i, "") | ||
.replace(/^\s*/, "") | ||
.replace(/^url\((.*)\)/i, "$1") | ||
.replace(/^javascript\:\s*[a-z0-9]+\((.*)/i, "$1") | ||
.replace(/^(['"])(.*)\1$/, "$2") | ||
.replace(/^\((.*)\)$/, "$1") | ||
.replace(/^\/\//, queueItem.protocol + "://") | ||
.replace(/\&/gi, "&") | ||
.replace(/\&/gi, "&") | ||
.replace(/\&/gi, "&") | ||
.split("#") | ||
.shift() | ||
.trim(); | ||
} | ||
/* | ||
Public: Clean up a list of resources (normally provided by discoverResources). | ||
Also expands URL's that are relative to the current page. | ||
urlMatch - Array of string resources | ||
queueItem - Queue item corresponding to where the resources were retrieved from | ||
Examples | ||
crawler.cleanExpandResources(["http://www.google.com", "/about", "mailto: example@example.com"]) | ||
Returns an array of URL strings. | ||
*/ | ||
Crawler.prototype.discoverResources = function(resourceData,queueItem) { | ||
// Convert to UTF-8 | ||
// TODO: account for text-encoding. | ||
var resources = [], | ||
resourceText = resourceData.toString("utf8"), | ||
crawler = this; | ||
Crawler.prototype.cleanExpandResources = function (urlMatch, queueItem) { | ||
var crawler = this, | ||
resources = []; | ||
if (!queueItem) | ||
queueItem = {}; | ||
if (!urlMatch) { | ||
return []; | ||
} | ||
if (!queueItem.protocol) | ||
queueItem.protocol = "http"; | ||
return urlMatch | ||
.map(cleanURL.bind(this, queueItem)) | ||
.reduce(function(list, URL) { | ||
if (!crawler.parseHTMLComments) { | ||
resourceText = resourceText.replace(/<!--([\s\S]+?)-->/g, ""); | ||
} | ||
// Ensure URL is whole and complete | ||
try { | ||
URL = uri(URL) | ||
.absoluteTo(queueItem.url || "") | ||
.normalize() | ||
.toString(); | ||
} catch (e) { | ||
// But if URI.js couldn't parse it - nobody can! | ||
return list; | ||
} | ||
if (!crawler.parseScriptTags) { | ||
resourceText = resourceText.replace(/<script(.*?)>([\s\S]+?)<\/script>/gi, ""); | ||
} | ||
// If we hit an empty item, don't return it | ||
if (!URL.length) { | ||
return list; | ||
} | ||
function cleanURL(URL) { | ||
return URL | ||
.replace(/^(?:\s*href|\s*src)\s*=+\s*/i,"") | ||
.replace(/^\s*/,"") | ||
.replace(/^url\((.*)\)/i,"$1") | ||
.replace(/^javascript\:\s*[a-z0-9]+\((.*)/i,"$1") | ||
.replace(/^(['"])(.*)\1$/,"$2") | ||
.replace(/^\((.*)\)$/,"$1") | ||
.replace(/^\/\//, queueItem.protocol + "://") | ||
.replace(/\&/gi,"&") | ||
.replace(/\&/gi,"&") | ||
.replace(/\&/gi,"&") | ||
.split("#") | ||
.shift(); | ||
} | ||
// If we don't support the protocol in question | ||
if (!crawler.protocolSupported(URL)) { | ||
return list; | ||
} | ||
// Clean links | ||
function cleanAndQueue(urlMatch) { | ||
if (!urlMatch) return []; | ||
// Does the item already exist in the list? | ||
if (resources.reduce(function(prev, current) { | ||
return prev || current === URL; | ||
}, false)) { | ||
return list; | ||
} | ||
return urlMatch | ||
.map(cleanURL) | ||
.reduce(function(list,URL) { | ||
var tmpURL; | ||
return list.concat(URL); | ||
}, []); | ||
}; | ||
// Ensure URL is whole and complete | ||
try { | ||
tmpURL = URI(URL); | ||
/* | ||
Public: Discovers linked resources in an HTML, XML or text document. | ||
if (queueItem.url) { | ||
URL = tmpURL | ||
.absoluteTo(queueItem.url) | ||
.normalize() | ||
.toString(); | ||
} else { | ||
URL = tmpURL | ||
.normalize() | ||
.toString(); | ||
} | ||
resourceData - String containing document with linked resources. | ||
queueItem - Queue item corresponding to document being searched. | ||
} catch(e) { | ||
// But if URI.js couldn't parse it - nobody can! | ||
return list; | ||
} | ||
Examples | ||
// If we hit an empty item, don't add return it | ||
if (!URL.length) return list; | ||
crawler.discoverResources("http://www.google.com") | ||
crawler.discoverResources("<a href='...'>test</a>") | ||
// If we don't support the protocol in question | ||
if (!crawler.protocolSupported(URL)) return list; | ||
Returns an array of the (string) resource URLs found in the document. If none | ||
were found, the array will be empty. | ||
// Does the item already exist in the list? | ||
if (resources.reduce(function(prev,current) { | ||
return prev || current === URL; | ||
},false)) | ||
return list; | ||
*/ | ||
Crawler.prototype.discoverResources = function(resourceData, queueItem) { | ||
// Convert to UTF-8 | ||
// TODO: account for text-encoding. | ||
var resourceText = resourceData.toString("utf8"), | ||
crawler = this; | ||
return list.concat(URL); | ||
},[]); | ||
} | ||
if (!queueItem) { | ||
queueItem = {}; | ||
} | ||
// Rough scan for URLs | ||
return crawler.discoverRegex | ||
.reduce(function(list,regex) { | ||
return list.concat( | ||
cleanAndQueue( | ||
resourceText.match(regex))); | ||
},[]) | ||
.reduce(function(list,check) { | ||
if (list.indexOf(check) < 0) | ||
return list.concat([check]); | ||
if (!queueItem.protocol) { | ||
queueItem.protocol = "http"; | ||
} | ||
return list; | ||
},[]); | ||
if (!crawler.parseHTMLComments) { | ||
resourceText = resourceText.replace(/<!--([\s\S]+?)-->/g, ""); | ||
} | ||
if (!crawler.parseScriptTags) { | ||
resourceText = resourceText.replace(/<script(.*?)>([\s\S]*?)<\/script>/gi, ""); | ||
} | ||
// Rough scan for URLs | ||
return crawler.discoverRegex | ||
.reduce(function(list, regex) { | ||
return list.concat( | ||
crawler.cleanExpandResources( | ||
resourceText.match(regex), queueItem)); | ||
}, []) | ||
.reduce(function(list, check) { | ||
if (list.indexOf(check) < 0) { | ||
return list.concat([check]); | ||
} | ||
return list; | ||
}, []); | ||
}; | ||
/* | ||
Public: Determines based on crawler state whether a domain is valid for | ||
crawling. | ||
Public: Determines based on crawler state whether a domain is valid for | ||
crawling. | ||
host - String containing the hostname of the resource to be fetched. | ||
host - String containing the hostname of the resource to be fetched. | ||
Examples | ||
Examples | ||
crawler.domainValid("127.0.0.1"); | ||
crawler.domainValid("google.com"); | ||
crawler.domainValid("test.example.com"); | ||
crawler.domainValid("127.0.0.1"); | ||
crawler.domainValid("google.com"); | ||
crawler.domainValid("test.example.com"); | ||
Returns an true if the domain is valid for crawling, false if not. | ||
Returns an true if the domain is valid for crawling, false if not. | ||
*/ | ||
Crawler.prototype.domainValid = function(host) { | ||
var crawler = this, | ||
crawlerHost = crawler.host; | ||
var crawler = this; | ||
// If we're ignoring the WWW domain, remove the WWW for comparisons... | ||
if (crawler.ignoreWWWDomain) | ||
host = host.replace(/^www\./i,""); | ||
// If we're ignoring the WWW domain, remove the WWW for comparisons... | ||
if (crawler.ignoreWWWDomain) { | ||
host = host.replace(/^www\./i, ""); | ||
} | ||
function domainInWhitelist(host) { | ||
function domainInWhitelist(host) { | ||
// If there's no whitelist, or the whitelist is of zero length, | ||
// just return false. | ||
if (!crawler.domainWhitelist || | ||
!crawler.domainWhitelist.length) return false; | ||
// If there's no whitelist, or the whitelist is of zero length, | ||
// just return false. | ||
if (!crawler.domainWhitelist || !crawler.domainWhitelist.length) { | ||
return false; | ||
} | ||
// Otherwise, scan through it. | ||
return !!crawler.domainWhitelist.reduce(function(prev,cur,index,array) { | ||
// Otherwise, scan through it. | ||
return !!crawler.domainWhitelist.reduce(function(prev, cur) { | ||
// If we already located the relevant domain in the whitelist... | ||
if (prev) return prev; | ||
// If we already located the relevant domain in the whitelist... | ||
if (prev) { | ||
return prev; | ||
} | ||
// If the domain is just equal, return true. | ||
if (host === cur) return true; | ||
// If the domain is just equal, return true. | ||
if (host === cur) { | ||
return true; | ||
} | ||
// If we're ignoring WWW subdomains, and both domains, | ||
// less www. are the same, return true. | ||
if (crawler.ignoreWWWDomain && host === cur.replace(/^www\./i,"")) | ||
return true; | ||
// If we're ignoring WWW subdomains, and both domains, | ||
// less www. are the same, return true. | ||
if (crawler.ignoreWWWDomain && host === cur.replace(/^www\./i, "")) { | ||
return true; | ||
} | ||
// Otherwise, sorry. No dice. | ||
return false; | ||
},false); | ||
} | ||
// Otherwise, sorry. No dice. | ||
return false; | ||
}, false); | ||
} | ||
// Checks if the first domain is a subdomain of the second | ||
function isSubdomainOf(subdomain,host) { | ||
// Checks if the first domain is a subdomain of the second | ||
function isSubdomainOf(subdomain, host) { | ||
// Comparisons must be case-insensitive | ||
subdomain = subdomain.toLowerCase(); | ||
host = host.toLowerCase(); | ||
// Comparisons must be case-insensitive | ||
subdomain = subdomain.toLowerCase(); | ||
host = host.toLowerCase(); | ||
// If we're ignoring www, remove it from both | ||
// (if www is the first domain component...) | ||
if (crawler.ignoreWWWDomain) { | ||
subdomain = subdomain.replace(/^www./ig, ""); | ||
host = host.replace(/^www./ig, ""); | ||
} | ||
// If we're ignoring www, remove it from both | ||
// (if www is the first domain component...) | ||
if (crawler.ignoreWWWDomain) { | ||
subdomain = subdomain.replace(/^www./ig, ""); | ||
host = host.replace(/^www./ig, ""); | ||
} | ||
// They should be the same flipped around! | ||
return ( | ||
subdomain.split("").reverse().join("").substr(0,host.length) === | ||
host.split("").reverse().join("")); | ||
} | ||
// They should be the same flipped around! | ||
return subdomain.split("").reverse().join("").substr(0, host.length) === | ||
host.split("").reverse().join(""); | ||
} | ||
// If we're not filtering by domain, just return true. | ||
return (!crawler.filterByDomain || | ||
// Or if the domain is just the right one, return true. | ||
(host === crawler.host) || | ||
// Or if we're ignoring WWW subdomains, and both domains, | ||
// less www. are the same, return true. | ||
( | ||
crawler.ignoreWWWDomain && | ||
crawler.host.replace(/^www\./i,"") === | ||
host.replace(/^www\./i,"") | ||
) || | ||
// Or if the domain in question exists in the domain whitelist, | ||
// return true. | ||
domainInWhitelist(host) || | ||
// Or if we're scanning subdomains, and this domain is a subdomain | ||
// of the crawler's set domain, return true. | ||
(crawler.scanSubdomains && isSubdomainOf(host,crawler.host))); | ||
// If we're not filtering by domain, just return true. | ||
return !crawler.filterByDomain || | ||
// Or if the domain is just the right one, return true. | ||
host === crawler.host || | ||
// Or if we're ignoring WWW subdomains, and both domains, | ||
// less www. are the same, return true. | ||
crawler.ignoreWWWDomain && | ||
crawler.host.replace(/^www\./i, "") === | ||
host.replace(/^www\./i, "") || | ||
// Or if the domain in question exists in the domain whitelist, | ||
// return true. | ||
domainInWhitelist(host) || | ||
// Or if we're scanning subdomains, and this domain is a subdomain | ||
// of the crawler's set domain, return true. | ||
crawler.scanSubdomains && isSubdomainOf(host, crawler.host); | ||
}; | ||
/* | ||
Public: Given a text or HTML document, initiates discovery of linked | ||
resources in the text, and queues the resources if applicable. Emits | ||
"discoverycomplete". Not to be confused with `crawler.discoverResources`, | ||
despite the `discoverResources` function being the main component of this | ||
one, since this function queues the resources in addition to | ||
discovering them. | ||
Public: Given a text or HTML document, initiates discovery of linked | ||
resources in the text, and queues the resources if applicable. Emits | ||
"discoverycomplete". Not to be confused with `crawler.discoverResources`, | ||
despite the `discoverResources` function being the main component of this | ||
one, since this function queues the resources in addition to | ||
discovering them. | ||
resourceData - Text document containing linked resource URLs. | ||
queueItem - Queue item from which the resource document was derived. | ||
decompressed - Content is already decompressed (default: false) | ||
resourceData - Text document containing linked resource URLs. | ||
queueItem - Queue item from which the resource document was derived. | ||
decompressed - Content is already decompressed (default: false) | ||
Emits | ||
Emits | ||
gziperr | ||
discoverycomplete | ||
gziperr | ||
discoverycomplete | ||
Examples | ||
Examples | ||
crawler.queueLinkedItems("<a href='...'>test</a>",queueItem); | ||
crawler.queueLinkedItems("<a href='...'>test</a>",queueItem); | ||
Returns the crawler object for chaining. | ||
Returns the crawler object for chaining. | ||
*/ | ||
Crawler.prototype.queueLinkedItems = function(resourceData,queueItem,decompressed) { | ||
var crawler = this, | ||
resources = []; | ||
Crawler.prototype.queueLinkedItems = function(resourceData, queueItem, decompressed) { | ||
var crawler = this, | ||
resources = []; | ||
if (!decompressed && | ||
queueItem.stateData && | ||
queueItem.stateData.headers['content-encoding'] && ( | ||
queueItem.stateData.headers['content-encoding'].match(/gzip/) || | ||
queueItem.stateData.headers['content-encoding'].match(/deflate/))) { | ||
if (!decompressed && | ||
queueItem.stateData && | ||
queueItem.stateData.headers["content-encoding"] && ( | ||
queueItem.stateData.headers["content-encoding"].match(/gzip/) || | ||
queueItem.stateData.headers["content-encoding"].match(/deflate/))) { | ||
return zlib.unzip(resourceData,function(err,newData) { | ||
if (err) { | ||
return crawler.emit("gziperror", queueItem, err, resourceData); | ||
} | ||
return zlib.unzip(resourceData, function(err, newData) { | ||
if (err) { | ||
return crawler.emit("gziperror", queueItem, err, resourceData); | ||
} | ||
crawler.queueLinkedItems(newData,queueItem,true); | ||
}); | ||
} | ||
crawler.queueLinkedItems(newData, queueItem, true); | ||
}); | ||
} | ||
resources = crawler.discoverResources(resourceData,queueItem); | ||
resources = crawler.discoverResources(resourceData, queueItem); | ||
// Emit discovered resources. ie: might be useful in building a graph of | ||
// page relationships. | ||
crawler.emit("discoverycomplete",queueItem,resources); | ||
// Emit discovered resources. ie: might be useful in building a graph of | ||
// page relationships. | ||
crawler.emit("discoverycomplete", queueItem, resources); | ||
resources.forEach(function(url){ crawler.queueURL(url,queueItem); }); | ||
resources.forEach(function(url) { | ||
crawler.queueURL(url, queueItem); | ||
}); | ||
return crawler; | ||
return crawler; | ||
}; | ||
/* | ||
Public: Given a single URL, this function cleans, validates, parses it and | ||
adds it to the queue. This is the best and simplest way to add an item to | ||
the queue. | ||
Public: Given a single URL, this function cleans, validates, parses it and | ||
adds it to the queue. This is the best and simplest way to add an item to | ||
the queue. | ||
url - URL to be queued. | ||
queueItem - Queue item from which the resource was linked. | ||
url - URL to be queued. | ||
queueItem - Queue item from which the resource was linked. | ||
Emits | ||
Emits | ||
queueduplicate | ||
queueerror | ||
queueadd | ||
queueduplicate | ||
queueerror | ||
queueadd | ||
Examples | ||
Examples | ||
crawler.queueURL("http://www.google.com/",queueItem); | ||
crawler.queueURL("http://www.google.com/",queueItem); | ||
Returns a boolean value indicating whether the URL was successfully queued | ||
or not. | ||
Returns a boolean value indicating whether the URL was successfully queued | ||
or not. | ||
*/ | ||
Crawler.prototype.queueURL = function(url,queueItem) { | ||
var crawler = this; | ||
var parsedURL = | ||
typeof(url) === "object" ? url : crawler.processURL(url,queueItem); | ||
Crawler.prototype.queueURL = function(url, queueItem) { | ||
var crawler = this, | ||
parsedURL = typeof url === "object" ? url : crawler.processURL(url, queueItem); | ||
// URL Parser decided this URL was junky. Next please! | ||
if (!parsedURL) { | ||
return false; | ||
} | ||
// URL Parser decided this URL was junky. Next please! | ||
if (!parsedURL) { | ||
return false; | ||
} | ||
// Pass this URL past fetch conditions to ensure the user thinks it's valid | ||
var fetchDenied = false; | ||
fetchDenied = crawler._fetchConditions.reduce(function(prev,callback) { | ||
return prev || !callback(parsedURL); | ||
},false); | ||
// Pass this URL past fetch conditions to ensure the user thinks it's valid | ||
var fetchDenied = false; | ||
fetchDenied = crawler._fetchConditions.reduce(function(prev, callback) { | ||
return prev || !callback(parsedURL); | ||
}, false); | ||
if (fetchDenied) { | ||
// Fetch Conditions conspired to block URL | ||
return false; | ||
} | ||
if (fetchDenied) { | ||
// Fetch Conditions conspired to block URL | ||
return false; | ||
} | ||
// Check the domain is valid before adding it to the queue | ||
if (crawler.domainValid(parsedURL.host)) { | ||
crawler.queue.add( | ||
parsedURL.protocol, | ||
parsedURL.host, | ||
parsedURL.port, | ||
parsedURL.path, | ||
parsedURL.depth, | ||
function queueAddCallback(error,newQueueItem) { | ||
if (error) { | ||
// We received an error condition when adding the callback | ||
if (error.code && error.code === "DUP") | ||
return crawler.emit("queueduplicate",parsedURL); | ||
// Check the domain is valid before adding it to the queue | ||
if (crawler.domainValid(parsedURL.host)) { | ||
crawler.queue.add( | ||
parsedURL.protocol, | ||
parsedURL.host, | ||
parsedURL.port, | ||
parsedURL.path, | ||
parsedURL.depth, | ||
function queueAddCallback(error, newQueueItem) { | ||
if (error) { | ||
// We received an error condition when adding the callback | ||
if (error.code && error.code === "DUP") { | ||
return crawler.emit("queueduplicate", parsedURL); | ||
} | ||
return crawler.emit("queueerror",error,parsedURL); | ||
} | ||
return crawler.emit("queueerror", error, parsedURL); | ||
} | ||
crawler.emit("queueadd",newQueueItem,parsedURL); | ||
newQueueItem.referrer = queueItem ? queueItem.url : null; | ||
} | ||
); | ||
} | ||
crawler.emit("queueadd", newQueueItem, parsedURL); | ||
newQueueItem.referrer = queueItem ? queueItem.url : null; | ||
} | ||
); | ||
} | ||
return true; | ||
return true; | ||
}; | ||
/* | ||
Public: The guts of the crawler: takes a queue item and spools a request for | ||
it, downloads, caches, and fires events based on the result of the request. | ||
It kicks off resource discovery and queues any new resources found. | ||
Public: The guts of the crawler: takes a queue item and spools a request for | ||
it, downloads, caches, and fires events based on the result of the request. | ||
It kicks off resource discovery and queues any new resources found. | ||
queueItem - Queue item to be fetched. | ||
queueItem - Queue item to be fetched. | ||
Emits | ||
fetchstart | ||
fetchheaders | ||
fetchcomplete | ||
fetchdataerror | ||
notmodified | ||
fetchredirect | ||
fetch404 | ||
fetcherror | ||
fetchclienterror | ||
Emits | ||
fetchstart | ||
fetchheaders | ||
fetchcomplete | ||
fetchdataerror | ||
notmodified | ||
fetchredirect | ||
fetch404 | ||
fetcherror | ||
fetchclienterror | ||
Examples | ||
Examples | ||
crawler.fetchQueueItem(queueItem); | ||
crawler.fetchQueueItem(queueItem); | ||
Returns the crawler object for chaining. | ||
Returns the crawler object for chaining. | ||
*/ | ||
Crawler.prototype.fetchQueueItem = function(queueItem) { | ||
var crawler = this; | ||
crawler._openRequests ++; | ||
var crawler = this; | ||
crawler._openRequests++; | ||
// Variable declarations | ||
var fetchData = false, | ||
requestOptions, | ||
clientRequest, | ||
timeCommenced; | ||
// Variable declarations | ||
var requestOptions, | ||
clientRequest, | ||
timeCommenced; | ||
// Mark as spooled | ||
queueItem.status = "spooled"; | ||
var client = (queueItem.protocol === "https" ? https : http); | ||
// Mark as spooled | ||
queueItem.status = "spooled"; | ||
var client = queueItem.protocol === "https" ? https : http; | ||
// Up the socket limit if required. | ||
if (client.globalAgent.maxSockets < crawler.maxConcurrency) { | ||
client.globalAgent.maxSockets = crawler.maxConcurrency; | ||
} | ||
// Up the socket limit if required. | ||
if (client.globalAgent.maxSockets < crawler.maxConcurrency) { | ||
client.globalAgent.maxSockets = crawler.maxConcurrency; | ||
} | ||
// Extract request options from queue; | ||
var requestHost = queueItem.host, | ||
requestPort = queueItem.port, | ||
requestPath = queueItem.path; | ||
// Extract request options from queue; | ||
var requestHost = queueItem.host, | ||
requestPort = queueItem.port, | ||
requestPath = queueItem.path; | ||
// Are we passing through an HTTP proxy? | ||
if (crawler.useProxy) { | ||
requestHost = crawler.proxyHostname; | ||
requestPort = crawler.proxyPort; | ||
requestPath = queueItem.url; | ||
} | ||
// Are we passing through an HTTP proxy? | ||
if (crawler.useProxy) { | ||
requestHost = crawler.proxyHostname; | ||
requestPort = crawler.proxyPort; | ||
requestPath = queueItem.url; | ||
} | ||
// Load in request options | ||
requestOptions = { | ||
method: "GET", | ||
host: requestHost, | ||
port: requestPort, | ||
path: requestPath, | ||
headers: { | ||
"User-Agent": crawler.userAgent, | ||
"Host": queueItem.host + ( | ||
queueItem.port !== 80 ? | ||
":" + queueItem.port : | ||
"" | ||
) | ||
} | ||
}; | ||
// Load in request options | ||
requestOptions = { | ||
method: "GET", | ||
host: requestHost, | ||
port: requestPort, | ||
path: requestPath, | ||
headers: { | ||
"User-Agent": crawler.userAgent, | ||
"Host": queueItem.host + ( | ||
queueItem.port !== 80 ? | ||
":" + queueItem.port : | ||
"" | ||
) | ||
} | ||
}; | ||
if (queueItem.referrer) { | ||
requestOptions.headers.Referer = queueItem.referrer; | ||
} | ||
if (queueItem.referrer) { | ||
requestOptions.headers.Referer = queueItem.referrer; | ||
} | ||
// If port is one of the HTTP/HTTPS defaults, delete the option to avoid conflicts | ||
if (requestOptions.port === 80 || requestOptions.port === 443) { | ||
delete requestOptions.port; | ||
} | ||
// If port is one of the HTTP/HTTPS defaults, delete the option to avoid conflicts | ||
if (requestOptions.port === 80 || requestOptions.port === 443) { | ||
delete requestOptions.port; | ||
} | ||
// Add cookie header from cookie jar if we're configured to | ||
// send/accept cookies | ||
if (crawler.acceptCookies && crawler.cookies.getAsHeader()) { | ||
requestOptions.headers.cookie = | ||
crawler.cookies.getAsHeader(queueItem.host,queueItem.path); | ||
} | ||
// Add cookie header from cookie jar if we're configured to | ||
// send/accept cookies | ||
if (crawler.acceptCookies && crawler.cookies.getAsHeader()) { | ||
requestOptions.headers.cookie = | ||
crawler.cookies.getAsHeader(queueItem.host, queueItem.path); | ||
} | ||
// Add auth headers if we need them | ||
if (crawler.needsAuth) { | ||
var auth = crawler.authUser + ":" + crawler.authPass; | ||
// Add auth headers if we need them | ||
if (crawler.needsAuth) { | ||
var auth = crawler.authUser + ":" + crawler.authPass; | ||
// Generate auth header | ||
auth = 'Basic ' + (new Buffer(auth).toString('base64')); | ||
requestOptions.headers.Authorization = auth; | ||
} | ||
// Generate auth header | ||
auth = "Basic " + new Buffer(auth).toString("base64"); | ||
requestOptions.headers.Authorization = auth; | ||
} | ||
// Add proxy auth if we need it | ||
if (crawler.proxyUser !== null && crawler.proxyPass !== null) { | ||
var proxyAuth = crawler.proxyUser + ":" + crawler.proxyPass; | ||
// Add proxy auth if we need it | ||
if (crawler.proxyUser !== null && crawler.proxyPass !== null) { | ||
var proxyAuth = crawler.proxyUser + ":" + crawler.proxyPass; | ||
// Generate auth header | ||
proxyAuth = 'Basic ' + (new Buffer(proxyAuth).toString('base64')); | ||
requestOptions.headers["Proxy-Authorization"] = proxyAuth; | ||
} | ||
// Generate auth header | ||
proxyAuth = "Basic " + new Buffer(proxyAuth).toString("base64"); | ||
requestOptions.headers["Proxy-Authorization"] = proxyAuth; | ||
} | ||
// And if we've got any custom headers available | ||
if (crawler.customHeaders) { | ||
for (var header in crawler.customHeaders) { | ||
if (!crawler.customHeaders.hasOwnProperty(header)) continue; | ||
// And if we've got any custom headers available | ||
if (crawler.customHeaders) { | ||
for (var header in crawler.customHeaders) { | ||
if (!crawler.customHeaders.hasOwnProperty(header)) { | ||
continue; | ||
} | ||
requestOptions.headers[header] = crawler.customHeaders[header]; | ||
} | ||
} | ||
requestOptions.headers[header] = crawler.customHeaders[header]; | ||
} | ||
} | ||
// Apply the ignoreInvalidSSL setting to https connections | ||
if(client === https && crawler.ignoreInvalidSSL === true) { | ||
client.rejectUnauthorized = false; | ||
client.strictSSL = false; | ||
} | ||
// Apply the ignoreInvalidSSL setting to https connections | ||
if (client === https && crawler.ignoreInvalidSSL === true) { | ||
client.rejectUnauthorized = false; | ||
client.strictSSL = false; | ||
} | ||
// Emit fetchstart event - gives the user time to mangle the request options | ||
// if required. | ||
crawler.emit("fetchstart", queueItem, requestOptions); | ||
// Emit fetchstart event - gives the user time to mangle the request options | ||
// if required. | ||
crawler.emit("fetchstart", queueItem, requestOptions); | ||
process.nextTick(function() { | ||
// Record what time we started this request | ||
timeCommenced = Date.now(); | ||
process.nextTick(function() { | ||
// Record what time we started this request | ||
timeCommenced = Date.now(); | ||
// Get the resource! | ||
clientRequest = | ||
client.request(requestOptions,function(response) { | ||
crawler.handleResponse(queueItem,response,timeCommenced); | ||
}); | ||
// Get the resource! | ||
clientRequest = | ||
client.request(requestOptions, function(response) { | ||
crawler.handleResponse(queueItem, response, timeCommenced); | ||
}); | ||
clientRequest.end(); | ||
clientRequest.end(); | ||
clientRequest.setTimeout(crawler.timeout, function () { | ||
if (crawler.running && !queueItem.fetched) { | ||
crawler._openRequests--; | ||
} | ||
clientRequest.setTimeout(crawler.timeout, function() { | ||
if (queueItem.fetched) { | ||
return; | ||
} | ||
queueItem.fetched = true; | ||
queueItem.status = "timeout"; | ||
crawler.emit("fetchtimeout", queueItem, crawler.timeout); | ||
clientRequest._crawlerHandled = true; | ||
clientRequest.abort(); | ||
}); | ||
if (crawler.running && !queueItem.fetched) { | ||
crawler._openRequests--; | ||
} | ||
clientRequest.on("error", function (errorData) { | ||
queueItem.fetched = true; | ||
queueItem.status = "timeout"; | ||
crawler.emit("fetchtimeout", queueItem, crawler.timeout); | ||
clientRequest._crawlerHandled = true; | ||
clientRequest.abort(); | ||
}); | ||
// This event will be thrown if we manually aborted the request, | ||
// but we don't want to do anything in that case. | ||
if (clientRequest._crawlerHandled) | ||
return; | ||
clientRequest.on("error", function(errorData) { | ||
if (crawler.running && !queueItem.fetched) { | ||
crawler._openRequests--; | ||
} | ||
// This event will be thrown if we manually aborted the request, | ||
// but we don't want to do anything in that case. | ||
if (clientRequest._crawlerHandled) { | ||
return; | ||
} | ||
// Emit 5xx / 4xx event | ||
queueItem.fetched = true; | ||
queueItem.stateData.code = 599; | ||
queueItem.status = "failed"; | ||
crawler.emit("fetchclienterror", queueItem, errorData); | ||
}); | ||
if (crawler.running && !queueItem.fetched) { | ||
crawler._openRequests--; | ||
} | ||
return crawler; | ||
}); | ||
// Emit 5xx / 4xx event | ||
queueItem.fetched = true; | ||
queueItem.stateData.code = 599; | ||
queueItem.status = "failed"; | ||
crawler.emit("fetchclienterror", queueItem, errorData); | ||
}); | ||
return crawler; | ||
}); | ||
}; | ||
@@ -912,405 +962,413 @@ | ||
/* | ||
Public: Given a queueItem and a matching response object, the crawler will | ||
handle downloading the resource, queueing of linked items, etc. | ||
Public: Given a queueItem and a matching response object, the crawler will | ||
handle downloading the resource, queueing of linked items, etc. | ||
Examples | ||
Examples | ||
// Passing in a response from `request` | ||
request(queueItem.url,function(err,res,body) { | ||
crawler.handleResponse(queueItem,res); | ||
}); | ||
// Passing in a response from `request` | ||
request(queueItem.url, function(err, res, body) { | ||
crawler.handleResponse(queueItem, res); | ||
}); | ||
Returns the crawler object for chaining. | ||
Returns the crawler object for chaining. | ||
*/ | ||
Crawler.prototype.handleResponse = function(queueItem,response,timeCommenced) { | ||
var crawler = this, | ||
dataReceived = false, | ||
timeHeadersReceived, | ||
timeDataReceived, | ||
parsedURL, | ||
responseBuffer, | ||
responseLength, | ||
responseLengthReceived = 0, | ||
contentType, | ||
stateData = queueItem.stateData; | ||
Crawler.prototype.handleResponse = function(queueItem, response, timeCommenced) { | ||
var crawler = this, | ||
dataReceived = false, | ||
timeHeadersReceived, | ||
timeDataReceived, | ||
parsedURL, | ||
responseBuffer, | ||
responseLength, | ||
responseLengthReceived = 0, | ||
contentType, | ||
stateData = queueItem.stateData; | ||
// Record what time we first received the header information | ||
timeHeadersReceived = Date.now(); | ||
// Record what time we first received the header information | ||
timeHeadersReceived = Date.now(); | ||
// If we weren't passed a time of commencement, assume Now() | ||
timeCommenced = timeCommenced || Date.now(); | ||
// If we weren't passed a time of commencement, assume Now() | ||
timeCommenced = timeCommenced || Date.now(); | ||
responseLength = parseInt(response.headers["content-length"],10); | ||
responseLength = !isNaN(responseLength) ? responseLength : 0; | ||
responseLength = parseInt(response.headers["content-length"], 10); | ||
responseLength = !isNaN(responseLength) ? responseLength : 0; | ||
// Save timing and content some header information into queue | ||
stateData.requestLatency = (timeHeadersReceived - timeCommenced); | ||
stateData.requestTime = (timeHeadersReceived - timeCommenced); | ||
stateData.contentLength = responseLength; | ||
stateData.contentType = contentType = response.headers["content-type"]; | ||
stateData.code = response.statusCode; | ||
stateData.headers = response.headers; | ||
// Save timing and content some header information into queue | ||
stateData.requestLatency = timeHeadersReceived - timeCommenced; | ||
stateData.requestTime = timeHeadersReceived - timeCommenced; | ||
stateData.contentLength = responseLength; | ||
stateData.contentType = contentType = response.headers["content-type"]; | ||
stateData.code = response.statusCode; | ||
stateData.headers = response.headers; | ||
// Do we need to save cookies? Were we sent any? | ||
if (crawler.acceptCookies && | ||
response.headers.hasOwnProperty('set-cookie')) | ||
crawler.cookies.addFromHeaders(response.headers["set-cookie"]); | ||
// Do we need to save cookies? Were we sent any? | ||
if (crawler.acceptCookies && response.headers.hasOwnProperty("set-cookie")) { | ||
crawler.cookies.addFromHeaders(response.headers["set-cookie"]); | ||
} | ||
// Emit header receive event | ||
crawler.emit("fetchheaders",queueItem,response); | ||
// Emit header receive event | ||
crawler.emit("fetchheaders", queueItem, response); | ||
// Ensure response length is reasonable... | ||
responseLength = | ||
responseLength > 0 ? responseLength : crawler.maxResourceSize; | ||
// Ensure response length is reasonable... | ||
responseLength = | ||
responseLength > 0 ? responseLength : crawler.maxResourceSize; | ||
queueItem.stateData.contentLength = responseLength; | ||
queueItem.stateData.contentLength = responseLength; | ||
// Function for dealing with 200 responses | ||
function processReceivedData() { | ||
if (queueItem.fetched) return; | ||
// Function for dealing with 200 responses | ||
function processReceivedData() { | ||
if (queueItem.fetched) { | ||
return; | ||
} | ||
timeDataReceived = (new Date().getTime()); | ||
timeDataReceived = new Date().getTime(); | ||
queueItem.fetched = true; | ||
queueItem.status = "downloaded"; | ||
queueItem.fetched = true; | ||
queueItem.status = "downloaded"; | ||
// Save state information | ||
stateData.downloadTime = (timeDataReceived - timeHeadersReceived); | ||
stateData.requestTime = (timeDataReceived - timeCommenced); | ||
stateData.actualDataSize = responseBuffer.length; | ||
stateData.sentIncorrectSize = responseBuffer.length !== responseLength; | ||
// Save state information | ||
stateData.downloadTime = timeDataReceived - timeHeadersReceived; | ||
stateData.requestTime = timeDataReceived - timeCommenced; | ||
stateData.actualDataSize = responseBuffer.length; | ||
stateData.sentIncorrectSize = responseBuffer.length !== responseLength; | ||
// First, save item to cache (if we're using a cache!) | ||
if (crawler.cache !== null && | ||
crawler.cache.setCacheData instanceof Function) { | ||
// First, save item to cache (if we're using a cache!) | ||
if (crawler.cache !== null && crawler.cache.setCacheData instanceof Function) { | ||
crawler.cache.setCacheData(queueItem, responseBuffer); | ||
} | ||
crawler.cache.setCacheData(queueItem,responseBuffer); | ||
} | ||
// Is the item allowed by depth conditions ? | ||
if (crawler.depthAllowed(queueItem)) { | ||
crawler.emit("fetchcomplete", queueItem, responseBuffer, response); | ||
// Is the item allowed by depth conditions ? | ||
if(crawler.depthAllowed(queueItem)) { | ||
crawler.emit("fetchcomplete",queueItem,responseBuffer,response); | ||
// We only process the item if it's of a valid mimetype | ||
// and only if the crawler is set to discover its own resources | ||
if (crawler.mimeTypeSupported(contentType) && crawler.discoverResources) { | ||
crawler.queueLinkedItems(responseBuffer, queueItem); | ||
} | ||
} | ||
// We only process the item if it's of a valid mimetype | ||
// and only if the crawler is set to discover its own resources | ||
if (crawler.mimeTypeSupported(contentType) && crawler.discoverResources) { | ||
crawler.queueLinkedItems(responseBuffer,queueItem); | ||
} | ||
} | ||
crawler._openRequests--; | ||
} | ||
crawler._openRequests --; | ||
} | ||
function receiveData(chunk) { | ||
if (chunk && chunk.length && !dataReceived) { | ||
if (responseLengthReceived + chunk.length > responseBuffer.length) { | ||
// Oh dear. We've been sent more data than we were initially told. | ||
// This could be a mis-calculation, or a streaming resource. | ||
// Let's increase the size of our buffer to match, as long as it isn't | ||
// larger than our maximum resource size. | ||
function receiveData(chunk) { | ||
if (chunk && chunk.length && !dataReceived) { | ||
if (responseLengthReceived + chunk.length > responseBuffer.length) { | ||
// Oh dear. We've been sent more data than we were initially told. | ||
// This could be a mis-calculation, or a streaming resource. | ||
// Let's increase the size of our buffer to match, as long as it isn't | ||
// larger than our maximum resource size. | ||
if (responseLengthReceived + chunk.length <= crawler.maxResourceSize) { | ||
if (responseLengthReceived + chunk.length <= crawler.maxResourceSize) { | ||
// Start by creating a new buffer, which will be our main | ||
// buffer from now on... | ||
// Start by creating a new buffer, which will be our main | ||
// buffer from now on... | ||
var tmpNewBuffer = new Buffer(responseLengthReceived + chunk.length); | ||
var tmpNewBuffer = new Buffer(responseLengthReceived + chunk.length); | ||
// Copy all our old data into it... | ||
responseBuffer.copy(tmpNewBuffer, 0, 0, responseBuffer.length); | ||
// Copy all our old data into it... | ||
responseBuffer.copy(tmpNewBuffer,0,0,responseBuffer.length); | ||
// And now the new chunk | ||
chunk.copy(tmpNewBuffer, responseBuffer.length, 0, chunk.length); | ||
// And now the new chunk | ||
chunk.copy(tmpNewBuffer,responseBuffer.length,0,chunk.length); | ||
// And now make the response buffer our new buffer, | ||
// leaving the original for GC | ||
responseBuffer = tmpNewBuffer; | ||
// And now make the response buffer our new buffer, | ||
// leaving the original for GC | ||
responseBuffer = tmpNewBuffer; | ||
} else { | ||
// Oh dear oh dear! The response is not only more data | ||
// than we were initially told, but it also exceeds the | ||
// maximum amount of data we're prepared to download per | ||
// resource. | ||
// | ||
// Throw error event and ignore. | ||
// | ||
// We'll then deal with the data that we have. | ||
} else { | ||
// Oh dear oh dear! The response is not only more data | ||
// than we were initially told, but it also exceeds the | ||
// maximum amount of data we're prepared to download per | ||
// resource. | ||
// | ||
// Throw error event and ignore. | ||
// | ||
// We'll then deal with the data that we have. | ||
crawler.emit("fetchdataerror", queueItem, response); | ||
} | ||
} else { | ||
// Copy the chunk data into our main buffer | ||
chunk.copy(responseBuffer, responseLengthReceived, 0, chunk.length); | ||
} | ||
crawler.emit("fetchdataerror",queueItem,response); | ||
} | ||
} else { | ||
// Copy the chunk data into our main buffer | ||
chunk.copy(responseBuffer,responseLengthReceived,0,chunk.length); | ||
} | ||
// Increment our data received counter | ||
responseLengthReceived += chunk.length; | ||
} | ||
// Increment our data received counter | ||
responseLengthReceived += chunk.length; | ||
} | ||
if ((responseLengthReceived >= responseLength || response.complete) && | ||
!dataReceived) { | ||
// Slice the buffer to chop off any unused space | ||
responseBuffer = responseBuffer.slice(0, responseLengthReceived); | ||
if ((responseLengthReceived >= responseLength || response.complete) && | ||
!dataReceived) { | ||
dataReceived = true; | ||
processReceivedData(); | ||
} | ||
} | ||
// Slice the buffer to chop off any unused space | ||
responseBuffer = responseBuffer.slice(0,responseLengthReceived); | ||
// If we should just go ahead and get the data | ||
if (response.statusCode >= 200 && response.statusCode < 300 && | ||
responseLength <= crawler.maxResourceSize) { | ||
dataReceived = true; | ||
processReceivedData(); | ||
} | ||
} | ||
queueItem.status = "headers"; | ||
// If we should just go ahead and get the data | ||
if (response.statusCode >= 200 && response.statusCode < 300 && | ||
responseLength <= crawler.maxResourceSize) { | ||
// Create a buffer with our response length | ||
responseBuffer = new Buffer(responseLength); | ||
queueItem.status = "headers"; | ||
// Only if we're prepared to download non-text resources... | ||
if (crawler.downloadUnsupported || | ||
crawler.mimeTypeSupported(contentType)) { | ||
// Create a buffer with our response length | ||
responseBuffer = new Buffer(responseLength); | ||
response.on("data", receiveData); | ||
response.on("end", receiveData); | ||
} else { | ||
queueItem.fetched = true; | ||
crawler._openRequests--; | ||
// Only if we're prepared to download non-text resources... | ||
if (crawler.downloadUnsupported || | ||
crawler.mimeTypeSupported(contentType)) { | ||
response.socket.end(); | ||
} | ||
response.on("data",receiveData); | ||
response.on("end",receiveData); | ||
} else { | ||
response.socket.end(); | ||
} | ||
// We've got a not-modified response back | ||
} else if (response.statusCode === 304) { | ||
// We've got a not-modified response back | ||
} else if (response.statusCode === 304) { | ||
if (crawler.cache !== null && crawler.cache.getCacheData) { | ||
// We've got access to a cache | ||
crawler.cache.getCacheData(queueItem, function(cacheObject) { | ||
crawler.emit("notmodified", queueItem, response, cacheObject); | ||
}); | ||
} else { | ||
// Emit notmodified event. We don't have a cache available, so | ||
// we don't send any data. | ||
crawler.emit("notmodified", queueItem, response); | ||
} | ||
if (crawler.cache !== null && crawler.cache.getCacheData) { | ||
// We've got access to a cache | ||
crawler.cache.getCacheData(queueItem,function(cacheObject) { | ||
crawler.emit("notmodified",queueItem,response,cacheObject); | ||
}); | ||
} else { | ||
// Emit notmodified event. We don't have a cache available, so | ||
// we don't send any data. | ||
crawler.emit("notmodified",queueItem,response); | ||
} | ||
// If we should queue a redirect | ||
} else if (response.statusCode >= 300 && response.statusCode < 400 && | ||
response.headers.location) { | ||
// If we should queue a redirect | ||
} else if (response.statusCode >= 300 && response.statusCode < 400 && | ||
response.headers.location) { | ||
queueItem.fetched = true; | ||
queueItem.status = "redirected"; | ||
queueItem.fetched = true; | ||
queueItem.status = "redirected"; | ||
// Parse the redirect URL ready for adding to the queue... | ||
parsedURL = crawler.processURL(response.headers.location, queueItem); | ||
// Parse the redirect URL ready for adding to the queue... | ||
parsedURL = crawler.processURL(response.headers.location,queueItem); | ||
// Emit redirect event | ||
crawler.emit("fetchredirect", queueItem, parsedURL, response); | ||
// Emit redirect event | ||
crawler.emit("fetchredirect",queueItem,parsedURL,response); | ||
// Clean URL, add to queue... | ||
crawler.queueURL(parsedURL, queueItem); | ||
response.socket.end(); | ||
// Clean URL, add to queue... | ||
crawler.queueURL(parsedURL,queueItem); | ||
response.socket.end(); | ||
crawler._openRequests--; | ||
crawler._openRequests --; | ||
// Ignore this request, but record that we had a 404 | ||
} else if (response.statusCode === 404 || response.statusCode === 410) { | ||
queueItem.fetched = true; | ||
queueItem.status = "notfound"; | ||
// Ignore this request, but record that we had a 404 | ||
} else if (response.statusCode === 404 || response.statusCode === 410) { | ||
queueItem.fetched = true; | ||
queueItem.status = "notfound"; | ||
// Emit 404 event | ||
crawler.emit("fetch404", queueItem, response); | ||
response.socket.end(); | ||
// Emit 404 event | ||
crawler.emit("fetch404",queueItem,response); | ||
response.socket.end(); | ||
crawler._openRequests--; | ||
crawler._openRequests --; | ||
// And oh dear. Handle this one as well. (other 400s, 500s, etc) | ||
} else { | ||
queueItem.fetched = true; | ||
queueItem.status = "failed"; | ||
// And oh dear. Handle this one as well. (other 400s, 500s, etc) | ||
} else { | ||
queueItem.fetched = true; | ||
queueItem.status = "failed"; | ||
// Emit 5xx / 4xx event | ||
crawler.emit("fetcherror", queueItem, response); | ||
response.socket.end(); | ||
// Emit 5xx / 4xx event | ||
crawler.emit("fetcherror",queueItem,response); | ||
response.socket.end(); | ||
crawler._openRequests--; | ||
} | ||
crawler._openRequests --; | ||
} | ||
return crawler; | ||
return crawler; | ||
}; | ||
/* | ||
Public: The main crawler runloop. Fires at the interval specified in the | ||
crawler configuration, when the crawl is running. May be manually fired. | ||
This function initiates fetching of a queue item if there are enough workers | ||
to do so and there are unfetched items in the queue. | ||
Public: The main crawler runloop. Fires at the interval specified in the | ||
crawler configuration, when the crawl is running. May be manually fired. | ||
This function initiates fetching of a queue item if there are enough workers | ||
to do so and there are unfetched items in the queue. | ||
Examples | ||
Examples | ||
crawler.crawl(); | ||
crawler.crawl(); | ||
Returns the crawler object for chaining. | ||
Returns the crawler object for chaining. | ||
*/ | ||
Crawler.prototype.crawl = function() { | ||
var crawler = this; | ||
var crawler = this; | ||
if (crawler._openRequests > crawler.maxConcurrency) return; | ||
if (crawler._openRequests > crawler.maxConcurrency) { | ||
return []; | ||
} | ||
crawler.queue.oldestUnfetchedItem(function(err, queueItem) { | ||
crawler.queue.oldestUnfetchedItem(function(err, queueItem) { // eslint-disable-line | ||
if (queueItem) { | ||
crawler.fetchQueueItem(queueItem); | ||
if (queueItem) { | ||
crawler.fetchQueueItem(queueItem); | ||
} else if ( !crawler._openRequests && | ||
!crawler._openListeners) { | ||
} else if (!crawler._openRequests && !crawler._openListeners) { | ||
crawler.queue.complete(function(err, completeCount) { | ||
if (err) throw err; | ||
crawler.queue.complete(function(err, completeCount) { | ||
if (err) { | ||
throw err; | ||
} | ||
crawler.queue.getLength(function(err, length) { | ||
if (err) throw err; | ||
crawler.queue.getLength(function(err, length) { | ||
if (err) { | ||
throw err; | ||
} | ||
if (completeCount === length) { | ||
crawler.emit("complete"); | ||
crawler.stop(); | ||
} | ||
}); | ||
}); | ||
} | ||
}); | ||
if (completeCount === length) { | ||
crawler.emit("complete"); | ||
crawler.stop(); | ||
} | ||
}); | ||
}); | ||
} | ||
}); | ||
return crawler; | ||
return crawler; | ||
}; | ||
/* | ||
Public: Stops the crawler, terminating the crawl runloop. | ||
Public: Stops the crawler, terminating the crawl runloop. | ||
Examples | ||
Examples | ||
crawler.stop(); | ||
crawler.stop(); | ||
Returns the crawler object for chaining. | ||
Returns the crawler object for chaining. | ||
*/ | ||
Crawler.prototype.stop = function() { | ||
var crawler = this; | ||
clearInterval(crawler.crawlIntervalID); | ||
crawler.running = false; | ||
return crawler; | ||
var crawler = this; | ||
clearInterval(crawler.crawlIntervalID); | ||
crawler.running = false; | ||
return crawler; | ||
}; | ||
/* | ||
Public: Holds the crawler in a 'running' state, preventing the `complete` | ||
event from firing until the callback this function returns has been executed, | ||
or a predetermined timeout (as specified by `crawler.listenerTTL`) has | ||
elapsed. | ||
Public: Holds the crawler in a 'running' state, preventing the `complete` | ||
event from firing until the callback this function returns has been executed, | ||
or a predetermined timeout (as specified by `crawler.listenerTTL`) has | ||
elapsed. | ||
Examples | ||
Examples | ||
crawler.on("fetchcomplete",function(queueItem,data) { | ||
continue = this.wait(); | ||
doSomethingThatTakesAlongTime(function callback() { | ||
continue(); | ||
}); | ||
}); | ||
crawler.on("fetchcomplete",function(queueItem,data) { | ||
continue = this.wait(); | ||
doSomethingThatTakesAlongTime(function callback() { | ||
continue(); | ||
}); | ||
}); | ||
Returns callback which will allow the crawler to continue. | ||
Returns callback which will allow the crawler to continue. | ||
*/ | ||
Crawler.prototype.wait = function() { | ||
var crawler = this, | ||
cleared = false, | ||
timeout = | ||
setTimeout(function() { | ||
if (cleared) return; | ||
cleared = true; | ||
crawler._openListeners --; | ||
}, crawler.listenerTTL); | ||
var crawler = this, | ||
cleared = false, | ||
timeout = | ||
setTimeout(function() { | ||
if (cleared) { | ||
return; | ||
} | ||
cleared = true; | ||
crawler._openListeners--; | ||
}, crawler.listenerTTL); | ||
crawler._openListeners ++; | ||
crawler._openListeners++; | ||
return function() { | ||
if (cleared) return; | ||
cleared = true; | ||
crawler._openListeners --; | ||
clearTimeout(timeout); | ||
}; | ||
return function() { | ||
if (cleared) { | ||
return; | ||
} | ||
cleared = true; | ||
crawler._openListeners--; | ||
clearTimeout(timeout); | ||
}; | ||
}; | ||
/* | ||
Public: Given a function, this method adds it to an internal list maintained | ||
by the crawler to be executed against each URL to determine whether it should | ||
be fetched or not. | ||
Public: Given a function, this method adds it to an internal list maintained | ||
by the crawler to be executed against each URL to determine whether it should | ||
be fetched or not. | ||
callback - Function to be called when evaluating a URL. This function is | ||
passed an object containing the protocol, hostname, port, and path | ||
of a resource to be fetched. It can determine whether it should | ||
be requested or not by returning a boolean - false for no, true | ||
for yes. | ||
callback - Function to be called when evaluating a URL. This function is | ||
passed an object containing the protocol, hostname, port, and path | ||
of a resource to be fetched. It can determine whether it should | ||
be requested or not by returning a boolean - false for no, true | ||
for yes. | ||
Examples | ||
Examples | ||
crawler.addFetchCondition(function(parsedURL) { | ||
return (parsedURL.host !== "evildomain.com"); | ||
}); | ||
crawler.addFetchCondition(function(parsedURL) { | ||
return (parsedURL.host !== "evildomain.com"); | ||
}); | ||
Returns the ID of the fetch condition - used for removing it from the crawler | ||
later. | ||
Returns the ID of the fetch condition - used for removing it from the crawler | ||
later. | ||
*/ | ||
Crawler.prototype.addFetchCondition = function(callback) { | ||
var crawler = this; | ||
if (callback instanceof Function) { | ||
crawler._fetchConditions.push(callback); | ||
return crawler._fetchConditions.length - 1; | ||
} else { | ||
throw new Error("Fetch Condition must be a function."); | ||
} | ||
var crawler = this; | ||
if (callback instanceof Function) { | ||
crawler._fetchConditions.push(callback); | ||
return crawler._fetchConditions.length - 1; | ||
} | ||
throw new Error("Fetch Condition must be a function."); | ||
}; | ||
/* | ||
Public: Given the ID of an existing fetch condition, this function removes | ||
it from the crawler's internal list of conditions. | ||
Public: Given the ID of an existing fetch condition, this function removes | ||
it from the crawler's internal list of conditions. | ||
index - ID of fetch condition to be removed. | ||
index - ID of fetch condition to be removed. | ||
Examples | ||
Examples | ||
crawler.removeFetchCondition(3); | ||
crawler.removeFetchCondition(3); | ||
Returns true if the fetch condition was removed, and throws an error if it | ||
could not be found. | ||
Returns true if the fetch condition was removed, and throws an error if it | ||
could not be found. | ||
*/ | ||
Crawler.prototype.removeFetchCondition = function(index) { | ||
var crawler = this; | ||
if (crawler._fetchConditions[index] && | ||
crawler._fetchConditions[index] instanceof Function) { | ||
var crawler = this; | ||
if (crawler._fetchConditions[index] && | ||
crawler._fetchConditions[index] instanceof Function) { | ||
return !!crawler._fetchConditions.splice(index,1); | ||
} else { | ||
throw new Error("Unable to find indexed Fetch Condition."); | ||
} | ||
return !!crawler._fetchConditions.splice(index, 1); | ||
} | ||
throw new Error("Unable to find indexed Fetch Condition."); | ||
}; | ||
/* | ||
Public: Given a URL it will remove the querstring if it exists. | ||
Public: Given a URL it will remove the querstring if it exists. | ||
url - URL from which to remove the querystring | ||
url - URL from which to remove the querystring | ||
Examples | ||
Examples | ||
crawler.removeQuerystring(url); | ||
crawler.removeQuerystring(url); | ||
Returns URL without querystring if it exists | ||
Returns URL without querystring if it exists | ||
*/ | ||
Crawler.prototype.removeQuerystring = function(url) { | ||
if (url.indexOf("?") > -1) { | ||
return url.substr(0,url.indexOf("?")); | ||
} else { | ||
return url; | ||
} | ||
if (url.indexOf("?") > -1) { | ||
return url.substr(0, url.indexOf("?")); | ||
} | ||
return url; | ||
}; | ||
module.exports = Crawler; |
@@ -1,7 +0,12 @@ | ||
// SimpleCrawler | ||
// Export interfaces | ||
/* | ||
* Simplecrawler - Export interfaces | ||
* https://github.com/cgiffard/node-simplecrawler | ||
* | ||
* Copyright (c) 2011-2015, Christopher Giffard | ||
* | ||
*/ | ||
module.exports = require("./crawler.js"); | ||
// Aliasing for compatibility with legacy code. | ||
// Aliasing for compatibility with legacy code | ||
module.exports.Crawler = module.exports; | ||
@@ -8,0 +13,0 @@ |
438
lib/queue.js
@@ -1,21 +0,23 @@ | ||
// Simplecrawler - queue module | ||
// Christopher Giffard, 2011 | ||
// | ||
// http://www.github.com/cgiffard/node-simplecrawler | ||
/* | ||
* Simplecrawler - queue module | ||
* https://github.com/cgiffard/node-simplecrawler | ||
* | ||
* Copyright (c) 2011-2015, Christopher Giffard | ||
* | ||
*/ | ||
var fs = require("fs"); | ||
var allowedStatistics = [ | ||
"requestTime", | ||
"requestLatency", | ||
"downloadTime", | ||
"contentLength", | ||
"actualDataSize" | ||
"requestTime", | ||
"requestLatency", | ||
"downloadTime", | ||
"contentLength", | ||
"actualDataSize" | ||
]; | ||
var FetchQueue = function(){ | ||
this.oldestUnfetchedIndex = 0; | ||
this.completeCache = 0; | ||
this.scanIndex = {}; | ||
var FetchQueue = function() { | ||
this.oldestUnfetchedIndex = 0; | ||
this.completeCache = 0; | ||
this.scanIndex = {}; | ||
}; | ||
@@ -28,47 +30,49 @@ | ||
// For legacy reasons | ||
if (depth instanceof Function) { | ||
callback = depth; | ||
depth = 1; | ||
} | ||
depth = depth || 1; | ||
callback = callback && callback instanceof Function ? callback : function(){}; | ||
var self = this; | ||
// For legacy reasons | ||
if (depth instanceof Function) { | ||
callback = depth; | ||
depth = 1; | ||
} | ||
// Ensure all variables conform to reasonable defaults | ||
protocol = protocol === "https" ? "https" : "http"; | ||
depth = depth || 1; | ||
callback = callback && callback instanceof Function ? callback : function() {}; | ||
var self = this; | ||
if (isNaN(port) || !port) { | ||
return callback(new Error("Port must be numeric!")); | ||
} | ||
// Ensure all variables conform to reasonable defaults | ||
protocol = protocol === "https" ? "https" : "http"; | ||
var url = protocol + "://" + domain + (port !== 80 ? ":" + port : "") + path; | ||
if (isNaN(port) || !port) { | ||
return callback(new Error("Port must be numeric!")); | ||
} | ||
this.exists(protocol,domain,port,path, | ||
function(err,exists) { | ||
if (err) return callback(err); | ||
var url = protocol + "://" + domain + (port !== 80 ? ":" + port : "") + path; | ||
if (!exists) { | ||
var queueItem = { | ||
"url": url, | ||
"protocol": protocol, | ||
"host": domain, | ||
"port": port, | ||
"path": path, | ||
"depth": depth, | ||
"fetched": false, | ||
"status": "queued", | ||
"stateData": {} | ||
}; | ||
self.exists(protocol, domain, port, path, | ||
function(err, exists) { | ||
if (err) { | ||
return callback(err); | ||
} | ||
self.push(queueItem); | ||
callback(null, queueItem); | ||
} else { | ||
var error = new Error("Resource already exists in queue!"); | ||
error.code = "DUP"; | ||
if (!exists) { | ||
var queueItem = { | ||
url: url, | ||
protocol: protocol, | ||
host: domain, | ||
port: port, | ||
path: path, | ||
depth: depth, | ||
fetched: false, | ||
status: "queued", | ||
stateData: {} | ||
}; | ||
callback(error); | ||
} | ||
}); | ||
self.push(queueItem); | ||
callback(null, queueItem); | ||
} else { | ||
var error = new Error("Resource already exists in queue!"); | ||
error.code = "DUP"; | ||
callback(error); | ||
} | ||
}); | ||
}; | ||
@@ -78,18 +82,17 @@ | ||
FetchQueue.prototype.exists = function(protocol, domain, port, path, callback) { | ||
callback = callback && callback instanceof Function ? callback : function(){}; | ||
callback = callback && callback instanceof Function ? callback : function() {}; | ||
port = (port !== 80 ? ":" + port : ""); | ||
port = port !== 80 ? ":" + port : ""; | ||
var url = | ||
(protocol + "://" + domain + port + path) | ||
.toLowerCase(); | ||
var url = (protocol + "://" + domain + port + path).toLowerCase(); | ||
if (!!this.scanIndex[url]) { | ||
callback(null, 1); | ||
return 1; | ||
} else { | ||
this.scanIndex[url] = true; | ||
callback(null, 0); | ||
return 0; | ||
} | ||
if (this.scanIndex[url]) { | ||
callback(null, 1); | ||
return 1; | ||
} | ||
this.scanIndex[url] = true; | ||
callback(null, 0); | ||
return 0; | ||
}; | ||
@@ -99,7 +102,9 @@ | ||
FetchQueue.prototype.last = function(callback) { | ||
callback = callback && callback instanceof Function ? callback : function(){}; | ||
var item, self = this; | ||
item = self[self.length-1]; | ||
callback(null, item); | ||
return item; | ||
callback = callback && callback instanceof Function ? callback : function() {}; | ||
var item, | ||
self = this; | ||
item = self[self.length - 1]; | ||
callback(null, item); | ||
return item; | ||
}; | ||
@@ -109,10 +114,11 @@ | ||
FetchQueue.prototype.get = function(id, callback) { | ||
callback = callback && callback instanceof Function ? callback : function(){}; | ||
var item, self = this; | ||
callback = callback && callback instanceof Function ? callback : function() {}; | ||
var item, | ||
self = this; | ||
if (!isNaN(id) && self.length > id) { | ||
item = self[id]; | ||
callback(null, item); | ||
return item; | ||
} | ||
if (!isNaN(id) && self.length > id) { | ||
item = self[id]; | ||
callback(null, item); | ||
return item; | ||
} | ||
}; | ||
@@ -122,15 +128,16 @@ | ||
FetchQueue.prototype.oldestUnfetchedItem = function(callback) { | ||
callback = callback && callback instanceof Function ? callback : function(){}; | ||
var item, self = this; | ||
callback = callback && callback instanceof Function ? callback : function() {}; | ||
var item, | ||
self = this; | ||
for (var itemIndex = self.oldestUnfetchedIndex; itemIndex < self.length; itemIndex ++) { | ||
if (self[itemIndex].status === "queued") { | ||
self.oldestUnfetchedIndex = itemIndex; | ||
item = self[itemIndex]; | ||
callback(null, item); | ||
return item; | ||
} | ||
} | ||
for (var itemIndex = self.oldestUnfetchedIndex; itemIndex < self.length; itemIndex++) { | ||
if (self[itemIndex].status === "queued") { | ||
self.oldestUnfetchedIndex = itemIndex; | ||
item = self[itemIndex]; | ||
callback(null, item); | ||
return item; | ||
} | ||
} | ||
callback(new Error("No unfetched items remain.")); | ||
callback(new Error("No unfetched items remain.")); | ||
}; | ||
@@ -140,18 +147,19 @@ | ||
FetchQueue.prototype.max = function(statisticName, callback) { | ||
callback = callback && callback instanceof Function ? callback : function(){}; | ||
var maxStatisticValue = 0, self = this; | ||
callback = callback && callback instanceof Function ? callback : function() {}; | ||
var maxStatisticValue = 0, | ||
self = this; | ||
if (allowedStatistics.join().indexOf(statisticName) === -1) { | ||
// Not a recognised statistic! | ||
return callback(new Error("Invalid statistic.")); | ||
} | ||
if (allowedStatistics.join().indexOf(statisticName) === -1) { | ||
// Not a recognised statistic! | ||
return callback(new Error("Invalid statistic.")); | ||
} | ||
self.forEach(function(item) { | ||
if (item.fetched && item.stateData[statisticName] !== null && item.stateData[statisticName] > maxStatisticValue) { | ||
maxStatisticValue = item.stateData[statisticName]; | ||
} | ||
}); | ||
self.forEach(function(item) { | ||
if (item.fetched && item.stateData[statisticName] !== null && item.stateData[statisticName] > maxStatisticValue) { | ||
maxStatisticValue = item.stateData[statisticName]; | ||
} | ||
}); | ||
callback(null, maxStatisticValue); | ||
return maxStatisticValue; | ||
callback(null, maxStatisticValue); | ||
return maxStatisticValue; | ||
}; | ||
@@ -161,19 +169,21 @@ | ||
FetchQueue.prototype.min = function(statisticName, callback) { | ||
callback = callback && callback instanceof Function ? callback : function(){}; | ||
var minimum, minStatisticValue = Infinity, self = this; | ||
callback = callback && callback instanceof Function ? callback : function() {}; | ||
var minimum, | ||
minStatisticValue = Infinity, | ||
self = this; | ||
if (allowedStatistics.join().indexOf(statisticName) === -1) { | ||
// Not a recognised statistic! | ||
return callback(new Error("Invalid statistic.")); | ||
} | ||
if (allowedStatistics.join().indexOf(statisticName) === -1) { | ||
// Not a recognised statistic! | ||
return callback(new Error("Invalid statistic.")); | ||
} | ||
self.forEach(function(item) { | ||
if (item.fetched && item.stateData[statisticName] !== null && item.stateData[statisticName] < minStatisticValue) { | ||
minStatisticValue = item.stateData[statisticName]; | ||
} | ||
}); | ||
minimum = minStatisticValue === Infinity? 0 : minStatisticValue; | ||
callback(null, minimum); | ||
return minimum; | ||
self.forEach(function(item) { | ||
if (item.fetched && item.stateData[statisticName] !== null && item.stateData[statisticName] < minStatisticValue) { | ||
minStatisticValue = item.stateData[statisticName]; | ||
} | ||
}); | ||
minimum = minStatisticValue === Infinity ? 0 : minStatisticValue; | ||
callback(null, minimum); | ||
return minimum; | ||
}; | ||
@@ -183,19 +193,22 @@ | ||
FetchQueue.prototype.avg = function(statisticName, callback) { | ||
callback = callback && callback instanceof Function ? callback : function(){}; | ||
var average, NumberSum = 0, NumberCount = 0, self = this; | ||
callback = callback && callback instanceof Function ? callback : function() {}; | ||
var average, | ||
NumberSum = 0, | ||
NumberCount = 0, | ||
self = this; | ||
if (allowedStatistics.join().indexOf(statisticName) === -1) { | ||
// Not a recognised statistic! | ||
return callback(new Error("Invalid statistic.")); | ||
} | ||
if (allowedStatistics.join().indexOf(statisticName) === -1) { | ||
// Not a recognised statistic! | ||
return callback(new Error("Invalid statistic.")); | ||
} | ||
self.forEach(function(item) { | ||
if (item.fetched && item.stateData[statisticName] !== null && !isNaN(item.stateData[statisticName])) { | ||
NumberSum += item.stateData[statisticName]; | ||
NumberCount ++; | ||
} | ||
}); | ||
average = NumberSum / NumberCount; | ||
callback(null, average); | ||
return average; | ||
self.forEach(function(item) { | ||
if (item.fetched && item.stateData[statisticName] !== null && !isNaN(item.stateData[statisticName])) { | ||
NumberSum += item.stateData[statisticName]; | ||
NumberCount++; | ||
} | ||
}); | ||
average = NumberSum / NumberCount; | ||
callback(null, average); | ||
return average; | ||
}; | ||
@@ -205,13 +218,14 @@ | ||
FetchQueue.prototype.complete = function(callback) { | ||
callback = callback && callback instanceof Function ? callback : function(){}; | ||
var NumberComplete = 0, self = this; | ||
callback = callback && callback instanceof Function ? callback : function() {}; | ||
var NumberComplete = 0, | ||
self = this; | ||
self.forEach(function(item) { | ||
if (item.fetched) { | ||
NumberComplete ++; | ||
} | ||
}); | ||
self.forEach(function(item) { | ||
if (item.fetched) { | ||
NumberComplete++; | ||
} | ||
}); | ||
callback(null, NumberComplete); | ||
return NumberComplete; | ||
callback(null, NumberComplete); | ||
return NumberComplete; | ||
}; | ||
@@ -221,13 +235,14 @@ | ||
FetchQueue.prototype.countWithStatus = function(status, callback) { | ||
callback = callback && callback instanceof Function ? callback : function(){}; | ||
var queueItemsMatched = 0, self = this; | ||
callback = callback && callback instanceof Function ? callback : function() {}; | ||
var queueItemsMatched = 0, | ||
self = this; | ||
self.forEach(function(item) { | ||
if (item.status === status) { | ||
queueItemsMatched ++; | ||
} | ||
}); | ||
self.forEach(function(item) { | ||
if (item.status === status) { | ||
queueItemsMatched++; | ||
} | ||
}); | ||
callback(null,queueItemsMatched); | ||
return queueItemsMatched; | ||
callback(null, queueItemsMatched); | ||
return queueItemsMatched; | ||
}; | ||
@@ -237,14 +252,15 @@ | ||
FetchQueue.prototype.getWithStatus = function(status, callback) { | ||
callback = callback && callback instanceof Function ? callback : function(){}; | ||
var subqueue = [], self = this; | ||
callback = callback && callback instanceof Function ? callback : function() {}; | ||
var subqueue = [], | ||
self = this; | ||
self.forEach(function(item,index) { | ||
if (item.status === status) { | ||
subqueue.push(item); | ||
subqueue[subqueue.length-1].queueIndex = index; | ||
} | ||
}); | ||
self.forEach(function(item, index) { | ||
if (item.status === status) { | ||
subqueue.push(item); | ||
subqueue[subqueue.length - 1].queueIndex = index; | ||
} | ||
}); | ||
callback(null,subqueue); | ||
return subqueue; | ||
callback(null, subqueue); | ||
return subqueue; | ||
}; | ||
@@ -254,10 +270,13 @@ | ||
FetchQueue.prototype.errors = function(callback) { | ||
callback = callback && callback instanceof Function ? callback : function(){}; | ||
var total, failedCount, notFoundCount, self = this; | ||
callback = callback && callback instanceof Function ? callback : function() {}; | ||
var total, | ||
failedCount, | ||
notFoundCount, | ||
self = this; | ||
failedCount = self.countWithStatus("failed"); | ||
notFoundCount = self.countWithStatus("notfound"); | ||
total = failedCount + notFoundCount; | ||
callback(null, total); | ||
return total; | ||
failedCount = self.countWithStatus("failed"); | ||
notFoundCount = self.countWithStatus("notfound"); | ||
total = failedCount + notFoundCount; | ||
callback(null, total); | ||
return total; | ||
}; | ||
@@ -267,20 +286,20 @@ | ||
FetchQueue.prototype.getLength = function(callback) { | ||
return callback(null, this.length); | ||
return callback(null, this.length); | ||
}; | ||
// Writes the queue to disk | ||
FetchQueue.prototype.freeze = function(filename,callback) { | ||
callback = callback && callback instanceof Function ? callback : function(){}; | ||
var self = this; | ||
FetchQueue.prototype.freeze = function(filename, callback) { | ||
callback = callback && callback instanceof Function ? callback : function() {}; | ||
var self = this; | ||
// Re-queue in-progress items before freezing... | ||
self.forEach(function(item) { | ||
if (item.fetched !== true) { | ||
item.status = "queued"; | ||
} | ||
}); | ||
// Re-queue in-progress items before freezing... | ||
self.forEach(function(item) { | ||
if (item.fetched !== true) { | ||
item.status = "queued"; | ||
} | ||
}); | ||
fs.writeFile(filename,JSON.stringify(self),function(err) { | ||
callback(err, self); | ||
}); | ||
fs.writeFile(filename, JSON.stringify(self), function(err) { | ||
callback(err, self); | ||
}); | ||
}; | ||
@@ -290,39 +309,44 @@ | ||
FetchQueue.prototype.defrost = function(filename, callback) { | ||
callback = callback && callback instanceof Function ? callback : function(){}; | ||
var fileData, self = this, defrostedQueue = []; | ||
callback = callback && callback instanceof Function ? callback : function() {}; | ||
var self = this, | ||
defrostedQueue = []; | ||
fs.readFile(filename,function(err,fileData) { | ||
if (err) return callback(err); | ||
fs.readFile(filename, function(err, fileData) { | ||
if (err) { | ||
return callback(err); | ||
} | ||
if (!fileData.toString("utf8").length) { | ||
return callback(new Error("Failed to defrost queue from zero-length JSON.")); | ||
} | ||
if (!fileData.toString("utf8").length) { | ||
return callback(new Error("Failed to defrost queue from zero-length JSON.")); | ||
} | ||
try { | ||
defrostedQueue = JSON.parse(fileData.toString("utf8")); | ||
} catch(error) { | ||
return callback(error); | ||
} | ||
try { | ||
defrostedQueue = JSON.parse(fileData.toString("utf8")); | ||
} catch (error) { | ||
return callback(error); | ||
} | ||
self.oldestUnfetchedIndex = Infinity; | ||
self.scanIndex = {}; | ||
self.oldestUnfetchedIndex = Infinity; | ||
self.scanIndex = {}; | ||
for (var index in defrostedQueue) { | ||
if (defrostedQueue.hasOwnProperty(index) && !isNaN(index)) { | ||
var queueItem = defrostedQueue[index]; | ||
self.push(queueItem); | ||
for (var index in defrostedQueue) { | ||
if (defrostedQueue.hasOwnProperty(index) && !isNaN(index)) { | ||
var queueItem = defrostedQueue[index]; | ||
self.push(queueItem); | ||
if (queueItem.status !== "downloaded") | ||
self.oldestUnfetchedIndex = Math.min( | ||
self.oldestUnfetchedIndex, index); | ||
if (queueItem.status !== "downloaded") { | ||
self.oldestUnfetchedIndex = Math.min( | ||
self.oldestUnfetchedIndex, index); | ||
} | ||
self.scanIndex[queueItem.url] = true; | ||
} | ||
} | ||
self.scanIndex[queueItem.url] = true; | ||
} | ||
} | ||
if (self.oldestUnfetchedIndex === Infinity) | ||
self.oldestUnfetchedIndex = 0; | ||
if (self.oldestUnfetchedIndex === Infinity) { | ||
self.oldestUnfetchedIndex = 0; | ||
} | ||
callback(null,self); | ||
}); | ||
callback(null, self); | ||
}); | ||
}; |
@@ -1,73 +0,85 @@ | ||
var Crawler = require("./crawler.js"), | ||
URI = require("URIjs"); | ||
/* | ||
* Simplecrawler | ||
* https://github.com/cgiffard/node-simplecrawler | ||
* | ||
* Copyright (c) 2011-2015, Christopher Giffard | ||
* | ||
*/ | ||
var Crawler = require("./crawler.js"), | ||
uri = require("urijs"); | ||
/* | ||
Public: Convenience function for really quick, simple crawls. It generates | ||
a new crawler, parses the URL provided, and sets up the new crawler with | ||
the host and path information extracted from the URL. It returns the crawler | ||
object, so you can set up event handlers, and waits until `process.nextTick` | ||
before kicking off the crawl. | ||
Public: Convenience function for really quick, simple crawls. It generates | ||
a new crawler, parses the URL provided, and sets up the new crawler with | ||
the host and path information extracted from the URL. It returns the crawler | ||
object, so you can set up event handlers, and waits until `process.nextTick` | ||
before kicking off the crawl. | ||
url - URL to begin crawl from. | ||
successCallback - Optional function called once an item is completely | ||
downloaded. Functionally identical to a fetchcomplete | ||
event listener. | ||
failCallback - Optional function to be called if an item fails to | ||
download. Functionally identical to a fetcherror | ||
event listener. | ||
url - URL to begin crawl from. | ||
successCallback - Optional function called once an item is completely | ||
downloaded. Functionally identical to a fetchcomplete | ||
event listener. | ||
failCallback - Optional function to be called if an item fails to | ||
download. Functionally identical to a fetcherror | ||
event listener. | ||
Examples | ||
Examples | ||
Crawler.crawl( | ||
"http://example.com:3000/start", | ||
function(queueItem,data) { | ||
console.log("I got a new item!"); | ||
} | ||
); | ||
Crawler.crawl( | ||
"http://example.com:3000/start", | ||
function(queueItem,data) { | ||
console.log("I got a new item!"); | ||
} | ||
); | ||
Crawler | ||
.crawl("http://www.example.com/") | ||
.on("fetchstart",function(queueItem) { | ||
console.log("Beginning fetch for",queueItem.url); | ||
}); | ||
Crawler | ||
.crawl("http://www.example.com/") | ||
.on("fetchstart",function(queueItem) { | ||
console.log("Beginning fetch for",queueItem.url); | ||
}); | ||
Returns the crawler object which has now been constructed. | ||
Returns the crawler object which has now been constructed. | ||
*/ | ||
module.exports = function crawl(url,successCallback,failCallback) { | ||
// Parse the URL first | ||
url = URI(url); | ||
module.exports = function crawl(url, successCallback, failCallback) { | ||
// If either the protocol, path, or hostname are unset, we can't really | ||
// do much. Die with error. | ||
if (!url.protocol()) | ||
throw new Error("Can't crawl with unspecified protocol."); | ||
// Parse the URL first | ||
url = uri(url); | ||
if (!url.hostname()) | ||
throw new Error("Can't crawl with unspecified hostname."); | ||
// If either the protocol, path, or hostname are unset, | ||
// we can't really do much. Die with error. | ||
if (!url.protocol()) { | ||
throw new Error("Can't crawl with unspecified protocol."); | ||
} | ||
if (!url.path()) | ||
throw new Error("Can't crawl with unspecified path."); | ||
if (!url.hostname()) { | ||
throw new Error("Can't crawl with unspecified hostname."); | ||
} | ||
var tmpCrawler = | ||
new Crawler( | ||
url.hostname(), | ||
url.path(), | ||
url.port() || 80); | ||
if (!url.path()) { | ||
throw new Error("Can't crawl with unspecified path."); | ||
} | ||
// Attach callbacks if they were provided | ||
if (successCallback) tmpCrawler.on("fetchcomplete",successCallback); | ||
if (failCallback) tmpCrawler.on("fetcherror",failCallback); | ||
var tmpCrawler = new Crawler(url.hostname(), url.path(), url.port() || 80); | ||
// Start the crawler on the next runloop | ||
// This enables initial configuration options and event handlers to take | ||
// effect before the first resource is queued. | ||
process.nextTick(function() { | ||
tmpCrawler.start(); | ||
}); | ||
// Attach callbacks if they were provided | ||
if (successCallback) { | ||
tmpCrawler.on("fetchcomplete", successCallback); | ||
} | ||
if (failCallback) { | ||
tmpCrawler.on("fetcherror", failCallback); | ||
} | ||
// Return crawler | ||
return tmpCrawler; | ||
// Start the crawler on the next runloop | ||
// This enables initial configuration options and event handlers to take | ||
// effect before the first resource is queued. | ||
process.nextTick(function() { | ||
tmpCrawler.start(); | ||
}); | ||
// Return crawler | ||
return tmpCrawler; | ||
}; |
{ | ||
"name": "simplecrawler", | ||
"description": "Very straigntforward web crawler. Uses EventEmitter. Generates queue statistics and has a basic cache mechanism with extensible backend.", | ||
"version": "0.5.3", | ||
"description": "Very straightforward web crawler. Uses EventEmitter. Generates queue statistics and has a basic cache mechanism with extensible backend.", | ||
"version": "0.5.4", | ||
"homepage": "https://github.com/cgiffard/node-simplecrawler", | ||
"author": "Christopher Giffard <christopher.giffard@cgiffard.com>", | ||
"license": "BSD-2-Clause", | ||
"repository": { | ||
"type": "git", | ||
"url": "https://github.com/cgiffard/node-simplecrawler.git" | ||
"url": "git+https://github.com/cgiffard/node-simplecrawler.git" | ||
}, | ||
@@ -24,3 +25,5 @@ "bugs": { | ||
"scripts": { | ||
"test": "mocha -R spec -t 5000" | ||
"lint": "eslint example/ lib/ test/", | ||
"mocha": "mocha -R spec -t 5000", | ||
"test": "npm run lint && npm run mocha" | ||
}, | ||
@@ -32,8 +35,8 @@ "bin": { | ||
"dependencies": { | ||
"URIjs": "^1.15.0" | ||
"urijs": "^1.16.1" | ||
}, | ||
"devDependencies": { | ||
"chai": "^2.2.0", | ||
"jshint": "^2.7.0", | ||
"mocha": "^2.2.4" | ||
"chai": "^3.2.0", | ||
"eslint": "^1.5.1", | ||
"mocha": "^2.3.2" | ||
}, | ||
@@ -40,0 +43,0 @@ "engines": { |
Sorry, the diff of this file is not supported yet
104647
1991
640
2
+ Addedurijs@^1.16.1
+ Addedurijs@1.19.11(transitive)
- RemovedURIjs@^1.15.0
- RemovedURIjs@1.16.1(transitive)