New Case Study:See how Anthropic automated 95% of dependency reviews with Socket.Learn More
Socket
Sign inDemoInstall
Socket

simplecrawler

Package Overview
Dependencies
Maintainers
2
Versions
70
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

simplecrawler - npm Package Compare versions

Comparing version 0.5.3 to 0.5.4

358

lib/cache-backend-fs.js

@@ -1,11 +0,18 @@

// Simplecrawler - FS cache backend
/*
* Simplecrawler - FS cache backend
* https://github.com/cgiffard/node-simplecrawler
*
* Copyright (c) 2011-2015, Christopher Giffard
*
*/
// Tries to ensure a local 'cache' of a website is as close as possible to a mirror of the website itself.
// The idea is that it is then possible to re-serve the website just using the cache.
var fs = require("fs");
var crypto = require("crypto");
var fs = require("fs"),
crypto = require("crypto");
// Factory for FSBackend
var backend = function backend(loadParameter) {
return new FSBackend(loadParameter);
return new FSBackend(loadParameter);
};

@@ -17,6 +24,6 @@

var FSBackend = function FSBackend(loadParameter) {
this.loaded = false;
this.index = [];
this.location = typeof(loadParameter) === "string" && loadParameter.length > 0 ? loadParameter : process.cwd() + "/cache/";
this.location = this.location.substr(this.location.length-1) === "/" ? this.location : this.location + "/";
this.loaded = false;
this.index = [];
this.location = typeof loadParameter === "string" && loadParameter.length > 0 ? loadParameter : process.cwd() + "/cache/";
this.location = this.location.substr(this.location.length - 1) === "/" ? this.location : this.location + "/";
};

@@ -29,212 +36,215 @@

function sanitisePath(path,queueObject) {
// Remove first slash (as we set one later.)
path = path.replace(/^\//,"");
function sanitisePath(path, queueObject) {
// Remove first slash (as we set one later.)
path = path.replace(/^\//, "");
var pathStack = [];
var pathStack = [];
// Trim whitespace. If no path is present - assume index.html.
var sanitisedPath = path.length ? path.replace(/\s*$/ig,"") : "index.html";
var headers = queueObject.stateData.headers, sanitisedPathParts;
// Trim whitespace. If no path is present - assume index.html.
var sanitisedPath = path.length ? path.replace(/\s*$/ig, "") : "index.html";
var headers = queueObject.stateData.headers, sanitisedPathParts;
if (sanitisedPath.match(/\?/)) {
sanitisedPathParts = sanitisedPath.split(/\?/g);
var resource = sanitisedPathParts.shift();
var hashedQS = crypto.createHash("sha1").update(sanitisedPathParts.join("?")).digest("hex");
sanitisedPath = resource + "?" + hashedQS;
}
if (sanitisedPath.match(/\?/)) {
sanitisedPathParts = sanitisedPath.split(/\?/g);
var resource = sanitisedPathParts.shift();
var hashedQS = crypto.createHash("sha1").update(sanitisedPathParts.join("?")).digest("hex");
sanitisedPath = resource + "?" + hashedQS;
}
pathStack = sanitisedPath.split(/\//g);
pathStack = pathStack.map(function(pathChunk,count) {
if (pathChunk.length >= 250) {
return crypto.createHash("sha1").update(pathChunk).digest("hex");
}
pathStack = sanitisedPath.split(/\//g);
pathStack = pathStack.map(function(pathChunk) {
if (pathChunk.length >= 250) {
return crypto.createHash("sha1").update(pathChunk).digest("hex");
}
return pathChunk;
});
return pathChunk;
});
sanitisedPath = pathStack.join("/");
sanitisedPath = pathStack.join("/");
// Try to get a file extension for the file - for ease of identification
// We run through this if we either:
// 1) haven't got a file extension at all, or:
// 2) have an HTML file without an HTML file extension (might be .php, .aspx, .do, or some other server-processed type)
// Try to get a file extension for the file - for ease of identification
// We run through this if we either:
// 1) haven't got a file extension at all, or:
// 2) have an HTML file without an HTML file extension (might be .php, .aspx, .do, or some other server-processed type)
if (!sanitisedPath.match(/\.[a-z0-9]{1,6}$/i) || (headers["content-type"] && headers["content-type"].match(/text\/html/i) && !sanitisedPath.match(/\.htm[l]?$/i))) {
var subMimeType = "";
var mimeParts = [];
if (!sanitisedPath.match(/\.[a-z0-9]{1,6}$/i) || headers["content-type"] && headers["content-type"].match(/text\/html/i) && !sanitisedPath.match(/\.htm[l]?$/i)) {
var subMimeType = "";
var mimeParts = [];
if (headers["content-type"] && headers["content-type"].match(/text\/html/i)) {
if (sanitisedPath.match(/\/$/)) {
sanitisedPath += "index.html";
} else {
sanitisedPath += ".html";
}
if (headers["content-type"] && headers["content-type"].match(/text\/html/i)) {
if (sanitisedPath.match(/\/$/)) {
sanitisedPath += "index.html";
} else {
sanitisedPath += ".html";
}
} else if (headers["content-type"] && (mimeParts = headers["content-type"].match(/(image|video|audio|application)\/([a-z0-9]+)/i))) {
subMimeType = mimeParts[2];
sanitisedPath += "." + subMimeType;
}
}
} else if (headers["content-type"] && (mimeParts = headers["content-type"].match(/(image|video|audio|application)\/([a-z0-9]+)/i))) {
subMimeType = mimeParts[2];
sanitisedPath += "." + subMimeType;
}
}
return sanitisedPath;
return sanitisedPath;
}
FSBackend.prototype.fileExists = function(location) {
try {
fs.statSync(location);
return true;
} catch (er) {
return false;
}
try {
fs.statSync(location);
return true;
} catch (er) {
return false;
}
};
FSBackend.prototype.isDirectory = function(location) {
try {
if (fs.statSync(location).isDirectory()) {
return true;
}
try {
if (fs.statSync(location).isDirectory()) {
return true;
}
return false;
} catch (er) {
return false;
}
return false;
} catch (er) {
return false;
}
};
FSBackend.prototype.load = function() {
var backend = this;
var backend = this;
if (!this.fileExists(this.location) && this.isDirectory(this.location)) {
throw new Error("Unable to verify cache location exists.");
}
if (!backend.fileExists(backend.location) && backend.isDirectory(backend.location)) {
throw new Error("Unable to verify cache location exists.");
}
try {
var fileData;
if ((fileData = fs.readFileSync(this.location + "cacheindex.json")) && fileData.length) {
this.index = JSON.parse(fileData.toString("utf8"));
this.loaded = true;
}
} catch(error) {
if (error.code === "ENOENT") {
// Cache index doesn't exist. Assume this is a new cache.
// Just leave the memory index empty for now.
this.loaded = true;
} else {
throw error;
}
}
try {
var fileData;
if ((fileData = fs.readFileSync(backend.location + "cacheindex.json")) && fileData.length) {
backend.index = JSON.parse(fileData.toString("utf8"));
backend.loaded = true;
}
} catch (error) {
if (error.code === "ENOENT") {
// Cache index doesn't exist. Assume this is a new cache.
// Just leave the memory index empty for now.
backend.loaded = true;
} else {
throw error;
}
}
// Flush store to disk when closing.
process.on("exit",function() {
backend.saveCache.apply(backend);
});
// Flush store to disk when closing.
process.on("exit", function() {
backend.saveCache.apply(backend);
});
};
FSBackend.prototype.saveCache = function(callback) {
fs.writeFile(this.location + "cacheindex.json", JSON.stringify(this.index), callback);
fs.writeFile(this.location + "cacheindex.json", JSON.stringify(this.index), callback);
};
FSBackend.prototype.setItem = function(queueObject,data,callback) {
callback = callback instanceof Function ? callback : function(){};
FSBackend.prototype.setItem = function(queueObject, data, callback) {
callback = callback instanceof Function ? callback : function() {};
var backend = this;
var pathStack = [queueObject.protocol, queueObject.host, queueObject.port];
pathStack = pathStack.concat(sanitisePath(queueObject.path,queueObject).split(/\/+/g));
var backend = this;
var pathStack = [queueObject.protocol, queueObject.host, queueObject.port];
pathStack = pathStack.concat(sanitisePath(queueObject.path, queueObject).split(/\/+/g));
var cacheItemExists = false;
var firstInstanceIndex = NaN;
if (this.index.reduce(function(prev,current,index,array) {
firstInstanceIndex = !isNaN(firstInstanceIndex) ? firstInstanceIndex : index;
return prev || current.url === queueObject.url;
},false)) {
cacheItemExists = true;
}
var cacheItemExists = false;
var firstInstanceIndex = NaN;
if (backend.index.reduce(function(prev, current, index) {
firstInstanceIndex = !isNaN(firstInstanceIndex) ? firstInstanceIndex : index;
return prev || current.url === queueObject.url;
}, false)) {
cacheItemExists = true;
}
var writeFileData = function(currentPath,data) {
fs.writeFile(currentPath,data,function(error) {
if (error) throw error;
fs.writeFile(currentPath + ".cacheData.json",JSON.stringify(queueObject),function(error) {
if (error) throw error;
var writeFileData = function(currentPath, data) {
fs.writeFile(currentPath, data, function(error) {
if (error) {
throw error;
}
fs.writeFile(currentPath + ".cacheData.json", JSON.stringify(queueObject), function(error) {
if (error) {
throw error;
}
var cacheObject = {
url: queueObject.url,
etag: queueObject.stateData.headers.etag,
lastModified: queueObject.stateData.headers['last-modified'],
dataFile: currentPath,
metaFile: currentPath + ".cacheData.json"
};
var cacheObject = {
url: queueObject.url,
etag: queueObject.stateData.headers.etag,
lastModified: queueObject.stateData.headers["last-modified"],
dataFile: currentPath,
metaFile: currentPath + ".cacheData.json"
};
if (cacheItemExists) {
backend.index[firstInstanceIndex] = cacheObject;
} else {
backend.index.push(cacheObject);
}
if (cacheItemExists) {
backend.index[firstInstanceIndex] = cacheObject;
} else {
backend.index.push(cacheObject);
}
callback(cacheObject);
});
});
};
callback(cacheObject);
});
});
};
pathStack.forEach(function(pathChunk,count) {
var currentPath = backend.location + pathStack.slice(0,count+1).join("/");
if (backend.fileExists(backend.location + pathStack.slice(0,count+1).join("/"))) {
if (!backend.isDirectory(currentPath)) {
if (count === pathStack.length -1) {
// Just overwrite the file...
writeFileData(currentPath,data);
} else {
throw new Error("Cache storage of resource (%s) blocked by file: %s",queueObject.url,currentPath);
}
}
} else {
if (count === pathStack.length -1) {
// Write the file data in
writeFileData(currentPath,data);
} else {
fs.mkdirSync(currentPath);
}
}
});
pathStack.forEach(function(pathChunk, count) {
var currentPath = backend.location + pathStack.slice(0, count + 1).join("/");
if (backend.fileExists(backend.location + pathStack.slice(0, count + 1).join("/"))) {
if (!backend.isDirectory(currentPath)) {
if (count === pathStack.length - 1) {
// Just overwrite the file...
writeFileData(currentPath, data);
} else {
throw new Error("Cache storage of resource (%s) blocked by file: %s", queueObject.url, currentPath);
}
}
} else {
if (count === pathStack.length - 1) {
// Write the file data in
writeFileData(currentPath, data);
}
fs.mkdirSync(currentPath);
}
});
};
FSBackend.prototype.getItem = function(queueObject,callback) {
var cacheItemResult = this.index.filter(function(item) {
return item.url === queueObject.url;
});
FSBackend.prototype.getItem = function(queueObject, callback) {
var cacheItemResult = this.index.filter(function(item) {
return item.url === queueObject.url;
});
if (cacheItemResult.length) {
var cacheItem = cacheItemResult.shift();
if (cacheItemResult.length) {
var cacheItem = cacheItemResult.shift();
callback({
"url": cacheItem.url,
"etag": cacheItem.etag,
"lastModified": cacheItem.lastModified,
"getData": function(callback) {
fs.readFile(cacheItem.dataFile,function(error,data) {
if (error) {
callback(error);
return false;
}
callback({
url: cacheItem.url,
etag: cacheItem.etag,
lastModified: cacheItem.lastModified,
getData: function(callback) {
fs.readFile(cacheItem.dataFile, function(error, data) {
if (error) {
callback(error);
return false;
}
callback(null,data);
});
},
"getMetadata": function(callback) {
fs.readFile(cacheItem.metaFile,function(error,data) {
if (error) {
callback(error);
return false;
}
callback(null, data);
});
},
getMetadata: function(callback) {
fs.readFile(cacheItem.metaFile, function(error, data) {
if (error) {
callback(error);
return false;
}
callback(null,JSON.parse(data.toString("utf8")));
});
}
});
callback(null, JSON.parse(data.toString("utf8")));
});
}
});
} else {
callback(null);
}
} else {
callback(null);
}
return false;
return false;
};

@@ -1,24 +0,24 @@

// Simplecrawler - cache module
// Christopher Giffard, 2011
//
// http://www.github.com/cgiffard/node-simplecrawler
/*
* Simplecrawler - cache module
* https://github.com/cgiffard/node-simplecrawler
*
* Copyright (c) 2011-2015, Christopher Giffard
*
*/
var fs = require("fs");
var EventEmitter = require('events').EventEmitter;
var EventEmitter = require("events").EventEmitter;
var FilesystemBackend = require("./cache-backend-fs.js");
// var RedisBackend = require("cache-backend-redis.js");
// var MongoBackend = require("cache-backend-mongo.js");
// Init cache wrapper for backend...
var Cache = function Cache(cacheLoadParameter,cacheBackend) {
var Cache = function Cache(cacheLoadParameter, cacheBackend) {
// Ensure parameters are how we want them...
cacheBackend = typeof cacheBackend === "object" ? cacheBackend : FilesystemBackend;
cacheLoadParameter = cacheLoadParameter instanceof Array ? cacheLoadParameter : [cacheLoadParameter];
// Ensure parameters are how we want them...
cacheBackend = typeof cacheBackend === "object" ? cacheBackend : FilesystemBackend;
cacheLoadParameter = cacheLoadParameter instanceof Array ? cacheLoadParameter : [cacheLoadParameter];
// Now we can just run the factory.
this.datastore = cacheBackend.apply(cacheBackend,cacheLoadParameter);
// Now we can just run the factory.
this.datastore = cacheBackend.apply(cacheBackend, cacheLoadParameter);
// Instruct the backend to load up.
this.datastore.load();
// Instruct the backend to load up.
this.datastore.load();
};

@@ -29,13 +29,13 @@

// Set up data import and export functions
Cache.prototype.setCacheData = function(queueObject,data,callback) {
this.datastore.setItem(queueObject,data,callback);
this.emit("setcache",queueObject,data);
Cache.prototype.setCacheData = function(queueObject, data, callback) {
this.datastore.setItem(queueObject, data, callback);
this.emit("setcache", queueObject, data);
};
Cache.prototype.getCacheData = function(queueObject,callback) {
this.datastore.getItem(queueObject,callback);
Cache.prototype.getCacheData = function(queueObject, callback) {
this.datastore.getItem(queueObject, callback);
};
Cache.prototype.saveCache = function() {
this.datastore.saveCache();
this.datastore.saveCache();
};

@@ -42,0 +42,0 @@

@@ -1,244 +0,268 @@

// Cookie Jar Functionality
var EventEmitter = require("events").EventEmitter,
util = require("util");
/*
* Simplecrawler - Cookie Jar Functionality
* https://github.com/cgiffard/node-simplecrawler
*
* Copyright (c) 2011-2015, Christopher Giffard
*
*/
var EventEmitter = require("events").EventEmitter,
util = require("util");
/*
Public: Constructor for the cookie jar.
Public: Constructor for the cookie jar.
Examples
Examples
var cookieJar = new CookieJar();
var cookieJar = new CookieJar();
Returns the cookie jar object which has now been constructed.
Returns the cookie jar object which has now been constructed.
*/
function CookieJar() {
var cookies = [];
this.__defineGetter__("cookies",function() {
return cookies;
});
var cookies = [];
this.__defineGetter__("cookies", function() {
return cookies;
});
// Run the EventEmitter constructor
EventEmitter.call(this);
// Run the EventEmitter constructor
EventEmitter.call(this);
}
util.inherits(CookieJar,EventEmitter);
util.inherits(CookieJar, EventEmitter);
/*
Public: Adds a new cookie to the jar, either by creating a new Cookie() object
from specific details such as name, value, etc., accepting a string from a
Set-Cookie header, or by passing in an existing Cookie() object.
Public: Adds a new cookie to the jar, either by creating a new Cookie() object
from specific details such as name, value, etc., accepting a string from a
Set-Cookie header, or by passing in an existing Cookie() object.
name - The name of the cookie to add. Alternately, set-cookie
header as string, or an existing cookie object.
value - The value of the cookie.
expiry - Expiry timestamp in milliseconds.
path - Limit cookie to path (defaults to "/")
domain - Limit cookie to domain
httponly - Boolean value specifying httponly
cb - Optional callback.
name - The name of the cookie to add. Alternately, set-cookie
header as string, or an existing cookie object.
value - The value of the cookie.
expiry - Expiry timestamp in milliseconds.
path - Limit cookie to path (defaults to "/")
domain - Limit cookie to domain
httponly - Boolean value specifying httponly
cb - Optional callback.
Emits
Emits
addcookie - Emitted with new cookie object as an argument.
addcookie - Emitted with new cookie object as an argument.
Examples
Examples
cookieJar.add("mycookie","myValue",Date.now(),"/","test.com",false);
cookieJar.add("mycookie","myValue",Date.now(),"/","test.com",false);
Returns the cookie jar object for chaining.
Returns the cookie jar object for chaining.
*/
CookieJar.prototype.add = function(name,value,expiry,path,domain,httponly,cb) {
CookieJar.prototype.add = function(name, value, expiry, path, domain, httponly, cb) {
var existingIndex = -1, newCookie;
var existingIndex = -1, newCookie;
if (arguments.length > 1) {
newCookie = new Cookie(name,value,expiry,path,domain,httponly);
} else if (name instanceof Cookie) {
newCookie = name;
} else {
newCookie = Cookie.fromString(name);
}
if (arguments.length > 1) {
newCookie = new Cookie(name, value, expiry, path, domain, httponly);
} else if (name instanceof Cookie) {
newCookie = name;
} else {
newCookie = Cookie.fromString(name);
}
// Are we updating an existing cookie or adding a new one?
this.cookies.forEach(function(cookie,index) {
if (cookie.name === newCookie.name &&
cookie.matchDomain(newCookie.domain)) {
// Are we updating an existing cookie or adding a new one?
this.cookies.forEach(function(cookie, index) {
if (cookie.name === newCookie.name &&
cookie.matchDomain(newCookie.domain)) {
existingIndex = index;
}
});
existingIndex = index;
}
});
if (existingIndex < 0) {
this.cookies.push(newCookie);
} else {
this.cookies[existingIndex] = newCookie;
}
if (existingIndex < 0) {
this.cookies.push(newCookie);
} else {
this.cookies[existingIndex] = newCookie;
}
this.emit("addcookie",newCookie);
this.emit("addcookie", newCookie);
if (cb && cb instanceof Function)
cb(null,newCookie);
if (cb && cb instanceof Function) {
cb(null, newCookie);
}
return this;
return this;
};
/*
Public: Removes cookies from the cookie jar. If no domain and name are
specified, all cookies in the jar are removed.
Public: Removes cookies from the cookie jar. If no domain and name are
specified, all cookies in the jar are removed.
name - The name of the cookie(s) to remove
domain - The domain from which to remove cookies.
cb - Optional callback.
name - The name of the cookie(s) to remove
domain - The domain from which to remove cookies.
cb - Optional callback.
Emits
Emits
removecookie - Emitted with array of removed cookies.
removecookie - Emitted with array of removed cookies.
Examples
Examples
cookieJar.remove(null,"nytimes.com");
cookieJar.remove(null,"nytimes.com");
Returns an array of removed cookies.
Returns an array of removed cookies.
*/
CookieJar.prototype.remove = function(name,domain,cb) {
var cookiesRemoved = [], jar = this;
CookieJar.prototype.remove = function(name, domain, cb) {
var cookiesRemoved = [],
jar = this;
this.cookies.forEach(function(cookie,index) {
jar.cookies.forEach(function(cookie, index) {
// If the names don't match, we're not removing this cookie
if (!!name && cookie.name !== name)
return false;
// If the names don't match, we're not removing this cookie
if (!!name && cookie.name !== name) {
return false;
}
// If the domains don't match, we're not removing this cookie
if (!!domain && !cookie.matchDomain(domain))
return false;
// If the domains don't match, we're not removing this cookie
if (!!domain && !cookie.matchDomain(domain)) {
return false;
}
// Matched. Remove!
cookiesRemoved.push(jar.cookies.splice(index,1));
});
// Matched. Remove!
cookiesRemoved.push(jar.cookies.splice(index, 1));
});
jar.emit("removecookie",cookiesRemoved);
jar.emit("removecookie", cookiesRemoved);
if (cb && cb instanceof Function)
cb(null,cookiesRemoved);
if (cb && cb instanceof Function) {
cb(null, cookiesRemoved);
}
return cookiesRemoved;
return cookiesRemoved;
};
/*
Public: Gets an array of cookies based on name and domain.
Public: Gets an array of cookies based on name and domain.
name - The name of the cookie(s) to retrieve
domain - The domain from which to retrieve cookies.
cb - Optional callback.
name - The name of the cookie(s) to retrieve
domain - The domain from which to retrieve cookies.
cb - Optional callback.
Examples
Examples
cookieJar.get(null,"nytimes.com");
cookieJar.get(null,"nytimes.com");
Returns an array of cookies.
Returns an array of cookies.
*/
CookieJar.prototype.get = function(name,domain,cb) {
CookieJar.prototype.get = function(name, domain, cb) {
var cookies =
this.cookies.filter(function(cookie,index) {
var cookies = this.cookies.filter(function(cookie) {
// If the names don't match, we're not returning this cookie
if (!!name && cookie.name !== name)
return false;
// If the names don't match, we're not returning this cookie
if (!!name && cookie.name !== name) {
return false;
}
// If the domains don't match, we're not returning this cookie
if (!!domain && !cookie.matchDomain(domain))
return false;
// If the domains don't match, we're not returning this cookie
if (!!domain && !cookie.matchDomain(domain)) {
return false;
}
return true;
});
return true;
});
if (cb && cb instanceof Function)
cb(null,cookies);
if (cb && cb instanceof Function) {
cb(null, cookies);
}
return cookies;
return cookies;
};
/*
Public: Generates an array of headers based on the value of the cookie jar.
Public: Generates an array of headers based on the value of the cookie jar.
domain - The domain from which to generate cookies.
path - Filter headers to cookies applicable to this path.
cb - Optional callback.
domain - The domain from which to generate cookies.
path - Filter headers to cookies applicable to this path.
cb - Optional callback.
Examples
Examples
cookieJar.getAsHeader("nytimes.com","/myaccount");
cookieJar.getAsHeader("nytimes.com","/myaccount");
Returns an array of cookie headers.
Returns an array of cookie headers.
*/
CookieJar.prototype.getAsHeader = function(domain,path,cb) {
CookieJar.prototype.getAsHeader = function(domain, path, cb) {
var headers =
this.cookies
.filter(function(cookie) {
if (cookie.isExpired()) return false;
if (!domain && !path) return true;
if (domain) return cookie.matchDomain(domain);
if (path) return cookie.matchPath(path);
})
.map(function(cookie) {
return cookie.toString();
});
var headers =
this.cookies.filter(function(cookie) {
if (cookie.isExpired()) {
return false;
}
if (!domain && !path) {
return true;
}
if (domain) {
return cookie.matchDomain(domain);
}
if (path) {
return cookie.matchPath(path);
}
})
.map(function(cookie) {
return cookie.toString();
});
if (cb && cb instanceof Function)
cb(null,headers);
if (cb && cb instanceof Function) {
cb(null, headers);
}
return headers;
return headers;
};
/*
Public: Adds cookies to the cookie jar based on an array of 'set-cookie'
headers provided by a webserver. Duplicate cookies are overwritten.
Public: Adds cookies to the cookie jar based on an array of 'set-cookie'
headers provided by a webserver. Duplicate cookies are overwritten.
headers - An array of 'set-cookie' headers
cb - Optional callback.
headers - An array of 'set-cookie' headers
cb - Optional callback.
Examples
Examples
cookieJar.addFromHeaders(res.headers["set-cookie"]);
cookieJar.addFromHeaders(res.headers["set-cookie"]);
Returns the cookie jar for chaining.
Returns the cookie jar for chaining.
*/
CookieJar.prototype.addFromHeaders = function(headers,cb) {
var jar = this;
CookieJar.prototype.addFromHeaders = function(headers, cb) {
var jar = this;
if (!(headers instanceof Array))
headers = [headers];
if (!(headers instanceof Array)) {
headers = [headers];
}
headers.forEach(function(header) {
jar.add(header);
});
headers.forEach(function(header) {
jar.add(header);
});
if (cb && cb instanceof Function)
cb(jar);
if (cb && cb instanceof Function) {
cb(jar);
}
return jar;
return jar;
};
/*
Public: Outputs a linefeed-separated list of set-cookie headers representing
the entire contents of the cookie jar.
Public: Outputs a linefeed-separated list of set-cookie headers representing
the entire contents of the cookie jar.
Examples
Examples
cookieJar.toString();
cookieJar.toString();
Returns a list of headers in string form.
Returns a list of headers in string form.
*/
CookieJar.prototype.toString = function() {
return this.getAsHeader().join("\n");
return this.getAsHeader().join("\n");
};

@@ -248,51 +272,54 @@

/*
Public: Constructor for the Cookie() object: create a new cookie.
Public: Constructor for the Cookie() object: create a new cookie.
name - The name of the cookie to add.
value - The value of the cookie.
expires - Expiry timestamp in milliseconds.
path - Limit cookie to path (defaults to "/")
domain - Limit cookie to domain
httponly - Boolean value specifying httponly
name - The name of the cookie to add.
value - The value of the cookie.
expires - Expiry timestamp in milliseconds.
path - Limit cookie to path (defaults to "/")
domain - Limit cookie to domain
httponly - Boolean value specifying httponly
Examples
Examples
var myCookie = new Cookie("mycookie","myValue",Date.now(),"/","test.com",false);
var myCookie = new Cookie("mycookie","myValue",Date.now(),"/","test.com",false);
Returns the newly created Cookie object.
Returns the newly created Cookie object.
*/
function Cookie(name,value,expires,path,domain,httponly) {
function Cookie(name, value, expires, path, domain, httponly) {
if (!name) throw new Error("A name is required to create a cookie.");
if (!name) {
throw new Error("A name is required to create a cookie.");
}
// Parse date to timestamp - consider it never expiring if timestamp is not
// passed to the function
if (expires) {
// Parse date to timestamp - consider it never expiring if timestamp is not
// passed to the function
if (expires) {
if (typeof expires !== "number")
expires = (new Date(expires)).getTime();
if (typeof expires !== "number") {
expires = (new Date(expires)).getTime();
}
} else {
expires = -1;
}
} else {
expires = -1;
}
this.name = name;
this.value = value || "";
this.expires = expires;
this.path = path || "/";
this.domain = domain || "*";
this.httponly = !!httponly;
this.name = name;
this.value = value || "";
this.expires = expires;
this.path = path || "/";
this.domain = domain || "*";
this.httponly = !!httponly;
}
/*
Public, Static: Returns a new Cookie() object based on a header string.
Public, Static: Returns a new Cookie() object based on a header string.
string - A set-cookie header string
string - A set-cookie header string
Examples
Examples
var myCookie = Cookie.fromString(response.headers["set-cookie"][0]);
var myCookie = Cookie.fromString(response.headers["set-cookie"][0]);
Returns the newly created Cookie object.
Returns the newly created Cookie object.

@@ -302,126 +329,137 @@ */

if (!string || typeof string !== "string")
throw new Error("String must be supplied to generate a cookie.");
if (!string || typeof string !== "string") {
throw new Error("String must be supplied to generate a cookie.");
}
function parseKeyVal(input) {
var key = input.split(/\=/).shift(),
val = input.split(/\=/).slice(1).join("=");
function parseKeyVal(input) {
var key = input.split(/\=/).shift(),
val = input.split(/\=/).slice(1).join("=");
return [key,val];
}
return [key, val];
}
string = string.replace(/^\s*set\-cookie\s*\:\s*/i,"");
string = string.replace(/^\s*set\-cookie\s*\:\s*/i, "");
var parts = string.split(/\s*\;\s*/i),
name = parseKeyVal(parts.shift()),
keyValParts = {};
var parts = string.split(/\s*\;\s*/i),
name = parseKeyVal(parts.shift()),
keyValParts = {};
keyValParts.name = name[0];
keyValParts.value = name[1];
keyValParts.name = name[0];
keyValParts.value = name[1];
parts
.filter(function(input) {
return !!input.replace(/\s+/ig,"").length;
})
.map(parseKeyVal)
.forEach(function(keyval) {
var key = String(keyval[0]).toLowerCase().replace(/[^a-z0-9]/ig,"");
keyValParts[key] = keyval[1];
});
parts
.filter(function(input) {
return !!input.replace(/\s+/ig, "").length;
})
.map(parseKeyVal)
.forEach(function(keyval) {
var key = String(keyval[0]).toLowerCase().replace(/[^a-z0-9]/ig, "");
keyValParts[key] = keyval[1];
});
return new Cookie(
keyValParts.name,
keyValParts.value,
keyValParts.expires || keyValParts.expiry,
keyValParts.path,
keyValParts.domain,
keyValParts.hasOwnProperty("httponly")
);
return new Cookie(
keyValParts.name,
keyValParts.value,
keyValParts.expires || keyValParts.expiry,
keyValParts.path,
keyValParts.domain,
keyValParts.hasOwnProperty("httponly")
);
};
/*
Public: Outputs the cookie as a string, in the form of a set-cookie header.
Public: Outputs the cookie as a string, in the form of a set-cookie header.
includeHeader - Boolean value specifying whether to include the
'Set-Cookie: ' header name at the beginning of the
string.
includeHeader - Boolean value specifying whether to include the
'Set-Cookie: ' header name at the beginning of the
string.
Examples
Examples
var header = myCookie.toString(true);
var header = myCookie.toString(true);
Returns the header string.
Returns the header string.
*/
Cookie.prototype.toString = function(includeHeader) {
var string = "";
var string = "";
if (includeHeader) string = "Set-Cookie: ";
if (includeHeader) {
string = "Set-Cookie: ";
}
string += this.name + "=" + this.value + "; ";
string += this.name + "=" + this.value + "; ";
if (this.expires > 0)
string += "Expires=" + (new Date(this.expires)).toGMTString() + "; ";
if (this.expires > 0) {
string += "Expires=" + (new Date(this.expires)).toGMTString() + "; ";
}
if (!!this.path)
string += "Path=" + this.path + "; ";
if (this.path) {
string += "Path=" + this.path + "; ";
}
if (!!this.domain)
string += "Domain=" + this.domain + "; ";
if (this.domain) {
string += "Domain=" + this.domain + "; ";
}
if (!!this.httponly)
string += "Httponly; ";
if (this.httponly) {
string += "Httponly; ";
}
return string;
return string;
};
/*
Public: Determines whether a cookie has expired or not.
Public: Determines whether a cookie has expired or not.
Examples
Examples
if (myCookie.isExpired()) { ... }
if (myCookie.isExpired()) { ... }
Returns a boolean value specifying whether the cookie has expired (true) or
whether it is still valid (false.)
Returns a boolean value specifying whether the cookie has expired (true) or
whether it is still valid (false.)
*/
Cookie.prototype.isExpired = function() {
if (this.expires < 0) return false;
return (this.expires < Date.now());
if (this.expires < 0) {
return false;
}
return this.expires < Date.now();
};
/*
Public: Determines whether a cookie matches a given domain.
Public: Determines whether a cookie matches a given domain.
Examples
Examples
if (myCookie.matchDomain("example.com")) { ... }
if (myCookie.matchDomain("example.com")) { ... }
Returns a boolean value specifying whether the cookie matches (true) or
doesn't match (false.)
Returns a boolean value specifying whether the cookie matches (true) or
doesn't match (false.)
*/
Cookie.prototype.matchDomain = function(domain) {
var reverseDomain = this.domain.split("").reverse().join(""),
reverseDomainComp = domain.split("").reverse().join("");
var reverseDomain = this.domain.split("").reverse().join(""),
reverseDomainComp = domain.split("").reverse().join("");
return reverseDomain.indexOf(reverseDomainComp) === 0;
return reverseDomain.indexOf(reverseDomainComp) === 0;
};
/*
Public: Determines whether a cookie matches a given path.
Public: Determines whether a cookie matches a given path.
Examples
Examples
if (myCookie.matchPath("/test/account")) { ... }
if (myCookie.matchPath("/test/account")) { ... }
Returns a boolean value specifying whether the cookie matches (true) or
doesn't match (false.)
Returns a boolean value specifying whether the cookie matches (true) or
doesn't match (false.)
*/
Cookie.prototype.matchPath = function(path) {
if (!this.path) return true;
if (!this.path) {
return true;
}
return path.indexOf(this.path) === 0;
return path.indexOf(this.path) === 0;
};

@@ -428,0 +466,0 @@

@@ -1,18 +0,20 @@

// Simplecrawler
// Christopher Giffard, 2011 - 2013+
//
// http://www.github.com/cgiffard/node-simplecrawler
/*
* Simplecrawler
* https://github.com/cgiffard/node-simplecrawler
*
* Copyright (c) 2011-2015, Christopher Giffard
*
*/
// Queue Dependency
var FetchQueue = require("./queue.js"),
Cache = require("./cache.js"),
CookieJar = require("./cookies.js"),
MetaInfo = require("../package.json");
var FetchQueue = require("./queue.js"),
CookieJar = require("./cookies.js"),
MetaInfo = require("../package.json");
var http = require("http"),
https = require("https"),
EventEmitter = require('events').EventEmitter,
URI = require("URIjs"),
zlib = require("zlib"),
util = require("util");
var http = require("http"),
https = require("https"),
EventEmitter = require("events").EventEmitter,
uri = require("urijs"),
zlib = require("zlib"),
util = require("util");

@@ -22,887 +24,935 @@ var QUEUE_ITEM_INITIAL_DEPTH = 1;

/*
Public: Constructor for the crawler.
Public: Constructor for the crawler.
host - Initial hostname/domain to begin crawling from. By
default, the crawl will be locked to this hostname.
initialPath - Initial path to begin crawling from.
initialPort - Port to begin crawling from.
interval - Request interval for the crawler. Defaults to 250ms.
host - Initial hostname/domain to begin crawling from. By
default, the crawl will be locked to this hostname.
initialPath - Initial path to begin crawling from.
initialPort - Port to begin crawling from.
interval - Request interval for the crawler. Defaults to 250ms.
Examples
Examples
var crawler = new Crawler("example.com","/",80,500);
var crawler = new Crawler("example.com","/",80,500);
var crawler = new Crawler("example.com");
var crawler = new Crawler("example.com");
Returns the crawler object which has now been constructed.
Returns the crawler object which has now been constructed.
*/
var Crawler = function(host,initialPath,initialPort,interval) {
var crawler = this;
var Crawler = function(host, initialPath, initialPort, interval) {
var crawler = this;
// Data integrity checks
if (initialPort && isNaN(initialPort))
throw new Error("Port must be a number!");
// Data integrity checks
if (initialPort && isNaN(initialPort)) {
throw new Error("Port must be a number!");
}
// SETTINGS TO STUFF WITH
// (not here! Do it when you create a `new Crawler()`)
// SETTINGS TO STUFF WITH
// (not here! Do it when you create a `new Crawler()`)
// Domain to crawl
crawler.host = host || "";
// Domain to crawl
crawler.host = host || "";
// Gotta start crawling *somewhere*
crawler.initialPath = initialPath || "/";
crawler.initialPort = initialPort || 80;
crawler.initialProtocol = "http";
// Gotta start crawling *somewhere*
crawler.initialPath = initialPath || "/";
crawler.initialPort = initialPort || 80;
crawler.initialProtocol = "http";
// Internal 'tick' interval for spawning new requests
// (as long as concurrency is under cap)
// One request will be spooled per tick, up to the concurrency threshold.
crawler.interval = interval || 250;
// Internal 'tick' interval for spawning new requests
// (as long as concurrency is under cap)
// One request will be spooled per tick, up to the concurrency threshold.
crawler.interval = interval || 250;
// Maximum request concurrency. Be sensible. Five ties in with node's
// default maxSockets value.
crawler.maxConcurrency = 5;
// Maximum request concurrency. Be sensible. Five ties in with node's
// default maxSockets value.
crawler.maxConcurrency = 5;
// Maximum time we'll wait for headers
crawler.timeout = 5 * 60 * 1000;
// Maximum time we'll wait for headers
crawler.timeout = 5 * 60 * 1000;
// Maximum time we'll wait for async listeners.
crawler.listenerTTL = 10 * 1000;
// Maximum time we'll wait for async listeners.
crawler.listenerTTL = 10 * 1000;
// User Agent
crawler.userAgent =
"Node/" + MetaInfo.name + " " + MetaInfo.version +
" (" + MetaInfo.repository.url + ")";
// User Agent
crawler.userAgent =
"Node/" + MetaInfo.name + " " + MetaInfo.version +
" (" + MetaInfo.repository.url + ")";
// Queue for requests - FetchQueue gives us stats and other sugar
// (but it's basically just an array)
crawler.queue = new FetchQueue();
// Queue for requests - FetchQueue gives us stats and other sugar
// (but it's basically just an array)
crawler.queue = new FetchQueue();
// Do we filter by domain?
// Unless you want to be crawling the entire internet, I would
// recommend leaving this on!
crawler.filterByDomain = true;
// Do we filter by domain?
// Unless you want to be crawling the entire internet, I would
// recommend leaving this on!
crawler.filterByDomain = true;
// Do we scan subdomains?
crawler.scanSubdomains = false;
// Do we scan subdomains?
crawler.scanSubdomains = false;
// Treat WWW subdomain the same as the main domain (and don't count
// it as a separate subdomain)
crawler.ignoreWWWDomain = true;
// Treat WWW subdomain the same as the main domain (and don't count
// it as a separate subdomain)
crawler.ignoreWWWDomain = true;
// Or go even further and strip WWW subdomain from domains altogether!
crawler.stripWWWDomain = false;
// Or go even further and strip WWW subdomain from domains altogether!
crawler.stripWWWDomain = false;
// Internal cachestore
crawler.cache = null;
// Internal cachestore
crawler.cache = null;
// Use an HTTP Proxy?
crawler.useProxy = false;
crawler.proxyHostname = "127.0.0.1";
crawler.proxyPort = 8123;
crawler.proxyUser = null;
crawler.proxyPass = null;
// Use an HTTP Proxy?
crawler.useProxy = false;
crawler.proxyHostname = "127.0.0.1";
crawler.proxyPort = 8123;
crawler.proxyUser = null;
crawler.proxyPass = null;
// Support for HTTP basic auth
crawler.needsAuth = false;
crawler.authUser = "";
crawler.authPass = "";
// Support for HTTP basic auth
crawler.needsAuth = false;
crawler.authUser = "";
crawler.authPass = "";
// Support for retaining cookies for parse duration
crawler.acceptCookies = true;
crawler.cookies = new CookieJar();
// Support for retaining cookies for parse duration
crawler.acceptCookies = true;
crawler.cookies = new CookieJar();
// Support for custom headers...
crawler.customHeaders = {};
// Support for custom headers...
crawler.customHeaders = {};
// Domain Whitelist
// We allow domains to be whitelisted, so cross-domain requests can be made.
crawler.domainWhitelist = [];
// Domain Whitelist
// We allow domains to be whitelisted, so cross-domain requests can be made.
crawler.domainWhitelist = [];
// Supported Protocols
crawler.allowedProtocols = [
/^http(s)?$/i, // HTTP & HTTPS
/^(rss|atom|feed)(\+xml)?$/i // RSS / XML
];
// Supported Protocols
crawler.allowedProtocols = [
/^http(s)?$/i, // HTTP & HTTPS
/^(rss|atom|feed)(\+xml)?$/i // RSS / XML
];
// Max file size to download/store
crawler.maxResourceSize = 1024 * 1024 * 16; // 16mb
// Max file size to download/store
crawler.maxResourceSize = 1024 * 1024 * 16; // 16mb
// Supported MIME-types
// Matching MIME-types will be scanned for links
crawler.supportedMimeTypes = [
/^text\//i,
/^application\/(rss|html|xhtml)?[\+\/\-]?xml/i,
/^application\/javascript/i,
/^xml/i
];
// Supported MIME-types
// Matching MIME-types will be scanned for links
crawler.supportedMimeTypes = [
/^text\//i,
/^application\/(rss|html|xhtml)?[\+\/\-]?xml/i,
/^application\/javascript/i,
/^xml/i
];
// Download linked, but unsupported files (binary - images, documents, etc)
crawler.downloadUnsupported = true;
// Download linked, but unsupported files (binary - images, documents, etc)
crawler.downloadUnsupported = true;
// URL Encoding setting...
crawler.urlEncoding = "unicode";
// URL Encoding setting...
crawler.urlEncoding = "unicode";
// Strip Querystring Parameters from URL
crawler.stripQuerystring = false;
// Strip Querystring Parameters from URL
crawler.stripQuerystring = false;
// Regular expressions for finding URL items in HTML and text
crawler.discoverRegex = [
/\s?(?:href|src)\s?=\s?(["']).*?\1/ig,
/\s?(?:href|src)\s?=\s?[^"'][^\s>]+/ig,
/\s?url\((["']).*?\1\)/ig,
/\s?url\([^"'].*?\)/ig,
// Regular expressions for finding URL items in HTML and text
crawler.discoverRegex = [
/\s?(?:href|src)\s?=\s?(["']).*?\1/ig,
/\s?(?:href|src)\s?=\s?[^"'][^\s>]+/ig,
/\s?url\((["']).*?\1\)/ig,
/\s?url\([^"'].*?\)/ig,
// This could easily duplicate matches above, e.g. in the case of
// href="http://example.com"
/http(s)?\:\/\/[^?\s><\'\"]+/ig,
// This could easily duplicate matches above, e.g. in the case of
// href="http://example.com"
/http(s)?\:\/\/[^?\s><\'\"]+/ig,
// This might be a bit of a gamble... but get hard-coded
// strings out of javacript: URLs. They're often popup-image
// or preview windows, which would otherwise be unavailable to us.
// Worst case scenario is we make some junky requests.
/^javascript\:[a-z0-9\$\_\.]+\(['"][^'"\s]+/ig
];
// This might be a bit of a gamble... but get hard-coded
// strings out of javacript: URLs. They're often popup-image
// or preview windows, which would otherwise be unavailable to us.
// Worst case scenario is we make some junky requests.
/^javascript\:[a-z0-9\$\_\.]+\(['"][^'"\s]+/ig
];
// Whether to parse inside HTML comments
crawler.parseHTMLComments = true;
// Whether to parse inside HTML comments
crawler.parseHTMLComments = true;
// Whether to parse inside script tags
crawler.parseScriptTags = true;
// Whether to parse inside script tags
crawler.parseScriptTags = true;
// Max depth parameter
crawler.maxDepth = 0;
// Max depth parameter
crawler.maxDepth = 0;
// Whether to allow 'resources' greater than the max depth to be downloaded
crawler.fetchWhitelistedMimeTypesBelowMaxDepth = false;
// Whether to allow 'resources' greater than the max depth to be downloaded
crawler.fetchWhitelistedMimeTypesBelowMaxDepth = false;
// Ignore invalid ssl certificates
crawler.ignoreInvalidSSL = false;
// Ignore invalid SSL certificates
crawler.ignoreInvalidSSL = false;
// STATE (AND OTHER) VARIABLES NOT TO STUFF WITH
var hiddenProps = {
"_openRequests": 0,
"_fetchConditions": [],
"_openListeners": 0
};
// STATE (AND OTHER) VARIABLES NOT TO STUFF WITH
var hiddenProps = {
_openRequests: 0,
_fetchConditions: [],
_openListeners: 0
};
// Run the EventEmitter constructor
EventEmitter.call(crawler);
// Run the EventEmitter constructor
EventEmitter.call(crawler);
// Apply all the hidden props
Object.keys(hiddenProps).forEach(function(key) {
Object.defineProperty(crawler, key, {
"writable": true,
"enumerable": false,
"value": hiddenProps[key]
});
});
// Apply all the hidden props
Object.keys(hiddenProps).forEach(function(key) {
Object.defineProperty(crawler, key, {
writable: true,
enumerable: false,
value: hiddenProps[key]
});
});
};
util.inherits(Crawler,EventEmitter);
util.inherits(Crawler, EventEmitter);
/*
Public: Starts or resumes the crawl. If the queue is empty, it adds a new
queue item from which to begin crawling based on the initial configuration
of the crawler itself. The crawler waits for process.nextTick to begin, so
handlers and other properties can be altered or addressed before the crawl
commences.
Public: Starts or resumes the crawl. If the queue is empty, it adds a new
queue item from which to begin crawling based on the initial configuration
of the crawler itself. The crawler waits for process.nextTick to begin, so
handlers and other properties can be altered or addressed before the crawl
commences.
Examples
Examples
crawler.start();
crawler.start();
Returns the crawler object, to enable chaining.
Returns the crawler object, to enable chaining.
*/
Crawler.prototype.start = function() {
var crawler = this;
var crawler = this;
// only if we haven't already got stuff in our queue...
crawler.queue.getLength(function(err, length) {
if (err) throw err;
// only if we haven't already got stuff in our queue...
crawler.queue.getLength(function(err, length) {
if (err) {
throw err;
}
if (!length) {
if (!length) {
// Initialise our queue by pushing the initial request data into it...
crawler.queue.add(
crawler.initialProtocol,
crawler.host,
crawler.initialPort,
crawler.initialPath,
QUEUE_ITEM_INITIAL_DEPTH,
function(error) {
if (error) throw error;
});
}
// Initialise our queue by pushing the initial request data into it...
crawler.queue.add(
crawler.initialProtocol,
crawler.host,
crawler.initialPort,
crawler.initialPath,
QUEUE_ITEM_INITIAL_DEPTH,
function(error) {
if (error) {
throw error;
}
});
}
crawler.crawlIntervalID =
setInterval(
function() {
crawler.crawl.call(crawler);
},
crawler.interval);
crawler.crawlIntervalID =
setInterval(
function() {
crawler.crawl(crawler);
},
crawler.interval);
crawler.emit("crawlstart");
crawler.running = true;
crawler.emit("crawlstart");
crawler.running = true;
// Now kick off the initial crawl
process.nextTick(function() {
crawler.crawl();
});
});
// Now kick off the initial crawl
process.nextTick(function() {
crawler.crawl();
});
});
return crawler;
return crawler;
};
/*
Public: Determines whether the protocol is supported, given a URL.
Public: Determines whether the protocol is supported, given a URL.
URL - URL with a protocol, for testing.
URL - URL with a protocol, for testing.
Examples
Examples
crawler.protocolSupported("http://google.com/") // true, by default
crawler.protocolSupported("wss://google.com/") // false, by default
crawler.protocolSupported("http://google.com/") // true, by default
crawler.protocolSupported("wss://google.com/") // false, by default
Returns a boolean, true if the protocol is supported - false if not.
Returns a boolean, true if the protocol is supported - false if not.
*/
Crawler.prototype.protocolSupported = function(URL) {
var protocol, crawler = this;
var protocol,
crawler = this;
try {
protocol = URI(URL).protocol();
try {
protocol = uri(URL).protocol();
// Unspecified protocol. Assume http
if (!protocol)
protocol = "http";
// Unspecified protocol. Assume http
if (!protocol) {
protocol = "http";
}
} catch(e) {
// If URIjs died, we definitely /do not/ support the protocol.
return false;
}
} catch (e) {
// If URIjs died, we definitely /do not/ support the protocol.
return false;
}
return crawler.allowedProtocols.reduce(function(prev,protocolCheck) {
return prev || !!protocolCheck.exec(protocol);
},false);
return crawler.allowedProtocols.reduce(function(prev, protocolCheck) {
return prev || !!protocolCheck.exec(protocol);
}, false);
};
/*
Public: Determines whether the mimetype is supported, given a mimetype
Public: Determines whether the mimetype is supported, given a mimetype
MIMEType - String containing MIME type to test
MIMEType - String containing MIME type to test
Examples
Examples
crawler.mimeTypeSupported("text/html") // true, by default
crawler.mimeTypeSupported("application/octet-stream") // false, by default
crawler.mimeTypeSupported("text/html") // true, by default
crawler.mimeTypeSupported("application/octet-stream") // false, by default
Returns a boolean, true if the MIME type is supported — false if not.
Returns a boolean, true if the MIME type is supported — false if not.
*/
Crawler.prototype.mimeTypeSupported = function(MIMEType) {
var crawler = this;
var crawler = this;
return (
crawler.supportedMimeTypes.reduce(function(prev,mimeCheck) {
return prev || !!mimeCheck.exec(MIMEType);
},false)
);
return crawler.supportedMimeTypes.reduce(function(prev, mimeCheck) {
return prev || !!mimeCheck.exec(MIMEType);
}, false);
};
/*
Public: Determines whether the queueItem can be fetched from its depth
Public: Determines whether the queueItem can be fetched from its depth
In fact, the queueItem needs to be fetched before calling this (because we
need its MIME type). This will just determine if we need to send an event
for this item & if we need to fetch linked resources.
In fact, the queueItem needs to be fetched before calling this (because we
need its MIME type). This will just determine if we need to send an event
for this item & if we need to fetch linked resources.
If the queue item is a CSS or JS file, it will always be fetched (we need
all images in CSS files, even if max depth is already reached). If it's an
HTML page, we will check if max depth is reached or not.
If the queue item is a CSS or JS file, it will always be fetched (we need
all images in CSS files, even if max depth is already reached). If it's an
HTML page, we will check if max depth is reached or not.
queueItem - Queue item object to check
queueItem - Queue item object to check
Returns a boolean, true if the queue item can be fetched - false if not.
Returns a boolean, true if the queue item can be fetched - false if not.
*/
Crawler.prototype.depthAllowed = function(queueItem) {
var crawler = this;
var crawler = this;
// Items matching this pattern will always be fetched, even if max depth
// is reached
var mimeTypesWhitelist = [
/^text\/(css|javascript|ecmascript)/i,
/^application\/javascript/i,
/^application\/x-font/i,
/^application\/font/i,
/^image\//i,
/^font\//i
];
// Items matching this pattern will always be fetched, even if max depth
// is reached
var mimeTypesWhitelist = [
/^text\/(css|javascript|ecmascript)/i,
/^application\/javascript/i,
/^application\/x-font/i,
/^application\/font/i,
/^image\//i,
/^font\//i
];
return (
crawler.maxDepth === 0 ||
queueItem.depth <= crawler.maxDepth ||
(
crawler.fetchWhitelistedMimeTypesBelowMaxDepth &&
mimeTypesWhitelist.reduce(function(prev,mimeCheck) {
return prev || !!mimeCheck.exec(queueItem.stateData.contentType);
}, false)
)
);
return crawler.maxDepth === 0 ||
queueItem.depth <= crawler.maxDepth ||
crawler.fetchWhitelistedMimeTypesBelowMaxDepth &&
mimeTypesWhitelist.reduce(function(prev, mimeCheck) {
return prev || !!mimeCheck.exec(queueItem.stateData.contentType);
}, false);
};
/*
Public: Extracts protocol, host, port and resource (path) given a URL string.
Public: Extracts protocol, host, port and resource (path) given a URL string.
URL - String containing URL to process
URL - String containing URL to process
Examples
Examples
var URLInfo = crawler.processURL("http://www.google.com/fish");
var URLInfo = crawler.processURL("http://www.google.com/fish");
Returns an object containing keys and values for "protocol", "host", "port",
and "path".
Returns an object containing keys and values for "protocol", "host", "port",
and "path".
*/
Crawler.prototype.processURL = function(URL,context) {
var newURL, crawler = this;
Crawler.prototype.processURL = function(URL, context) {
var newURL,
crawler = this;
if (!context || typeof(context) !== "object")
context = {
url: (
crawler.initialProtocol + "://" +
crawler.host + ":" +
crawler.initialPort + "/"
),
depth: QUEUE_ITEM_INITIAL_DEPTH
};
if (!context || typeof context !== "object") {
context = {
url: crawler.initialProtocol + "://" +
crawler.host + ":" +
crawler.initialPort + "/",
depth: QUEUE_ITEM_INITIAL_DEPTH
};
}
// If the URL didn't contain anything, don't fetch it.
if (!URL.replace(/\s+/ig,"").length) return false;
// If the URL didn't contain anything, don't fetch it.
if (!(URL && URL.replace(/\s+/ig, "").length)) {
return false;
}
// Check if querystring should be ignored
if (crawler.stripQuerystring === true)
URL = crawler.removeQuerystring(URL);
// Check if querystring should be ignored
if (crawler.stripQuerystring === true) {
URL = crawler.removeQuerystring(URL);
}
try {
newURL =
URI(URL)
.absoluteTo(context.url)
.normalize();
if (crawler.stripWWWDomain && URL.match(/https?\:\/\/(www\.).*/i)) {
URL = URL.replace("www.", "");
}
if (crawler.urlEncoding === "iso8859") {
newURL = newURL.iso8859();
}
try {
newURL =
uri(URL)
.absoluteTo(context.url)
.normalize();
} catch(e) {
// Couldn't process the URL, since URIjs choked on it.
return false;
}
if (crawler.urlEncoding === "iso8859") {
newURL = newURL.iso8859();
}
// simplecrawler uses slightly different terminology to URIjs. Sorry!
return {
"protocol": newURL.protocol() || "http",
"host": newURL.hostname(),
"port": newURL.port() || 80,
"path": newURL.resource(),
"uriPath": newURL.path(),
"depth": context.depth + 1
};
} catch (e) {
// Couldn't process the URL, since URIjs choked on it.
return false;
}
// simplecrawler uses slightly different terminology to URIjs. Sorry!
return {
protocol: newURL.protocol() || "http",
host: newURL.hostname(),
port: newURL.port() || 80,
path: newURL.resource(),
uriPath: newURL.path(),
depth: context.depth + 1
};
};
/*
Public: Discovers linked resources in an HTML, XML or text document.
Private: Perform string replace operations on a URL string. Eg. removes
HTML attribute fluff around actual URL, replaces leading "//" with
absolute protocol etc.
resourceData - String containing document with linked resources.
queueItem - Queue item corresponding to document being searched.
queueItem - Queue item corresponding to where the resource was found
URL - String to be cleaned up
Examples
Examples
crawler.discoverResources("http://www.google.com")
crawler.discoverResources("<a href='...'>test</a>")
cleanURL({protocol: "http"}, "url('//example.com/about') ")
Returns an array of the (string) resource URLs found in the document. If none
were found, the array will be empty.
Returns a string.
*/
function cleanURL (queueItem, URL) {
return URL
.replace(/^(?:\s*href|\s*src)\s*=+\s*/i, "")
.replace(/^\s*/, "")
.replace(/^url\((.*)\)/i, "$1")
.replace(/^javascript\:\s*[a-z0-9]+\((.*)/i, "$1")
.replace(/^(['"])(.*)\1$/, "$2")
.replace(/^\((.*)\)$/, "$1")
.replace(/^\/\//, queueItem.protocol + "://")
.replace(/\&amp;/gi, "&")
.replace(/\&#38;/gi, "&")
.replace(/\&#x00026;/gi, "&")
.split("#")
.shift()
.trim();
}
/*
Public: Clean up a list of resources (normally provided by discoverResources).
Also expands URL's that are relative to the current page.
urlMatch - Array of string resources
queueItem - Queue item corresponding to where the resources were retrieved from
Examples
crawler.cleanExpandResources(["http://www.google.com", "/about", "mailto: example@example.com"])
Returns an array of URL strings.
*/
Crawler.prototype.discoverResources = function(resourceData,queueItem) {
// Convert to UTF-8
// TODO: account for text-encoding.
var resources = [],
resourceText = resourceData.toString("utf8"),
crawler = this;
Crawler.prototype.cleanExpandResources = function (urlMatch, queueItem) {
var crawler = this,
resources = [];
if (!queueItem)
queueItem = {};
if (!urlMatch) {
return [];
}
if (!queueItem.protocol)
queueItem.protocol = "http";
return urlMatch
.map(cleanURL.bind(this, queueItem))
.reduce(function(list, URL) {
if (!crawler.parseHTMLComments) {
resourceText = resourceText.replace(/<!--([\s\S]+?)-->/g, "");
}
// Ensure URL is whole and complete
try {
URL = uri(URL)
.absoluteTo(queueItem.url || "")
.normalize()
.toString();
} catch (e) {
// But if URI.js couldn't parse it - nobody can!
return list;
}
if (!crawler.parseScriptTags) {
resourceText = resourceText.replace(/<script(.*?)>([\s\S]+?)<\/script>/gi, "");
}
// If we hit an empty item, don't return it
if (!URL.length) {
return list;
}
function cleanURL(URL) {
return URL
.replace(/^(?:\s*href|\s*src)\s*=+\s*/i,"")
.replace(/^\s*/,"")
.replace(/^url\((.*)\)/i,"$1")
.replace(/^javascript\:\s*[a-z0-9]+\((.*)/i,"$1")
.replace(/^(['"])(.*)\1$/,"$2")
.replace(/^\((.*)\)$/,"$1")
.replace(/^\/\//, queueItem.protocol + "://")
.replace(/\&amp;/gi,"&")
.replace(/\&#38;/gi,"&")
.replace(/\&#x00026;/gi,"&")
.split("#")
.shift();
}
// If we don't support the protocol in question
if (!crawler.protocolSupported(URL)) {
return list;
}
// Clean links
function cleanAndQueue(urlMatch) {
if (!urlMatch) return [];
// Does the item already exist in the list?
if (resources.reduce(function(prev, current) {
return prev || current === URL;
}, false)) {
return list;
}
return urlMatch
.map(cleanURL)
.reduce(function(list,URL) {
var tmpURL;
return list.concat(URL);
}, []);
};
// Ensure URL is whole and complete
try {
tmpURL = URI(URL);
/*
Public: Discovers linked resources in an HTML, XML or text document.
if (queueItem.url) {
URL = tmpURL
.absoluteTo(queueItem.url)
.normalize()
.toString();
} else {
URL = tmpURL
.normalize()
.toString();
}
resourceData - String containing document with linked resources.
queueItem - Queue item corresponding to document being searched.
} catch(e) {
// But if URI.js couldn't parse it - nobody can!
return list;
}
Examples
// If we hit an empty item, don't add return it
if (!URL.length) return list;
crawler.discoverResources("http://www.google.com")
crawler.discoverResources("<a href='...'>test</a>")
// If we don't support the protocol in question
if (!crawler.protocolSupported(URL)) return list;
Returns an array of the (string) resource URLs found in the document. If none
were found, the array will be empty.
// Does the item already exist in the list?
if (resources.reduce(function(prev,current) {
return prev || current === URL;
},false))
return list;
*/
Crawler.prototype.discoverResources = function(resourceData, queueItem) {
// Convert to UTF-8
// TODO: account for text-encoding.
var resourceText = resourceData.toString("utf8"),
crawler = this;
return list.concat(URL);
},[]);
}
if (!queueItem) {
queueItem = {};
}
// Rough scan for URLs
return crawler.discoverRegex
.reduce(function(list,regex) {
return list.concat(
cleanAndQueue(
resourceText.match(regex)));
},[])
.reduce(function(list,check) {
if (list.indexOf(check) < 0)
return list.concat([check]);
if (!queueItem.protocol) {
queueItem.protocol = "http";
}
return list;
},[]);
if (!crawler.parseHTMLComments) {
resourceText = resourceText.replace(/<!--([\s\S]+?)-->/g, "");
}
if (!crawler.parseScriptTags) {
resourceText = resourceText.replace(/<script(.*?)>([\s\S]*?)<\/script>/gi, "");
}
// Rough scan for URLs
return crawler.discoverRegex
.reduce(function(list, regex) {
return list.concat(
crawler.cleanExpandResources(
resourceText.match(regex), queueItem));
}, [])
.reduce(function(list, check) {
if (list.indexOf(check) < 0) {
return list.concat([check]);
}
return list;
}, []);
};
/*
Public: Determines based on crawler state whether a domain is valid for
crawling.
Public: Determines based on crawler state whether a domain is valid for
crawling.
host - String containing the hostname of the resource to be fetched.
host - String containing the hostname of the resource to be fetched.
Examples
Examples
crawler.domainValid("127.0.0.1");
crawler.domainValid("google.com");
crawler.domainValid("test.example.com");
crawler.domainValid("127.0.0.1");
crawler.domainValid("google.com");
crawler.domainValid("test.example.com");
Returns an true if the domain is valid for crawling, false if not.
Returns an true if the domain is valid for crawling, false if not.
*/
Crawler.prototype.domainValid = function(host) {
var crawler = this,
crawlerHost = crawler.host;
var crawler = this;
// If we're ignoring the WWW domain, remove the WWW for comparisons...
if (crawler.ignoreWWWDomain)
host = host.replace(/^www\./i,"");
// If we're ignoring the WWW domain, remove the WWW for comparisons...
if (crawler.ignoreWWWDomain) {
host = host.replace(/^www\./i, "");
}
function domainInWhitelist(host) {
function domainInWhitelist(host) {
// If there's no whitelist, or the whitelist is of zero length,
// just return false.
if (!crawler.domainWhitelist ||
!crawler.domainWhitelist.length) return false;
// If there's no whitelist, or the whitelist is of zero length,
// just return false.
if (!crawler.domainWhitelist || !crawler.domainWhitelist.length) {
return false;
}
// Otherwise, scan through it.
return !!crawler.domainWhitelist.reduce(function(prev,cur,index,array) {
// Otherwise, scan through it.
return !!crawler.domainWhitelist.reduce(function(prev, cur) {
// If we already located the relevant domain in the whitelist...
if (prev) return prev;
// If we already located the relevant domain in the whitelist...
if (prev) {
return prev;
}
// If the domain is just equal, return true.
if (host === cur) return true;
// If the domain is just equal, return true.
if (host === cur) {
return true;
}
// If we're ignoring WWW subdomains, and both domains,
// less www. are the same, return true.
if (crawler.ignoreWWWDomain && host === cur.replace(/^www\./i,""))
return true;
// If we're ignoring WWW subdomains, and both domains,
// less www. are the same, return true.
if (crawler.ignoreWWWDomain && host === cur.replace(/^www\./i, "")) {
return true;
}
// Otherwise, sorry. No dice.
return false;
},false);
}
// Otherwise, sorry. No dice.
return false;
}, false);
}
// Checks if the first domain is a subdomain of the second
function isSubdomainOf(subdomain,host) {
// Checks if the first domain is a subdomain of the second
function isSubdomainOf(subdomain, host) {
// Comparisons must be case-insensitive
subdomain = subdomain.toLowerCase();
host = host.toLowerCase();
// Comparisons must be case-insensitive
subdomain = subdomain.toLowerCase();
host = host.toLowerCase();
// If we're ignoring www, remove it from both
// (if www is the first domain component...)
if (crawler.ignoreWWWDomain) {
subdomain = subdomain.replace(/^www./ig, "");
host = host.replace(/^www./ig, "");
}
// If we're ignoring www, remove it from both
// (if www is the first domain component...)
if (crawler.ignoreWWWDomain) {
subdomain = subdomain.replace(/^www./ig, "");
host = host.replace(/^www./ig, "");
}
// They should be the same flipped around!
return (
subdomain.split("").reverse().join("").substr(0,host.length) ===
host.split("").reverse().join(""));
}
// They should be the same flipped around!
return subdomain.split("").reverse().join("").substr(0, host.length) ===
host.split("").reverse().join("");
}
// If we're not filtering by domain, just return true.
return (!crawler.filterByDomain ||
// Or if the domain is just the right one, return true.
(host === crawler.host) ||
// Or if we're ignoring WWW subdomains, and both domains,
// less www. are the same, return true.
(
crawler.ignoreWWWDomain &&
crawler.host.replace(/^www\./i,"") ===
host.replace(/^www\./i,"")
) ||
// Or if the domain in question exists in the domain whitelist,
// return true.
domainInWhitelist(host) ||
// Or if we're scanning subdomains, and this domain is a subdomain
// of the crawler's set domain, return true.
(crawler.scanSubdomains && isSubdomainOf(host,crawler.host)));
// If we're not filtering by domain, just return true.
return !crawler.filterByDomain ||
// Or if the domain is just the right one, return true.
host === crawler.host ||
// Or if we're ignoring WWW subdomains, and both domains,
// less www. are the same, return true.
crawler.ignoreWWWDomain &&
crawler.host.replace(/^www\./i, "") ===
host.replace(/^www\./i, "") ||
// Or if the domain in question exists in the domain whitelist,
// return true.
domainInWhitelist(host) ||
// Or if we're scanning subdomains, and this domain is a subdomain
// of the crawler's set domain, return true.
crawler.scanSubdomains && isSubdomainOf(host, crawler.host);
};
/*
Public: Given a text or HTML document, initiates discovery of linked
resources in the text, and queues the resources if applicable. Emits
"discoverycomplete". Not to be confused with `crawler.discoverResources`,
despite the `discoverResources` function being the main component of this
one, since this function queues the resources in addition to
discovering them.
Public: Given a text or HTML document, initiates discovery of linked
resources in the text, and queues the resources if applicable. Emits
"discoverycomplete". Not to be confused with `crawler.discoverResources`,
despite the `discoverResources` function being the main component of this
one, since this function queues the resources in addition to
discovering them.
resourceData - Text document containing linked resource URLs.
queueItem - Queue item from which the resource document was derived.
decompressed - Content is already decompressed (default: false)
resourceData - Text document containing linked resource URLs.
queueItem - Queue item from which the resource document was derived.
decompressed - Content is already decompressed (default: false)
Emits
Emits
gziperr
discoverycomplete
gziperr
discoverycomplete
Examples
Examples
crawler.queueLinkedItems("<a href='...'>test</a>",queueItem);
crawler.queueLinkedItems("<a href='...'>test</a>",queueItem);
Returns the crawler object for chaining.
Returns the crawler object for chaining.
*/
Crawler.prototype.queueLinkedItems = function(resourceData,queueItem,decompressed) {
var crawler = this,
resources = [];
Crawler.prototype.queueLinkedItems = function(resourceData, queueItem, decompressed) {
var crawler = this,
resources = [];
if (!decompressed &&
queueItem.stateData &&
queueItem.stateData.headers['content-encoding'] && (
queueItem.stateData.headers['content-encoding'].match(/gzip/) ||
queueItem.stateData.headers['content-encoding'].match(/deflate/))) {
if (!decompressed &&
queueItem.stateData &&
queueItem.stateData.headers["content-encoding"] && (
queueItem.stateData.headers["content-encoding"].match(/gzip/) ||
queueItem.stateData.headers["content-encoding"].match(/deflate/))) {
return zlib.unzip(resourceData,function(err,newData) {
if (err) {
return crawler.emit("gziperror", queueItem, err, resourceData);
}
return zlib.unzip(resourceData, function(err, newData) {
if (err) {
return crawler.emit("gziperror", queueItem, err, resourceData);
}
crawler.queueLinkedItems(newData,queueItem,true);
});
}
crawler.queueLinkedItems(newData, queueItem, true);
});
}
resources = crawler.discoverResources(resourceData,queueItem);
resources = crawler.discoverResources(resourceData, queueItem);
// Emit discovered resources. ie: might be useful in building a graph of
// page relationships.
crawler.emit("discoverycomplete",queueItem,resources);
// Emit discovered resources. ie: might be useful in building a graph of
// page relationships.
crawler.emit("discoverycomplete", queueItem, resources);
resources.forEach(function(url){ crawler.queueURL(url,queueItem); });
resources.forEach(function(url) {
crawler.queueURL(url, queueItem);
});
return crawler;
return crawler;
};
/*
Public: Given a single URL, this function cleans, validates, parses it and
adds it to the queue. This is the best and simplest way to add an item to
the queue.
Public: Given a single URL, this function cleans, validates, parses it and
adds it to the queue. This is the best and simplest way to add an item to
the queue.
url - URL to be queued.
queueItem - Queue item from which the resource was linked.
url - URL to be queued.
queueItem - Queue item from which the resource was linked.
Emits
Emits
queueduplicate
queueerror
queueadd
queueduplicate
queueerror
queueadd
Examples
Examples
crawler.queueURL("http://www.google.com/",queueItem);
crawler.queueURL("http://www.google.com/",queueItem);
Returns a boolean value indicating whether the URL was successfully queued
or not.
Returns a boolean value indicating whether the URL was successfully queued
or not.
*/
Crawler.prototype.queueURL = function(url,queueItem) {
var crawler = this;
var parsedURL =
typeof(url) === "object" ? url : crawler.processURL(url,queueItem);
Crawler.prototype.queueURL = function(url, queueItem) {
var crawler = this,
parsedURL = typeof url === "object" ? url : crawler.processURL(url, queueItem);
// URL Parser decided this URL was junky. Next please!
if (!parsedURL) {
return false;
}
// URL Parser decided this URL was junky. Next please!
if (!parsedURL) {
return false;
}
// Pass this URL past fetch conditions to ensure the user thinks it's valid
var fetchDenied = false;
fetchDenied = crawler._fetchConditions.reduce(function(prev,callback) {
return prev || !callback(parsedURL);
},false);
// Pass this URL past fetch conditions to ensure the user thinks it's valid
var fetchDenied = false;
fetchDenied = crawler._fetchConditions.reduce(function(prev, callback) {
return prev || !callback(parsedURL);
}, false);
if (fetchDenied) {
// Fetch Conditions conspired to block URL
return false;
}
if (fetchDenied) {
// Fetch Conditions conspired to block URL
return false;
}
// Check the domain is valid before adding it to the queue
if (crawler.domainValid(parsedURL.host)) {
crawler.queue.add(
parsedURL.protocol,
parsedURL.host,
parsedURL.port,
parsedURL.path,
parsedURL.depth,
function queueAddCallback(error,newQueueItem) {
if (error) {
// We received an error condition when adding the callback
if (error.code && error.code === "DUP")
return crawler.emit("queueduplicate",parsedURL);
// Check the domain is valid before adding it to the queue
if (crawler.domainValid(parsedURL.host)) {
crawler.queue.add(
parsedURL.protocol,
parsedURL.host,
parsedURL.port,
parsedURL.path,
parsedURL.depth,
function queueAddCallback(error, newQueueItem) {
if (error) {
// We received an error condition when adding the callback
if (error.code && error.code === "DUP") {
return crawler.emit("queueduplicate", parsedURL);
}
return crawler.emit("queueerror",error,parsedURL);
}
return crawler.emit("queueerror", error, parsedURL);
}
crawler.emit("queueadd",newQueueItem,parsedURL);
newQueueItem.referrer = queueItem ? queueItem.url : null;
}
);
}
crawler.emit("queueadd", newQueueItem, parsedURL);
newQueueItem.referrer = queueItem ? queueItem.url : null;
}
);
}
return true;
return true;
};
/*
Public: The guts of the crawler: takes a queue item and spools a request for
it, downloads, caches, and fires events based on the result of the request.
It kicks off resource discovery and queues any new resources found.
Public: The guts of the crawler: takes a queue item and spools a request for
it, downloads, caches, and fires events based on the result of the request.
It kicks off resource discovery and queues any new resources found.
queueItem - Queue item to be fetched.
queueItem - Queue item to be fetched.
Emits
fetchstart
fetchheaders
fetchcomplete
fetchdataerror
notmodified
fetchredirect
fetch404
fetcherror
fetchclienterror
Emits
fetchstart
fetchheaders
fetchcomplete
fetchdataerror
notmodified
fetchredirect
fetch404
fetcherror
fetchclienterror
Examples
Examples
crawler.fetchQueueItem(queueItem);
crawler.fetchQueueItem(queueItem);
Returns the crawler object for chaining.
Returns the crawler object for chaining.
*/
Crawler.prototype.fetchQueueItem = function(queueItem) {
var crawler = this;
crawler._openRequests ++;
var crawler = this;
crawler._openRequests++;
// Variable declarations
var fetchData = false,
requestOptions,
clientRequest,
timeCommenced;
// Variable declarations
var requestOptions,
clientRequest,
timeCommenced;
// Mark as spooled
queueItem.status = "spooled";
var client = (queueItem.protocol === "https" ? https : http);
// Mark as spooled
queueItem.status = "spooled";
var client = queueItem.protocol === "https" ? https : http;
// Up the socket limit if required.
if (client.globalAgent.maxSockets < crawler.maxConcurrency) {
client.globalAgent.maxSockets = crawler.maxConcurrency;
}
// Up the socket limit if required.
if (client.globalAgent.maxSockets < crawler.maxConcurrency) {
client.globalAgent.maxSockets = crawler.maxConcurrency;
}
// Extract request options from queue;
var requestHost = queueItem.host,
requestPort = queueItem.port,
requestPath = queueItem.path;
// Extract request options from queue;
var requestHost = queueItem.host,
requestPort = queueItem.port,
requestPath = queueItem.path;
// Are we passing through an HTTP proxy?
if (crawler.useProxy) {
requestHost = crawler.proxyHostname;
requestPort = crawler.proxyPort;
requestPath = queueItem.url;
}
// Are we passing through an HTTP proxy?
if (crawler.useProxy) {
requestHost = crawler.proxyHostname;
requestPort = crawler.proxyPort;
requestPath = queueItem.url;
}
// Load in request options
requestOptions = {
method: "GET",
host: requestHost,
port: requestPort,
path: requestPath,
headers: {
"User-Agent": crawler.userAgent,
"Host": queueItem.host + (
queueItem.port !== 80 ?
":" + queueItem.port :
""
)
}
};
// Load in request options
requestOptions = {
method: "GET",
host: requestHost,
port: requestPort,
path: requestPath,
headers: {
"User-Agent": crawler.userAgent,
"Host": queueItem.host + (
queueItem.port !== 80 ?
":" + queueItem.port :
""
)
}
};
if (queueItem.referrer) {
requestOptions.headers.Referer = queueItem.referrer;
}
if (queueItem.referrer) {
requestOptions.headers.Referer = queueItem.referrer;
}
// If port is one of the HTTP/HTTPS defaults, delete the option to avoid conflicts
if (requestOptions.port === 80 || requestOptions.port === 443) {
delete requestOptions.port;
}
// If port is one of the HTTP/HTTPS defaults, delete the option to avoid conflicts
if (requestOptions.port === 80 || requestOptions.port === 443) {
delete requestOptions.port;
}
// Add cookie header from cookie jar if we're configured to
// send/accept cookies
if (crawler.acceptCookies && crawler.cookies.getAsHeader()) {
requestOptions.headers.cookie =
crawler.cookies.getAsHeader(queueItem.host,queueItem.path);
}
// Add cookie header from cookie jar if we're configured to
// send/accept cookies
if (crawler.acceptCookies && crawler.cookies.getAsHeader()) {
requestOptions.headers.cookie =
crawler.cookies.getAsHeader(queueItem.host, queueItem.path);
}
// Add auth headers if we need them
if (crawler.needsAuth) {
var auth = crawler.authUser + ":" + crawler.authPass;
// Add auth headers if we need them
if (crawler.needsAuth) {
var auth = crawler.authUser + ":" + crawler.authPass;
// Generate auth header
auth = 'Basic ' + (new Buffer(auth).toString('base64'));
requestOptions.headers.Authorization = auth;
}
// Generate auth header
auth = "Basic " + new Buffer(auth).toString("base64");
requestOptions.headers.Authorization = auth;
}
// Add proxy auth if we need it
if (crawler.proxyUser !== null && crawler.proxyPass !== null) {
var proxyAuth = crawler.proxyUser + ":" + crawler.proxyPass;
// Add proxy auth if we need it
if (crawler.proxyUser !== null && crawler.proxyPass !== null) {
var proxyAuth = crawler.proxyUser + ":" + crawler.proxyPass;
// Generate auth header
proxyAuth = 'Basic ' + (new Buffer(proxyAuth).toString('base64'));
requestOptions.headers["Proxy-Authorization"] = proxyAuth;
}
// Generate auth header
proxyAuth = "Basic " + new Buffer(proxyAuth).toString("base64");
requestOptions.headers["Proxy-Authorization"] = proxyAuth;
}
// And if we've got any custom headers available
if (crawler.customHeaders) {
for (var header in crawler.customHeaders) {
if (!crawler.customHeaders.hasOwnProperty(header)) continue;
// And if we've got any custom headers available
if (crawler.customHeaders) {
for (var header in crawler.customHeaders) {
if (!crawler.customHeaders.hasOwnProperty(header)) {
continue;
}
requestOptions.headers[header] = crawler.customHeaders[header];
}
}
requestOptions.headers[header] = crawler.customHeaders[header];
}
}
// Apply the ignoreInvalidSSL setting to https connections
if(client === https && crawler.ignoreInvalidSSL === true) {
client.rejectUnauthorized = false;
client.strictSSL = false;
}
// Apply the ignoreInvalidSSL setting to https connections
if (client === https && crawler.ignoreInvalidSSL === true) {
client.rejectUnauthorized = false;
client.strictSSL = false;
}
// Emit fetchstart event - gives the user time to mangle the request options
// if required.
crawler.emit("fetchstart", queueItem, requestOptions);
// Emit fetchstart event - gives the user time to mangle the request options
// if required.
crawler.emit("fetchstart", queueItem, requestOptions);
process.nextTick(function() {
// Record what time we started this request
timeCommenced = Date.now();
process.nextTick(function() {
// Record what time we started this request
timeCommenced = Date.now();
// Get the resource!
clientRequest =
client.request(requestOptions,function(response) {
crawler.handleResponse(queueItem,response,timeCommenced);
});
// Get the resource!
clientRequest =
client.request(requestOptions, function(response) {
crawler.handleResponse(queueItem, response, timeCommenced);
});
clientRequest.end();
clientRequest.end();
clientRequest.setTimeout(crawler.timeout, function () {
if (crawler.running && !queueItem.fetched) {
crawler._openRequests--;
}
clientRequest.setTimeout(crawler.timeout, function() {
if (queueItem.fetched) {
return;
}
queueItem.fetched = true;
queueItem.status = "timeout";
crawler.emit("fetchtimeout", queueItem, crawler.timeout);
clientRequest._crawlerHandled = true;
clientRequest.abort();
});
if (crawler.running && !queueItem.fetched) {
crawler._openRequests--;
}
clientRequest.on("error", function (errorData) {
queueItem.fetched = true;
queueItem.status = "timeout";
crawler.emit("fetchtimeout", queueItem, crawler.timeout);
clientRequest._crawlerHandled = true;
clientRequest.abort();
});
// This event will be thrown if we manually aborted the request,
// but we don't want to do anything in that case.
if (clientRequest._crawlerHandled)
return;
clientRequest.on("error", function(errorData) {
if (crawler.running && !queueItem.fetched) {
crawler._openRequests--;
}
// This event will be thrown if we manually aborted the request,
// but we don't want to do anything in that case.
if (clientRequest._crawlerHandled) {
return;
}
// Emit 5xx / 4xx event
queueItem.fetched = true;
queueItem.stateData.code = 599;
queueItem.status = "failed";
crawler.emit("fetchclienterror", queueItem, errorData);
});
if (crawler.running && !queueItem.fetched) {
crawler._openRequests--;
}
return crawler;
});
// Emit 5xx / 4xx event
queueItem.fetched = true;
queueItem.stateData.code = 599;
queueItem.status = "failed";
crawler.emit("fetchclienterror", queueItem, errorData);
});
return crawler;
});
};

@@ -912,405 +962,413 @@

/*
Public: Given a queueItem and a matching response object, the crawler will
handle downloading the resource, queueing of linked items, etc.
Public: Given a queueItem and a matching response object, the crawler will
handle downloading the resource, queueing of linked items, etc.
Examples
Examples
// Passing in a response from `request`
request(queueItem.url,function(err,res,body) {
crawler.handleResponse(queueItem,res);
});
// Passing in a response from `request`
request(queueItem.url, function(err, res, body) {
crawler.handleResponse(queueItem, res);
});
Returns the crawler object for chaining.
Returns the crawler object for chaining.
*/
Crawler.prototype.handleResponse = function(queueItem,response,timeCommenced) {
var crawler = this,
dataReceived = false,
timeHeadersReceived,
timeDataReceived,
parsedURL,
responseBuffer,
responseLength,
responseLengthReceived = 0,
contentType,
stateData = queueItem.stateData;
Crawler.prototype.handleResponse = function(queueItem, response, timeCommenced) {
var crawler = this,
dataReceived = false,
timeHeadersReceived,
timeDataReceived,
parsedURL,
responseBuffer,
responseLength,
responseLengthReceived = 0,
contentType,
stateData = queueItem.stateData;
// Record what time we first received the header information
timeHeadersReceived = Date.now();
// Record what time we first received the header information
timeHeadersReceived = Date.now();
// If we weren't passed a time of commencement, assume Now()
timeCommenced = timeCommenced || Date.now();
// If we weren't passed a time of commencement, assume Now()
timeCommenced = timeCommenced || Date.now();
responseLength = parseInt(response.headers["content-length"],10);
responseLength = !isNaN(responseLength) ? responseLength : 0;
responseLength = parseInt(response.headers["content-length"], 10);
responseLength = !isNaN(responseLength) ? responseLength : 0;
// Save timing and content some header information into queue
stateData.requestLatency = (timeHeadersReceived - timeCommenced);
stateData.requestTime = (timeHeadersReceived - timeCommenced);
stateData.contentLength = responseLength;
stateData.contentType = contentType = response.headers["content-type"];
stateData.code = response.statusCode;
stateData.headers = response.headers;
// Save timing and content some header information into queue
stateData.requestLatency = timeHeadersReceived - timeCommenced;
stateData.requestTime = timeHeadersReceived - timeCommenced;
stateData.contentLength = responseLength;
stateData.contentType = contentType = response.headers["content-type"];
stateData.code = response.statusCode;
stateData.headers = response.headers;
// Do we need to save cookies? Were we sent any?
if (crawler.acceptCookies &&
response.headers.hasOwnProperty('set-cookie'))
crawler.cookies.addFromHeaders(response.headers["set-cookie"]);
// Do we need to save cookies? Were we sent any?
if (crawler.acceptCookies && response.headers.hasOwnProperty("set-cookie")) {
crawler.cookies.addFromHeaders(response.headers["set-cookie"]);
}
// Emit header receive event
crawler.emit("fetchheaders",queueItem,response);
// Emit header receive event
crawler.emit("fetchheaders", queueItem, response);
// Ensure response length is reasonable...
responseLength =
responseLength > 0 ? responseLength : crawler.maxResourceSize;
// Ensure response length is reasonable...
responseLength =
responseLength > 0 ? responseLength : crawler.maxResourceSize;
queueItem.stateData.contentLength = responseLength;
queueItem.stateData.contentLength = responseLength;
// Function for dealing with 200 responses
function processReceivedData() {
if (queueItem.fetched) return;
// Function for dealing with 200 responses
function processReceivedData() {
if (queueItem.fetched) {
return;
}
timeDataReceived = (new Date().getTime());
timeDataReceived = new Date().getTime();
queueItem.fetched = true;
queueItem.status = "downloaded";
queueItem.fetched = true;
queueItem.status = "downloaded";
// Save state information
stateData.downloadTime = (timeDataReceived - timeHeadersReceived);
stateData.requestTime = (timeDataReceived - timeCommenced);
stateData.actualDataSize = responseBuffer.length;
stateData.sentIncorrectSize = responseBuffer.length !== responseLength;
// Save state information
stateData.downloadTime = timeDataReceived - timeHeadersReceived;
stateData.requestTime = timeDataReceived - timeCommenced;
stateData.actualDataSize = responseBuffer.length;
stateData.sentIncorrectSize = responseBuffer.length !== responseLength;
// First, save item to cache (if we're using a cache!)
if (crawler.cache !== null &&
crawler.cache.setCacheData instanceof Function) {
// First, save item to cache (if we're using a cache!)
if (crawler.cache !== null && crawler.cache.setCacheData instanceof Function) {
crawler.cache.setCacheData(queueItem, responseBuffer);
}
crawler.cache.setCacheData(queueItem,responseBuffer);
}
// Is the item allowed by depth conditions ?
if (crawler.depthAllowed(queueItem)) {
crawler.emit("fetchcomplete", queueItem, responseBuffer, response);
// Is the item allowed by depth conditions ?
if(crawler.depthAllowed(queueItem)) {
crawler.emit("fetchcomplete",queueItem,responseBuffer,response);
// We only process the item if it's of a valid mimetype
// and only if the crawler is set to discover its own resources
if (crawler.mimeTypeSupported(contentType) && crawler.discoverResources) {
crawler.queueLinkedItems(responseBuffer, queueItem);
}
}
// We only process the item if it's of a valid mimetype
// and only if the crawler is set to discover its own resources
if (crawler.mimeTypeSupported(contentType) && crawler.discoverResources) {
crawler.queueLinkedItems(responseBuffer,queueItem);
}
}
crawler._openRequests--;
}
crawler._openRequests --;
}
function receiveData(chunk) {
if (chunk && chunk.length && !dataReceived) {
if (responseLengthReceived + chunk.length > responseBuffer.length) {
// Oh dear. We've been sent more data than we were initially told.
// This could be a mis-calculation, or a streaming resource.
// Let's increase the size of our buffer to match, as long as it isn't
// larger than our maximum resource size.
function receiveData(chunk) {
if (chunk && chunk.length && !dataReceived) {
if (responseLengthReceived + chunk.length > responseBuffer.length) {
// Oh dear. We've been sent more data than we were initially told.
// This could be a mis-calculation, or a streaming resource.
// Let's increase the size of our buffer to match, as long as it isn't
// larger than our maximum resource size.
if (responseLengthReceived + chunk.length <= crawler.maxResourceSize) {
if (responseLengthReceived + chunk.length <= crawler.maxResourceSize) {
// Start by creating a new buffer, which will be our main
// buffer from now on...
// Start by creating a new buffer, which will be our main
// buffer from now on...
var tmpNewBuffer = new Buffer(responseLengthReceived + chunk.length);
var tmpNewBuffer = new Buffer(responseLengthReceived + chunk.length);
// Copy all our old data into it...
responseBuffer.copy(tmpNewBuffer, 0, 0, responseBuffer.length);
// Copy all our old data into it...
responseBuffer.copy(tmpNewBuffer,0,0,responseBuffer.length);
// And now the new chunk
chunk.copy(tmpNewBuffer, responseBuffer.length, 0, chunk.length);
// And now the new chunk
chunk.copy(tmpNewBuffer,responseBuffer.length,0,chunk.length);
// And now make the response buffer our new buffer,
// leaving the original for GC
responseBuffer = tmpNewBuffer;
// And now make the response buffer our new buffer,
// leaving the original for GC
responseBuffer = tmpNewBuffer;
} else {
// Oh dear oh dear! The response is not only more data
// than we were initially told, but it also exceeds the
// maximum amount of data we're prepared to download per
// resource.
//
// Throw error event and ignore.
//
// We'll then deal with the data that we have.
} else {
// Oh dear oh dear! The response is not only more data
// than we were initially told, but it also exceeds the
// maximum amount of data we're prepared to download per
// resource.
//
// Throw error event and ignore.
//
// We'll then deal with the data that we have.
crawler.emit("fetchdataerror", queueItem, response);
}
} else {
// Copy the chunk data into our main buffer
chunk.copy(responseBuffer, responseLengthReceived, 0, chunk.length);
}
crawler.emit("fetchdataerror",queueItem,response);
}
} else {
// Copy the chunk data into our main buffer
chunk.copy(responseBuffer,responseLengthReceived,0,chunk.length);
}
// Increment our data received counter
responseLengthReceived += chunk.length;
}
// Increment our data received counter
responseLengthReceived += chunk.length;
}
if ((responseLengthReceived >= responseLength || response.complete) &&
!dataReceived) {
// Slice the buffer to chop off any unused space
responseBuffer = responseBuffer.slice(0, responseLengthReceived);
if ((responseLengthReceived >= responseLength || response.complete) &&
!dataReceived) {
dataReceived = true;
processReceivedData();
}
}
// Slice the buffer to chop off any unused space
responseBuffer = responseBuffer.slice(0,responseLengthReceived);
// If we should just go ahead and get the data
if (response.statusCode >= 200 && response.statusCode < 300 &&
responseLength <= crawler.maxResourceSize) {
dataReceived = true;
processReceivedData();
}
}
queueItem.status = "headers";
// If we should just go ahead and get the data
if (response.statusCode >= 200 && response.statusCode < 300 &&
responseLength <= crawler.maxResourceSize) {
// Create a buffer with our response length
responseBuffer = new Buffer(responseLength);
queueItem.status = "headers";
// Only if we're prepared to download non-text resources...
if (crawler.downloadUnsupported ||
crawler.mimeTypeSupported(contentType)) {
// Create a buffer with our response length
responseBuffer = new Buffer(responseLength);
response.on("data", receiveData);
response.on("end", receiveData);
} else {
queueItem.fetched = true;
crawler._openRequests--;
// Only if we're prepared to download non-text resources...
if (crawler.downloadUnsupported ||
crawler.mimeTypeSupported(contentType)) {
response.socket.end();
}
response.on("data",receiveData);
response.on("end",receiveData);
} else {
response.socket.end();
}
// We've got a not-modified response back
} else if (response.statusCode === 304) {
// We've got a not-modified response back
} else if (response.statusCode === 304) {
if (crawler.cache !== null && crawler.cache.getCacheData) {
// We've got access to a cache
crawler.cache.getCacheData(queueItem, function(cacheObject) {
crawler.emit("notmodified", queueItem, response, cacheObject);
});
} else {
// Emit notmodified event. We don't have a cache available, so
// we don't send any data.
crawler.emit("notmodified", queueItem, response);
}
if (crawler.cache !== null && crawler.cache.getCacheData) {
// We've got access to a cache
crawler.cache.getCacheData(queueItem,function(cacheObject) {
crawler.emit("notmodified",queueItem,response,cacheObject);
});
} else {
// Emit notmodified event. We don't have a cache available, so
// we don't send any data.
crawler.emit("notmodified",queueItem,response);
}
// If we should queue a redirect
} else if (response.statusCode >= 300 && response.statusCode < 400 &&
response.headers.location) {
// If we should queue a redirect
} else if (response.statusCode >= 300 && response.statusCode < 400 &&
response.headers.location) {
queueItem.fetched = true;
queueItem.status = "redirected";
queueItem.fetched = true;
queueItem.status = "redirected";
// Parse the redirect URL ready for adding to the queue...
parsedURL = crawler.processURL(response.headers.location, queueItem);
// Parse the redirect URL ready for adding to the queue...
parsedURL = crawler.processURL(response.headers.location,queueItem);
// Emit redirect event
crawler.emit("fetchredirect", queueItem, parsedURL, response);
// Emit redirect event
crawler.emit("fetchredirect",queueItem,parsedURL,response);
// Clean URL, add to queue...
crawler.queueURL(parsedURL, queueItem);
response.socket.end();
// Clean URL, add to queue...
crawler.queueURL(parsedURL,queueItem);
response.socket.end();
crawler._openRequests--;
crawler._openRequests --;
// Ignore this request, but record that we had a 404
} else if (response.statusCode === 404 || response.statusCode === 410) {
queueItem.fetched = true;
queueItem.status = "notfound";
// Ignore this request, but record that we had a 404
} else if (response.statusCode === 404 || response.statusCode === 410) {
queueItem.fetched = true;
queueItem.status = "notfound";
// Emit 404 event
crawler.emit("fetch404", queueItem, response);
response.socket.end();
// Emit 404 event
crawler.emit("fetch404",queueItem,response);
response.socket.end();
crawler._openRequests--;
crawler._openRequests --;
// And oh dear. Handle this one as well. (other 400s, 500s, etc)
} else {
queueItem.fetched = true;
queueItem.status = "failed";
// And oh dear. Handle this one as well. (other 400s, 500s, etc)
} else {
queueItem.fetched = true;
queueItem.status = "failed";
// Emit 5xx / 4xx event
crawler.emit("fetcherror", queueItem, response);
response.socket.end();
// Emit 5xx / 4xx event
crawler.emit("fetcherror",queueItem,response);
response.socket.end();
crawler._openRequests--;
}
crawler._openRequests --;
}
return crawler;
return crawler;
};
/*
Public: The main crawler runloop. Fires at the interval specified in the
crawler configuration, when the crawl is running. May be manually fired.
This function initiates fetching of a queue item if there are enough workers
to do so and there are unfetched items in the queue.
Public: The main crawler runloop. Fires at the interval specified in the
crawler configuration, when the crawl is running. May be manually fired.
This function initiates fetching of a queue item if there are enough workers
to do so and there are unfetched items in the queue.
Examples
Examples
crawler.crawl();
crawler.crawl();
Returns the crawler object for chaining.
Returns the crawler object for chaining.
*/
Crawler.prototype.crawl = function() {
var crawler = this;
var crawler = this;
if (crawler._openRequests > crawler.maxConcurrency) return;
if (crawler._openRequests > crawler.maxConcurrency) {
return [];
}
crawler.queue.oldestUnfetchedItem(function(err, queueItem) {
crawler.queue.oldestUnfetchedItem(function(err, queueItem) { // eslint-disable-line
if (queueItem) {
crawler.fetchQueueItem(queueItem);
if (queueItem) {
crawler.fetchQueueItem(queueItem);
} else if ( !crawler._openRequests &&
!crawler._openListeners) {
} else if (!crawler._openRequests && !crawler._openListeners) {
crawler.queue.complete(function(err, completeCount) {
if (err) throw err;
crawler.queue.complete(function(err, completeCount) {
if (err) {
throw err;
}
crawler.queue.getLength(function(err, length) {
if (err) throw err;
crawler.queue.getLength(function(err, length) {
if (err) {
throw err;
}
if (completeCount === length) {
crawler.emit("complete");
crawler.stop();
}
});
});
}
});
if (completeCount === length) {
crawler.emit("complete");
crawler.stop();
}
});
});
}
});
return crawler;
return crawler;
};
/*
Public: Stops the crawler, terminating the crawl runloop.
Public: Stops the crawler, terminating the crawl runloop.
Examples
Examples
crawler.stop();
crawler.stop();
Returns the crawler object for chaining.
Returns the crawler object for chaining.
*/
Crawler.prototype.stop = function() {
var crawler = this;
clearInterval(crawler.crawlIntervalID);
crawler.running = false;
return crawler;
var crawler = this;
clearInterval(crawler.crawlIntervalID);
crawler.running = false;
return crawler;
};
/*
Public: Holds the crawler in a 'running' state, preventing the `complete`
event from firing until the callback this function returns has been executed,
or a predetermined timeout (as specified by `crawler.listenerTTL`) has
elapsed.
Public: Holds the crawler in a 'running' state, preventing the `complete`
event from firing until the callback this function returns has been executed,
or a predetermined timeout (as specified by `crawler.listenerTTL`) has
elapsed.
Examples
Examples
crawler.on("fetchcomplete",function(queueItem,data) {
continue = this.wait();
doSomethingThatTakesAlongTime(function callback() {
continue();
});
});
crawler.on("fetchcomplete",function(queueItem,data) {
continue = this.wait();
doSomethingThatTakesAlongTime(function callback() {
continue();
});
});
Returns callback which will allow the crawler to continue.
Returns callback which will allow the crawler to continue.
*/
Crawler.prototype.wait = function() {
var crawler = this,
cleared = false,
timeout =
setTimeout(function() {
if (cleared) return;
cleared = true;
crawler._openListeners --;
}, crawler.listenerTTL);
var crawler = this,
cleared = false,
timeout =
setTimeout(function() {
if (cleared) {
return;
}
cleared = true;
crawler._openListeners--;
}, crawler.listenerTTL);
crawler._openListeners ++;
crawler._openListeners++;
return function() {
if (cleared) return;
cleared = true;
crawler._openListeners --;
clearTimeout(timeout);
};
return function() {
if (cleared) {
return;
}
cleared = true;
crawler._openListeners--;
clearTimeout(timeout);
};
};
/*
Public: Given a function, this method adds it to an internal list maintained
by the crawler to be executed against each URL to determine whether it should
be fetched or not.
Public: Given a function, this method adds it to an internal list maintained
by the crawler to be executed against each URL to determine whether it should
be fetched or not.
callback - Function to be called when evaluating a URL. This function is
passed an object containing the protocol, hostname, port, and path
of a resource to be fetched. It can determine whether it should
be requested or not by returning a boolean - false for no, true
for yes.
callback - Function to be called when evaluating a URL. This function is
passed an object containing the protocol, hostname, port, and path
of a resource to be fetched. It can determine whether it should
be requested or not by returning a boolean - false for no, true
for yes.
Examples
Examples
crawler.addFetchCondition(function(parsedURL) {
return (parsedURL.host !== "evildomain.com");
});
crawler.addFetchCondition(function(parsedURL) {
return (parsedURL.host !== "evildomain.com");
});
Returns the ID of the fetch condition - used for removing it from the crawler
later.
Returns the ID of the fetch condition - used for removing it from the crawler
later.
*/
Crawler.prototype.addFetchCondition = function(callback) {
var crawler = this;
if (callback instanceof Function) {
crawler._fetchConditions.push(callback);
return crawler._fetchConditions.length - 1;
} else {
throw new Error("Fetch Condition must be a function.");
}
var crawler = this;
if (callback instanceof Function) {
crawler._fetchConditions.push(callback);
return crawler._fetchConditions.length - 1;
}
throw new Error("Fetch Condition must be a function.");
};
/*
Public: Given the ID of an existing fetch condition, this function removes
it from the crawler's internal list of conditions.
Public: Given the ID of an existing fetch condition, this function removes
it from the crawler's internal list of conditions.
index - ID of fetch condition to be removed.
index - ID of fetch condition to be removed.
Examples
Examples
crawler.removeFetchCondition(3);
crawler.removeFetchCondition(3);
Returns true if the fetch condition was removed, and throws an error if it
could not be found.
Returns true if the fetch condition was removed, and throws an error if it
could not be found.
*/
Crawler.prototype.removeFetchCondition = function(index) {
var crawler = this;
if (crawler._fetchConditions[index] &&
crawler._fetchConditions[index] instanceof Function) {
var crawler = this;
if (crawler._fetchConditions[index] &&
crawler._fetchConditions[index] instanceof Function) {
return !!crawler._fetchConditions.splice(index,1);
} else {
throw new Error("Unable to find indexed Fetch Condition.");
}
return !!crawler._fetchConditions.splice(index, 1);
}
throw new Error("Unable to find indexed Fetch Condition.");
};
/*
Public: Given a URL it will remove the querstring if it exists.
Public: Given a URL it will remove the querstring if it exists.
url - URL from which to remove the querystring
url - URL from which to remove the querystring
Examples
Examples
crawler.removeQuerystring(url);
crawler.removeQuerystring(url);
Returns URL without querystring if it exists
Returns URL without querystring if it exists
*/
Crawler.prototype.removeQuerystring = function(url) {
if (url.indexOf("?") > -1) {
return url.substr(0,url.indexOf("?"));
} else {
return url;
}
if (url.indexOf("?") > -1) {
return url.substr(0, url.indexOf("?"));
}
return url;
};
module.exports = Crawler;

@@ -1,7 +0,12 @@

// SimpleCrawler
// Export interfaces
/*
* Simplecrawler - Export interfaces
* https://github.com/cgiffard/node-simplecrawler
*
* Copyright (c) 2011-2015, Christopher Giffard
*
*/
module.exports = require("./crawler.js");
// Aliasing for compatibility with legacy code.
// Aliasing for compatibility with legacy code
module.exports.Crawler = module.exports;

@@ -8,0 +13,0 @@

@@ -1,21 +0,23 @@

// Simplecrawler - queue module
// Christopher Giffard, 2011
//
// http://www.github.com/cgiffard/node-simplecrawler
/*
* Simplecrawler - queue module
* https://github.com/cgiffard/node-simplecrawler
*
* Copyright (c) 2011-2015, Christopher Giffard
*
*/
var fs = require("fs");
var allowedStatistics = [
"requestTime",
"requestLatency",
"downloadTime",
"contentLength",
"actualDataSize"
"requestTime",
"requestLatency",
"downloadTime",
"contentLength",
"actualDataSize"
];
var FetchQueue = function(){
this.oldestUnfetchedIndex = 0;
this.completeCache = 0;
this.scanIndex = {};
var FetchQueue = function() {
this.oldestUnfetchedIndex = 0;
this.completeCache = 0;
this.scanIndex = {};
};

@@ -28,47 +30,49 @@

// For legacy reasons
if (depth instanceof Function) {
callback = depth;
depth = 1;
}
depth = depth || 1;
callback = callback && callback instanceof Function ? callback : function(){};
var self = this;
// For legacy reasons
if (depth instanceof Function) {
callback = depth;
depth = 1;
}
// Ensure all variables conform to reasonable defaults
protocol = protocol === "https" ? "https" : "http";
depth = depth || 1;
callback = callback && callback instanceof Function ? callback : function() {};
var self = this;
if (isNaN(port) || !port) {
return callback(new Error("Port must be numeric!"));
}
// Ensure all variables conform to reasonable defaults
protocol = protocol === "https" ? "https" : "http";
var url = protocol + "://" + domain + (port !== 80 ? ":" + port : "") + path;
if (isNaN(port) || !port) {
return callback(new Error("Port must be numeric!"));
}
this.exists(protocol,domain,port,path,
function(err,exists) {
if (err) return callback(err);
var url = protocol + "://" + domain + (port !== 80 ? ":" + port : "") + path;
if (!exists) {
var queueItem = {
"url": url,
"protocol": protocol,
"host": domain,
"port": port,
"path": path,
"depth": depth,
"fetched": false,
"status": "queued",
"stateData": {}
};
self.exists(protocol, domain, port, path,
function(err, exists) {
if (err) {
return callback(err);
}
self.push(queueItem);
callback(null, queueItem);
} else {
var error = new Error("Resource already exists in queue!");
error.code = "DUP";
if (!exists) {
var queueItem = {
url: url,
protocol: protocol,
host: domain,
port: port,
path: path,
depth: depth,
fetched: false,
status: "queued",
stateData: {}
};
callback(error);
}
});
self.push(queueItem);
callback(null, queueItem);
} else {
var error = new Error("Resource already exists in queue!");
error.code = "DUP";
callback(error);
}
});
};

@@ -78,18 +82,17 @@

FetchQueue.prototype.exists = function(protocol, domain, port, path, callback) {
callback = callback && callback instanceof Function ? callback : function(){};
callback = callback && callback instanceof Function ? callback : function() {};
port = (port !== 80 ? ":" + port : "");
port = port !== 80 ? ":" + port : "";
var url =
(protocol + "://" + domain + port + path)
.toLowerCase();
var url = (protocol + "://" + domain + port + path).toLowerCase();
if (!!this.scanIndex[url]) {
callback(null, 1);
return 1;
} else {
this.scanIndex[url] = true;
callback(null, 0);
return 0;
}
if (this.scanIndex[url]) {
callback(null, 1);
return 1;
}
this.scanIndex[url] = true;
callback(null, 0);
return 0;
};

@@ -99,7 +102,9 @@

FetchQueue.prototype.last = function(callback) {
callback = callback && callback instanceof Function ? callback : function(){};
var item, self = this;
item = self[self.length-1];
callback(null, item);
return item;
callback = callback && callback instanceof Function ? callback : function() {};
var item,
self = this;
item = self[self.length - 1];
callback(null, item);
return item;
};

@@ -109,10 +114,11 @@

FetchQueue.prototype.get = function(id, callback) {
callback = callback && callback instanceof Function ? callback : function(){};
var item, self = this;
callback = callback && callback instanceof Function ? callback : function() {};
var item,
self = this;
if (!isNaN(id) && self.length > id) {
item = self[id];
callback(null, item);
return item;
}
if (!isNaN(id) && self.length > id) {
item = self[id];
callback(null, item);
return item;
}
};

@@ -122,15 +128,16 @@

FetchQueue.prototype.oldestUnfetchedItem = function(callback) {
callback = callback && callback instanceof Function ? callback : function(){};
var item, self = this;
callback = callback && callback instanceof Function ? callback : function() {};
var item,
self = this;
for (var itemIndex = self.oldestUnfetchedIndex; itemIndex < self.length; itemIndex ++) {
if (self[itemIndex].status === "queued") {
self.oldestUnfetchedIndex = itemIndex;
item = self[itemIndex];
callback(null, item);
return item;
}
}
for (var itemIndex = self.oldestUnfetchedIndex; itemIndex < self.length; itemIndex++) {
if (self[itemIndex].status === "queued") {
self.oldestUnfetchedIndex = itemIndex;
item = self[itemIndex];
callback(null, item);
return item;
}
}
callback(new Error("No unfetched items remain."));
callback(new Error("No unfetched items remain."));
};

@@ -140,18 +147,19 @@

FetchQueue.prototype.max = function(statisticName, callback) {
callback = callback && callback instanceof Function ? callback : function(){};
var maxStatisticValue = 0, self = this;
callback = callback && callback instanceof Function ? callback : function() {};
var maxStatisticValue = 0,
self = this;
if (allowedStatistics.join().indexOf(statisticName) === -1) {
// Not a recognised statistic!
return callback(new Error("Invalid statistic."));
}
if (allowedStatistics.join().indexOf(statisticName) === -1) {
// Not a recognised statistic!
return callback(new Error("Invalid statistic."));
}
self.forEach(function(item) {
if (item.fetched && item.stateData[statisticName] !== null && item.stateData[statisticName] > maxStatisticValue) {
maxStatisticValue = item.stateData[statisticName];
}
});
self.forEach(function(item) {
if (item.fetched && item.stateData[statisticName] !== null && item.stateData[statisticName] > maxStatisticValue) {
maxStatisticValue = item.stateData[statisticName];
}
});
callback(null, maxStatisticValue);
return maxStatisticValue;
callback(null, maxStatisticValue);
return maxStatisticValue;
};

@@ -161,19 +169,21 @@

FetchQueue.prototype.min = function(statisticName, callback) {
callback = callback && callback instanceof Function ? callback : function(){};
var minimum, minStatisticValue = Infinity, self = this;
callback = callback && callback instanceof Function ? callback : function() {};
var minimum,
minStatisticValue = Infinity,
self = this;
if (allowedStatistics.join().indexOf(statisticName) === -1) {
// Not a recognised statistic!
return callback(new Error("Invalid statistic."));
}
if (allowedStatistics.join().indexOf(statisticName) === -1) {
// Not a recognised statistic!
return callback(new Error("Invalid statistic."));
}
self.forEach(function(item) {
if (item.fetched && item.stateData[statisticName] !== null && item.stateData[statisticName] < minStatisticValue) {
minStatisticValue = item.stateData[statisticName];
}
});
minimum = minStatisticValue === Infinity? 0 : minStatisticValue;
callback(null, minimum);
return minimum;
self.forEach(function(item) {
if (item.fetched && item.stateData[statisticName] !== null && item.stateData[statisticName] < minStatisticValue) {
minStatisticValue = item.stateData[statisticName];
}
});
minimum = minStatisticValue === Infinity ? 0 : minStatisticValue;
callback(null, minimum);
return minimum;
};

@@ -183,19 +193,22 @@

FetchQueue.prototype.avg = function(statisticName, callback) {
callback = callback && callback instanceof Function ? callback : function(){};
var average, NumberSum = 0, NumberCount = 0, self = this;
callback = callback && callback instanceof Function ? callback : function() {};
var average,
NumberSum = 0,
NumberCount = 0,
self = this;
if (allowedStatistics.join().indexOf(statisticName) === -1) {
// Not a recognised statistic!
return callback(new Error("Invalid statistic."));
}
if (allowedStatistics.join().indexOf(statisticName) === -1) {
// Not a recognised statistic!
return callback(new Error("Invalid statistic."));
}
self.forEach(function(item) {
if (item.fetched && item.stateData[statisticName] !== null && !isNaN(item.stateData[statisticName])) {
NumberSum += item.stateData[statisticName];
NumberCount ++;
}
});
average = NumberSum / NumberCount;
callback(null, average);
return average;
self.forEach(function(item) {
if (item.fetched && item.stateData[statisticName] !== null && !isNaN(item.stateData[statisticName])) {
NumberSum += item.stateData[statisticName];
NumberCount++;
}
});
average = NumberSum / NumberCount;
callback(null, average);
return average;
};

@@ -205,13 +218,14 @@

FetchQueue.prototype.complete = function(callback) {
callback = callback && callback instanceof Function ? callback : function(){};
var NumberComplete = 0, self = this;
callback = callback && callback instanceof Function ? callback : function() {};
var NumberComplete = 0,
self = this;
self.forEach(function(item) {
if (item.fetched) {
NumberComplete ++;
}
});
self.forEach(function(item) {
if (item.fetched) {
NumberComplete++;
}
});
callback(null, NumberComplete);
return NumberComplete;
callback(null, NumberComplete);
return NumberComplete;
};

@@ -221,13 +235,14 @@

FetchQueue.prototype.countWithStatus = function(status, callback) {
callback = callback && callback instanceof Function ? callback : function(){};
var queueItemsMatched = 0, self = this;
callback = callback && callback instanceof Function ? callback : function() {};
var queueItemsMatched = 0,
self = this;
self.forEach(function(item) {
if (item.status === status) {
queueItemsMatched ++;
}
});
self.forEach(function(item) {
if (item.status === status) {
queueItemsMatched++;
}
});
callback(null,queueItemsMatched);
return queueItemsMatched;
callback(null, queueItemsMatched);
return queueItemsMatched;
};

@@ -237,14 +252,15 @@

FetchQueue.prototype.getWithStatus = function(status, callback) {
callback = callback && callback instanceof Function ? callback : function(){};
var subqueue = [], self = this;
callback = callback && callback instanceof Function ? callback : function() {};
var subqueue = [],
self = this;
self.forEach(function(item,index) {
if (item.status === status) {
subqueue.push(item);
subqueue[subqueue.length-1].queueIndex = index;
}
});
self.forEach(function(item, index) {
if (item.status === status) {
subqueue.push(item);
subqueue[subqueue.length - 1].queueIndex = index;
}
});
callback(null,subqueue);
return subqueue;
callback(null, subqueue);
return subqueue;
};

@@ -254,10 +270,13 @@

FetchQueue.prototype.errors = function(callback) {
callback = callback && callback instanceof Function ? callback : function(){};
var total, failedCount, notFoundCount, self = this;
callback = callback && callback instanceof Function ? callback : function() {};
var total,
failedCount,
notFoundCount,
self = this;
failedCount = self.countWithStatus("failed");
notFoundCount = self.countWithStatus("notfound");
total = failedCount + notFoundCount;
callback(null, total);
return total;
failedCount = self.countWithStatus("failed");
notFoundCount = self.countWithStatus("notfound");
total = failedCount + notFoundCount;
callback(null, total);
return total;
};

@@ -267,20 +286,20 @@

FetchQueue.prototype.getLength = function(callback) {
return callback(null, this.length);
return callback(null, this.length);
};
// Writes the queue to disk
FetchQueue.prototype.freeze = function(filename,callback) {
callback = callback && callback instanceof Function ? callback : function(){};
var self = this;
FetchQueue.prototype.freeze = function(filename, callback) {
callback = callback && callback instanceof Function ? callback : function() {};
var self = this;
// Re-queue in-progress items before freezing...
self.forEach(function(item) {
if (item.fetched !== true) {
item.status = "queued";
}
});
// Re-queue in-progress items before freezing...
self.forEach(function(item) {
if (item.fetched !== true) {
item.status = "queued";
}
});
fs.writeFile(filename,JSON.stringify(self),function(err) {
callback(err, self);
});
fs.writeFile(filename, JSON.stringify(self), function(err) {
callback(err, self);
});
};

@@ -290,39 +309,44 @@

FetchQueue.prototype.defrost = function(filename, callback) {
callback = callback && callback instanceof Function ? callback : function(){};
var fileData, self = this, defrostedQueue = [];
callback = callback && callback instanceof Function ? callback : function() {};
var self = this,
defrostedQueue = [];
fs.readFile(filename,function(err,fileData) {
if (err) return callback(err);
fs.readFile(filename, function(err, fileData) {
if (err) {
return callback(err);
}
if (!fileData.toString("utf8").length) {
return callback(new Error("Failed to defrost queue from zero-length JSON."));
}
if (!fileData.toString("utf8").length) {
return callback(new Error("Failed to defrost queue from zero-length JSON."));
}
try {
defrostedQueue = JSON.parse(fileData.toString("utf8"));
} catch(error) {
return callback(error);
}
try {
defrostedQueue = JSON.parse(fileData.toString("utf8"));
} catch (error) {
return callback(error);
}
self.oldestUnfetchedIndex = Infinity;
self.scanIndex = {};
self.oldestUnfetchedIndex = Infinity;
self.scanIndex = {};
for (var index in defrostedQueue) {
if (defrostedQueue.hasOwnProperty(index) && !isNaN(index)) {
var queueItem = defrostedQueue[index];
self.push(queueItem);
for (var index in defrostedQueue) {
if (defrostedQueue.hasOwnProperty(index) && !isNaN(index)) {
var queueItem = defrostedQueue[index];
self.push(queueItem);
if (queueItem.status !== "downloaded")
self.oldestUnfetchedIndex = Math.min(
self.oldestUnfetchedIndex, index);
if (queueItem.status !== "downloaded") {
self.oldestUnfetchedIndex = Math.min(
self.oldestUnfetchedIndex, index);
}
self.scanIndex[queueItem.url] = true;
}
}
self.scanIndex[queueItem.url] = true;
}
}
if (self.oldestUnfetchedIndex === Infinity)
self.oldestUnfetchedIndex = 0;
if (self.oldestUnfetchedIndex === Infinity) {
self.oldestUnfetchedIndex = 0;
}
callback(null,self);
});
callback(null, self);
});
};

@@ -1,73 +0,85 @@

var Crawler = require("./crawler.js"),
URI = require("URIjs");
/*
* Simplecrawler
* https://github.com/cgiffard/node-simplecrawler
*
* Copyright (c) 2011-2015, Christopher Giffard
*
*/
var Crawler = require("./crawler.js"),
uri = require("urijs");
/*
Public: Convenience function for really quick, simple crawls. It generates
a new crawler, parses the URL provided, and sets up the new crawler with
the host and path information extracted from the URL. It returns the crawler
object, so you can set up event handlers, and waits until `process.nextTick`
before kicking off the crawl.
Public: Convenience function for really quick, simple crawls. It generates
a new crawler, parses the URL provided, and sets up the new crawler with
the host and path information extracted from the URL. It returns the crawler
object, so you can set up event handlers, and waits until `process.nextTick`
before kicking off the crawl.
url - URL to begin crawl from.
successCallback - Optional function called once an item is completely
downloaded. Functionally identical to a fetchcomplete
event listener.
failCallback - Optional function to be called if an item fails to
download. Functionally identical to a fetcherror
event listener.
url - URL to begin crawl from.
successCallback - Optional function called once an item is completely
downloaded. Functionally identical to a fetchcomplete
event listener.
failCallback - Optional function to be called if an item fails to
download. Functionally identical to a fetcherror
event listener.
Examples
Examples
Crawler.crawl(
"http://example.com:3000/start",
function(queueItem,data) {
console.log("I got a new item!");
}
);
Crawler.crawl(
"http://example.com:3000/start",
function(queueItem,data) {
console.log("I got a new item!");
}
);
Crawler
.crawl("http://www.example.com/")
.on("fetchstart",function(queueItem) {
console.log("Beginning fetch for",queueItem.url);
});
Crawler
.crawl("http://www.example.com/")
.on("fetchstart",function(queueItem) {
console.log("Beginning fetch for",queueItem.url);
});
Returns the crawler object which has now been constructed.
Returns the crawler object which has now been constructed.
*/
module.exports = function crawl(url,successCallback,failCallback) {
// Parse the URL first
url = URI(url);
module.exports = function crawl(url, successCallback, failCallback) {
// If either the protocol, path, or hostname are unset, we can't really
// do much. Die with error.
if (!url.protocol())
throw new Error("Can't crawl with unspecified protocol.");
// Parse the URL first
url = uri(url);
if (!url.hostname())
throw new Error("Can't crawl with unspecified hostname.");
// If either the protocol, path, or hostname are unset,
// we can't really do much. Die with error.
if (!url.protocol()) {
throw new Error("Can't crawl with unspecified protocol.");
}
if (!url.path())
throw new Error("Can't crawl with unspecified path.");
if (!url.hostname()) {
throw new Error("Can't crawl with unspecified hostname.");
}
var tmpCrawler =
new Crawler(
url.hostname(),
url.path(),
url.port() || 80);
if (!url.path()) {
throw new Error("Can't crawl with unspecified path.");
}
// Attach callbacks if they were provided
if (successCallback) tmpCrawler.on("fetchcomplete",successCallback);
if (failCallback) tmpCrawler.on("fetcherror",failCallback);
var tmpCrawler = new Crawler(url.hostname(), url.path(), url.port() || 80);
// Start the crawler on the next runloop
// This enables initial configuration options and event handlers to take
// effect before the first resource is queued.
process.nextTick(function() {
tmpCrawler.start();
});
// Attach callbacks if they were provided
if (successCallback) {
tmpCrawler.on("fetchcomplete", successCallback);
}
if (failCallback) {
tmpCrawler.on("fetcherror", failCallback);
}
// Return crawler
return tmpCrawler;
// Start the crawler on the next runloop
// This enables initial configuration options and event handlers to take
// effect before the first resource is queued.
process.nextTick(function() {
tmpCrawler.start();
});
// Return crawler
return tmpCrawler;
};
{
"name": "simplecrawler",
"description": "Very straigntforward web crawler. Uses EventEmitter. Generates queue statistics and has a basic cache mechanism with extensible backend.",
"version": "0.5.3",
"description": "Very straightforward web crawler. Uses EventEmitter. Generates queue statistics and has a basic cache mechanism with extensible backend.",
"version": "0.5.4",
"homepage": "https://github.com/cgiffard/node-simplecrawler",
"author": "Christopher Giffard <christopher.giffard@cgiffard.com>",
"license": "BSD-2-Clause",
"repository": {
"type": "git",
"url": "https://github.com/cgiffard/node-simplecrawler.git"
"url": "git+https://github.com/cgiffard/node-simplecrawler.git"
},

@@ -24,3 +25,5 @@ "bugs": {

"scripts": {
"test": "mocha -R spec -t 5000"
"lint": "eslint example/ lib/ test/",
"mocha": "mocha -R spec -t 5000",
"test": "npm run lint && npm run mocha"
},

@@ -32,8 +35,8 @@ "bin": {

"dependencies": {
"URIjs": "^1.15.0"
"urijs": "^1.16.1"
},
"devDependencies": {
"chai": "^2.2.0",
"jshint": "^2.7.0",
"mocha": "^2.2.4"
"chai": "^3.2.0",
"eslint": "^1.5.1",
"mocha": "^2.3.2"
},

@@ -40,0 +43,0 @@ "engines": {

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc