New Case Study:See how Anthropic automated 95% of dependency reviews with Socket.Learn More
Socket
Sign inDemoInstall
Socket

simplecrawler

Package Overview
Dependencies
Maintainers
2
Versions
70
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

simplecrawler - npm Package Compare versions

Comparing version 0.1.7 to 0.2.0

example/savetodisk.js

505

lib/crawler.js

@@ -9,2 +9,3 @@ // Simplecrawler

Cache = require("./cache.js"),
CookieJar = require("./cookies.js"),
MetaInfo = require("../package.json");

@@ -19,3 +20,3 @@

Public: Constructor for the crawler.
host - Initial hostname/domain to begin crawling from. By

@@ -26,7 +27,7 @@ default, the crawl will be locked to this hostname.

interval - Request interval for the crawler. Defaults to 250ms.
Examples
var crawler = new Crawler("example.com","/",80,500);
var crawler = new Crawler("example.com");

@@ -41,6 +42,6 @@

throw new Error("Port must be a number!");
// SETTINGS TO STUFF WITH
// (not here! Do it when you create a `new Crawler()`)
// Domain to crawl

@@ -58,3 +59,3 @@ this.host = host || "";

this.interval = interval || 250;
// Maximum request concurrency. Be sensible. Five ties in with node's

@@ -75,3 +76,3 @@ // default maxSockets value.

this.queue = new FetchQueue();
// Do we filter by domain?

@@ -99,8 +100,15 @@ // Unless you want to be crawling the entire internet, I would

this.proxyPort = 8123;
// Support for HTTP basic auth
this.needsAuth = false;
this.authUser = "";
this.authPass = "";
// Support for HTTP basic auth
this.needsAuth = false;
this.authUser = "";
this.authPass = "";
// Support for retaining cookies for parse duration
this.acceptCookies = true;
this.cookies = new CookieJar();
// Support for custom headers...
this.customHeaders = {};
// Domain Whitelist

@@ -130,2 +138,5 @@ // We allow domains to be whitelisted, so cross-domain requests can be made.

this.downloadUnsupported = true;
// URL Encoding setting...
this.urlEncoding = "unicode";

@@ -145,3 +156,3 @@ // STATE (AND OTHER) VARIABLES NOT TO STUFF WITH

commences.
Examples

@@ -156,6 +167,6 @@

var crawler = this;
// only if we haven't already got stuff in our queue...
if (!this.queue.length) {
// Initialise our queue by pushing the initial request data into it...

@@ -171,10 +182,10 @@ this.queue.add(

}
this.crawlIntervalID = setInterval(function() {
crawler.crawl.call(crawler);
},this.interval);
this.emit("crawlstart");
this.running = true;
// Now kick off the initial crawl

@@ -184,3 +195,3 @@ process.nextTick(function() {

});
return this;

@@ -193,3 +204,3 @@ };

URL - URL with a protocol, for testing.
Examples

@@ -205,6 +216,6 @@

var protocol;
try {
protocol = URI(URL).protocol();
} catch(e) {

@@ -214,3 +225,3 @@ // If URIjs died, we definitely /do not/ support the protocol.

}
return this.allowedProtocols.reduce(function(prev,protocolCheck) {

@@ -235,3 +246,3 @@ return prev || !!protocolCheck.exec(protocol);

Crawler.prototype.mimeTypeSupported = function(MIMEType) {
return (

@@ -258,12 +269,17 @@ this.supportedMimeTypes.reduce(function(prev,mimeCheck) {

Crawler.prototype.processURL = function(URL,context) {
var newURL;
var newURL, crawler = this;
// If the URL didn't contain anything, don't fetch it.
if (!URL.replace(/\s+/ig,"").length) return false;
try {
newURL =
newURL =
URI(URL)
.absoluteTo(context.url)
.normalize();
if (crawler.urlEncoding === "iso8859") {
newURL = newURL.iso8859();
}
} catch(e) {

@@ -273,3 +289,3 @@ // Couldn't process the URL, since URIjs choked on it.

}
// simplecrawler uses slightly different terminology to URIjs. Sorry!

@@ -304,3 +320,3 @@ return {

crawler = this;
// Regular expressions for finding URL items in HTML and text

@@ -311,3 +327,3 @@ var discoverRegex = [

/url\([^)]+/ig,
// This might be a bit of a gamble... but get hard-coded

@@ -319,3 +335,3 @@ // strings out of javacript: URLs. They're often popup-image

];
function cleanURL(URL) {

@@ -337,7 +353,7 @@ return URL

if (!urlMatch) return [];
return urlMatch
.map(cleanURL)
.reduce(function(list,URL) {
// Ensure URL is whole and complete

@@ -353,9 +369,9 @@ try {

}
// If we hit an empty item, don't add return it
if (!URL.length) return list;
// If we don't support the protocol in question
if (!crawler.protocolSupported(URL)) return list;
// Does the item already exist in the list?

@@ -398,9 +414,9 @@ if (resources.reduce(function(prev,current) {

crawlerHost = crawler.host;
// If we're ignoring the WWW domain, remove the WWW for comparisons...
if (crawler.ignoreWWWDomain)
host = host.replace(/^www\./i,"");
function domainInWhitelist(host) {
// If there's no whitelist, or the whitelist is of zero length,

@@ -410,12 +426,12 @@ // just return false.

!crawler.domainWhitelist.length) return false;
// Otherwise, scan through it.
return !!crawler.domainWhitelist.reduce(function(prev,cur,index,array) {
// If we already located the relevant domain in the whitelist...
if (prev) return prev;
// If the domain is just equal, return true.
if (host === cur) return true;
// If we're ignoring WWW subdomains, and both domains,

@@ -425,3 +441,3 @@ // less www. are the same, return true.

return true;
// Otherwise, sorry. No dice.

@@ -434,7 +450,7 @@ return false;

function isSubdomainOf(subdomain,host) {
// Comparisons must be case-insensitive
subdomain = subdomain.toLowerCase();
host = host.toLowerCase();
// If we're ignoring www, remove it from both

@@ -446,3 +462,3 @@ // (if www is the first domain component...)

}
// They should be the same flipped around!

@@ -500,3 +516,3 @@ return (

resources.forEach(function(url){ crawler.queueURL(url,queueItem); });
return this;

@@ -525,3 +541,3 @@ };

typeof(url) === "object" ? url : crawler.processURL(url,queueItem);
// URL Parser decided this URL was junky. Next please!

@@ -542,3 +558,3 @@ if (!parsedURL) {

}
// Check the domain is valid before adding it to the queue

@@ -568,3 +584,3 @@ if (crawler.domainValid(parsedURL.host)) {

}
return true;

@@ -577,5 +593,5 @@ };

It kicks off resource discovery and queues any new resources found.
queueItem - Queue item to be fetched.
Emits

@@ -602,6 +618,3 @@ fetchstart

crawler.openRequests ++;
// Emit fetchstart event
crawler.emit("fetchstart",queueItem);
// Variable declarations

@@ -611,10 +624,3 @@ var fetchData = false,

clientRequest,
timeCommenced,
timeHeadersReceived,
timeDataReceived,
parsedURL,
responseBuffer,
responseLength,
responseLengthReceived,
contentType;
timeCommenced;

@@ -647,5 +653,13 @@ // Mark as spooled

if(crawler.needsAuth) {
// Add cookie header from cookie jar if we're configured to
// send/accept cookies
if (crawler.acceptCookies && crawler.cookies.getAsHeader()) {
requestOptions.headers.cookie =
crawler.cookies.getAsHeader(requestHost,requestPath);
}
// Add auth headers if we need them
if (crawler.needsAuth) {
var auth = crawler.authUser + ":" + crawler.authPass;
// Generate auth header

@@ -656,189 +670,252 @@ auth = 'Basic ' + (new Buffer(auth).toString('base64'));

// Record what time we started this request
timeCommenced = (new Date().getTime());
// And if we've got any custom headers available
if (crawler.customHeaders) {
for (var header in crawler.customHeaders) {
if (!crawler.customHeaders.hasOwnProperty(header)) continue;
requestOptions.headers[header] = crawler.customHeaders[header];
}
}
// Emit fetchstart event - gives the user time to mangle the request options
// if required.
crawler.emit("fetchstart",queueItem,requestOptions);
// Get the resource!
clientRequest = client.get(requestOptions,function(response) {
var dataReceived = false,
stateData = queueItem.stateData;
process.nextTick(function() {
// Record what time we started this request
timeCommenced = Date.now();
responseLengthReceived = 0;
// Get the resource!
clientRequest =
client.get(requestOptions,function(response) {
crawler.handleResponse(queueItem,response,timeCommenced);
});
clientRequest.on("error",function(errorData) {
crawler.openRequests --;
// Emit 5xx / 4xx event
crawler.emit("fetchclienterror",queueItem,errorData);
queueItem.fetched = true;
queueItem.stateData.code = 599;
queueItem.status = "failed";
});
return crawler;
});
};
// Record what time we first received the header information
timeHeadersReceived = (new Date().getTime());
responseLength = parseInt(response.headers["content-length"],10);
responseLength = !isNaN(responseLength) ? responseLength : 0;
/*
Public: Given a queueItem and a matching response object, the crawler will
handle downloading the resource, queueing of linked items, etc.
Examples
// Passing in a response from `request`
request(queueItem.url,function(err,res,body) {
crawler.handleResponse(queueItem,res);
});
// Save timing and content some header information into queue
stateData.requestLatency = (timeHeadersReceived - timeCommenced);
stateData.requestTime = (timeHeadersReceived - timeCommenced);
stateData.contentLength = responseLength;
stateData.contentType = contentType = response.headers["content-type"];
stateData.code = response.statusCode;
Returns the crawler object for chaining.
// Save entire headers, in less scannable way
stateData.headers = response.headers;
*/
Crawler.prototype.handleResponse = function(queueItem,response,timeCommenced) {
var crawler = this,
dataReceived = false,
timeHeadersReceived,
timeDataReceived,
parsedURL,
responseBuffer,
responseLength,
responseLengthReceived = 0,
contentType,
stateData = queueItem.stateData;
// Record what time we first received the header information
timeHeadersReceived = Date.now();
// If we weren't passed a time of commencement, assume Now()
timeCommenced = timeCommenced || Date.now();
// Emit header receive event
crawler.emit("fetchheaders",queueItem,response);
responseLength = parseInt(response.headers["content-length"],10);
responseLength = !isNaN(responseLength) ? responseLength : 0;
// Ensure response length is reasonable...
responseLength =
responseLength > 0 ? responseLength : crawler.maxResourceSize;
queueItem.stateData.contentLength = responseLength;
// Save timing and content some header information into queue
stateData.requestLatency = (timeHeadersReceived - timeCommenced);
stateData.requestTime = (timeHeadersReceived - timeCommenced);
stateData.contentLength = responseLength;
stateData.contentType = contentType = response.headers["content-type"];
stateData.code = response.statusCode;
stateData.headers = response.headers;
// Do we need to save cookies? Were we sent any?
if (crawler.acceptCookies &&
response.headers.hasOwnProperty('set-cookie'))
crawler.cookies.addFromHeaders(response.headers["set-cookie"]);
// Emit header receive event
crawler.emit("fetchheaders",queueItem,response);
// Function for dealing with 200 responses
function processReceivedData() {
if (queueItem.fetched) return;
timeDataReceived = (new Date().getTime());
// Ensure response length is reasonable...
responseLength =
responseLength > 0 ? responseLength : crawler.maxResourceSize;
queueItem.fetched = true;
queueItem.status = "downloaded";
// Save state information
stateData.downloadTime = (timeDataReceived - timeHeadersReceived);
stateData.requestTime = (timeDataReceived - timeCommenced);
stateData.actualDataSize = responseBuffer.length;
stateData.sentIncorrectSize = responseBuffer.length !== responseLength;
queueItem.stateData.contentLength = responseLength;
crawler.emit("fetchcomplete",queueItem,responseBuffer,response);
// Function for dealing with 200 responses
function processReceivedData() {
if (queueItem.fetched) return;
// First, save item to cache (if we're using a cache!)
if (crawler.cache !== null &&
crawler.cache.setCacheData instanceof Function) {
crawler.cache.setCacheData(queueItem,responseBuffer);
}
timeDataReceived = (new Date().getTime());
// We only process the item if it's of a valid mimetype
// and only if the crawler is set to discover its own resources
if (crawler.mimeTypeSupported(contentType) && crawler.discoverResources) {
crawler.queueLinkedItems(responseBuffer,queueItem);
}
crawler.openRequests --;
queueItem.fetched = true;
queueItem.status = "downloaded";
// Save state information
stateData.downloadTime = (timeDataReceived - timeHeadersReceived);
stateData.requestTime = (timeDataReceived - timeCommenced);
stateData.actualDataSize = responseBuffer.length;
stateData.sentIncorrectSize = responseBuffer.length !== responseLength;
crawler.emit("fetchcomplete",queueItem,responseBuffer,response);
// First, save item to cache (if we're using a cache!)
if (crawler.cache !== null &&
crawler.cache.setCacheData instanceof Function) {
crawler.cache.setCacheData(queueItem,responseBuffer);
}
function receiveData(chunk) {
if (chunk && chunk.length && !dataReceived) {
if (responseLengthReceived + chunk.length > responseBuffer.length) {
// Oh dear. We've been sent more data than we were initially told.
// This could be a mis-calculation, or a streaming resource.
// Let's increase the size of our buffer to match, as long as it isn't
// larger than our maximum resource size.
// We only process the item if it's of a valid mimetype
// and only if the crawler is set to discover its own resources
if (crawler.mimeTypeSupported(contentType) && crawler.discoverResources) {
crawler.queueLinkedItems(responseBuffer,queueItem);
}
if (responseLengthReceived + chunk.length <= crawler.maxResourceSize) {
// Start by creating a new buffer, which will be our main buffer going forward...
var tmpNewBuffer = new Buffer(responseLengthReceived + chunk.length);
crawler.openRequests --;
}
// Copy all our old data into it...
responseBuffer.copy(tmpNewBuffer,0,0,responseBuffer.length);
function receiveData(chunk) {
if (chunk && chunk.length && !dataReceived) {
if (responseLengthReceived + chunk.length > responseBuffer.length) {
// Oh dear. We've been sent more data than we were initially told.
// This could be a mis-calculation, or a streaming resource.
// Let's increase the size of our buffer to match, as long as it isn't
// larger than our maximum resource size.
// And now the new chunk
chunk.copy(tmpNewBuffer,responseBuffer.length,0,chunk.length);
if (responseLengthReceived + chunk.length <= crawler.maxResourceSize) {
// Start by creating a new buffer, which will be our main
// buffer from now on...
var tmpNewBuffer = new Buffer(responseLengthReceived + chunk.length);
// And now make the response buffer our new buffer, leaving the original for GC
responseBuffer = tmpNewBuffer;
// Copy all our old data into it...
responseBuffer.copy(tmpNewBuffer,0,0,responseBuffer.length);
} else {
// Oh dear oh dear! The response is not only more data than we were initially told,
// but it also exceeds the maximum amount of data we're prepared to download per resource.
// Throw error event and ignore.
//
// We'll then deal with the data that we have.
// And now the new chunk
chunk.copy(tmpNewBuffer,responseBuffer.length,0,chunk.length);
crawler.emit("fetchdataerror",queueItem,response);
}
// And now make the response buffer our new buffer,
// leaving the original for GC
responseBuffer = tmpNewBuffer;
} else {
// Copy the chunk data into our main buffer
chunk.copy(responseBuffer,responseLengthReceived,0,chunk.length);
// Oh dear oh dear! The response is not only more data
// than we were initially told, but it also exceeds the
// maximum amount of data we're prepared to download per
// resource.
//
// Throw error event and ignore.
//
// We'll then deal with the data that we have.
crawler.emit("fetchdataerror",queueItem,response);
}
// Increment our data received counter
responseLengthReceived += chunk.length;
} else {
// Copy the chunk data into our main buffer
chunk.copy(responseBuffer,responseLengthReceived,0,chunk.length);
}
// Increment our data received counter
responseLengthReceived += chunk.length;
}
if ((responseLengthReceived >= responseLength || response.complete) && !dataReceived) {
// Slice the buffer to chop off any unused space
responseBuffer = responseBuffer.slice(0,responseLengthReceived);
dataReceived = true;
processReceivedData();
}
}
// If we should just go ahead and get the data
if (response.statusCode >= 200 && response.statusCode < 300 && responseLength <= crawler.maxResourceSize) {
queueItem.status = "headers";
if ((responseLengthReceived >= responseLength || response.complete) &&
!dataReceived) {
// Create a buffer with our response length
responseBuffer = new Buffer(responseLength);
// Slice the buffer to chop off any unused space
responseBuffer = responseBuffer.slice(0,responseLengthReceived);
response.on("data",receiveData);
response.on("end",receiveData);
dataReceived = true;
processReceivedData();
}
}
// We've got a not-modified response back
} else if (response.statusCode === 304) {
// If we should just go ahead and get the data
if (response.statusCode >= 200 && response.statusCode < 300 &&
responseLength <= crawler.maxResourceSize) {
queueItem.status = "headers";
if (crawler.cache !== null && crawler.cache.getCacheData) {
// We've got access to a cache
crawler.cache.getCacheData(queueItem,function(cacheObject) {
crawler.emit("notmodified",queueItem,response,cacheObject);
});
} else {
// Emit notmodified event. We don't have a cache available, so we don't send any data.
crawler.emit("notmodified",queueItem,response);
}
// Create a buffer with our response length
responseBuffer = new Buffer(responseLength);
// If we should queue a redirect
} else if (response.statusCode >= 300 && response.statusCode < 400 && response.headers.location) {
queueItem.fetched = true;
queueItem.status = "redirected";
// Parse the redirect URL ready for adding to the queue...
parsedURL = crawler.processURL(response.headers.location,queueItem);
response.on("data",receiveData);
response.on("end",receiveData);
// Emit redirect event
crawler.emit("fetchredirect",queueItem,parsedURL,response);
// We've got a not-modified response back
} else if (response.statusCode === 304) {
// Clean URL, add to queue...
crawler.queueURL(parsedURL,queueItem);
if (crawler.cache !== null && crawler.cache.getCacheData) {
// We've got access to a cache
crawler.cache.getCacheData(queueItem,function(cacheObject) {
crawler.emit("notmodified",queueItem,response,cacheObject);
});
} else {
// Emit notmodified event. We don't have a cache available, so
// we don't send any data.
crawler.emit("notmodified",queueItem,response);
}
crawler.openRequests --;
// If we should queue a redirect
} else if (response.statusCode >= 300 && response.statusCode < 400 &&
response.headers.location) {
queueItem.fetched = true;
queueItem.status = "redirected";
// Ignore this request, but record that we had a 404
} else if (response.statusCode === 404) {
queueItem.fetched = true;
queueItem.status = "notfound";
// Parse the redirect URL ready for adding to the queue...
parsedURL = crawler.processURL(response.headers.location,queueItem);
// Emit 404 event
crawler.emit("fetch404",queueItem,response);
// Emit redirect event
crawler.emit("fetchredirect",queueItem,parsedURL,response);
crawler.openRequests --;
// Clean URL, add to queue...
crawler.queueURL(parsedURL,queueItem);
// And oh dear. Handle this one as well. (other 400s, 500s, etc)
} else {
queueItem.fetched = true;
queueItem.status = "failed";
crawler.openRequests --;
// Emit 5xx / 4xx event
crawler.emit("fetcherror",queueItem,response);
// Ignore this request, but record that we had a 404
} else if (response.statusCode === 404) {
queueItem.fetched = true;
queueItem.status = "notfound";
crawler.openRequests --;
}
});
// Emit 404 event
crawler.emit("fetch404",queueItem,response);
clientRequest.on("error",function(errorData) {
crawler.openRequests --;
// Emit 5xx / 4xx event
crawler.emit("fetchclienterror",queueItem,errorData);
// And oh dear. Handle this one as well. (other 400s, 500s, etc)
} else {
queueItem.fetched = true;
queueItem.stateData.code = 599;
queueItem.status = "failed";
});
// Emit 5xx / 4xx event
crawler.emit("fetcherror",queueItem,response);
crawler.openRequests --;
}

@@ -853,3 +930,3 @@ return crawler;

to do so and there are unfetched items in the queue.
Examples

@@ -864,5 +941,5 @@

var crawler = this;
if (crawler.openRequests > crawler.maxConcurrency) return;
crawler.queue.oldestUnfetchedItem(function(err,queueItem) {

@@ -880,3 +957,3 @@ if (queueItem) {

});
return crawler;

@@ -941,3 +1018,3 @@ };

Returns true if the fetch condition was removed, and throws an error if it
Returns true if the fetch condition was removed, and throws an error if it
could not be found.

@@ -949,3 +1026,3 @@

this.fetchConditions[index] instanceof Function) {
return !!this.fetchConditions.splice(index,1);

@@ -952,0 +1029,0 @@ } else {

{
"name": "simplecrawler",
"description": "Very straigntforward web crawler. Uses EventEmitter. Generates queue statistics and has a basic cache mechanism with extensible backend.",
"version": "0.1.7",
"homepage": "http://github.com/cgiffard/node-simplecrawler",
"author": "Christopher Giffard <christopher.giffard@cgiffard.com>",
"keywords": [
"crawler",
"spider",
"cache",
"queue",
"simplecrawler",
"eventemitter"
],
"scripts": {
"test": "mocha -R spec"
},
"bin": {
"crawl": "./lib/cli.js"
},
"repository": {
"type": "git",
"url": "http://github.com/cgiffard/node-simplecrawler.git"
},
"bugs": {
"url": "https://github.com/cgiffard/node-simplecrawler/issues"
},
"main": "./lib/index.js",
"engines": {
"node": ">=0.4.0"
},
"devDependencies": {
"mocha": "~1.8.1",
"jshint": "~0.7.x",
"chai": "~1.2.0"
},
"dependencies": {
"iconv": "~1.2.4",
"URIjs": "~1.8.3"
}
"name": "simplecrawler",
"description": "Very straigntforward web crawler. Uses EventEmitter. Generates queue statistics and has a basic cache mechanism with extensible backend.",
"version": "0.2.0",
"homepage": "http://github.com/cgiffard/node-simplecrawler",
"author": "Christopher Giffard <christopher.giffard@cgiffard.com>",
"keywords": [
"crawler",
"spider",
"cache",
"queue",
"simplecrawler",
"eventemitter"
],
"scripts": {
"test": "mocha -R spec"
},
"bin": {
"crawl": "./lib/cli.js"
},
"repository": {
"type": "git",
"url": "http://github.com/cgiffard/node-simplecrawler.git"
},
"bugs": {
"url": "https://github.com/cgiffard/node-simplecrawler/issues"
},
"main": "./lib/index.js",
"engines": {
"node": ">=0.4.0"
},
"devDependencies": {
"mocha": "~1.8.1",
"jshint": "~0.7.x",
"chai": "~1.2.0"
},
"dependencies": {
"iconv": "~1.2.4",
"URIjs": "~1.8.3"
}
}

@@ -18,2 +18,3 @@ // Tests to ensure crawler code is well formed

"cli",
"cookies",
"crawler",

@@ -20,0 +21,0 @@ "index",

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc