New Case Study:See how Anthropic automated 95% of dependency reviews with Socket.Learn More
Socket
Sign inDemoInstall
Socket

simplecrawler

Package Overview
Dependencies
Maintainers
2
Versions
70
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

simplecrawler - npm Package Compare versions

Comparing version 0.2.5 to 0.2.6

211

lib/crawler.js

@@ -37,2 +37,4 @@ // Simplecrawler

var Crawler = function(host,initialPath,initialPort,interval) {
var crawler = this;
// Data integrity checks

@@ -46,8 +48,8 @@ if (initialPort && isNaN(initialPort))

// Domain to crawl
this.host = host || "";
crawler.host = host || "";
// Gotta start crawling *somewhere*
this.initialPath = initialPath || "/";
this.initialPort = initialPort || 80;
this.initialProtocol = "http";
crawler.initialPath = initialPath || "/";
crawler.initialPort = initialPort || 80;
crawler.initialProtocol = "http";

@@ -57,13 +59,13 @@ // Internal 'tick' interval for spawning new requests

// One request will be spooled per tick, up to the concurrency threshold.
this.interval = interval || 250;
crawler.interval = interval || 250;
// Maximum request concurrency. Be sensible. Five ties in with node's
// default maxSockets value.
this.maxConcurrency = 5;
crawler.maxConcurrency = 5;
// Maximum time we'll wait for headers
this.timeout = 5 * 60 * 1000;
crawler.timeout = 5 * 60 * 1000;
// User Agent
this.userAgent =
crawler.userAgent =
"Node/" + MetaInfo.name + " " + MetaInfo.version +

@@ -74,3 +76,3 @@ " (" + MetaInfo.repository.url + ")";

// (but it's basically just an array)
this.queue = new FetchQueue();
crawler.queue = new FetchQueue();

@@ -80,40 +82,40 @@ // Do we filter by domain?

// recommend leaving this on!
this.filterByDomain = true;
crawler.filterByDomain = true;
// Do we scan subdomains?
this.scanSubdomains = false;
crawler.scanSubdomains = false;
// Treat WWW subdomain the same as the main domain (and don't count
// it as a separate subdomain)
this.ignoreWWWDomain = true;
crawler.ignoreWWWDomain = true;
// Or go even further and strip WWW subdomain from domains altogether!
this.stripWWWDomain = false;
crawler.stripWWWDomain = false;
// Internal cachestore
this.cache = null;
crawler.cache = null;
// Use an HTTP Proxy?
this.useProxy = false;
this.proxyHostname = "127.0.0.1";
this.proxyPort = 8123;
crawler.useProxy = false;
crawler.proxyHostname = "127.0.0.1";
crawler.proxyPort = 8123;
// Support for HTTP basic auth
this.needsAuth = false;
this.authUser = "";
this.authPass = "";
crawler.needsAuth = false;
crawler.authUser = "";
crawler.authPass = "";
// Support for retaining cookies for parse duration
this.acceptCookies = true;
this.cookies = new CookieJar();
crawler.acceptCookies = true;
crawler.cookies = new CookieJar();
// Support for custom headers...
this.customHeaders = {};
crawler.customHeaders = {};
// Domain Whitelist
// We allow domains to be whitelisted, so cross-domain requests can be made.
this.domainWhitelist = [];
crawler.domainWhitelist = [];
// Supported Protocols
this.allowedProtocols = [
crawler.allowedProtocols = [
/^http(s)?$/i, // HTTP & HTTPS

@@ -124,7 +126,7 @@ /^(rss|atom|feed)(\+xml)?$/i // RSS / XML

// Max file size to download/store
this.maxResourceSize = 1024 * 1024 * 16; // 16mb
crawler.maxResourceSize = 1024 * 1024 * 16; // 16mb
// Supported MIME-types
// Matching MIME-types will be scanned for links
this.supportedMimeTypes = [
crawler.supportedMimeTypes = [
/^text\//i,

@@ -137,13 +139,36 @@ /^application\/(rss|html|xhtml)?[\+\/\-]?xml/i,

// Download linked, but unsupported files (binary - images, documents, etc)
this.downloadUnsupported = true;
crawler.downloadUnsupported = true;
// URL Encoding setting...
this.urlEncoding = "unicode";
crawler.urlEncoding = "unicode";
// STATE (AND OTHER) VARIABLES NOT TO STUFF WITH
this.openRequests = 0;
this.fetchConditions = [];
crawler._openRequests = 0;
crawler._fetchConditions = [];
crawler._openListeners = 0;
crawler._listenerMap = {};
// Run the EventEmitter constructor
EventEmitter.call(this);
EventEmitter.call(crawler);
crawler._emitSpecial = function() {
var args = Array.prototype.slice.call(arguments,0),
event = args[0],
eventArgsLen = args.length-1,
asyncListenerCount = 0;
crawler.listeners(event).forEach(function(listener) {
if (listener.length > eventArgsLen)
asyncListenerCount++;
});
crawler._openListeners += asyncListenerCount|0;
crawler.emit.apply(crawler,args.concat([
function listenerComplete() {
if (crawler._openListeners > 0)
crawler._openListeners --;
}
]));
};
};

@@ -171,10 +196,10 @@

// only if we haven't already got stuff in our queue...
if (!this.queue.length) {
if (!crawler.queue.length) {
// Initialise our queue by pushing the initial request data into it...
this.queue.add(
this.initialProtocol,
this.host,
this.initialPort,
this.initialPath,
crawler.queue.add(
crawler.initialProtocol,
crawler.host,
crawler.initialPort,
crawler.initialPath,
function(error) {

@@ -185,8 +210,8 @@ if (error) throw error;

this.crawlIntervalID = setInterval(function() {
crawler.crawlIntervalID = setInterval(function() {
crawler.crawl.call(crawler);
},this.interval);
},crawler.interval);
this.emit("crawlstart");
this.running = true;
crawler._emitSpecial("crawlstart");
crawler.running = true;

@@ -198,3 +223,3 @@ // Now kick off the initial crawl

return this;
return crawler;
};

@@ -216,3 +241,3 @@

Crawler.prototype.protocolSupported = function(URL) {
var protocol;
var protocol, crawler = this;

@@ -227,3 +252,3 @@ try {

return this.allowedProtocols.reduce(function(prev,protocolCheck) {
return crawler.allowedProtocols.reduce(function(prev,protocolCheck) {
return prev || !!protocolCheck.exec(protocol);

@@ -247,5 +272,6 @@ },false);

Crawler.prototype.mimeTypeSupported = function(MIMEType) {
var crawler = this;
return (
this.supportedMimeTypes.reduce(function(prev,mimeCheck) {
crawler.supportedMimeTypes.reduce(function(prev,mimeCheck) {
return prev || !!mimeCheck.exec(MIMEType);

@@ -271,3 +297,12 @@ },false)

var newURL, crawler = this;
if (!context || typeof(context) !== "object")
context = {
url: (
crawler.initialProtocol + "://" +
crawler.host + ":" +
crawler.initialPort + "/"
)
};
// If the URL didn't contain anything, don't fetch it.

@@ -499,12 +534,12 @@ if (!URL.replace(/\s+/ig,"").length) return false;

Crawler.prototype.queueLinkedItems = function(resourceData,queueItem) {
var resources = this.discoverResources(resourceData,queueItem),
crawler = this;
var crawler = this,
resources = crawler.discoverResources(resourceData,queueItem);
// Emit discovered resources. ie: might be useful in building a graph of
// page relationships.
this.emit("discoverycomplete",queueItem,resources);
crawler._emitSpecial("discoverycomplete",queueItem,resources);
resources.forEach(function(url){ crawler.queueURL(url,queueItem); });
return this;
return crawler;
};

@@ -537,9 +572,9 @@

}
// Pass this URL past fetch conditions to ensure the user thinks it's valid
var fetchDenied = false;
fetchDenied = crawler.fetchConditions.reduce(function(prev,callback) {
fetchDenied = crawler._fetchConditions.reduce(function(prev,callback) {
return fetchDenied || !callback(parsedURL);
},false);
if (fetchDenied) {

@@ -549,3 +584,3 @@ // Fetch Conditions conspired to block URL

}
// Check the domain is valid before adding it to the queue

@@ -562,6 +597,6 @@ if (crawler.domainValid(parsedURL.host)) {

// We received an error condition when adding the callback
crawler.emit("queueerror",error,parsedURL);
crawler._emitSpecial("queueerror",error,parsedURL);
} else {
crawler.emit("queueadd",newQueueItem,parsedURL);
newQueueItem.referrer = queueItem.url;
crawler._emitSpecial("queueadd",newQueueItem,parsedURL);
newQueueItem.referrer = queueItem ? queueItem.url : null;
}

@@ -572,3 +607,3 @@ }

// If we caught an error, emit queueerror
crawler.emit("queueerror",error,parsedURL);
crawler._emitSpecial("queueerror",error,parsedURL);
return false;

@@ -608,3 +643,3 @@ }

var crawler = this;
crawler.openRequests ++;
crawler._openRequests ++;

@@ -670,3 +705,3 @@ // Variable declarations

// if required.
crawler.emit("fetchstart",queueItem,requestOptions);
crawler._emitSpecial("fetchstart",queueItem,requestOptions);

@@ -684,6 +719,6 @@ process.nextTick(function() {

clientRequest.on("error",function(errorData) {
crawler.openRequests --;
crawler._openRequests --;
// Emit 5xx / 4xx event
crawler.emit("fetchclienterror",queueItem,errorData);
crawler._emitSpecial("fetchclienterror",queueItem,errorData);
queueItem.fetched = true;

@@ -748,3 +783,3 @@ queueItem.stateData.code = 599;

// Emit header receive event
crawler.emit("fetchheaders",queueItem,response);
crawler._emitSpecial("fetchheaders",queueItem,response);

@@ -772,3 +807,3 @@ // Ensure response length is reasonable...

crawler.emit("fetchcomplete",queueItem,responseBuffer,response);
crawler._emitSpecial("fetchcomplete",queueItem,responseBuffer,response);

@@ -788,3 +823,3 @@ // First, save item to cache (if we're using a cache!)

crawler.openRequests --;
crawler._openRequests --;
}

@@ -827,3 +862,3 @@

crawler.emit("fetchdataerror",queueItem,response);
crawler._emitSpecial("fetchdataerror",queueItem,response);
}

@@ -869,3 +904,3 @@ } else {

crawler.cache.getCacheData(queueItem,function(cacheObject) {
crawler.emit("notmodified",queueItem,response,cacheObject);
crawler._emitSpecial("notmodified",queueItem,response,cacheObject);
});

@@ -875,3 +910,3 @@ } else {

// we don't send any data.
crawler.emit("notmodified",queueItem,response);
crawler._emitSpecial("notmodified",queueItem,response);
}

@@ -890,3 +925,3 @@

// Emit redirect event
crawler.emit("fetchredirect",queueItem,parsedURL,response);
crawler._emitSpecial("fetchredirect",queueItem,parsedURL,response);

@@ -896,3 +931,3 @@ // Clean URL, add to queue...

crawler.openRequests --;
crawler._openRequests --;

@@ -905,5 +940,5 @@ // Ignore this request, but record that we had a 404

// Emit 404 event
crawler.emit("fetch404",queueItem,response);
crawler._emitSpecial("fetch404",queueItem,response);
crawler.openRequests --;
crawler._openRequests --;

@@ -916,5 +951,5 @@ // And oh dear. Handle this one as well. (other 400s, 500s, etc)

// Emit 5xx / 4xx event
crawler.emit("fetcherror",queueItem,response);
crawler._emitSpecial("fetcherror",queueItem,response);
crawler.openRequests --;
crawler._openRequests --;
}

@@ -941,3 +976,3 @@

if (crawler.openRequests > crawler.maxConcurrency) return;
if (crawler._openRequests > crawler.maxConcurrency) return;

@@ -947,3 +982,6 @@ crawler.queue.oldestUnfetchedItem(function(err,queueItem) {

crawler.fetchQueueItem(queueItem);
} else if (crawler.openRequests === 0) {
} else if ( !crawler._openRequests &&
!crawler._openListeners) {
crawler.queue.complete(function(err,completeCount) {

@@ -972,5 +1010,6 @@ if (completeCount === crawler.queue.length) {

Crawler.prototype.stop = function() {
clearInterval(this.crawlIntervalID);
this.running = false;
return this;
var crawler = this;
clearInterval(crawler.crawlIntervalID);
crawler.running = false;
return crawler;
};

@@ -1000,5 +1039,6 @@

Crawler.prototype.addFetchCondition = function(callback) {
var crawler = this;
if (callback instanceof Function) {
this.fetchConditions.push(callback);
return this.fetchConditions.length - 1;
crawler._fetchConditions.push(callback);
return crawler._fetchConditions.length - 1;
} else {

@@ -1024,6 +1064,7 @@ throw new Error("Fetch Condition must be a function.");

Crawler.prototype.removeFetchCondition = function(index) {
if (this.fetchConditions[index] &&
this.fetchConditions[index] instanceof Function) {
var crawler = this;
if (crawler._fetchConditions[index] &&
crawler._fetchConditions[index] instanceof Function) {
return !!this.fetchConditions.splice(index,1);
return !!crawler._fetchConditions.splice(index,1);
} else {

@@ -1030,0 +1071,0 @@ throw new Error("Unable to find indexed Fetch Condition.");

{
"name": "simplecrawler",
"description": "Very straigntforward web crawler. Uses EventEmitter. Generates queue statistics and has a basic cache mechanism with extensible backend.",
"version": "0.2.5",
"version": "0.2.6",
"homepage": "http://github.com/cgiffard/node-simplecrawler",

@@ -16,3 +16,3 @@ "author": "Christopher Giffard <christopher.giffard@cgiffard.com>",

"scripts": {
"test": "mocha -R spec"
"test": "mocha -R spec -t 4000"
},

@@ -19,0 +19,0 @@ "bin": {

@@ -27,3 +27,15 @@ // Routes for testing server

write(200,"Crawl complete!");
},
"/async-stage1": function(write) {
write(200,"http://127.0.0.1:3000/async-stage2");
},
"/async-stage2": function(write) {
write(200,"http://127.0.0.1:3000/async-stage3");
},
"/async-stage3": function(write) {
write(200,"Complete!");
}
};

@@ -14,9 +14,13 @@ // Runs a very simple crawl on an HTTP server

// Create a new crawler to crawl this server
var localCrawler = new Crawler("127.0.0.1","/",3000);
var localCrawler = new Crawler("127.0.0.1","/",3000),
asyncCrawler = new Crawler("127.0.0.1","/",3000);
var linksDiscovered = 0;
it("should be able to be started",function(done) {
localCrawler.on("crawlstart",done);
localCrawler.on("crawlstart",function() { done() });
localCrawler.on("discoverycomplete",function() {
linksDiscovered ++;
});
localCrawler.start();

@@ -33,6 +37,2 @@ localCrawler.running.should.be.truthy;

localCrawler.on("discoverycomplete",function() {
linksDiscovered ++;
});
localCrawler.on("complete",function() {

@@ -43,4 +43,31 @@ linksDiscovered.should.equal(5);

});
it("should support async event listeners for manual discovery",function(done) {
// Use a different crawler this time
asyncCrawler.discoverResources = false;
asyncCrawler.queueURL("http://127.0.0.1:3000/async-stage1");
asyncCrawler.start();
asyncCrawler.on("fetchcomplete",function(queueItem,data,res,evtDone) {
setTimeout(function(){
linksDiscovered ++;
if (String(data).match(/complete/i))
return evtDone();
// Taking advantage of the fact that for these, the sum total
// of the body data is a URL.
asyncCrawler.queueURL(String(data)).should.be.true;
evtDone();
},250);
});
asyncCrawler.on("complete",function() {
linksDiscovered.should.equal(8);
done();
});
});
// TODO

@@ -47,0 +74,0 @@

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc