New Case Study:See how Anthropic automated 95% of dependency reviews with Socket.Learn More
Socket
Sign inDemoInstall
Socket

simplecrawler

Package Overview
Dependencies
Maintainers
2
Versions
70
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

simplecrawler - npm Package Compare versions

Comparing version 0.2.10 to 0.3.0

112

lib/crawler.js

@@ -65,2 +65,5 @@ // Simplecrawler

crawler.timeout = 5 * 60 * 1000;
// Maximum time we'll wait for async listeners.
crawler.listenerTTL = 10 * 1000;

@@ -143,6 +146,7 @@ // User Agent

// STATE (AND OTHER) VARIABLES NOT TO STUFF WITH
crawler._openRequests = 0;
crawler._fetchConditions = [];
crawler._openListeners = 0;
crawler._listenerMap = {};
var hiddenProps = {
"_openRequests": 0,
"_fetchConditions": [],
"_openListeners": 0
};

@@ -152,22 +156,10 @@ // Run the EventEmitter constructor

crawler._emitSpecial = function() {
var args = Array.prototype.slice.call(arguments,0),
event = args[0],
eventArgsLen = args.length-1,
asyncListenerCount = 0;
crawler.listeners(event).forEach(function(listener) {
if (listener.length > eventArgsLen)
asyncListenerCount++;
// Apply all the hidden props
Object.keys(hiddenProps).forEach(function(key) {
Object.defineProperty(crawler, key, {
"writable": true,
"enumerable": false,
"value": hiddenProps[key]
});
crawler._openListeners += asyncListenerCount|0;
crawler.emit.apply(crawler,args.concat([
function listenerComplete() {
if (crawler._openListeners > 0)
crawler._openListeners --;
}
]));
};
});
};

@@ -212,3 +204,3 @@

crawler._emitSpecial("crawlstart");
crawler.emit("crawlstart");
crawler.running = true;

@@ -553,3 +545,3 @@

// page relationships.
crawler._emitSpecial("discoverycomplete",queueItem,resources);
crawler.emit("discoverycomplete",queueItem,resources);

@@ -615,8 +607,8 @@ resources.forEach(function(url){ crawler.queueURL(url,queueItem); });

if (error.code && error.code === "DUP")
return crawler._emitSpecial("queueduplicate",parsedURL);
return crawler.emit("queueduplicate",parsedURL);
return crawler._emitSpecial("queueerror",error,parsedURL);
return crawler.emit("queueerror",error,parsedURL);
}
crawler._emitSpecial("queueadd",newQueueItem,parsedURL);
crawler.emit("queueadd",newQueueItem,parsedURL);
newQueueItem.referrer = queueItem ? queueItem.url : null;

@@ -723,3 +715,3 @@ }

// if required.
crawler._emitSpecial("fetchstart",queueItem,requestOptions);
crawler.emit("fetchstart",queueItem,requestOptions);

@@ -738,3 +730,3 @@ process.nextTick(function() {

clientRequest.abort();
crawler._emitSpecial("fetchtimeout",queueItem,crawler.timeout);
crawler.emit("fetchtimeout",queueItem,crawler.timeout);
});

@@ -746,3 +738,3 @@

// Emit 5xx / 4xx event
crawler._emitSpecial("fetchclienterror",queueItem,errorData);
crawler.emit("fetchclienterror",queueItem,errorData);
queueItem.fetched = true;

@@ -807,3 +799,3 @@ queueItem.stateData.code = 599;

// Emit header receive event
crawler._emitSpecial("fetchheaders",queueItem,response);
crawler.emit("fetchheaders",queueItem,response);

@@ -831,3 +823,3 @@ // Ensure response length is reasonable...

crawler._emitSpecial("fetchcomplete",queueItem,responseBuffer,response);
crawler.emit("fetchcomplete",queueItem,responseBuffer,response);

@@ -885,3 +877,3 @@ // First, save item to cache (if we're using a cache!)

crawler._emitSpecial("fetchdataerror",queueItem,response);
crawler.emit("fetchdataerror",queueItem,response);
}

@@ -927,3 +919,3 @@ } else {

crawler.cache.getCacheData(queueItem,function(cacheObject) {
crawler._emitSpecial("notmodified",queueItem,response,cacheObject);
crawler.emit("notmodified",queueItem,response,cacheObject);
});

@@ -933,3 +925,3 @@ } else {

// we don't send any data.
crawler._emitSpecial("notmodified",queueItem,response);
crawler.emit("notmodified",queueItem,response);
}

@@ -948,3 +940,3 @@

// Emit redirect event
crawler._emitSpecial("fetchredirect",queueItem,parsedURL,response);
crawler.emit("fetchredirect",queueItem,parsedURL,response);

@@ -962,3 +954,3 @@ // Clean URL, add to queue...

// Emit 404 event
crawler._emitSpecial("fetch404",queueItem,response);
crawler.emit("fetch404",queueItem,response);

@@ -973,3 +965,3 @@ crawler._openRequests --;

// Emit 5xx / 4xx event
crawler._emitSpecial("fetcherror",queueItem,response);
crawler.emit("fetcherror",queueItem,response);

@@ -1037,2 +1029,40 @@ crawler._openRequests --;

/*
Public: Holds the crawler in a 'running' state, preventing the `complete`
event from firing until the callback this function returns has been executed,
or a predetermined timeout (as specified by `crawler.listenerTTL`) has
elapsed.
Examples
crawler.on("fetchcomplete",function(queueItem,data) {
continue = this.wait();
doSomethingThatTakesAlongTime(function callback() {
continue();
});
});
Returns callback which will allow the crawler to continue.
*/
Crawler.prototype.wait = function() {
var crawler = this,
cleared = false,
timeout =
setTimeout(function() {
if (cleared) return;
cleared = true;
crawler._openListeners --;
}, crawler.listenerTTL);
crawler._openListeners ++;
return function() {
if (cleared) return;
cleared = true;
crawler._openListeners --;
clearTimeout(timeout);
};
};
/*
Public: Given a function, this method adds it to an internal list maintained

@@ -1072,3 +1102,3 @@ by the crawler to be executed against each URL to determine whether it should

url - ID of fetch condition to be removed.
index - ID of fetch condition to be removed.

@@ -1097,3 +1127,3 @@ Examples

url
url - URL from which to remove the querystring

@@ -1100,0 +1130,0 @@ Examples

{
"name": "simplecrawler",
"description": "Very straigntforward web crawler. Uses EventEmitter. Generates queue statistics and has a basic cache mechanism with extensible backend.",
"version": "0.2.10",
"version": "0.3.0",
"homepage": "http://github.com/cgiffard/node-simplecrawler",

@@ -6,0 +6,0 @@ "author": "Christopher Giffard <christopher.giffard@cgiffard.com>",

@@ -42,2 +42,4 @@ // Tests to ensure crawler code is well formed

"clearInterval": true,
"setTimeout": true,
"clearTimeout": true,
"Buffer": true

@@ -44,0 +46,0 @@ });

@@ -54,3 +54,5 @@ // Runs a very simple crawl on an HTTP server

asyncCrawler.on("fetchcomplete",function(queueItem,data,res,evtDone) {
asyncCrawler.on("fetchcomplete",function(queueItem,data,res) {
evtDone = this.wait();
setTimeout(function(){

@@ -57,0 +59,0 @@ linksDiscovered ++;

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc