Socket
Socket
Sign inDemoInstall

simplecrawler

Package Overview
Dependencies
3
Maintainers
2
Versions
70
Alerts
File Explorer

Advanced tools

Install Socket

Detect and block malicious and high-risk dependencies

Install

Comparing version 0.5.4 to 0.6.0

README.md

8

lib/cache-backend-fs.js

@@ -198,9 +198,7 @@ /*

}
} else if (count === pathStack.length - 1) {
// Write the file data in
writeFileData(currentPath, data);
} else {
if (count === pathStack.length - 1) {
// Write the file data in
writeFileData(currentPath, data);
}
fs.mkdirSync(currentPath);
}

@@ -207,0 +205,0 @@ });

@@ -0,0 +0,0 @@ /*

// CLI module for crawling.
// Not yet built.

@@ -440,2 +440,6 @@ /*

Cookie.prototype.matchDomain = function(domain) {
if (this.domain === "*") {
return true;
}
var reverseDomain = this.domain.split("").reverse().join(""),

@@ -442,0 +446,0 @@ reverseDomainComp = domain.split("").reverse().join("");

@@ -19,3 +19,4 @@ /*

zlib = require("zlib"),
util = require("util");
util = require("util"),
iconv = require("iconv-lite");

@@ -85,2 +86,6 @@ var QUEUE_ITEM_INITIAL_DEPTH = 1;

// Decode HTTP responses based on their Content-Type header or any
// inline charset definition
crawler.decodeResponses = false;
// Do we filter by domain?

@@ -156,4 +161,4 @@ // Unless you want to be crawling the entire internet, I would

crawler.discoverRegex = [
/\s?(?:href|src)\s?=\s?(["']).*?\1/ig,
/\s?(?:href|src)\s?=\s?[^"'][^\s>]+/ig,
/\s(?:href|src)\s?=\s?(["']).*?\1/ig,
/\s(?:href|src)\s?=\s?[^"'\s][^\s>]+/ig,
/\s?url\((["']).*?\1\)/ig,

@@ -188,2 +193,6 @@ /\s?url\([^"'].*?\)/ig,

// The HTTP / HTTPS agent used to crawl
crawler.httpAgent = http.globalAgent;
crawler.httpsAgent = https.globalAgent;
// STATE (AND OTHER) VARIABLES NOT TO STUFF WITH

@@ -250,16 +259,16 @@ var hiddenProps = {

crawler.crawlIntervalID =
setInterval(
function() {
crawler.crawl(crawler);
},
crawler.interval);
process.nextTick(function() {
crawler.crawlIntervalID =
setInterval(
function() {
crawler.crawl(crawler);
},
crawler.interval);
crawler.emit("crawlstart");
crawler.running = true;
// Now kick off the initial crawl
process.nextTick(function() {
// Now kick off the initial crawl
crawler.crawl();
});
crawler.running = true;
crawler.emit("crawlstart");
});

@@ -448,4 +457,5 @@

.replace(/^\s*/, "")
.replace(/^(['"])(.*)\1$/, "$2")
.replace(/^url\((.*)\)/i, "$1")
.replace(/^javascript\:\s*[a-z0-9]+\((.*)/i, "$1")
.replace(/^javascript\:\s*([a-z0-9]*\(["'](.*)["']\))*.*/i, "$2")
.replace(/^(['"])(.*)\1$/, "$2")

@@ -476,4 +486,3 @@ .replace(/^\((.*)\)$/, "$1")

Crawler.prototype.cleanExpandResources = function (urlMatch, queueItem) {
var crawler = this,
resources = [];
var crawler = this;

@@ -510,3 +519,3 @@ if (!urlMatch) {

// Does the item already exist in the list?
if (resources.reduce(function(prev, current) {
if (list.reduce(function(prev, current) {
return prev || current === URL;

@@ -827,6 +836,7 @@ }, false)) {

var client = queueItem.protocol === "https" ? https : http;
var agent = queueItem.protocol === "https" ? crawler.httpsAgent : crawler.httpAgent;
// Up the socket limit if required.
if (client.globalAgent.maxSockets < crawler.maxConcurrency) {
client.globalAgent.maxSockets = crawler.maxConcurrency;
if (agent.maxSockets < crawler.maxConcurrency) {
agent.maxSockets = crawler.maxConcurrency;
}

@@ -852,2 +862,3 @@

path: requestPath,
agent: agent,
headers: {

@@ -918,51 +929,49 @@ "User-Agent": crawler.userAgent,

process.nextTick(function() {
// Record what time we started this request
timeCommenced = Date.now();
// Record what time we started this request
timeCommenced = Date.now();
// Get the resource!
clientRequest =
client.request(requestOptions, function(response) {
crawler.handleResponse(queueItem, response, timeCommenced);
});
// Get the resource!
clientRequest =
client.request(requestOptions, function(response) {
crawler.handleResponse(queueItem, response, timeCommenced);
});
clientRequest.end();
clientRequest.end();
clientRequest.setTimeout(crawler.timeout, function() {
if (queueItem.fetched) {
return;
}
clientRequest.setTimeout(crawler.timeout, function() {
if (queueItem.fetched) {
return;
}
if (crawler.running && !queueItem.fetched) {
crawler._openRequests--;
}
if (crawler.running && !queueItem.fetched) {
crawler._openRequests--;
}
queueItem.fetched = true;
queueItem.status = "timeout";
crawler.emit("fetchtimeout", queueItem, crawler.timeout);
clientRequest._crawlerHandled = true;
clientRequest.abort();
});
queueItem.fetched = true;
queueItem.status = "timeout";
crawler.emit("fetchtimeout", queueItem, crawler.timeout);
clientRequest._crawlerHandled = true;
clientRequest.abort();
});
clientRequest.on("error", function(errorData) {
clientRequest.on("error", function(errorData) {
// This event will be thrown if we manually aborted the request,
// but we don't want to do anything in that case.
if (clientRequest._crawlerHandled) {
return;
}
// This event will be thrown if we manually aborted the request,
// but we don't want to do anything in that case.
if (clientRequest._crawlerHandled) {
return;
}
if (crawler.running && !queueItem.fetched) {
crawler._openRequests--;
}
if (crawler.running && !queueItem.fetched) {
crawler._openRequests--;
}
// Emit 5xx / 4xx event
queueItem.fetched = true;
queueItem.stateData.code = 599;
queueItem.status = "failed";
crawler.emit("fetchclienterror", queueItem, errorData);
});
// Emit 5xx / 4xx event
queueItem.fetched = true;
queueItem.stateData.code = 599;
queueItem.status = "failed";
crawler.emit("fetchclienterror", queueItem, errorData);
});
return crawler;
});
return crawler;
};

@@ -972,2 +981,27 @@

/*
Decode string buffer based on a complete Content-Type header. Will also look
for an embedded <meta> tag with a charset definition, but the Content-Type
header is prioritized, see:
https://developer.mozilla.org/en-US/docs/Web/HTML/Element/meta#attr-charset
Examples
crawler.decodeBuffer(responseBuffer, "text/html; charset=tis-620");
Returns the decoded buffer.
*/
Crawler.prototype.decodeBuffer = function (buffer, contentTypeHeader) {
contentTypeHeader = contentTypeHeader || "";
var embeddedEncoding = /<meta.*charset=["']{0,1}([^"'>]*)["']{0,1}\s*\/{0,1}>/i.exec(buffer.toString(undefined, 0, 512)) || [],
encoding = contentTypeHeader.split("charset=")[1] || embeddedEncoding[1] || contentTypeHeader;
encoding = iconv.encodingExists(encoding) ? encoding : "utf-8";
return iconv.decode(buffer, encoding);
};
/*
Public: Given a queueItem and a matching response object, the crawler will

@@ -1053,4 +1087,7 @@ handle downloading the resource, queueing of linked items, etc.

if (crawler.depthAllowed(queueItem)) {
crawler.emit("fetchcomplete", queueItem, responseBuffer, response);
var responseBody =
crawler.decodeResponses ? crawler.decodeBuffer(responseBuffer, stateData.contentType) : responseBuffer;
crawler.emit("fetchcomplete", queueItem, responseBody, response);
// We only process the item if it's of a valid mimetype

@@ -1142,3 +1179,3 @@ // and only if the crawler is set to discover its own resources

response.socket.end();
response.socket.destroy();
}

@@ -1197,3 +1234,3 @@

crawler.emit("fetcherror", queueItem, response);
response.socket.end();
response.socket.destroy();

@@ -1200,0 +1237,0 @@ crawler._openRequests--;

@@ -0,0 +0,0 @@ /*

@@ -0,0 +0,0 @@ /*

@@ -66,3 +66,3 @@ /*

var tmpCrawler = new Crawler(url.hostname(), url.path(), url.port() || 80);
var tmpCrawler = new Crawler(url.hostname(), url.path(), url.port() || (url.protocol() === "https" ? 443 : 80));

@@ -69,0 +69,0 @@ // Attach callbacks if they were provided

{
"name": "simplecrawler",
"description": "Very straightforward web crawler. Uses EventEmitter. Generates queue statistics and has a basic cache mechanism with extensible backend.",
"version": "0.5.4",
"version": "0.6.0",
"homepage": "https://github.com/cgiffard/node-simplecrawler",

@@ -34,2 +34,3 @@ "author": "Christopher Giffard <christopher.giffard@cgiffard.com>",

"dependencies": {
"iconv-lite": "^0.4.13",
"urijs": "^1.16.1"

@@ -36,0 +37,0 @@ },

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc