simplecrawler
Advanced tools
Comparing version 0.5.4 to 0.6.0
@@ -198,9 +198,7 @@ /* | ||
} | ||
} else if (count === pathStack.length - 1) { | ||
// Write the file data in | ||
writeFileData(currentPath, data); | ||
} else { | ||
if (count === pathStack.length - 1) { | ||
// Write the file data in | ||
writeFileData(currentPath, data); | ||
} | ||
fs.mkdirSync(currentPath); | ||
} | ||
@@ -207,0 +205,0 @@ }); |
@@ -0,0 +0,0 @@ /* |
// CLI module for crawling. | ||
// Not yet built. |
@@ -440,2 +440,6 @@ /* | ||
Cookie.prototype.matchDomain = function(domain) { | ||
if (this.domain === "*") { | ||
return true; | ||
} | ||
var reverseDomain = this.domain.split("").reverse().join(""), | ||
@@ -442,0 +446,0 @@ reverseDomainComp = domain.split("").reverse().join(""); |
@@ -19,3 +19,4 @@ /* | ||
zlib = require("zlib"), | ||
util = require("util"); | ||
util = require("util"), | ||
iconv = require("iconv-lite"); | ||
@@ -85,2 +86,6 @@ var QUEUE_ITEM_INITIAL_DEPTH = 1; | ||
// Decode HTTP responses based on their Content-Type header or any | ||
// inline charset definition | ||
crawler.decodeResponses = false; | ||
// Do we filter by domain? | ||
@@ -156,4 +161,4 @@ // Unless you want to be crawling the entire internet, I would | ||
crawler.discoverRegex = [ | ||
/\s?(?:href|src)\s?=\s?(["']).*?\1/ig, | ||
/\s?(?:href|src)\s?=\s?[^"'][^\s>]+/ig, | ||
/\s(?:href|src)\s?=\s?(["']).*?\1/ig, | ||
/\s(?:href|src)\s?=\s?[^"'\s][^\s>]+/ig, | ||
/\s?url\((["']).*?\1\)/ig, | ||
@@ -188,2 +193,6 @@ /\s?url\([^"'].*?\)/ig, | ||
// The HTTP / HTTPS agent used to crawl | ||
crawler.httpAgent = http.globalAgent; | ||
crawler.httpsAgent = https.globalAgent; | ||
// STATE (AND OTHER) VARIABLES NOT TO STUFF WITH | ||
@@ -250,16 +259,16 @@ var hiddenProps = { | ||
crawler.crawlIntervalID = | ||
setInterval( | ||
function() { | ||
crawler.crawl(crawler); | ||
}, | ||
crawler.interval); | ||
process.nextTick(function() { | ||
crawler.crawlIntervalID = | ||
setInterval( | ||
function() { | ||
crawler.crawl(crawler); | ||
}, | ||
crawler.interval); | ||
crawler.emit("crawlstart"); | ||
crawler.running = true; | ||
// Now kick off the initial crawl | ||
process.nextTick(function() { | ||
// Now kick off the initial crawl | ||
crawler.crawl(); | ||
}); | ||
crawler.running = true; | ||
crawler.emit("crawlstart"); | ||
}); | ||
@@ -448,4 +457,5 @@ | ||
.replace(/^\s*/, "") | ||
.replace(/^(['"])(.*)\1$/, "$2") | ||
.replace(/^url\((.*)\)/i, "$1") | ||
.replace(/^javascript\:\s*[a-z0-9]+\((.*)/i, "$1") | ||
.replace(/^javascript\:\s*([a-z0-9]*\(["'](.*)["']\))*.*/i, "$2") | ||
.replace(/^(['"])(.*)\1$/, "$2") | ||
@@ -476,4 +486,3 @@ .replace(/^\((.*)\)$/, "$1") | ||
Crawler.prototype.cleanExpandResources = function (urlMatch, queueItem) { | ||
var crawler = this, | ||
resources = []; | ||
var crawler = this; | ||
@@ -510,3 +519,3 @@ if (!urlMatch) { | ||
// Does the item already exist in the list? | ||
if (resources.reduce(function(prev, current) { | ||
if (list.reduce(function(prev, current) { | ||
return prev || current === URL; | ||
@@ -827,6 +836,7 @@ }, false)) { | ||
var client = queueItem.protocol === "https" ? https : http; | ||
var agent = queueItem.protocol === "https" ? crawler.httpsAgent : crawler.httpAgent; | ||
// Up the socket limit if required. | ||
if (client.globalAgent.maxSockets < crawler.maxConcurrency) { | ||
client.globalAgent.maxSockets = crawler.maxConcurrency; | ||
if (agent.maxSockets < crawler.maxConcurrency) { | ||
agent.maxSockets = crawler.maxConcurrency; | ||
} | ||
@@ -852,2 +862,3 @@ | ||
path: requestPath, | ||
agent: agent, | ||
headers: { | ||
@@ -918,51 +929,49 @@ "User-Agent": crawler.userAgent, | ||
process.nextTick(function() { | ||
// Record what time we started this request | ||
timeCommenced = Date.now(); | ||
// Record what time we started this request | ||
timeCommenced = Date.now(); | ||
// Get the resource! | ||
clientRequest = | ||
client.request(requestOptions, function(response) { | ||
crawler.handleResponse(queueItem, response, timeCommenced); | ||
}); | ||
// Get the resource! | ||
clientRequest = | ||
client.request(requestOptions, function(response) { | ||
crawler.handleResponse(queueItem, response, timeCommenced); | ||
}); | ||
clientRequest.end(); | ||
clientRequest.end(); | ||
clientRequest.setTimeout(crawler.timeout, function() { | ||
if (queueItem.fetched) { | ||
return; | ||
} | ||
clientRequest.setTimeout(crawler.timeout, function() { | ||
if (queueItem.fetched) { | ||
return; | ||
} | ||
if (crawler.running && !queueItem.fetched) { | ||
crawler._openRequests--; | ||
} | ||
if (crawler.running && !queueItem.fetched) { | ||
crawler._openRequests--; | ||
} | ||
queueItem.fetched = true; | ||
queueItem.status = "timeout"; | ||
crawler.emit("fetchtimeout", queueItem, crawler.timeout); | ||
clientRequest._crawlerHandled = true; | ||
clientRequest.abort(); | ||
}); | ||
queueItem.fetched = true; | ||
queueItem.status = "timeout"; | ||
crawler.emit("fetchtimeout", queueItem, crawler.timeout); | ||
clientRequest._crawlerHandled = true; | ||
clientRequest.abort(); | ||
}); | ||
clientRequest.on("error", function(errorData) { | ||
clientRequest.on("error", function(errorData) { | ||
// This event will be thrown if we manually aborted the request, | ||
// but we don't want to do anything in that case. | ||
if (clientRequest._crawlerHandled) { | ||
return; | ||
} | ||
// This event will be thrown if we manually aborted the request, | ||
// but we don't want to do anything in that case. | ||
if (clientRequest._crawlerHandled) { | ||
return; | ||
} | ||
if (crawler.running && !queueItem.fetched) { | ||
crawler._openRequests--; | ||
} | ||
if (crawler.running && !queueItem.fetched) { | ||
crawler._openRequests--; | ||
} | ||
// Emit 5xx / 4xx event | ||
queueItem.fetched = true; | ||
queueItem.stateData.code = 599; | ||
queueItem.status = "failed"; | ||
crawler.emit("fetchclienterror", queueItem, errorData); | ||
}); | ||
// Emit 5xx / 4xx event | ||
queueItem.fetched = true; | ||
queueItem.stateData.code = 599; | ||
queueItem.status = "failed"; | ||
crawler.emit("fetchclienterror", queueItem, errorData); | ||
}); | ||
return crawler; | ||
}); | ||
return crawler; | ||
}; | ||
@@ -972,2 +981,27 @@ | ||
/* | ||
Decode string buffer based on a complete Content-Type header. Will also look | ||
for an embedded <meta> tag with a charset definition, but the Content-Type | ||
header is prioritized, see: | ||
https://developer.mozilla.org/en-US/docs/Web/HTML/Element/meta#attr-charset | ||
Examples | ||
crawler.decodeBuffer(responseBuffer, "text/html; charset=tis-620"); | ||
Returns the decoded buffer. | ||
*/ | ||
Crawler.prototype.decodeBuffer = function (buffer, contentTypeHeader) { | ||
contentTypeHeader = contentTypeHeader || ""; | ||
var embeddedEncoding = /<meta.*charset=["']{0,1}([^"'>]*)["']{0,1}\s*\/{0,1}>/i.exec(buffer.toString(undefined, 0, 512)) || [], | ||
encoding = contentTypeHeader.split("charset=")[1] || embeddedEncoding[1] || contentTypeHeader; | ||
encoding = iconv.encodingExists(encoding) ? encoding : "utf-8"; | ||
return iconv.decode(buffer, encoding); | ||
}; | ||
/* | ||
Public: Given a queueItem and a matching response object, the crawler will | ||
@@ -1053,4 +1087,7 @@ handle downloading the resource, queueing of linked items, etc. | ||
if (crawler.depthAllowed(queueItem)) { | ||
crawler.emit("fetchcomplete", queueItem, responseBuffer, response); | ||
var responseBody = | ||
crawler.decodeResponses ? crawler.decodeBuffer(responseBuffer, stateData.contentType) : responseBuffer; | ||
crawler.emit("fetchcomplete", queueItem, responseBody, response); | ||
// We only process the item if it's of a valid mimetype | ||
@@ -1142,3 +1179,3 @@ // and only if the crawler is set to discover its own resources | ||
response.socket.end(); | ||
response.socket.destroy(); | ||
} | ||
@@ -1197,3 +1234,3 @@ | ||
crawler.emit("fetcherror", queueItem, response); | ||
response.socket.end(); | ||
response.socket.destroy(); | ||
@@ -1200,0 +1237,0 @@ crawler._openRequests--; |
@@ -0,0 +0,0 @@ /* |
@@ -0,0 +0,0 @@ /* |
@@ -66,3 +66,3 @@ /* | ||
var tmpCrawler = new Crawler(url.hostname(), url.path(), url.port() || 80); | ||
var tmpCrawler = new Crawler(url.hostname(), url.path(), url.port() || (url.protocol() === "https" ? 443 : 80)); | ||
@@ -69,0 +69,0 @@ // Attach callbacks if they were provided |
{ | ||
"name": "simplecrawler", | ||
"description": "Very straightforward web crawler. Uses EventEmitter. Generates queue statistics and has a basic cache mechanism with extensible backend.", | ||
"version": "0.5.4", | ||
"version": "0.6.0", | ||
"homepage": "https://github.com/cgiffard/node-simplecrawler", | ||
@@ -34,2 +34,3 @@ "author": "Christopher Giffard <christopher.giffard@cgiffard.com>", | ||
"dependencies": { | ||
"iconv-lite": "^0.4.13", | ||
"urijs": "^1.16.1" | ||
@@ -36,0 +37,0 @@ }, |
New author
Supply chain riskA new npm collaborator published a version of the package for the first time. New collaborators are usually benign additions to a project, but do indicate a change to the security surface area of a package.
Found 1 instance in 1 package
109712
2018
646
2
3
+ Addediconv-lite@^0.4.13
+ Addediconv-lite@0.4.24(transitive)
+ Addedsafer-buffer@2.1.2(transitive)