New Case Study:See how Anthropic automated 95% of dependency reviews with Socket.Learn More
Socket
Sign inDemoInstall
Socket

simplecrawler

Package Overview
Dependencies
Maintainers
2
Versions
70
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

simplecrawler - npm Package Compare versions

Comparing version 0.3.10 to 0.3.11

test/depth.js

108

lib/crawler.js

@@ -19,2 +19,4 @@ // Simplecrawler

var QUEUE_ITEM_INITIAL_DEPTH = 1;
/*

@@ -162,2 +164,11 @@ Public: Constructor for the crawler.

// Whether to parse inside HTML comments
crawler.parseHTMLComments = true;
// Whether to parse inside script tags
crawler.parseScriptTags = true;
// Max depth parameter
crawler.maxDepth = 0;
// STATE (AND OTHER) VARIABLES NOT TO STUFF WITH

@@ -214,2 +225,3 @@ var hiddenProps = {

crawler.initialPath,
QUEUE_ITEM_INITIAL_DEPTH,
function(error) {

@@ -296,2 +308,40 @@ if (error) throw error;

/*
Public: Determines whether the queueItem can be fetched from its depth
In fact, the queueItem need to be fetched before calling this (because we need its MIMEType).
This will just determine if we need to send an event for this item & if we need to fetch linked
resources.
If the queue item is a CSS or JS file, it will always be fetched (we need all images in CSS files,
even if max depth is already reached). If it's an HTML page, we will check if max depth is reached
or not.
queueItem - Queue item object to check
Returns a boolean, true if the queue item can be fetched - false if not.
*/
Crawler.prototype.depthAllowed = function(queueItem) {
var crawler = this;
// Items matching this pattern will always be fetched, even if max depth is reached
var mimeTypesWhitelist = [
/^text\/(css|javascript|ecmascript)/i,
/^application\/javascript/i,
/^application\/x-font/i,
/^application\/font/i,
/^image\//i,
/^font\//i
];
return (
crawler.maxDepth === 0 ||
queueItem.depth <= crawler.maxDepth ||
mimeTypesWhitelist.reduce(function(prev,mimeCheck) {
return prev || !!mimeCheck.exec(queueItem.stateData.contentType);
}, false)
);
};
/*
Public: Extracts protocol, host, port and resource (path) given a URL string.

@@ -318,3 +368,4 @@

crawler.initialPort + "/"
)
),
depth: QUEUE_ITEM_INITIAL_DEPTH
};

@@ -347,6 +398,7 @@

"protocol": newURL.protocol() || "http",
"host": newURL.hostname(),
"port": newURL.port() || 80,
"path": newURL.resource(),
"uriPath": newURL.path()
"host": newURL.hostname(),
"port": newURL.port() || 80,
"path": newURL.resource(),
"uriPath": newURL.path(),
"depth": context.depth + 1
};

@@ -383,2 +435,10 @@ };

if (!crawler.parseHTMLComments) {
resourceText = resourceText.replace(/<!--([\s\S]+?)-->/g, "");
}
if (!crawler.parseScriptTags) {
resourceText = resourceText.replace(/<script(.*?)>([\s\S]+?)<\/script>/gi, "");
}
function cleanURL(URL) {

@@ -404,11 +464,20 @@ return URL

.reduce(function(list,URL) {
var tmpURL;
// Ensure URL is whole and complete
try {
URL = URI(URL)
.absoluteTo(queueItem.url)
tmpURL = URI(URL);
if (queueItem.url) {
URL = tmpURL
.absoluteTo(queueItem.url)
.normalize()
.toString();
} else {
URL = tmpURL
.normalize()
.toString();
}
} catch(e) {
// But if URI.js couldn't parse it - nobody can!

@@ -636,2 +705,3 @@ return list;

parsedURL.path,
parsedURL.depth,
function queueAddCallback(error,newQueueItem) {

@@ -723,7 +793,10 @@ if (error) {

""
),
"Referer": queueItem.referrer
)
}
};
if (queueItem.referrer) {
requestOptions.headers.Referer = queueItem.referrer;
}
// If port is one of the HTTP/HTTPS defaults, delete the option to avoid conflicts

@@ -876,4 +949,2 @@ if (requestOptions.port === 80 || requestOptions.port === 443) {

crawler.emit("fetchcomplete",queueItem,responseBuffer,response);
// First, save item to cache (if we're using a cache!)

@@ -886,6 +957,11 @@ if (crawler.cache !== null &&

// We only process the item if it's of a valid mimetype
// and only if the crawler is set to discover its own resources
if (crawler.mimeTypeSupported(contentType) && crawler.discoverResources) {
crawler.queueLinkedItems(responseBuffer,queueItem);
// Is the item allowed by depth conditions ?
if(crawler.depthAllowed(queueItem)) {
crawler.emit("fetchcomplete",queueItem,responseBuffer,response);
// We only process the item if it's of a valid mimetype
// and only if the crawler is set to discover its own resources
if (crawler.mimeTypeSupported(contentType) && crawler.discoverResources) {
crawler.queueLinkedItems(responseBuffer,queueItem);
}
}

@@ -892,0 +968,0 @@

@@ -26,3 +26,11 @@ // Simplecrawler - queue module

FetchQueue.prototype = [];
FetchQueue.prototype.add = function(protocol, domain, port, path, callback) {
FetchQueue.prototype.add = function(protocol, domain, port, path, depth, callback) {
// For legacy reasons
if (depth instanceof Function) {
callback = depth;
depth = 1;
}
depth = depth || 1;
callback = callback && callback instanceof Function ? callback : function(){};

@@ -51,2 +59,3 @@ var self = this;

"path": path,
"depth": depth,
"fetched": false,

@@ -53,0 +62,0 @@ "status": "queued",

10

package.json
{
"name": "simplecrawler",
"description": "Very straigntforward web crawler. Uses EventEmitter. Generates queue statistics and has a basic cache mechanism with extensible backend.",
"version": "0.3.10",
"version": "0.3.11",
"homepage": "http://github.com/cgiffard/node-simplecrawler",

@@ -34,9 +34,9 @@ "author": "Christopher Giffard <christopher.giffard@cgiffard.com>",

"devDependencies": {
"mocha": "~1.8.2",
"jshint": "~0.7.x",
"chai": "~1.2.0"
"chai": "^2.2.0",
"jshint": "^2.7.0",
"mocha": "^2.2.4"
},
"dependencies": {
"URIjs": "~1.10.2"
"URIjs": "^1.15.0"
}
}

@@ -11,4 +11,4 @@ // Ensures that cookie support is functional and reliable across

"Set-cookie: adxcs=-; path=/; domain=.nytimes.com",
"Set-Cookie: PREF=ID=8c63f2522e22574d:FF=0:TM=1366349569:LM=1366349569:S=p1Urbmfwfs-R573P; expires=Sun, 19-Apr-2015 05:32:49 GMT; path=/; domain=.google.com",
"Set-Cookie: NID=67=DhLO04YPAMlhETrTIe2oFPqWZfypQXLZfCIPItOvf70zhtUEMEItYfdVh6aROEzRHqtd9jHT6HJ7Oo93eqP3cjYNp8GgShfa6r0WVbsmQQRUvutbjBOPwzo7ybwYcWdB; expires=Sat, 19-Oct-2015 05:32:49 GMT; path=/; domain=.google.com; HttpOnly",
"Set-Cookie: PREF=ID=8c63f2522e22574d:FF=0:TM=1366349569:LM=1366349569:S=p1Urbmfwfs-R573P; expires=Sun, 19-Apr-2020 05:32:49 GMT; path=/; domain=.google.com",
"Set-Cookie: NID=67=DhLO04YPAMlhETrTIe2oFPqWZfypQXLZfCIPItOvf70zhtUEMEItYfdVh6aROEzRHqtd9jHT6HJ7Oo93eqP3cjYNp8GgShfa6r0WVbsmQQRUvutbjBOPwzo7ybwYcWdB; expires=Sat, 19-Oct-2020 05:32:49 GMT; path=/; domain=.google.com; HttpOnly",
"Set-Cookie: fpc=d=Yq1z8hbA9WextmPFlb7suMTfMRgtSc2FyzAB7now1ExfUZ.eW7s4QSwSKlB6ZB0juN8OLZxWf_XXEIcspYaQmVVD0mD0xJ.xpXBCSw5Dl_Ql6n.RLoM.7CnTbNSsiSr2fkNiCN47tRUB4j8iWevNwQdFDn1hB8z8t1xwWt76n.sLIRY9p2_jTBhukfSD4SBpBkJhI1o-&v=2; expires=Sat, 19-Apr-2020 05:48:42 GMT; path=/; domain=www.yahoo.com",

@@ -15,0 +15,0 @@ "Set-Cookie: test=test; path=/test; domain=test.com"

@@ -8,5 +8,11 @@ // Runs a very simple crawl on an HTTP server

var Crawler = require("../"),
crawler = new Crawler(),
var Crawler = null,
crawler = null,
discover = null;
beforeEach(function() {
Crawler = require("../");
crawler = new Crawler();
discover = crawler.discoverResources.bind(crawler);
});

@@ -64,2 +70,36 @@ it("should discover http/s prefixed URLs in the document",function() {

});
it("should ignore HTML comments with parseHTMLComments = false",function() {
crawler.parseHTMLComments = false;
var links =
discover(" <!-- http://example.com/oneline_comment --> \
<a href=google.com> \
<!-- \
http://example.com/resource \
<a href=example.com> \
-->");
links.should.be.an("array");
links.length.should.equal(1);
links[0].should.equal("google.com");
});
it("should ignore script tags with parseScriptTags = false",function() {
crawler.parseScriptTags = false;
var links =
discover(" <script>var a = \"<a href='http://example.com/oneline_script'></a>\";</script> \
<a href=google.com> \
<script type='text/javascript'> \
http://example.com/resource \
<a href=example.com> \
</SCRIPT>");
links.should.be.an("array");
links.length.should.equal(1);
links[0].should.equal("google.com");
});
});

@@ -43,3 +43,48 @@ // Routes for testing server

// We want to trigger a timeout. Never respond.
},
// Routes for depth tests
"/depth/1": function(write) {
write(200,"<link rel='stylesheet' href='/css'> Home. <a href='/depth/2'>depth2</a>");
},
"/depth/2": function(write) {
write(200,"Depth 2. http://127.0.0.1:3000/depth/3");
},
"/depth/3": function(write) {
write(200,"Depth 3. <link rel='stylesheet' href='/css/2'> <link rel='stylesheet' href='/css/4'>");
},
"/css": function(write) {
write(200,"/* CSS 1 */ @import url('/css/2'); @font-face { url(/font/1) format('woff'); }", "text/css");
},
"/css/2": function(write) {
write(200,"/* CSS 2 */ @import url('/css/3'); .img1 { background-image:url('/img/1'); }", "text/css");
},
"/css/3": function(write) {
write(200,"/* CSS 3 */", "text/css");
},
"/css/4": function(write) {
write(200,"/* CSS 4 */ .img1 { background-image:url('/img/2'); } @font-face { url(/font/2) format('woff'); }", "text/css");
},
"/img/1": function(write) {
write(200,"", "image/png");
},
"/img/2": function(write) {
write(200,"", "image/png");
},
"/font/1": function(write) {
write(200,"", "font/woff");
},
"/font/2": function(write) {
write(200,"", "application/font-woff");
}
};

@@ -79,2 +79,9 @@ // Runs a very simple crawl on an HTTP server

it('should not throw an error if header Referer is undefined', function (done) {
var crawler = new Crawler("127.0.0.1","/depth/1",3000);
crawler.maxDepth = 1;
crawler.start();
crawler.on('complete', function () { done(); });
});
// TODO

@@ -81,0 +88,0 @@

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc