New Case Study:See how Anthropic automated 95% of dependency reviews with Socket.Learn More
Socket
Sign inDemoInstall
Socket

simplecrawler

Package Overview
Dependencies
Maintainers
2
Versions
70
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

simplecrawler - npm Package Compare versions

Comparing version 0.2.9 to 0.2.10

test/discovery.js

53

lib/crawler.js

@@ -138,2 +138,5 @@ // Simplecrawler

// Strip Querystring Parameters from URL
crawler.stripQuerystring = false;
// STATE (AND OTHER) VARIABLES NOT TO STUFF WITH

@@ -237,2 +240,6 @@ crawler._openRequests = 0;

// Unspecified protocol. Assume http
if (!protocol)
protocol = "http";
} catch(e) {

@@ -299,2 +306,6 @@ // If URIjs died, we definitely /do not/ support the protocol.

// Check if querystring should be ignored
if (crawler.stripQuerystring === true)
URL = crawler.removeQuerystring(URL);
try {

@@ -344,8 +355,15 @@ newURL =

crawler = this;
if (!queueItem)
queueItem = {};
if (!queueItem.protocol)
queueItem.protocol = "http";
// Regular expressions for finding URL items in HTML and text
var discoverRegex = [
/(\shref\s?=\s?|\ssrc\s?=\s?|url\()['"]?([^"'\s>\)]+)/ig,
/(\shref\s?=\s?|\ssrc\s?=\s?|url\()([^\"\'\s>\)]+)/ig,
/(\shref\s?=\s?|\ssrc\s?=\s?|url\()['"]([^"']+)/ig,
/http(s)?\:\/\/[^?\s><\'\"]+/ig,
/url\([^)]+/ig,
/url\([^\)]+/ig,

@@ -368,4 +386,2 @@ // This might be a bit of a gamble... but get hard-coded

.replace(/\&amp;/i,"&")
.split(/\s+/g)
.shift()
.split("#")

@@ -390,2 +406,3 @@ .shift();

} catch(e) {
// But if URI.js couldn't parse it - nobody can!

@@ -417,2 +434,8 @@ return list;

resourceText.match(regex)));
},[])
.reduce(function(list,check) {
if (list.indexOf(check) < 0)
return list.concat([check]);
return list;
},[]);

@@ -476,3 +499,3 @@ };

host = host.toLowerCase();
// If we're ignoring www, remove it from both

@@ -1060,2 +1083,22 @@ // (if www is the first domain component...)

/*
Public: Given a URL it will remove the querstring if it exists.
url
Examples
crawler.removeQuerystring(url);
Returns URL without querystring if it exists
*/
Crawler.prototype.removeQuerystring = function(url) {
if (url.indexOf("?") > -1) {
return url.substr(0,url.indexOf("?"));
} else {
return url;
}
};
module.exports = Crawler;

2

package.json
{
"name": "simplecrawler",
"description": "Very straigntforward web crawler. Uses EventEmitter. Generates queue statistics and has a basic cache mechanism with extensible backend.",
"version": "0.2.9",
"version": "0.2.10",
"homepage": "http://github.com/cgiffard/node-simplecrawler",

@@ -6,0 +6,0 @@ "author": "Christopher Giffard <christopher.giffard@cgiffard.com>",

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc