simplecrawler
Advanced tools
Comparing version 0.2.9 to 0.2.10
@@ -138,2 +138,5 @@ // Simplecrawler | ||
// Strip Querystring Parameters from URL | ||
crawler.stripQuerystring = false; | ||
// STATE (AND OTHER) VARIABLES NOT TO STUFF WITH | ||
@@ -237,2 +240,6 @@ crawler._openRequests = 0; | ||
// Unspecified protocol. Assume http | ||
if (!protocol) | ||
protocol = "http"; | ||
} catch(e) { | ||
@@ -299,2 +306,6 @@ // If URIjs died, we definitely /do not/ support the protocol. | ||
// Check if querystring should be ignored | ||
if (crawler.stripQuerystring === true) | ||
URL = crawler.removeQuerystring(URL); | ||
try { | ||
@@ -344,8 +355,15 @@ newURL = | ||
crawler = this; | ||
if (!queueItem) | ||
queueItem = {}; | ||
if (!queueItem.protocol) | ||
queueItem.protocol = "http"; | ||
// Regular expressions for finding URL items in HTML and text | ||
var discoverRegex = [ | ||
/(\shref\s?=\s?|\ssrc\s?=\s?|url\()['"]?([^"'\s>\)]+)/ig, | ||
/(\shref\s?=\s?|\ssrc\s?=\s?|url\()([^\"\'\s>\)]+)/ig, | ||
/(\shref\s?=\s?|\ssrc\s?=\s?|url\()['"]([^"']+)/ig, | ||
/http(s)?\:\/\/[^?\s><\'\"]+/ig, | ||
/url\([^)]+/ig, | ||
/url\([^\)]+/ig, | ||
@@ -368,4 +386,2 @@ // This might be a bit of a gamble... but get hard-coded | ||
.replace(/\&/i,"&") | ||
.split(/\s+/g) | ||
.shift() | ||
.split("#") | ||
@@ -390,2 +406,3 @@ .shift(); | ||
} catch(e) { | ||
// But if URI.js couldn't parse it - nobody can! | ||
@@ -417,2 +434,8 @@ return list; | ||
resourceText.match(regex))); | ||
},[]) | ||
.reduce(function(list,check) { | ||
if (list.indexOf(check) < 0) | ||
return list.concat([check]); | ||
return list; | ||
},[]); | ||
@@ -476,3 +499,3 @@ }; | ||
host = host.toLowerCase(); | ||
// If we're ignoring www, remove it from both | ||
@@ -1060,2 +1083,22 @@ // (if www is the first domain component...) | ||
/* | ||
Public: Given a URL it will remove the querstring if it exists. | ||
url | ||
Examples | ||
crawler.removeQuerystring(url); | ||
Returns URL without querystring if it exists | ||
*/ | ||
Crawler.prototype.removeQuerystring = function(url) { | ||
if (url.indexOf("?") > -1) { | ||
return url.substr(0,url.indexOf("?")); | ||
} else { | ||
return url; | ||
} | ||
}; | ||
module.exports = Crawler; |
{ | ||
"name": "simplecrawler", | ||
"description": "Very straigntforward web crawler. Uses EventEmitter. Generates queue statistics and has a basic cache mechanism with extensible backend.", | ||
"version": "0.2.9", | ||
"version": "0.2.10", | ||
"homepage": "http://github.com/cgiffard/node-simplecrawler", | ||
@@ -6,0 +6,0 @@ "author": "Christopher Giffard <christopher.giffard@cgiffard.com>", |
Sorry, the diff of this file is not supported yet
107513
24
2281
569