simplecrawler
Advanced tools
Comparing version 0.0.5 to 0.0.6
@@ -252,3 +252,3 @@ // Simplecrawler | ||
urlMatch.forEach(function(URL) { | ||
URL = URL.replace(/^(href|src)=['"]?/i,"").replace(/^\s*/,""); | ||
URL = URL.replace(/^(\s?href|\s?src)=['"]?/i,"").replace(/^\s*/,""); | ||
URL = URL.replace(/^url\(['"]*/i,""); | ||
@@ -279,3 +279,3 @@ URL = URL.replace(/^javascript\:[a-z0-9]+\(['"]/i,""); | ||
// Rough scan for URLs | ||
cleanAndQueue(resourceText.match(/(href\s?=\s?|src\s?=\s?|url\()['"]?([^"'\s>\)]+)/ig)); | ||
cleanAndQueue(resourceText.match(/(\shref\s?=\s?|\ssrc\s?=\s?|url\()['"]?([^"'\s>\)]+)/ig)); | ||
cleanAndQueue(resourceText.match(/http(s)?\:\/\/[^?\s><\'\"]+/ig)); | ||
@@ -282,0 +282,0 @@ cleanAndQueue(resourceText.match(/url\([^)]+/ig)); |
{ | ||
"name": "simplecrawler", | ||
"description": "Very straigntforward web crawler. Uses EventEmitter. Generates queue statistics and has a basic cache mechanism with extensible backend.", | ||
"version": "0.0.5", | ||
"version": "0.0.6", | ||
"homepage": "http://github.com/cgiffard/node-simplecrawler", | ||
@@ -6,0 +6,0 @@ "author": "Christopher Giffard <christopher.giffard@cgiffard.com>", |
18
queue.js
@@ -20,2 +20,3 @@ // Simplecrawler - queue module | ||
this.completeCache = 0; | ||
this.scanIndex = {}; | ||
}; | ||
@@ -64,14 +65,11 @@ | ||
callback = callback && callback instanceof Function ? callback : function(){}; | ||
var self = this; | ||
var count = self.filter(function(item) { | ||
if (String(item.protocol).toLowerCase() === String(protocol).toLowerCase() && | ||
String(item.domain).toLowerCase() === String(domain).toLowerCase() && | ||
parseInt(item.port,10) === parseInt(port,10) && | ||
item.path === path) return true; | ||
return false; | ||
}).length; | ||
var url = (protocol + "://" + domain + (port !== 80 ? ":" + port : "") + path).toLowerCase(); | ||
callback(null,count); | ||
if (!!this.scanIndex[url]) { | ||
callback(null,1); | ||
} else { | ||
this.scanIndex[url] = true; | ||
callback(null,0); | ||
} | ||
}; | ||
@@ -78,0 +76,0 @@ |
Sorry, the diff of this file is not supported yet
53077
981