simplecrawler
Advanced tools
Comparing version 0.3.10 to 0.3.11
@@ -19,2 +19,4 @@ // Simplecrawler | ||
var QUEUE_ITEM_INITIAL_DEPTH = 1; | ||
/* | ||
@@ -162,2 +164,11 @@ Public: Constructor for the crawler. | ||
// Whether to parse inside HTML comments | ||
crawler.parseHTMLComments = true; | ||
// Whether to parse inside script tags | ||
crawler.parseScriptTags = true; | ||
// Max depth parameter | ||
crawler.maxDepth = 0; | ||
// STATE (AND OTHER) VARIABLES NOT TO STUFF WITH | ||
@@ -214,2 +225,3 @@ var hiddenProps = { | ||
crawler.initialPath, | ||
QUEUE_ITEM_INITIAL_DEPTH, | ||
function(error) { | ||
@@ -296,2 +308,40 @@ if (error) throw error; | ||
/* | ||
Public: Determines whether the queueItem can be fetched from its depth | ||
In fact, the queueItem need to be fetched before calling this (because we need its MIMEType). | ||
This will just determine if we need to send an event for this item & if we need to fetch linked | ||
resources. | ||
If the queue item is a CSS or JS file, it will always be fetched (we need all images in CSS files, | ||
even if max depth is already reached). If it's an HTML page, we will check if max depth is reached | ||
or not. | ||
queueItem - Queue item object to check | ||
Returns a boolean, true if the queue item can be fetched - false if not. | ||
*/ | ||
Crawler.prototype.depthAllowed = function(queueItem) { | ||
var crawler = this; | ||
// Items matching this pattern will always be fetched, even if max depth is reached | ||
var mimeTypesWhitelist = [ | ||
/^text\/(css|javascript|ecmascript)/i, | ||
/^application\/javascript/i, | ||
/^application\/x-font/i, | ||
/^application\/font/i, | ||
/^image\//i, | ||
/^font\//i | ||
]; | ||
return ( | ||
crawler.maxDepth === 0 || | ||
queueItem.depth <= crawler.maxDepth || | ||
mimeTypesWhitelist.reduce(function(prev,mimeCheck) { | ||
return prev || !!mimeCheck.exec(queueItem.stateData.contentType); | ||
}, false) | ||
); | ||
}; | ||
/* | ||
Public: Extracts protocol, host, port and resource (path) given a URL string. | ||
@@ -318,3 +368,4 @@ | ||
crawler.initialPort + "/" | ||
) | ||
), | ||
depth: QUEUE_ITEM_INITIAL_DEPTH | ||
}; | ||
@@ -347,6 +398,7 @@ | ||
"protocol": newURL.protocol() || "http", | ||
"host": newURL.hostname(), | ||
"port": newURL.port() || 80, | ||
"path": newURL.resource(), | ||
"uriPath": newURL.path() | ||
"host": newURL.hostname(), | ||
"port": newURL.port() || 80, | ||
"path": newURL.resource(), | ||
"uriPath": newURL.path(), | ||
"depth": context.depth + 1 | ||
}; | ||
@@ -383,2 +435,10 @@ }; | ||
if (!crawler.parseHTMLComments) { | ||
resourceText = resourceText.replace(/<!--([\s\S]+?)-->/g, ""); | ||
} | ||
if (!crawler.parseScriptTags) { | ||
resourceText = resourceText.replace(/<script(.*?)>([\s\S]+?)<\/script>/gi, ""); | ||
} | ||
function cleanURL(URL) { | ||
@@ -404,11 +464,20 @@ return URL | ||
.reduce(function(list,URL) { | ||
var tmpURL; | ||
// Ensure URL is whole and complete | ||
try { | ||
URL = URI(URL) | ||
.absoluteTo(queueItem.url) | ||
tmpURL = URI(URL); | ||
if (queueItem.url) { | ||
URL = tmpURL | ||
.absoluteTo(queueItem.url) | ||
.normalize() | ||
.toString(); | ||
} else { | ||
URL = tmpURL | ||
.normalize() | ||
.toString(); | ||
} | ||
} catch(e) { | ||
// But if URI.js couldn't parse it - nobody can! | ||
@@ -636,2 +705,3 @@ return list; | ||
parsedURL.path, | ||
parsedURL.depth, | ||
function queueAddCallback(error,newQueueItem) { | ||
@@ -723,7 +793,10 @@ if (error) { | ||
"" | ||
), | ||
"Referer": queueItem.referrer | ||
) | ||
} | ||
}; | ||
if (queueItem.referrer) { | ||
requestOptions.headers.Referer = queueItem.referrer; | ||
} | ||
// If port is one of the HTTP/HTTPS defaults, delete the option to avoid conflicts | ||
@@ -876,4 +949,2 @@ if (requestOptions.port === 80 || requestOptions.port === 443) { | ||
crawler.emit("fetchcomplete",queueItem,responseBuffer,response); | ||
// First, save item to cache (if we're using a cache!) | ||
@@ -886,6 +957,11 @@ if (crawler.cache !== null && | ||
// We only process the item if it's of a valid mimetype | ||
// and only if the crawler is set to discover its own resources | ||
if (crawler.mimeTypeSupported(contentType) && crawler.discoverResources) { | ||
crawler.queueLinkedItems(responseBuffer,queueItem); | ||
// Is the item allowed by depth conditions ? | ||
if(crawler.depthAllowed(queueItem)) { | ||
crawler.emit("fetchcomplete",queueItem,responseBuffer,response); | ||
// We only process the item if it's of a valid mimetype | ||
// and only if the crawler is set to discover its own resources | ||
if (crawler.mimeTypeSupported(contentType) && crawler.discoverResources) { | ||
crawler.queueLinkedItems(responseBuffer,queueItem); | ||
} | ||
} | ||
@@ -892,0 +968,0 @@ |
@@ -26,3 +26,11 @@ // Simplecrawler - queue module | ||
FetchQueue.prototype = []; | ||
FetchQueue.prototype.add = function(protocol, domain, port, path, callback) { | ||
FetchQueue.prototype.add = function(protocol, domain, port, path, depth, callback) { | ||
// For legacy reasons | ||
if (depth instanceof Function) { | ||
callback = depth; | ||
depth = 1; | ||
} | ||
depth = depth || 1; | ||
callback = callback && callback instanceof Function ? callback : function(){}; | ||
@@ -51,2 +59,3 @@ var self = this; | ||
"path": path, | ||
"depth": depth, | ||
"fetched": false, | ||
@@ -53,0 +62,0 @@ "status": "queued", |
{ | ||
"name": "simplecrawler", | ||
"description": "Very straigntforward web crawler. Uses EventEmitter. Generates queue statistics and has a basic cache mechanism with extensible backend.", | ||
"version": "0.3.10", | ||
"version": "0.3.11", | ||
"homepage": "http://github.com/cgiffard/node-simplecrawler", | ||
@@ -34,9 +34,9 @@ "author": "Christopher Giffard <christopher.giffard@cgiffard.com>", | ||
"devDependencies": { | ||
"mocha": "~1.8.2", | ||
"jshint": "~0.7.x", | ||
"chai": "~1.2.0" | ||
"chai": "^2.2.0", | ||
"jshint": "^2.7.0", | ||
"mocha": "^2.2.4" | ||
}, | ||
"dependencies": { | ||
"URIjs": "~1.10.2" | ||
"URIjs": "^1.15.0" | ||
} | ||
} |
@@ -11,4 +11,4 @@ // Ensures that cookie support is functional and reliable across | ||
"Set-cookie: adxcs=-; path=/; domain=.nytimes.com", | ||
"Set-Cookie: PREF=ID=8c63f2522e22574d:FF=0:TM=1366349569:LM=1366349569:S=p1Urbmfwfs-R573P; expires=Sun, 19-Apr-2015 05:32:49 GMT; path=/; domain=.google.com", | ||
"Set-Cookie: NID=67=DhLO04YPAMlhETrTIe2oFPqWZfypQXLZfCIPItOvf70zhtUEMEItYfdVh6aROEzRHqtd9jHT6HJ7Oo93eqP3cjYNp8GgShfa6r0WVbsmQQRUvutbjBOPwzo7ybwYcWdB; expires=Sat, 19-Oct-2015 05:32:49 GMT; path=/; domain=.google.com; HttpOnly", | ||
"Set-Cookie: PREF=ID=8c63f2522e22574d:FF=0:TM=1366349569:LM=1366349569:S=p1Urbmfwfs-R573P; expires=Sun, 19-Apr-2020 05:32:49 GMT; path=/; domain=.google.com", | ||
"Set-Cookie: NID=67=DhLO04YPAMlhETrTIe2oFPqWZfypQXLZfCIPItOvf70zhtUEMEItYfdVh6aROEzRHqtd9jHT6HJ7Oo93eqP3cjYNp8GgShfa6r0WVbsmQQRUvutbjBOPwzo7ybwYcWdB; expires=Sat, 19-Oct-2020 05:32:49 GMT; path=/; domain=.google.com; HttpOnly", | ||
"Set-Cookie: fpc=d=Yq1z8hbA9WextmPFlb7suMTfMRgtSc2FyzAB7now1ExfUZ.eW7s4QSwSKlB6ZB0juN8OLZxWf_XXEIcspYaQmVVD0mD0xJ.xpXBCSw5Dl_Ql6n.RLoM.7CnTbNSsiSr2fkNiCN47tRUB4j8iWevNwQdFDn1hB8z8t1xwWt76n.sLIRY9p2_jTBhukfSD4SBpBkJhI1o-&v=2; expires=Sat, 19-Apr-2020 05:48:42 GMT; path=/; domain=www.yahoo.com", | ||
@@ -15,0 +15,0 @@ "Set-Cookie: test=test; path=/test; domain=test.com" |
@@ -8,5 +8,11 @@ // Runs a very simple crawl on an HTTP server | ||
var Crawler = require("../"), | ||
crawler = new Crawler(), | ||
var Crawler = null, | ||
crawler = null, | ||
discover = null; | ||
beforeEach(function() { | ||
Crawler = require("../"); | ||
crawler = new Crawler(); | ||
discover = crawler.discoverResources.bind(crawler); | ||
}); | ||
@@ -64,2 +70,36 @@ it("should discover http/s prefixed URLs in the document",function() { | ||
}); | ||
it("should ignore HTML comments with parseHTMLComments = false",function() { | ||
crawler.parseHTMLComments = false; | ||
var links = | ||
discover(" <!-- http://example.com/oneline_comment --> \ | ||
<a href=google.com> \ | ||
<!-- \ | ||
http://example.com/resource \ | ||
<a href=example.com> \ | ||
-->"); | ||
links.should.be.an("array"); | ||
links.length.should.equal(1); | ||
links[0].should.equal("google.com"); | ||
}); | ||
it("should ignore script tags with parseScriptTags = false",function() { | ||
crawler.parseScriptTags = false; | ||
var links = | ||
discover(" <script>var a = \"<a href='http://example.com/oneline_script'></a>\";</script> \ | ||
<a href=google.com> \ | ||
<script type='text/javascript'> \ | ||
http://example.com/resource \ | ||
<a href=example.com> \ | ||
</SCRIPT>"); | ||
links.should.be.an("array"); | ||
links.length.should.equal(1); | ||
links[0].should.equal("google.com"); | ||
}); | ||
}); |
@@ -43,3 +43,48 @@ // Routes for testing server | ||
// We want to trigger a timeout. Never respond. | ||
}, | ||
// Routes for depth tests | ||
"/depth/1": function(write) { | ||
write(200,"<link rel='stylesheet' href='/css'> Home. <a href='/depth/2'>depth2</a>"); | ||
}, | ||
"/depth/2": function(write) { | ||
write(200,"Depth 2. http://127.0.0.1:3000/depth/3"); | ||
}, | ||
"/depth/3": function(write) { | ||
write(200,"Depth 3. <link rel='stylesheet' href='/css/2'> <link rel='stylesheet' href='/css/4'>"); | ||
}, | ||
"/css": function(write) { | ||
write(200,"/* CSS 1 */ @import url('/css/2'); @font-face { url(/font/1) format('woff'); }", "text/css"); | ||
}, | ||
"/css/2": function(write) { | ||
write(200,"/* CSS 2 */ @import url('/css/3'); .img1 { background-image:url('/img/1'); }", "text/css"); | ||
}, | ||
"/css/3": function(write) { | ||
write(200,"/* CSS 3 */", "text/css"); | ||
}, | ||
"/css/4": function(write) { | ||
write(200,"/* CSS 4 */ .img1 { background-image:url('/img/2'); } @font-face { url(/font/2) format('woff'); }", "text/css"); | ||
}, | ||
"/img/1": function(write) { | ||
write(200,"", "image/png"); | ||
}, | ||
"/img/2": function(write) { | ||
write(200,"", "image/png"); | ||
}, | ||
"/font/1": function(write) { | ||
write(200,"", "font/woff"); | ||
}, | ||
"/font/2": function(write) { | ||
write(200,"", "application/font-woff"); | ||
} | ||
}; |
@@ -79,2 +79,9 @@ // Runs a very simple crawl on an HTTP server | ||
it('should not throw an error if header Referer is undefined', function (done) { | ||
var crawler = new Crawler("127.0.0.1","/depth/1",3000); | ||
crawler.maxDepth = 1; | ||
crawler.start(); | ||
crawler.on('complete', function () { done(); }); | ||
}); | ||
// TODO | ||
@@ -81,0 +88,0 @@ |
Sorry, the diff of this file is not supported yet
120759
27
2643
608
+ AddedURIjs@1.16.1(transitive)
- RemovedURIjs@1.10.2(transitive)
UpdatedURIjs@^1.15.0