Comparing version 0.2.1 to 0.2.2
@@ -1,2 +0,1 @@ | ||
var http = require('http'), | ||
@@ -10,2 +9,3 @@ path = require('path'), | ||
jsdom = require('jsdom'), | ||
fs = require("fs"), | ||
Pool = require('generic-pool').Pool; | ||
@@ -22,8 +22,9 @@ | ||
jQuery: true, | ||
jQueryUrl: path.resolve(__dirname,"../vendor/jquery-1.8.1.min.js"), | ||
jQueryUrl: path.resolve(__dirname,"../vendor/jquery-1.8.3.min.js"), | ||
maxConnections: 10, | ||
priorityRange: 10, | ||
priority: 5, | ||
priority: 5, | ||
retries: 3, | ||
forceUTF8: false, | ||
autoWindowClose:true, | ||
retryTimeout: 10000, | ||
@@ -66,7 +67,6 @@ method: "GET", | ||
if (queuedCount+plannedQueueCallsCount === 0) { | ||
if (self.options.onDrain) self.options.onDrain(); | ||
} | ||
} | ||
}; | ||
@@ -92,3 +92,3 @@ self.onDrain = function() {}; | ||
// Make sure we actually have cached data, and not just a note | ||
// Make sure we actually have cached data, and not just a note | ||
// that the page was already crawled | ||
@@ -123,3 +123,7 @@ if (_.isArray(cacheData)) { | ||
request(ropts, function(error,response,body) { | ||
var requestArgs = ["uri","url","qs","method","headers","body","form","json","multipart","followRedirect","followAllRedirects", | ||
"maxRedirects","encoding","pool","timeout","proxy","oauth","strictSSL","jar","aws"]; | ||
request(_.pick.apply(this,[ropts].concat(requestArgs)), function(error,response,body) { | ||
if (error) return self.onContent(error, opts); | ||
@@ -129,2 +133,3 @@ | ||
self.onContent(error,opts,response,false); | ||
}); | ||
@@ -188,3 +193,3 @@ }; | ||
//If we don't cache but still want to skip duplicates we have to maintain a list of fetched URLs. | ||
} else if (toQueue.skipDuplicates) { | ||
} else if (toQueue.skipDuplicates) { | ||
self.cache[toQueue.uri] = true; | ||
@@ -200,15 +205,48 @@ } | ||
// TODO support for non-HTML content | ||
// TODO support for non-HTML content | ||
// https://github.com/joshfire/node-crawler/issues/9 | ||
try { | ||
jsdom.env(response.body,[toQueue.jQueryUrl],function(errors,window) { | ||
if (errors) { | ||
toQueue.callback(errors); | ||
} else { | ||
response.window = window; | ||
toQueue.callback(null,response,window.jQuery); | ||
} | ||
var jsd = function(src) { | ||
var env = jsdom.env({ | ||
"url":toQueue.uri, | ||
"html":response.body, | ||
"src":src, | ||
"done":function(errors,window) { | ||
release(toQueue); | ||
}); | ||
if (errors) { | ||
toQueue.callback(errors); | ||
} else { | ||
response.window = window; | ||
toQueue.callback(null,response,window.jQuery); | ||
} | ||
// Free jsdom memory | ||
if (toQueue.autoWindowClose) { | ||
try { | ||
window.close(); | ||
} catch (err) {} | ||
} | ||
release(toQueue); | ||
} | ||
}); | ||
}; | ||
// jsdom doesn't support adding local scripts, | ||
// We have to read jQuery from the local fs | ||
if (toQueue.jQueryUrl.match(/^(file\:\/\/|\/)/)) { | ||
// TODO cache this | ||
fs.readFile(toQueue.jQueryUrl.replace(/^file\:\/\//,""),"utf-8",function(err,jq) { | ||
if (err) { | ||
toQueue.callback(e); | ||
release(toQueue); | ||
} else { | ||
jsd([jq]); | ||
} | ||
}); | ||
} else { | ||
jsd([toQueue.jQueryUrl]); | ||
} | ||
} catch (e) { | ||
@@ -223,3 +261,3 @@ toQueue.callback(e); | ||
release(toQueue); | ||
} | ||
} | ||
@@ -292,5 +330,5 @@ }; | ||
},toQueue.priority); | ||
} | ||
}; | ||
}; | ||
{ | ||
"name": "crawler", | ||
"version": "0.2.1", | ||
"version": "0.2.2", | ||
"description": "Crawler is a web spider written with Nodejs. It gives you the full power of jQuery on the server to parse a big number of pages as they are downloaded, asynchronously.", | ||
@@ -38,5 +38,5 @@ "keywords": [ | ||
"dependencies": { | ||
"request": "2.11.1", | ||
"request": "2.12.0", | ||
"jsdom": "0.2.19", | ||
"generic-pool": "2.0.1", | ||
"generic-pool": "2.0.2", | ||
"htmlparser": "1.7.6", | ||
@@ -49,3 +49,4 @@ "underscore": "1.3.3", | ||
"qunit": "0.5.8", | ||
"express": "2.5.x" | ||
"express": "2.5.x", | ||
"memwatch": "0.1.5" | ||
}, | ||
@@ -52,0 +53,0 @@ "scripts": { |
@@ -36,3 +36,3 @@ node-crawler | ||
// $ is a jQuery instance scoped to the server-side DOM of the page | ||
$("#content a:link").each(function(a) { | ||
$("#content a").each(function(a) { | ||
c.queue(a.href); | ||
@@ -101,2 +101,3 @@ }); | ||
* jQueryUrl: String, path to the jQuery file you want to insert (Defaults to bundled jquery-1.8.1.min.js) | ||
* autoWindowClose: Boolean, if false you will have to close the window yourself with result.window.close(). Useful when your callback needs to continue having the window open until some async code has ran. (Default true) | ||
@@ -113,3 +114,10 @@ Charset encoding: | ||
Memory leaks | ||
------------ | ||
When using timeouts, to avoid triggering [Node #3076](https://github.com/joyent/node/pull/3076) you should use Node > 0.8.14 | ||
There is now a complete memory leak test for node-crawler :) | ||
How to test | ||
@@ -138,2 +146,11 @@ ----------- | ||
0.2.2 | ||
- Fix relative link bug, all a.href should be absolute when crawling a remote URL | ||
- Updated default jQuery to 1.8.3, request to 2.12.0, genericpool to 2.0.2 | ||
- Fixed memory leak by adding the autoWindowClose option | ||
- Added memory leak test | ||
0.2.1 | ||
- Updated jsdom to 0.2.19 | ||
0.2.0 | ||
@@ -140,0 +157,0 @@ - Updated code & dependencies for node 0.6/0.8, cleaned package.json |
@@ -28,2 +28,17 @@ var express = require('express'); | ||
exports.app = app; | ||
//100k page | ||
var bigpage = new Array(100).join(new Array(100).join("1234567890")); | ||
app.get('/bigpage', function(req, res){ | ||
res.send("<html><body>"+bigpage+"</body></html>"); | ||
}); | ||
app.get('/mockfiles/*', function(req, res){ | ||
res.sendfile("test/mockfiles/"+req.param(0)); | ||
}); | ||
exports.app = app; | ||
if (require.main === module) { | ||
app.listen(8080); | ||
} |
@@ -19,7 +19,13 @@ #!/usr/bin/env node | ||
tests: [ | ||
path + "/test/units/links.js", | ||
path + "/test/units/forceutf8.js", | ||
path + "/test/units/simple.js", | ||
path + "/test/units/errors.js" | ||
path + "/test/units/errors.js", | ||
path + "/test/units/leaks.js" | ||
] | ||
} | ||
} | ||
],function() { | ||
@@ -26,0 +32,0 @@ console.log("Stopping mockserver..."); |
New author
Supply chain riskA new npm collaborator published a version of the package for the first time. New collaborators are usually benign additions to a project, but do indicate a change to the security surface area of a package.
Found 1 instance in 1 package
Non-existent author
Supply chain riskThe package was published by an npm account that no longer exists.
Found 1 instance in 1 package
177095
19
1276
167
0
3
3
2
+ Addedgeneric-pool@2.0.2(transitive)
+ Addedrequest@2.12.0(transitive)
- Removedgeneric-pool@2.0.1(transitive)
- Removedrequest@2.11.1(transitive)
Updatedgeneric-pool@2.0.2
Updatedrequest@2.12.0