Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

crawler

Package Overview
Dependencies
Maintainers
1
Versions
40
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

crawler - npm Package Compare versions

Comparing version 0.2.1 to 0.2.2

test/mockfiles/links1.html

78

lib/crawler.js

@@ -1,2 +0,1 @@

var http = require('http'),

@@ -10,2 +9,3 @@ path = require('path'),

jsdom = require('jsdom'),
fs = require("fs"),
Pool = require('generic-pool').Pool;

@@ -22,8 +22,9 @@

jQuery: true,
jQueryUrl: path.resolve(__dirname,"../vendor/jquery-1.8.1.min.js"),
jQueryUrl: path.resolve(__dirname,"../vendor/jquery-1.8.3.min.js"),
maxConnections: 10,
priorityRange: 10,
priority: 5,
priority: 5,
retries: 3,
forceUTF8: false,
autoWindowClose:true,
retryTimeout: 10000,

@@ -66,7 +67,6 @@ method: "GET",

if (queuedCount+plannedQueueCallsCount === 0) {
if (self.options.onDrain) self.options.onDrain();
}
}
};

@@ -92,3 +92,3 @@ self.onDrain = function() {};

// Make sure we actually have cached data, and not just a note
// Make sure we actually have cached data, and not just a note
// that the page was already crawled

@@ -123,3 +123,7 @@ if (_.isArray(cacheData)) {

request(ropts, function(error,response,body) {
var requestArgs = ["uri","url","qs","method","headers","body","form","json","multipart","followRedirect","followAllRedirects",
"maxRedirects","encoding","pool","timeout","proxy","oauth","strictSSL","jar","aws"];
request(_.pick.apply(this,[ropts].concat(requestArgs)), function(error,response,body) {
if (error) return self.onContent(error, opts);

@@ -129,2 +133,3 @@

self.onContent(error,opts,response,false);
});

@@ -188,3 +193,3 @@ };

//If we don't cache but still want to skip duplicates we have to maintain a list of fetched URLs.
} else if (toQueue.skipDuplicates) {
} else if (toQueue.skipDuplicates) {
self.cache[toQueue.uri] = true;

@@ -200,15 +205,48 @@ }

// TODO support for non-HTML content
// TODO support for non-HTML content
// https://github.com/joshfire/node-crawler/issues/9
try {
jsdom.env(response.body,[toQueue.jQueryUrl],function(errors,window) {
if (errors) {
toQueue.callback(errors);
} else {
response.window = window;
toQueue.callback(null,response,window.jQuery);
}
var jsd = function(src) {
var env = jsdom.env({
"url":toQueue.uri,
"html":response.body,
"src":src,
"done":function(errors,window) {
release(toQueue);
});
if (errors) {
toQueue.callback(errors);
} else {
response.window = window;
toQueue.callback(null,response,window.jQuery);
}
// Free jsdom memory
if (toQueue.autoWindowClose) {
try {
window.close();
} catch (err) {}
}
release(toQueue);
}
});
};
// jsdom doesn't support adding local scripts,
// We have to read jQuery from the local fs
if (toQueue.jQueryUrl.match(/^(file\:\/\/|\/)/)) {
// TODO cache this
fs.readFile(toQueue.jQueryUrl.replace(/^file\:\/\//,""),"utf-8",function(err,jq) {
if (err) {
toQueue.callback(e);
release(toQueue);
} else {
jsd([jq]);
}
});
} else {
jsd([toQueue.jQueryUrl]);
}
} catch (e) {

@@ -223,3 +261,3 @@ toQueue.callback(e);

release(toQueue);
}
}

@@ -292,5 +330,5 @@ };

},toQueue.priority);
}
};
};
{
"name": "crawler",
"version": "0.2.1",
"version": "0.2.2",
"description": "Crawler is a web spider written with Nodejs. It gives you the full power of jQuery on the server to parse a big number of pages as they are downloaded, asynchronously.",

@@ -38,5 +38,5 @@ "keywords": [

"dependencies": {
"request": "2.11.1",
"request": "2.12.0",
"jsdom": "0.2.19",
"generic-pool": "2.0.1",
"generic-pool": "2.0.2",
"htmlparser": "1.7.6",

@@ -49,3 +49,4 @@ "underscore": "1.3.3",

"qunit": "0.5.8",
"express": "2.5.x"
"express": "2.5.x",
"memwatch": "0.1.5"
},

@@ -52,0 +53,0 @@ "scripts": {

@@ -36,3 +36,3 @@ node-crawler

// $ is a jQuery instance scoped to the server-side DOM of the page
$("#content a:link").each(function(a) {
$("#content a").each(function(a) {
c.queue(a.href);

@@ -101,2 +101,3 @@ });

* jQueryUrl: String, path to the jQuery file you want to insert (Defaults to bundled jquery-1.8.1.min.js)
* autoWindowClose: Boolean, if false you will have to close the window yourself with result.window.close(). Useful when your callback needs to continue having the window open until some async code has ran. (Default true)

@@ -113,3 +114,10 @@ Charset encoding:

Memory leaks
------------
When using timeouts, to avoid triggering [Node #3076](https://github.com/joyent/node/pull/3076) you should use Node > 0.8.14
There is now a complete memory leak test for node-crawler :)
How to test

@@ -138,2 +146,11 @@ -----------

0.2.2
- Fix relative link bug, all a.href should be absolute when crawling a remote URL
- Updated default jQuery to 1.8.3, request to 2.12.0, genericpool to 2.0.2
- Fixed memory leak by adding the autoWindowClose option
- Added memory leak test
0.2.1
- Updated jsdom to 0.2.19
0.2.0

@@ -140,0 +157,0 @@ - Updated code & dependencies for node 0.6/0.8, cleaned package.json

@@ -28,2 +28,17 @@ var express = require('express');

exports.app = app;
//100k page
var bigpage = new Array(100).join(new Array(100).join("1234567890"));
app.get('/bigpage', function(req, res){
res.send("<html><body>"+bigpage+"</body></html>");
});
app.get('/mockfiles/*', function(req, res){
res.sendfile("test/mockfiles/"+req.param(0));
});
exports.app = app;
if (require.main === module) {
app.listen(8080);
}

@@ -19,7 +19,13 @@ #!/usr/bin/env node

tests: [
path + "/test/units/links.js",
path + "/test/units/forceutf8.js",
path + "/test/units/simple.js",
path + "/test/units/errors.js"
path + "/test/units/errors.js",
path + "/test/units/leaks.js"
]
}
}
],function() {

@@ -26,0 +32,0 @@ console.log("Stopping mockserver...");

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc