Socket
Socket
Sign inDemoInstall

x-ray-crawler

Package Overview
Dependencies
121
Maintainers
1
Versions
7
Alerts
File Explorer

Advanced tools

Install Socket

Detect and block malicious and high-risk dependencies

Install

Comparing version 1.0.1 to 2.0.0

5

History.md
2.0.0 / 2015-03-25
==================
* sparkling new (functional) api
1.0.1 / 2015-03-20

@@ -3,0 +8,0 @@ ==================

26

lib/http-driver.js

@@ -5,3 +5,3 @@ /**

var superagent = require('superagent');
var superagent = require('superagent')

@@ -12,3 +12,3 @@ /**

module.exports = driver;
module.exports = driver

@@ -23,3 +23,3 @@ /**

function driver(opts) {
var agent = superagent.agent(opts || {});
var agent = superagent.agent(opts || {})

@@ -31,12 +31,10 @@ return function http_driver(ctx, fn) {

.end(function(err, res) {
if (err) return fn(err);
if (err) return fn(err)
ctx.status = res.status;
ctx.set(res.headers);
ctx.status = res.status
ctx.set(res.headers)
if ('application/json' == ctx.type) {
ctx.body = res.body;
} else {
ctx.body = res.text;
}
ctx.body = 'application/json' == ctx.type
? res.body
: res.text

@@ -46,7 +44,7 @@ // update the URL if there were redirects

? res.redirects.pop()
: ctx.url;
: ctx.url
return fn(null, ctx);
});
return fn(null, ctx)
})
}
}
/**
* Export `Crawler`
* Export `Create`
*/
module.exports = Crawler;
module.exports = Crawler

@@ -11,29 +11,21 @@ /**

var Emitter = require('emitter-component');
var context = require('http-context');
var delegate = require('delegates');
var parse = require('url').parse;
var cheerio = require('cheerio');
var selectn = require('selectn');
var enqueue = require('enqueue');
var yieldly = require('yieldly');
var wrapfn = require('wrap-fn');
var isArray = Array.isArray;
var ms = require('ms');
var context = require('http-context')
var enqueue = require('enqueue')
var wrapfn = require('wrap-fn')
var noop = function(){}
var ms = require('ms')
/**
* Debug
* Locals
*/
var debug = require('debug')('x-ray-crawler');
var rate_limit = require('./rate-limit')
var http = require('./http-driver')
var range = require('./range')
/**
* Locals
* Debug
*/
var absolutes = require('./absolute-path');
var rate_limit = require('./rate-limit');
var driver = require('./http-driver');
var select = require('./select');
var range = require('./range');
var debug = require('debug')('x-ray-crawler')

@@ -43,319 +35,209 @@ /**

*
* @param {String} url
* @return {Crawler}
* @param {Function} driver (options)
* @return {Function} crawler(url, fn)
* @api public
*/
function Crawler(url) {
if (!(this instanceof Crawler)) return new Crawler(url);
function Crawler(driver) {
driver = driver || http()
// default state
this.state = {
response: function(){},
paginate: function(){},
throttle: rate_limit(),
request: function(){},
driver: driver(),
delay_to: false,
limit: Infinity,
delay: range(),
timeout: false,
throws: false,
url: url
};
}
// defaults
var throttle = rate_limit()
var concurrency = Infinity
var limit = Infinity
var request = noop
var timeout = false
var response = noop
var delay = range()
var driver = driver
var queue = false
/**
* Mixin `Emitter`
*/
/**
* Make a request
*/
Emitter(Crawler.prototype);
function crawler(url, fn) {
// co support
if (1 == arguments.length) {
return function _crawler(fn) {
return crawler(url, fn)
}
}
/**
* Delegates
*/
if (!queue) {
var options = {
concurrency: concurrency,
timeout: timeout,
limit: limit
}
delegate(Crawler.prototype, 'state')
.fluent('concurrency')
.fluent('response')
.fluent('request')
.fluent('driver')
.fluent('throws')
.fluent('limit')
.getter('url');
/**
* Throttle according to a rate limit
*
* @param {Number|String} requests
* @param {Number|String} rate
* @return {Number|Crawler}
* @api public
*/
Crawler.prototype.throttle = function(requests, rate) {
if (!arguments.length) return this.state.throttle;
else if (1 == arguments.length) rate = requests, requests = 1;
rate = /^\d/.test(rate) ? rate : 1 + rate;
rate = 'string' == typeof rate ? ms(rate) : rate;
this.state.throttle = rate_limit(requests, rate);
return this;
};
/**
* Delay subsequent requests
*
* @param {String|Number} from
* @param {String|Number} to (optional)
* @return {Number|Crawler}
* @api public
*/
Crawler.prototype.delay = function(from, to) {
if (!arguments.length) return this.state.delay;
from = 'string' == typeof from ? ms(from) : from;
to = 'string' == typeof to ? ms(to) : to;
this.state.delay = range(from, to);
return this;
};
/**
* Specify a request timeout
*
* @param {String|Number} timeout
* @return {Number|Crawler}
* @api public
*/
Crawler.prototype.timeout = function(timeout) {
if (!arguments.length) return this.state.timeout;
timeout = 'string' == typeof timeout ? ms(timeout) : timeout;
this.state.timeout = timeout;
return this;
};
/**
* Paginate through responses
*
* @param {String|Function} fn
* @return {String|Function|Crawler}
* @api public
*/
Crawler.prototype.paginate = function(fn) {
if (!arguments.length) return this.state.paginate;
this.state.paginate = 'function' != typeof fn
? compile
: fn;
function compile($, ctx) {
var response = ctx.response;
if (response.is('html') || response.is('xml')) {
return select($, fn);
} else if (response.is('json')) {
return json_select(fn, $);
queue = enqueue(get, options)
queue(url, fn)
} else {
return [];
schedule(url, fn)
}
return crawler
}
return this;
};
/**
* Fetch the `url` based on the `driver`
*
* @param {String} url
* @param {Function} fn
*/
/**
* Initiate the crawl
*
* @param {Function} done (optional)
* @return {Crawl|Generator}
* @api public
*/
function get(url, fn) {
debug('getting: %s', url)
var ctx = context()
ctx.url = url
Crawler.prototype.crawl = yieldly(function(done) {
done = done || noop;
// request hook
request(ctx.request)
var limit = this.paginate() ? this.limit() : 1;
var concurrency = this.concurrency();
var paginate = this.paginate();
var throttle = this.throttle();
var throws = this.throws();
var delay = this.delay();
var url = this.url;
var scheduled = 0;
var self = this;
var tid = {};
wrapfn(driver, result)(ctx)
// initial request
debug('initial request: %s', url);
// HTTP response
function result(err, res) {
if (err) return fn(err)
// queue options
var options = {
concurrency: this.concurrency(),
timeout: this.timeout(),
limit: limit
};
// update the context
if (res && res != ctx) ctx.body = res
// setup the queue
var queue = enqueue(function(url, next) {
self.get(url, next);
}, options);
// post-flight. modify the response
response(ctx.response)
// kick us off
scheduled++;
queue(url, onjobfinish(url));
// handle the response
function onresponse(ctx) {
var response = ctx.response;
var isJSON = response.is('json');
var isHTML = response.is('html');
var isXML = response.is('xml');
var urls = [];
var $;
debug('response: %j', {
url: ctx.url,
status: ctx.status,
type: ctx.type
});
// load response
if (isHTML || isXML) {
$ = cheerio.load(ctx.body, { xmlMode: isXML });
$ = absolutes(ctx.url, $);
} else {
$ = ctx.body;
fn(null, ctx)
}
}
// send a response
self.emit('response', $, ctx);
/**
* Schedule another request for later
*
* @param {String} url
* @param {Function} fn
*/
// where we going next?
var next_page = paginate($, ctx);
if (next_page) {
debug('next page(s): %j', next_page)
urls = urls.concat(next_page).filter(canRequest);
}
function schedule(url, fn) {
// if we've reached the limit, don't request anymore
if (--limit <= 0) return
// queue up the next round of urls
if (urls.length) {
urls.forEach(schedule);
} else {
debug('no next page, finishing up.')
}
}
// schedule the next url
function schedule(url) {
// if we've reached the limit, don't request anymore
if (--limit <= 0) return;
// if specified, throttle requests and add a delay
var wait = throttle() + delay();
debug('queued "%s", waiting "%sms"', url, wait);
var wait = throttle() + delay()
scheduled++;
debug('queued: "%s", waiting "%sms"', url, wait)
setTimeout(function() {
// queue up next request
var queued = queue(url, onjobfinish(url));
if (!queued) return;
}, wait);
var queued = queue(url, fn)
if (!queued) return
}, wait)
}
// handle jobs finishing
function onjobfinish(url) {
return function(err, ctx) {
if (err) {
debug('job (%s) error: %s', url, err.message);
err.url = url;
self.emit('error', err);
} else if (ctx) {
onresponse(ctx);
debug('job finished: %s', url);
}
/**
* Throttle according to a rate limit
*
* @param {Number|String} requests
* @param {Number|String} rate
* @return {Number|Crawler}
* @api public
*/
if (--scheduled <= 0) {
return done();
}
crawler.throttle = function(requests, rate) {
if (!arguments.length) return throttle
if (1 == arguments.length) {
rate = requests
requests = 1
}
rate = /^\d/.test(rate) ? rate : 1 + rate
rate = 'string' == typeof rate ? ms(rate) : rate
throttle = rate_limit(requests, rate)
return crawler
}
return this;
});
/**
* Delay subsequent requests
*
* @param {String|Number} from
* @param {String|Number} to (optional)
* @return {Number|Crawler}
* @api public
*/
/**
* Make a request using the
* specified driver
*
* @param {String} url
* @param {Function} fn
* @return {Crawler}
* @api private
*/
crawler.delay = function(from, to) {
if (!arguments.length) return delay
from = 'string' == typeof from ? ms(from) : from
to = 'string' == typeof to ? ms(to) : to
delay = range(from, to)
return crawler
}
Crawler.prototype.get = function(url, fn) {
var response = this.response();
var request = this.request();
var driver = this.driver();
var ctx = context();
ctx.url = url;
/**
* Specify a request timeout
*
* @param {String|Number} timeout
* @return {Number|Crawler}
* @api public
*/
// pre-flight. modify the request
request(ctx.request);
crawler.timeout = function(_) {
if (!arguments.length) return _
timeout = 'string' == typeof _ ? ms(_) : _
return crawler
}
// call the driver
debug('request %j', {
driver: driver.name,
url: url
});
/**
* Specify a request concurrency
*
* @param {Number} n
* @return {Number|crawler}
*/
// short circuit the driver for testing
if (ctx.body) {
done(null, ctx);
} else {
wrapfn(driver, done)(ctx);
crawler.concurrency = function(n) {
if (!arguments.length) return concurrency
concurrency = n
return crawler
}
function done(err, res) {
if (err) {
return fn(err);
}
/**
* Hook into the request
*
* @param {Function} fn
* @return {Function|crawler}
*/
// update the context
if (res && res != ctx) ctx.body = res;
crawler.request = function(fn) {
if (!arguments.length) return request
request = fn
return crawler
}
// post-flight. modify the response
response(ctx.response);
/**
* Hook into the response
*
* @param {Function} fn
* @return {Function|crawler}
*/
fn(null, ctx);
crawler.response = function(fn) {
if (!arguments.length) return response
response = fn
return crawler
}
return this;
};
/**
* Limit the total number of requests
*
* @param {Number} n
* @return {Number|crawler}
*/
/**
* Query a JSON object
*
* @param {String} selector
* @param {Object} json
* @return {String|Array}
*/
crawler.limit = function(n) {
if (!arguments.length) return limit
limit = n
return crawler
}
function json_select(selector, json) {
return isArray(json)
? json.map(function(obj) { return selectn(selector, obj); })
: selectn(selector, obj);
return crawler
}
/**
* Can we make a request?
*
* @param {String} url
* @return {Boolean}
*/
function canRequest(url) {
return 'string' == typeof url
? parse(url).protocol
: false;
}

@@ -5,3 +5,3 @@ /**

module.exports = range;
module.exports = range

@@ -17,8 +17,8 @@ /**

function range(from, to) {
from = from || 0;
to = to || from;
from = from || 0
to = to || from
return function() {
return Math.floor(Math.random() * to) + from;
return Math.floor(Math.random() * (to - from + 1) + from)
}
}

@@ -5,3 +5,3 @@ /**

module.exports = rate_limit;
module.exports = rate_limit

@@ -17,25 +17,25 @@ /**

function rate_limit(requests, rate) {
requests = requests || Infinity;
rate = rate || 0;
requests = requests || Infinity
rate = rate || 0
var rate = Math.round(rate / requests);
var waiting = 0;
var called = 0;
var tids = [];
var rate = Math.round(rate / requests)
var waiting = 0
var called = 0
var tids = []
return function _rate_limit(fn) {
// clear all timeouts if _rate_limit(0);
if (0 === fn) return tids.forEach(clearTimeout);
// clear all timeouts if _rate_limit(0)
if (0 === fn) return tids.forEach(clearTimeout)
var calling = new Date();
var delta = calling - called;
var free = delta > rate && !waiting;
var calling = new Date()
var delta = calling - called
var free = delta > rate && !waiting
if (free) {
called = calling;
return 0;
called = calling
return 0
} else {
var wait = (rate - delta) + (waiting++ * rate);
timer(wait);
return wait;
var wait = (rate - delta) + (waiting++ * rate)
timer(wait)
return wait
}

@@ -45,7 +45,7 @@

tids[tids.length] = setTimeout(function() {
called = new Date();
waiting--;
}, ms);
called = new Date()
waiting--
}, ms)
}
}
}
{
"name": "x-ray-crawler",
"version": "1.0.1",
"version": "2.0.0",
"description": "x-ray's crawler",

@@ -12,3 +12,3 @@ "main": "lib/index.js",

"enqueue": "^1.0.2",
"http-context": "^1.0.0",
"http-context": "^1.1.0",
"ms": "^0.7.0",

@@ -15,0 +15,0 @@ "selectn": "^0.9.6",

@@ -7,3 +7,2 @@ # x-ray Crawler

- Flexible pagination
- Extensible drivers

@@ -20,15 +19,17 @@ - Request and response hooks

```js
crawler('http://google.com')
function http(ctx, fn) {
superagent.get(ctx.url, fn)
}
var crawl = Crawler(http)
.throttle(3, '1s')
.delay('1s', '10s')
.concurrency(2)
.paginate('a @ href')
.limit(20)
.on('response', function($, ctx) {
console.log('title: %s', $('title').text().trim());
})
.crawl(function(err, res) {
if (err) throw err;
console.log('done!');
});
crawl('http://lapwinglabs.com', function(err, ctx) {
if (err) throw err
console.log('status code: %s', ctx.status)
console.log('status body: %s', ctx.body)
})
```

@@ -35,0 +36,0 @@

@@ -5,3 +5,3 @@ /**

var crawler = require('./');
var crawler = require('./')

@@ -12,14 +12,11 @@ /**

crawler('http://google.com')
var crawl = crawler()
.throttle(3, '1s')
.delay('1s', '10s')
.concurrency(2)
.paginate('a @ href')
.limit(20)
.on('response', function($, ctx) {
console.log('title: %s', $('title').text().trim());
})
.crawl(function(err, res) {
if (err) throw err;
console.log('done!');
});
crawl('http://lapwinglabs.com', function(err, ctx) {
if (err) throw err
console.log(ctx.status)
})
SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc