x-ray-crawler
Advanced tools
Comparing version 1.0.1 to 2.0.0
2.0.0 / 2015-03-25 | ||
================== | ||
* sparkling new (functional) api | ||
1.0.1 / 2015-03-20 | ||
@@ -3,0 +8,0 @@ ================== |
@@ -5,3 +5,3 @@ /** | ||
var superagent = require('superagent'); | ||
var superagent = require('superagent') | ||
@@ -12,3 +12,3 @@ /** | ||
module.exports = driver; | ||
module.exports = driver | ||
@@ -23,3 +23,3 @@ /** | ||
function driver(opts) { | ||
var agent = superagent.agent(opts || {}); | ||
var agent = superagent.agent(opts || {}) | ||
@@ -31,12 +31,10 @@ return function http_driver(ctx, fn) { | ||
.end(function(err, res) { | ||
if (err) return fn(err); | ||
if (err) return fn(err) | ||
ctx.status = res.status; | ||
ctx.set(res.headers); | ||
ctx.status = res.status | ||
ctx.set(res.headers) | ||
if ('application/json' == ctx.type) { | ||
ctx.body = res.body; | ||
} else { | ||
ctx.body = res.text; | ||
} | ||
ctx.body = 'application/json' == ctx.type | ||
? res.body | ||
: res.text | ||
@@ -46,7 +44,7 @@ // update the URL if there were redirects | ||
? res.redirects.pop() | ||
: ctx.url; | ||
: ctx.url | ||
return fn(null, ctx); | ||
}); | ||
return fn(null, ctx) | ||
}) | ||
} | ||
} |
464
lib/index.js
/** | ||
* Export `Crawler` | ||
* Export `Create` | ||
*/ | ||
module.exports = Crawler; | ||
module.exports = Crawler | ||
@@ -11,29 +11,21 @@ /** | ||
var Emitter = require('emitter-component'); | ||
var context = require('http-context'); | ||
var delegate = require('delegates'); | ||
var parse = require('url').parse; | ||
var cheerio = require('cheerio'); | ||
var selectn = require('selectn'); | ||
var enqueue = require('enqueue'); | ||
var yieldly = require('yieldly'); | ||
var wrapfn = require('wrap-fn'); | ||
var isArray = Array.isArray; | ||
var ms = require('ms'); | ||
var context = require('http-context') | ||
var enqueue = require('enqueue') | ||
var wrapfn = require('wrap-fn') | ||
var noop = function(){} | ||
var ms = require('ms') | ||
/** | ||
* Debug | ||
* Locals | ||
*/ | ||
var debug = require('debug')('x-ray-crawler'); | ||
var rate_limit = require('./rate-limit') | ||
var http = require('./http-driver') | ||
var range = require('./range') | ||
/** | ||
* Locals | ||
* Debug | ||
*/ | ||
var absolutes = require('./absolute-path'); | ||
var rate_limit = require('./rate-limit'); | ||
var driver = require('./http-driver'); | ||
var select = require('./select'); | ||
var range = require('./range'); | ||
var debug = require('debug')('x-ray-crawler') | ||
@@ -43,319 +35,209 @@ /** | ||
* | ||
* @param {String} url | ||
* @return {Crawler} | ||
* @param {Function} driver (options) | ||
* @return {Function} crawler(url, fn) | ||
* @api public | ||
*/ | ||
function Crawler(url) { | ||
if (!(this instanceof Crawler)) return new Crawler(url); | ||
function Crawler(driver) { | ||
driver = driver || http() | ||
// default state | ||
this.state = { | ||
response: function(){}, | ||
paginate: function(){}, | ||
throttle: rate_limit(), | ||
request: function(){}, | ||
driver: driver(), | ||
delay_to: false, | ||
limit: Infinity, | ||
delay: range(), | ||
timeout: false, | ||
throws: false, | ||
url: url | ||
}; | ||
} | ||
// defaults | ||
var throttle = rate_limit() | ||
var concurrency = Infinity | ||
var limit = Infinity | ||
var request = noop | ||
var timeout = false | ||
var response = noop | ||
var delay = range() | ||
var driver = driver | ||
var queue = false | ||
/** | ||
* Mixin `Emitter` | ||
*/ | ||
/** | ||
* Make a request | ||
*/ | ||
Emitter(Crawler.prototype); | ||
function crawler(url, fn) { | ||
// co support | ||
if (1 == arguments.length) { | ||
return function _crawler(fn) { | ||
return crawler(url, fn) | ||
} | ||
} | ||
/** | ||
* Delegates | ||
*/ | ||
if (!queue) { | ||
var options = { | ||
concurrency: concurrency, | ||
timeout: timeout, | ||
limit: limit | ||
} | ||
delegate(Crawler.prototype, 'state') | ||
.fluent('concurrency') | ||
.fluent('response') | ||
.fluent('request') | ||
.fluent('driver') | ||
.fluent('throws') | ||
.fluent('limit') | ||
.getter('url'); | ||
/** | ||
* Throttle according to a rate limit | ||
* | ||
* @param {Number|String} requests | ||
* @param {Number|String} rate | ||
* @return {Number|Crawler} | ||
* @api public | ||
*/ | ||
Crawler.prototype.throttle = function(requests, rate) { | ||
if (!arguments.length) return this.state.throttle; | ||
else if (1 == arguments.length) rate = requests, requests = 1; | ||
rate = /^\d/.test(rate) ? rate : 1 + rate; | ||
rate = 'string' == typeof rate ? ms(rate) : rate; | ||
this.state.throttle = rate_limit(requests, rate); | ||
return this; | ||
}; | ||
/** | ||
* Delay subsequent requests | ||
* | ||
* @param {String|Number} from | ||
* @param {String|Number} to (optional) | ||
* @return {Number|Crawler} | ||
* @api public | ||
*/ | ||
Crawler.prototype.delay = function(from, to) { | ||
if (!arguments.length) return this.state.delay; | ||
from = 'string' == typeof from ? ms(from) : from; | ||
to = 'string' == typeof to ? ms(to) : to; | ||
this.state.delay = range(from, to); | ||
return this; | ||
}; | ||
/** | ||
* Specify a request timeout | ||
* | ||
* @param {String|Number} timeout | ||
* @return {Number|Crawler} | ||
* @api public | ||
*/ | ||
Crawler.prototype.timeout = function(timeout) { | ||
if (!arguments.length) return this.state.timeout; | ||
timeout = 'string' == typeof timeout ? ms(timeout) : timeout; | ||
this.state.timeout = timeout; | ||
return this; | ||
}; | ||
/** | ||
* Paginate through responses | ||
* | ||
* @param {String|Function} fn | ||
* @return {String|Function|Crawler} | ||
* @api public | ||
*/ | ||
Crawler.prototype.paginate = function(fn) { | ||
if (!arguments.length) return this.state.paginate; | ||
this.state.paginate = 'function' != typeof fn | ||
? compile | ||
: fn; | ||
function compile($, ctx) { | ||
var response = ctx.response; | ||
if (response.is('html') || response.is('xml')) { | ||
return select($, fn); | ||
} else if (response.is('json')) { | ||
return json_select(fn, $); | ||
queue = enqueue(get, options) | ||
queue(url, fn) | ||
} else { | ||
return []; | ||
schedule(url, fn) | ||
} | ||
return crawler | ||
} | ||
return this; | ||
}; | ||
/** | ||
* Fetch the `url` based on the `driver` | ||
* | ||
* @param {String} url | ||
* @param {Function} fn | ||
*/ | ||
/** | ||
* Initiate the crawl | ||
* | ||
* @param {Function} done (optional) | ||
* @return {Crawl|Generator} | ||
* @api public | ||
*/ | ||
function get(url, fn) { | ||
debug('getting: %s', url) | ||
var ctx = context() | ||
ctx.url = url | ||
Crawler.prototype.crawl = yieldly(function(done) { | ||
done = done || noop; | ||
// request hook | ||
request(ctx.request) | ||
var limit = this.paginate() ? this.limit() : 1; | ||
var concurrency = this.concurrency(); | ||
var paginate = this.paginate(); | ||
var throttle = this.throttle(); | ||
var throws = this.throws(); | ||
var delay = this.delay(); | ||
var url = this.url; | ||
var scheduled = 0; | ||
var self = this; | ||
var tid = {}; | ||
wrapfn(driver, result)(ctx) | ||
// initial request | ||
debug('initial request: %s', url); | ||
// HTTP response | ||
function result(err, res) { | ||
if (err) return fn(err) | ||
// queue options | ||
var options = { | ||
concurrency: this.concurrency(), | ||
timeout: this.timeout(), | ||
limit: limit | ||
}; | ||
// update the context | ||
if (res && res != ctx) ctx.body = res | ||
// setup the queue | ||
var queue = enqueue(function(url, next) { | ||
self.get(url, next); | ||
}, options); | ||
// post-flight. modify the response | ||
response(ctx.response) | ||
// kick us off | ||
scheduled++; | ||
queue(url, onjobfinish(url)); | ||
// handle the response | ||
function onresponse(ctx) { | ||
var response = ctx.response; | ||
var isJSON = response.is('json'); | ||
var isHTML = response.is('html'); | ||
var isXML = response.is('xml'); | ||
var urls = []; | ||
var $; | ||
debug('response: %j', { | ||
url: ctx.url, | ||
status: ctx.status, | ||
type: ctx.type | ||
}); | ||
// load response | ||
if (isHTML || isXML) { | ||
$ = cheerio.load(ctx.body, { xmlMode: isXML }); | ||
$ = absolutes(ctx.url, $); | ||
} else { | ||
$ = ctx.body; | ||
fn(null, ctx) | ||
} | ||
} | ||
// send a response | ||
self.emit('response', $, ctx); | ||
/** | ||
* Schedule another request for later | ||
* | ||
* @param {String} url | ||
* @param {Function} fn | ||
*/ | ||
// where we going next? | ||
var next_page = paginate($, ctx); | ||
if (next_page) { | ||
debug('next page(s): %j', next_page) | ||
urls = urls.concat(next_page).filter(canRequest); | ||
} | ||
function schedule(url, fn) { | ||
// if we've reached the limit, don't request anymore | ||
if (--limit <= 0) return | ||
// queue up the next round of urls | ||
if (urls.length) { | ||
urls.forEach(schedule); | ||
} else { | ||
debug('no next page, finishing up.') | ||
} | ||
} | ||
// schedule the next url | ||
function schedule(url) { | ||
// if we've reached the limit, don't request anymore | ||
if (--limit <= 0) return; | ||
// if specified, throttle requests and add a delay | ||
var wait = throttle() + delay(); | ||
debug('queued "%s", waiting "%sms"', url, wait); | ||
var wait = throttle() + delay() | ||
scheduled++; | ||
debug('queued: "%s", waiting "%sms"', url, wait) | ||
setTimeout(function() { | ||
// queue up next request | ||
var queued = queue(url, onjobfinish(url)); | ||
if (!queued) return; | ||
}, wait); | ||
var queued = queue(url, fn) | ||
if (!queued) return | ||
}, wait) | ||
} | ||
// handle jobs finishing | ||
function onjobfinish(url) { | ||
return function(err, ctx) { | ||
if (err) { | ||
debug('job (%s) error: %s', url, err.message); | ||
err.url = url; | ||
self.emit('error', err); | ||
} else if (ctx) { | ||
onresponse(ctx); | ||
debug('job finished: %s', url); | ||
} | ||
/** | ||
* Throttle according to a rate limit | ||
* | ||
* @param {Number|String} requests | ||
* @param {Number|String} rate | ||
* @return {Number|Crawler} | ||
* @api public | ||
*/ | ||
if (--scheduled <= 0) { | ||
return done(); | ||
} | ||
crawler.throttle = function(requests, rate) { | ||
if (!arguments.length) return throttle | ||
if (1 == arguments.length) { | ||
rate = requests | ||
requests = 1 | ||
} | ||
rate = /^\d/.test(rate) ? rate : 1 + rate | ||
rate = 'string' == typeof rate ? ms(rate) : rate | ||
throttle = rate_limit(requests, rate) | ||
return crawler | ||
} | ||
return this; | ||
}); | ||
/** | ||
* Delay subsequent requests | ||
* | ||
* @param {String|Number} from | ||
* @param {String|Number} to (optional) | ||
* @return {Number|Crawler} | ||
* @api public | ||
*/ | ||
/** | ||
* Make a request using the | ||
* specified driver | ||
* | ||
* @param {String} url | ||
* @param {Function} fn | ||
* @return {Crawler} | ||
* @api private | ||
*/ | ||
crawler.delay = function(from, to) { | ||
if (!arguments.length) return delay | ||
from = 'string' == typeof from ? ms(from) : from | ||
to = 'string' == typeof to ? ms(to) : to | ||
delay = range(from, to) | ||
return crawler | ||
} | ||
Crawler.prototype.get = function(url, fn) { | ||
var response = this.response(); | ||
var request = this.request(); | ||
var driver = this.driver(); | ||
var ctx = context(); | ||
ctx.url = url; | ||
/** | ||
* Specify a request timeout | ||
* | ||
* @param {String|Number} timeout | ||
* @return {Number|Crawler} | ||
* @api public | ||
*/ | ||
// pre-flight. modify the request | ||
request(ctx.request); | ||
crawler.timeout = function(_) { | ||
if (!arguments.length) return _ | ||
timeout = 'string' == typeof _ ? ms(_) : _ | ||
return crawler | ||
} | ||
// call the driver | ||
debug('request %j', { | ||
driver: driver.name, | ||
url: url | ||
}); | ||
/** | ||
* Specify a request concurrency | ||
* | ||
* @param {Number} n | ||
* @return {Number|crawler} | ||
*/ | ||
// short circuit the driver for testing | ||
if (ctx.body) { | ||
done(null, ctx); | ||
} else { | ||
wrapfn(driver, done)(ctx); | ||
crawler.concurrency = function(n) { | ||
if (!arguments.length) return concurrency | ||
concurrency = n | ||
return crawler | ||
} | ||
function done(err, res) { | ||
if (err) { | ||
return fn(err); | ||
} | ||
/** | ||
* Hook into the request | ||
* | ||
* @param {Function} fn | ||
* @return {Function|crawler} | ||
*/ | ||
// update the context | ||
if (res && res != ctx) ctx.body = res; | ||
crawler.request = function(fn) { | ||
if (!arguments.length) return request | ||
request = fn | ||
return crawler | ||
} | ||
// post-flight. modify the response | ||
response(ctx.response); | ||
/** | ||
* Hook into the response | ||
* | ||
* @param {Function} fn | ||
* @return {Function|crawler} | ||
*/ | ||
fn(null, ctx); | ||
crawler.response = function(fn) { | ||
if (!arguments.length) return response | ||
response = fn | ||
return crawler | ||
} | ||
return this; | ||
}; | ||
/** | ||
* Limit the total number of requests | ||
* | ||
* @param {Number} n | ||
* @return {Number|crawler} | ||
*/ | ||
/** | ||
* Query a JSON object | ||
* | ||
* @param {String} selector | ||
* @param {Object} json | ||
* @return {String|Array} | ||
*/ | ||
crawler.limit = function(n) { | ||
if (!arguments.length) return limit | ||
limit = n | ||
return crawler | ||
} | ||
function json_select(selector, json) { | ||
return isArray(json) | ||
? json.map(function(obj) { return selectn(selector, obj); }) | ||
: selectn(selector, obj); | ||
return crawler | ||
} | ||
/** | ||
* Can we make a request? | ||
* | ||
* @param {String} url | ||
* @return {Boolean} | ||
*/ | ||
function canRequest(url) { | ||
return 'string' == typeof url | ||
? parse(url).protocol | ||
: false; | ||
} |
@@ -5,3 +5,3 @@ /** | ||
module.exports = range; | ||
module.exports = range | ||
@@ -17,8 +17,8 @@ /** | ||
function range(from, to) { | ||
from = from || 0; | ||
to = to || from; | ||
from = from || 0 | ||
to = to || from | ||
return function() { | ||
return Math.floor(Math.random() * to) + from; | ||
return Math.floor(Math.random() * (to - from + 1) + from) | ||
} | ||
} |
@@ -5,3 +5,3 @@ /** | ||
module.exports = rate_limit; | ||
module.exports = rate_limit | ||
@@ -17,25 +17,25 @@ /** | ||
function rate_limit(requests, rate) { | ||
requests = requests || Infinity; | ||
rate = rate || 0; | ||
requests = requests || Infinity | ||
rate = rate || 0 | ||
var rate = Math.round(rate / requests); | ||
var waiting = 0; | ||
var called = 0; | ||
var tids = []; | ||
var rate = Math.round(rate / requests) | ||
var waiting = 0 | ||
var called = 0 | ||
var tids = [] | ||
return function _rate_limit(fn) { | ||
// clear all timeouts if _rate_limit(0); | ||
if (0 === fn) return tids.forEach(clearTimeout); | ||
// clear all timeouts if _rate_limit(0) | ||
if (0 === fn) return tids.forEach(clearTimeout) | ||
var calling = new Date(); | ||
var delta = calling - called; | ||
var free = delta > rate && !waiting; | ||
var calling = new Date() | ||
var delta = calling - called | ||
var free = delta > rate && !waiting | ||
if (free) { | ||
called = calling; | ||
return 0; | ||
called = calling | ||
return 0 | ||
} else { | ||
var wait = (rate - delta) + (waiting++ * rate); | ||
timer(wait); | ||
return wait; | ||
var wait = (rate - delta) + (waiting++ * rate) | ||
timer(wait) | ||
return wait | ||
} | ||
@@ -45,7 +45,7 @@ | ||
tids[tids.length] = setTimeout(function() { | ||
called = new Date(); | ||
waiting--; | ||
}, ms); | ||
called = new Date() | ||
waiting-- | ||
}, ms) | ||
} | ||
} | ||
} |
{ | ||
"name": "x-ray-crawler", | ||
"version": "1.0.1", | ||
"version": "2.0.0", | ||
"description": "x-ray's crawler", | ||
@@ -12,3 +12,3 @@ "main": "lib/index.js", | ||
"enqueue": "^1.0.2", | ||
"http-context": "^1.0.0", | ||
"http-context": "^1.1.0", | ||
"ms": "^0.7.0", | ||
@@ -15,0 +15,0 @@ "selectn": "^0.9.6", |
@@ -7,3 +7,2 @@ # x-ray Crawler | ||
- Flexible pagination | ||
- Extensible drivers | ||
@@ -20,15 +19,17 @@ - Request and response hooks | ||
```js | ||
crawler('http://google.com') | ||
function http(ctx, fn) { | ||
superagent.get(ctx.url, fn) | ||
} | ||
var crawl = Crawler(http) | ||
.throttle(3, '1s') | ||
.delay('1s', '10s') | ||
.concurrency(2) | ||
.paginate('a @ href') | ||
.limit(20) | ||
.on('response', function($, ctx) { | ||
console.log('title: %s', $('title').text().trim()); | ||
}) | ||
.crawl(function(err, res) { | ||
if (err) throw err; | ||
console.log('done!'); | ||
}); | ||
crawl('http://lapwinglabs.com', function(err, ctx) { | ||
if (err) throw err | ||
console.log('status code: %s', ctx.status) | ||
console.log('status body: %s', ctx.body) | ||
}) | ||
``` | ||
@@ -35,0 +36,0 @@ |
17
test.js
@@ -5,3 +5,3 @@ /** | ||
var crawler = require('./'); | ||
var crawler = require('./') | ||
@@ -12,14 +12,11 @@ /** | ||
crawler('http://google.com') | ||
var crawl = crawler() | ||
.throttle(3, '1s') | ||
.delay('1s', '10s') | ||
.concurrency(2) | ||
.paginate('a @ href') | ||
.limit(20) | ||
.on('response', function($, ctx) { | ||
console.log('title: %s', $('title').text().trim()); | ||
}) | ||
.crawl(function(err, res) { | ||
if (err) throw err; | ||
console.log('done!'); | ||
}); | ||
crawl('http://lapwinglabs.com', function(err, ctx) { | ||
if (err) throw err | ||
console.log(ctx.status) | ||
}) |
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
60
9487
9
306
2
Updatedhttp-context@^1.1.0