Comparing version 1.2.2 to 1.3.0
node-crawler ChangeLog | ||
------------------------- | ||
1.3.0 | ||
- [#367](https://github.com/bda-research/node-crawler/pull/367) add http2 functionality (@BeijingProtoHuman) | ||
- [#364](https://github.com/bda-research/node-crawler/pull/364) Fix some typos (@pzmarzly) | ||
- [#363](https://github.com/bda-research/node-crawler/pull/363) Remove stale vendored jQuery version (@pzmarzly) | ||
1.2.2 | ||
@@ -5,0 +10,0 @@ - [#353](https://github.com/bda-research/node-crawler/pull/353) Release automate (@mike442144) |
@@ -11,3 +11,3 @@ # Node Crawler Examples | ||
### Use Proxy with Crawler | ||
Most large scale webscraping tasks requires us to perform countless amounts of access to a specific website. This could be higly risky using only one IP address since the website could permanately or temporarily block our IP address. Instead, we can use a proxy that gives us the freedom to access websites using multiple different IPs. **Below is an example of how to use a proxy with Crawler:** | ||
Most large scale webscraping tasks requires us to perform countless amounts of access to a specific website. This could be very risky using only one IP address since the website could permanently or temporarily block our IP address. Instead, we can use a proxy that gives us the freedom to access websites using multiple different IPs. **Below is an example of how to use a proxy with Crawler:** | ||
```javascript | ||
@@ -32,3 +32,3 @@ const Crawler = require("crawler"); | ||
Some of our web scraping tasks involves downloading images or other file types, like grabbing images to train image recognition algorithms. | ||
With crawler, a few settings will do the trick; simply set ```encoding``` and ```jQurey``` options to ```null``` and ```false``` respectively when queuing a task. **Below is an example of downloading images with Crawler:** | ||
With crawler, a few settings will do the trick; simply set ```encoding``` and ```jQuery``` options to ```null``` and ```false``` respectively when queuing a task. **Below is an example of downloading images with Crawler:** | ||
```javascript | ||
@@ -35,0 +35,0 @@ const Crawler = require("crawler"); |
@@ -1,2 +0,1 @@ | ||
'use strict'; | ||
@@ -14,17 +13,27 @@ | ||
, iconvLite = require('iconv-lite') | ||
, typeis = require('type-is').is; | ||
, typeis = require('type-is').is | ||
, qs = require('querystring'), | ||
URL = require('url').URL; | ||
var whacko=null, level, levels = ['silly','debug','verbose','info','warn','error','critical']; | ||
try{ | ||
//NOTE for polyfill purpose, cause the http2 is just table for node 10.0 | ||
let http2; | ||
try { | ||
http2 = require('http2'); | ||
} catch (e) { | ||
//NOTE leave it empty for pass eslint | ||
} | ||
var whacko = null, level, levels = ['silly', 'debug', 'verbose', 'info', 'warn', 'error', 'critical']; | ||
try { | ||
whacko = require('whacko'); | ||
}catch(e){ | ||
} catch (e) { | ||
e.code; | ||
} | ||
function defaultLog(){ //2016-11-24T12:22:55.639Z - debug: | ||
if( levels.indexOf(arguments[0]) >= levels.indexOf(level) ) | ||
console.log(new Date().toJSON()+' - '+ arguments[0] +': CRAWLER %s', util.format.apply(util, Array.prototype.slice.call(arguments, 1))); | ||
function defaultLog() { //2016-11-24T12:22:55.639Z - debug: | ||
if (levels.indexOf(arguments[0]) >= levels.indexOf(level)) | ||
console.log(new Date().toJSON() + ' - ' + arguments[0] + ': CRAWLER %s', util.format.apply(util, Array.prototype.slice.call(arguments, 1))); | ||
} | ||
function checkJQueryNaming (options) { | ||
function checkJQueryNaming(options) { | ||
if ('jquery' in options) { | ||
@@ -37,5 +46,5 @@ options.jQuery = options.jquery; | ||
function readJqueryUrl (url, callback) { | ||
function readJqueryUrl(url, callback) { | ||
if (url.match(/^(file:\/\/|\w+:|\/)/)) { | ||
fs.readFile(url.replace(/^file:\/\//,''),'utf-8', function(err,jq) { | ||
fs.readFile(url.replace(/^file:\/\//, ''), 'utf-8', function (err, jq) { | ||
callback(err, jq); | ||
@@ -48,7 +57,7 @@ }); | ||
function contentType(res){ | ||
return get(res,'content-type').split(';').filter(item => item.trim().length !== 0).join(';'); | ||
function contentType(res) { | ||
return get(res, 'content-type').split(';').filter(item => item.trim().length !== 0).join(';'); | ||
} | ||
function get(res,field){ | ||
function get(res, field) { | ||
return res.headers[field.toLowerCase()] || ''; | ||
@@ -59,7 +68,7 @@ } | ||
function Crawler (options) { | ||
function Crawler(options) { | ||
var self = this; | ||
options = options||{}; | ||
if(['onDrain','cache'].some(key => key in options)){ | ||
options = options || {}; | ||
if (['onDrain', 'cache'].some(key => key in options)) { | ||
throw new Error('Support for "onDrain", "cache" has been removed! For more details, see https://github.com/bda-research/node-crawler'); | ||
@@ -73,26 +82,27 @@ } | ||
Crawler.prototype.init = function init (options) { | ||
Crawler.prototype.init = function init(options) { | ||
var self = this; | ||
var defaultOptions = { | ||
autoWindowClose: true, | ||
forceUTF8: true, | ||
gzip: true, | ||
incomingEncoding: null, | ||
jQuery: true, | ||
maxConnections: 10, | ||
method: 'GET', | ||
priority: 5, | ||
priorityRange: 10, | ||
rateLimit: 0, | ||
referer: false, | ||
retries: 3, | ||
retryTimeout: 10000, | ||
timeout: 15000, | ||
skipDuplicates: false, | ||
rotateUA: false, | ||
homogeneous: false | ||
autoWindowClose: true, | ||
forceUTF8: true, | ||
gzip: true, | ||
incomingEncoding: null, | ||
jQuery: true, | ||
maxConnections: 10, | ||
method: 'GET', | ||
priority: 5, | ||
priorityRange: 10, | ||
rateLimit: 0, | ||
referer: false, | ||
retries: 3, | ||
retryTimeout: 10000, | ||
timeout: 15000, | ||
skipDuplicates: false, | ||
rotateUA: false, | ||
homogeneous: false, | ||
http2: false | ||
}; | ||
//return defaultOptions with overriden properties from options. | ||
// return defaultOptions with overridden properties from options. | ||
self.options = _.extend(defaultOptions, options); | ||
@@ -106,7 +116,10 @@ | ||
self.limiters = new Bottleneck.Cluster(self.options.maxConnections,self.options.rateLimit,self.options.priorityRange, self.options.priority, self.options.homogeneous); | ||
self.limiters = new Bottleneck.Cluster(self.options.maxConnections, self.options.rateLimit, self.options.priorityRange, self.options.priority, self.options.homogeneous); | ||
//maintain the http2 sessions | ||
self.http2Connections = {}; | ||
level = self.options.debug ? 'debug' : 'info'; | ||
if(self.options.logger) | ||
if (self.options.logger) | ||
log = self.options.logger.log.bind(self.options.logger); | ||
@@ -117,17 +130,20 @@ | ||
self.seen = new seenreq(self.options.seenreq); | ||
self.seen.initialize().then(()=> log('debug', 'seenreq is initialized.')).catch(e => log('error', e)); | ||
self.seen.initialize().then(() => log('debug', 'seenreq is initialized.')).catch(e => log('error', e)); | ||
self.on('_release', function(){ | ||
log('debug','Queue size: %d',this.queueSize); | ||
self.on('_release', function () { | ||
log('debug', 'Queue size: %d', this.queueSize); | ||
if(this.limiters.empty) | ||
if (this.limiters.empty) { | ||
if (Object.keys(self.http2Connections).length > 0) self._clearHttp2Session(); | ||
return this.emit('drain'); | ||
} | ||
}); | ||
}; | ||
Crawler.prototype.setLimiterProperty = function setLimiterProperty (limiter, property, value) { | ||
Crawler.prototype.setLimiterProperty = function setLimiterProperty(limiter, property, value) { | ||
var self = this; | ||
switch(property) { | ||
case 'rateLimit': self.limiters.key(limiter).setRateLimit(value);break; | ||
switch (property) { | ||
case 'rateLimit': self.limiters.key(limiter).setRateLimit(value); break; | ||
default: break; | ||
@@ -137,7 +153,44 @@ } | ||
Crawler.prototype._inject = function _inject (response, options, callback) { | ||
Crawler.prototype.generateHttp2RequestLine = function (options) { | ||
const urlObj = new URL(options.uri); | ||
const requestLine = { | ||
':method': options.method || 'GET', | ||
':path': urlObj.pathname, | ||
':scheme': urlObj.protocol.replace(':', ''), | ||
':authority': urlObj.hostname | ||
}; | ||
return requestLine; | ||
}; | ||
Crawler.prototype.generateHttp2RequestBody = function (options) { | ||
let data = null; | ||
if (options.form) { | ||
if (!/^application\/x-www-form-urlencoded\b/.test(options.headers['content-type'])) { | ||
options.headers['content-type'] = 'application/x-www-form-urlencoded'; | ||
} | ||
data = (typeof options.form === 'string') ? encodeURIComponent(options.form) : qs.stringify(options.form); | ||
} else if (options.json) { | ||
if (!/^application\/x-www-form-urlencoded\b/.test(options.headers['content-type'])) { | ||
data = JSON.stringify(options.body); | ||
} | ||
if (!options.headers['contentn-type']) options.headers['content-type'] = 'application/json'; | ||
} else if (options.body !== undefined) { | ||
data = options.body; | ||
} | ||
//NOTE the default situation do nothing to the | ||
return data; | ||
}; | ||
Crawler.prototype._inject = function _inject(response, options, callback) { | ||
var $; | ||
if (options.jQuery === 'whacko') { | ||
if(!whacko){ | ||
if (!whacko) { | ||
throw new Error('Please install whacko by your own since `crawler` detected you specify explicitly'); | ||
@@ -148,3 +201,3 @@ } | ||
callback(null, response, options, $); | ||
}else if (options.jQuery === 'cheerio' || options.jQuery.name === 'cheerio' || options.jQuery === true) { | ||
} else if (options.jQuery === 'cheerio' || options.jQuery.name === 'cheerio' || options.jQuery === true) { | ||
var defaultCheerioOptions = { | ||
@@ -159,3 +212,3 @@ normalizeWhitespace: false, | ||
callback(null, response, options, $); | ||
}else if (options.jQuery.jsdom) { | ||
} else if (options.jQuery.jsdom) { | ||
var jsdom = options.jQuery.jsdom; | ||
@@ -165,3 +218,3 @@ var scriptLocation = path.resolve(__dirname, '../vendor/jquery-2.1.1.min.js'); | ||
//Use promises | ||
readJqueryUrl(scriptLocation, function(err, jquery) { | ||
readJqueryUrl(scriptLocation, function (err, jquery) { | ||
try { | ||
@@ -180,3 +233,3 @@ jsdom.env({ | ||
} catch (err) { | ||
log('error',err); | ||
log('error', err); | ||
} | ||
@@ -187,3 +240,3 @@ | ||
} catch (e) { | ||
options.callback(e,{options}, options.release); | ||
options.callback(e, { options }, options.release); | ||
} | ||
@@ -198,15 +251,15 @@ }); | ||
Crawler.prototype.isIllegal = function isIllegal (options) { | ||
Crawler.prototype.isIllegal = function isIllegal(options) { | ||
return (_.isNull(options) || _.isUndefined(options) || (!_.isString(options) && !_.isPlainObject(options))); | ||
}; | ||
Crawler.prototype.direct = function direct (options) { | ||
Crawler.prototype.direct = function direct(options) { | ||
var self = this; | ||
if(self.isIllegal(options) || !_.isPlainObject(options)) { | ||
return log('warn','Illegal queue option: ', JSON.stringify(options)); | ||
if (self.isIllegal(options) || !_.isPlainObject(options)) { | ||
return log('warn', 'Illegal queue option: ', JSON.stringify(options)); | ||
} | ||
if(!('callback' in options) || !_.isFunction(options.callback)) { | ||
return log('warn','must specify callback function when using sending direct request with crawler'); | ||
if (!('callback' in options) || !_.isFunction(options.callback)) { | ||
return log('warn', 'must specify callback function when using sending direct request with crawler'); | ||
} | ||
@@ -232,3 +285,3 @@ | ||
Crawler.prototype.queue = function queue (options) { | ||
Crawler.prototype.queue = function queue(options) { | ||
var self = this; | ||
@@ -241,9 +294,9 @@ | ||
for(var i = 0; i < options.length; ++i) { | ||
if(self.isIllegal(options[i])) { | ||
log('warn','Illegal queue option: ', JSON.stringify(options[i])); | ||
for (var i = 0; i < options.length; ++i) { | ||
if (self.isIllegal(options[i])) { | ||
log('warn', 'Illegal queue option: ', JSON.stringify(options[i])); | ||
continue; | ||
} | ||
self._pushToQueue( | ||
_.isString(options[i]) ? {uri: options[i]} : options[i] | ||
_.isString(options[i]) ? { uri: options[i] } : options[i] | ||
); | ||
@@ -253,3 +306,3 @@ } | ||
Crawler.prototype._pushToQueue = function _pushToQueue (options) { | ||
Crawler.prototype._pushToQueue = function _pushToQueue(options) { | ||
var self = this; | ||
@@ -268,3 +321,3 @@ | ||
// If duplicate skipping is enabled, avoid queueing entirely for URLs we already crawled | ||
if (!self.options.skipDuplicates){ | ||
if (!self.options.skipDuplicates) { | ||
self._schedule(options); | ||
@@ -275,3 +328,3 @@ return; | ||
self.seen.exists(options, options.seenreq).then(rst => { | ||
if(!rst){ | ||
if (!rst) { | ||
self._schedule(options); | ||
@@ -282,9 +335,10 @@ } | ||
Crawler.prototype._schedule = function _scheduler(options){ | ||
Crawler.prototype._schedule = function _scheduler(options) { | ||
var self = this; | ||
self.emit('schedule',options); | ||
//NOTE this will be used to add proxy outside the class | ||
self.emit('schedule', options); | ||
self.limiters.key(options.limiter||'default').submit(options.priority,function(done, limiter){ | ||
options.release = function(){ done();self.emit('_release'); }; | ||
if(!options.callback) | ||
self.limiters.key(options.limiter || 'default').submit(options.priority, function (done, limiter) { | ||
options.release = function () { done(); self.emit('_release'); }; | ||
if (!options.callback) | ||
options.callback = options.release; | ||
@@ -297,5 +351,5 @@ | ||
if (options.html) { | ||
self._onContent(null, options, {body:options.html,headers:{'content-type':'text/html'}}); | ||
self._onContent(null, options, { body: options.html, headers: { 'content-type': 'text/html' } }); | ||
} else if (typeof options.uri === 'function') { | ||
options.uri(function(uri) { | ||
options.uri(function (uri) { | ||
options.uri = uri; | ||
@@ -308,10 +362,26 @@ self._buildHttpRequest(options); | ||
}); | ||
}; | ||
Crawler.prototype._buildHttpRequest = function _buildHTTPRequest (options) { | ||
Crawler.prototype._clearHttp2Session = function _clearHttp2Session() { | ||
log('debug', `Crawler clear all ${Object.keys(this.http2Connections).length} http2 connections`); | ||
Object.keys(this.http2Connections).forEach(hostName => { | ||
this._closeAndDeleteHttp2Session(hostName); | ||
log('debug', `http2 connection to ${hostName} closed`); | ||
}); | ||
}; | ||
Crawler.prototype._closeAndDeleteHttp2Session = function _closeAndDeleteHttp2Session(targetHost) { | ||
if (this.http2Connections[targetHost]) { | ||
this.http2Connections[targetHost].close(); | ||
delete this.http2Connections[targetHost]; | ||
} | ||
}; | ||
Crawler.prototype._buildHttpRequest = function _buildHTTPRequest(options) { | ||
var self = this; | ||
log('debug',options.method+' '+options.uri); | ||
if(options.proxy) | ||
log('debug','Use proxy: %s', options.proxy); | ||
log('debug', options.method + ' ' + options.uri); | ||
if (options.proxy) | ||
log('debug', 'Use proxy: %s', options.proxy); | ||
@@ -323,16 +393,16 @@ // Cloning keeps the opts parameter clean: | ||
var ropts = _.assign({},options); | ||
var ropts = _.assign({}, options); | ||
if (!ropts.headers) { ropts.headers={}; } | ||
if (ropts.forceUTF8) {ropts.encoding=null;} | ||
if (!ropts.headers) { ropts.headers = {}; } | ||
if (ropts.forceUTF8) { ropts.encoding = null; } | ||
// specifying json in request will have request sets body to JSON representation of value and | ||
// adds Content-type: application/json header. Additionally, parses the response body as JSON | ||
// so the response will be JSON object, no need to deal with encoding | ||
if (ropts.json) {options.encoding=null;} | ||
if (ropts.json) { options.encoding = null; } | ||
if (ropts.userAgent) { | ||
if(self.options.rotateUA && _.isArray(ropts.userAgent)){ | ||
if (self.options.rotateUA && _.isArray(ropts.userAgent)) { | ||
ropts.headers['User-Agent'] = ropts.userAgent[0]; | ||
// If "rotateUA" is true, rotate User-Agent | ||
options.userAgent.push(options.userAgent.shift()); | ||
}else{ | ||
} else { | ||
ropts.headers['User-Agent'] = ropts.userAgent; | ||
@@ -350,11 +420,11 @@ } | ||
var doRequest = function(err) { | ||
if(err) { | ||
err.message = 'Error in preRequest' + (err.message ? ', '+err.message : err.message); | ||
switch(err.op) { | ||
case 'retry': log('debug', err.message + ', retry ' + options.uri);self._onContent(err,options);break; | ||
case 'fail': log('debug', err.message + ', fail ' + options.uri);options.callback(err,{options:options},options.release);break; | ||
case 'abort': log('debug', err.message + ', abort ' + options.uri);options.release();break; | ||
case 'queue': log('debug', err.message + ', queue ' + options.uri);self.queue(options);options.release();break; | ||
default: log('debug', err.message + ', retry ' + options.uri);self._onContent(err,options);break; | ||
var doRequest = function (err) { | ||
if (err) { | ||
err.message = 'Error in preRequest' + (err.message ? ', ' + err.message : err.message); | ||
switch (err.op) { | ||
case 'retry': log('debug', err.message + ', retry ' + options.uri); self._onContent(err, options); break; | ||
case 'fail': log('debug', err.message + ', fail ' + options.uri); options.callback(err, { options: options }, options.release); break; | ||
case 'abort': log('debug', err.message + ', abort ' + options.uri); options.release(); break; | ||
case 'queue': log('debug', err.message + ', queue ' + options.uri); self.queue(options); options.release(); break; | ||
default: log('debug', err.message + ', retry ' + options.uri); self._onContent(err, options); break; | ||
} | ||
@@ -364,15 +434,28 @@ return; | ||
if(ropts.skipEventRequest !== true) { | ||
self.emit('request',ropts); | ||
} | ||
//do http2.* request | ||
if (ropts.http2) { | ||
if (!http2) { | ||
process.nextTick(() => { | ||
const notSupportedHttp2Error = new Error('you are trying to use http2 API which may not be supported for your current environment or node version'); | ||
notSupportedHttp2Error.code = 'NOHTTP2SUPPORT'; | ||
self._onContent(notSupportedHttp2Error, options); | ||
}); | ||
return; | ||
} | ||
self._http2request(ropts, options); | ||
} else { | ||
if (ropts.skipEventRequest !== true) { | ||
self.emit('request', ropts); | ||
} | ||
var requestArgs = ['uri','url','qs','method','headers','body','form','formData','json','multipart','followRedirect','followAllRedirects','maxRedirects','removeRefererHeader','encoding','pool','timeout','proxy','auth','oauth','strictSSL','jar','aws','gzip','time','tunnel','proxyHeaderWhiteList','proxyHeaderExclusiveList','localAddress','forever', 'agent', 'strictSSL', 'agentOptions', 'agentClass']; | ||
var requestArgs = ['uri', 'url', 'qs', 'method', 'headers', 'body', 'form', 'formData', 'json', 'multipart', 'followRedirect', 'followAllRedirects', 'maxRedirects', 'removeRefererHeader', 'encoding', 'pool', 'timeout', 'proxy', 'auth', 'oauth', 'strictSSL', 'jar', 'aws', 'gzip', 'time', 'tunnel', 'proxyHeaderWhiteList', 'proxyHeaderExclusiveList', 'localAddress', 'forever', 'agent', 'strictSSL', 'agentOptions', 'agentClass']; | ||
request(_.pick.apply(self,[ropts].concat(requestArgs)), function(error,response) { | ||
if (error) { | ||
return self._onContent(error, options); | ||
} | ||
request(_.pick.apply(self, [ropts].concat(requestArgs)), function (error, response) { | ||
if (error) { | ||
return self._onContent(error, options); | ||
} | ||
self._onContent(error,options,response); | ||
}); | ||
self._onContent(error, options, response); | ||
}); | ||
} | ||
}; | ||
@@ -387,30 +470,130 @@ | ||
Crawler.prototype._onContent = function _onContent (error, options, response) { | ||
Crawler.prototype._buildHttp2Session = function _buildHttp2Session(targetHost) { | ||
const self = this; | ||
const newHttp2Connection = self.http2Connections[targetHost] = http2.connect(targetHost); | ||
log('debug', `connect to a new ${targetHost}`); | ||
newHttp2Connection.on('error', (err) => { | ||
log('warn', `Http2 stession error ${targetHost}, got error ${err}`); | ||
}).on('goaway', () => { | ||
log('debug', `Http2 session${targetHost} connection goaway`); | ||
}).on('connect', () => { | ||
log('debug', `Http2 session${targetHost} connection init`); | ||
}).once('close', () => { | ||
log('debug', `Http2 session ${targetHost} connection closed`); | ||
}); | ||
}; | ||
Crawler.prototype._http2request = function _http2request(ropts, options) { | ||
const self = this; | ||
const targetHost = new URL(ropts.uri).origin; | ||
ropts.headers = Object.assign(ropts.headers, self.generateHttp2RequestLine(ropts)); | ||
const requestBody = ropts.headers[':method'] === 'GET' ? null : self.generateHttp2RequestBody(ropts); | ||
const response = { | ||
headers: {} | ||
}; | ||
const chunks = []; | ||
let http2Error = null; | ||
if (!self.http2Connections[targetHost] || self.http2Connections[targetHost].destroyed) { | ||
self._buildHttp2Session(targetHost); | ||
} | ||
let req = null; | ||
try { | ||
req = self.http2Connections[targetHost].request(ropts.headers); | ||
} catch (e) { | ||
//to handle the goaway issue, goaway will make the session can not be established | ||
//but it can not be detected at the moment that stream init | ||
//try catch seems the way to sovle it | ||
self._onContent(e, options, response); | ||
return; | ||
} | ||
req.on('response', headers => { | ||
//Where build the response obj | ||
response.statusCode = headers[':status']; | ||
response.request = { | ||
uri: `${req.sentHeaders[':scheme']}://${req.sentHeaders[':authority']}${req.sentHeaders[':path']}`, | ||
method: req.sentHeaders[':method'], | ||
headers: Object.assign({}, req.sentHeaders, req.sentInfoHeaders) | ||
}; | ||
for (const name in headers) { | ||
response.headers[name] = headers[name]; | ||
} | ||
}); | ||
req.on('error', (err) => { | ||
log('debug', `Http2 stream error${ropts.uri}, got error ${err}`); | ||
http2Error = err; | ||
}); | ||
req.on('data', chunk => { | ||
chunks.push(chunk); | ||
}); | ||
req.setTimeout(self.options.timeout); | ||
req.on('timeout', () => { | ||
const error = new Error('ESOCKETTIMEDOUT'); | ||
error.code = 'ESOCKETTIMEDOUT'; | ||
http2Error = error; | ||
req.close(); | ||
}); | ||
req.once('close', () => { | ||
if (http2Error) self._onContent(http2Error, options, response); | ||
else { | ||
response.body = Buffer.concat(chunks); | ||
self._onContent(null, options, response); | ||
} | ||
}); | ||
req.on('end', () => { | ||
log('debug', `${ropts.uri} stream ends`); | ||
}); | ||
//set request body | ||
req.end(requestBody); | ||
}; | ||
Crawler.prototype._onContent = function _onContent(error, options, response) { | ||
var self = this; | ||
if (error) { | ||
log('error','Error '+error+' when fetching '+ (options.uri||options.url)+(options.retries ? ' ('+options.retries+' retries left)' : '')); | ||
if (options.retries) { | ||
setTimeout(function() { | ||
options.retries--; | ||
self._schedule(options); | ||
options.release(); | ||
},options.retryTimeout); | ||
} else{ | ||
options.callback(error,{options:options},options.release); | ||
switch (error.code) { | ||
case 'NOHTTP2SUPPORT': | ||
//if the enviroment is not support http2 api, all request rely on http2 protocol | ||
//are aborted immediately no matter how many retry times left | ||
log('error', 'Error ' + error + ' when fetching ' + (options.uri || options.url) + ' skip all retry times'); | ||
break; | ||
default: | ||
log('error', 'Error ' + error + ' when fetching ' + (options.uri || options.url) + (options.retries ? ' (' + options.retries + ' retries left)' : '')); | ||
if (options.retries) { | ||
setTimeout(function () { | ||
options.retries--; | ||
self._schedule(options); | ||
options.release(); | ||
}, options.retryTimeout); | ||
return; | ||
} | ||
break; | ||
} | ||
options.callback(error, { options: options }, options.release); | ||
return; | ||
} | ||
if (!response.body) { response.body=''; } | ||
if (!response.body) { response.body = ''; } | ||
log('debug','Got '+(options.uri||'html')+' ('+response.body.length+' bytes)...'); | ||
log('debug', 'Got ' + (options.uri || 'html') + ' (' + response.body.length + ' bytes)...'); | ||
try{ | ||
self._doEncoding(options,response); | ||
}catch(e){ | ||
log('error',e); | ||
return options.callback(e,{options:options},options.release); | ||
try { | ||
self._doEncoding(options, response); | ||
} catch (e) { | ||
return options.callback(e, { options: options }, options.release); | ||
} | ||
@@ -420,13 +603,13 @@ | ||
if(options.method === 'HEAD' || !options.jQuery){ | ||
return options.callback(null,response,options.release); | ||
if (options.method === 'HEAD' || !options.jQuery) { | ||
return options.callback(null, response, options.release); | ||
} | ||
var injectableTypes = ['html','xhtml','text/xml', 'application/xml', '+xml']; | ||
if (!options.html && !typeis(contentType(response), injectableTypes)){ | ||
log('warn','response body is not HTML, skip injecting. Set jQuery to false to suppress this message'); | ||
return options.callback(null,response,options.release); | ||
var injectableTypes = ['html', 'xhtml', 'text/xml', 'application/xml', '+xml']; | ||
if (!options.html && !typeis(contentType(response), injectableTypes)) { | ||
log('warn', 'response body is not HTML, skip injecting. Set jQuery to false to suppress this message'); | ||
return options.callback(null, response, options.release); | ||
} | ||
log('debug','Injecting'); | ||
log('debug', 'Injecting'); | ||
@@ -436,4 +619,4 @@ self._inject(response, options, self._injected.bind(self)); | ||
Crawler.prototype._injected = function(errors, response, options, $){ | ||
log('debug','Injected'); | ||
Crawler.prototype._injected = function (errors, response, options, $) { | ||
log('debug', 'Injected'); | ||
@@ -444,6 +627,6 @@ response.$ = $; | ||
Crawler.prototype._doEncoding = function(options,response){ | ||
Crawler.prototype._doEncoding = function (options, response) { | ||
var self = this; | ||
if(options.encoding === null){ | ||
if (options.encoding === null) { | ||
return; | ||
@@ -455,3 +638,3 @@ } | ||
response.charset = charset; | ||
log('debug','Charset ' + charset); | ||
log('debug', 'Charset ' + charset); | ||
@@ -466,6 +649,6 @@ if (charset !== 'utf-8' && charset !== 'ascii') {// convert response.body into 'utf-8' encoded buffer | ||
Crawler.prototype._parseCharset = function(res){ | ||
Crawler.prototype._parseCharset = function (res) { | ||
//Browsers treat gb2312 as gbk, but iconv-lite not. | ||
//Replace gb2312 with gbk, in order to parse the pages which say gb2312 but actually are gbk. | ||
function getCharset(str){ | ||
function getCharset(str) { | ||
var charset = (str && str.match(/charset=['"]?([\w.-]+)/i) || [0, null])[1]; | ||
@@ -479,7 +662,7 @@ return charset && charset.replace(/:\d{4}$|[^0-9a-z]/g, '') == 'gb2312' ? 'gbk' : charset; | ||
var charset = charsetParser(contentType(res)); | ||
if(charset) | ||
if (charset) | ||
return charset; | ||
if(!typeis(contentType(res), ['html'])){ | ||
log('debug','Charset not detected in response headers, please specify using `incomingEncoding`, use `utf-8` by default'); | ||
if (!typeis(contentType(res), ['html'])) { | ||
log('debug', 'Charset not detected in response headers, please specify using `incomingEncoding`, use `utf-8` by default'); | ||
return 'utf-8'; | ||
@@ -489,3 +672,3 @@ } | ||
var body = res.body instanceof Buffer ? res.body.toString() : res.body; | ||
charset = charsetParser(contentType(res),body,'utf-8'); | ||
charset = charsetParser(contentType(res), body, 'utf-8'); | ||
@@ -495,4 +678,4 @@ return charset; | ||
Object.defineProperty(Crawler.prototype,'queueSize',{ | ||
get:function(){ | ||
Object.defineProperty(Crawler.prototype, 'queueSize', { | ||
get: function () { | ||
return this.limiters.unfinishedClients; | ||
@@ -499,0 +682,0 @@ } |
{ | ||
"name": "crawler", | ||
"version": "1.2.2", | ||
"version": "1.3.0", | ||
"description": "Crawler is a web spider written with Nodejs. It gives you the full power of jQuery on the server to parse a big number of pages as they are downloaded, asynchronously", | ||
@@ -12,2 +12,3 @@ "main": "./lib/crawler.js", | ||
"test": "mocha --timeout=15000 tests/*.test.js", | ||
"http2test": "mocha --timeout=15000 tests/http2*.test.js", | ||
"cover": "nyc --reporter=lcovonly --reporter=text --reporter=text-summary mocha --timeout=15000 --reporter spec tests/*.test.js" | ||
@@ -20,3 +21,3 @@ }, | ||
"engine-strict": { | ||
"node": ">=4.0.0" | ||
"node": ">=10.0.0" | ||
}, | ||
@@ -38,4 +39,4 @@ "dependencies": { | ||
"mocha": "^6.1.0", | ||
"nock": "^13.0.5", | ||
"mocha-testdata": "^1.2.0", | ||
"nock": "^10.0.6", | ||
"nyc": "^13.1.0", | ||
@@ -42,0 +43,0 @@ "sinon": "^7.0.0", |
@@ -249,2 +249,25 @@ | ||
## Work with Http2 | ||
Node-crawler now supports http request. Proxy functionality for http2 request does not be included now. It will be added in the future. | ||
```js | ||
crawler.queue({ | ||
//unit test work with httpbin http2 server. It could be used for test | ||
uri: 'https://nghttp2.org/httpbin/status/200', | ||
method: 'GET', | ||
http2: true, //set http2 to be true will make a http2 request | ||
callback: (error, response, done) => { | ||
if(error) { | ||
console.error(error); | ||
return done(); | ||
} | ||
console.log(`inside callback`); | ||
console.log(response.body); | ||
return done(); | ||
} | ||
}) | ||
``` | ||
## Work with bottleneck | ||
@@ -425,2 +448,6 @@ | ||
### Http2 | ||
* `options.http2`: [Boolean](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Data_structures#Boolean_type) If true, request will be sent in http2 protocol (Default false) | ||
### Https socks5 | ||
@@ -427,0 +454,0 @@ ```js |
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Network access
Supply chain riskThis module accesses the network.
Found 1 instance in 1 package
43
2669
558
772339
1