Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

crawler

Package Overview
Dependencies
Maintainers
4
Versions
40
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

crawler - npm Package Compare versions

Comparing version 1.2.2 to 1.3.0

tests/http2errorHandling.test.js

5

CHANGELOG.md
node-crawler ChangeLog
-------------------------
1.3.0
- [#367](https://github.com/bda-research/node-crawler/pull/367) add http2 functionality (@BeijingProtoHuman)
- [#364](https://github.com/bda-research/node-crawler/pull/364) Fix some typos (@pzmarzly)
- [#363](https://github.com/bda-research/node-crawler/pull/363) Remove stale vendored jQuery version (@pzmarzly)
1.2.2

@@ -5,0 +10,0 @@ - [#353](https://github.com/bda-research/node-crawler/pull/353) Release automate (@mike442144)

4

examples/README.md

@@ -11,3 +11,3 @@ # Node Crawler Examples

### Use Proxy with Crawler
Most large scale webscraping tasks requires us to perform countless amounts of access to a specific website. This could be higly risky using only one IP address since the website could permanately or temporarily block our IP address. Instead, we can use a proxy that gives us the freedom to access websites using multiple different IPs. **Below is an example of how to use a proxy with Crawler:**
Most large scale webscraping tasks requires us to perform countless amounts of access to a specific website. This could be very risky using only one IP address since the website could permanently or temporarily block our IP address. Instead, we can use a proxy that gives us the freedom to access websites using multiple different IPs. **Below is an example of how to use a proxy with Crawler:**
```javascript

@@ -32,3 +32,3 @@ const Crawler = require("crawler");

Some of our web scraping tasks involves downloading images or other file types, like grabbing images to train image recognition algorithms.
With crawler, a few settings will do the trick; simply set ```encoding``` and ```jQurey``` options to ```null``` and ```false``` respectively when queuing a task. **Below is an example of downloading images with Crawler:**
With crawler, a few settings will do the trick; simply set ```encoding``` and ```jQuery``` options to ```null``` and ```false``` respectively when queuing a task. **Below is an example of downloading images with Crawler:**
```javascript

@@ -35,0 +35,0 @@ const Crawler = require("crawler");

@@ -1,2 +0,1 @@

'use strict';

@@ -14,17 +13,27 @@

, iconvLite = require('iconv-lite')
, typeis = require('type-is').is;
, typeis = require('type-is').is
, qs = require('querystring'),
URL = require('url').URL;
var whacko=null, level, levels = ['silly','debug','verbose','info','warn','error','critical'];
try{
//NOTE for polyfill purpose, cause the http2 is just table for node 10.0
let http2;
try {
http2 = require('http2');
} catch (e) {
//NOTE leave it empty for pass eslint
}
var whacko = null, level, levels = ['silly', 'debug', 'verbose', 'info', 'warn', 'error', 'critical'];
try {
whacko = require('whacko');
}catch(e){
} catch (e) {
e.code;
}
function defaultLog(){ //2016-11-24T12:22:55.639Z - debug:
if( levels.indexOf(arguments[0]) >= levels.indexOf(level) )
console.log(new Date().toJSON()+' - '+ arguments[0] +': CRAWLER %s', util.format.apply(util, Array.prototype.slice.call(arguments, 1)));
function defaultLog() { //2016-11-24T12:22:55.639Z - debug:
if (levels.indexOf(arguments[0]) >= levels.indexOf(level))
console.log(new Date().toJSON() + ' - ' + arguments[0] + ': CRAWLER %s', util.format.apply(util, Array.prototype.slice.call(arguments, 1)));
}
function checkJQueryNaming (options) {
function checkJQueryNaming(options) {
if ('jquery' in options) {

@@ -37,5 +46,5 @@ options.jQuery = options.jquery;

function readJqueryUrl (url, callback) {
function readJqueryUrl(url, callback) {
if (url.match(/^(file:\/\/|\w+:|\/)/)) {
fs.readFile(url.replace(/^file:\/\//,''),'utf-8', function(err,jq) {
fs.readFile(url.replace(/^file:\/\//, ''), 'utf-8', function (err, jq) {
callback(err, jq);

@@ -48,7 +57,7 @@ });

function contentType(res){
return get(res,'content-type').split(';').filter(item => item.trim().length !== 0).join(';');
function contentType(res) {
return get(res, 'content-type').split(';').filter(item => item.trim().length !== 0).join(';');
}
function get(res,field){
function get(res, field) {
return res.headers[field.toLowerCase()] || '';

@@ -59,7 +68,7 @@ }

function Crawler (options) {
function Crawler(options) {
var self = this;
options = options||{};
if(['onDrain','cache'].some(key => key in options)){
options = options || {};
if (['onDrain', 'cache'].some(key => key in options)) {
throw new Error('Support for "onDrain", "cache" has been removed! For more details, see https://github.com/bda-research/node-crawler');

@@ -73,26 +82,27 @@ }

Crawler.prototype.init = function init (options) {
Crawler.prototype.init = function init(options) {
var self = this;
var defaultOptions = {
autoWindowClose: true,
forceUTF8: true,
gzip: true,
incomingEncoding: null,
jQuery: true,
maxConnections: 10,
method: 'GET',
priority: 5,
priorityRange: 10,
rateLimit: 0,
referer: false,
retries: 3,
retryTimeout: 10000,
timeout: 15000,
skipDuplicates: false,
rotateUA: false,
homogeneous: false
autoWindowClose: true,
forceUTF8: true,
gzip: true,
incomingEncoding: null,
jQuery: true,
maxConnections: 10,
method: 'GET',
priority: 5,
priorityRange: 10,
rateLimit: 0,
referer: false,
retries: 3,
retryTimeout: 10000,
timeout: 15000,
skipDuplicates: false,
rotateUA: false,
homogeneous: false,
http2: false
};
//return defaultOptions with overriden properties from options.
// return defaultOptions with overridden properties from options.
self.options = _.extend(defaultOptions, options);

@@ -106,7 +116,10 @@

self.limiters = new Bottleneck.Cluster(self.options.maxConnections,self.options.rateLimit,self.options.priorityRange, self.options.priority, self.options.homogeneous);
self.limiters = new Bottleneck.Cluster(self.options.maxConnections, self.options.rateLimit, self.options.priorityRange, self.options.priority, self.options.homogeneous);
//maintain the http2 sessions
self.http2Connections = {};
level = self.options.debug ? 'debug' : 'info';
if(self.options.logger)
if (self.options.logger)
log = self.options.logger.log.bind(self.options.logger);

@@ -117,17 +130,20 @@

self.seen = new seenreq(self.options.seenreq);
self.seen.initialize().then(()=> log('debug', 'seenreq is initialized.')).catch(e => log('error', e));
self.seen.initialize().then(() => log('debug', 'seenreq is initialized.')).catch(e => log('error', e));
self.on('_release', function(){
log('debug','Queue size: %d',this.queueSize);
self.on('_release', function () {
log('debug', 'Queue size: %d', this.queueSize);
if(this.limiters.empty)
if (this.limiters.empty) {
if (Object.keys(self.http2Connections).length > 0) self._clearHttp2Session();
return this.emit('drain');
}
});
};
Crawler.prototype.setLimiterProperty = function setLimiterProperty (limiter, property, value) {
Crawler.prototype.setLimiterProperty = function setLimiterProperty(limiter, property, value) {
var self = this;
switch(property) {
case 'rateLimit': self.limiters.key(limiter).setRateLimit(value);break;
switch (property) {
case 'rateLimit': self.limiters.key(limiter).setRateLimit(value); break;
default: break;

@@ -137,7 +153,44 @@ }

Crawler.prototype._inject = function _inject (response, options, callback) {
Crawler.prototype.generateHttp2RequestLine = function (options) {
const urlObj = new URL(options.uri);
const requestLine = {
':method': options.method || 'GET',
':path': urlObj.pathname,
':scheme': urlObj.protocol.replace(':', ''),
':authority': urlObj.hostname
};
return requestLine;
};
Crawler.prototype.generateHttp2RequestBody = function (options) {
let data = null;
if (options.form) {
if (!/^application\/x-www-form-urlencoded\b/.test(options.headers['content-type'])) {
options.headers['content-type'] = 'application/x-www-form-urlencoded';
}
data = (typeof options.form === 'string') ? encodeURIComponent(options.form) : qs.stringify(options.form);
} else if (options.json) {
if (!/^application\/x-www-form-urlencoded\b/.test(options.headers['content-type'])) {
data = JSON.stringify(options.body);
}
if (!options.headers['contentn-type']) options.headers['content-type'] = 'application/json';
} else if (options.body !== undefined) {
data = options.body;
}
//NOTE the default situation do nothing to the
return data;
};
Crawler.prototype._inject = function _inject(response, options, callback) {
var $;
if (options.jQuery === 'whacko') {
if(!whacko){
if (!whacko) {
throw new Error('Please install whacko by your own since `crawler` detected you specify explicitly');

@@ -148,3 +201,3 @@ }

callback(null, response, options, $);
}else if (options.jQuery === 'cheerio' || options.jQuery.name === 'cheerio' || options.jQuery === true) {
} else if (options.jQuery === 'cheerio' || options.jQuery.name === 'cheerio' || options.jQuery === true) {
var defaultCheerioOptions = {

@@ -159,3 +212,3 @@ normalizeWhitespace: false,

callback(null, response, options, $);
}else if (options.jQuery.jsdom) {
} else if (options.jQuery.jsdom) {
var jsdom = options.jQuery.jsdom;

@@ -165,3 +218,3 @@ var scriptLocation = path.resolve(__dirname, '../vendor/jquery-2.1.1.min.js');

//Use promises
readJqueryUrl(scriptLocation, function(err, jquery) {
readJqueryUrl(scriptLocation, function (err, jquery) {
try {

@@ -180,3 +233,3 @@ jsdom.env({

} catch (err) {
log('error',err);
log('error', err);
}

@@ -187,3 +240,3 @@

} catch (e) {
options.callback(e,{options}, options.release);
options.callback(e, { options }, options.release);
}

@@ -198,15 +251,15 @@ });

Crawler.prototype.isIllegal = function isIllegal (options) {
Crawler.prototype.isIllegal = function isIllegal(options) {
return (_.isNull(options) || _.isUndefined(options) || (!_.isString(options) && !_.isPlainObject(options)));
};
Crawler.prototype.direct = function direct (options) {
Crawler.prototype.direct = function direct(options) {
var self = this;
if(self.isIllegal(options) || !_.isPlainObject(options)) {
return log('warn','Illegal queue option: ', JSON.stringify(options));
if (self.isIllegal(options) || !_.isPlainObject(options)) {
return log('warn', 'Illegal queue option: ', JSON.stringify(options));
}
if(!('callback' in options) || !_.isFunction(options.callback)) {
return log('warn','must specify callback function when using sending direct request with crawler');
if (!('callback' in options) || !_.isFunction(options.callback)) {
return log('warn', 'must specify callback function when using sending direct request with crawler');
}

@@ -232,3 +285,3 @@

Crawler.prototype.queue = function queue (options) {
Crawler.prototype.queue = function queue(options) {
var self = this;

@@ -241,9 +294,9 @@

for(var i = 0; i < options.length; ++i) {
if(self.isIllegal(options[i])) {
log('warn','Illegal queue option: ', JSON.stringify(options[i]));
for (var i = 0; i < options.length; ++i) {
if (self.isIllegal(options[i])) {
log('warn', 'Illegal queue option: ', JSON.stringify(options[i]));
continue;
}
self._pushToQueue(
_.isString(options[i]) ? {uri: options[i]} : options[i]
_.isString(options[i]) ? { uri: options[i] } : options[i]
);

@@ -253,3 +306,3 @@ }

Crawler.prototype._pushToQueue = function _pushToQueue (options) {
Crawler.prototype._pushToQueue = function _pushToQueue(options) {
var self = this;

@@ -268,3 +321,3 @@

// If duplicate skipping is enabled, avoid queueing entirely for URLs we already crawled
if (!self.options.skipDuplicates){
if (!self.options.skipDuplicates) {
self._schedule(options);

@@ -275,3 +328,3 @@ return;

self.seen.exists(options, options.seenreq).then(rst => {
if(!rst){
if (!rst) {
self._schedule(options);

@@ -282,9 +335,10 @@ }

Crawler.prototype._schedule = function _scheduler(options){
Crawler.prototype._schedule = function _scheduler(options) {
var self = this;
self.emit('schedule',options);
//NOTE this will be used to add proxy outside the class
self.emit('schedule', options);
self.limiters.key(options.limiter||'default').submit(options.priority,function(done, limiter){
options.release = function(){ done();self.emit('_release'); };
if(!options.callback)
self.limiters.key(options.limiter || 'default').submit(options.priority, function (done, limiter) {
options.release = function () { done(); self.emit('_release'); };
if (!options.callback)
options.callback = options.release;

@@ -297,5 +351,5 @@

if (options.html) {
self._onContent(null, options, {body:options.html,headers:{'content-type':'text/html'}});
self._onContent(null, options, { body: options.html, headers: { 'content-type': 'text/html' } });
} else if (typeof options.uri === 'function') {
options.uri(function(uri) {
options.uri(function (uri) {
options.uri = uri;

@@ -308,10 +362,26 @@ self._buildHttpRequest(options);

});
};
Crawler.prototype._buildHttpRequest = function _buildHTTPRequest (options) {
Crawler.prototype._clearHttp2Session = function _clearHttp2Session() {
log('debug', `Crawler clear all ${Object.keys(this.http2Connections).length} http2 connections`);
Object.keys(this.http2Connections).forEach(hostName => {
this._closeAndDeleteHttp2Session(hostName);
log('debug', `http2 connection to ${hostName} closed`);
});
};
Crawler.prototype._closeAndDeleteHttp2Session = function _closeAndDeleteHttp2Session(targetHost) {
if (this.http2Connections[targetHost]) {
this.http2Connections[targetHost].close();
delete this.http2Connections[targetHost];
}
};
Crawler.prototype._buildHttpRequest = function _buildHTTPRequest(options) {
var self = this;
log('debug',options.method+' '+options.uri);
if(options.proxy)
log('debug','Use proxy: %s', options.proxy);
log('debug', options.method + ' ' + options.uri);
if (options.proxy)
log('debug', 'Use proxy: %s', options.proxy);

@@ -323,16 +393,16 @@ // Cloning keeps the opts parameter clean:

var ropts = _.assign({},options);
var ropts = _.assign({}, options);
if (!ropts.headers) { ropts.headers={}; }
if (ropts.forceUTF8) {ropts.encoding=null;}
if (!ropts.headers) { ropts.headers = {}; }
if (ropts.forceUTF8) { ropts.encoding = null; }
// specifying json in request will have request sets body to JSON representation of value and
// adds Content-type: application/json header. Additionally, parses the response body as JSON
// so the response will be JSON object, no need to deal with encoding
if (ropts.json) {options.encoding=null;}
if (ropts.json) { options.encoding = null; }
if (ropts.userAgent) {
if(self.options.rotateUA && _.isArray(ropts.userAgent)){
if (self.options.rotateUA && _.isArray(ropts.userAgent)) {
ropts.headers['User-Agent'] = ropts.userAgent[0];
// If "rotateUA" is true, rotate User-Agent
options.userAgent.push(options.userAgent.shift());
}else{
} else {
ropts.headers['User-Agent'] = ropts.userAgent;

@@ -350,11 +420,11 @@ }

var doRequest = function(err) {
if(err) {
err.message = 'Error in preRequest' + (err.message ? ', '+err.message : err.message);
switch(err.op) {
case 'retry': log('debug', err.message + ', retry ' + options.uri);self._onContent(err,options);break;
case 'fail': log('debug', err.message + ', fail ' + options.uri);options.callback(err,{options:options},options.release);break;
case 'abort': log('debug', err.message + ', abort ' + options.uri);options.release();break;
case 'queue': log('debug', err.message + ', queue ' + options.uri);self.queue(options);options.release();break;
default: log('debug', err.message + ', retry ' + options.uri);self._onContent(err,options);break;
var doRequest = function (err) {
if (err) {
err.message = 'Error in preRequest' + (err.message ? ', ' + err.message : err.message);
switch (err.op) {
case 'retry': log('debug', err.message + ', retry ' + options.uri); self._onContent(err, options); break;
case 'fail': log('debug', err.message + ', fail ' + options.uri); options.callback(err, { options: options }, options.release); break;
case 'abort': log('debug', err.message + ', abort ' + options.uri); options.release(); break;
case 'queue': log('debug', err.message + ', queue ' + options.uri); self.queue(options); options.release(); break;
default: log('debug', err.message + ', retry ' + options.uri); self._onContent(err, options); break;
}

@@ -364,15 +434,28 @@ return;

if(ropts.skipEventRequest !== true) {
self.emit('request',ropts);
}
//do http2.* request
if (ropts.http2) {
if (!http2) {
process.nextTick(() => {
const notSupportedHttp2Error = new Error('you are trying to use http2 API which may not be supported for your current environment or node version');
notSupportedHttp2Error.code = 'NOHTTP2SUPPORT';
self._onContent(notSupportedHttp2Error, options);
});
return;
}
self._http2request(ropts, options);
} else {
if (ropts.skipEventRequest !== true) {
self.emit('request', ropts);
}
var requestArgs = ['uri','url','qs','method','headers','body','form','formData','json','multipart','followRedirect','followAllRedirects','maxRedirects','removeRefererHeader','encoding','pool','timeout','proxy','auth','oauth','strictSSL','jar','aws','gzip','time','tunnel','proxyHeaderWhiteList','proxyHeaderExclusiveList','localAddress','forever', 'agent', 'strictSSL', 'agentOptions', 'agentClass'];
var requestArgs = ['uri', 'url', 'qs', 'method', 'headers', 'body', 'form', 'formData', 'json', 'multipart', 'followRedirect', 'followAllRedirects', 'maxRedirects', 'removeRefererHeader', 'encoding', 'pool', 'timeout', 'proxy', 'auth', 'oauth', 'strictSSL', 'jar', 'aws', 'gzip', 'time', 'tunnel', 'proxyHeaderWhiteList', 'proxyHeaderExclusiveList', 'localAddress', 'forever', 'agent', 'strictSSL', 'agentOptions', 'agentClass'];
request(_.pick.apply(self,[ropts].concat(requestArgs)), function(error,response) {
if (error) {
return self._onContent(error, options);
}
request(_.pick.apply(self, [ropts].concat(requestArgs)), function (error, response) {
if (error) {
return self._onContent(error, options);
}
self._onContent(error,options,response);
});
self._onContent(error, options, response);
});
}
};

@@ -387,30 +470,130 @@

Crawler.prototype._onContent = function _onContent (error, options, response) {
Crawler.prototype._buildHttp2Session = function _buildHttp2Session(targetHost) {
const self = this;
const newHttp2Connection = self.http2Connections[targetHost] = http2.connect(targetHost);
log('debug', `connect to a new ${targetHost}`);
newHttp2Connection.on('error', (err) => {
log('warn', `Http2 stession error ${targetHost}, got error ${err}`);
}).on('goaway', () => {
log('debug', `Http2 session${targetHost} connection goaway`);
}).on('connect', () => {
log('debug', `Http2 session${targetHost} connection init`);
}).once('close', () => {
log('debug', `Http2 session ${targetHost} connection closed`);
});
};
Crawler.prototype._http2request = function _http2request(ropts, options) {
const self = this;
const targetHost = new URL(ropts.uri).origin;
ropts.headers = Object.assign(ropts.headers, self.generateHttp2RequestLine(ropts));
const requestBody = ropts.headers[':method'] === 'GET' ? null : self.generateHttp2RequestBody(ropts);
const response = {
headers: {}
};
const chunks = [];
let http2Error = null;
if (!self.http2Connections[targetHost] || self.http2Connections[targetHost].destroyed) {
self._buildHttp2Session(targetHost);
}
let req = null;
try {
req = self.http2Connections[targetHost].request(ropts.headers);
} catch (e) {
//to handle the goaway issue, goaway will make the session can not be established
//but it can not be detected at the moment that stream init
//try catch seems the way to sovle it
self._onContent(e, options, response);
return;
}
req.on('response', headers => {
//Where build the response obj
response.statusCode = headers[':status'];
response.request = {
uri: `${req.sentHeaders[':scheme']}://${req.sentHeaders[':authority']}${req.sentHeaders[':path']}`,
method: req.sentHeaders[':method'],
headers: Object.assign({}, req.sentHeaders, req.sentInfoHeaders)
};
for (const name in headers) {
response.headers[name] = headers[name];
}
});
req.on('error', (err) => {
log('debug', `Http2 stream error${ropts.uri}, got error ${err}`);
http2Error = err;
});
req.on('data', chunk => {
chunks.push(chunk);
});
req.setTimeout(self.options.timeout);
req.on('timeout', () => {
const error = new Error('ESOCKETTIMEDOUT');
error.code = 'ESOCKETTIMEDOUT';
http2Error = error;
req.close();
});
req.once('close', () => {
if (http2Error) self._onContent(http2Error, options, response);
else {
response.body = Buffer.concat(chunks);
self._onContent(null, options, response);
}
});
req.on('end', () => {
log('debug', `${ropts.uri} stream ends`);
});
//set request body
req.end(requestBody);
};
Crawler.prototype._onContent = function _onContent(error, options, response) {
var self = this;
if (error) {
log('error','Error '+error+' when fetching '+ (options.uri||options.url)+(options.retries ? ' ('+options.retries+' retries left)' : ''));
if (options.retries) {
setTimeout(function() {
options.retries--;
self._schedule(options);
options.release();
},options.retryTimeout);
} else{
options.callback(error,{options:options},options.release);
switch (error.code) {
case 'NOHTTP2SUPPORT':
//if the enviroment is not support http2 api, all request rely on http2 protocol
//are aborted immediately no matter how many retry times left
log('error', 'Error ' + error + ' when fetching ' + (options.uri || options.url) + ' skip all retry times');
break;
default:
log('error', 'Error ' + error + ' when fetching ' + (options.uri || options.url) + (options.retries ? ' (' + options.retries + ' retries left)' : ''));
if (options.retries) {
setTimeout(function () {
options.retries--;
self._schedule(options);
options.release();
}, options.retryTimeout);
return;
}
break;
}
options.callback(error, { options: options }, options.release);
return;
}
if (!response.body) { response.body=''; }
if (!response.body) { response.body = ''; }
log('debug','Got '+(options.uri||'html')+' ('+response.body.length+' bytes)...');
log('debug', 'Got ' + (options.uri || 'html') + ' (' + response.body.length + ' bytes)...');
try{
self._doEncoding(options,response);
}catch(e){
log('error',e);
return options.callback(e,{options:options},options.release);
try {
self._doEncoding(options, response);
} catch (e) {
return options.callback(e, { options: options }, options.release);
}

@@ -420,13 +603,13 @@

if(options.method === 'HEAD' || !options.jQuery){
return options.callback(null,response,options.release);
if (options.method === 'HEAD' || !options.jQuery) {
return options.callback(null, response, options.release);
}
var injectableTypes = ['html','xhtml','text/xml', 'application/xml', '+xml'];
if (!options.html && !typeis(contentType(response), injectableTypes)){
log('warn','response body is not HTML, skip injecting. Set jQuery to false to suppress this message');
return options.callback(null,response,options.release);
var injectableTypes = ['html', 'xhtml', 'text/xml', 'application/xml', '+xml'];
if (!options.html && !typeis(contentType(response), injectableTypes)) {
log('warn', 'response body is not HTML, skip injecting. Set jQuery to false to suppress this message');
return options.callback(null, response, options.release);
}
log('debug','Injecting');
log('debug', 'Injecting');

@@ -436,4 +619,4 @@ self._inject(response, options, self._injected.bind(self));

Crawler.prototype._injected = function(errors, response, options, $){
log('debug','Injected');
Crawler.prototype._injected = function (errors, response, options, $) {
log('debug', 'Injected');

@@ -444,6 +627,6 @@ response.$ = $;

Crawler.prototype._doEncoding = function(options,response){
Crawler.prototype._doEncoding = function (options, response) {
var self = this;
if(options.encoding === null){
if (options.encoding === null) {
return;

@@ -455,3 +638,3 @@ }

response.charset = charset;
log('debug','Charset ' + charset);
log('debug', 'Charset ' + charset);

@@ -466,6 +649,6 @@ if (charset !== 'utf-8' && charset !== 'ascii') {// convert response.body into 'utf-8' encoded buffer

Crawler.prototype._parseCharset = function(res){
Crawler.prototype._parseCharset = function (res) {
//Browsers treat gb2312 as gbk, but iconv-lite not.
//Replace gb2312 with gbk, in order to parse the pages which say gb2312 but actually are gbk.
function getCharset(str){
function getCharset(str) {
var charset = (str && str.match(/charset=['"]?([\w.-]+)/i) || [0, null])[1];

@@ -479,7 +662,7 @@ return charset && charset.replace(/:\d{4}$|[^0-9a-z]/g, '') == 'gb2312' ? 'gbk' : charset;

var charset = charsetParser(contentType(res));
if(charset)
if (charset)
return charset;
if(!typeis(contentType(res), ['html'])){
log('debug','Charset not detected in response headers, please specify using `incomingEncoding`, use `utf-8` by default');
if (!typeis(contentType(res), ['html'])) {
log('debug', 'Charset not detected in response headers, please specify using `incomingEncoding`, use `utf-8` by default');
return 'utf-8';

@@ -489,3 +672,3 @@ }

var body = res.body instanceof Buffer ? res.body.toString() : res.body;
charset = charsetParser(contentType(res),body,'utf-8');
charset = charsetParser(contentType(res), body, 'utf-8');

@@ -495,4 +678,4 @@ return charset;

Object.defineProperty(Crawler.prototype,'queueSize',{
get:function(){
Object.defineProperty(Crawler.prototype, 'queueSize', {
get: function () {
return this.limiters.unfinishedClients;

@@ -499,0 +682,0 @@ }

{
"name": "crawler",
"version": "1.2.2",
"version": "1.3.0",
"description": "Crawler is a web spider written with Nodejs. It gives you the full power of jQuery on the server to parse a big number of pages as they are downloaded, asynchronously",

@@ -12,2 +12,3 @@ "main": "./lib/crawler.js",

"test": "mocha --timeout=15000 tests/*.test.js",
"http2test": "mocha --timeout=15000 tests/http2*.test.js",
"cover": "nyc --reporter=lcovonly --reporter=text --reporter=text-summary mocha --timeout=15000 --reporter spec tests/*.test.js"

@@ -20,3 +21,3 @@ },

"engine-strict": {
"node": ">=4.0.0"
"node": ">=10.0.0"
},

@@ -38,4 +39,4 @@ "dependencies": {

"mocha": "^6.1.0",
"nock": "^13.0.5",
"mocha-testdata": "^1.2.0",
"nock": "^10.0.6",
"nyc": "^13.1.0",

@@ -42,0 +43,0 @@ "sinon": "^7.0.0",

@@ -249,2 +249,25 @@

## Work with Http2
Node-crawler now supports http request. Proxy functionality for http2 request does not be included now. It will be added in the future.
```js
crawler.queue({
//unit test work with httpbin http2 server. It could be used for test
uri: 'https://nghttp2.org/httpbin/status/200',
method: 'GET',
http2: true, //set http2 to be true will make a http2 request
callback: (error, response, done) => {
if(error) {
console.error(error);
return done();
}
console.log(`inside callback`);
console.log(response.body);
return done();
}
})
```
## Work with bottleneck

@@ -425,2 +448,6 @@

### Http2
* `options.http2`: [Boolean](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Data_structures#Boolean_type) If true, request will be sent in http2 protocol (Default false)
### Https socks5

@@ -427,0 +454,0 @@ ```js

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc