Comparing version 1.0.4 to 1.0.5
node-webcrawler ChangeLog | ||
------------------------- | ||
1.0.5 | ||
* fix missing debugging messages [#213](https://github.com/bda-research/node-crawler/issues/213) | ||
* fix bug of 'drain' never called [#210](https://github.com/bda-research/node-crawler/issues/210) | ||
1.0.4 | ||
* fix bug of charset detecting [#203](https://github.com/bda-research/node-crawler/issues/203) | ||
* keep node version update to date in travis scripts | ||
* keep node version up to date in travis scripts | ||
@@ -8,0 +12,0 @@ 1.0.3 |
@@ -109,5 +109,5 @@ | ||
self.log = log; | ||
self.on('_release', function(){ | ||
if(this.debug) | ||
log('debug',"Queue size: %d",this.queueSize); | ||
log('debug',"Queue size: %d",this.queueSize); | ||
@@ -344,3 +344,3 @@ if(this.limiters.empty) | ||
if (!options.html && !typeis(contentType(response), ['html','xhtml'])){ | ||
log("warn","response body is not HTML, skip injecting"); | ||
log("warn","response body is not HTML, skip injecting. Set jQuery to false to suppress this message"); | ||
return options.callback(null,response,options.release); | ||
@@ -355,4 +355,3 @@ } | ||
Crawler.prototype._injected = function(errors, response, options, $){ | ||
if(this.debug) | ||
log("debug","Injected") | ||
log("debug","Injected") | ||
@@ -359,0 +358,0 @@ response.$ = $; |
{ | ||
"name": "crawler", | ||
"version": "1.0.4", | ||
"version": "1.0.5", | ||
"description": "Crawler is a web spider written with Nodejs. It gives you the full power of jQuery on the server to parse a big number of pages as they are downloaded, asynchronously", | ||
@@ -20,3 +20,3 @@ "main": "./lib/crawler.js", | ||
"dependencies": { | ||
"bottleneckp": "~1.1.0", | ||
"bottleneckp": "~1.1.2", | ||
"charset-parser": "^0.2.0", | ||
@@ -23,0 +23,0 @@ "cheerio": "^0.22.0", |
142
README.md
@@ -7,5 +7,6 @@ | ||
[![Dependency Status](https://david-dm.org/bda-research/node-crawler/status.svg)](https://david-dm.org/bda-research/node-crawler) | ||
[![Gitter](https://img.shields.io/badge/gitter-join_chat-blue.svg?style=flat-square)](https://gitter.im/node-crawler/discuss?utm_source=badge) | ||
Most powerful crawling/scraping package for Node, happy hacking :). Now we are looking for a logo design, which need your help! | ||
Most powerful, popular and production crawling/scraping package for Node, happy hacking :). Now we are looking for a logo design, which need your help! | ||
@@ -23,8 +24,10 @@ Features: | ||
# How to install | ||
# Get started | ||
## How to install | ||
$ npm install crawler | ||
# Crash course | ||
## Usage | ||
@@ -34,3 +37,2 @@ | ||
var Crawler = require("crawler"); | ||
var url = require('url'); | ||
@@ -81,52 +83,20 @@ var c = new Crawler({ | ||
# Work with `bottleneck` | ||
## Slow down | ||
Use `rateLimit` to slow down when you are visiting web sites. | ||
Control rate limit for with limiter. All tasks submit to a limiter will abide the `rateLimit` and `maxConnections` restrictions of the limiter. `rateLimit` is the minimum time gap between two tasks. `maxConnections` is the maximum number of tasks that can be running at the same time. Limiters are independent of each other. One common use case is setting different limiters for different proxies. One thing is worth noticing, when `rateLimit` is set to a non-zero value, `maxConnections` will be forced to 1. | ||
```javascript | ||
var crawler = require('crawler'); | ||
var crawler = require("crawler"); | ||
var c = new Crawler({ | ||
rateLimit: 2000, | ||
maxConnections: 1, | ||
callback: function(error, res, done) { | ||
if(error) { | ||
console.log(error) | ||
} else { | ||
var $ = res.$; | ||
console.log($('title').text()) | ||
} | ||
rateLimit: 1000, // `maxConnections` will be forced to 1 | ||
callback: function(err, res, done){ | ||
console.log(res.$("title").text()); | ||
done(); | ||
} | ||
}) | ||
}); | ||
// if you want to crawl some website with 2000ms gap between requests | ||
c.queue('http://www.somewebsite.com/page/1') | ||
c.queue('http://www.somewebsite.com/page/2') | ||
c.queue('http://www.somewebsite.com/page/3') | ||
// if you want to crawl some website using proxy with 2000ms gap between requests for each proxy | ||
c.queue({ | ||
uri:'http://www.somewebsite.com/page/1', | ||
limiter:'proxy_1', | ||
proxy:'proxy_1' | ||
}) | ||
c.queue({ | ||
uri:'http://www.somewebsite.com/page/2', | ||
limiter:'proxy_2', | ||
proxy:'proxy_2' | ||
}) | ||
c.queue({ | ||
uri:'http://www.somewebsite.com/page/3', | ||
limiter:'proxy_3', | ||
proxy:'proxy_3' | ||
}) | ||
c.queue({ | ||
uri:'http://www.somewebsite.com/page/4', | ||
limiter:'proxy_1', | ||
proxy:'proxy_1' | ||
}) | ||
c.queue(tasks);//between two tasks, minimum time gap is 1000 (ms) | ||
``` | ||
# Options reference | ||
## Options reference | ||
@@ -193,7 +163,6 @@ | ||
# Class:Crawler | ||
## Class:Crawler | ||
## Event: 'schedule' | ||
### Event: 'schedule' | ||
* `options` [Options](#options-reference) | ||
@@ -209,3 +178,3 @@ | ||
## Event: 'limiterChange' | ||
### Event: 'limiterChange' | ||
* `options` [Options](#options-reference) | ||
@@ -216,3 +185,3 @@ * `limiter` [String](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Data_structures#String_type) | ||
## Event: 'request' | ||
### Event: 'request' | ||
* `options` [Options](#options-reference) | ||
@@ -230,3 +199,3 @@ | ||
## Event: 'drain' | ||
### Event: 'drain' | ||
@@ -242,3 +211,3 @@ Emitted when queue is empty. | ||
## crawler.queue(uri|options) | ||
### crawler.queue(uri|options) | ||
* `uri` [String](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Data_structures#String_type) | ||
@@ -249,3 +218,3 @@ * `options` [Options](#options-reference) | ||
## crawler.queueSize | ||
### crawler.queueSize | ||
* [Number](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Data_structures#Number_type) | ||
@@ -255,9 +224,59 @@ | ||
# Working with Cheerio or JSDOM | ||
## Work with bottleneck | ||
Control rate limit for with limiter. All tasks submit to a limiter will abide the `rateLimit` and `maxConnections` restrictions of the limiter. `rateLimit` is the minimum time gap between two tasks. `maxConnections` is the maximum number of tasks that can be running at the same time. Limiters are independent of each other. One common use case is setting different limiters for different proxies. One thing is worth noticing, when `rateLimit` is set to a non-zero value, `maxConnections` will be forced to 1. | ||
```javascript | ||
var crawler = require('crawler'); | ||
var c = new Crawler({ | ||
rateLimit: 2000, | ||
maxConnections: 1, | ||
callback: function(error, res, done) { | ||
if(error) { | ||
console.log(error) | ||
} else { | ||
var $ = res.$; | ||
console.log($('title').text()) | ||
} | ||
done(); | ||
} | ||
}) | ||
// if you want to crawl some website with 2000ms gap between requests | ||
c.queue('http://www.somewebsite.com/page/1') | ||
c.queue('http://www.somewebsite.com/page/2') | ||
c.queue('http://www.somewebsite.com/page/3') | ||
// if you want to crawl some website using proxy with 2000ms gap between requests for each proxy | ||
c.queue({ | ||
uri:'http://www.somewebsite.com/page/1', | ||
limiter:'proxy_1', | ||
proxy:'proxy_1' | ||
}) | ||
c.queue({ | ||
uri:'http://www.somewebsite.com/page/2', | ||
limiter:'proxy_2', | ||
proxy:'proxy_2' | ||
}) | ||
c.queue({ | ||
uri:'http://www.somewebsite.com/page/3', | ||
limiter:'proxy_3', | ||
proxy:'proxy_3' | ||
}) | ||
c.queue({ | ||
uri:'http://www.somewebsite.com/page/4', | ||
limiter:'proxy_1', | ||
proxy:'proxy_1' | ||
}) | ||
``` | ||
## Work with Cheerio or JSDOM | ||
Crawler by default use [Cheerio](https://github.com/cheeriojs/cheerio) instead of [JSDOM](https://github.com/tmpvar/jsdom). JSDOM is more robust, if you want to use JSDOM you will have to require it `require('jsdom')` in your own script before passing it to crawler. | ||
## Working with Cheerio | ||
### Working with Cheerio | ||
```javascript | ||
@@ -290,3 +309,3 @@ jQuery: true //(default) | ||
## Working with JSDOM | ||
### Work with JSDOM | ||
@@ -306,4 +325,2 @@ In order to work with JSDOM you will have to install it in your project folder `npm install jsdom`, and pass it to crawler. | ||
## Install and run Httpbin | ||
@@ -342,5 +359,4 @@ | ||
* Make Sizzle tests pass (JSDOM bug? https://github.com/tmpvar/jsdom/issues#issue/81) | ||
# ChangeLog | ||
See [CHANGELOG](https://github.com/bda-research/node-crawler/blob/master/CHANGELOG.md) | ||
* Promise support | ||
* Commander support | ||
* Middleware support |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
249558
28
349
2002
Updatedbottleneckp@~1.1.2