Comparing version 1.0.2 to 1.0.3
@@ -206,3 +206,3 @@ | ||
// If duplicate skipping is enabled, avoid queueing entirely for URLs we already crawled | ||
if (options.skipDuplicates && self.seen.exists(options)) { | ||
if (self.options.skipDuplicates && self.seen.exists(options)) { | ||
return | ||
@@ -277,3 +277,3 @@ } | ||
if (ropts.userAgent) { | ||
if(ropts.rotateUA && _.isArray(ropts.userAgent)){ | ||
if(self.options.rotateUA && _.isArray(ropts.userAgent)){ | ||
ropts.headers['User-Agent'] = ropts.userAgent[0]; | ||
@@ -280,0 +280,0 @@ // If "rotateUA" is true, rotate User-Agent |
{ | ||
"name": "crawler", | ||
"version": "1.0.2", | ||
"version": "1.0.3", | ||
"description": "Crawler is a web spider written with Nodejs. It gives you the full power of jQuery on the server to parse a big number of pages as they are downloaded, asynchronously", | ||
@@ -5,0 +5,0 @@ "main": "./lib/crawler.js", |
@@ -80,51 +80,47 @@ | ||
Control rate limit for with limiter. All tasks submit to a limiter will abide the `rateLimit` and `maxConnections` restrictions of the limiter. `rateLimit` is the minimum time gap between two tasks. `maxConnections` is the maximum number of tasks that can be running at the same time. Limiters are independent of each other. One common use case is setting different limiters for different proxies. | ||
Control rate limit for with limiter. All tasks submit to a limiter will abide the `rateLimit` and `maxConnections` restrictions of the limiter. `rateLimit` is the minimum time gap between two tasks. `maxConnections` is the maximum number of tasks that can be running at the same time. Limiters are independent of each other. One common use case is setting different limiters for different proxies. One thing is worth noticing, when `rateLimit` is set to a non-zero value, `maxConnections` will be forced to 1. | ||
To help you better understand `maxConnections`, here's an example. Say we have 10 tasks to do, `rateLimit` is set to 2000 ms, `maxConnections` is set to 3 and each task takes 10000 ms to finish. What happens will be as follows: | ||
``` | ||
00'----start doing task1 | ||
02'----start doing task2 | ||
04'----start doing task3 | ||
10'----task1 done, start doing task4 | ||
12'----task2 done, start doing task5 | ||
... | ||
``` | ||
Below is an example: | ||
```javascript | ||
var Crawler = require("crawler"); | ||
var crawler = require('crawler'); | ||
var c = new Crawler({ | ||
maxConnections : 1, | ||
rateLimit:2000, | ||
callback : function (error, res, done) { | ||
if(error){ | ||
console.error(error); | ||
}else{ | ||
rateLimit: 2000, | ||
maxConnections: 1, | ||
callback: function(error, res, done) { | ||
if(error) { | ||
console.log(error) | ||
} else { | ||
var $ = res.$; | ||
console.log($('title').text()); | ||
console.log($('title').text()) | ||
} | ||
done(); | ||
} | ||
}); | ||
}) | ||
c.queue({ | ||
uri:"http://www.google.com", | ||
limiter:"key1",// for connection of 'key1' | ||
proxy:"http://user:pass@127.0.0.1:8080" | ||
}); | ||
// if you want to crawl some website with 2000ms gap between requests | ||
c.queue('http://www.somewebsite.com/page/1') | ||
c.queue('http://www.somewebsite.com/page/2') | ||
c.queue('http://www.somewebsite.com/page/3') | ||
// if you want to crawl some website using proxy with 2000ms gap between requests for each proxy | ||
c.queue({ | ||
uri:"http://www.google.com", | ||
limiter:"key2", // for connection of 'key2' | ||
proxy:"http://user:pass@127.0.0.1:8082" | ||
}); | ||
uri:'http://www.somewebsite.com/page/1', | ||
limiter:'proxy_1', | ||
proxy:'proxy_1' | ||
}) | ||
c.queue({ | ||
uri:"http://www.google.com", | ||
limiter:"key3", // for connection of 'key3' | ||
proxy:"http://user:pass@127.0.0.1:8081" | ||
}); | ||
uri:'http://www.somewebsite.com/page/2', | ||
limiter:'proxy_2', | ||
proxy:'proxy_2' | ||
}) | ||
c.queue({ | ||
uri:'http://www.somewebsite.com/page/3', | ||
limiter:'proxy_3', | ||
proxy:'proxy_3' | ||
}) | ||
c.queue({ | ||
uri:'http://www.somewebsite.com/page/4', | ||
limiter:'proxy_1', | ||
proxy:'proxy_1' | ||
}) | ||
``` | ||
@@ -177,3 +173,3 @@ | ||
* `jQuery`: [Boolean](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Data_structures#Boolean_type)|[String](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Data_structures#String_type)|[Object](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Object) Use `cheerio` with default configrations to inject document if true or "cheerio". Or use customized `cheerio` if an object with [Parser options](https://github.com/fb55/htmlparser2/wiki/Parser-options). Disable injecting jQuery selector if false. If you have memory leak issue in your project, use "whacko", an alternative parser,to avoid that. (Default true) | ||
* `jQuery`: [Boolean](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Data_structures#Boolean_type)|[String](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Data_structures#String_type)|[Object](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Object) Use `cheerio` with default configurations to inject document if true or "cheerio". Or use customized `cheerio` if an object with [Parser options](https://github.com/fb55/htmlparser2/wiki/Parser-options). Disable injecting jQuery selector if false. If you have memory leak issue in your project, use "whacko", an alternative parser,to avoid that. (Default true) | ||
@@ -244,3 +240,3 @@ Charset encoding: | ||
Enqueue a task and wait for it to be excuted. | ||
Enqueue a task and wait for it to be executed. | ||
@@ -256,4 +252,4 @@ ## crawler.queueSize | ||
Crawler by default use [Cheerio](https://github.com/cheeriojs/cheerio) instead of [Jsdom](https://github.com/tmpvar/jsdom). Jsdom is more robust but can be hard to install (espacially on windows) because of [contextify](https://github.com/tmpvar/jsdom#contextify). | ||
Which is why, if you want to use jsdom you will have to build it, and `require('jsdom')` in your own script before passing it to crawler. This is to avoid cheerio crawler user to build jsdom when installing crawler. | ||
Crawler by default use [Cheerio](https://github.com/cheeriojs/cheerio) instead of [JSDOM](https://github.com/tmpvar/jsdom). JSDOM is more robust but can be hard to install (especially on windows) because of [contextify](https://github.com/tmpvar/jsdom#contextify). | ||
Which is why, if you want to use JSDOM you will have to build it, and `require('jsdom')` in your own script before passing it to crawler. This is to avoid cheerio crawler user to build JSDOM when installing crawler. | ||
@@ -336,4 +332,4 @@ ## Working with Cheerio | ||
* Introducing zombie to deal with page with complex ajax | ||
* Refactoring the code to be more maintenable | ||
* Make Sizzle tests pass (jsdom bug? https://github.com/tmpvar/jsdom/issues#issue/81) | ||
* Refactoring the code to be more maintainable | ||
* Make Sizzle tests pass (JSDOM bug? https://github.com/tmpvar/jsdom/issues#issue/81) | ||
@@ -340,0 +336,0 @@ # ChangeLog |
New author
Supply chain riskA new npm collaborator published a version of the package for the first time. New collaborators are usually benign additions to a project, but do indicate a change to the security surface area of a package.
Found 1 instance in 1 package
247889
334
1