Comparing version 2.0.0-beta.4 to 2.0.0-beta.5
@@ -11,2 +11,3 @@ import { EventEmitter } from "events"; | ||
import { Logger } from "tslog"; | ||
// @todo: remove seenreq dependency | ||
process.env.NODE_ENV = process.env.NODE_ENV ?? process.argv[2]; | ||
@@ -285,2 +286,5 @@ // process.env.NODE_ENV = "debug"; | ||
this.options = { ...defaultOptions, ...options }; | ||
if (this.options.rateLimit > 0) { | ||
this.options.maxConnections = 1; | ||
} | ||
this.globalOnlyOptions = [ | ||
@@ -287,0 +291,0 @@ "maxConnections", |
{ | ||
"name": "crawler", | ||
"version": "2.0.0-beta.4", | ||
"version": "2.0.0-beta.5", | ||
"description": "New TypeScript Crawler Test version", | ||
@@ -5,0 +5,0 @@ "repository": { |
184
README.md
@@ -9,3 +9,3 @@ <p align="center"> | ||
[![npm package](https://nodei.co/npm/crawler.png?downloads=true&downloadRank=true&stars=true)](https://www.npmjs.com/package/crawler/v/2.0.0-beta.3) | ||
[![npm package](https://nodei.co/npm/crawler.png?downloads=true&downloadRank=true&stars=true)](https://www.npmjs.com/package/crawler/v/2.0.0-beta.5) | ||
@@ -30,56 +30,4 @@ [![NPM download][download-image]][download-url] | ||
If you have prior experience with Crawler v1, for fast migration, please proceed to the section [Differences and Breaking Changes](#differences-and-breaking-changes). | ||
# Table of Contents | ||
- [Table of Contents](#table-of-contents) | ||
- [Quick start](#quick-start) | ||
- [Install](#install) | ||
- [Usage](#usage) | ||
- [Direct request](#direct-request) | ||
- [Execute asynchronously via custom options](#execute-asynchronously-via-custom-options) | ||
- [Slow down](#slow-down) | ||
- [Custom parameters](#custom-parameters) | ||
- [Raw body](#raw-body) | ||
- [preRequest](#prerequest) | ||
- [Advanced](#advanced) | ||
- [Work with Http2](#work-with-http2) | ||
- [Work with rateLimiters](#work-with-ratelimiters) | ||
- [Class: Crawler](#class-crawler) | ||
- [Event: 'schedule'](#event-schedule) | ||
- [Event: 'limiterChange'](#event-limiterchange) | ||
- [Event: 'request'](#event-request) | ||
- [Event: 'drain'](#event-drain) | ||
- [crawler.add(url|options)](#crawleraddurloptions) | ||
- [crawler.queueSize](#crawlerqueuesize) | ||
- [Options](#options) | ||
- [Global only options](#global-only-options) | ||
- [`maxConnections`](#maxconnections) | ||
- [`priorityLevels`](#prioritylevels) | ||
- [`rateLimit`](#ratelimit) | ||
- [`skipDuplicates`](#skipduplicates) | ||
- [`homogeneous`](#homogeneous) | ||
- [`userAgents`](#useragents) | ||
- [Crawler General options](#crawler-general-options) | ||
- [`url | method | headers | body | searchParams...`](#url--method--headers--body--searchparams) | ||
- [`forceUTF8`](#forceutf8) | ||
- [`jQuery`](#jquery) | ||
- [`encoding`](#encoding) | ||
- [`rateLimiterId`](#ratelimiterid) | ||
- [`retries`](#retries) | ||
- [`retryInterval`](#retryinterval) | ||
- [`timeout`](#timeout) | ||
- [`priority`](#priority) | ||
- [`skipEventRequest`](#skipeventrequest) | ||
- [`html`](#html) | ||
- [`proxies`](#proxies) | ||
- [`proxy`](#proxy) | ||
- [`http2`](#http2) | ||
- [`referer`](#referer) | ||
- [`userParams`](#userparams) | ||
- [`preRequest`](#prerequest-1) | ||
- [`Callback`](#callback) | ||
- [Work with Cheerio](#work-with-cheerio) | ||
- [How to test](#how-to-test) | ||
# Quick start | ||
@@ -104,3 +52,3 @@ | ||
```js | ||
import Crawler from "../node-crawler/dist/index.js"; | ||
import Crawler from "crawler"; | ||
@@ -180,3 +128,3 @@ const crawler = new Crawler(); | ||
please refer to [options](#options-reference) for detail. | ||
please refer to [options](#options) for detail. | ||
@@ -286,4 +234,54 @@ ## Slow down | ||
# Advanced | ||
# Table | ||
- [Content](#content) | ||
- [Work with Http2](#work-with-http2) | ||
- [Work with rateLimiters](#work-with-ratelimiters) | ||
- [Class: Crawler](#class-crawler) | ||
- [Event: 'schedule'](#event-schedule) | ||
- [Event: 'limiterChange'](#event-limiterchange) | ||
- [Event: 'request'](#event-request) | ||
- [Event: 'drain'](#event-drain) | ||
- [crawler.add(url|options)](#crawleraddurloptions) | ||
- [crawler.queueSize](#crawlerqueuesize) | ||
- [Options](#options) | ||
- [Global only options](#global-only-options) | ||
- [`maxConnections`](#maxconnections) | ||
- [`priorityLevels`](#prioritylevels) | ||
- [`rateLimit`](#ratelimit) | ||
- [`skipDuplicates`](#skipduplicates) | ||
- [`homogeneous`](#homogeneous) | ||
- [`userAgents`](#useragents) | ||
- [Crawler General options](#crawler-general-options) | ||
- [`url | method | headers | body | searchParams...`](#url--method--headers--body--searchparams) | ||
- [`forceUTF8`](#forceutf8) | ||
- [`jQuery`](#jquery) | ||
- [`encoding`](#encoding) | ||
- [`rateLimiterId`](#ratelimiterid) | ||
- [`retries`](#retries) | ||
- [`retryInterval`](#retryinterval) | ||
- [`timeout`](#timeout) | ||
- [`priority`](#priority) | ||
- [`skipEventRequest`](#skipeventrequest) | ||
- [`html`](#html) | ||
- [`proxies`](#proxies) | ||
- [`proxy`](#proxy) | ||
- [`http2`](#http2) | ||
- [`referer`](#referer) | ||
- [`userParams`](#userparams) | ||
- [`preRequest`](#prerequest-1) | ||
- [`Callback`](#callback) | ||
- [Work with Cheerio](#work-with-cheerio) | ||
- [Differences and Breaking Changes](#differences-and-breaking-changes) | ||
- [renaming](#renaming) | ||
- [Crawler Options](#crawler-options) | ||
- [Origin Request Options](#origin-request-options) | ||
- [Behavior Changes](#behavior-changes) | ||
- [How to test](#how-to-test) | ||
# Content | ||
## Work with Http2 | ||
@@ -610,18 +608,66 @@ | ||
- `error`: [Error](https://nodejs.org/api/errors.html) catched by the crawler | ||
- `response` : [http.IncomingMessage](https://nodejs.org/api/http.html#http_class_http_incomingmessage) A response of standard IncomingMessage includes `$` and `options` | ||
- `res.options`: [Options](#options-reference) of this task | ||
- `res.$`: [jQuery Selector](https://api.jquery.com/category/selectors/) A selector for html or xml document. | ||
- `res.statusCode`: [Number](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Data_structures#Number_type) HTTP status code. E.G.`200` | ||
- `res.body`: [Buffer](https://nodejs.org/api/buffer.html) | [String](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Data_structures#String_type) HTTP response content which could be a html page, plain text or xml document e.g. | ||
- `res.headers`: [Object](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Object) HTTP response headers | ||
- `res.request`: [Request](https://github.com/request/request) An instance of Mikeal's `Request` instead of [http.ClientRequest](https://nodejs.org/api/http.html#http_class_http_clientrequest) | ||
- `res.request.url`: [urlObject](https://nodejs.org/api/url.html#url_url_strings_and_url_objects) HTTP request entity of parsed url | ||
- `res.request.method`: [String](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Data_structures#String_type) HTTP request method. E.G. `GET` | ||
- `res.request.headers`: [Object](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Object) HTTP request headers | ||
- `response` : A response of standard IncomingMessage includes `$` and `options` | ||
- `response.options`: [Options](#options) of this task | ||
- `response.$`: [jQuery Selector](https://api.jquery.com/category/selectors/) A selector for html or xml document. | ||
- `response.statusCode`: `Number` HTTP status code. E.G.`200` | ||
- `response.body`: `Buffer` | `String` | `JSON` HTTP response content which could be a html page, plain text or xml document e.g. | ||
- `response.headers`: HTTP response headers | ||
- `done` : The function must be called when you've done your work in callback. This is the only way to tell the crawler that the task is finished. | ||
## Work with Cheerio | ||
Crawler by default use [Cheerio](https://github.com/cheeriojs/cheerio). We are temporarily no longer supporting jsdom for certain reasons. | ||
Crawler by default use [Cheerio](https://github.com/cheeriojs/cheerio). We are temporarily no longer supporting jsdom for certain reasons, may be later. | ||
# Differences and Breaking Changes | ||
## renaming | ||
*Options list here are renamed but most of the old ones are still supported for backward compatibility.* | ||
### Crawler Options | ||
`options.priorityRange` → `options.priorityLevels` | ||
`options.uri` → `options.url` | ||
`options.json` → `options.isJson` (Boolean. The "json" option is now work completely different in Got.) | ||
`options.limiter` → `options.rateLimiterId` | ||
`options.retryTimeout` → `options.retryInterval` | ||
`crawler.direct` → `crawler.send` | ||
`crawler.queue` → `crawler.add` | ||
`crawler.setLimiterProperty` → `crawler.setLimiter` | ||
### Origin Request Options | ||
*Since we have switched from `request` to `got`, the following option names have been updated accordingly.* | ||
`incomingEncoding` → `encoding` | ||
`qs` → `searchParams` | ||
`strictSSL` → `rejectUnauthorized` | ||
`gzip` → `decompress` | ||
`jar` → `cookieJar` (accepts `tough-cookie` jar) | ||
`jsonReviver` → `parseJson` | ||
`jsonReplacer` → `stringifyJson` | ||
## Behavior Changes | ||
- default retries: 3 => 2 | ||
**Some practices that were acceptable and offen used in version 1 but not in version 2:** | ||
- use “jquery/JQuery/..." => **Only "jQuery" will be accepted.** | ||
- use "body" as the POST form => **Please use "form" instead. For more, see [got options](https://github.com/sindresorhus/got/blob/main/documentation/2-options.md) .** | ||
- add custom options on request options => **Not allowed. Only options.userParams could pass through the response.** | ||
- We are temporarily no longer supporting jsdom for certain reasons. | ||
# How to test | ||
@@ -628,0 +674,0 @@ |
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
105046
1326
674