Comparing version 5.0.0 to 5.0.1
{ | ||
"name": "x-crawl", | ||
"version": "5.0.0", | ||
"version": "5.0.1", | ||
"author": "coderHXL", | ||
@@ -5,0 +5,0 @@ "description": "x-crawl is a flexible nodejs crawler library.", |
211
README.md
@@ -45,3 +45,4 @@ # x-crawl [![npm](https://img.shields.io/npm/v/x-crawl.svg)](https://www.npmjs.com/package/x-crawl) [![GitHub license](https://img.shields.io/badge/license-MIT-blue.svg)](https://github.com/coder-hxl/x-crawl/blob/main/LICENSE) | ||
- [Priority queue](#Priority-queue) | ||
- [About the results](#About the results) | ||
- [About results](#About-results) | ||
- [TypeScript](#TypeScript) | ||
- [API](#API) | ||
@@ -68,27 +69,30 @@ - [xCrawl](#xCrawl) | ||
- [API Config](#API-Config) | ||
- [IntervalTime](#IntervalTime) | ||
- [Method](#Method) | ||
- [PageRequestConfigCookies](#PageRequestConfigCookies) | ||
- [PageRequestConfig](#PageRequestConfig) | ||
- [DataRequestConfig](#DataRequestConfig) | ||
- [FileRequestConfig](#FileRequestConfig) | ||
- [CrawlPageConfigObject](#CrawlPageConfigObject) | ||
- [CrawlDataConfigObject](#CrawlDataConfigObject) | ||
- [CrawlFileConfigObject](#CrawlFileConfigObject) | ||
- [XCrawlBaseConfig](#XCrawlBaseConfig) | ||
- [CrawlPageConfig](#CrawlPageConfig) | ||
- [CrawlDataConfig](#CrawlDataConfig) | ||
- [CrawlFileConfig](#CrawlFileConfig) | ||
- [StartPollingConfig](#StartPollingConfig) | ||
- [API Result](#API-Result) | ||
- [XCrawlInstance](#XCrawlInstance) | ||
- [CrawlCommonRes](#CrawlCommonRes) | ||
- [CrawlPageSingleRes](#CrawlPageSingleRes) | ||
- [CrawlDataSingleRes](#CrawlDataSingleRes) | ||
- [CrawlFileSingleRes](#CrawlFileSingleRes) | ||
- [CrawlPageRes](#CrawlPageRes) | ||
- [CrawlDataRes](#CrawlDataRes) | ||
- [CrawlFileRes](#CrawlFileRes) | ||
- [API Other](#API-Other) | ||
- [AnyObject](#AnyObject) | ||
- [API Config Other](#API-Config-Other) | ||
- [IntervalTime](#IntervalTime) | ||
- [Method](#Method) | ||
- [PageRequestConfigCookies](#PageRequestConfigCookies) | ||
- [API Config Request](#API-Config-Request) | ||
- [PageRequestConfig](#PageRequestConfig) | ||
- [DataRequestConfig](#DataRequestConfig) | ||
- [FileRequestConfig](#FileRequestConfig) | ||
- [API Config Crawl](#API-Config-Crawl) | ||
- [XCrawlBaseConfig](#XCrawlBaseConfig) | ||
- [CrawlPageConfigObject](#CrawlPageConfigObject) | ||
- [CrawlDataConfigObject](#CrawlDataConfigObject) | ||
- [CrawlFileConfigObject](#CrawlFileConfigObject) | ||
- [CrawlPageConfig](#CrawlPageConfig) | ||
- [CrawlDataConfig](#CrawlDataConfig) | ||
- [CrawlFileConfig](#CrawlFileConfig) | ||
- [StartPollingConfig](#StartPollingConfig) | ||
- [API Result](#API-Result) | ||
- [XCrawlInstance](#XCrawlInstance) | ||
- [CrawlCommonRes](#CrawlCommonRes) | ||
- [CrawlPageSingleRes](#CrawlPageSingleRes) | ||
- [CrawlDataSingleRes](#CrawlDataSingleRes) | ||
- [CrawlFileSingleRes](#CrawlFileSingleRes) | ||
- [CrawlPageRes](#CrawlPageRes) | ||
- [CrawlDataRes](#CrawlDataRes) | ||
- [CrawlFileRes](#CrawlFileRes) | ||
- [API Other](#API-Other) | ||
- [AnyObject](#AnyObject) | ||
- [More](#More) | ||
@@ -106,3 +110,3 @@ | ||
Timing capture: Take the automatic capture of the cover image of Airbnb Plus listings every day as an example: | ||
Take some pictures of Airbnb hawaii experience and Plus listings automatically every day as an example: | ||
@@ -123,19 +127,30 @@ ```js | ||
// Call crawlPage API to crawl Page | ||
const res = await myXCrawl.crawlPage('https://zh.airbnb.com/s/*/plus_homes') | ||
const { page } = res.data | ||
const res = await myXCrawl.crawlPage([ | ||
'https://zh.airbnb.com/s/hawaii/experiences', | ||
'https://zh.airbnb.com/s/hawaii/plus_homes' | ||
]) | ||
// set request configuration | ||
const plusBoxHandle = await page.$('.a1stauiv') | ||
const requestConfigs = await plusBoxHandle!.$$eval( | ||
'picture img', | ||
(imgEls) => { | ||
// Store the image URL | ||
const imgUrls: string[] = [] | ||
const elSelectorMap = ['.c14whb16', '.a1stauiv'] | ||
for (const item of res) { | ||
const { id } = item | ||
const { page } = item.data | ||
// Gets the URL of the page's wheel image element | ||
const boxHandle = await page.$(elSelectorMap[id - 1]) | ||
const urls = await boxHandle!.$$eval('picture img', (imgEls) => { | ||
return imgEls.map((item) => item.src) | ||
} | ||
) | ||
}) | ||
imgUrls.push(...urls) | ||
// Close page | ||
page.close() | ||
} | ||
// Call the crawlFile API to crawl pictures | ||
myXCrawl.crawlFile({ requestConfigs, fileConfig: { storeDir: './upload' } }) | ||
// Close page | ||
page.close() | ||
myXCrawl.crawlFile({ | ||
requestConfigs: imgUrls, | ||
fileConfig: { storeDir: './upload' } | ||
}) | ||
}) | ||
@@ -329,7 +344,7 @@ ``` | ||
- Examples of crawler applications | ||
- Spider API | ||
- request configuration | ||
- Crawler application instance (global) | ||
- Crawler API (local) | ||
- Request configuration (separate) | ||
The priority is: request config > API config > base config | ||
The priority is: request config > API config > application config | ||
@@ -364,3 +379,3 @@ ### Interval time | ||
Failed retries can be re-requested when timeouts and the like. | ||
Failed retry In the event of an error such as a timeout, the request will wait for the round to end and then retry. | ||
@@ -370,5 +385,3 @@ ```js | ||
const myXCrawl = xCrawl({ | ||
intervalTime: { max: 3000, min: 1000 } | ||
}) | ||
const myXCrawl = xCrawl() | ||
@@ -387,5 +400,3 @@ myXCrawl.crawlData({ url: 'https://xxx.com/xxxx', maxRetry: 1 }).then((res) => {}) | ||
const myXCrawl = xCrawl({ | ||
intervalTime: { max: 3000, min: 1000 } | ||
}) | ||
const myXCrawl = xCrawl() | ||
@@ -403,6 +414,16 @@ myXCrawl | ||
### About the results | ||
### About results | ||
For the result, the result of each request is uniformly wrapped with an object that provides information about the result of the request, such as id, result, success or not, maximum retry, number of retries, error information collected, and so on. Automatically determine whether the return value is wrapped in an array depending on the configuration you choose, and the type fits perfectly in TS. | ||
The id of each object is determined according to the order of requests in your configuration, and if there is a priority used, it will be sorted by priority. | ||
Details about configuration methods and results are as follows: [crawlPage config](#config), [crawlData config](#config-1), [crawlFile config](#config-2). | ||
### TypeScript | ||
Type systems like TypeScript can detect many common errors at compile time through static analysis. This reduces runtime errors and gives us more confidence when refactoring large projects. TypeScript also improves the development experience and efficiency through type-based auto-completion in the IDE. | ||
x-crawl itself is written in TypeScript and supports TypeScript. Comes with a type declaration file, out of the box. | ||
## API | ||
@@ -492,2 +513,4 @@ | ||
The res you get will be an object. | ||
**2. PageRequestConfig** | ||
@@ -513,2 +536,4 @@ | ||
The res you get will be an object. | ||
**3.(string | PageRequestConfig)[]** | ||
@@ -530,2 +555,4 @@ | ||
The res you get will be an array of objects. | ||
**4. CrawlPageConfigObject** | ||
@@ -553,4 +580,6 @@ | ||
It can be selected according to the actual situation. | ||
The res you get will be an array of objects. | ||
More information about the results can be found at [About results](# About-results), which can be selected according to the actual situation. | ||
### crawlData | ||
@@ -616,2 +645,4 @@ | ||
The res you get will be an object. | ||
**2. DataRequestConfig** | ||
@@ -637,2 +668,4 @@ | ||
The res you get will be an object. | ||
**3.(string | DataRequestConfig)[]** | ||
@@ -654,2 +687,4 @@ | ||
The res you get will be an array of objects. | ||
**4. CrawlDataConfigObject** | ||
@@ -677,4 +712,6 @@ | ||
It can be selected according to the actual situation. | ||
The res you get will be an array of objects. | ||
More information about the results can be found at [About results](# About-results), which can be selected according to the actual situation. | ||
### crawlFile | ||
@@ -749,2 +786,4 @@ | ||
The res you get will be an object. | ||
**2. FileRequestConfig[]** | ||
@@ -769,2 +808,4 @@ | ||
The res you get will be an array of objects. | ||
**3. CrawlFileConfigObject** | ||
@@ -792,4 +833,6 @@ | ||
It can be selected according to the actual situation. | ||
The res you get will be an array of objects. | ||
More information about the results can be found at [About results](# About-results), which can be selected according to the actual situation. | ||
### startPolling | ||
@@ -831,4 +874,6 @@ | ||
#### IntervalTime | ||
#### API Config Other | ||
##### IntervalTime | ||
```ts | ||
@@ -838,3 +883,3 @@ export type IntervalTime = number | { max: number; min?: number } | ||
#### Method | ||
##### Method | ||
@@ -865,3 +910,3 @@ ```ts | ||
#### PageRequestConfigCookies | ||
##### PageRequestConfigCookies | ||
@@ -875,4 +920,6 @@ ```ts | ||
#### PageRequestConfig | ||
#### API Config Request | ||
##### PageRequestConfig | ||
```ts | ||
@@ -890,3 +937,3 @@ export interface PageRequestConfig { | ||
#### DataRequestConfig | ||
##### DataRequestConfig | ||
@@ -907,3 +954,3 @@ ```ts | ||
#### FileRequestConfig | ||
##### FileRequestConfig | ||
@@ -924,5 +971,20 @@ ```ts | ||
#### CrawlPageConfigObject | ||
#### API Config Crawl | ||
##### XCrawlBaseConfig | ||
```ts | ||
export interface XCrawlBaseConfig { | ||
baseUrl?: string | ||
timeout?: number | ||
intervalTime?: IntervalTime | ||
mode?: 'async' | 'sync' | ||
proxy?: string | ||
maxRetry?: number | ||
} | ||
``` | ||
##### CrawlPageConfigObject | ||
```ts | ||
export interface CrawlPageConfigObject { | ||
@@ -938,3 +1000,3 @@ requestConfigs: (string | PageRequestConfig)[] | ||
#### CrawlDataConfigObject | ||
##### CrawlDataConfigObject | ||
@@ -951,3 +1013,3 @@ ```ts | ||
#### CrawlFileConfigObject | ||
##### CrawlFileConfigObject | ||
@@ -974,18 +1036,5 @@ ```ts | ||
#### XCrawlBaseConfig | ||
##### CrawlPageConfig | ||
```ts | ||
export interface XCrawlBaseConfig { | ||
baseUrl?: string | ||
timeout?: number | ||
intervalTime?: IntervalTime | ||
mode?: 'async' | 'sync' | ||
proxy?: string | ||
maxRetry?: number | ||
} | ||
``` | ||
#### CrawlPageConfig | ||
```ts | ||
export type CrawlPageConfig = | ||
@@ -998,3 +1047,3 @@ | string | ||
#### CrawlDataConfig | ||
##### CrawlDataConfig | ||
@@ -1009,3 +1058,3 @@ ```ts | ||
#### CrawlFileConfig | ||
##### CrawlFileConfig | ||
@@ -1016,3 +1065,3 @@ ```ts | ||
#### StartPollingConfig | ||
##### StartPollingConfig | ||
@@ -1019,0 +1068,0 @@ ```js |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
152155
1170